{ "best_metric": 0.29454880952835083, "best_model_checkpoint": "./cifar100_outputs/checkpoint-47817", "epoch": 10.0, "eval_steps": 500, "global_step": 53130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.1900806427001953, "learning_rate": 1.9996235648409565e-05, "loss": 4.6367, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.6411144733428955, "learning_rate": 1.9992471296819125e-05, "loss": 4.5998, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.594529151916504, "learning_rate": 1.9988706945228688e-05, "loss": 4.5914, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.5973572731018066, "learning_rate": 1.9984942593638247e-05, "loss": 4.6124, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.887483596801758, "learning_rate": 1.998117824204781e-05, "loss": 4.6123, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.584811210632324, "learning_rate": 1.997741389045737e-05, "loss": 4.6138, "step": 60 }, { "epoch": 0.01, "grad_norm": 2.9465954303741455, "learning_rate": 1.9973649538866934e-05, "loss": 4.5905, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.4353935718536377, "learning_rate": 1.9969885187276493e-05, "loss": 4.5707, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.8148584365844727, "learning_rate": 1.9966120835686053e-05, "loss": 4.5688, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.962184429168701, "learning_rate": 1.9962356484095616e-05, "loss": 4.5784, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.464444637298584, "learning_rate": 1.9958592132505176e-05, "loss": 4.5557, "step": 110 }, { "epoch": 0.02, "grad_norm": 3.1651523113250732, "learning_rate": 1.995482778091474e-05, "loss": 4.5485, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.603081226348877, "learning_rate": 1.99510634293243e-05, "loss": 4.5642, "step": 130 }, { "epoch": 0.03, "grad_norm": 2.5633766651153564, "learning_rate": 1.9947299077733862e-05, "loss": 4.5619, "step": 140 }, { "epoch": 0.03, "grad_norm": 3.072338342666626, "learning_rate": 1.9943534726143422e-05, "loss": 4.5694, "step": 150 }, { "epoch": 0.03, "grad_norm": 2.9631261825561523, "learning_rate": 1.9939770374552985e-05, "loss": 4.554, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.998347043991089, "learning_rate": 1.9936006022962545e-05, "loss": 4.5413, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.716461420059204, "learning_rate": 1.9932241671372108e-05, "loss": 4.5095, "step": 180 }, { "epoch": 0.04, "grad_norm": 2.613055944442749, "learning_rate": 1.992847731978167e-05, "loss": 4.5277, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.56919002532959, "learning_rate": 1.992471296819123e-05, "loss": 4.5149, "step": 200 }, { "epoch": 0.04, "grad_norm": 3.0990872383117676, "learning_rate": 1.9920948616600794e-05, "loss": 4.5029, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.658292770385742, "learning_rate": 1.9917184265010354e-05, "loss": 4.4927, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.613175868988037, "learning_rate": 1.9913419913419917e-05, "loss": 4.46, "step": 230 }, { "epoch": 0.05, "grad_norm": 2.9422857761383057, "learning_rate": 1.9909655561829477e-05, "loss": 4.4698, "step": 240 }, { "epoch": 0.05, "grad_norm": 3.384212017059326, "learning_rate": 1.990589121023904e-05, "loss": 4.4693, "step": 250 }, { "epoch": 0.05, "grad_norm": 2.8845982551574707, "learning_rate": 1.99021268586486e-05, "loss": 4.4814, "step": 260 }, { "epoch": 0.05, "grad_norm": 2.9230496883392334, "learning_rate": 1.989836250705816e-05, "loss": 4.4504, "step": 270 }, { "epoch": 0.05, "grad_norm": 3.3872103691101074, "learning_rate": 1.9894598155467723e-05, "loss": 4.4218, "step": 280 }, { "epoch": 0.05, "grad_norm": 3.033067226409912, "learning_rate": 1.9890833803877282e-05, "loss": 4.4298, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.819579601287842, "learning_rate": 1.9887069452286845e-05, "loss": 4.4118, "step": 300 }, { "epoch": 0.06, "grad_norm": 2.8507134914398193, "learning_rate": 1.9883305100696405e-05, "loss": 4.4365, "step": 310 }, { "epoch": 0.06, "grad_norm": 2.762634038925171, "learning_rate": 1.987954074910597e-05, "loss": 4.4232, "step": 320 }, { "epoch": 0.06, "grad_norm": 3.0568439960479736, "learning_rate": 1.9875776397515528e-05, "loss": 4.3622, "step": 330 }, { "epoch": 0.06, "grad_norm": 2.941974639892578, "learning_rate": 1.987201204592509e-05, "loss": 4.4076, "step": 340 }, { "epoch": 0.07, "grad_norm": 3.0038816928863525, "learning_rate": 1.986824769433465e-05, "loss": 4.3652, "step": 350 }, { "epoch": 0.07, "grad_norm": 2.681337356567383, "learning_rate": 1.9864483342744214e-05, "loss": 4.3819, "step": 360 }, { "epoch": 0.07, "grad_norm": 2.8940606117248535, "learning_rate": 1.9860718991153774e-05, "loss": 4.3638, "step": 370 }, { "epoch": 0.07, "grad_norm": 3.30637526512146, "learning_rate": 1.9856954639563337e-05, "loss": 4.385, "step": 380 }, { "epoch": 0.07, "grad_norm": 3.1481170654296875, "learning_rate": 1.98531902879729e-05, "loss": 4.3957, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.795309543609619, "learning_rate": 1.984942593638246e-05, "loss": 4.3601, "step": 400 }, { "epoch": 0.08, "grad_norm": 3.1323978900909424, "learning_rate": 1.9845661584792023e-05, "loss": 4.3536, "step": 410 }, { "epoch": 0.08, "grad_norm": 3.0077526569366455, "learning_rate": 1.9841897233201583e-05, "loss": 4.3306, "step": 420 }, { "epoch": 0.08, "grad_norm": 3.095212936401367, "learning_rate": 1.9838132881611146e-05, "loss": 4.3473, "step": 430 }, { "epoch": 0.08, "grad_norm": 3.200291633605957, "learning_rate": 1.9834368530020706e-05, "loss": 4.2868, "step": 440 }, { "epoch": 0.08, "grad_norm": 3.210118532180786, "learning_rate": 1.9830604178430266e-05, "loss": 4.3114, "step": 450 }, { "epoch": 0.09, "grad_norm": 3.286947011947632, "learning_rate": 1.982683982683983e-05, "loss": 4.3222, "step": 460 }, { "epoch": 0.09, "grad_norm": 2.8678014278411865, "learning_rate": 1.982307547524939e-05, "loss": 4.2983, "step": 470 }, { "epoch": 0.09, "grad_norm": 3.726501703262329, "learning_rate": 1.9819311123658952e-05, "loss": 4.2952, "step": 480 }, { "epoch": 0.09, "grad_norm": 2.8604493141174316, "learning_rate": 1.981554677206851e-05, "loss": 4.2516, "step": 490 }, { "epoch": 0.09, "grad_norm": 2.8486504554748535, "learning_rate": 1.9811782420478075e-05, "loss": 4.2516, "step": 500 }, { "epoch": 0.1, "grad_norm": 2.9079558849334717, "learning_rate": 1.9808018068887634e-05, "loss": 4.2404, "step": 510 }, { "epoch": 0.1, "grad_norm": 3.027830123901367, "learning_rate": 1.9804253717297198e-05, "loss": 4.2424, "step": 520 }, { "epoch": 0.1, "grad_norm": 3.154351234436035, "learning_rate": 1.9800489365706757e-05, "loss": 4.21, "step": 530 }, { "epoch": 0.1, "grad_norm": 3.0818393230438232, "learning_rate": 1.979672501411632e-05, "loss": 4.242, "step": 540 }, { "epoch": 0.1, "grad_norm": 3.877248525619507, "learning_rate": 1.979296066252588e-05, "loss": 4.2019, "step": 550 }, { "epoch": 0.11, "grad_norm": 3.109707832336426, "learning_rate": 1.9789196310935443e-05, "loss": 4.1811, "step": 560 }, { "epoch": 0.11, "grad_norm": 3.032790184020996, "learning_rate": 1.9785431959345007e-05, "loss": 4.1958, "step": 570 }, { "epoch": 0.11, "grad_norm": 3.237541913986206, "learning_rate": 1.9781667607754566e-05, "loss": 4.1735, "step": 580 }, { "epoch": 0.11, "grad_norm": 3.851644992828369, "learning_rate": 1.977790325616413e-05, "loss": 4.1478, "step": 590 }, { "epoch": 0.11, "grad_norm": 2.9260008335113525, "learning_rate": 1.977413890457369e-05, "loss": 4.1436, "step": 600 }, { "epoch": 0.11, "grad_norm": 3.1182501316070557, "learning_rate": 1.977037455298325e-05, "loss": 4.1768, "step": 610 }, { "epoch": 0.12, "grad_norm": 3.236636161804199, "learning_rate": 1.9766610201392812e-05, "loss": 4.1488, "step": 620 }, { "epoch": 0.12, "grad_norm": 2.9359562397003174, "learning_rate": 1.9762845849802372e-05, "loss": 4.1619, "step": 630 }, { "epoch": 0.12, "grad_norm": 2.9437806606292725, "learning_rate": 1.9759081498211935e-05, "loss": 4.092, "step": 640 }, { "epoch": 0.12, "grad_norm": 3.566126585006714, "learning_rate": 1.9755317146621495e-05, "loss": 4.1258, "step": 650 }, { "epoch": 0.12, "grad_norm": 3.742216110229492, "learning_rate": 1.9751552795031058e-05, "loss": 4.0587, "step": 660 }, { "epoch": 0.13, "grad_norm": 2.9798004627227783, "learning_rate": 1.9747788443440618e-05, "loss": 4.0615, "step": 670 }, { "epoch": 0.13, "grad_norm": 3.074636697769165, "learning_rate": 1.974402409185018e-05, "loss": 4.0414, "step": 680 }, { "epoch": 0.13, "grad_norm": 3.1997222900390625, "learning_rate": 1.974025974025974e-05, "loss": 4.0918, "step": 690 }, { "epoch": 0.13, "grad_norm": 3.321722984313965, "learning_rate": 1.9736495388669304e-05, "loss": 4.0527, "step": 700 }, { "epoch": 0.13, "grad_norm": 3.0019032955169678, "learning_rate": 1.9732731037078864e-05, "loss": 3.9662, "step": 710 }, { "epoch": 0.14, "grad_norm": 2.8338370323181152, "learning_rate": 1.9728966685488427e-05, "loss": 4.0178, "step": 720 }, { "epoch": 0.14, "grad_norm": 3.2571487426757812, "learning_rate": 1.9725202333897987e-05, "loss": 4.0006, "step": 730 }, { "epoch": 0.14, "grad_norm": 2.9352447986602783, "learning_rate": 1.972143798230755e-05, "loss": 4.0078, "step": 740 }, { "epoch": 0.14, "grad_norm": 4.793808460235596, "learning_rate": 1.9717673630717113e-05, "loss": 4.0501, "step": 750 }, { "epoch": 0.14, "grad_norm": 6.661139488220215, "learning_rate": 1.9713909279126673e-05, "loss": 3.9689, "step": 760 }, { "epoch": 0.14, "grad_norm": 3.5628387928009033, "learning_rate": 1.9710144927536236e-05, "loss": 3.9937, "step": 770 }, { "epoch": 0.15, "grad_norm": 3.0853545665740967, "learning_rate": 1.9706380575945796e-05, "loss": 3.9623, "step": 780 }, { "epoch": 0.15, "grad_norm": 3.3460278511047363, "learning_rate": 1.9702616224355355e-05, "loss": 4.0009, "step": 790 }, { "epoch": 0.15, "grad_norm": 3.079176425933838, "learning_rate": 1.969885187276492e-05, "loss": 3.9128, "step": 800 }, { "epoch": 0.15, "grad_norm": 3.0661251544952393, "learning_rate": 1.9695087521174478e-05, "loss": 3.9185, "step": 810 }, { "epoch": 0.15, "grad_norm": 2.881580352783203, "learning_rate": 1.969132316958404e-05, "loss": 3.8848, "step": 820 }, { "epoch": 0.16, "grad_norm": 4.219837665557861, "learning_rate": 1.96875588179936e-05, "loss": 3.9397, "step": 830 }, { "epoch": 0.16, "grad_norm": 3.501627206802368, "learning_rate": 1.9683794466403164e-05, "loss": 3.9385, "step": 840 }, { "epoch": 0.16, "grad_norm": 3.5525853633880615, "learning_rate": 1.9680030114812724e-05, "loss": 3.9145, "step": 850 }, { "epoch": 0.16, "grad_norm": 3.5347719192504883, "learning_rate": 1.9676265763222287e-05, "loss": 3.9678, "step": 860 }, { "epoch": 0.16, "grad_norm": 5.558157920837402, "learning_rate": 1.9672501411631847e-05, "loss": 3.8212, "step": 870 }, { "epoch": 0.17, "grad_norm": 3.4307472705841064, "learning_rate": 1.966873706004141e-05, "loss": 3.8341, "step": 880 }, { "epoch": 0.17, "grad_norm": 4.318007469177246, "learning_rate": 1.966497270845097e-05, "loss": 3.8516, "step": 890 }, { "epoch": 0.17, "grad_norm": 4.074690341949463, "learning_rate": 1.9661208356860533e-05, "loss": 3.8625, "step": 900 }, { "epoch": 0.17, "grad_norm": 3.376126289367676, "learning_rate": 1.9657444005270093e-05, "loss": 3.8391, "step": 910 }, { "epoch": 0.17, "grad_norm": 5.645514011383057, "learning_rate": 1.9653679653679656e-05, "loss": 3.8403, "step": 920 }, { "epoch": 0.18, "grad_norm": 3.457108497619629, "learning_rate": 1.964991530208922e-05, "loss": 3.8908, "step": 930 }, { "epoch": 0.18, "grad_norm": 2.9738640785217285, "learning_rate": 1.964615095049878e-05, "loss": 3.7746, "step": 940 }, { "epoch": 0.18, "grad_norm": 4.399070739746094, "learning_rate": 1.9642386598908342e-05, "loss": 3.7559, "step": 950 }, { "epoch": 0.18, "grad_norm": 4.6740546226501465, "learning_rate": 1.96386222473179e-05, "loss": 3.7348, "step": 960 }, { "epoch": 0.18, "grad_norm": 3.3767216205596924, "learning_rate": 1.963485789572746e-05, "loss": 3.6984, "step": 970 }, { "epoch": 0.18, "grad_norm": 3.3283941745758057, "learning_rate": 1.9631093544137025e-05, "loss": 3.756, "step": 980 }, { "epoch": 0.19, "grad_norm": 8.651248931884766, "learning_rate": 1.9627329192546585e-05, "loss": 3.7782, "step": 990 }, { "epoch": 0.19, "grad_norm": 4.265860080718994, "learning_rate": 1.9623564840956148e-05, "loss": 3.787, "step": 1000 }, { "epoch": 0.19, "grad_norm": 3.8226921558380127, "learning_rate": 1.9619800489365707e-05, "loss": 3.8101, "step": 1010 }, { "epoch": 0.19, "grad_norm": 3.2111263275146484, "learning_rate": 1.961603613777527e-05, "loss": 3.737, "step": 1020 }, { "epoch": 0.19, "grad_norm": 5.9517107009887695, "learning_rate": 1.961227178618483e-05, "loss": 3.7126, "step": 1030 }, { "epoch": 0.2, "grad_norm": 5.050525665283203, "learning_rate": 1.9608507434594394e-05, "loss": 3.7004, "step": 1040 }, { "epoch": 0.2, "grad_norm": 2.8831429481506348, "learning_rate": 1.9604743083003953e-05, "loss": 3.6664, "step": 1050 }, { "epoch": 0.2, "grad_norm": 4.123471260070801, "learning_rate": 1.9600978731413516e-05, "loss": 3.7305, "step": 1060 }, { "epoch": 0.2, "grad_norm": 3.836909770965576, "learning_rate": 1.9597214379823076e-05, "loss": 3.7419, "step": 1070 }, { "epoch": 0.2, "grad_norm": 3.690175771713257, "learning_rate": 1.959345002823264e-05, "loss": 3.6338, "step": 1080 }, { "epoch": 0.21, "grad_norm": 3.598996639251709, "learning_rate": 1.95896856766422e-05, "loss": 3.7188, "step": 1090 }, { "epoch": 0.21, "grad_norm": 5.010788917541504, "learning_rate": 1.9585921325051762e-05, "loss": 3.7787, "step": 1100 }, { "epoch": 0.21, "grad_norm": 4.156946182250977, "learning_rate": 1.9582156973461322e-05, "loss": 3.6394, "step": 1110 }, { "epoch": 0.21, "grad_norm": 3.7809722423553467, "learning_rate": 1.9578392621870885e-05, "loss": 3.6945, "step": 1120 }, { "epoch": 0.21, "grad_norm": 7.646225929260254, "learning_rate": 1.957462827028045e-05, "loss": 3.5862, "step": 1130 }, { "epoch": 0.21, "grad_norm": 3.2124147415161133, "learning_rate": 1.9570863918690005e-05, "loss": 3.6443, "step": 1140 }, { "epoch": 0.22, "grad_norm": 8.297073364257812, "learning_rate": 1.9567099567099568e-05, "loss": 3.7029, "step": 1150 }, { "epoch": 0.22, "grad_norm": 4.9652862548828125, "learning_rate": 1.956333521550913e-05, "loss": 3.6561, "step": 1160 }, { "epoch": 0.22, "grad_norm": 5.712035179138184, "learning_rate": 1.955957086391869e-05, "loss": 3.5105, "step": 1170 }, { "epoch": 0.22, "grad_norm": 3.600637912750244, "learning_rate": 1.9555806512328254e-05, "loss": 3.5531, "step": 1180 }, { "epoch": 0.22, "grad_norm": 4.265107154846191, "learning_rate": 1.9552042160737814e-05, "loss": 3.6188, "step": 1190 }, { "epoch": 0.23, "grad_norm": 6.627386093139648, "learning_rate": 1.9548277809147377e-05, "loss": 3.5911, "step": 1200 }, { "epoch": 0.23, "grad_norm": 4.785702228546143, "learning_rate": 1.9544513457556937e-05, "loss": 3.5837, "step": 1210 }, { "epoch": 0.23, "grad_norm": 3.3369293212890625, "learning_rate": 1.95407491059665e-05, "loss": 3.6222, "step": 1220 }, { "epoch": 0.23, "grad_norm": 3.7859039306640625, "learning_rate": 1.953698475437606e-05, "loss": 3.5098, "step": 1230 }, { "epoch": 0.23, "grad_norm": 3.6498477458953857, "learning_rate": 1.9533220402785623e-05, "loss": 3.4831, "step": 1240 }, { "epoch": 0.24, "grad_norm": 4.522767543792725, "learning_rate": 1.9529456051195183e-05, "loss": 3.6801, "step": 1250 }, { "epoch": 0.24, "grad_norm": 3.2972137928009033, "learning_rate": 1.9525691699604746e-05, "loss": 3.3404, "step": 1260 }, { "epoch": 0.24, "grad_norm": 8.638528823852539, "learning_rate": 1.9521927348014305e-05, "loss": 3.5866, "step": 1270 }, { "epoch": 0.24, "grad_norm": 6.175476551055908, "learning_rate": 1.951816299642387e-05, "loss": 3.509, "step": 1280 }, { "epoch": 0.24, "grad_norm": 3.4966530799865723, "learning_rate": 1.951439864483343e-05, "loss": 3.4481, "step": 1290 }, { "epoch": 0.24, "grad_norm": 3.191817283630371, "learning_rate": 1.951063429324299e-05, "loss": 3.622, "step": 1300 }, { "epoch": 0.25, "grad_norm": 4.43389368057251, "learning_rate": 1.950686994165255e-05, "loss": 3.5212, "step": 1310 }, { "epoch": 0.25, "grad_norm": 7.5795392990112305, "learning_rate": 1.950310559006211e-05, "loss": 3.6054, "step": 1320 }, { "epoch": 0.25, "grad_norm": 3.8179938793182373, "learning_rate": 1.9499341238471674e-05, "loss": 3.3792, "step": 1330 }, { "epoch": 0.25, "grad_norm": 4.200837135314941, "learning_rate": 1.9495576886881234e-05, "loss": 3.5091, "step": 1340 }, { "epoch": 0.25, "grad_norm": 3.6297414302825928, "learning_rate": 1.9491812535290797e-05, "loss": 3.4068, "step": 1350 }, { "epoch": 0.26, "grad_norm": 3.7957377433776855, "learning_rate": 1.948804818370036e-05, "loss": 3.4862, "step": 1360 }, { "epoch": 0.26, "grad_norm": 3.713080644607544, "learning_rate": 1.948428383210992e-05, "loss": 3.4861, "step": 1370 }, { "epoch": 0.26, "grad_norm": 5.54100227355957, "learning_rate": 1.9480519480519483e-05, "loss": 3.4529, "step": 1380 }, { "epoch": 0.26, "grad_norm": 3.8583154678344727, "learning_rate": 1.9476755128929043e-05, "loss": 3.4531, "step": 1390 }, { "epoch": 0.26, "grad_norm": 4.6709675788879395, "learning_rate": 1.9472990777338606e-05, "loss": 3.4798, "step": 1400 }, { "epoch": 0.27, "grad_norm": 3.9218077659606934, "learning_rate": 1.9469226425748166e-05, "loss": 3.4589, "step": 1410 }, { "epoch": 0.27, "grad_norm": 3.0315423011779785, "learning_rate": 1.946546207415773e-05, "loss": 3.3474, "step": 1420 }, { "epoch": 0.27, "grad_norm": 4.207186222076416, "learning_rate": 1.946169772256729e-05, "loss": 3.3297, "step": 1430 }, { "epoch": 0.27, "grad_norm": 3.950998544692993, "learning_rate": 1.9457933370976852e-05, "loss": 3.4374, "step": 1440 }, { "epoch": 0.27, "grad_norm": 4.596767902374268, "learning_rate": 1.9454169019386412e-05, "loss": 3.417, "step": 1450 }, { "epoch": 0.27, "grad_norm": 4.453456878662109, "learning_rate": 1.9450404667795975e-05, "loss": 3.2764, "step": 1460 }, { "epoch": 0.28, "grad_norm": 4.382648944854736, "learning_rate": 1.9446640316205535e-05, "loss": 3.4189, "step": 1470 }, { "epoch": 0.28, "grad_norm": 9.689038276672363, "learning_rate": 1.9442875964615098e-05, "loss": 3.4015, "step": 1480 }, { "epoch": 0.28, "grad_norm": 4.036463737487793, "learning_rate": 1.9439111613024658e-05, "loss": 3.3156, "step": 1490 }, { "epoch": 0.28, "grad_norm": 3.8814964294433594, "learning_rate": 1.9435347261434217e-05, "loss": 3.1903, "step": 1500 }, { "epoch": 0.28, "grad_norm": 4.564530849456787, "learning_rate": 1.943158290984378e-05, "loss": 3.3632, "step": 1510 }, { "epoch": 0.29, "grad_norm": 4.282833576202393, "learning_rate": 1.942781855825334e-05, "loss": 3.4336, "step": 1520 }, { "epoch": 0.29, "grad_norm": 4.326153755187988, "learning_rate": 1.9424054206662903e-05, "loss": 3.4133, "step": 1530 }, { "epoch": 0.29, "grad_norm": 8.867426872253418, "learning_rate": 1.9420289855072467e-05, "loss": 3.3124, "step": 1540 }, { "epoch": 0.29, "grad_norm": 4.446542739868164, "learning_rate": 1.9416525503482026e-05, "loss": 3.2558, "step": 1550 }, { "epoch": 0.29, "grad_norm": 10.327835083007812, "learning_rate": 1.941276115189159e-05, "loss": 3.3257, "step": 1560 }, { "epoch": 0.3, "grad_norm": 4.354964256286621, "learning_rate": 1.940899680030115e-05, "loss": 3.2928, "step": 1570 }, { "epoch": 0.3, "grad_norm": 4.222498893737793, "learning_rate": 1.9405232448710712e-05, "loss": 3.3359, "step": 1580 }, { "epoch": 0.3, "grad_norm": 3.6612141132354736, "learning_rate": 1.9401468097120272e-05, "loss": 3.2173, "step": 1590 }, { "epoch": 0.3, "grad_norm": 3.745842695236206, "learning_rate": 1.9397703745529835e-05, "loss": 3.1372, "step": 1600 }, { "epoch": 0.3, "grad_norm": 8.753840446472168, "learning_rate": 1.9393939393939395e-05, "loss": 3.321, "step": 1610 }, { "epoch": 0.3, "grad_norm": 4.043251037597656, "learning_rate": 1.9390175042348958e-05, "loss": 3.3141, "step": 1620 }, { "epoch": 0.31, "grad_norm": 7.991616249084473, "learning_rate": 1.9386410690758518e-05, "loss": 3.2403, "step": 1630 }, { "epoch": 0.31, "grad_norm": 4.049126148223877, "learning_rate": 1.938264633916808e-05, "loss": 3.2457, "step": 1640 }, { "epoch": 0.31, "grad_norm": 3.2904117107391357, "learning_rate": 1.937888198757764e-05, "loss": 3.3252, "step": 1650 }, { "epoch": 0.31, "grad_norm": 4.309184551239014, "learning_rate": 1.93751176359872e-05, "loss": 3.1543, "step": 1660 }, { "epoch": 0.31, "grad_norm": 3.324444532394409, "learning_rate": 1.9371353284396764e-05, "loss": 3.3012, "step": 1670 }, { "epoch": 0.32, "grad_norm": 4.101102352142334, "learning_rate": 1.9367588932806324e-05, "loss": 3.2415, "step": 1680 }, { "epoch": 0.32, "grad_norm": 4.955221652984619, "learning_rate": 1.9363824581215887e-05, "loss": 3.1479, "step": 1690 }, { "epoch": 0.32, "grad_norm": 7.092071056365967, "learning_rate": 1.9360060229625447e-05, "loss": 3.2071, "step": 1700 }, { "epoch": 0.32, "grad_norm": 4.205142021179199, "learning_rate": 1.935629587803501e-05, "loss": 3.1963, "step": 1710 }, { "epoch": 0.32, "grad_norm": 4.192953109741211, "learning_rate": 1.9352531526444573e-05, "loss": 3.0651, "step": 1720 }, { "epoch": 0.33, "grad_norm": 4.651326656341553, "learning_rate": 1.9348767174854133e-05, "loss": 3.3063, "step": 1730 }, { "epoch": 0.33, "grad_norm": 5.780113697052002, "learning_rate": 1.9345002823263696e-05, "loss": 3.1667, "step": 1740 }, { "epoch": 0.33, "grad_norm": 5.112650394439697, "learning_rate": 1.9341238471673256e-05, "loss": 3.2213, "step": 1750 }, { "epoch": 0.33, "grad_norm": 5.035951137542725, "learning_rate": 1.933747412008282e-05, "loss": 3.2067, "step": 1760 }, { "epoch": 0.33, "grad_norm": 4.030355930328369, "learning_rate": 1.933370976849238e-05, "loss": 3.1413, "step": 1770 }, { "epoch": 0.34, "grad_norm": 6.994670391082764, "learning_rate": 1.932994541690194e-05, "loss": 3.1571, "step": 1780 }, { "epoch": 0.34, "grad_norm": 6.542640686035156, "learning_rate": 1.93261810653115e-05, "loss": 3.0721, "step": 1790 }, { "epoch": 0.34, "grad_norm": 3.707923173904419, "learning_rate": 1.9322416713721065e-05, "loss": 3.0077, "step": 1800 }, { "epoch": 0.34, "grad_norm": 9.954961776733398, "learning_rate": 1.9318652362130624e-05, "loss": 3.1853, "step": 1810 }, { "epoch": 0.34, "grad_norm": 4.040740013122559, "learning_rate": 1.9314888010540187e-05, "loss": 3.1621, "step": 1820 }, { "epoch": 0.34, "grad_norm": 3.269940137863159, "learning_rate": 1.9311123658949747e-05, "loss": 3.0776, "step": 1830 }, { "epoch": 0.35, "grad_norm": 6.951628684997559, "learning_rate": 1.9307359307359307e-05, "loss": 3.0865, "step": 1840 }, { "epoch": 0.35, "grad_norm": 4.079604625701904, "learning_rate": 1.930359495576887e-05, "loss": 3.1071, "step": 1850 }, { "epoch": 0.35, "grad_norm": 6.9071431159973145, "learning_rate": 1.929983060417843e-05, "loss": 3.1679, "step": 1860 }, { "epoch": 0.35, "grad_norm": 4.998239994049072, "learning_rate": 1.9296066252587993e-05, "loss": 2.9328, "step": 1870 }, { "epoch": 0.35, "grad_norm": 7.7196455001831055, "learning_rate": 1.9292301900997553e-05, "loss": 3.1693, "step": 1880 }, { "epoch": 0.36, "grad_norm": 6.726539611816406, "learning_rate": 1.9288537549407116e-05, "loss": 3.119, "step": 1890 }, { "epoch": 0.36, "grad_norm": 5.929136276245117, "learning_rate": 1.9284773197816676e-05, "loss": 3.0396, "step": 1900 }, { "epoch": 0.36, "grad_norm": 3.6181488037109375, "learning_rate": 1.928100884622624e-05, "loss": 2.9703, "step": 1910 }, { "epoch": 0.36, "grad_norm": 4.3397040367126465, "learning_rate": 1.9277244494635802e-05, "loss": 3.1033, "step": 1920 }, { "epoch": 0.36, "grad_norm": 7.7239484786987305, "learning_rate": 1.9273480143045362e-05, "loss": 3.001, "step": 1930 }, { "epoch": 0.37, "grad_norm": 3.317354440689087, "learning_rate": 1.9269715791454925e-05, "loss": 2.9419, "step": 1940 }, { "epoch": 0.37, "grad_norm": 9.748729705810547, "learning_rate": 1.9265951439864485e-05, "loss": 3.1594, "step": 1950 }, { "epoch": 0.37, "grad_norm": 4.6273064613342285, "learning_rate": 1.9262187088274048e-05, "loss": 2.9821, "step": 1960 }, { "epoch": 0.37, "grad_norm": 8.60408878326416, "learning_rate": 1.9258422736683608e-05, "loss": 3.1565, "step": 1970 }, { "epoch": 0.37, "grad_norm": 5.767194747924805, "learning_rate": 1.925465838509317e-05, "loss": 2.9479, "step": 1980 }, { "epoch": 0.37, "grad_norm": 13.294413566589355, "learning_rate": 1.925089403350273e-05, "loss": 3.0082, "step": 1990 }, { "epoch": 0.38, "grad_norm": 5.2884979248046875, "learning_rate": 1.9247129681912294e-05, "loss": 2.9004, "step": 2000 }, { "epoch": 0.38, "grad_norm": 6.24321174621582, "learning_rate": 1.9243365330321854e-05, "loss": 2.8892, "step": 2010 }, { "epoch": 0.38, "grad_norm": 4.944282054901123, "learning_rate": 1.9239600978731413e-05, "loss": 3.0362, "step": 2020 }, { "epoch": 0.38, "grad_norm": 4.824367046356201, "learning_rate": 1.9235836627140976e-05, "loss": 3.0755, "step": 2030 }, { "epoch": 0.38, "grad_norm": 5.201323509216309, "learning_rate": 1.9232072275550536e-05, "loss": 2.9256, "step": 2040 }, { "epoch": 0.39, "grad_norm": 6.465519428253174, "learning_rate": 1.92283079239601e-05, "loss": 2.8423, "step": 2050 }, { "epoch": 0.39, "grad_norm": 4.563433647155762, "learning_rate": 1.922454357236966e-05, "loss": 3.063, "step": 2060 }, { "epoch": 0.39, "grad_norm": 5.595127582550049, "learning_rate": 1.9220779220779222e-05, "loss": 2.89, "step": 2070 }, { "epoch": 0.39, "grad_norm": 5.29541540145874, "learning_rate": 1.9217014869188782e-05, "loss": 2.9488, "step": 2080 }, { "epoch": 0.39, "grad_norm": 10.494131088256836, "learning_rate": 1.9213250517598345e-05, "loss": 2.9473, "step": 2090 }, { "epoch": 0.4, "grad_norm": 7.640602111816406, "learning_rate": 1.920948616600791e-05, "loss": 2.9225, "step": 2100 }, { "epoch": 0.4, "grad_norm": 3.552557945251465, "learning_rate": 1.9205721814417468e-05, "loss": 2.9173, "step": 2110 }, { "epoch": 0.4, "grad_norm": 4.50587272644043, "learning_rate": 1.920195746282703e-05, "loss": 2.8535, "step": 2120 }, { "epoch": 0.4, "grad_norm": 7.200937271118164, "learning_rate": 1.919819311123659e-05, "loss": 2.9673, "step": 2130 }, { "epoch": 0.4, "grad_norm": 4.5765485763549805, "learning_rate": 1.9194428759646154e-05, "loss": 2.8064, "step": 2140 }, { "epoch": 0.4, "grad_norm": 5.502497673034668, "learning_rate": 1.9190664408055714e-05, "loss": 2.8947, "step": 2150 }, { "epoch": 0.41, "grad_norm": 19.691614151000977, "learning_rate": 1.9186900056465277e-05, "loss": 3.028, "step": 2160 }, { "epoch": 0.41, "grad_norm": 3.6259846687316895, "learning_rate": 1.9183135704874837e-05, "loss": 2.8456, "step": 2170 }, { "epoch": 0.41, "grad_norm": 4.246203422546387, "learning_rate": 1.9179371353284397e-05, "loss": 2.8903, "step": 2180 }, { "epoch": 0.41, "grad_norm": 5.460795879364014, "learning_rate": 1.917560700169396e-05, "loss": 2.9129, "step": 2190 }, { "epoch": 0.41, "grad_norm": 5.957250118255615, "learning_rate": 1.917184265010352e-05, "loss": 2.8096, "step": 2200 }, { "epoch": 0.42, "grad_norm": 9.38149642944336, "learning_rate": 1.9168078298513083e-05, "loss": 2.9384, "step": 2210 }, { "epoch": 0.42, "grad_norm": 3.8041586875915527, "learning_rate": 1.9164313946922643e-05, "loss": 2.7674, "step": 2220 }, { "epoch": 0.42, "grad_norm": 4.8668904304504395, "learning_rate": 1.9160549595332206e-05, "loss": 2.7463, "step": 2230 }, { "epoch": 0.42, "grad_norm": 4.3397698402404785, "learning_rate": 1.9156785243741765e-05, "loss": 2.8838, "step": 2240 }, { "epoch": 0.42, "grad_norm": 10.79181957244873, "learning_rate": 1.915302089215133e-05, "loss": 2.8304, "step": 2250 }, { "epoch": 0.43, "grad_norm": 6.324551582336426, "learning_rate": 1.914925654056089e-05, "loss": 2.8287, "step": 2260 }, { "epoch": 0.43, "grad_norm": 5.550736427307129, "learning_rate": 1.914549218897045e-05, "loss": 2.7671, "step": 2270 }, { "epoch": 0.43, "grad_norm": 5.587412357330322, "learning_rate": 1.9141727837380015e-05, "loss": 3.0649, "step": 2280 }, { "epoch": 0.43, "grad_norm": 4.18621301651001, "learning_rate": 1.9137963485789574e-05, "loss": 2.863, "step": 2290 }, { "epoch": 0.43, "grad_norm": 6.899294376373291, "learning_rate": 1.9134199134199138e-05, "loss": 2.9058, "step": 2300 }, { "epoch": 0.43, "grad_norm": 4.400335788726807, "learning_rate": 1.9130434782608697e-05, "loss": 2.7415, "step": 2310 }, { "epoch": 0.44, "grad_norm": 18.84127426147461, "learning_rate": 1.912667043101826e-05, "loss": 2.8094, "step": 2320 }, { "epoch": 0.44, "grad_norm": 11.325419425964355, "learning_rate": 1.912290607942782e-05, "loss": 2.7769, "step": 2330 }, { "epoch": 0.44, "grad_norm": 11.26507568359375, "learning_rate": 1.9119141727837383e-05, "loss": 2.8841, "step": 2340 }, { "epoch": 0.44, "grad_norm": 6.831374168395996, "learning_rate": 1.9115377376246943e-05, "loss": 2.832, "step": 2350 }, { "epoch": 0.44, "grad_norm": 14.735857963562012, "learning_rate": 1.9111613024656503e-05, "loss": 2.7335, "step": 2360 }, { "epoch": 0.45, "grad_norm": 8.749835968017578, "learning_rate": 1.9107848673066066e-05, "loss": 2.7282, "step": 2370 }, { "epoch": 0.45, "grad_norm": 4.772495269775391, "learning_rate": 1.9104084321475626e-05, "loss": 2.6789, "step": 2380 }, { "epoch": 0.45, "grad_norm": 3.5235466957092285, "learning_rate": 1.910031996988519e-05, "loss": 2.8611, "step": 2390 }, { "epoch": 0.45, "grad_norm": 3.360028028488159, "learning_rate": 1.909655561829475e-05, "loss": 2.6936, "step": 2400 }, { "epoch": 0.45, "grad_norm": 7.748390197753906, "learning_rate": 1.9092791266704312e-05, "loss": 2.6904, "step": 2410 }, { "epoch": 0.46, "grad_norm": 6.634457588195801, "learning_rate": 1.9089026915113872e-05, "loss": 2.9342, "step": 2420 }, { "epoch": 0.46, "grad_norm": 8.747628211975098, "learning_rate": 1.9085262563523435e-05, "loss": 2.7683, "step": 2430 }, { "epoch": 0.46, "grad_norm": 3.515625476837158, "learning_rate": 1.9081498211932995e-05, "loss": 2.7442, "step": 2440 }, { "epoch": 0.46, "grad_norm": 3.4130609035491943, "learning_rate": 1.9077733860342558e-05, "loss": 2.6693, "step": 2450 }, { "epoch": 0.46, "grad_norm": 4.92776346206665, "learning_rate": 1.907396950875212e-05, "loss": 2.733, "step": 2460 }, { "epoch": 0.46, "grad_norm": 11.010396957397461, "learning_rate": 1.907020515716168e-05, "loss": 2.6891, "step": 2470 }, { "epoch": 0.47, "grad_norm": 3.8040144443511963, "learning_rate": 1.9066440805571244e-05, "loss": 2.5664, "step": 2480 }, { "epoch": 0.47, "grad_norm": 6.726337909698486, "learning_rate": 1.9062676453980804e-05, "loss": 2.7416, "step": 2490 }, { "epoch": 0.47, "grad_norm": 7.524297714233398, "learning_rate": 1.9058912102390367e-05, "loss": 2.6905, "step": 2500 }, { "epoch": 0.47, "grad_norm": 3.6792001724243164, "learning_rate": 1.9055147750799927e-05, "loss": 2.5483, "step": 2510 }, { "epoch": 0.47, "grad_norm": 6.578388690948486, "learning_rate": 1.905138339920949e-05, "loss": 2.7227, "step": 2520 }, { "epoch": 0.48, "grad_norm": 2.9775335788726807, "learning_rate": 1.904761904761905e-05, "loss": 2.5024, "step": 2530 }, { "epoch": 0.48, "grad_norm": 10.870952606201172, "learning_rate": 1.904385469602861e-05, "loss": 2.622, "step": 2540 }, { "epoch": 0.48, "grad_norm": 8.206267356872559, "learning_rate": 1.9040090344438172e-05, "loss": 2.8171, "step": 2550 }, { "epoch": 0.48, "grad_norm": 6.320977210998535, "learning_rate": 1.9036325992847732e-05, "loss": 2.6359, "step": 2560 }, { "epoch": 0.48, "grad_norm": 7.017043113708496, "learning_rate": 1.9032561641257295e-05, "loss": 2.6382, "step": 2570 }, { "epoch": 0.49, "grad_norm": 4.2981061935424805, "learning_rate": 1.9028797289666855e-05, "loss": 2.5545, "step": 2580 }, { "epoch": 0.49, "grad_norm": 4.075060844421387, "learning_rate": 1.9025032938076418e-05, "loss": 2.6773, "step": 2590 }, { "epoch": 0.49, "grad_norm": 3.952000379562378, "learning_rate": 1.9021268586485978e-05, "loss": 2.4791, "step": 2600 }, { "epoch": 0.49, "grad_norm": 4.052979469299316, "learning_rate": 1.901750423489554e-05, "loss": 2.7196, "step": 2610 }, { "epoch": 0.49, "grad_norm": 4.638850212097168, "learning_rate": 1.90137398833051e-05, "loss": 2.7139, "step": 2620 }, { "epoch": 0.5, "grad_norm": 7.109199523925781, "learning_rate": 1.9009975531714664e-05, "loss": 2.5936, "step": 2630 }, { "epoch": 0.5, "grad_norm": 4.101054668426514, "learning_rate": 1.9006211180124224e-05, "loss": 2.5606, "step": 2640 }, { "epoch": 0.5, "grad_norm": 6.660862445831299, "learning_rate": 1.9002446828533787e-05, "loss": 2.5895, "step": 2650 }, { "epoch": 0.5, "grad_norm": 5.114608287811279, "learning_rate": 1.899868247694335e-05, "loss": 2.4963, "step": 2660 }, { "epoch": 0.5, "grad_norm": 11.300898551940918, "learning_rate": 1.899491812535291e-05, "loss": 2.7821, "step": 2670 }, { "epoch": 0.5, "grad_norm": 5.744706630706787, "learning_rate": 1.8991153773762473e-05, "loss": 2.5695, "step": 2680 }, { "epoch": 0.51, "grad_norm": 7.060439586639404, "learning_rate": 1.8987389422172033e-05, "loss": 2.4313, "step": 2690 }, { "epoch": 0.51, "grad_norm": 4.207651138305664, "learning_rate": 1.8983625070581596e-05, "loss": 2.4718, "step": 2700 }, { "epoch": 0.51, "grad_norm": 6.556578636169434, "learning_rate": 1.8979860718991156e-05, "loss": 2.5789, "step": 2710 }, { "epoch": 0.51, "grad_norm": 7.473084449768066, "learning_rate": 1.8976096367400716e-05, "loss": 2.6073, "step": 2720 }, { "epoch": 0.51, "grad_norm": 8.552806854248047, "learning_rate": 1.897233201581028e-05, "loss": 2.568, "step": 2730 }, { "epoch": 0.52, "grad_norm": 8.12427043914795, "learning_rate": 1.896856766421984e-05, "loss": 2.4706, "step": 2740 }, { "epoch": 0.52, "grad_norm": 7.1217474937438965, "learning_rate": 1.89648033126294e-05, "loss": 2.4005, "step": 2750 }, { "epoch": 0.52, "grad_norm": 3.9127049446105957, "learning_rate": 1.896103896103896e-05, "loss": 2.517, "step": 2760 }, { "epoch": 0.52, "grad_norm": 15.194258689880371, "learning_rate": 1.8957274609448525e-05, "loss": 2.5601, "step": 2770 }, { "epoch": 0.52, "grad_norm": 12.40319538116455, "learning_rate": 1.8953510257858084e-05, "loss": 2.3259, "step": 2780 }, { "epoch": 0.53, "grad_norm": 10.481034278869629, "learning_rate": 1.8949745906267647e-05, "loss": 2.4099, "step": 2790 }, { "epoch": 0.53, "grad_norm": 4.578127861022949, "learning_rate": 1.8945981554677207e-05, "loss": 2.416, "step": 2800 }, { "epoch": 0.53, "grad_norm": 15.104098320007324, "learning_rate": 1.894221720308677e-05, "loss": 2.4252, "step": 2810 }, { "epoch": 0.53, "grad_norm": 7.515500545501709, "learning_rate": 1.893845285149633e-05, "loss": 2.4741, "step": 2820 }, { "epoch": 0.53, "grad_norm": 3.786322593688965, "learning_rate": 1.8934688499905893e-05, "loss": 2.5193, "step": 2830 }, { "epoch": 0.53, "grad_norm": 3.358415365219116, "learning_rate": 1.8930924148315456e-05, "loss": 2.603, "step": 2840 }, { "epoch": 0.54, "grad_norm": 7.374852180480957, "learning_rate": 1.8927159796725016e-05, "loss": 2.5356, "step": 2850 }, { "epoch": 0.54, "grad_norm": 8.016167640686035, "learning_rate": 1.892339544513458e-05, "loss": 2.4606, "step": 2860 }, { "epoch": 0.54, "grad_norm": 5.327517509460449, "learning_rate": 1.891963109354414e-05, "loss": 2.3619, "step": 2870 }, { "epoch": 0.54, "grad_norm": 9.286544799804688, "learning_rate": 1.89158667419537e-05, "loss": 2.4749, "step": 2880 }, { "epoch": 0.54, "grad_norm": 7.052702903747559, "learning_rate": 1.8912102390363262e-05, "loss": 2.5101, "step": 2890 }, { "epoch": 0.55, "grad_norm": 24.803468704223633, "learning_rate": 1.8908338038772822e-05, "loss": 2.4248, "step": 2900 }, { "epoch": 0.55, "grad_norm": 2.5042014122009277, "learning_rate": 1.8904573687182385e-05, "loss": 2.3821, "step": 2910 }, { "epoch": 0.55, "grad_norm": 6.960639476776123, "learning_rate": 1.8900809335591945e-05, "loss": 2.3184, "step": 2920 }, { "epoch": 0.55, "grad_norm": 8.575072288513184, "learning_rate": 1.8897044984001508e-05, "loss": 2.5079, "step": 2930 }, { "epoch": 0.55, "grad_norm": 9.817319869995117, "learning_rate": 1.8893280632411068e-05, "loss": 2.3782, "step": 2940 }, { "epoch": 0.56, "grad_norm": 11.542531967163086, "learning_rate": 1.888951628082063e-05, "loss": 2.3612, "step": 2950 }, { "epoch": 0.56, "grad_norm": 8.432719230651855, "learning_rate": 1.888575192923019e-05, "loss": 2.6094, "step": 2960 }, { "epoch": 0.56, "grad_norm": 7.157316207885742, "learning_rate": 1.8881987577639754e-05, "loss": 2.3173, "step": 2970 }, { "epoch": 0.56, "grad_norm": 7.01808500289917, "learning_rate": 1.8878223226049314e-05, "loss": 2.4611, "step": 2980 }, { "epoch": 0.56, "grad_norm": 7.0088324546813965, "learning_rate": 1.8874458874458877e-05, "loss": 2.3906, "step": 2990 }, { "epoch": 0.56, "grad_norm": 5.18332052230835, "learning_rate": 1.8870694522868436e-05, "loss": 2.699, "step": 3000 }, { "epoch": 0.57, "grad_norm": 11.170907974243164, "learning_rate": 1.8866930171278e-05, "loss": 2.1945, "step": 3010 }, { "epoch": 0.57, "grad_norm": 8.540265083312988, "learning_rate": 1.8863165819687563e-05, "loss": 2.3214, "step": 3020 }, { "epoch": 0.57, "grad_norm": 6.900252342224121, "learning_rate": 1.8859401468097123e-05, "loss": 2.4009, "step": 3030 }, { "epoch": 0.57, "grad_norm": 7.43867826461792, "learning_rate": 1.8855637116506686e-05, "loss": 2.5143, "step": 3040 }, { "epoch": 0.57, "grad_norm": 5.265262126922607, "learning_rate": 1.8851872764916242e-05, "loss": 2.3603, "step": 3050 }, { "epoch": 0.58, "grad_norm": 7.436113357543945, "learning_rate": 1.8848108413325805e-05, "loss": 2.1487, "step": 3060 }, { "epoch": 0.58, "grad_norm": 6.59679651260376, "learning_rate": 1.884434406173537e-05, "loss": 2.3253, "step": 3070 }, { "epoch": 0.58, "grad_norm": 5.955831527709961, "learning_rate": 1.8840579710144928e-05, "loss": 2.2582, "step": 3080 }, { "epoch": 0.58, "grad_norm": 8.666378021240234, "learning_rate": 1.883681535855449e-05, "loss": 2.2552, "step": 3090 }, { "epoch": 0.58, "grad_norm": 8.003193855285645, "learning_rate": 1.883305100696405e-05, "loss": 2.2303, "step": 3100 }, { "epoch": 0.59, "grad_norm": 6.098687648773193, "learning_rate": 1.8829286655373614e-05, "loss": 2.4797, "step": 3110 }, { "epoch": 0.59, "grad_norm": 4.299673557281494, "learning_rate": 1.8825522303783174e-05, "loss": 2.3256, "step": 3120 }, { "epoch": 0.59, "grad_norm": 5.408268451690674, "learning_rate": 1.8821757952192737e-05, "loss": 2.2441, "step": 3130 }, { "epoch": 0.59, "grad_norm": 8.5181303024292, "learning_rate": 1.8817993600602297e-05, "loss": 2.2284, "step": 3140 }, { "epoch": 0.59, "grad_norm": 9.531221389770508, "learning_rate": 1.881422924901186e-05, "loss": 2.3657, "step": 3150 }, { "epoch": 0.59, "grad_norm": 5.5880889892578125, "learning_rate": 1.881046489742142e-05, "loss": 2.3858, "step": 3160 }, { "epoch": 0.6, "grad_norm": 8.181473731994629, "learning_rate": 1.8806700545830983e-05, "loss": 2.3284, "step": 3170 }, { "epoch": 0.6, "grad_norm": 11.959000587463379, "learning_rate": 1.8802936194240543e-05, "loss": 2.265, "step": 3180 }, { "epoch": 0.6, "grad_norm": 4.405294418334961, "learning_rate": 1.8799171842650106e-05, "loss": 2.4588, "step": 3190 }, { "epoch": 0.6, "grad_norm": 5.867427825927734, "learning_rate": 1.8795407491059666e-05, "loss": 2.2892, "step": 3200 }, { "epoch": 0.6, "grad_norm": 4.539100646972656, "learning_rate": 1.879164313946923e-05, "loss": 2.033, "step": 3210 }, { "epoch": 0.61, "grad_norm": 9.679889678955078, "learning_rate": 1.8787878787878792e-05, "loss": 2.344, "step": 3220 }, { "epoch": 0.61, "grad_norm": 9.749494552612305, "learning_rate": 1.878411443628835e-05, "loss": 2.1338, "step": 3230 }, { "epoch": 0.61, "grad_norm": 11.74128246307373, "learning_rate": 1.878035008469791e-05, "loss": 2.4541, "step": 3240 }, { "epoch": 0.61, "grad_norm": 18.931474685668945, "learning_rate": 1.8776585733107475e-05, "loss": 2.2487, "step": 3250 }, { "epoch": 0.61, "grad_norm": 3.824002265930176, "learning_rate": 1.8772821381517034e-05, "loss": 2.407, "step": 3260 }, { "epoch": 0.62, "grad_norm": 8.731551170349121, "learning_rate": 1.8769057029926598e-05, "loss": 2.2347, "step": 3270 }, { "epoch": 0.62, "grad_norm": 4.22593879699707, "learning_rate": 1.8765292678336157e-05, "loss": 2.0125, "step": 3280 }, { "epoch": 0.62, "grad_norm": 5.457214832305908, "learning_rate": 1.876152832674572e-05, "loss": 2.1522, "step": 3290 }, { "epoch": 0.62, "grad_norm": 9.144113540649414, "learning_rate": 1.875776397515528e-05, "loss": 2.4606, "step": 3300 }, { "epoch": 0.62, "grad_norm": 3.9879465103149414, "learning_rate": 1.8753999623564843e-05, "loss": 2.0853, "step": 3310 }, { "epoch": 0.62, "grad_norm": 8.082098960876465, "learning_rate": 1.8750235271974403e-05, "loss": 2.1247, "step": 3320 }, { "epoch": 0.63, "grad_norm": 13.393769264221191, "learning_rate": 1.8746470920383966e-05, "loss": 2.1507, "step": 3330 }, { "epoch": 0.63, "grad_norm": 3.922086477279663, "learning_rate": 1.8742706568793526e-05, "loss": 2.2208, "step": 3340 }, { "epoch": 0.63, "grad_norm": 7.737308025360107, "learning_rate": 1.873894221720309e-05, "loss": 2.2848, "step": 3350 }, { "epoch": 0.63, "grad_norm": 18.515275955200195, "learning_rate": 1.873517786561265e-05, "loss": 2.0712, "step": 3360 }, { "epoch": 0.63, "grad_norm": 9.918493270874023, "learning_rate": 1.8731413514022212e-05, "loss": 2.1302, "step": 3370 }, { "epoch": 0.64, "grad_norm": 3.196310043334961, "learning_rate": 1.8727649162431772e-05, "loss": 2.1687, "step": 3380 }, { "epoch": 0.64, "grad_norm": 5.755160808563232, "learning_rate": 1.8723884810841335e-05, "loss": 2.3199, "step": 3390 }, { "epoch": 0.64, "grad_norm": 6.229613780975342, "learning_rate": 1.8720120459250895e-05, "loss": 2.0273, "step": 3400 }, { "epoch": 0.64, "grad_norm": 4.063007831573486, "learning_rate": 1.8716356107660455e-05, "loss": 2.2429, "step": 3410 }, { "epoch": 0.64, "grad_norm": 4.2730302810668945, "learning_rate": 1.8712591756070018e-05, "loss": 2.2487, "step": 3420 }, { "epoch": 0.65, "grad_norm": 5.997701644897461, "learning_rate": 1.8708827404479578e-05, "loss": 2.288, "step": 3430 }, { "epoch": 0.65, "grad_norm": 13.065136909484863, "learning_rate": 1.870506305288914e-05, "loss": 2.2238, "step": 3440 }, { "epoch": 0.65, "grad_norm": 7.368946075439453, "learning_rate": 1.8701298701298704e-05, "loss": 2.2514, "step": 3450 }, { "epoch": 0.65, "grad_norm": 5.36475944519043, "learning_rate": 1.8697534349708264e-05, "loss": 2.1219, "step": 3460 }, { "epoch": 0.65, "grad_norm": 6.188838958740234, "learning_rate": 1.8693769998117827e-05, "loss": 2.2102, "step": 3470 }, { "epoch": 0.65, "grad_norm": 12.989961624145508, "learning_rate": 1.8690005646527387e-05, "loss": 2.2069, "step": 3480 }, { "epoch": 0.66, "grad_norm": 6.382188320159912, "learning_rate": 1.868624129493695e-05, "loss": 2.0466, "step": 3490 }, { "epoch": 0.66, "grad_norm": 6.724734306335449, "learning_rate": 1.868247694334651e-05, "loss": 2.0416, "step": 3500 }, { "epoch": 0.66, "grad_norm": 10.628301620483398, "learning_rate": 1.8678712591756073e-05, "loss": 2.3185, "step": 3510 }, { "epoch": 0.66, "grad_norm": 3.1391384601593018, "learning_rate": 1.8674948240165632e-05, "loss": 1.9034, "step": 3520 }, { "epoch": 0.66, "grad_norm": 9.9121732711792, "learning_rate": 1.8671183888575196e-05, "loss": 2.1419, "step": 3530 }, { "epoch": 0.67, "grad_norm": 11.588180541992188, "learning_rate": 1.8667419536984755e-05, "loss": 2.1772, "step": 3540 }, { "epoch": 0.67, "grad_norm": 8.538999557495117, "learning_rate": 1.866365518539432e-05, "loss": 2.1558, "step": 3550 }, { "epoch": 0.67, "grad_norm": 6.513119697570801, "learning_rate": 1.8659890833803878e-05, "loss": 2.2185, "step": 3560 }, { "epoch": 0.67, "grad_norm": 9.248276710510254, "learning_rate": 1.865612648221344e-05, "loss": 2.0644, "step": 3570 }, { "epoch": 0.67, "grad_norm": 8.285011291503906, "learning_rate": 1.8652362130623e-05, "loss": 2.1054, "step": 3580 }, { "epoch": 0.68, "grad_norm": 6.439683437347412, "learning_rate": 1.864859777903256e-05, "loss": 2.123, "step": 3590 }, { "epoch": 0.68, "grad_norm": 8.896665573120117, "learning_rate": 1.8644833427442124e-05, "loss": 2.1271, "step": 3600 }, { "epoch": 0.68, "grad_norm": 10.368461608886719, "learning_rate": 1.8641069075851684e-05, "loss": 1.907, "step": 3610 }, { "epoch": 0.68, "grad_norm": 3.7022368907928467, "learning_rate": 1.8637304724261247e-05, "loss": 1.9676, "step": 3620 }, { "epoch": 0.68, "grad_norm": 11.112764358520508, "learning_rate": 1.863354037267081e-05, "loss": 1.9633, "step": 3630 }, { "epoch": 0.69, "grad_norm": 10.671046257019043, "learning_rate": 1.862977602108037e-05, "loss": 2.1857, "step": 3640 }, { "epoch": 0.69, "grad_norm": 4.5004682540893555, "learning_rate": 1.8626011669489933e-05, "loss": 1.9923, "step": 3650 }, { "epoch": 0.69, "grad_norm": 27.06603240966797, "learning_rate": 1.8622247317899493e-05, "loss": 2.2683, "step": 3660 }, { "epoch": 0.69, "grad_norm": 10.998562812805176, "learning_rate": 1.8618482966309056e-05, "loss": 2.1817, "step": 3670 }, { "epoch": 0.69, "grad_norm": 6.846683979034424, "learning_rate": 1.8614718614718616e-05, "loss": 2.0747, "step": 3680 }, { "epoch": 0.69, "grad_norm": 8.303011894226074, "learning_rate": 1.861095426312818e-05, "loss": 2.0505, "step": 3690 }, { "epoch": 0.7, "grad_norm": 14.131027221679688, "learning_rate": 1.860718991153774e-05, "loss": 2.143, "step": 3700 }, { "epoch": 0.7, "grad_norm": 6.333893775939941, "learning_rate": 1.8603425559947302e-05, "loss": 2.0173, "step": 3710 }, { "epoch": 0.7, "grad_norm": 13.341752052307129, "learning_rate": 1.859966120835686e-05, "loss": 2.0513, "step": 3720 }, { "epoch": 0.7, "grad_norm": 8.092610359191895, "learning_rate": 1.8595896856766425e-05, "loss": 1.966, "step": 3730 }, { "epoch": 0.7, "grad_norm": 6.7696356773376465, "learning_rate": 1.8592132505175985e-05, "loss": 2.0219, "step": 3740 }, { "epoch": 0.71, "grad_norm": 5.491209030151367, "learning_rate": 1.8588368153585544e-05, "loss": 2.062, "step": 3750 }, { "epoch": 0.71, "grad_norm": 8.02304458618164, "learning_rate": 1.8584603801995107e-05, "loss": 2.1111, "step": 3760 }, { "epoch": 0.71, "grad_norm": 8.15708065032959, "learning_rate": 1.8580839450404667e-05, "loss": 1.8681, "step": 3770 }, { "epoch": 0.71, "grad_norm": 14.145309448242188, "learning_rate": 1.857707509881423e-05, "loss": 1.9796, "step": 3780 }, { "epoch": 0.71, "grad_norm": 3.182771682739258, "learning_rate": 1.857331074722379e-05, "loss": 2.2717, "step": 3790 }, { "epoch": 0.72, "grad_norm": 6.4951090812683105, "learning_rate": 1.8569546395633353e-05, "loss": 2.2107, "step": 3800 }, { "epoch": 0.72, "grad_norm": 15.642072677612305, "learning_rate": 1.8565782044042916e-05, "loss": 2.1561, "step": 3810 }, { "epoch": 0.72, "grad_norm": 3.078226327896118, "learning_rate": 1.8562017692452476e-05, "loss": 1.9989, "step": 3820 }, { "epoch": 0.72, "grad_norm": 4.49596643447876, "learning_rate": 1.855825334086204e-05, "loss": 1.9954, "step": 3830 }, { "epoch": 0.72, "grad_norm": 11.259866714477539, "learning_rate": 1.85544889892716e-05, "loss": 1.8364, "step": 3840 }, { "epoch": 0.72, "grad_norm": 9.790380477905273, "learning_rate": 1.8550724637681162e-05, "loss": 2.0872, "step": 3850 }, { "epoch": 0.73, "grad_norm": 10.723867416381836, "learning_rate": 1.8546960286090722e-05, "loss": 1.9063, "step": 3860 }, { "epoch": 0.73, "grad_norm": 5.424127101898193, "learning_rate": 1.8543195934500285e-05, "loss": 1.9285, "step": 3870 }, { "epoch": 0.73, "grad_norm": 6.509077548980713, "learning_rate": 1.8539431582909845e-05, "loss": 1.9326, "step": 3880 }, { "epoch": 0.73, "grad_norm": 9.685775756835938, "learning_rate": 1.8535667231319408e-05, "loss": 2.2902, "step": 3890 }, { "epoch": 0.73, "grad_norm": 11.874910354614258, "learning_rate": 1.8531902879728968e-05, "loss": 1.8581, "step": 3900 }, { "epoch": 0.74, "grad_norm": 6.72626256942749, "learning_rate": 1.852813852813853e-05, "loss": 2.0148, "step": 3910 }, { "epoch": 0.74, "grad_norm": 21.550472259521484, "learning_rate": 1.852437417654809e-05, "loss": 1.9228, "step": 3920 }, { "epoch": 0.74, "grad_norm": 10.180908203125, "learning_rate": 1.852060982495765e-05, "loss": 1.94, "step": 3930 }, { "epoch": 0.74, "grad_norm": 2.4823896884918213, "learning_rate": 1.8516845473367214e-05, "loss": 1.8551, "step": 3940 }, { "epoch": 0.74, "grad_norm": 37.56202697753906, "learning_rate": 1.8513081121776774e-05, "loss": 2.1671, "step": 3950 }, { "epoch": 0.75, "grad_norm": 13.235817909240723, "learning_rate": 1.8509316770186337e-05, "loss": 1.9974, "step": 3960 }, { "epoch": 0.75, "grad_norm": 13.139102935791016, "learning_rate": 1.8505552418595896e-05, "loss": 2.0052, "step": 3970 }, { "epoch": 0.75, "grad_norm": 4.566539287567139, "learning_rate": 1.850178806700546e-05, "loss": 1.8644, "step": 3980 }, { "epoch": 0.75, "grad_norm": 12.045722007751465, "learning_rate": 1.849802371541502e-05, "loss": 1.9597, "step": 3990 }, { "epoch": 0.75, "grad_norm": 9.117792129516602, "learning_rate": 1.8494259363824583e-05, "loss": 1.9613, "step": 4000 }, { "epoch": 0.75, "grad_norm": 30.70831298828125, "learning_rate": 1.8490495012234146e-05, "loss": 1.9947, "step": 4010 }, { "epoch": 0.76, "grad_norm": 8.068875312805176, "learning_rate": 1.8486730660643705e-05, "loss": 2.0513, "step": 4020 }, { "epoch": 0.76, "grad_norm": 6.033820629119873, "learning_rate": 1.848296630905327e-05, "loss": 2.1018, "step": 4030 }, { "epoch": 0.76, "grad_norm": 7.325448036193848, "learning_rate": 1.847920195746283e-05, "loss": 2.0123, "step": 4040 }, { "epoch": 0.76, "grad_norm": 4.722583770751953, "learning_rate": 1.847543760587239e-05, "loss": 2.086, "step": 4050 }, { "epoch": 0.76, "grad_norm": 5.869627952575684, "learning_rate": 1.847167325428195e-05, "loss": 1.8988, "step": 4060 }, { "epoch": 0.77, "grad_norm": 5.616646766662598, "learning_rate": 1.8467908902691514e-05, "loss": 1.9823, "step": 4070 }, { "epoch": 0.77, "grad_norm": 7.294058799743652, "learning_rate": 1.8464144551101074e-05, "loss": 1.97, "step": 4080 }, { "epoch": 0.77, "grad_norm": 3.1566827297210693, "learning_rate": 1.8460380199510637e-05, "loss": 1.9226, "step": 4090 }, { "epoch": 0.77, "grad_norm": 8.810595512390137, "learning_rate": 1.8456615847920197e-05, "loss": 2.0752, "step": 4100 }, { "epoch": 0.77, "grad_norm": 8.686856269836426, "learning_rate": 1.8452851496329757e-05, "loss": 2.0722, "step": 4110 }, { "epoch": 0.78, "grad_norm": 8.495172500610352, "learning_rate": 1.844908714473932e-05, "loss": 1.7657, "step": 4120 }, { "epoch": 0.78, "grad_norm": 7.293805122375488, "learning_rate": 1.844532279314888e-05, "loss": 1.6995, "step": 4130 }, { "epoch": 0.78, "grad_norm": 8.600557327270508, "learning_rate": 1.8441558441558443e-05, "loss": 1.7201, "step": 4140 }, { "epoch": 0.78, "grad_norm": 9.685958862304688, "learning_rate": 1.8437794089968003e-05, "loss": 1.7472, "step": 4150 }, { "epoch": 0.78, "grad_norm": 20.721675872802734, "learning_rate": 1.8434029738377566e-05, "loss": 1.8939, "step": 4160 }, { "epoch": 0.78, "grad_norm": 12.649245262145996, "learning_rate": 1.8430265386787126e-05, "loss": 1.8974, "step": 4170 }, { "epoch": 0.79, "grad_norm": 17.670217514038086, "learning_rate": 1.842650103519669e-05, "loss": 1.7713, "step": 4180 }, { "epoch": 0.79, "grad_norm": 13.489444732666016, "learning_rate": 1.8422736683606252e-05, "loss": 2.0021, "step": 4190 }, { "epoch": 0.79, "grad_norm": 15.630731582641602, "learning_rate": 1.8418972332015812e-05, "loss": 1.9024, "step": 4200 }, { "epoch": 0.79, "grad_norm": 11.856411933898926, "learning_rate": 1.8415207980425375e-05, "loss": 2.1115, "step": 4210 }, { "epoch": 0.79, "grad_norm": 8.349586486816406, "learning_rate": 1.8411443628834935e-05, "loss": 1.7986, "step": 4220 }, { "epoch": 0.8, "grad_norm": 10.628458023071289, "learning_rate": 1.8407679277244498e-05, "loss": 1.7519, "step": 4230 }, { "epoch": 0.8, "grad_norm": 5.205367565155029, "learning_rate": 1.8403914925654058e-05, "loss": 1.8736, "step": 4240 }, { "epoch": 0.8, "grad_norm": 11.882181167602539, "learning_rate": 1.840015057406362e-05, "loss": 1.9139, "step": 4250 }, { "epoch": 0.8, "grad_norm": 8.525898933410645, "learning_rate": 1.839638622247318e-05, "loss": 2.0817, "step": 4260 }, { "epoch": 0.8, "grad_norm": 11.283459663391113, "learning_rate": 1.839262187088274e-05, "loss": 1.8557, "step": 4270 }, { "epoch": 0.81, "grad_norm": 12.506168365478516, "learning_rate": 1.8388857519292303e-05, "loss": 1.9177, "step": 4280 }, { "epoch": 0.81, "grad_norm": 9.862067222595215, "learning_rate": 1.8385093167701863e-05, "loss": 2.0223, "step": 4290 }, { "epoch": 0.81, "grad_norm": 6.25103759765625, "learning_rate": 1.8381328816111426e-05, "loss": 2.045, "step": 4300 }, { "epoch": 0.81, "grad_norm": 7.032151699066162, "learning_rate": 1.8377564464520986e-05, "loss": 1.8646, "step": 4310 }, { "epoch": 0.81, "grad_norm": 9.486144065856934, "learning_rate": 1.837380011293055e-05, "loss": 1.6828, "step": 4320 }, { "epoch": 0.81, "grad_norm": 10.718717575073242, "learning_rate": 1.837003576134011e-05, "loss": 2.1125, "step": 4330 }, { "epoch": 0.82, "grad_norm": 12.391996383666992, "learning_rate": 1.8366271409749672e-05, "loss": 1.7862, "step": 4340 }, { "epoch": 0.82, "grad_norm": 9.340251922607422, "learning_rate": 1.8362507058159232e-05, "loss": 1.8666, "step": 4350 }, { "epoch": 0.82, "grad_norm": 13.326081275939941, "learning_rate": 1.8358742706568795e-05, "loss": 1.7263, "step": 4360 }, { "epoch": 0.82, "grad_norm": 7.439601898193359, "learning_rate": 1.8354978354978358e-05, "loss": 1.9045, "step": 4370 }, { "epoch": 0.82, "grad_norm": 7.711633682250977, "learning_rate": 1.8351214003387918e-05, "loss": 1.6444, "step": 4380 }, { "epoch": 0.83, "grad_norm": 27.964046478271484, "learning_rate": 1.834744965179748e-05, "loss": 1.9252, "step": 4390 }, { "epoch": 0.83, "grad_norm": 23.466896057128906, "learning_rate": 1.834368530020704e-05, "loss": 1.9257, "step": 4400 }, { "epoch": 0.83, "grad_norm": 8.017694473266602, "learning_rate": 1.8339920948616604e-05, "loss": 1.8272, "step": 4410 }, { "epoch": 0.83, "grad_norm": 9.13419246673584, "learning_rate": 1.8336156597026164e-05, "loss": 1.8617, "step": 4420 }, { "epoch": 0.83, "grad_norm": 4.761314392089844, "learning_rate": 1.8332392245435727e-05, "loss": 1.6775, "step": 4430 }, { "epoch": 0.84, "grad_norm": 19.422325134277344, "learning_rate": 1.8328627893845287e-05, "loss": 2.015, "step": 4440 }, { "epoch": 0.84, "grad_norm": 7.556143760681152, "learning_rate": 1.8324863542254847e-05, "loss": 1.7302, "step": 4450 }, { "epoch": 0.84, "grad_norm": 13.221558570861816, "learning_rate": 1.832109919066441e-05, "loss": 1.6569, "step": 4460 }, { "epoch": 0.84, "grad_norm": 8.56251335144043, "learning_rate": 1.831733483907397e-05, "loss": 1.7976, "step": 4470 }, { "epoch": 0.84, "grad_norm": 15.487319946289062, "learning_rate": 1.8313570487483533e-05, "loss": 2.0049, "step": 4480 }, { "epoch": 0.85, "grad_norm": 12.269038200378418, "learning_rate": 1.8309806135893092e-05, "loss": 1.6322, "step": 4490 }, { "epoch": 0.85, "grad_norm": 10.545343399047852, "learning_rate": 1.8306041784302656e-05, "loss": 1.7523, "step": 4500 }, { "epoch": 0.85, "grad_norm": 11.680045127868652, "learning_rate": 1.8302277432712215e-05, "loss": 1.7489, "step": 4510 }, { "epoch": 0.85, "grad_norm": 13.965753555297852, "learning_rate": 1.829851308112178e-05, "loss": 1.7063, "step": 4520 }, { "epoch": 0.85, "grad_norm": 14.397643089294434, "learning_rate": 1.8294748729531338e-05, "loss": 1.8438, "step": 4530 }, { "epoch": 0.85, "grad_norm": 15.942564010620117, "learning_rate": 1.82909843779409e-05, "loss": 1.8897, "step": 4540 }, { "epoch": 0.86, "grad_norm": 4.38701868057251, "learning_rate": 1.8287220026350465e-05, "loss": 1.8604, "step": 4550 }, { "epoch": 0.86, "grad_norm": 2.3334455490112305, "learning_rate": 1.8283455674760024e-05, "loss": 1.6651, "step": 4560 }, { "epoch": 0.86, "grad_norm": 12.184940338134766, "learning_rate": 1.8279691323169587e-05, "loss": 2.0425, "step": 4570 }, { "epoch": 0.86, "grad_norm": 29.6932430267334, "learning_rate": 1.8275926971579147e-05, "loss": 1.7136, "step": 4580 }, { "epoch": 0.86, "grad_norm": 9.31380844116211, "learning_rate": 1.827216261998871e-05, "loss": 1.6088, "step": 4590 }, { "epoch": 0.87, "grad_norm": 6.644775390625, "learning_rate": 1.826839826839827e-05, "loss": 1.7825, "step": 4600 }, { "epoch": 0.87, "grad_norm": 14.836319923400879, "learning_rate": 1.8264633916807833e-05, "loss": 1.7212, "step": 4610 }, { "epoch": 0.87, "grad_norm": 10.639488220214844, "learning_rate": 1.8260869565217393e-05, "loss": 1.6093, "step": 4620 }, { "epoch": 0.87, "grad_norm": 10.97711181640625, "learning_rate": 1.8257105213626953e-05, "loss": 1.6979, "step": 4630 }, { "epoch": 0.87, "grad_norm": 18.360660552978516, "learning_rate": 1.8253340862036516e-05, "loss": 1.8738, "step": 4640 }, { "epoch": 0.88, "grad_norm": 9.36877727508545, "learning_rate": 1.8249576510446076e-05, "loss": 1.6484, "step": 4650 }, { "epoch": 0.88, "grad_norm": 13.112272262573242, "learning_rate": 1.824581215885564e-05, "loss": 1.6616, "step": 4660 }, { "epoch": 0.88, "grad_norm": 6.932171821594238, "learning_rate": 1.82420478072652e-05, "loss": 1.6586, "step": 4670 }, { "epoch": 0.88, "grad_norm": 8.852755546569824, "learning_rate": 1.8238283455674762e-05, "loss": 1.8107, "step": 4680 }, { "epoch": 0.88, "grad_norm": 7.760582447052002, "learning_rate": 1.823451910408432e-05, "loss": 1.7808, "step": 4690 }, { "epoch": 0.88, "grad_norm": 8.635812759399414, "learning_rate": 1.8230754752493885e-05, "loss": 1.6713, "step": 4700 }, { "epoch": 0.89, "grad_norm": 16.030855178833008, "learning_rate": 1.8226990400903445e-05, "loss": 1.6924, "step": 4710 }, { "epoch": 0.89, "grad_norm": 16.42034149169922, "learning_rate": 1.8223226049313008e-05, "loss": 1.8233, "step": 4720 }, { "epoch": 0.89, "grad_norm": 7.589072227478027, "learning_rate": 1.8219461697722567e-05, "loss": 1.4428, "step": 4730 }, { "epoch": 0.89, "grad_norm": 14.824429512023926, "learning_rate": 1.821569734613213e-05, "loss": 1.7412, "step": 4740 }, { "epoch": 0.89, "grad_norm": 2.6960065364837646, "learning_rate": 1.8211932994541694e-05, "loss": 1.7846, "step": 4750 }, { "epoch": 0.9, "grad_norm": 9.252869606018066, "learning_rate": 1.8208168642951254e-05, "loss": 1.6731, "step": 4760 }, { "epoch": 0.9, "grad_norm": 12.624210357666016, "learning_rate": 1.8204404291360817e-05, "loss": 1.7592, "step": 4770 }, { "epoch": 0.9, "grad_norm": 5.478265762329102, "learning_rate": 1.8200639939770376e-05, "loss": 1.6657, "step": 4780 }, { "epoch": 0.9, "grad_norm": 11.12108039855957, "learning_rate": 1.819687558817994e-05, "loss": 1.6616, "step": 4790 }, { "epoch": 0.9, "grad_norm": 5.339282035827637, "learning_rate": 1.81931112365895e-05, "loss": 1.5382, "step": 4800 }, { "epoch": 0.91, "grad_norm": 30.758148193359375, "learning_rate": 1.818934688499906e-05, "loss": 1.7256, "step": 4810 }, { "epoch": 0.91, "grad_norm": 3.9476451873779297, "learning_rate": 1.8185582533408622e-05, "loss": 1.5978, "step": 4820 }, { "epoch": 0.91, "grad_norm": 2.4273171424865723, "learning_rate": 1.8181818181818182e-05, "loss": 1.4934, "step": 4830 }, { "epoch": 0.91, "grad_norm": 12.683277130126953, "learning_rate": 1.8178053830227745e-05, "loss": 1.8227, "step": 4840 }, { "epoch": 0.91, "grad_norm": 7.911778450012207, "learning_rate": 1.8174289478637305e-05, "loss": 1.5653, "step": 4850 }, { "epoch": 0.91, "grad_norm": 30.177553176879883, "learning_rate": 1.8170525127046868e-05, "loss": 1.6419, "step": 4860 }, { "epoch": 0.92, "grad_norm": 22.67075538635254, "learning_rate": 1.8166760775456428e-05, "loss": 1.8511, "step": 4870 }, { "epoch": 0.92, "grad_norm": 16.48545265197754, "learning_rate": 1.816299642386599e-05, "loss": 1.2912, "step": 4880 }, { "epoch": 0.92, "grad_norm": 13.775650978088379, "learning_rate": 1.815923207227555e-05, "loss": 1.7698, "step": 4890 }, { "epoch": 0.92, "grad_norm": 6.288990020751953, "learning_rate": 1.8155467720685114e-05, "loss": 1.5706, "step": 4900 }, { "epoch": 0.92, "grad_norm": 17.974271774291992, "learning_rate": 1.8151703369094674e-05, "loss": 1.6064, "step": 4910 }, { "epoch": 0.93, "grad_norm": 28.941722869873047, "learning_rate": 1.8147939017504237e-05, "loss": 1.6713, "step": 4920 }, { "epoch": 0.93, "grad_norm": 11.917637825012207, "learning_rate": 1.81441746659138e-05, "loss": 1.6204, "step": 4930 }, { "epoch": 0.93, "grad_norm": 24.066226959228516, "learning_rate": 1.814041031432336e-05, "loss": 1.7583, "step": 4940 }, { "epoch": 0.93, "grad_norm": 9.762685775756836, "learning_rate": 1.8136645962732923e-05, "loss": 1.4719, "step": 4950 }, { "epoch": 0.93, "grad_norm": 3.6561665534973145, "learning_rate": 1.8132881611142483e-05, "loss": 1.5726, "step": 4960 }, { "epoch": 0.94, "grad_norm": 8.348560333251953, "learning_rate": 1.8129117259552043e-05, "loss": 1.6183, "step": 4970 }, { "epoch": 0.94, "grad_norm": 13.388601303100586, "learning_rate": 1.8125352907961606e-05, "loss": 1.614, "step": 4980 }, { "epoch": 0.94, "grad_norm": 5.767553806304932, "learning_rate": 1.8121588556371165e-05, "loss": 1.5365, "step": 4990 }, { "epoch": 0.94, "grad_norm": 8.167011260986328, "learning_rate": 1.811782420478073e-05, "loss": 1.6598, "step": 5000 }, { "epoch": 0.94, "grad_norm": 33.257511138916016, "learning_rate": 1.811405985319029e-05, "loss": 1.9184, "step": 5010 }, { "epoch": 0.94, "grad_norm": 21.02366065979004, "learning_rate": 1.811029550159985e-05, "loss": 1.7108, "step": 5020 }, { "epoch": 0.95, "grad_norm": 9.491157531738281, "learning_rate": 1.810653115000941e-05, "loss": 1.5522, "step": 5030 }, { "epoch": 0.95, "grad_norm": 13.046422004699707, "learning_rate": 1.8102766798418974e-05, "loss": 1.7517, "step": 5040 }, { "epoch": 0.95, "grad_norm": 11.477202415466309, "learning_rate": 1.8099002446828534e-05, "loss": 1.6329, "step": 5050 }, { "epoch": 0.95, "grad_norm": 12.039795875549316, "learning_rate": 1.8095238095238097e-05, "loss": 1.7396, "step": 5060 }, { "epoch": 0.95, "grad_norm": 10.725845336914062, "learning_rate": 1.8091473743647657e-05, "loss": 1.5248, "step": 5070 }, { "epoch": 0.96, "grad_norm": 9.213160514831543, "learning_rate": 1.808770939205722e-05, "loss": 1.5598, "step": 5080 }, { "epoch": 0.96, "grad_norm": 5.273130893707275, "learning_rate": 1.808394504046678e-05, "loss": 1.5013, "step": 5090 }, { "epoch": 0.96, "grad_norm": 4.758931636810303, "learning_rate": 1.8080180688876343e-05, "loss": 1.6013, "step": 5100 }, { "epoch": 0.96, "grad_norm": 12.065877914428711, "learning_rate": 1.8076416337285906e-05, "loss": 1.5301, "step": 5110 }, { "epoch": 0.96, "grad_norm": 6.5146260261535645, "learning_rate": 1.8072651985695466e-05, "loss": 1.5861, "step": 5120 }, { "epoch": 0.97, "grad_norm": 8.56678295135498, "learning_rate": 1.806888763410503e-05, "loss": 1.5615, "step": 5130 }, { "epoch": 0.97, "grad_norm": 12.599604606628418, "learning_rate": 1.8065123282514586e-05, "loss": 1.5666, "step": 5140 }, { "epoch": 0.97, "grad_norm": 5.8899827003479, "learning_rate": 1.806135893092415e-05, "loss": 1.544, "step": 5150 }, { "epoch": 0.97, "grad_norm": 12.435726165771484, "learning_rate": 1.8057594579333712e-05, "loss": 1.4651, "step": 5160 }, { "epoch": 0.97, "grad_norm": 17.321407318115234, "learning_rate": 1.8053830227743272e-05, "loss": 1.7957, "step": 5170 }, { "epoch": 0.97, "grad_norm": 9.785209655761719, "learning_rate": 1.8050065876152835e-05, "loss": 1.5213, "step": 5180 }, { "epoch": 0.98, "grad_norm": 5.476792812347412, "learning_rate": 1.8046301524562395e-05, "loss": 1.4325, "step": 5190 }, { "epoch": 0.98, "grad_norm": 8.630148887634277, "learning_rate": 1.8042537172971958e-05, "loss": 1.4048, "step": 5200 }, { "epoch": 0.98, "grad_norm": 18.134206771850586, "learning_rate": 1.8038772821381518e-05, "loss": 1.6637, "step": 5210 }, { "epoch": 0.98, "grad_norm": 41.112239837646484, "learning_rate": 1.803500846979108e-05, "loss": 1.4036, "step": 5220 }, { "epoch": 0.98, "grad_norm": 3.7763662338256836, "learning_rate": 1.803124411820064e-05, "loss": 1.7532, "step": 5230 }, { "epoch": 0.99, "grad_norm": 24.516996383666992, "learning_rate": 1.8027479766610204e-05, "loss": 1.4504, "step": 5240 }, { "epoch": 0.99, "grad_norm": 18.604209899902344, "learning_rate": 1.8023715415019763e-05, "loss": 1.509, "step": 5250 }, { "epoch": 0.99, "grad_norm": 6.184398651123047, "learning_rate": 1.8019951063429327e-05, "loss": 1.4567, "step": 5260 }, { "epoch": 0.99, "grad_norm": 7.315374374389648, "learning_rate": 1.8016186711838886e-05, "loss": 1.5755, "step": 5270 }, { "epoch": 0.99, "grad_norm": 8.645417213439941, "learning_rate": 1.801242236024845e-05, "loss": 1.6434, "step": 5280 }, { "epoch": 1.0, "grad_norm": 12.885428428649902, "learning_rate": 1.800865800865801e-05, "loss": 1.4227, "step": 5290 }, { "epoch": 1.0, "grad_norm": 3.746971607208252, "learning_rate": 1.8004893657067572e-05, "loss": 1.2959, "step": 5300 }, { "epoch": 1.0, "grad_norm": 5.56287145614624, "learning_rate": 1.8001129305477136e-05, "loss": 1.3866, "step": 5310 }, { "epoch": 1.0, "eval_accuracy": 0.8746666666666667, "eval_loss": 1.0967950820922852, "eval_runtime": 33.6773, "eval_samples_per_second": 222.702, "eval_steps_per_second": 27.853, "step": 5313 }, { "epoch": 1.0, "grad_norm": 5.499802112579346, "learning_rate": 1.7997364953886692e-05, "loss": 1.676, "step": 5320 }, { "epoch": 1.0, "grad_norm": 16.520263671875, "learning_rate": 1.7993600602296255e-05, "loss": 1.5193, "step": 5330 }, { "epoch": 1.01, "grad_norm": 10.004297256469727, "learning_rate": 1.7989836250705818e-05, "loss": 1.5083, "step": 5340 }, { "epoch": 1.01, "grad_norm": 25.948406219482422, "learning_rate": 1.7986071899115378e-05, "loss": 1.2522, "step": 5350 }, { "epoch": 1.01, "grad_norm": 38.36083221435547, "learning_rate": 1.798230754752494e-05, "loss": 1.3645, "step": 5360 }, { "epoch": 1.01, "grad_norm": 7.533431529998779, "learning_rate": 1.79785431959345e-05, "loss": 1.3528, "step": 5370 }, { "epoch": 1.01, "grad_norm": 6.777375221252441, "learning_rate": 1.7974778844344064e-05, "loss": 1.5204, "step": 5380 }, { "epoch": 1.01, "grad_norm": 8.859736442565918, "learning_rate": 1.7971014492753624e-05, "loss": 1.3621, "step": 5390 }, { "epoch": 1.02, "grad_norm": 14.00033187866211, "learning_rate": 1.7967250141163187e-05, "loss": 1.6082, "step": 5400 }, { "epoch": 1.02, "grad_norm": 33.97581481933594, "learning_rate": 1.7963485789572747e-05, "loss": 1.6728, "step": 5410 }, { "epoch": 1.02, "grad_norm": 15.276714324951172, "learning_rate": 1.795972143798231e-05, "loss": 1.3271, "step": 5420 }, { "epoch": 1.02, "grad_norm": 9.051756858825684, "learning_rate": 1.795595708639187e-05, "loss": 1.6152, "step": 5430 }, { "epoch": 1.02, "grad_norm": 14.027779579162598, "learning_rate": 1.7952192734801433e-05, "loss": 1.3804, "step": 5440 }, { "epoch": 1.03, "grad_norm": 20.677108764648438, "learning_rate": 1.7948428383210993e-05, "loss": 1.4668, "step": 5450 }, { "epoch": 1.03, "grad_norm": 7.414674758911133, "learning_rate": 1.7944664031620556e-05, "loss": 1.3469, "step": 5460 }, { "epoch": 1.03, "grad_norm": 8.591064453125, "learning_rate": 1.7940899680030116e-05, "loss": 1.0537, "step": 5470 }, { "epoch": 1.03, "grad_norm": 14.56933307647705, "learning_rate": 1.793713532843968e-05, "loss": 1.2944, "step": 5480 }, { "epoch": 1.03, "grad_norm": 9.722732543945312, "learning_rate": 1.793337097684924e-05, "loss": 1.7081, "step": 5490 }, { "epoch": 1.04, "grad_norm": 22.593751907348633, "learning_rate": 1.7929606625258798e-05, "loss": 1.4611, "step": 5500 }, { "epoch": 1.04, "grad_norm": 4.620069980621338, "learning_rate": 1.792584227366836e-05, "loss": 1.3241, "step": 5510 }, { "epoch": 1.04, "grad_norm": 18.490928649902344, "learning_rate": 1.792207792207792e-05, "loss": 1.2878, "step": 5520 }, { "epoch": 1.04, "grad_norm": 4.842093467712402, "learning_rate": 1.7918313570487484e-05, "loss": 1.3626, "step": 5530 }, { "epoch": 1.04, "grad_norm": 2.8665945529937744, "learning_rate": 1.7914549218897047e-05, "loss": 1.3749, "step": 5540 }, { "epoch": 1.04, "grad_norm": 8.311807632446289, "learning_rate": 1.7910784867306607e-05, "loss": 1.0602, "step": 5550 }, { "epoch": 1.05, "grad_norm": 11.766958236694336, "learning_rate": 1.790702051571617e-05, "loss": 1.2955, "step": 5560 }, { "epoch": 1.05, "grad_norm": 4.863016605377197, "learning_rate": 1.790325616412573e-05, "loss": 1.7003, "step": 5570 }, { "epoch": 1.05, "grad_norm": 18.07254409790039, "learning_rate": 1.7899491812535293e-05, "loss": 1.3364, "step": 5580 }, { "epoch": 1.05, "grad_norm": 4.024524211883545, "learning_rate": 1.7895727460944853e-05, "loss": 1.3305, "step": 5590 }, { "epoch": 1.05, "grad_norm": 15.396920204162598, "learning_rate": 1.7891963109354416e-05, "loss": 1.309, "step": 5600 }, { "epoch": 1.06, "grad_norm": 9.364221572875977, "learning_rate": 1.7888198757763976e-05, "loss": 1.4572, "step": 5610 }, { "epoch": 1.06, "grad_norm": 21.09225082397461, "learning_rate": 1.788443440617354e-05, "loss": 1.4568, "step": 5620 }, { "epoch": 1.06, "grad_norm": 37.32011795043945, "learning_rate": 1.78806700545831e-05, "loss": 1.5728, "step": 5630 }, { "epoch": 1.06, "grad_norm": 6.8537468910217285, "learning_rate": 1.7876905702992662e-05, "loss": 1.4453, "step": 5640 }, { "epoch": 1.06, "grad_norm": 10.12431812286377, "learning_rate": 1.7873141351402222e-05, "loss": 1.4191, "step": 5650 }, { "epoch": 1.07, "grad_norm": 6.774322032928467, "learning_rate": 1.7869376999811785e-05, "loss": 1.3566, "step": 5660 }, { "epoch": 1.07, "grad_norm": 9.487374305725098, "learning_rate": 1.7865612648221345e-05, "loss": 1.2845, "step": 5670 }, { "epoch": 1.07, "grad_norm": 8.616371154785156, "learning_rate": 1.7861848296630905e-05, "loss": 1.2827, "step": 5680 }, { "epoch": 1.07, "grad_norm": 18.26116943359375, "learning_rate": 1.7858083945040468e-05, "loss": 1.279, "step": 5690 }, { "epoch": 1.07, "grad_norm": 6.658090114593506, "learning_rate": 1.7854319593450027e-05, "loss": 1.1976, "step": 5700 }, { "epoch": 1.07, "grad_norm": 2.6320950984954834, "learning_rate": 1.785055524185959e-05, "loss": 1.3875, "step": 5710 }, { "epoch": 1.08, "grad_norm": 13.106246948242188, "learning_rate": 1.7846790890269154e-05, "loss": 1.42, "step": 5720 }, { "epoch": 1.08, "grad_norm": 7.456079006195068, "learning_rate": 1.7843026538678714e-05, "loss": 1.106, "step": 5730 }, { "epoch": 1.08, "grad_norm": 13.17140007019043, "learning_rate": 1.7839262187088277e-05, "loss": 1.153, "step": 5740 }, { "epoch": 1.08, "grad_norm": 28.36896324157715, "learning_rate": 1.7835497835497836e-05, "loss": 1.6023, "step": 5750 }, { "epoch": 1.08, "grad_norm": 13.630576133728027, "learning_rate": 1.78317334839074e-05, "loss": 1.5223, "step": 5760 }, { "epoch": 1.09, "grad_norm": 3.064805030822754, "learning_rate": 1.782796913231696e-05, "loss": 1.3474, "step": 5770 }, { "epoch": 1.09, "grad_norm": 16.060394287109375, "learning_rate": 1.7824204780726523e-05, "loss": 1.3724, "step": 5780 }, { "epoch": 1.09, "grad_norm": 8.598823547363281, "learning_rate": 1.7820440429136082e-05, "loss": 1.2749, "step": 5790 }, { "epoch": 1.09, "grad_norm": 15.959213256835938, "learning_rate": 1.7816676077545645e-05, "loss": 1.5515, "step": 5800 }, { "epoch": 1.09, "grad_norm": 11.059527397155762, "learning_rate": 1.7812911725955205e-05, "loss": 1.3217, "step": 5810 }, { "epoch": 1.1, "grad_norm": 14.561382293701172, "learning_rate": 1.780914737436477e-05, "loss": 1.4639, "step": 5820 }, { "epoch": 1.1, "grad_norm": 7.75567626953125, "learning_rate": 1.7805383022774328e-05, "loss": 1.4613, "step": 5830 }, { "epoch": 1.1, "grad_norm": 5.322535037994385, "learning_rate": 1.7801618671183888e-05, "loss": 1.3931, "step": 5840 }, { "epoch": 1.1, "grad_norm": 8.112421035766602, "learning_rate": 1.779785431959345e-05, "loss": 1.3793, "step": 5850 }, { "epoch": 1.1, "grad_norm": 10.282398223876953, "learning_rate": 1.779408996800301e-05, "loss": 1.4861, "step": 5860 }, { "epoch": 1.1, "grad_norm": 16.15000343322754, "learning_rate": 1.7790325616412574e-05, "loss": 1.4151, "step": 5870 }, { "epoch": 1.11, "grad_norm": 16.440019607543945, "learning_rate": 1.7786561264822134e-05, "loss": 1.4436, "step": 5880 }, { "epoch": 1.11, "grad_norm": 8.281665802001953, "learning_rate": 1.7782796913231697e-05, "loss": 1.416, "step": 5890 }, { "epoch": 1.11, "grad_norm": 6.84842586517334, "learning_rate": 1.777903256164126e-05, "loss": 1.1374, "step": 5900 }, { "epoch": 1.11, "grad_norm": 6.96786642074585, "learning_rate": 1.777526821005082e-05, "loss": 1.0473, "step": 5910 }, { "epoch": 1.11, "grad_norm": 17.65937042236328, "learning_rate": 1.7771503858460383e-05, "loss": 1.1968, "step": 5920 }, { "epoch": 1.12, "grad_norm": 7.487143516540527, "learning_rate": 1.7767739506869943e-05, "loss": 1.4706, "step": 5930 }, { "epoch": 1.12, "grad_norm": 16.491477966308594, "learning_rate": 1.7763975155279506e-05, "loss": 1.3157, "step": 5940 }, { "epoch": 1.12, "grad_norm": 6.16838264465332, "learning_rate": 1.7760210803689066e-05, "loss": 1.1594, "step": 5950 }, { "epoch": 1.12, "grad_norm": 27.0855712890625, "learning_rate": 1.775644645209863e-05, "loss": 1.3721, "step": 5960 }, { "epoch": 1.12, "grad_norm": 7.84747314453125, "learning_rate": 1.775268210050819e-05, "loss": 1.3545, "step": 5970 }, { "epoch": 1.13, "grad_norm": 12.929715156555176, "learning_rate": 1.7748917748917752e-05, "loss": 1.2785, "step": 5980 }, { "epoch": 1.13, "grad_norm": 9.559564590454102, "learning_rate": 1.774515339732731e-05, "loss": 1.242, "step": 5990 }, { "epoch": 1.13, "grad_norm": 8.087242126464844, "learning_rate": 1.7741389045736875e-05, "loss": 1.1266, "step": 6000 }, { "epoch": 1.13, "grad_norm": 9.025975227355957, "learning_rate": 1.7737624694146434e-05, "loss": 1.4599, "step": 6010 }, { "epoch": 1.13, "grad_norm": 8.053915023803711, "learning_rate": 1.7733860342555994e-05, "loss": 0.8851, "step": 6020 }, { "epoch": 1.13, "grad_norm": 11.262309074401855, "learning_rate": 1.7730095990965557e-05, "loss": 1.1816, "step": 6030 }, { "epoch": 1.14, "grad_norm": 8.548508644104004, "learning_rate": 1.7726331639375117e-05, "loss": 1.2975, "step": 6040 }, { "epoch": 1.14, "grad_norm": 6.463540554046631, "learning_rate": 1.772256728778468e-05, "loss": 1.1834, "step": 6050 }, { "epoch": 1.14, "grad_norm": 8.140536308288574, "learning_rate": 1.771880293619424e-05, "loss": 1.0621, "step": 6060 }, { "epoch": 1.14, "grad_norm": 3.9653189182281494, "learning_rate": 1.7715038584603803e-05, "loss": 1.1374, "step": 6070 }, { "epoch": 1.14, "grad_norm": 16.860267639160156, "learning_rate": 1.7711274233013366e-05, "loss": 1.3068, "step": 6080 }, { "epoch": 1.15, "grad_norm": 7.63691520690918, "learning_rate": 1.7707509881422926e-05, "loss": 1.1172, "step": 6090 }, { "epoch": 1.15, "grad_norm": 12.925235748291016, "learning_rate": 1.770374552983249e-05, "loss": 0.7959, "step": 6100 }, { "epoch": 1.15, "grad_norm": 27.463420867919922, "learning_rate": 1.769998117824205e-05, "loss": 1.3255, "step": 6110 }, { "epoch": 1.15, "grad_norm": 9.057517051696777, "learning_rate": 1.7696216826651612e-05, "loss": 1.4139, "step": 6120 }, { "epoch": 1.15, "grad_norm": 5.9874982833862305, "learning_rate": 1.7692452475061172e-05, "loss": 1.3465, "step": 6130 }, { "epoch": 1.16, "grad_norm": 16.482505798339844, "learning_rate": 1.7688688123470735e-05, "loss": 0.9673, "step": 6140 }, { "epoch": 1.16, "grad_norm": 4.9901885986328125, "learning_rate": 1.7684923771880295e-05, "loss": 1.3453, "step": 6150 }, { "epoch": 1.16, "grad_norm": 18.844236373901367, "learning_rate": 1.7681159420289858e-05, "loss": 1.1033, "step": 6160 }, { "epoch": 1.16, "grad_norm": 18.83871841430664, "learning_rate": 1.7677395068699418e-05, "loss": 1.4439, "step": 6170 }, { "epoch": 1.16, "grad_norm": 10.778878211975098, "learning_rate": 1.767363071710898e-05, "loss": 1.2751, "step": 6180 }, { "epoch": 1.17, "grad_norm": 26.04317283630371, "learning_rate": 1.766986636551854e-05, "loss": 1.1188, "step": 6190 }, { "epoch": 1.17, "grad_norm": 14.1301851272583, "learning_rate": 1.76661020139281e-05, "loss": 1.376, "step": 6200 }, { "epoch": 1.17, "grad_norm": 8.929794311523438, "learning_rate": 1.7662337662337664e-05, "loss": 1.2898, "step": 6210 }, { "epoch": 1.17, "grad_norm": 24.299001693725586, "learning_rate": 1.7658573310747223e-05, "loss": 1.1609, "step": 6220 }, { "epoch": 1.17, "grad_norm": 15.158361434936523, "learning_rate": 1.7654808959156787e-05, "loss": 1.2065, "step": 6230 }, { "epoch": 1.17, "grad_norm": 7.862684726715088, "learning_rate": 1.7651044607566346e-05, "loss": 1.302, "step": 6240 }, { "epoch": 1.18, "grad_norm": 4.036644458770752, "learning_rate": 1.764728025597591e-05, "loss": 0.9381, "step": 6250 }, { "epoch": 1.18, "grad_norm": 17.739803314208984, "learning_rate": 1.764351590438547e-05, "loss": 0.9986, "step": 6260 }, { "epoch": 1.18, "grad_norm": 23.925811767578125, "learning_rate": 1.7639751552795032e-05, "loss": 1.4922, "step": 6270 }, { "epoch": 1.18, "grad_norm": 5.70884895324707, "learning_rate": 1.7635987201204596e-05, "loss": 1.2282, "step": 6280 }, { "epoch": 1.18, "grad_norm": 10.17802619934082, "learning_rate": 1.7632222849614155e-05, "loss": 1.3497, "step": 6290 }, { "epoch": 1.19, "grad_norm": 7.948269844055176, "learning_rate": 1.762845849802372e-05, "loss": 1.0691, "step": 6300 }, { "epoch": 1.19, "grad_norm": 19.185148239135742, "learning_rate": 1.7624694146433278e-05, "loss": 1.206, "step": 6310 }, { "epoch": 1.19, "grad_norm": 19.54599952697754, "learning_rate": 1.762092979484284e-05, "loss": 1.2538, "step": 6320 }, { "epoch": 1.19, "grad_norm": 5.534640789031982, "learning_rate": 1.76171654432524e-05, "loss": 0.9516, "step": 6330 }, { "epoch": 1.19, "grad_norm": 10.249228477478027, "learning_rate": 1.7613401091661964e-05, "loss": 1.1624, "step": 6340 }, { "epoch": 1.2, "grad_norm": 17.44942283630371, "learning_rate": 1.7609636740071524e-05, "loss": 1.4045, "step": 6350 }, { "epoch": 1.2, "grad_norm": 8.629911422729492, "learning_rate": 1.7605872388481084e-05, "loss": 1.3624, "step": 6360 }, { "epoch": 1.2, "grad_norm": 14.105700492858887, "learning_rate": 1.7602108036890647e-05, "loss": 1.2062, "step": 6370 }, { "epoch": 1.2, "grad_norm": 9.206585884094238, "learning_rate": 1.7598343685300207e-05, "loss": 1.4023, "step": 6380 }, { "epoch": 1.2, "grad_norm": 34.754150390625, "learning_rate": 1.759457933370977e-05, "loss": 1.1324, "step": 6390 }, { "epoch": 1.2, "grad_norm": 21.429977416992188, "learning_rate": 1.759081498211933e-05, "loss": 1.3611, "step": 6400 }, { "epoch": 1.21, "grad_norm": 13.360370635986328, "learning_rate": 1.7587050630528893e-05, "loss": 1.2232, "step": 6410 }, { "epoch": 1.21, "grad_norm": 7.976438522338867, "learning_rate": 1.7583286278938453e-05, "loss": 1.2726, "step": 6420 }, { "epoch": 1.21, "grad_norm": 3.3813695907592773, "learning_rate": 1.7579521927348016e-05, "loss": 1.3038, "step": 6430 }, { "epoch": 1.21, "grad_norm": 4.168612480163574, "learning_rate": 1.7575757575757576e-05, "loss": 1.0494, "step": 6440 }, { "epoch": 1.21, "grad_norm": 4.296903133392334, "learning_rate": 1.757199322416714e-05, "loss": 1.3312, "step": 6450 }, { "epoch": 1.22, "grad_norm": 9.388733863830566, "learning_rate": 1.7568228872576702e-05, "loss": 1.0723, "step": 6460 }, { "epoch": 1.22, "grad_norm": 12.61970043182373, "learning_rate": 1.756446452098626e-05, "loss": 1.1115, "step": 6470 }, { "epoch": 1.22, "grad_norm": 7.065197944641113, "learning_rate": 1.7560700169395825e-05, "loss": 1.173, "step": 6480 }, { "epoch": 1.22, "grad_norm": 20.004718780517578, "learning_rate": 1.7556935817805385e-05, "loss": 1.0418, "step": 6490 }, { "epoch": 1.22, "grad_norm": 5.888309001922607, "learning_rate": 1.7553171466214948e-05, "loss": 1.2062, "step": 6500 }, { "epoch": 1.23, "grad_norm": 31.428281784057617, "learning_rate": 1.7549407114624507e-05, "loss": 1.3629, "step": 6510 }, { "epoch": 1.23, "grad_norm": 37.77729034423828, "learning_rate": 1.754564276303407e-05, "loss": 1.1825, "step": 6520 }, { "epoch": 1.23, "grad_norm": 3.3252882957458496, "learning_rate": 1.754187841144363e-05, "loss": 1.3147, "step": 6530 }, { "epoch": 1.23, "grad_norm": 14.839897155761719, "learning_rate": 1.753811405985319e-05, "loss": 1.2901, "step": 6540 }, { "epoch": 1.23, "grad_norm": 13.03376293182373, "learning_rate": 1.7534349708262753e-05, "loss": 0.8962, "step": 6550 }, { "epoch": 1.23, "grad_norm": 15.040424346923828, "learning_rate": 1.7530585356672313e-05, "loss": 1.3194, "step": 6560 }, { "epoch": 1.24, "grad_norm": 12.113794326782227, "learning_rate": 1.7526821005081876e-05, "loss": 1.1243, "step": 6570 }, { "epoch": 1.24, "grad_norm": 13.116803169250488, "learning_rate": 1.7523056653491436e-05, "loss": 0.9943, "step": 6580 }, { "epoch": 1.24, "grad_norm": 6.835216522216797, "learning_rate": 1.7519292301901e-05, "loss": 1.2691, "step": 6590 }, { "epoch": 1.24, "grad_norm": 10.555135726928711, "learning_rate": 1.751552795031056e-05, "loss": 0.9653, "step": 6600 }, { "epoch": 1.24, "grad_norm": 20.608491897583008, "learning_rate": 1.7511763598720122e-05, "loss": 1.2499, "step": 6610 }, { "epoch": 1.25, "grad_norm": 10.590780258178711, "learning_rate": 1.7507999247129682e-05, "loss": 1.1381, "step": 6620 }, { "epoch": 1.25, "grad_norm": 1.9735676050186157, "learning_rate": 1.7504234895539245e-05, "loss": 0.9156, "step": 6630 }, { "epoch": 1.25, "grad_norm": 10.672266006469727, "learning_rate": 1.7500470543948808e-05, "loss": 1.3334, "step": 6640 }, { "epoch": 1.25, "grad_norm": 16.784568786621094, "learning_rate": 1.7496706192358368e-05, "loss": 1.4105, "step": 6650 }, { "epoch": 1.25, "grad_norm": 14.953975677490234, "learning_rate": 1.749294184076793e-05, "loss": 1.0408, "step": 6660 }, { "epoch": 1.26, "grad_norm": 14.848883628845215, "learning_rate": 1.748917748917749e-05, "loss": 1.1514, "step": 6670 }, { "epoch": 1.26, "grad_norm": 13.122929573059082, "learning_rate": 1.7485413137587054e-05, "loss": 1.1443, "step": 6680 }, { "epoch": 1.26, "grad_norm": 20.22705078125, "learning_rate": 1.7481648785996614e-05, "loss": 1.0515, "step": 6690 }, { "epoch": 1.26, "grad_norm": 11.579082489013672, "learning_rate": 1.7477884434406177e-05, "loss": 1.3199, "step": 6700 }, { "epoch": 1.26, "grad_norm": 20.08841323852539, "learning_rate": 1.7474120082815737e-05, "loss": 1.0948, "step": 6710 }, { "epoch": 1.26, "grad_norm": 23.67656135559082, "learning_rate": 1.7470355731225296e-05, "loss": 1.158, "step": 6720 }, { "epoch": 1.27, "grad_norm": 11.358781814575195, "learning_rate": 1.746659137963486e-05, "loss": 1.1823, "step": 6730 }, { "epoch": 1.27, "grad_norm": 11.688865661621094, "learning_rate": 1.746282702804442e-05, "loss": 1.3804, "step": 6740 }, { "epoch": 1.27, "grad_norm": 10.149205207824707, "learning_rate": 1.7459062676453983e-05, "loss": 0.9094, "step": 6750 }, { "epoch": 1.27, "grad_norm": 9.609593391418457, "learning_rate": 1.7455298324863542e-05, "loss": 1.2674, "step": 6760 }, { "epoch": 1.27, "grad_norm": 15.846113204956055, "learning_rate": 1.7451533973273105e-05, "loss": 1.2045, "step": 6770 }, { "epoch": 1.28, "grad_norm": 14.391237258911133, "learning_rate": 1.7447769621682665e-05, "loss": 1.0173, "step": 6780 }, { "epoch": 1.28, "grad_norm": 18.767101287841797, "learning_rate": 1.744400527009223e-05, "loss": 1.1205, "step": 6790 }, { "epoch": 1.28, "grad_norm": 8.494179725646973, "learning_rate": 1.7440240918501788e-05, "loss": 1.0519, "step": 6800 }, { "epoch": 1.28, "grad_norm": 7.717184543609619, "learning_rate": 1.743647656691135e-05, "loss": 1.0126, "step": 6810 }, { "epoch": 1.28, "grad_norm": 19.109561920166016, "learning_rate": 1.743271221532091e-05, "loss": 0.9986, "step": 6820 }, { "epoch": 1.29, "grad_norm": 7.908644199371338, "learning_rate": 1.7428947863730474e-05, "loss": 0.9898, "step": 6830 }, { "epoch": 1.29, "grad_norm": 7.440939426422119, "learning_rate": 1.7425183512140037e-05, "loss": 1.1074, "step": 6840 }, { "epoch": 1.29, "grad_norm": 6.752496719360352, "learning_rate": 1.7421419160549597e-05, "loss": 1.1364, "step": 6850 }, { "epoch": 1.29, "grad_norm": 16.471498489379883, "learning_rate": 1.741765480895916e-05, "loss": 1.385, "step": 6860 }, { "epoch": 1.29, "grad_norm": 5.562114715576172, "learning_rate": 1.741389045736872e-05, "loss": 1.0686, "step": 6870 }, { "epoch": 1.29, "grad_norm": 7.111942768096924, "learning_rate": 1.7410126105778283e-05, "loss": 1.0202, "step": 6880 }, { "epoch": 1.3, "grad_norm": 26.472091674804688, "learning_rate": 1.7406361754187843e-05, "loss": 1.2902, "step": 6890 }, { "epoch": 1.3, "grad_norm": 9.381438255310059, "learning_rate": 1.7402597402597403e-05, "loss": 1.2116, "step": 6900 }, { "epoch": 1.3, "grad_norm": 61.49063491821289, "learning_rate": 1.7398833051006966e-05, "loss": 1.4989, "step": 6910 }, { "epoch": 1.3, "grad_norm": 20.3765869140625, "learning_rate": 1.7395068699416526e-05, "loss": 0.8747, "step": 6920 }, { "epoch": 1.3, "grad_norm": 16.292556762695312, "learning_rate": 1.739130434782609e-05, "loss": 1.1964, "step": 6930 }, { "epoch": 1.31, "grad_norm": 6.413318634033203, "learning_rate": 1.738753999623565e-05, "loss": 0.9384, "step": 6940 }, { "epoch": 1.31, "grad_norm": 11.494325637817383, "learning_rate": 1.7383775644645212e-05, "loss": 1.4534, "step": 6950 }, { "epoch": 1.31, "grad_norm": 16.83256721496582, "learning_rate": 1.738001129305477e-05, "loss": 1.1801, "step": 6960 }, { "epoch": 1.31, "grad_norm": 10.633999824523926, "learning_rate": 1.7376246941464335e-05, "loss": 1.3302, "step": 6970 }, { "epoch": 1.31, "grad_norm": 23.226150512695312, "learning_rate": 1.7372482589873894e-05, "loss": 1.1845, "step": 6980 }, { "epoch": 1.32, "grad_norm": 14.6954345703125, "learning_rate": 1.7368718238283458e-05, "loss": 1.2775, "step": 6990 }, { "epoch": 1.32, "grad_norm": 11.616198539733887, "learning_rate": 1.7364953886693017e-05, "loss": 0.9493, "step": 7000 }, { "epoch": 1.32, "grad_norm": 5.6046929359436035, "learning_rate": 1.736118953510258e-05, "loss": 1.2158, "step": 7010 }, { "epoch": 1.32, "grad_norm": 7.577897071838379, "learning_rate": 1.7357425183512144e-05, "loss": 1.092, "step": 7020 }, { "epoch": 1.32, "grad_norm": 14.711727142333984, "learning_rate": 1.7353660831921703e-05, "loss": 1.3447, "step": 7030 }, { "epoch": 1.33, "grad_norm": 22.08068084716797, "learning_rate": 1.7349896480331267e-05, "loss": 0.9474, "step": 7040 }, { "epoch": 1.33, "grad_norm": 5.154607772827148, "learning_rate": 1.7346132128740826e-05, "loss": 1.0093, "step": 7050 }, { "epoch": 1.33, "grad_norm": 7.215937614440918, "learning_rate": 1.7342367777150386e-05, "loss": 1.178, "step": 7060 }, { "epoch": 1.33, "grad_norm": 3.9813942909240723, "learning_rate": 1.733860342555995e-05, "loss": 1.1342, "step": 7070 }, { "epoch": 1.33, "grad_norm": 8.915732383728027, "learning_rate": 1.733483907396951e-05, "loss": 1.1109, "step": 7080 }, { "epoch": 1.33, "grad_norm": 1.5986876487731934, "learning_rate": 1.7331074722379072e-05, "loss": 1.1772, "step": 7090 }, { "epoch": 1.34, "grad_norm": 5.484806537628174, "learning_rate": 1.7327310370788632e-05, "loss": 1.1488, "step": 7100 }, { "epoch": 1.34, "grad_norm": 15.327614784240723, "learning_rate": 1.7323546019198195e-05, "loss": 1.1301, "step": 7110 }, { "epoch": 1.34, "grad_norm": 8.505776405334473, "learning_rate": 1.7319781667607755e-05, "loss": 1.1446, "step": 7120 }, { "epoch": 1.34, "grad_norm": 11.313884735107422, "learning_rate": 1.7316017316017318e-05, "loss": 1.0336, "step": 7130 }, { "epoch": 1.34, "grad_norm": 8.289346694946289, "learning_rate": 1.7312252964426878e-05, "loss": 0.9494, "step": 7140 }, { "epoch": 1.35, "grad_norm": 14.154651641845703, "learning_rate": 1.730848861283644e-05, "loss": 0.8998, "step": 7150 }, { "epoch": 1.35, "grad_norm": 8.944982528686523, "learning_rate": 1.7304724261246e-05, "loss": 0.9689, "step": 7160 }, { "epoch": 1.35, "grad_norm": 9.77503490447998, "learning_rate": 1.7300959909655564e-05, "loss": 1.0465, "step": 7170 }, { "epoch": 1.35, "grad_norm": 28.841171264648438, "learning_rate": 1.7297195558065124e-05, "loss": 1.2161, "step": 7180 }, { "epoch": 1.35, "grad_norm": 8.071671485900879, "learning_rate": 1.7293431206474687e-05, "loss": 1.1167, "step": 7190 }, { "epoch": 1.36, "grad_norm": 11.291550636291504, "learning_rate": 1.728966685488425e-05, "loss": 1.2706, "step": 7200 }, { "epoch": 1.36, "grad_norm": 13.803507804870605, "learning_rate": 1.728590250329381e-05, "loss": 1.1404, "step": 7210 }, { "epoch": 1.36, "grad_norm": 9.958276748657227, "learning_rate": 1.7282138151703373e-05, "loss": 1.2336, "step": 7220 }, { "epoch": 1.36, "grad_norm": 8.445544242858887, "learning_rate": 1.727837380011293e-05, "loss": 1.16, "step": 7230 }, { "epoch": 1.36, "grad_norm": 20.41545867919922, "learning_rate": 1.7274609448522492e-05, "loss": 1.3856, "step": 7240 }, { "epoch": 1.36, "grad_norm": 9.721564292907715, "learning_rate": 1.7270845096932056e-05, "loss": 1.0896, "step": 7250 }, { "epoch": 1.37, "grad_norm": 18.577259063720703, "learning_rate": 1.7267080745341615e-05, "loss": 1.181, "step": 7260 }, { "epoch": 1.37, "grad_norm": 24.051130294799805, "learning_rate": 1.726331639375118e-05, "loss": 1.0202, "step": 7270 }, { "epoch": 1.37, "grad_norm": 12.581955909729004, "learning_rate": 1.7259552042160738e-05, "loss": 1.0709, "step": 7280 }, { "epoch": 1.37, "grad_norm": 6.8670854568481445, "learning_rate": 1.72557876905703e-05, "loss": 0.9722, "step": 7290 }, { "epoch": 1.37, "grad_norm": 9.2810640335083, "learning_rate": 1.725202333897986e-05, "loss": 1.1988, "step": 7300 }, { "epoch": 1.38, "grad_norm": 8.035595893859863, "learning_rate": 1.7248258987389424e-05, "loss": 1.059, "step": 7310 }, { "epoch": 1.38, "grad_norm": 5.461163520812988, "learning_rate": 1.7244494635798984e-05, "loss": 1.1715, "step": 7320 }, { "epoch": 1.38, "grad_norm": 6.015925407409668, "learning_rate": 1.7240730284208547e-05, "loss": 0.8844, "step": 7330 }, { "epoch": 1.38, "grad_norm": 17.79153060913086, "learning_rate": 1.7236965932618107e-05, "loss": 1.1316, "step": 7340 }, { "epoch": 1.38, "grad_norm": 65.43628692626953, "learning_rate": 1.723320158102767e-05, "loss": 0.9798, "step": 7350 }, { "epoch": 1.39, "grad_norm": 12.030508041381836, "learning_rate": 1.722943722943723e-05, "loss": 1.093, "step": 7360 }, { "epoch": 1.39, "grad_norm": 5.5272297859191895, "learning_rate": 1.7225672877846793e-05, "loss": 0.882, "step": 7370 }, { "epoch": 1.39, "grad_norm": 6.576135635375977, "learning_rate": 1.7221908526256356e-05, "loss": 1.0853, "step": 7380 }, { "epoch": 1.39, "grad_norm": 13.047865867614746, "learning_rate": 1.7218144174665916e-05, "loss": 1.0521, "step": 7390 }, { "epoch": 1.39, "grad_norm": 10.292452812194824, "learning_rate": 1.721437982307548e-05, "loss": 1.3856, "step": 7400 }, { "epoch": 1.39, "grad_norm": 4.0016326904296875, "learning_rate": 1.7210615471485036e-05, "loss": 0.9876, "step": 7410 }, { "epoch": 1.4, "grad_norm": 10.368799209594727, "learning_rate": 1.72068511198946e-05, "loss": 0.7977, "step": 7420 }, { "epoch": 1.4, "grad_norm": 12.0331449508667, "learning_rate": 1.7203086768304162e-05, "loss": 1.0317, "step": 7430 }, { "epoch": 1.4, "grad_norm": 4.535419940948486, "learning_rate": 1.719932241671372e-05, "loss": 1.2162, "step": 7440 }, { "epoch": 1.4, "grad_norm": 4.63490104675293, "learning_rate": 1.7195558065123285e-05, "loss": 0.9855, "step": 7450 }, { "epoch": 1.4, "grad_norm": 12.621866226196289, "learning_rate": 1.7191793713532845e-05, "loss": 1.1798, "step": 7460 }, { "epoch": 1.41, "grad_norm": 6.641757488250732, "learning_rate": 1.7188029361942408e-05, "loss": 1.2643, "step": 7470 }, { "epoch": 1.41, "grad_norm": 4.4592604637146, "learning_rate": 1.7184265010351967e-05, "loss": 0.8314, "step": 7480 }, { "epoch": 1.41, "grad_norm": 22.9270076751709, "learning_rate": 1.718050065876153e-05, "loss": 1.2297, "step": 7490 }, { "epoch": 1.41, "grad_norm": 13.613977432250977, "learning_rate": 1.717673630717109e-05, "loss": 0.957, "step": 7500 }, { "epoch": 1.41, "grad_norm": 18.736854553222656, "learning_rate": 1.7172971955580654e-05, "loss": 1.0854, "step": 7510 }, { "epoch": 1.42, "grad_norm": 15.172490119934082, "learning_rate": 1.7169207603990213e-05, "loss": 0.9997, "step": 7520 }, { "epoch": 1.42, "grad_norm": 14.035980224609375, "learning_rate": 1.7165443252399776e-05, "loss": 0.9522, "step": 7530 }, { "epoch": 1.42, "grad_norm": 6.1426310539245605, "learning_rate": 1.7161678900809336e-05, "loss": 0.8932, "step": 7540 }, { "epoch": 1.42, "grad_norm": 4.456810474395752, "learning_rate": 1.71579145492189e-05, "loss": 1.0575, "step": 7550 }, { "epoch": 1.42, "grad_norm": 18.907272338867188, "learning_rate": 1.715415019762846e-05, "loss": 0.9667, "step": 7560 }, { "epoch": 1.42, "grad_norm": 24.286773681640625, "learning_rate": 1.7150385846038022e-05, "loss": 1.1699, "step": 7570 }, { "epoch": 1.43, "grad_norm": 6.49676513671875, "learning_rate": 1.7146621494447582e-05, "loss": 1.1024, "step": 7580 }, { "epoch": 1.43, "grad_norm": 13.635810852050781, "learning_rate": 1.7142857142857142e-05, "loss": 0.9841, "step": 7590 }, { "epoch": 1.43, "grad_norm": 13.655981063842773, "learning_rate": 1.7139092791266705e-05, "loss": 1.0624, "step": 7600 }, { "epoch": 1.43, "grad_norm": 3.580477714538574, "learning_rate": 1.7135328439676268e-05, "loss": 0.943, "step": 7610 }, { "epoch": 1.43, "grad_norm": 19.59247589111328, "learning_rate": 1.7131564088085828e-05, "loss": 0.9837, "step": 7620 }, { "epoch": 1.44, "grad_norm": 3.9449715614318848, "learning_rate": 1.712779973649539e-05, "loss": 1.0507, "step": 7630 }, { "epoch": 1.44, "grad_norm": 10.523272514343262, "learning_rate": 1.712403538490495e-05, "loss": 1.0263, "step": 7640 }, { "epoch": 1.44, "grad_norm": 14.362699508666992, "learning_rate": 1.7120271033314514e-05, "loss": 1.1896, "step": 7650 }, { "epoch": 1.44, "grad_norm": 2.5842766761779785, "learning_rate": 1.7116506681724074e-05, "loss": 1.0641, "step": 7660 }, { "epoch": 1.44, "grad_norm": 17.754718780517578, "learning_rate": 1.7112742330133637e-05, "loss": 0.9287, "step": 7670 }, { "epoch": 1.45, "grad_norm": 10.024535179138184, "learning_rate": 1.7108977978543197e-05, "loss": 1.2929, "step": 7680 }, { "epoch": 1.45, "grad_norm": 14.328852653503418, "learning_rate": 1.710521362695276e-05, "loss": 1.2506, "step": 7690 }, { "epoch": 1.45, "grad_norm": 3.0855348110198975, "learning_rate": 1.710144927536232e-05, "loss": 0.9465, "step": 7700 }, { "epoch": 1.45, "grad_norm": 3.1114163398742676, "learning_rate": 1.7097684923771883e-05, "loss": 1.1016, "step": 7710 }, { "epoch": 1.45, "grad_norm": 7.601757526397705, "learning_rate": 1.7093920572181443e-05, "loss": 1.1521, "step": 7720 }, { "epoch": 1.45, "grad_norm": 8.17392349243164, "learning_rate": 1.7090156220591006e-05, "loss": 1.1438, "step": 7730 }, { "epoch": 1.46, "grad_norm": 14.822709083557129, "learning_rate": 1.7086391869000565e-05, "loss": 1.0884, "step": 7740 }, { "epoch": 1.46, "grad_norm": 12.926187515258789, "learning_rate": 1.708262751741013e-05, "loss": 1.2607, "step": 7750 }, { "epoch": 1.46, "grad_norm": 13.08968734741211, "learning_rate": 1.707886316581969e-05, "loss": 1.1789, "step": 7760 }, { "epoch": 1.46, "grad_norm": 13.06160831451416, "learning_rate": 1.7075098814229248e-05, "loss": 1.2074, "step": 7770 }, { "epoch": 1.46, "grad_norm": 9.818836212158203, "learning_rate": 1.707133446263881e-05, "loss": 1.2368, "step": 7780 }, { "epoch": 1.47, "grad_norm": 15.557538032531738, "learning_rate": 1.706757011104837e-05, "loss": 1.1518, "step": 7790 }, { "epoch": 1.47, "grad_norm": 12.945401191711426, "learning_rate": 1.7063805759457934e-05, "loss": 0.9489, "step": 7800 }, { "epoch": 1.47, "grad_norm": 8.303050994873047, "learning_rate": 1.7060041407867497e-05, "loss": 1.0736, "step": 7810 }, { "epoch": 1.47, "grad_norm": 19.958223342895508, "learning_rate": 1.7056277056277057e-05, "loss": 1.3085, "step": 7820 }, { "epoch": 1.47, "grad_norm": 13.087350845336914, "learning_rate": 1.705251270468662e-05, "loss": 1.2354, "step": 7830 }, { "epoch": 1.48, "grad_norm": 5.501480579376221, "learning_rate": 1.704874835309618e-05, "loss": 1.1402, "step": 7840 }, { "epoch": 1.48, "grad_norm": 16.800434112548828, "learning_rate": 1.7044984001505743e-05, "loss": 1.176, "step": 7850 }, { "epoch": 1.48, "grad_norm": 15.918781280517578, "learning_rate": 1.7041219649915303e-05, "loss": 1.0657, "step": 7860 }, { "epoch": 1.48, "grad_norm": 10.953459739685059, "learning_rate": 1.7037455298324866e-05, "loss": 0.9552, "step": 7870 }, { "epoch": 1.48, "grad_norm": 11.856039047241211, "learning_rate": 1.7033690946734426e-05, "loss": 0.9727, "step": 7880 }, { "epoch": 1.49, "grad_norm": 6.3316755294799805, "learning_rate": 1.702992659514399e-05, "loss": 0.9975, "step": 7890 }, { "epoch": 1.49, "grad_norm": 54.56752014160156, "learning_rate": 1.702616224355355e-05, "loss": 1.0682, "step": 7900 }, { "epoch": 1.49, "grad_norm": 16.042255401611328, "learning_rate": 1.7022397891963112e-05, "loss": 1.2348, "step": 7910 }, { "epoch": 1.49, "grad_norm": 14.975419044494629, "learning_rate": 1.7018633540372672e-05, "loss": 1.0173, "step": 7920 }, { "epoch": 1.49, "grad_norm": 11.27600383758545, "learning_rate": 1.701486918878223e-05, "loss": 1.1986, "step": 7930 }, { "epoch": 1.49, "grad_norm": 25.487232208251953, "learning_rate": 1.7011104837191795e-05, "loss": 1.5951, "step": 7940 }, { "epoch": 1.5, "grad_norm": 19.516244888305664, "learning_rate": 1.7007340485601354e-05, "loss": 1.0892, "step": 7950 }, { "epoch": 1.5, "grad_norm": 1.6688463687896729, "learning_rate": 1.7003576134010918e-05, "loss": 0.8421, "step": 7960 }, { "epoch": 1.5, "grad_norm": 7.081192493438721, "learning_rate": 1.6999811782420477e-05, "loss": 1.0691, "step": 7970 }, { "epoch": 1.5, "grad_norm": 10.31060791015625, "learning_rate": 1.699604743083004e-05, "loss": 0.8777, "step": 7980 }, { "epoch": 1.5, "grad_norm": 7.9369659423828125, "learning_rate": 1.6992283079239604e-05, "loss": 0.9824, "step": 7990 }, { "epoch": 1.51, "grad_norm": 13.844595909118652, "learning_rate": 1.6988518727649163e-05, "loss": 0.9572, "step": 8000 }, { "epoch": 1.51, "grad_norm": 32.09144592285156, "learning_rate": 1.6984754376058727e-05, "loss": 1.0308, "step": 8010 }, { "epoch": 1.51, "grad_norm": 11.795945167541504, "learning_rate": 1.6980990024468286e-05, "loss": 1.1595, "step": 8020 }, { "epoch": 1.51, "grad_norm": 22.064987182617188, "learning_rate": 1.697722567287785e-05, "loss": 0.7747, "step": 8030 }, { "epoch": 1.51, "grad_norm": 8.580666542053223, "learning_rate": 1.697346132128741e-05, "loss": 0.9154, "step": 8040 }, { "epoch": 1.52, "grad_norm": 9.91849136352539, "learning_rate": 1.6969696969696972e-05, "loss": 1.2361, "step": 8050 }, { "epoch": 1.52, "grad_norm": 13.927072525024414, "learning_rate": 1.6965932618106532e-05, "loss": 0.8912, "step": 8060 }, { "epoch": 1.52, "grad_norm": 7.275015830993652, "learning_rate": 1.6962168266516095e-05, "loss": 1.1334, "step": 8070 }, { "epoch": 1.52, "grad_norm": 24.040142059326172, "learning_rate": 1.6958403914925655e-05, "loss": 0.9843, "step": 8080 }, { "epoch": 1.52, "grad_norm": 22.510576248168945, "learning_rate": 1.6954639563335218e-05, "loss": 0.9083, "step": 8090 }, { "epoch": 1.52, "grad_norm": 7.329137325286865, "learning_rate": 1.6950875211744778e-05, "loss": 1.0092, "step": 8100 }, { "epoch": 1.53, "grad_norm": 5.5893330574035645, "learning_rate": 1.6947110860154338e-05, "loss": 0.9781, "step": 8110 }, { "epoch": 1.53, "grad_norm": 22.0462703704834, "learning_rate": 1.69433465085639e-05, "loss": 0.9981, "step": 8120 }, { "epoch": 1.53, "grad_norm": 10.5774564743042, "learning_rate": 1.693958215697346e-05, "loss": 1.0343, "step": 8130 }, { "epoch": 1.53, "grad_norm": 15.137490272521973, "learning_rate": 1.6935817805383024e-05, "loss": 1.0412, "step": 8140 }, { "epoch": 1.53, "grad_norm": 16.09028434753418, "learning_rate": 1.6932053453792584e-05, "loss": 0.8931, "step": 8150 }, { "epoch": 1.54, "grad_norm": 3.9936368465423584, "learning_rate": 1.6928289102202147e-05, "loss": 0.787, "step": 8160 }, { "epoch": 1.54, "grad_norm": 20.518054962158203, "learning_rate": 1.692452475061171e-05, "loss": 1.2866, "step": 8170 }, { "epoch": 1.54, "grad_norm": 6.0525689125061035, "learning_rate": 1.692076039902127e-05, "loss": 1.1158, "step": 8180 }, { "epoch": 1.54, "grad_norm": 9.403233528137207, "learning_rate": 1.6916996047430833e-05, "loss": 1.0815, "step": 8190 }, { "epoch": 1.54, "grad_norm": 22.706296920776367, "learning_rate": 1.6913231695840393e-05, "loss": 1.0685, "step": 8200 }, { "epoch": 1.55, "grad_norm": 10.937347412109375, "learning_rate": 1.6909467344249956e-05, "loss": 1.1039, "step": 8210 }, { "epoch": 1.55, "grad_norm": 20.09309959411621, "learning_rate": 1.6905702992659516e-05, "loss": 0.9035, "step": 8220 }, { "epoch": 1.55, "grad_norm": 8.693171501159668, "learning_rate": 1.690193864106908e-05, "loss": 1.0463, "step": 8230 }, { "epoch": 1.55, "grad_norm": 15.067790031433105, "learning_rate": 1.689817428947864e-05, "loss": 1.1981, "step": 8240 }, { "epoch": 1.55, "grad_norm": 5.470821857452393, "learning_rate": 1.68944099378882e-05, "loss": 1.1446, "step": 8250 }, { "epoch": 1.55, "grad_norm": 7.007253646850586, "learning_rate": 1.689064558629776e-05, "loss": 1.101, "step": 8260 }, { "epoch": 1.56, "grad_norm": 7.471108913421631, "learning_rate": 1.6886881234707325e-05, "loss": 0.7692, "step": 8270 }, { "epoch": 1.56, "grad_norm": 10.229903221130371, "learning_rate": 1.6883116883116884e-05, "loss": 0.9428, "step": 8280 }, { "epoch": 1.56, "grad_norm": 13.042211532592773, "learning_rate": 1.6879352531526444e-05, "loss": 1.2651, "step": 8290 }, { "epoch": 1.56, "grad_norm": 9.159402847290039, "learning_rate": 1.6875588179936007e-05, "loss": 0.7305, "step": 8300 }, { "epoch": 1.56, "grad_norm": 7.143734931945801, "learning_rate": 1.6871823828345567e-05, "loss": 1.3042, "step": 8310 }, { "epoch": 1.57, "grad_norm": 5.597439765930176, "learning_rate": 1.686805947675513e-05, "loss": 0.7694, "step": 8320 }, { "epoch": 1.57, "grad_norm": 24.404653549194336, "learning_rate": 1.686429512516469e-05, "loss": 0.7511, "step": 8330 }, { "epoch": 1.57, "grad_norm": 3.326371908187866, "learning_rate": 1.6860530773574253e-05, "loss": 0.8211, "step": 8340 }, { "epoch": 1.57, "grad_norm": 16.690340042114258, "learning_rate": 1.6856766421983813e-05, "loss": 1.0918, "step": 8350 }, { "epoch": 1.57, "grad_norm": 3.873743772506714, "learning_rate": 1.6853002070393376e-05, "loss": 0.8984, "step": 8360 }, { "epoch": 1.58, "grad_norm": 38.66802978515625, "learning_rate": 1.684923771880294e-05, "loss": 0.6514, "step": 8370 }, { "epoch": 1.58, "grad_norm": 23.641817092895508, "learning_rate": 1.68454733672125e-05, "loss": 0.9783, "step": 8380 }, { "epoch": 1.58, "grad_norm": 14.112013816833496, "learning_rate": 1.6841709015622062e-05, "loss": 0.8311, "step": 8390 }, { "epoch": 1.58, "grad_norm": 5.782837867736816, "learning_rate": 1.6837944664031622e-05, "loss": 0.9418, "step": 8400 }, { "epoch": 1.58, "grad_norm": 5.840491771697998, "learning_rate": 1.6834180312441185e-05, "loss": 0.9358, "step": 8410 }, { "epoch": 1.58, "grad_norm": 28.026111602783203, "learning_rate": 1.6830415960850745e-05, "loss": 1.5066, "step": 8420 }, { "epoch": 1.59, "grad_norm": 13.618431091308594, "learning_rate": 1.6826651609260308e-05, "loss": 1.0053, "step": 8430 }, { "epoch": 1.59, "grad_norm": 8.274946212768555, "learning_rate": 1.6822887257669868e-05, "loss": 1.0545, "step": 8440 }, { "epoch": 1.59, "grad_norm": 19.648679733276367, "learning_rate": 1.6819122906079427e-05, "loss": 0.8776, "step": 8450 }, { "epoch": 1.59, "grad_norm": 15.531411170959473, "learning_rate": 1.681535855448899e-05, "loss": 1.0128, "step": 8460 }, { "epoch": 1.59, "grad_norm": 4.063401222229004, "learning_rate": 1.681159420289855e-05, "loss": 1.0426, "step": 8470 }, { "epoch": 1.6, "grad_norm": 11.043712615966797, "learning_rate": 1.6807829851308114e-05, "loss": 0.7918, "step": 8480 }, { "epoch": 1.6, "grad_norm": 8.745186805725098, "learning_rate": 1.6804065499717673e-05, "loss": 0.8653, "step": 8490 }, { "epoch": 1.6, "grad_norm": 13.314781188964844, "learning_rate": 1.6800301148127236e-05, "loss": 0.904, "step": 8500 }, { "epoch": 1.6, "grad_norm": 4.7882208824157715, "learning_rate": 1.6796536796536796e-05, "loss": 0.8867, "step": 8510 }, { "epoch": 1.6, "grad_norm": 11.435511589050293, "learning_rate": 1.679277244494636e-05, "loss": 1.0631, "step": 8520 }, { "epoch": 1.61, "grad_norm": 30.225162506103516, "learning_rate": 1.678900809335592e-05, "loss": 1.235, "step": 8530 }, { "epoch": 1.61, "grad_norm": 17.80291175842285, "learning_rate": 1.6785243741765482e-05, "loss": 1.2388, "step": 8540 }, { "epoch": 1.61, "grad_norm": 19.908802032470703, "learning_rate": 1.6781479390175045e-05, "loss": 1.1334, "step": 8550 }, { "epoch": 1.61, "grad_norm": 4.993206977844238, "learning_rate": 1.6777715038584605e-05, "loss": 0.9747, "step": 8560 }, { "epoch": 1.61, "grad_norm": 27.299789428710938, "learning_rate": 1.677395068699417e-05, "loss": 1.1113, "step": 8570 }, { "epoch": 1.61, "grad_norm": 12.851706504821777, "learning_rate": 1.6770186335403728e-05, "loss": 0.907, "step": 8580 }, { "epoch": 1.62, "grad_norm": 24.122529983520508, "learning_rate": 1.676642198381329e-05, "loss": 0.9915, "step": 8590 }, { "epoch": 1.62, "grad_norm": 22.42790985107422, "learning_rate": 1.676265763222285e-05, "loss": 0.9087, "step": 8600 }, { "epoch": 1.62, "grad_norm": 10.046234130859375, "learning_rate": 1.6758893280632414e-05, "loss": 0.9194, "step": 8610 }, { "epoch": 1.62, "grad_norm": 11.75545883178711, "learning_rate": 1.6755128929041974e-05, "loss": 1.2203, "step": 8620 }, { "epoch": 1.62, "grad_norm": 12.967751502990723, "learning_rate": 1.6751364577451534e-05, "loss": 1.1011, "step": 8630 }, { "epoch": 1.63, "grad_norm": 6.418219566345215, "learning_rate": 1.6747600225861097e-05, "loss": 1.0657, "step": 8640 }, { "epoch": 1.63, "grad_norm": 39.12499237060547, "learning_rate": 1.6743835874270657e-05, "loss": 1.2006, "step": 8650 }, { "epoch": 1.63, "grad_norm": 2.25285267829895, "learning_rate": 1.674007152268022e-05, "loss": 0.8309, "step": 8660 }, { "epoch": 1.63, "grad_norm": 8.486536026000977, "learning_rate": 1.673630717108978e-05, "loss": 0.9349, "step": 8670 }, { "epoch": 1.63, "grad_norm": 3.4122467041015625, "learning_rate": 1.6732542819499343e-05, "loss": 1.1027, "step": 8680 }, { "epoch": 1.64, "grad_norm": 11.690104484558105, "learning_rate": 1.6728778467908903e-05, "loss": 0.9033, "step": 8690 }, { "epoch": 1.64, "grad_norm": 16.22601318359375, "learning_rate": 1.6725014116318466e-05, "loss": 1.0882, "step": 8700 }, { "epoch": 1.64, "grad_norm": 20.739831924438477, "learning_rate": 1.6721249764728025e-05, "loss": 0.9375, "step": 8710 }, { "epoch": 1.64, "grad_norm": 15.394549369812012, "learning_rate": 1.671748541313759e-05, "loss": 0.7345, "step": 8720 }, { "epoch": 1.64, "grad_norm": 26.133543014526367, "learning_rate": 1.6713721061547152e-05, "loss": 0.64, "step": 8730 }, { "epoch": 1.65, "grad_norm": 13.604958534240723, "learning_rate": 1.670995670995671e-05, "loss": 0.9136, "step": 8740 }, { "epoch": 1.65, "grad_norm": 42.65677261352539, "learning_rate": 1.6706192358366275e-05, "loss": 1.0659, "step": 8750 }, { "epoch": 1.65, "grad_norm": 8.75528621673584, "learning_rate": 1.6702428006775834e-05, "loss": 0.8149, "step": 8760 }, { "epoch": 1.65, "grad_norm": 5.066274642944336, "learning_rate": 1.6698663655185398e-05, "loss": 1.0136, "step": 8770 }, { "epoch": 1.65, "grad_norm": 22.15322494506836, "learning_rate": 1.6694899303594957e-05, "loss": 1.0216, "step": 8780 }, { "epoch": 1.65, "grad_norm": 14.151029586791992, "learning_rate": 1.669113495200452e-05, "loss": 1.1095, "step": 8790 }, { "epoch": 1.66, "grad_norm": 6.58624267578125, "learning_rate": 1.668737060041408e-05, "loss": 0.8408, "step": 8800 }, { "epoch": 1.66, "grad_norm": 25.148658752441406, "learning_rate": 1.668360624882364e-05, "loss": 1.0391, "step": 8810 }, { "epoch": 1.66, "grad_norm": 6.866160869598389, "learning_rate": 1.6679841897233203e-05, "loss": 0.9131, "step": 8820 }, { "epoch": 1.66, "grad_norm": 32.80194854736328, "learning_rate": 1.6676077545642763e-05, "loss": 0.8875, "step": 8830 }, { "epoch": 1.66, "grad_norm": 3.0241827964782715, "learning_rate": 1.6672313194052326e-05, "loss": 0.7572, "step": 8840 }, { "epoch": 1.67, "grad_norm": 14.243484497070312, "learning_rate": 1.6668548842461886e-05, "loss": 0.9565, "step": 8850 }, { "epoch": 1.67, "grad_norm": 18.31822395324707, "learning_rate": 1.666478449087145e-05, "loss": 1.0248, "step": 8860 }, { "epoch": 1.67, "grad_norm": 18.321674346923828, "learning_rate": 1.666102013928101e-05, "loss": 0.9848, "step": 8870 }, { "epoch": 1.67, "grad_norm": 14.554122924804688, "learning_rate": 1.6657255787690572e-05, "loss": 0.9658, "step": 8880 }, { "epoch": 1.67, "grad_norm": 12.484946250915527, "learning_rate": 1.6653491436100132e-05, "loss": 0.8559, "step": 8890 }, { "epoch": 1.68, "grad_norm": 5.653412342071533, "learning_rate": 1.6649727084509695e-05, "loss": 0.9627, "step": 8900 }, { "epoch": 1.68, "grad_norm": 7.155862331390381, "learning_rate": 1.6645962732919258e-05, "loss": 0.8445, "step": 8910 }, { "epoch": 1.68, "grad_norm": 12.255889892578125, "learning_rate": 1.6642198381328818e-05, "loss": 0.7791, "step": 8920 }, { "epoch": 1.68, "grad_norm": 8.364027976989746, "learning_rate": 1.663843402973838e-05, "loss": 1.0761, "step": 8930 }, { "epoch": 1.68, "grad_norm": 19.489355087280273, "learning_rate": 1.663466967814794e-05, "loss": 0.8686, "step": 8940 }, { "epoch": 1.68, "grad_norm": 12.537185668945312, "learning_rate": 1.6630905326557504e-05, "loss": 0.9145, "step": 8950 }, { "epoch": 1.69, "grad_norm": 26.021825790405273, "learning_rate": 1.6627140974967064e-05, "loss": 0.8122, "step": 8960 }, { "epoch": 1.69, "grad_norm": 6.992552757263184, "learning_rate": 1.6623376623376627e-05, "loss": 0.7718, "step": 8970 }, { "epoch": 1.69, "grad_norm": 10.073301315307617, "learning_rate": 1.6619612271786187e-05, "loss": 0.9453, "step": 8980 }, { "epoch": 1.69, "grad_norm": 25.186731338500977, "learning_rate": 1.6615847920195746e-05, "loss": 0.7758, "step": 8990 }, { "epoch": 1.69, "grad_norm": 15.207765579223633, "learning_rate": 1.661208356860531e-05, "loss": 1.0578, "step": 9000 }, { "epoch": 1.7, "grad_norm": 14.172220230102539, "learning_rate": 1.660831921701487e-05, "loss": 1.0341, "step": 9010 }, { "epoch": 1.7, "grad_norm": 10.55916976928711, "learning_rate": 1.6604554865424432e-05, "loss": 0.9354, "step": 9020 }, { "epoch": 1.7, "grad_norm": 13.917641639709473, "learning_rate": 1.6600790513833992e-05, "loss": 1.136, "step": 9030 }, { "epoch": 1.7, "grad_norm": 11.619650840759277, "learning_rate": 1.6597026162243555e-05, "loss": 0.7212, "step": 9040 }, { "epoch": 1.7, "grad_norm": 17.172563552856445, "learning_rate": 1.6593261810653115e-05, "loss": 0.7703, "step": 9050 }, { "epoch": 1.71, "grad_norm": 18.772756576538086, "learning_rate": 1.6589497459062678e-05, "loss": 0.8157, "step": 9060 }, { "epoch": 1.71, "grad_norm": 18.383899688720703, "learning_rate": 1.6585733107472238e-05, "loss": 1.1417, "step": 9070 }, { "epoch": 1.71, "grad_norm": 15.073466300964355, "learning_rate": 1.65819687558818e-05, "loss": 0.9564, "step": 9080 }, { "epoch": 1.71, "grad_norm": 23.901710510253906, "learning_rate": 1.657820440429136e-05, "loss": 1.1246, "step": 9090 }, { "epoch": 1.71, "grad_norm": 7.932938575744629, "learning_rate": 1.6574440052700924e-05, "loss": 0.949, "step": 9100 }, { "epoch": 1.71, "grad_norm": 4.0341410636901855, "learning_rate": 1.6570675701110487e-05, "loss": 1.0209, "step": 9110 }, { "epoch": 1.72, "grad_norm": 7.543979167938232, "learning_rate": 1.6566911349520047e-05, "loss": 1.0694, "step": 9120 }, { "epoch": 1.72, "grad_norm": 18.913318634033203, "learning_rate": 1.656314699792961e-05, "loss": 0.9295, "step": 9130 }, { "epoch": 1.72, "grad_norm": 7.830410003662109, "learning_rate": 1.655938264633917e-05, "loss": 0.7556, "step": 9140 }, { "epoch": 1.72, "grad_norm": 7.6431379318237305, "learning_rate": 1.655561829474873e-05, "loss": 0.8671, "step": 9150 }, { "epoch": 1.72, "grad_norm": 27.64036750793457, "learning_rate": 1.6551853943158293e-05, "loss": 0.6891, "step": 9160 }, { "epoch": 1.73, "grad_norm": 15.285219192504883, "learning_rate": 1.6548089591567853e-05, "loss": 1.1812, "step": 9170 }, { "epoch": 1.73, "grad_norm": 11.814650535583496, "learning_rate": 1.6544325239977416e-05, "loss": 0.8133, "step": 9180 }, { "epoch": 1.73, "grad_norm": 3.3252789974212646, "learning_rate": 1.6540560888386976e-05, "loss": 1.0719, "step": 9190 }, { "epoch": 1.73, "grad_norm": 2.2466318607330322, "learning_rate": 1.653679653679654e-05, "loss": 0.8765, "step": 9200 }, { "epoch": 1.73, "grad_norm": 34.163726806640625, "learning_rate": 1.65330321852061e-05, "loss": 1.2877, "step": 9210 }, { "epoch": 1.74, "grad_norm": 27.794078826904297, "learning_rate": 1.652926783361566e-05, "loss": 0.8177, "step": 9220 }, { "epoch": 1.74, "grad_norm": 16.112585067749023, "learning_rate": 1.652550348202522e-05, "loss": 1.1625, "step": 9230 }, { "epoch": 1.74, "grad_norm": 19.92578125, "learning_rate": 1.6521739130434785e-05, "loss": 1.1242, "step": 9240 }, { "epoch": 1.74, "grad_norm": 17.408260345458984, "learning_rate": 1.6517974778844344e-05, "loss": 1.1734, "step": 9250 }, { "epoch": 1.74, "grad_norm": 23.571901321411133, "learning_rate": 1.6514210427253907e-05, "loss": 0.9096, "step": 9260 }, { "epoch": 1.74, "grad_norm": 10.09569263458252, "learning_rate": 1.6510446075663467e-05, "loss": 0.8421, "step": 9270 }, { "epoch": 1.75, "grad_norm": 22.68370246887207, "learning_rate": 1.650668172407303e-05, "loss": 0.8, "step": 9280 }, { "epoch": 1.75, "grad_norm": 26.997875213623047, "learning_rate": 1.6502917372482594e-05, "loss": 0.9072, "step": 9290 }, { "epoch": 1.75, "grad_norm": 26.56907081604004, "learning_rate": 1.6499153020892153e-05, "loss": 1.1413, "step": 9300 }, { "epoch": 1.75, "grad_norm": 13.025582313537598, "learning_rate": 1.6495388669301716e-05, "loss": 0.712, "step": 9310 }, { "epoch": 1.75, "grad_norm": 2.8579206466674805, "learning_rate": 1.6491624317711273e-05, "loss": 1.0311, "step": 9320 }, { "epoch": 1.76, "grad_norm": 7.947895526885986, "learning_rate": 1.6487859966120836e-05, "loss": 0.8033, "step": 9330 }, { "epoch": 1.76, "grad_norm": 6.863089561462402, "learning_rate": 1.64840956145304e-05, "loss": 1.0714, "step": 9340 }, { "epoch": 1.76, "grad_norm": 10.841645240783691, "learning_rate": 1.648033126293996e-05, "loss": 0.6842, "step": 9350 }, { "epoch": 1.76, "grad_norm": 27.438404083251953, "learning_rate": 1.6476566911349522e-05, "loss": 0.851, "step": 9360 }, { "epoch": 1.76, "grad_norm": 2.9211061000823975, "learning_rate": 1.6472802559759082e-05, "loss": 0.8752, "step": 9370 }, { "epoch": 1.77, "grad_norm": 13.311325073242188, "learning_rate": 1.6469038208168645e-05, "loss": 1.0659, "step": 9380 }, { "epoch": 1.77, "grad_norm": 9.285094261169434, "learning_rate": 1.6465273856578205e-05, "loss": 0.8645, "step": 9390 }, { "epoch": 1.77, "grad_norm": 12.683711051940918, "learning_rate": 1.6461509504987768e-05, "loss": 0.8845, "step": 9400 }, { "epoch": 1.77, "grad_norm": 16.096887588500977, "learning_rate": 1.6457745153397328e-05, "loss": 0.9758, "step": 9410 }, { "epoch": 1.77, "grad_norm": 10.897958755493164, "learning_rate": 1.645398080180689e-05, "loss": 1.2507, "step": 9420 }, { "epoch": 1.77, "grad_norm": 22.48199462890625, "learning_rate": 1.645021645021645e-05, "loss": 0.7385, "step": 9430 }, { "epoch": 1.78, "grad_norm": 16.660545349121094, "learning_rate": 1.6446452098626014e-05, "loss": 0.8564, "step": 9440 }, { "epoch": 1.78, "grad_norm": 8.656999588012695, "learning_rate": 1.6442687747035574e-05, "loss": 0.7063, "step": 9450 }, { "epoch": 1.78, "grad_norm": 10.140769004821777, "learning_rate": 1.6438923395445137e-05, "loss": 1.0147, "step": 9460 }, { "epoch": 1.78, "grad_norm": 4.824342727661133, "learning_rate": 1.64351590438547e-05, "loss": 0.7609, "step": 9470 }, { "epoch": 1.78, "grad_norm": 9.159531593322754, "learning_rate": 1.643139469226426e-05, "loss": 1.0369, "step": 9480 }, { "epoch": 1.79, "grad_norm": 14.373122215270996, "learning_rate": 1.6427630340673823e-05, "loss": 1.1801, "step": 9490 }, { "epoch": 1.79, "grad_norm": 5.078024864196777, "learning_rate": 1.642386598908338e-05, "loss": 0.9217, "step": 9500 }, { "epoch": 1.79, "grad_norm": 7.723122596740723, "learning_rate": 1.6420101637492942e-05, "loss": 0.9727, "step": 9510 }, { "epoch": 1.79, "grad_norm": 9.917556762695312, "learning_rate": 1.6416337285902505e-05, "loss": 1.2133, "step": 9520 }, { "epoch": 1.79, "grad_norm": 6.582098007202148, "learning_rate": 1.6412572934312065e-05, "loss": 0.9397, "step": 9530 }, { "epoch": 1.8, "grad_norm": 10.489495277404785, "learning_rate": 1.640880858272163e-05, "loss": 1.093, "step": 9540 }, { "epoch": 1.8, "grad_norm": 21.351308822631836, "learning_rate": 1.6405044231131188e-05, "loss": 0.9417, "step": 9550 }, { "epoch": 1.8, "grad_norm": 14.546730041503906, "learning_rate": 1.640127987954075e-05, "loss": 1.0752, "step": 9560 }, { "epoch": 1.8, "grad_norm": 14.62718391418457, "learning_rate": 1.639751552795031e-05, "loss": 0.85, "step": 9570 }, { "epoch": 1.8, "grad_norm": 9.844496726989746, "learning_rate": 1.6393751176359874e-05, "loss": 1.3426, "step": 9580 }, { "epoch": 1.81, "grad_norm": 3.517404317855835, "learning_rate": 1.6389986824769434e-05, "loss": 0.8968, "step": 9590 }, { "epoch": 1.81, "grad_norm": 39.88374328613281, "learning_rate": 1.6386222473178997e-05, "loss": 0.6198, "step": 9600 }, { "epoch": 1.81, "grad_norm": 23.119050979614258, "learning_rate": 1.6382458121588557e-05, "loss": 0.7901, "step": 9610 }, { "epoch": 1.81, "grad_norm": 7.583620071411133, "learning_rate": 1.637869376999812e-05, "loss": 1.0529, "step": 9620 }, { "epoch": 1.81, "grad_norm": 29.128244400024414, "learning_rate": 1.637492941840768e-05, "loss": 1.237, "step": 9630 }, { "epoch": 1.81, "grad_norm": 3.6622776985168457, "learning_rate": 1.6371165066817243e-05, "loss": 0.9682, "step": 9640 }, { "epoch": 1.82, "grad_norm": 28.509607315063477, "learning_rate": 1.6367400715226803e-05, "loss": 1.0373, "step": 9650 }, { "epoch": 1.82, "grad_norm": 2.624483346939087, "learning_rate": 1.6363636363636366e-05, "loss": 0.9508, "step": 9660 }, { "epoch": 1.82, "grad_norm": 14.195404052734375, "learning_rate": 1.6359872012045926e-05, "loss": 1.0253, "step": 9670 }, { "epoch": 1.82, "grad_norm": 10.364255905151367, "learning_rate": 1.6356107660455485e-05, "loss": 1.1393, "step": 9680 }, { "epoch": 1.82, "grad_norm": 17.5921688079834, "learning_rate": 1.635234330886505e-05, "loss": 1.2439, "step": 9690 }, { "epoch": 1.83, "grad_norm": 20.8980712890625, "learning_rate": 1.6348578957274612e-05, "loss": 1.1815, "step": 9700 }, { "epoch": 1.83, "grad_norm": 5.478924751281738, "learning_rate": 1.634481460568417e-05, "loss": 1.0223, "step": 9710 }, { "epoch": 1.83, "grad_norm": 20.507476806640625, "learning_rate": 1.6341050254093735e-05, "loss": 0.6629, "step": 9720 }, { "epoch": 1.83, "grad_norm": 28.950838088989258, "learning_rate": 1.6337285902503294e-05, "loss": 0.858, "step": 9730 }, { "epoch": 1.83, "grad_norm": 14.215471267700195, "learning_rate": 1.6333521550912858e-05, "loss": 1.0337, "step": 9740 }, { "epoch": 1.84, "grad_norm": 9.36776065826416, "learning_rate": 1.6329757199322417e-05, "loss": 1.4074, "step": 9750 }, { "epoch": 1.84, "grad_norm": 7.266942501068115, "learning_rate": 1.632599284773198e-05, "loss": 0.8383, "step": 9760 }, { "epoch": 1.84, "grad_norm": 8.291203498840332, "learning_rate": 1.632222849614154e-05, "loss": 0.8997, "step": 9770 }, { "epoch": 1.84, "grad_norm": 28.071760177612305, "learning_rate": 1.6318464144551103e-05, "loss": 0.8101, "step": 9780 }, { "epoch": 1.84, "grad_norm": 8.149245262145996, "learning_rate": 1.6314699792960663e-05, "loss": 0.7052, "step": 9790 }, { "epoch": 1.84, "grad_norm": 34.350502014160156, "learning_rate": 1.6310935441370226e-05, "loss": 0.9759, "step": 9800 }, { "epoch": 1.85, "grad_norm": 4.613625526428223, "learning_rate": 1.6307171089779786e-05, "loss": 0.9999, "step": 9810 }, { "epoch": 1.85, "grad_norm": 8.235222816467285, "learning_rate": 1.630340673818935e-05, "loss": 0.8468, "step": 9820 }, { "epoch": 1.85, "grad_norm": 9.5585298538208, "learning_rate": 1.629964238659891e-05, "loss": 1.0039, "step": 9830 }, { "epoch": 1.85, "grad_norm": 23.900362014770508, "learning_rate": 1.6295878035008472e-05, "loss": 1.15, "step": 9840 }, { "epoch": 1.85, "grad_norm": 5.563354969024658, "learning_rate": 1.6292113683418032e-05, "loss": 0.723, "step": 9850 }, { "epoch": 1.86, "grad_norm": 38.29470443725586, "learning_rate": 1.6288349331827592e-05, "loss": 0.9061, "step": 9860 }, { "epoch": 1.86, "grad_norm": 17.7316837310791, "learning_rate": 1.6284584980237155e-05, "loss": 1.1306, "step": 9870 }, { "epoch": 1.86, "grad_norm": 22.314489364624023, "learning_rate": 1.6280820628646715e-05, "loss": 0.8234, "step": 9880 }, { "epoch": 1.86, "grad_norm": 10.5621337890625, "learning_rate": 1.6277056277056278e-05, "loss": 0.7809, "step": 9890 }, { "epoch": 1.86, "grad_norm": 13.799981117248535, "learning_rate": 1.627329192546584e-05, "loss": 0.9947, "step": 9900 }, { "epoch": 1.87, "grad_norm": 15.118837356567383, "learning_rate": 1.62695275738754e-05, "loss": 0.9937, "step": 9910 }, { "epoch": 1.87, "grad_norm": 8.086974143981934, "learning_rate": 1.6265763222284964e-05, "loss": 0.8989, "step": 9920 }, { "epoch": 1.87, "grad_norm": 24.501052856445312, "learning_rate": 1.6261998870694524e-05, "loss": 0.9246, "step": 9930 }, { "epoch": 1.87, "grad_norm": 17.838918685913086, "learning_rate": 1.6258234519104087e-05, "loss": 0.6852, "step": 9940 }, { "epoch": 1.87, "grad_norm": 14.205941200256348, "learning_rate": 1.6254470167513647e-05, "loss": 0.9937, "step": 9950 }, { "epoch": 1.87, "grad_norm": 37.4649658203125, "learning_rate": 1.625070581592321e-05, "loss": 1.1566, "step": 9960 }, { "epoch": 1.88, "grad_norm": 3.642307758331299, "learning_rate": 1.624694146433277e-05, "loss": 0.6511, "step": 9970 }, { "epoch": 1.88, "grad_norm": 8.570235252380371, "learning_rate": 1.6243177112742333e-05, "loss": 1.0627, "step": 9980 }, { "epoch": 1.88, "grad_norm": 49.44059753417969, "learning_rate": 1.6239412761151892e-05, "loss": 0.87, "step": 9990 }, { "epoch": 1.88, "grad_norm": 4.772658824920654, "learning_rate": 1.6235648409561456e-05, "loss": 0.9856, "step": 10000 }, { "epoch": 1.88, "grad_norm": 5.388487815856934, "learning_rate": 1.6231884057971015e-05, "loss": 0.8331, "step": 10010 }, { "epoch": 1.89, "grad_norm": 28.38582992553711, "learning_rate": 1.6228119706380575e-05, "loss": 0.7234, "step": 10020 }, { "epoch": 1.89, "grad_norm": 35.28871154785156, "learning_rate": 1.6224355354790138e-05, "loss": 1.2131, "step": 10030 }, { "epoch": 1.89, "grad_norm": 11.386420249938965, "learning_rate": 1.6220591003199698e-05, "loss": 0.5296, "step": 10040 }, { "epoch": 1.89, "grad_norm": 8.352224349975586, "learning_rate": 1.621682665160926e-05, "loss": 0.8543, "step": 10050 }, { "epoch": 1.89, "grad_norm": 45.16379928588867, "learning_rate": 1.621306230001882e-05, "loss": 1.0114, "step": 10060 }, { "epoch": 1.9, "grad_norm": 18.55881118774414, "learning_rate": 1.6209297948428384e-05, "loss": 0.8992, "step": 10070 }, { "epoch": 1.9, "grad_norm": 35.752506256103516, "learning_rate": 1.6205533596837947e-05, "loss": 1.1698, "step": 10080 }, { "epoch": 1.9, "grad_norm": 9.165136337280273, "learning_rate": 1.6201769245247507e-05, "loss": 0.809, "step": 10090 }, { "epoch": 1.9, "grad_norm": 6.1471781730651855, "learning_rate": 1.619800489365707e-05, "loss": 0.9719, "step": 10100 }, { "epoch": 1.9, "grad_norm": 6.279614448547363, "learning_rate": 1.619424054206663e-05, "loss": 0.8327, "step": 10110 }, { "epoch": 1.9, "grad_norm": 4.469071865081787, "learning_rate": 1.6190476190476193e-05, "loss": 0.7403, "step": 10120 }, { "epoch": 1.91, "grad_norm": 28.841690063476562, "learning_rate": 1.6186711838885753e-05, "loss": 0.9098, "step": 10130 }, { "epoch": 1.91, "grad_norm": 13.314472198486328, "learning_rate": 1.6182947487295316e-05, "loss": 0.8678, "step": 10140 }, { "epoch": 1.91, "grad_norm": 12.73503589630127, "learning_rate": 1.6179183135704876e-05, "loss": 0.9671, "step": 10150 }, { "epoch": 1.91, "grad_norm": 5.612974643707275, "learning_rate": 1.617541878411444e-05, "loss": 0.8328, "step": 10160 }, { "epoch": 1.91, "grad_norm": 16.328927993774414, "learning_rate": 1.6171654432524e-05, "loss": 0.9885, "step": 10170 }, { "epoch": 1.92, "grad_norm": 22.157751083374023, "learning_rate": 1.6167890080933562e-05, "loss": 0.9805, "step": 10180 }, { "epoch": 1.92, "grad_norm": 41.27034378051758, "learning_rate": 1.616412572934312e-05, "loss": 1.0366, "step": 10190 }, { "epoch": 1.92, "grad_norm": 9.670302391052246, "learning_rate": 1.616036137775268e-05, "loss": 1.0358, "step": 10200 }, { "epoch": 1.92, "grad_norm": 23.558927536010742, "learning_rate": 1.6156597026162245e-05, "loss": 1.0069, "step": 10210 }, { "epoch": 1.92, "grad_norm": 9.577465057373047, "learning_rate": 1.6152832674571804e-05, "loss": 0.9131, "step": 10220 }, { "epoch": 1.93, "grad_norm": 7.152612209320068, "learning_rate": 1.6149068322981367e-05, "loss": 0.9243, "step": 10230 }, { "epoch": 1.93, "grad_norm": 14.56041431427002, "learning_rate": 1.6145303971390927e-05, "loss": 1.2452, "step": 10240 }, { "epoch": 1.93, "grad_norm": 4.7469611167907715, "learning_rate": 1.614153961980049e-05, "loss": 0.8841, "step": 10250 }, { "epoch": 1.93, "grad_norm": 0.6863610148429871, "learning_rate": 1.6137775268210054e-05, "loss": 0.6211, "step": 10260 }, { "epoch": 1.93, "grad_norm": 18.57179832458496, "learning_rate": 1.6134010916619613e-05, "loss": 1.0319, "step": 10270 }, { "epoch": 1.93, "grad_norm": 11.881142616271973, "learning_rate": 1.6130246565029176e-05, "loss": 1.0036, "step": 10280 }, { "epoch": 1.94, "grad_norm": 12.16629695892334, "learning_rate": 1.6126482213438736e-05, "loss": 0.7445, "step": 10290 }, { "epoch": 1.94, "grad_norm": 26.047819137573242, "learning_rate": 1.61227178618483e-05, "loss": 1.1173, "step": 10300 }, { "epoch": 1.94, "grad_norm": 4.4914116859436035, "learning_rate": 1.611895351025786e-05, "loss": 0.9002, "step": 10310 }, { "epoch": 1.94, "grad_norm": 12.42726993560791, "learning_rate": 1.6115189158667422e-05, "loss": 0.9133, "step": 10320 }, { "epoch": 1.94, "grad_norm": 12.923616409301758, "learning_rate": 1.6111424807076982e-05, "loss": 0.4911, "step": 10330 }, { "epoch": 1.95, "grad_norm": 22.348865509033203, "learning_rate": 1.6107660455486545e-05, "loss": 0.8791, "step": 10340 }, { "epoch": 1.95, "grad_norm": 4.061618804931641, "learning_rate": 1.6103896103896105e-05, "loss": 0.9309, "step": 10350 }, { "epoch": 1.95, "grad_norm": 25.17974090576172, "learning_rate": 1.6100131752305668e-05, "loss": 0.995, "step": 10360 }, { "epoch": 1.95, "grad_norm": 25.17315673828125, "learning_rate": 1.6096367400715228e-05, "loss": 0.6279, "step": 10370 }, { "epoch": 1.95, "grad_norm": 33.93449020385742, "learning_rate": 1.6092603049124788e-05, "loss": 0.9279, "step": 10380 }, { "epoch": 1.96, "grad_norm": 8.33711051940918, "learning_rate": 1.608883869753435e-05, "loss": 1.0123, "step": 10390 }, { "epoch": 1.96, "grad_norm": 4.49125862121582, "learning_rate": 1.608507434594391e-05, "loss": 0.7202, "step": 10400 }, { "epoch": 1.96, "grad_norm": 21.362960815429688, "learning_rate": 1.6081309994353474e-05, "loss": 1.0845, "step": 10410 }, { "epoch": 1.96, "grad_norm": 22.759014129638672, "learning_rate": 1.6077545642763034e-05, "loss": 0.6752, "step": 10420 }, { "epoch": 1.96, "grad_norm": 20.3575382232666, "learning_rate": 1.6073781291172597e-05, "loss": 0.9548, "step": 10430 }, { "epoch": 1.96, "grad_norm": 5.1825947761535645, "learning_rate": 1.6070016939582156e-05, "loss": 0.7794, "step": 10440 }, { "epoch": 1.97, "grad_norm": 13.626837730407715, "learning_rate": 1.606625258799172e-05, "loss": 0.8118, "step": 10450 }, { "epoch": 1.97, "grad_norm": 20.532129287719727, "learning_rate": 1.6062488236401283e-05, "loss": 1.0943, "step": 10460 }, { "epoch": 1.97, "grad_norm": 23.677160263061523, "learning_rate": 1.6058723884810843e-05, "loss": 1.4279, "step": 10470 }, { "epoch": 1.97, "grad_norm": 30.627092361450195, "learning_rate": 1.6054959533220406e-05, "loss": 0.798, "step": 10480 }, { "epoch": 1.97, "grad_norm": 11.574738502502441, "learning_rate": 1.6051195181629965e-05, "loss": 0.7298, "step": 10490 }, { "epoch": 1.98, "grad_norm": 20.059284210205078, "learning_rate": 1.604743083003953e-05, "loss": 1.1131, "step": 10500 }, { "epoch": 1.98, "grad_norm": 14.673868179321289, "learning_rate": 1.604366647844909e-05, "loss": 0.8992, "step": 10510 }, { "epoch": 1.98, "grad_norm": 6.70554256439209, "learning_rate": 1.603990212685865e-05, "loss": 0.9923, "step": 10520 }, { "epoch": 1.98, "grad_norm": 22.268571853637695, "learning_rate": 1.603613777526821e-05, "loss": 0.9369, "step": 10530 }, { "epoch": 1.98, "grad_norm": 7.241332530975342, "learning_rate": 1.603237342367777e-05, "loss": 0.918, "step": 10540 }, { "epoch": 1.99, "grad_norm": 8.223257064819336, "learning_rate": 1.6028609072087334e-05, "loss": 0.7295, "step": 10550 }, { "epoch": 1.99, "grad_norm": 14.106549263000488, "learning_rate": 1.6024844720496894e-05, "loss": 0.9356, "step": 10560 }, { "epoch": 1.99, "grad_norm": 5.024641513824463, "learning_rate": 1.6021080368906457e-05, "loss": 1.0827, "step": 10570 }, { "epoch": 1.99, "grad_norm": 9.588906288146973, "learning_rate": 1.6017316017316017e-05, "loss": 0.9986, "step": 10580 }, { "epoch": 1.99, "grad_norm": 13.386589050292969, "learning_rate": 1.601355166572558e-05, "loss": 0.801, "step": 10590 }, { "epoch": 2.0, "grad_norm": 14.955891609191895, "learning_rate": 1.600978731413514e-05, "loss": 0.611, "step": 10600 }, { "epoch": 2.0, "grad_norm": 26.955785751342773, "learning_rate": 1.6006022962544703e-05, "loss": 0.8322, "step": 10610 }, { "epoch": 2.0, "grad_norm": 4.895423889160156, "learning_rate": 1.6002258610954263e-05, "loss": 0.6479, "step": 10620 }, { "epoch": 2.0, "eval_accuracy": 0.9004, "eval_loss": 0.43774810433387756, "eval_runtime": 33.2538, "eval_samples_per_second": 225.538, "eval_steps_per_second": 28.207, "step": 10626 }, { "epoch": 2.0, "grad_norm": 8.223067283630371, "learning_rate": 1.5998494259363826e-05, "loss": 0.8513, "step": 10630 }, { "epoch": 2.0, "grad_norm": 6.11199951171875, "learning_rate": 1.599472990777339e-05, "loss": 0.8083, "step": 10640 }, { "epoch": 2.0, "grad_norm": 7.889083385467529, "learning_rate": 1.599096555618295e-05, "loss": 0.6345, "step": 10650 }, { "epoch": 2.01, "grad_norm": 0.47759780287742615, "learning_rate": 1.5987201204592512e-05, "loss": 0.7421, "step": 10660 }, { "epoch": 2.01, "grad_norm": 7.3583879470825195, "learning_rate": 1.5983436853002072e-05, "loss": 0.791, "step": 10670 }, { "epoch": 2.01, "grad_norm": 14.362602233886719, "learning_rate": 1.5979672501411635e-05, "loss": 0.7772, "step": 10680 }, { "epoch": 2.01, "grad_norm": 14.467147827148438, "learning_rate": 1.5975908149821195e-05, "loss": 0.8084, "step": 10690 }, { "epoch": 2.01, "grad_norm": 27.702394485473633, "learning_rate": 1.5972143798230758e-05, "loss": 1.2076, "step": 10700 }, { "epoch": 2.02, "grad_norm": 29.06777572631836, "learning_rate": 1.5968379446640318e-05, "loss": 0.9421, "step": 10710 }, { "epoch": 2.02, "grad_norm": 17.13526725769043, "learning_rate": 1.5964615095049877e-05, "loss": 0.7547, "step": 10720 }, { "epoch": 2.02, "grad_norm": 5.489052772521973, "learning_rate": 1.596085074345944e-05, "loss": 0.6932, "step": 10730 }, { "epoch": 2.02, "grad_norm": 18.08340072631836, "learning_rate": 1.5957086391869e-05, "loss": 0.9143, "step": 10740 }, { "epoch": 2.02, "grad_norm": 7.827996730804443, "learning_rate": 1.5953322040278563e-05, "loss": 0.7774, "step": 10750 }, { "epoch": 2.03, "grad_norm": 29.571741104125977, "learning_rate": 1.5949557688688123e-05, "loss": 0.7913, "step": 10760 }, { "epoch": 2.03, "grad_norm": 12.85148811340332, "learning_rate": 1.5945793337097686e-05, "loss": 0.5545, "step": 10770 }, { "epoch": 2.03, "grad_norm": 4.041319847106934, "learning_rate": 1.5942028985507246e-05, "loss": 0.7221, "step": 10780 }, { "epoch": 2.03, "grad_norm": 13.464383125305176, "learning_rate": 1.593826463391681e-05, "loss": 1.0252, "step": 10790 }, { "epoch": 2.03, "grad_norm": 267.48870849609375, "learning_rate": 1.593450028232637e-05, "loss": 1.032, "step": 10800 }, { "epoch": 2.03, "grad_norm": 5.915716648101807, "learning_rate": 1.5930735930735932e-05, "loss": 0.6537, "step": 10810 }, { "epoch": 2.04, "grad_norm": 3.0029585361480713, "learning_rate": 1.5926971579145495e-05, "loss": 0.8683, "step": 10820 }, { "epoch": 2.04, "grad_norm": 51.99099349975586, "learning_rate": 1.5923207227555055e-05, "loss": 0.714, "step": 10830 }, { "epoch": 2.04, "grad_norm": 18.225906372070312, "learning_rate": 1.5919442875964618e-05, "loss": 0.6733, "step": 10840 }, { "epoch": 2.04, "grad_norm": 14.735111236572266, "learning_rate": 1.5915678524374178e-05, "loss": 0.7899, "step": 10850 }, { "epoch": 2.04, "grad_norm": 0.9460431337356567, "learning_rate": 1.591191417278374e-05, "loss": 0.697, "step": 10860 }, { "epoch": 2.05, "grad_norm": 5.448774814605713, "learning_rate": 1.59081498211933e-05, "loss": 0.699, "step": 10870 }, { "epoch": 2.05, "grad_norm": 12.799127578735352, "learning_rate": 1.5904385469602864e-05, "loss": 1.0552, "step": 10880 }, { "epoch": 2.05, "grad_norm": 44.27261734008789, "learning_rate": 1.5900621118012424e-05, "loss": 0.6706, "step": 10890 }, { "epoch": 2.05, "grad_norm": 23.952608108520508, "learning_rate": 1.5896856766421984e-05, "loss": 1.0494, "step": 10900 }, { "epoch": 2.05, "grad_norm": 29.93094825744629, "learning_rate": 1.5893092414831547e-05, "loss": 0.7551, "step": 10910 }, { "epoch": 2.06, "grad_norm": 12.698896408081055, "learning_rate": 1.5889328063241107e-05, "loss": 0.8114, "step": 10920 }, { "epoch": 2.06, "grad_norm": 6.4680867195129395, "learning_rate": 1.588556371165067e-05, "loss": 0.6495, "step": 10930 }, { "epoch": 2.06, "grad_norm": 20.92882537841797, "learning_rate": 1.588179936006023e-05, "loss": 0.5763, "step": 10940 }, { "epoch": 2.06, "grad_norm": 16.703866958618164, "learning_rate": 1.5878035008469793e-05, "loss": 0.7678, "step": 10950 }, { "epoch": 2.06, "grad_norm": 26.071792602539062, "learning_rate": 1.5874270656879352e-05, "loss": 0.7934, "step": 10960 }, { "epoch": 2.06, "grad_norm": 8.398687362670898, "learning_rate": 1.5870506305288916e-05, "loss": 0.6339, "step": 10970 }, { "epoch": 2.07, "grad_norm": 7.431175231933594, "learning_rate": 1.5866741953698475e-05, "loss": 0.4395, "step": 10980 }, { "epoch": 2.07, "grad_norm": 14.827509880065918, "learning_rate": 1.586297760210804e-05, "loss": 0.6793, "step": 10990 }, { "epoch": 2.07, "grad_norm": 3.574143409729004, "learning_rate": 1.58592132505176e-05, "loss": 1.0553, "step": 11000 }, { "epoch": 2.07, "grad_norm": 11.981563568115234, "learning_rate": 1.585544889892716e-05, "loss": 1.0152, "step": 11010 }, { "epoch": 2.07, "grad_norm": 33.9033088684082, "learning_rate": 1.5851684547336725e-05, "loss": 0.8337, "step": 11020 }, { "epoch": 2.08, "grad_norm": 3.125753879547119, "learning_rate": 1.5847920195746284e-05, "loss": 0.6738, "step": 11030 }, { "epoch": 2.08, "grad_norm": 16.212665557861328, "learning_rate": 1.5844155844155847e-05, "loss": 0.7257, "step": 11040 }, { "epoch": 2.08, "grad_norm": 8.9789457321167, "learning_rate": 1.5840391492565407e-05, "loss": 0.8129, "step": 11050 }, { "epoch": 2.08, "grad_norm": 27.197084426879883, "learning_rate": 1.583662714097497e-05, "loss": 0.8489, "step": 11060 }, { "epoch": 2.08, "grad_norm": 9.654715538024902, "learning_rate": 1.583286278938453e-05, "loss": 1.1031, "step": 11070 }, { "epoch": 2.09, "grad_norm": 7.9157514572143555, "learning_rate": 1.582909843779409e-05, "loss": 0.649, "step": 11080 }, { "epoch": 2.09, "grad_norm": 26.433883666992188, "learning_rate": 1.5825334086203653e-05, "loss": 1.0084, "step": 11090 }, { "epoch": 2.09, "grad_norm": 2.050842046737671, "learning_rate": 1.5821569734613213e-05, "loss": 0.7625, "step": 11100 }, { "epoch": 2.09, "grad_norm": 22.36996078491211, "learning_rate": 1.5817805383022776e-05, "loss": 0.877, "step": 11110 }, { "epoch": 2.09, "grad_norm": 1.3118711709976196, "learning_rate": 1.5814041031432336e-05, "loss": 0.5775, "step": 11120 }, { "epoch": 2.09, "grad_norm": 17.231550216674805, "learning_rate": 1.58102766798419e-05, "loss": 1.0835, "step": 11130 }, { "epoch": 2.1, "grad_norm": 8.116622924804688, "learning_rate": 1.580651232825146e-05, "loss": 0.7954, "step": 11140 }, { "epoch": 2.1, "grad_norm": 10.011574745178223, "learning_rate": 1.5802747976661022e-05, "loss": 0.817, "step": 11150 }, { "epoch": 2.1, "grad_norm": 17.308115005493164, "learning_rate": 1.579898362507058e-05, "loss": 0.4949, "step": 11160 }, { "epoch": 2.1, "grad_norm": 15.126243591308594, "learning_rate": 1.5795219273480145e-05, "loss": 0.5666, "step": 11170 }, { "epoch": 2.1, "grad_norm": 8.26230525970459, "learning_rate": 1.5791454921889705e-05, "loss": 0.5484, "step": 11180 }, { "epoch": 2.11, "grad_norm": 2.2589051723480225, "learning_rate": 1.5787690570299268e-05, "loss": 0.8625, "step": 11190 }, { "epoch": 2.11, "grad_norm": 18.615032196044922, "learning_rate": 1.578392621870883e-05, "loss": 1.1054, "step": 11200 }, { "epoch": 2.11, "grad_norm": 14.837967872619629, "learning_rate": 1.578016186711839e-05, "loss": 0.9737, "step": 11210 }, { "epoch": 2.11, "grad_norm": 11.36581802368164, "learning_rate": 1.5776397515527954e-05, "loss": 0.77, "step": 11220 }, { "epoch": 2.11, "grad_norm": 7.043651103973389, "learning_rate": 1.5772633163937514e-05, "loss": 0.7536, "step": 11230 }, { "epoch": 2.12, "grad_norm": 18.54761505126953, "learning_rate": 1.5768868812347073e-05, "loss": 1.0175, "step": 11240 }, { "epoch": 2.12, "grad_norm": 7.881554126739502, "learning_rate": 1.5765104460756636e-05, "loss": 0.6701, "step": 11250 }, { "epoch": 2.12, "grad_norm": 18.58554458618164, "learning_rate": 1.5761340109166196e-05, "loss": 1.0165, "step": 11260 }, { "epoch": 2.12, "grad_norm": 6.169960021972656, "learning_rate": 1.575757575757576e-05, "loss": 0.7932, "step": 11270 }, { "epoch": 2.12, "grad_norm": 13.55213737487793, "learning_rate": 1.575381140598532e-05, "loss": 0.6286, "step": 11280 }, { "epoch": 2.12, "grad_norm": 26.40308952331543, "learning_rate": 1.5750047054394882e-05, "loss": 1.0074, "step": 11290 }, { "epoch": 2.13, "grad_norm": 14.90792465209961, "learning_rate": 1.5746282702804442e-05, "loss": 0.7594, "step": 11300 }, { "epoch": 2.13, "grad_norm": 25.512937545776367, "learning_rate": 1.5742518351214005e-05, "loss": 0.8933, "step": 11310 }, { "epoch": 2.13, "grad_norm": 20.684497833251953, "learning_rate": 1.5738753999623565e-05, "loss": 0.8769, "step": 11320 }, { "epoch": 2.13, "grad_norm": 26.62982940673828, "learning_rate": 1.5734989648033128e-05, "loss": 1.0551, "step": 11330 }, { "epoch": 2.13, "grad_norm": 12.02119255065918, "learning_rate": 1.5731225296442688e-05, "loss": 0.8013, "step": 11340 }, { "epoch": 2.14, "grad_norm": 4.007152080535889, "learning_rate": 1.572746094485225e-05, "loss": 0.7307, "step": 11350 }, { "epoch": 2.14, "grad_norm": 11.850004196166992, "learning_rate": 1.572369659326181e-05, "loss": 0.7143, "step": 11360 }, { "epoch": 2.14, "grad_norm": 11.220576286315918, "learning_rate": 1.5719932241671374e-05, "loss": 0.7637, "step": 11370 }, { "epoch": 2.14, "grad_norm": 3.0567941665649414, "learning_rate": 1.5716167890080937e-05, "loss": 0.9187, "step": 11380 }, { "epoch": 2.14, "grad_norm": 24.867557525634766, "learning_rate": 1.5712403538490497e-05, "loss": 0.86, "step": 11390 }, { "epoch": 2.15, "grad_norm": 13.429787635803223, "learning_rate": 1.570863918690006e-05, "loss": 0.919, "step": 11400 }, { "epoch": 2.15, "grad_norm": 1.4714537858963013, "learning_rate": 1.5704874835309616e-05, "loss": 0.8753, "step": 11410 }, { "epoch": 2.15, "grad_norm": 26.924331665039062, "learning_rate": 1.570111048371918e-05, "loss": 0.61, "step": 11420 }, { "epoch": 2.15, "grad_norm": 19.854873657226562, "learning_rate": 1.5697346132128743e-05, "loss": 0.765, "step": 11430 }, { "epoch": 2.15, "grad_norm": 10.933598518371582, "learning_rate": 1.5693581780538303e-05, "loss": 0.9729, "step": 11440 }, { "epoch": 2.16, "grad_norm": 12.186927795410156, "learning_rate": 1.5689817428947866e-05, "loss": 1.1194, "step": 11450 }, { "epoch": 2.16, "grad_norm": 15.089810371398926, "learning_rate": 1.5686053077357425e-05, "loss": 0.979, "step": 11460 }, { "epoch": 2.16, "grad_norm": 34.98917007446289, "learning_rate": 1.568228872576699e-05, "loss": 0.7631, "step": 11470 }, { "epoch": 2.16, "grad_norm": 7.224375247955322, "learning_rate": 1.567852437417655e-05, "loss": 0.7461, "step": 11480 }, { "epoch": 2.16, "grad_norm": 3.452284812927246, "learning_rate": 1.567476002258611e-05, "loss": 0.6214, "step": 11490 }, { "epoch": 2.16, "grad_norm": 12.092110633850098, "learning_rate": 1.567099567099567e-05, "loss": 0.6558, "step": 11500 }, { "epoch": 2.17, "grad_norm": 10.905387878417969, "learning_rate": 1.5667231319405234e-05, "loss": 0.6047, "step": 11510 }, { "epoch": 2.17, "grad_norm": 10.892648696899414, "learning_rate": 1.5663466967814794e-05, "loss": 0.8369, "step": 11520 }, { "epoch": 2.17, "grad_norm": 2.8297064304351807, "learning_rate": 1.5659702616224357e-05, "loss": 0.6804, "step": 11530 }, { "epoch": 2.17, "grad_norm": 6.064371585845947, "learning_rate": 1.5655938264633917e-05, "loss": 0.6657, "step": 11540 }, { "epoch": 2.17, "grad_norm": 8.448015213012695, "learning_rate": 1.565217391304348e-05, "loss": 0.6027, "step": 11550 }, { "epoch": 2.18, "grad_norm": 39.2706413269043, "learning_rate": 1.5648409561453043e-05, "loss": 0.7446, "step": 11560 }, { "epoch": 2.18, "grad_norm": 17.710491180419922, "learning_rate": 1.5644645209862603e-05, "loss": 0.9606, "step": 11570 }, { "epoch": 2.18, "grad_norm": 8.026297569274902, "learning_rate": 1.5640880858272166e-05, "loss": 0.6021, "step": 11580 }, { "epoch": 2.18, "grad_norm": 5.3070855140686035, "learning_rate": 1.5637116506681723e-05, "loss": 0.5271, "step": 11590 }, { "epoch": 2.18, "grad_norm": 10.307164192199707, "learning_rate": 1.5633352155091286e-05, "loss": 0.6096, "step": 11600 }, { "epoch": 2.19, "grad_norm": 5.217769145965576, "learning_rate": 1.562958780350085e-05, "loss": 0.4626, "step": 11610 }, { "epoch": 2.19, "grad_norm": 33.302310943603516, "learning_rate": 1.562582345191041e-05, "loss": 0.5173, "step": 11620 }, { "epoch": 2.19, "grad_norm": 5.040510654449463, "learning_rate": 1.5622059100319972e-05, "loss": 0.9346, "step": 11630 }, { "epoch": 2.19, "grad_norm": 30.57442855834961, "learning_rate": 1.5618294748729532e-05, "loss": 0.8653, "step": 11640 }, { "epoch": 2.19, "grad_norm": 4.821202754974365, "learning_rate": 1.5614530397139095e-05, "loss": 0.8027, "step": 11650 }, { "epoch": 2.19, "grad_norm": 3.6210570335388184, "learning_rate": 1.5610766045548655e-05, "loss": 0.7766, "step": 11660 }, { "epoch": 2.2, "grad_norm": 5.999725818634033, "learning_rate": 1.5607001693958218e-05, "loss": 0.9771, "step": 11670 }, { "epoch": 2.2, "grad_norm": 9.624933242797852, "learning_rate": 1.5603237342367778e-05, "loss": 0.5936, "step": 11680 }, { "epoch": 2.2, "grad_norm": 10.115823745727539, "learning_rate": 1.559947299077734e-05, "loss": 1.1893, "step": 11690 }, { "epoch": 2.2, "grad_norm": 7.536423683166504, "learning_rate": 1.55957086391869e-05, "loss": 0.8263, "step": 11700 }, { "epoch": 2.2, "grad_norm": 15.722282409667969, "learning_rate": 1.5591944287596464e-05, "loss": 0.5882, "step": 11710 }, { "epoch": 2.21, "grad_norm": 22.392576217651367, "learning_rate": 1.5588179936006023e-05, "loss": 0.6934, "step": 11720 }, { "epoch": 2.21, "grad_norm": 28.885343551635742, "learning_rate": 1.5584415584415587e-05, "loss": 1.0459, "step": 11730 }, { "epoch": 2.21, "grad_norm": 20.01675796508789, "learning_rate": 1.5580651232825146e-05, "loss": 0.7343, "step": 11740 }, { "epoch": 2.21, "grad_norm": 3.547024726867676, "learning_rate": 1.557688688123471e-05, "loss": 0.8088, "step": 11750 }, { "epoch": 2.21, "grad_norm": 7.1423726081848145, "learning_rate": 1.557312252964427e-05, "loss": 0.5475, "step": 11760 }, { "epoch": 2.22, "grad_norm": 29.471176147460938, "learning_rate": 1.556935817805383e-05, "loss": 0.8382, "step": 11770 }, { "epoch": 2.22, "grad_norm": 16.62720489501953, "learning_rate": 1.5565593826463392e-05, "loss": 0.7917, "step": 11780 }, { "epoch": 2.22, "grad_norm": 12.257611274719238, "learning_rate": 1.5561829474872955e-05, "loss": 0.7627, "step": 11790 }, { "epoch": 2.22, "grad_norm": 16.232980728149414, "learning_rate": 1.5558065123282515e-05, "loss": 0.5723, "step": 11800 }, { "epoch": 2.22, "grad_norm": 15.463409423828125, "learning_rate": 1.5554300771692078e-05, "loss": 0.6709, "step": 11810 }, { "epoch": 2.22, "grad_norm": 11.793865203857422, "learning_rate": 1.5550536420101638e-05, "loss": 0.6177, "step": 11820 }, { "epoch": 2.23, "grad_norm": 23.931665420532227, "learning_rate": 1.55467720685112e-05, "loss": 0.8577, "step": 11830 }, { "epoch": 2.23, "grad_norm": 12.713443756103516, "learning_rate": 1.554300771692076e-05, "loss": 0.7765, "step": 11840 }, { "epoch": 2.23, "grad_norm": 19.967254638671875, "learning_rate": 1.5539243365330324e-05, "loss": 1.0167, "step": 11850 }, { "epoch": 2.23, "grad_norm": 15.038551330566406, "learning_rate": 1.5535479013739884e-05, "loss": 1.0065, "step": 11860 }, { "epoch": 2.23, "grad_norm": 11.947111129760742, "learning_rate": 1.5531714662149447e-05, "loss": 0.7426, "step": 11870 }, { "epoch": 2.24, "grad_norm": 23.100839614868164, "learning_rate": 1.5527950310559007e-05, "loss": 1.2186, "step": 11880 }, { "epoch": 2.24, "grad_norm": 18.139780044555664, "learning_rate": 1.552418595896857e-05, "loss": 0.5974, "step": 11890 }, { "epoch": 2.24, "grad_norm": 12.937055587768555, "learning_rate": 1.552042160737813e-05, "loss": 1.0542, "step": 11900 }, { "epoch": 2.24, "grad_norm": 25.850690841674805, "learning_rate": 1.5516657255787693e-05, "loss": 0.6753, "step": 11910 }, { "epoch": 2.24, "grad_norm": 11.147032737731934, "learning_rate": 1.5512892904197253e-05, "loss": 0.8321, "step": 11920 }, { "epoch": 2.25, "grad_norm": 16.01250648498535, "learning_rate": 1.5509128552606816e-05, "loss": 0.6835, "step": 11930 }, { "epoch": 2.25, "grad_norm": 14.47998332977295, "learning_rate": 1.5505364201016376e-05, "loss": 0.4995, "step": 11940 }, { "epoch": 2.25, "grad_norm": 29.964704513549805, "learning_rate": 1.5501599849425935e-05, "loss": 0.6967, "step": 11950 }, { "epoch": 2.25, "grad_norm": 17.843643188476562, "learning_rate": 1.54978354978355e-05, "loss": 0.8046, "step": 11960 }, { "epoch": 2.25, "grad_norm": 10.963820457458496, "learning_rate": 1.5494071146245058e-05, "loss": 0.8361, "step": 11970 }, { "epoch": 2.25, "grad_norm": 15.843602180480957, "learning_rate": 1.549030679465462e-05, "loss": 0.6498, "step": 11980 }, { "epoch": 2.26, "grad_norm": 0.7258365154266357, "learning_rate": 1.5486542443064185e-05, "loss": 0.8831, "step": 11990 }, { "epoch": 2.26, "grad_norm": 1.571528434753418, "learning_rate": 1.5482778091473744e-05, "loss": 0.8198, "step": 12000 }, { "epoch": 2.26, "grad_norm": 8.469624519348145, "learning_rate": 1.5479013739883307e-05, "loss": 0.7245, "step": 12010 }, { "epoch": 2.26, "grad_norm": 3.154909133911133, "learning_rate": 1.5475249388292867e-05, "loss": 0.7674, "step": 12020 }, { "epoch": 2.26, "grad_norm": 21.369901657104492, "learning_rate": 1.547148503670243e-05, "loss": 0.9809, "step": 12030 }, { "epoch": 2.27, "grad_norm": 31.945472717285156, "learning_rate": 1.546772068511199e-05, "loss": 0.7597, "step": 12040 }, { "epoch": 2.27, "grad_norm": 25.05544662475586, "learning_rate": 1.5463956333521553e-05, "loss": 0.5625, "step": 12050 }, { "epoch": 2.27, "grad_norm": 21.71693992614746, "learning_rate": 1.5460191981931113e-05, "loss": 0.5271, "step": 12060 }, { "epoch": 2.27, "grad_norm": 24.646568298339844, "learning_rate": 1.5456427630340676e-05, "loss": 0.7604, "step": 12070 }, { "epoch": 2.27, "grad_norm": 3.2769088745117188, "learning_rate": 1.5452663278750236e-05, "loss": 0.7491, "step": 12080 }, { "epoch": 2.28, "grad_norm": 15.733813285827637, "learning_rate": 1.54488989271598e-05, "loss": 0.9759, "step": 12090 }, { "epoch": 2.28, "grad_norm": 17.16128158569336, "learning_rate": 1.544513457556936e-05, "loss": 0.8524, "step": 12100 }, { "epoch": 2.28, "grad_norm": 11.384461402893066, "learning_rate": 1.544137022397892e-05, "loss": 0.7966, "step": 12110 }, { "epoch": 2.28, "grad_norm": 13.934250831604004, "learning_rate": 1.5437605872388482e-05, "loss": 0.8525, "step": 12120 }, { "epoch": 2.28, "grad_norm": 11.015745162963867, "learning_rate": 1.543384152079804e-05, "loss": 0.617, "step": 12130 }, { "epoch": 2.28, "grad_norm": 31.421207427978516, "learning_rate": 1.5430077169207605e-05, "loss": 0.4724, "step": 12140 }, { "epoch": 2.29, "grad_norm": 27.373411178588867, "learning_rate": 1.5426312817617165e-05, "loss": 0.6447, "step": 12150 }, { "epoch": 2.29, "grad_norm": 4.831554889678955, "learning_rate": 1.5422548466026728e-05, "loss": 0.7196, "step": 12160 }, { "epoch": 2.29, "grad_norm": 17.245033264160156, "learning_rate": 1.541878411443629e-05, "loss": 0.9213, "step": 12170 }, { "epoch": 2.29, "grad_norm": 12.337189674377441, "learning_rate": 1.541501976284585e-05, "loss": 0.8848, "step": 12180 }, { "epoch": 2.29, "grad_norm": 14.200904846191406, "learning_rate": 1.5411255411255414e-05, "loss": 0.9418, "step": 12190 }, { "epoch": 2.3, "grad_norm": 25.789546966552734, "learning_rate": 1.5407491059664974e-05, "loss": 0.9909, "step": 12200 }, { "epoch": 2.3, "grad_norm": 67.17901611328125, "learning_rate": 1.5403726708074537e-05, "loss": 0.7414, "step": 12210 }, { "epoch": 2.3, "grad_norm": 11.221137046813965, "learning_rate": 1.5399962356484096e-05, "loss": 1.1104, "step": 12220 }, { "epoch": 2.3, "grad_norm": 13.688097953796387, "learning_rate": 1.539619800489366e-05, "loss": 0.6626, "step": 12230 }, { "epoch": 2.3, "grad_norm": 16.81260871887207, "learning_rate": 1.539243365330322e-05, "loss": 0.6778, "step": 12240 }, { "epoch": 2.31, "grad_norm": 7.933690071105957, "learning_rate": 1.5388669301712783e-05, "loss": 0.913, "step": 12250 }, { "epoch": 2.31, "grad_norm": 30.486112594604492, "learning_rate": 1.5384904950122342e-05, "loss": 1.057, "step": 12260 }, { "epoch": 2.31, "grad_norm": 19.049509048461914, "learning_rate": 1.5381140598531905e-05, "loss": 0.7562, "step": 12270 }, { "epoch": 2.31, "grad_norm": 21.68576431274414, "learning_rate": 1.5377376246941465e-05, "loss": 0.696, "step": 12280 }, { "epoch": 2.31, "grad_norm": 21.92389678955078, "learning_rate": 1.5373611895351025e-05, "loss": 0.845, "step": 12290 }, { "epoch": 2.32, "grad_norm": 11.414402961730957, "learning_rate": 1.5369847543760588e-05, "loss": 0.7288, "step": 12300 }, { "epoch": 2.32, "grad_norm": 12.205060005187988, "learning_rate": 1.5366083192170148e-05, "loss": 0.7567, "step": 12310 }, { "epoch": 2.32, "grad_norm": 21.142671585083008, "learning_rate": 1.536231884057971e-05, "loss": 0.6949, "step": 12320 }, { "epoch": 2.32, "grad_norm": 3.8780596256256104, "learning_rate": 1.535855448898927e-05, "loss": 0.5205, "step": 12330 }, { "epoch": 2.32, "grad_norm": 2.610788106918335, "learning_rate": 1.5354790137398834e-05, "loss": 0.4817, "step": 12340 }, { "epoch": 2.32, "grad_norm": 5.773406505584717, "learning_rate": 1.5351025785808397e-05, "loss": 0.9445, "step": 12350 }, { "epoch": 2.33, "grad_norm": 4.580637454986572, "learning_rate": 1.5347261434217957e-05, "loss": 0.7737, "step": 12360 }, { "epoch": 2.33, "grad_norm": 20.938676834106445, "learning_rate": 1.534349708262752e-05, "loss": 0.8886, "step": 12370 }, { "epoch": 2.33, "grad_norm": 16.42961311340332, "learning_rate": 1.533973273103708e-05, "loss": 0.48, "step": 12380 }, { "epoch": 2.33, "grad_norm": 4.3253493309021, "learning_rate": 1.5335968379446643e-05, "loss": 0.7892, "step": 12390 }, { "epoch": 2.33, "grad_norm": 5.634984970092773, "learning_rate": 1.5332204027856203e-05, "loss": 0.805, "step": 12400 }, { "epoch": 2.34, "grad_norm": 19.899145126342773, "learning_rate": 1.5328439676265766e-05, "loss": 0.7257, "step": 12410 }, { "epoch": 2.34, "grad_norm": 11.208027839660645, "learning_rate": 1.5324675324675326e-05, "loss": 0.732, "step": 12420 }, { "epoch": 2.34, "grad_norm": 33.87583541870117, "learning_rate": 1.532091097308489e-05, "loss": 0.6432, "step": 12430 }, { "epoch": 2.34, "grad_norm": 8.736035346984863, "learning_rate": 1.531714662149445e-05, "loss": 0.7492, "step": 12440 }, { "epoch": 2.34, "grad_norm": 13.03171443939209, "learning_rate": 1.5313382269904012e-05, "loss": 0.9966, "step": 12450 }, { "epoch": 2.35, "grad_norm": 9.763032913208008, "learning_rate": 1.530961791831357e-05, "loss": 0.8488, "step": 12460 }, { "epoch": 2.35, "grad_norm": 0.931615948677063, "learning_rate": 1.530585356672313e-05, "loss": 0.6715, "step": 12470 }, { "epoch": 2.35, "grad_norm": 12.652198791503906, "learning_rate": 1.5302089215132694e-05, "loss": 1.1259, "step": 12480 }, { "epoch": 2.35, "grad_norm": 6.345569133758545, "learning_rate": 1.5298324863542254e-05, "loss": 0.7731, "step": 12490 }, { "epoch": 2.35, "grad_norm": 22.567358016967773, "learning_rate": 1.5294560511951817e-05, "loss": 0.6017, "step": 12500 }, { "epoch": 2.35, "grad_norm": 3.477397918701172, "learning_rate": 1.5290796160361377e-05, "loss": 1.0937, "step": 12510 }, { "epoch": 2.36, "grad_norm": 24.150218963623047, "learning_rate": 1.528703180877094e-05, "loss": 0.8269, "step": 12520 }, { "epoch": 2.36, "grad_norm": 33.526763916015625, "learning_rate": 1.5283267457180503e-05, "loss": 0.7596, "step": 12530 }, { "epoch": 2.36, "grad_norm": 11.938372611999512, "learning_rate": 1.5279503105590063e-05, "loss": 0.7197, "step": 12540 }, { "epoch": 2.36, "grad_norm": 0.7802831530570984, "learning_rate": 1.5275738753999626e-05, "loss": 0.5031, "step": 12550 }, { "epoch": 2.36, "grad_norm": 33.820281982421875, "learning_rate": 1.5271974402409186e-05, "loss": 0.7919, "step": 12560 }, { "epoch": 2.37, "grad_norm": 16.24278450012207, "learning_rate": 1.526821005081875e-05, "loss": 0.5727, "step": 12570 }, { "epoch": 2.37, "grad_norm": 8.651189804077148, "learning_rate": 1.526444569922831e-05, "loss": 0.6429, "step": 12580 }, { "epoch": 2.37, "grad_norm": 42.32835388183594, "learning_rate": 1.5260681347637872e-05, "loss": 0.9006, "step": 12590 }, { "epoch": 2.37, "grad_norm": 15.94575309753418, "learning_rate": 1.5256916996047434e-05, "loss": 0.7776, "step": 12600 }, { "epoch": 2.37, "grad_norm": 34.81913375854492, "learning_rate": 1.5253152644456995e-05, "loss": 1.2596, "step": 12610 }, { "epoch": 2.38, "grad_norm": 5.684762954711914, "learning_rate": 1.5249388292866557e-05, "loss": 0.6622, "step": 12620 }, { "epoch": 2.38, "grad_norm": 14.016319274902344, "learning_rate": 1.5245623941276115e-05, "loss": 0.7527, "step": 12630 }, { "epoch": 2.38, "grad_norm": 24.80596351623535, "learning_rate": 1.5241859589685676e-05, "loss": 1.1414, "step": 12640 }, { "epoch": 2.38, "grad_norm": 11.010673522949219, "learning_rate": 1.523809523809524e-05, "loss": 0.6522, "step": 12650 }, { "epoch": 2.38, "grad_norm": 11.180171966552734, "learning_rate": 1.52343308865048e-05, "loss": 0.4195, "step": 12660 }, { "epoch": 2.38, "grad_norm": 18.080059051513672, "learning_rate": 1.5230566534914362e-05, "loss": 0.532, "step": 12670 }, { "epoch": 2.39, "grad_norm": 16.197202682495117, "learning_rate": 1.5226802183323924e-05, "loss": 1.0089, "step": 12680 }, { "epoch": 2.39, "grad_norm": 13.255231857299805, "learning_rate": 1.5223037831733485e-05, "loss": 0.9259, "step": 12690 }, { "epoch": 2.39, "grad_norm": 6.038920879364014, "learning_rate": 1.5219273480143047e-05, "loss": 0.7511, "step": 12700 }, { "epoch": 2.39, "grad_norm": 6.505653381347656, "learning_rate": 1.5215509128552608e-05, "loss": 0.8873, "step": 12710 }, { "epoch": 2.39, "grad_norm": 10.185829162597656, "learning_rate": 1.521174477696217e-05, "loss": 0.9914, "step": 12720 }, { "epoch": 2.4, "grad_norm": 22.103660583496094, "learning_rate": 1.5207980425371731e-05, "loss": 1.0945, "step": 12730 }, { "epoch": 2.4, "grad_norm": 5.609415531158447, "learning_rate": 1.5204216073781292e-05, "loss": 0.7768, "step": 12740 }, { "epoch": 2.4, "grad_norm": 21.126070022583008, "learning_rate": 1.5200451722190854e-05, "loss": 0.8569, "step": 12750 }, { "epoch": 2.4, "grad_norm": 3.481779098510742, "learning_rate": 1.5196687370600415e-05, "loss": 0.6686, "step": 12760 }, { "epoch": 2.4, "grad_norm": 5.01765775680542, "learning_rate": 1.5192923019009977e-05, "loss": 0.6434, "step": 12770 }, { "epoch": 2.41, "grad_norm": 8.629416465759277, "learning_rate": 1.5189158667419538e-05, "loss": 0.6469, "step": 12780 }, { "epoch": 2.41, "grad_norm": 14.828179359436035, "learning_rate": 1.5185394315829101e-05, "loss": 0.7293, "step": 12790 }, { "epoch": 2.41, "grad_norm": 10.851454734802246, "learning_rate": 1.5181629964238663e-05, "loss": 0.7607, "step": 12800 }, { "epoch": 2.41, "grad_norm": 22.300085067749023, "learning_rate": 1.5177865612648221e-05, "loss": 0.8268, "step": 12810 }, { "epoch": 2.41, "grad_norm": 8.352980613708496, "learning_rate": 1.5174101261057782e-05, "loss": 0.6191, "step": 12820 }, { "epoch": 2.41, "grad_norm": 10.153834342956543, "learning_rate": 1.5170336909467346e-05, "loss": 0.8384, "step": 12830 }, { "epoch": 2.42, "grad_norm": 9.206123352050781, "learning_rate": 1.5166572557876907e-05, "loss": 0.7043, "step": 12840 }, { "epoch": 2.42, "grad_norm": 6.315232276916504, "learning_rate": 1.5162808206286468e-05, "loss": 0.6646, "step": 12850 }, { "epoch": 2.42, "grad_norm": 34.999691009521484, "learning_rate": 1.515904385469603e-05, "loss": 0.874, "step": 12860 }, { "epoch": 2.42, "grad_norm": 9.736954689025879, "learning_rate": 1.5155279503105591e-05, "loss": 1.0045, "step": 12870 }, { "epoch": 2.42, "grad_norm": 13.932580947875977, "learning_rate": 1.5151515151515153e-05, "loss": 0.8253, "step": 12880 }, { "epoch": 2.43, "grad_norm": 34.856239318847656, "learning_rate": 1.5147750799924714e-05, "loss": 0.9148, "step": 12890 }, { "epoch": 2.43, "grad_norm": 8.915196418762207, "learning_rate": 1.5143986448334276e-05, "loss": 0.799, "step": 12900 }, { "epoch": 2.43, "grad_norm": 23.154916763305664, "learning_rate": 1.5140222096743837e-05, "loss": 0.7314, "step": 12910 }, { "epoch": 2.43, "grad_norm": 16.326953887939453, "learning_rate": 1.5136457745153399e-05, "loss": 0.7348, "step": 12920 }, { "epoch": 2.43, "grad_norm": 25.118789672851562, "learning_rate": 1.513269339356296e-05, "loss": 0.6603, "step": 12930 }, { "epoch": 2.44, "grad_norm": 24.18308448791504, "learning_rate": 1.5128929041972522e-05, "loss": 0.7526, "step": 12940 }, { "epoch": 2.44, "grad_norm": 5.591280937194824, "learning_rate": 1.5125164690382083e-05, "loss": 0.8457, "step": 12950 }, { "epoch": 2.44, "grad_norm": 13.854071617126465, "learning_rate": 1.5121400338791645e-05, "loss": 0.7127, "step": 12960 }, { "epoch": 2.44, "grad_norm": 1.194824457168579, "learning_rate": 1.5117635987201206e-05, "loss": 0.9448, "step": 12970 }, { "epoch": 2.44, "grad_norm": 10.191553115844727, "learning_rate": 1.5113871635610766e-05, "loss": 0.4985, "step": 12980 }, { "epoch": 2.44, "grad_norm": 21.121976852416992, "learning_rate": 1.5110107284020327e-05, "loss": 0.5263, "step": 12990 }, { "epoch": 2.45, "grad_norm": 15.498823165893555, "learning_rate": 1.5106342932429889e-05, "loss": 0.9089, "step": 13000 }, { "epoch": 2.45, "grad_norm": 10.255945205688477, "learning_rate": 1.510257858083945e-05, "loss": 0.7219, "step": 13010 }, { "epoch": 2.45, "grad_norm": 4.906744003295898, "learning_rate": 1.5098814229249013e-05, "loss": 0.8106, "step": 13020 }, { "epoch": 2.45, "grad_norm": 4.751705169677734, "learning_rate": 1.5095049877658575e-05, "loss": 0.742, "step": 13030 }, { "epoch": 2.45, "grad_norm": 10.848084449768066, "learning_rate": 1.5091285526068136e-05, "loss": 0.6807, "step": 13040 }, { "epoch": 2.46, "grad_norm": 31.987140655517578, "learning_rate": 1.5087521174477698e-05, "loss": 0.4382, "step": 13050 }, { "epoch": 2.46, "grad_norm": 23.37934112548828, "learning_rate": 1.5083756822887259e-05, "loss": 0.7205, "step": 13060 }, { "epoch": 2.46, "grad_norm": 2.923102855682373, "learning_rate": 1.507999247129682e-05, "loss": 0.9366, "step": 13070 }, { "epoch": 2.46, "grad_norm": 2.3516485691070557, "learning_rate": 1.5076228119706382e-05, "loss": 0.6969, "step": 13080 }, { "epoch": 2.46, "grad_norm": 16.754817962646484, "learning_rate": 1.5072463768115944e-05, "loss": 0.6468, "step": 13090 }, { "epoch": 2.47, "grad_norm": 0.9715334177017212, "learning_rate": 1.5068699416525505e-05, "loss": 0.9202, "step": 13100 }, { "epoch": 2.47, "grad_norm": 16.75336456298828, "learning_rate": 1.5064935064935066e-05, "loss": 0.5497, "step": 13110 }, { "epoch": 2.47, "grad_norm": 11.597676277160645, "learning_rate": 1.5061170713344628e-05, "loss": 1.031, "step": 13120 }, { "epoch": 2.47, "grad_norm": 19.306570053100586, "learning_rate": 1.505740636175419e-05, "loss": 0.5285, "step": 13130 }, { "epoch": 2.47, "grad_norm": 11.259316444396973, "learning_rate": 1.505364201016375e-05, "loss": 0.6111, "step": 13140 }, { "epoch": 2.48, "grad_norm": 5.920182704925537, "learning_rate": 1.5049877658573312e-05, "loss": 0.9721, "step": 13150 }, { "epoch": 2.48, "grad_norm": 4.26685905456543, "learning_rate": 1.5046113306982872e-05, "loss": 0.5111, "step": 13160 }, { "epoch": 2.48, "grad_norm": 1.7324987649917603, "learning_rate": 1.5042348955392434e-05, "loss": 0.3585, "step": 13170 }, { "epoch": 2.48, "grad_norm": 12.388806343078613, "learning_rate": 1.5038584603801995e-05, "loss": 0.6431, "step": 13180 }, { "epoch": 2.48, "grad_norm": 26.238208770751953, "learning_rate": 1.5034820252211556e-05, "loss": 0.881, "step": 13190 }, { "epoch": 2.48, "grad_norm": 17.09404945373535, "learning_rate": 1.5031055900621118e-05, "loss": 0.7259, "step": 13200 }, { "epoch": 2.49, "grad_norm": 24.81346321105957, "learning_rate": 1.5027291549030681e-05, "loss": 0.8976, "step": 13210 }, { "epoch": 2.49, "grad_norm": 18.42681121826172, "learning_rate": 1.5023527197440243e-05, "loss": 0.6566, "step": 13220 }, { "epoch": 2.49, "grad_norm": 14.247604370117188, "learning_rate": 1.5019762845849804e-05, "loss": 0.5223, "step": 13230 }, { "epoch": 2.49, "grad_norm": 10.424066543579102, "learning_rate": 1.5015998494259365e-05, "loss": 0.7308, "step": 13240 }, { "epoch": 2.49, "grad_norm": 0.2744300067424774, "learning_rate": 1.5012234142668927e-05, "loss": 0.5581, "step": 13250 }, { "epoch": 2.5, "grad_norm": 13.965598106384277, "learning_rate": 1.5008469791078488e-05, "loss": 0.6971, "step": 13260 }, { "epoch": 2.5, "grad_norm": 15.680059432983398, "learning_rate": 1.500470543948805e-05, "loss": 0.823, "step": 13270 }, { "epoch": 2.5, "grad_norm": 64.32691955566406, "learning_rate": 1.5000941087897611e-05, "loss": 0.7521, "step": 13280 }, { "epoch": 2.5, "grad_norm": 17.173147201538086, "learning_rate": 1.4997176736307173e-05, "loss": 0.7143, "step": 13290 }, { "epoch": 2.5, "grad_norm": 12.266512870788574, "learning_rate": 1.4993412384716734e-05, "loss": 0.5146, "step": 13300 }, { "epoch": 2.51, "grad_norm": 27.415990829467773, "learning_rate": 1.4989648033126296e-05, "loss": 0.6267, "step": 13310 }, { "epoch": 2.51, "grad_norm": 38.314414978027344, "learning_rate": 1.4985883681535857e-05, "loss": 0.8287, "step": 13320 }, { "epoch": 2.51, "grad_norm": 14.848710060119629, "learning_rate": 1.4982119329945417e-05, "loss": 0.9447, "step": 13330 }, { "epoch": 2.51, "grad_norm": 16.18121910095215, "learning_rate": 1.4978354978354978e-05, "loss": 0.8354, "step": 13340 }, { "epoch": 2.51, "grad_norm": 18.633420944213867, "learning_rate": 1.497459062676454e-05, "loss": 0.5659, "step": 13350 }, { "epoch": 2.51, "grad_norm": 8.951250076293945, "learning_rate": 1.4970826275174101e-05, "loss": 0.7976, "step": 13360 }, { "epoch": 2.52, "grad_norm": 8.146002769470215, "learning_rate": 1.4967061923583663e-05, "loss": 0.5475, "step": 13370 }, { "epoch": 2.52, "grad_norm": 10.315155982971191, "learning_rate": 1.4963297571993224e-05, "loss": 0.7532, "step": 13380 }, { "epoch": 2.52, "grad_norm": 6.590341567993164, "learning_rate": 1.4959533220402787e-05, "loss": 0.7397, "step": 13390 }, { "epoch": 2.52, "grad_norm": 7.777267932891846, "learning_rate": 1.4955768868812349e-05, "loss": 0.7557, "step": 13400 }, { "epoch": 2.52, "grad_norm": 13.434835433959961, "learning_rate": 1.495200451722191e-05, "loss": 0.6464, "step": 13410 }, { "epoch": 2.53, "grad_norm": 15.336316108703613, "learning_rate": 1.4948240165631472e-05, "loss": 1.0214, "step": 13420 }, { "epoch": 2.53, "grad_norm": 9.77111530303955, "learning_rate": 1.4944475814041033e-05, "loss": 0.4507, "step": 13430 }, { "epoch": 2.53, "grad_norm": 10.342262268066406, "learning_rate": 1.4940711462450595e-05, "loss": 0.993, "step": 13440 }, { "epoch": 2.53, "grad_norm": 22.338380813598633, "learning_rate": 1.4936947110860156e-05, "loss": 1.1721, "step": 13450 }, { "epoch": 2.53, "grad_norm": 12.006577491760254, "learning_rate": 1.4933182759269718e-05, "loss": 0.7968, "step": 13460 }, { "epoch": 2.54, "grad_norm": 27.795074462890625, "learning_rate": 1.4929418407679279e-05, "loss": 0.9014, "step": 13470 }, { "epoch": 2.54, "grad_norm": 38.55152893066406, "learning_rate": 1.492565405608884e-05, "loss": 0.8012, "step": 13480 }, { "epoch": 2.54, "grad_norm": 4.785322666168213, "learning_rate": 1.4921889704498402e-05, "loss": 0.7915, "step": 13490 }, { "epoch": 2.54, "grad_norm": 11.960060119628906, "learning_rate": 1.4918125352907963e-05, "loss": 0.949, "step": 13500 }, { "epoch": 2.54, "grad_norm": 11.774707794189453, "learning_rate": 1.4914361001317523e-05, "loss": 0.5137, "step": 13510 }, { "epoch": 2.54, "grad_norm": 10.362529754638672, "learning_rate": 1.4910596649727085e-05, "loss": 0.7564, "step": 13520 }, { "epoch": 2.55, "grad_norm": 14.903637886047363, "learning_rate": 1.4906832298136646e-05, "loss": 1.0717, "step": 13530 }, { "epoch": 2.55, "grad_norm": 24.547834396362305, "learning_rate": 1.4903067946546208e-05, "loss": 0.675, "step": 13540 }, { "epoch": 2.55, "grad_norm": 15.174468040466309, "learning_rate": 1.4899303594955769e-05, "loss": 0.7592, "step": 13550 }, { "epoch": 2.55, "grad_norm": 5.759738922119141, "learning_rate": 1.489553924336533e-05, "loss": 0.8173, "step": 13560 }, { "epoch": 2.55, "grad_norm": 30.83299446105957, "learning_rate": 1.4891774891774892e-05, "loss": 0.6815, "step": 13570 }, { "epoch": 2.56, "grad_norm": 18.987077713012695, "learning_rate": 1.4888010540184455e-05, "loss": 0.7051, "step": 13580 }, { "epoch": 2.56, "grad_norm": 25.075693130493164, "learning_rate": 1.4884246188594017e-05, "loss": 0.9777, "step": 13590 }, { "epoch": 2.56, "grad_norm": 12.43553352355957, "learning_rate": 1.4880481837003578e-05, "loss": 0.6671, "step": 13600 }, { "epoch": 2.56, "grad_norm": 7.164165496826172, "learning_rate": 1.487671748541314e-05, "loss": 0.7385, "step": 13610 }, { "epoch": 2.56, "grad_norm": 12.529688835144043, "learning_rate": 1.4872953133822701e-05, "loss": 0.9938, "step": 13620 }, { "epoch": 2.57, "grad_norm": 11.181074142456055, "learning_rate": 1.4869188782232262e-05, "loss": 1.0485, "step": 13630 }, { "epoch": 2.57, "grad_norm": 15.510049819946289, "learning_rate": 1.4865424430641824e-05, "loss": 0.6744, "step": 13640 }, { "epoch": 2.57, "grad_norm": 14.156461715698242, "learning_rate": 1.4861660079051385e-05, "loss": 0.8552, "step": 13650 }, { "epoch": 2.57, "grad_norm": 17.33963394165039, "learning_rate": 1.4857895727460947e-05, "loss": 0.9491, "step": 13660 }, { "epoch": 2.57, "grad_norm": 0.6199512481689453, "learning_rate": 1.4854131375870508e-05, "loss": 0.7211, "step": 13670 }, { "epoch": 2.57, "grad_norm": 6.893851280212402, "learning_rate": 1.4850367024280068e-05, "loss": 0.717, "step": 13680 }, { "epoch": 2.58, "grad_norm": 11.729772567749023, "learning_rate": 1.484660267268963e-05, "loss": 0.4342, "step": 13690 }, { "epoch": 2.58, "grad_norm": 0.8101961612701416, "learning_rate": 1.4842838321099191e-05, "loss": 0.8445, "step": 13700 }, { "epoch": 2.58, "grad_norm": 28.97067642211914, "learning_rate": 1.4839073969508752e-05, "loss": 0.8899, "step": 13710 }, { "epoch": 2.58, "grad_norm": 50.73062515258789, "learning_rate": 1.4835309617918314e-05, "loss": 0.7837, "step": 13720 }, { "epoch": 2.58, "grad_norm": 15.3909330368042, "learning_rate": 1.4831545266327875e-05, "loss": 0.9568, "step": 13730 }, { "epoch": 2.59, "grad_norm": 30.514009475708008, "learning_rate": 1.4827780914737437e-05, "loss": 0.9959, "step": 13740 }, { "epoch": 2.59, "grad_norm": 8.172021865844727, "learning_rate": 1.4824016563146998e-05, "loss": 0.7495, "step": 13750 }, { "epoch": 2.59, "grad_norm": 20.11758041381836, "learning_rate": 1.4820252211556561e-05, "loss": 0.549, "step": 13760 }, { "epoch": 2.59, "grad_norm": 0.79815274477005, "learning_rate": 1.4816487859966123e-05, "loss": 0.5851, "step": 13770 }, { "epoch": 2.59, "grad_norm": 60.54981231689453, "learning_rate": 1.4812723508375684e-05, "loss": 0.7835, "step": 13780 }, { "epoch": 2.6, "grad_norm": 10.375343322753906, "learning_rate": 1.4808959156785246e-05, "loss": 0.9386, "step": 13790 }, { "epoch": 2.6, "grad_norm": 11.545114517211914, "learning_rate": 1.4805194805194807e-05, "loss": 0.585, "step": 13800 }, { "epoch": 2.6, "grad_norm": 18.797042846679688, "learning_rate": 1.4801430453604369e-05, "loss": 0.7882, "step": 13810 }, { "epoch": 2.6, "grad_norm": 18.879865646362305, "learning_rate": 1.479766610201393e-05, "loss": 0.6514, "step": 13820 }, { "epoch": 2.6, "grad_norm": 21.993356704711914, "learning_rate": 1.4793901750423492e-05, "loss": 0.7005, "step": 13830 }, { "epoch": 2.6, "grad_norm": 27.9875431060791, "learning_rate": 1.4790137398833053e-05, "loss": 0.9488, "step": 13840 }, { "epoch": 2.61, "grad_norm": 11.482915878295898, "learning_rate": 1.4786373047242613e-05, "loss": 0.7769, "step": 13850 }, { "epoch": 2.61, "grad_norm": 7.644073486328125, "learning_rate": 1.4782608695652174e-05, "loss": 0.3767, "step": 13860 }, { "epoch": 2.61, "grad_norm": 15.943230628967285, "learning_rate": 1.4778844344061736e-05, "loss": 0.7765, "step": 13870 }, { "epoch": 2.61, "grad_norm": 23.832786560058594, "learning_rate": 1.4775079992471297e-05, "loss": 0.7166, "step": 13880 }, { "epoch": 2.61, "grad_norm": 9.894569396972656, "learning_rate": 1.4771315640880859e-05, "loss": 0.7431, "step": 13890 }, { "epoch": 2.62, "grad_norm": 18.379024505615234, "learning_rate": 1.476755128929042e-05, "loss": 0.8358, "step": 13900 }, { "epoch": 2.62, "grad_norm": 1.7742643356323242, "learning_rate": 1.4763786937699982e-05, "loss": 0.5125, "step": 13910 }, { "epoch": 2.62, "grad_norm": 10.236210823059082, "learning_rate": 1.4760022586109543e-05, "loss": 0.6742, "step": 13920 }, { "epoch": 2.62, "grad_norm": 21.775062561035156, "learning_rate": 1.4756258234519105e-05, "loss": 0.5442, "step": 13930 }, { "epoch": 2.62, "grad_norm": 12.80639362335205, "learning_rate": 1.4752493882928666e-05, "loss": 0.8378, "step": 13940 }, { "epoch": 2.63, "grad_norm": 12.556268692016602, "learning_rate": 1.4748729531338229e-05, "loss": 0.7747, "step": 13950 }, { "epoch": 2.63, "grad_norm": 9.039406776428223, "learning_rate": 1.474496517974779e-05, "loss": 0.7077, "step": 13960 }, { "epoch": 2.63, "grad_norm": 15.263288497924805, "learning_rate": 1.4741200828157352e-05, "loss": 0.6179, "step": 13970 }, { "epoch": 2.63, "grad_norm": 18.868026733398438, "learning_rate": 1.4737436476566914e-05, "loss": 0.8249, "step": 13980 }, { "epoch": 2.63, "grad_norm": 35.89374923706055, "learning_rate": 1.4733672124976475e-05, "loss": 0.9226, "step": 13990 }, { "epoch": 2.64, "grad_norm": 15.616324424743652, "learning_rate": 1.4729907773386036e-05, "loss": 1.0765, "step": 14000 }, { "epoch": 2.64, "grad_norm": 1.1768288612365723, "learning_rate": 1.4726143421795598e-05, "loss": 0.7274, "step": 14010 }, { "epoch": 2.64, "grad_norm": 24.081043243408203, "learning_rate": 1.472237907020516e-05, "loss": 0.8053, "step": 14020 }, { "epoch": 2.64, "grad_norm": 4.48431396484375, "learning_rate": 1.4718614718614719e-05, "loss": 0.5182, "step": 14030 }, { "epoch": 2.64, "grad_norm": 15.65145492553711, "learning_rate": 1.471485036702428e-05, "loss": 0.4912, "step": 14040 }, { "epoch": 2.64, "grad_norm": 7.0879316329956055, "learning_rate": 1.4711086015433842e-05, "loss": 0.6764, "step": 14050 }, { "epoch": 2.65, "grad_norm": 12.771845817565918, "learning_rate": 1.4707321663843404e-05, "loss": 0.9551, "step": 14060 }, { "epoch": 2.65, "grad_norm": 14.33671760559082, "learning_rate": 1.4703557312252965e-05, "loss": 0.8178, "step": 14070 }, { "epoch": 2.65, "grad_norm": 7.756180286407471, "learning_rate": 1.4699792960662526e-05, "loss": 0.8128, "step": 14080 }, { "epoch": 2.65, "grad_norm": 3.8817169666290283, "learning_rate": 1.4696028609072088e-05, "loss": 0.6818, "step": 14090 }, { "epoch": 2.65, "grad_norm": 25.685022354125977, "learning_rate": 1.469226425748165e-05, "loss": 1.0637, "step": 14100 }, { "epoch": 2.66, "grad_norm": 7.975883483886719, "learning_rate": 1.468849990589121e-05, "loss": 0.6875, "step": 14110 }, { "epoch": 2.66, "grad_norm": 11.329687118530273, "learning_rate": 1.4684735554300772e-05, "loss": 0.7874, "step": 14120 }, { "epoch": 2.66, "grad_norm": 28.03009033203125, "learning_rate": 1.4680971202710335e-05, "loss": 0.3694, "step": 14130 }, { "epoch": 2.66, "grad_norm": 15.965397834777832, "learning_rate": 1.4677206851119897e-05, "loss": 0.8764, "step": 14140 }, { "epoch": 2.66, "grad_norm": 25.158184051513672, "learning_rate": 1.4673442499529458e-05, "loss": 0.7737, "step": 14150 }, { "epoch": 2.67, "grad_norm": 6.6749725341796875, "learning_rate": 1.466967814793902e-05, "loss": 0.587, "step": 14160 }, { "epoch": 2.67, "grad_norm": 11.725072860717773, "learning_rate": 1.4665913796348581e-05, "loss": 0.7353, "step": 14170 }, { "epoch": 2.67, "grad_norm": 17.462539672851562, "learning_rate": 1.4662149444758143e-05, "loss": 0.9037, "step": 14180 }, { "epoch": 2.67, "grad_norm": 7.446224212646484, "learning_rate": 1.4658385093167704e-05, "loss": 0.7478, "step": 14190 }, { "epoch": 2.67, "grad_norm": 26.2742919921875, "learning_rate": 1.4654620741577264e-05, "loss": 0.8547, "step": 14200 }, { "epoch": 2.67, "grad_norm": 26.45722770690918, "learning_rate": 1.4650856389986825e-05, "loss": 1.2598, "step": 14210 }, { "epoch": 2.68, "grad_norm": 30.59162139892578, "learning_rate": 1.4647092038396387e-05, "loss": 0.9227, "step": 14220 }, { "epoch": 2.68, "grad_norm": 2.223261594772339, "learning_rate": 1.4643327686805948e-05, "loss": 0.6894, "step": 14230 }, { "epoch": 2.68, "grad_norm": 12.304634094238281, "learning_rate": 1.463956333521551e-05, "loss": 0.938, "step": 14240 }, { "epoch": 2.68, "grad_norm": 18.860149383544922, "learning_rate": 1.4635798983625071e-05, "loss": 0.7099, "step": 14250 }, { "epoch": 2.68, "grad_norm": 26.4725399017334, "learning_rate": 1.4632034632034633e-05, "loss": 0.6649, "step": 14260 }, { "epoch": 2.69, "grad_norm": 16.648344039916992, "learning_rate": 1.4628270280444194e-05, "loss": 0.7711, "step": 14270 }, { "epoch": 2.69, "grad_norm": 10.517045974731445, "learning_rate": 1.4624505928853756e-05, "loss": 0.9333, "step": 14280 }, { "epoch": 2.69, "grad_norm": 11.410792350769043, "learning_rate": 1.4620741577263317e-05, "loss": 0.5923, "step": 14290 }, { "epoch": 2.69, "grad_norm": 16.357027053833008, "learning_rate": 1.4616977225672879e-05, "loss": 0.9859, "step": 14300 }, { "epoch": 2.69, "grad_norm": 16.898576736450195, "learning_rate": 1.461321287408244e-05, "loss": 0.5869, "step": 14310 }, { "epoch": 2.7, "grad_norm": 24.384197235107422, "learning_rate": 1.4609448522492003e-05, "loss": 1.1913, "step": 14320 }, { "epoch": 2.7, "grad_norm": 19.52752685546875, "learning_rate": 1.4605684170901565e-05, "loss": 0.4424, "step": 14330 }, { "epoch": 2.7, "grad_norm": 24.645038604736328, "learning_rate": 1.4601919819311126e-05, "loss": 0.8484, "step": 14340 }, { "epoch": 2.7, "grad_norm": 6.663630485534668, "learning_rate": 1.4598155467720688e-05, "loss": 0.8926, "step": 14350 }, { "epoch": 2.7, "grad_norm": 36.7334098815918, "learning_rate": 1.4594391116130249e-05, "loss": 0.8592, "step": 14360 }, { "epoch": 2.7, "grad_norm": 8.041152954101562, "learning_rate": 1.459062676453981e-05, "loss": 0.4931, "step": 14370 }, { "epoch": 2.71, "grad_norm": 4.520535469055176, "learning_rate": 1.458686241294937e-05, "loss": 0.6731, "step": 14380 }, { "epoch": 2.71, "grad_norm": 18.924875259399414, "learning_rate": 1.4583098061358932e-05, "loss": 0.6853, "step": 14390 }, { "epoch": 2.71, "grad_norm": 8.557247161865234, "learning_rate": 1.4579333709768493e-05, "loss": 0.7192, "step": 14400 }, { "epoch": 2.71, "grad_norm": 14.18856143951416, "learning_rate": 1.4575569358178055e-05, "loss": 0.6096, "step": 14410 }, { "epoch": 2.71, "grad_norm": 8.3154296875, "learning_rate": 1.4571805006587616e-05, "loss": 0.6547, "step": 14420 }, { "epoch": 2.72, "grad_norm": 22.346403121948242, "learning_rate": 1.4568040654997178e-05, "loss": 0.6901, "step": 14430 }, { "epoch": 2.72, "grad_norm": 26.471521377563477, "learning_rate": 1.4564276303406739e-05, "loss": 0.8754, "step": 14440 }, { "epoch": 2.72, "grad_norm": 18.310527801513672, "learning_rate": 1.45605119518163e-05, "loss": 0.6152, "step": 14450 }, { "epoch": 2.72, "grad_norm": 9.209930419921875, "learning_rate": 1.4556747600225862e-05, "loss": 0.8826, "step": 14460 }, { "epoch": 2.72, "grad_norm": 7.652730464935303, "learning_rate": 1.4552983248635423e-05, "loss": 0.4942, "step": 14470 }, { "epoch": 2.73, "grad_norm": 0.7166558504104614, "learning_rate": 1.4549218897044985e-05, "loss": 0.6347, "step": 14480 }, { "epoch": 2.73, "grad_norm": 23.243921279907227, "learning_rate": 1.4545454545454546e-05, "loss": 0.9109, "step": 14490 }, { "epoch": 2.73, "grad_norm": 24.10265350341797, "learning_rate": 1.4541690193864108e-05, "loss": 0.5445, "step": 14500 }, { "epoch": 2.73, "grad_norm": 7.200760364532471, "learning_rate": 1.4537925842273671e-05, "loss": 0.8351, "step": 14510 }, { "epoch": 2.73, "grad_norm": 23.413759231567383, "learning_rate": 1.4534161490683232e-05, "loss": 0.9889, "step": 14520 }, { "epoch": 2.73, "grad_norm": 11.395479202270508, "learning_rate": 1.4530397139092794e-05, "loss": 0.548, "step": 14530 }, { "epoch": 2.74, "grad_norm": 18.534324645996094, "learning_rate": 1.4526632787502355e-05, "loss": 0.5068, "step": 14540 }, { "epoch": 2.74, "grad_norm": 22.6841983795166, "learning_rate": 1.4522868435911915e-05, "loss": 0.933, "step": 14550 }, { "epoch": 2.74, "grad_norm": 26.522497177124023, "learning_rate": 1.4519104084321477e-05, "loss": 0.9139, "step": 14560 }, { "epoch": 2.74, "grad_norm": 0.6627997756004333, "learning_rate": 1.4515339732731038e-05, "loss": 0.7262, "step": 14570 }, { "epoch": 2.74, "grad_norm": 8.96130084991455, "learning_rate": 1.45115753811406e-05, "loss": 0.8326, "step": 14580 }, { "epoch": 2.75, "grad_norm": 8.547167778015137, "learning_rate": 1.4507811029550161e-05, "loss": 0.8872, "step": 14590 }, { "epoch": 2.75, "grad_norm": 8.127227783203125, "learning_rate": 1.4504046677959722e-05, "loss": 0.7324, "step": 14600 }, { "epoch": 2.75, "grad_norm": 33.58380126953125, "learning_rate": 1.4500282326369284e-05, "loss": 0.8574, "step": 14610 }, { "epoch": 2.75, "grad_norm": 3.6204919815063477, "learning_rate": 1.4496517974778845e-05, "loss": 0.6359, "step": 14620 }, { "epoch": 2.75, "grad_norm": 32.16131591796875, "learning_rate": 1.4492753623188407e-05, "loss": 0.9616, "step": 14630 }, { "epoch": 2.76, "grad_norm": 12.819900512695312, "learning_rate": 1.4488989271597968e-05, "loss": 0.6547, "step": 14640 }, { "epoch": 2.76, "grad_norm": 22.329343795776367, "learning_rate": 1.448522492000753e-05, "loss": 0.5535, "step": 14650 }, { "epoch": 2.76, "grad_norm": 24.37099266052246, "learning_rate": 1.4481460568417091e-05, "loss": 0.935, "step": 14660 }, { "epoch": 2.76, "grad_norm": 24.315109252929688, "learning_rate": 1.4477696216826653e-05, "loss": 0.8781, "step": 14670 }, { "epoch": 2.76, "grad_norm": 2.761948823928833, "learning_rate": 1.4473931865236214e-05, "loss": 0.7095, "step": 14680 }, { "epoch": 2.76, "grad_norm": 20.340417861938477, "learning_rate": 1.4470167513645777e-05, "loss": 0.9713, "step": 14690 }, { "epoch": 2.77, "grad_norm": 14.535849571228027, "learning_rate": 1.4466403162055339e-05, "loss": 0.5641, "step": 14700 }, { "epoch": 2.77, "grad_norm": 18.312532424926758, "learning_rate": 1.44626388104649e-05, "loss": 0.8118, "step": 14710 }, { "epoch": 2.77, "grad_norm": 15.691104888916016, "learning_rate": 1.4458874458874458e-05, "loss": 0.6316, "step": 14720 }, { "epoch": 2.77, "grad_norm": 10.579209327697754, "learning_rate": 1.445511010728402e-05, "loss": 0.4896, "step": 14730 }, { "epoch": 2.77, "grad_norm": 13.167963981628418, "learning_rate": 1.4451345755693583e-05, "loss": 0.9028, "step": 14740 }, { "epoch": 2.78, "grad_norm": 18.209197998046875, "learning_rate": 1.4447581404103144e-05, "loss": 0.7617, "step": 14750 }, { "epoch": 2.78, "grad_norm": 7.098175525665283, "learning_rate": 1.4443817052512706e-05, "loss": 0.9601, "step": 14760 }, { "epoch": 2.78, "grad_norm": 29.88236427307129, "learning_rate": 1.4440052700922267e-05, "loss": 0.9673, "step": 14770 }, { "epoch": 2.78, "grad_norm": 14.183192253112793, "learning_rate": 1.4436288349331829e-05, "loss": 0.9759, "step": 14780 }, { "epoch": 2.78, "grad_norm": 1.7883967161178589, "learning_rate": 1.443252399774139e-05, "loss": 0.4751, "step": 14790 }, { "epoch": 2.79, "grad_norm": 25.07923698425293, "learning_rate": 1.4428759646150952e-05, "loss": 0.8102, "step": 14800 }, { "epoch": 2.79, "grad_norm": 0.43187662959098816, "learning_rate": 1.4424995294560513e-05, "loss": 0.6616, "step": 14810 }, { "epoch": 2.79, "grad_norm": 3.4543023109436035, "learning_rate": 1.4421230942970075e-05, "loss": 0.8503, "step": 14820 }, { "epoch": 2.79, "grad_norm": 9.902512550354004, "learning_rate": 1.4417466591379636e-05, "loss": 0.7454, "step": 14830 }, { "epoch": 2.79, "grad_norm": 14.120229721069336, "learning_rate": 1.4413702239789197e-05, "loss": 0.7271, "step": 14840 }, { "epoch": 2.8, "grad_norm": 42.34109878540039, "learning_rate": 1.4409937888198759e-05, "loss": 0.6191, "step": 14850 }, { "epoch": 2.8, "grad_norm": 8.808544158935547, "learning_rate": 1.440617353660832e-05, "loss": 0.7787, "step": 14860 }, { "epoch": 2.8, "grad_norm": 18.57027244567871, "learning_rate": 1.4402409185017882e-05, "loss": 0.6487, "step": 14870 }, { "epoch": 2.8, "grad_norm": 12.555221557617188, "learning_rate": 1.4398644833427445e-05, "loss": 0.5236, "step": 14880 }, { "epoch": 2.8, "grad_norm": 4.785314083099365, "learning_rate": 1.4394880481837006e-05, "loss": 0.6572, "step": 14890 }, { "epoch": 2.8, "grad_norm": 4.444519996643066, "learning_rate": 1.4391116130246565e-05, "loss": 0.4675, "step": 14900 }, { "epoch": 2.81, "grad_norm": 16.588285446166992, "learning_rate": 1.4387351778656126e-05, "loss": 0.9234, "step": 14910 }, { "epoch": 2.81, "grad_norm": 6.433644771575928, "learning_rate": 1.4383587427065689e-05, "loss": 0.7184, "step": 14920 }, { "epoch": 2.81, "grad_norm": 27.862564086914062, "learning_rate": 1.437982307547525e-05, "loss": 0.8789, "step": 14930 }, { "epoch": 2.81, "grad_norm": 8.804747581481934, "learning_rate": 1.4376058723884812e-05, "loss": 0.8987, "step": 14940 }, { "epoch": 2.81, "grad_norm": 15.815366744995117, "learning_rate": 1.4372294372294374e-05, "loss": 0.601, "step": 14950 }, { "epoch": 2.82, "grad_norm": 16.371736526489258, "learning_rate": 1.4368530020703935e-05, "loss": 0.9209, "step": 14960 }, { "epoch": 2.82, "grad_norm": 20.936168670654297, "learning_rate": 1.4364765669113496e-05, "loss": 0.6826, "step": 14970 }, { "epoch": 2.82, "grad_norm": 0.5109462141990662, "learning_rate": 1.4361001317523058e-05, "loss": 0.5467, "step": 14980 }, { "epoch": 2.82, "grad_norm": 15.911187171936035, "learning_rate": 1.435723696593262e-05, "loss": 0.7498, "step": 14990 }, { "epoch": 2.82, "grad_norm": 10.85964584350586, "learning_rate": 1.435347261434218e-05, "loss": 0.7946, "step": 15000 }, { "epoch": 2.83, "grad_norm": 12.648409843444824, "learning_rate": 1.4349708262751742e-05, "loss": 0.8198, "step": 15010 }, { "epoch": 2.83, "grad_norm": 25.404829025268555, "learning_rate": 1.4345943911161304e-05, "loss": 0.6896, "step": 15020 }, { "epoch": 2.83, "grad_norm": 21.891569137573242, "learning_rate": 1.4342179559570865e-05, "loss": 0.5017, "step": 15030 }, { "epoch": 2.83, "grad_norm": 8.692461967468262, "learning_rate": 1.4338415207980427e-05, "loss": 0.6032, "step": 15040 }, { "epoch": 2.83, "grad_norm": 6.623054027557373, "learning_rate": 1.4334650856389988e-05, "loss": 0.5454, "step": 15050 }, { "epoch": 2.83, "grad_norm": 14.266157150268555, "learning_rate": 1.4330886504799551e-05, "loss": 0.8083, "step": 15060 }, { "epoch": 2.84, "grad_norm": 21.779327392578125, "learning_rate": 1.432712215320911e-05, "loss": 0.7511, "step": 15070 }, { "epoch": 2.84, "grad_norm": 14.891936302185059, "learning_rate": 1.432335780161867e-05, "loss": 0.7927, "step": 15080 }, { "epoch": 2.84, "grad_norm": 3.073430061340332, "learning_rate": 1.4319593450028232e-05, "loss": 0.6556, "step": 15090 }, { "epoch": 2.84, "grad_norm": 13.47904109954834, "learning_rate": 1.4315829098437794e-05, "loss": 0.8525, "step": 15100 }, { "epoch": 2.84, "grad_norm": 22.618297576904297, "learning_rate": 1.4312064746847357e-05, "loss": 0.7522, "step": 15110 }, { "epoch": 2.85, "grad_norm": 21.820899963378906, "learning_rate": 1.4308300395256918e-05, "loss": 1.0761, "step": 15120 }, { "epoch": 2.85, "grad_norm": 0.3761250078678131, "learning_rate": 1.430453604366648e-05, "loss": 0.5225, "step": 15130 }, { "epoch": 2.85, "grad_norm": 4.551104545593262, "learning_rate": 1.4300771692076041e-05, "loss": 0.7417, "step": 15140 }, { "epoch": 2.85, "grad_norm": 17.213136672973633, "learning_rate": 1.4297007340485603e-05, "loss": 0.7196, "step": 15150 }, { "epoch": 2.85, "grad_norm": 29.140161514282227, "learning_rate": 1.4293242988895164e-05, "loss": 0.6101, "step": 15160 }, { "epoch": 2.86, "grad_norm": 23.715343475341797, "learning_rate": 1.4289478637304726e-05, "loss": 0.9656, "step": 15170 }, { "epoch": 2.86, "grad_norm": 16.146928787231445, "learning_rate": 1.4285714285714287e-05, "loss": 0.5416, "step": 15180 }, { "epoch": 2.86, "grad_norm": 23.387487411499023, "learning_rate": 1.4281949934123849e-05, "loss": 1.0218, "step": 15190 }, { "epoch": 2.86, "grad_norm": 19.16897201538086, "learning_rate": 1.427818558253341e-05, "loss": 0.7901, "step": 15200 }, { "epoch": 2.86, "grad_norm": 20.92551040649414, "learning_rate": 1.4274421230942972e-05, "loss": 1.0405, "step": 15210 }, { "epoch": 2.86, "grad_norm": 4.3972930908203125, "learning_rate": 1.4270656879352533e-05, "loss": 1.0935, "step": 15220 }, { "epoch": 2.87, "grad_norm": 9.615336418151855, "learning_rate": 1.4266892527762094e-05, "loss": 0.7947, "step": 15230 }, { "epoch": 2.87, "grad_norm": 12.70883560180664, "learning_rate": 1.4263128176171656e-05, "loss": 0.7954, "step": 15240 }, { "epoch": 2.87, "grad_norm": 19.39516830444336, "learning_rate": 1.4259363824581216e-05, "loss": 0.8201, "step": 15250 }, { "epoch": 2.87, "grad_norm": 16.59307098388672, "learning_rate": 1.4255599472990777e-05, "loss": 0.766, "step": 15260 }, { "epoch": 2.87, "grad_norm": 7.716126441955566, "learning_rate": 1.4251835121400339e-05, "loss": 0.7935, "step": 15270 }, { "epoch": 2.88, "grad_norm": 3.0978472232818604, "learning_rate": 1.42480707698099e-05, "loss": 0.7058, "step": 15280 }, { "epoch": 2.88, "grad_norm": 2.391120195388794, "learning_rate": 1.4244306418219463e-05, "loss": 0.88, "step": 15290 }, { "epoch": 2.88, "grad_norm": 16.929407119750977, "learning_rate": 1.4240542066629025e-05, "loss": 0.6826, "step": 15300 }, { "epoch": 2.88, "grad_norm": 15.712681770324707, "learning_rate": 1.4236777715038586e-05, "loss": 0.9275, "step": 15310 }, { "epoch": 2.88, "grad_norm": 2.698317527770996, "learning_rate": 1.4233013363448148e-05, "loss": 0.7053, "step": 15320 }, { "epoch": 2.89, "grad_norm": 4.682209014892578, "learning_rate": 1.4229249011857709e-05, "loss": 0.3361, "step": 15330 }, { "epoch": 2.89, "grad_norm": 18.427043914794922, "learning_rate": 1.422548466026727e-05, "loss": 0.5728, "step": 15340 }, { "epoch": 2.89, "grad_norm": 5.598134994506836, "learning_rate": 1.4221720308676832e-05, "loss": 0.6496, "step": 15350 }, { "epoch": 2.89, "grad_norm": 13.333558082580566, "learning_rate": 1.4217955957086393e-05, "loss": 0.7265, "step": 15360 }, { "epoch": 2.89, "grad_norm": 15.676292419433594, "learning_rate": 1.4214191605495955e-05, "loss": 0.6903, "step": 15370 }, { "epoch": 2.89, "grad_norm": 15.788107872009277, "learning_rate": 1.4210427253905516e-05, "loss": 0.7214, "step": 15380 }, { "epoch": 2.9, "grad_norm": 18.944698333740234, "learning_rate": 1.4206662902315078e-05, "loss": 0.6256, "step": 15390 }, { "epoch": 2.9, "grad_norm": 17.18710708618164, "learning_rate": 1.420289855072464e-05, "loss": 0.9801, "step": 15400 }, { "epoch": 2.9, "grad_norm": 24.525888442993164, "learning_rate": 1.41991341991342e-05, "loss": 0.8311, "step": 15410 }, { "epoch": 2.9, "grad_norm": 15.199677467346191, "learning_rate": 1.419536984754376e-05, "loss": 0.6035, "step": 15420 }, { "epoch": 2.9, "grad_norm": 1.7926441431045532, "learning_rate": 1.4191605495953322e-05, "loss": 0.7877, "step": 15430 }, { "epoch": 2.91, "grad_norm": 10.499088287353516, "learning_rate": 1.4187841144362883e-05, "loss": 0.5538, "step": 15440 }, { "epoch": 2.91, "grad_norm": 3.2541844844818115, "learning_rate": 1.4184076792772445e-05, "loss": 0.5928, "step": 15450 }, { "epoch": 2.91, "grad_norm": 0.6842057108879089, "learning_rate": 1.4180312441182006e-05, "loss": 0.8708, "step": 15460 }, { "epoch": 2.91, "grad_norm": 11.153387069702148, "learning_rate": 1.4176548089591568e-05, "loss": 0.55, "step": 15470 }, { "epoch": 2.91, "grad_norm": 14.324100494384766, "learning_rate": 1.4172783738001131e-05, "loss": 0.9598, "step": 15480 }, { "epoch": 2.92, "grad_norm": 9.769564628601074, "learning_rate": 1.4169019386410692e-05, "loss": 0.6915, "step": 15490 }, { "epoch": 2.92, "grad_norm": 5.7416863441467285, "learning_rate": 1.4165255034820254e-05, "loss": 1.0917, "step": 15500 }, { "epoch": 2.92, "grad_norm": 8.69757080078125, "learning_rate": 1.4161490683229815e-05, "loss": 0.5941, "step": 15510 }, { "epoch": 2.92, "grad_norm": 5.992401123046875, "learning_rate": 1.4157726331639377e-05, "loss": 0.5866, "step": 15520 }, { "epoch": 2.92, "grad_norm": 18.785144805908203, "learning_rate": 1.4153961980048938e-05, "loss": 0.721, "step": 15530 }, { "epoch": 2.92, "grad_norm": 18.443172454833984, "learning_rate": 1.41501976284585e-05, "loss": 0.513, "step": 15540 }, { "epoch": 2.93, "grad_norm": 8.198022842407227, "learning_rate": 1.4146433276868061e-05, "loss": 0.8079, "step": 15550 }, { "epoch": 2.93, "grad_norm": 8.908957481384277, "learning_rate": 1.4142668925277623e-05, "loss": 0.656, "step": 15560 }, { "epoch": 2.93, "grad_norm": 16.126298904418945, "learning_rate": 1.4138904573687184e-05, "loss": 0.9063, "step": 15570 }, { "epoch": 2.93, "grad_norm": 20.169885635375977, "learning_rate": 1.4135140222096746e-05, "loss": 0.741, "step": 15580 }, { "epoch": 2.93, "grad_norm": 0.3554701507091522, "learning_rate": 1.4131375870506307e-05, "loss": 0.614, "step": 15590 }, { "epoch": 2.94, "grad_norm": 9.904111862182617, "learning_rate": 1.4127611518915867e-05, "loss": 0.8123, "step": 15600 }, { "epoch": 2.94, "grad_norm": 24.91057586669922, "learning_rate": 1.4123847167325428e-05, "loss": 0.7551, "step": 15610 }, { "epoch": 2.94, "grad_norm": 5.963438510894775, "learning_rate": 1.412008281573499e-05, "loss": 0.6714, "step": 15620 }, { "epoch": 2.94, "grad_norm": 15.89360523223877, "learning_rate": 1.4116318464144551e-05, "loss": 0.8974, "step": 15630 }, { "epoch": 2.94, "grad_norm": 12.206485748291016, "learning_rate": 1.4112554112554113e-05, "loss": 1.0773, "step": 15640 }, { "epoch": 2.95, "grad_norm": 2.8578970432281494, "learning_rate": 1.4108789760963674e-05, "loss": 0.5777, "step": 15650 }, { "epoch": 2.95, "grad_norm": 3.7192718982696533, "learning_rate": 1.4105025409373236e-05, "loss": 0.6443, "step": 15660 }, { "epoch": 2.95, "grad_norm": 15.652264595031738, "learning_rate": 1.4101261057782799e-05, "loss": 0.7183, "step": 15670 }, { "epoch": 2.95, "grad_norm": 4.038358688354492, "learning_rate": 1.409749670619236e-05, "loss": 0.3951, "step": 15680 }, { "epoch": 2.95, "grad_norm": 23.8483829498291, "learning_rate": 1.4093732354601922e-05, "loss": 0.7791, "step": 15690 }, { "epoch": 2.96, "grad_norm": 3.3910269737243652, "learning_rate": 1.4089968003011483e-05, "loss": 0.5433, "step": 15700 }, { "epoch": 2.96, "grad_norm": 3.8397445678710938, "learning_rate": 1.4086203651421045e-05, "loss": 0.7548, "step": 15710 }, { "epoch": 2.96, "grad_norm": 11.651528358459473, "learning_rate": 1.4082439299830606e-05, "loss": 0.4432, "step": 15720 }, { "epoch": 2.96, "grad_norm": 11.280261039733887, "learning_rate": 1.4078674948240167e-05, "loss": 0.5902, "step": 15730 }, { "epoch": 2.96, "grad_norm": 22.70057487487793, "learning_rate": 1.4074910596649729e-05, "loss": 0.5237, "step": 15740 }, { "epoch": 2.96, "grad_norm": 12.416793823242188, "learning_rate": 1.407114624505929e-05, "loss": 0.5404, "step": 15750 }, { "epoch": 2.97, "grad_norm": 23.81106948852539, "learning_rate": 1.4067381893468852e-05, "loss": 0.7493, "step": 15760 }, { "epoch": 2.97, "grad_norm": 21.926897048950195, "learning_rate": 1.4063617541878412e-05, "loss": 0.8389, "step": 15770 }, { "epoch": 2.97, "grad_norm": 4.98796272277832, "learning_rate": 1.4059853190287973e-05, "loss": 0.7776, "step": 15780 }, { "epoch": 2.97, "grad_norm": 3.577707052230835, "learning_rate": 1.4056088838697535e-05, "loss": 0.3931, "step": 15790 }, { "epoch": 2.97, "grad_norm": 6.863688945770264, "learning_rate": 1.4052324487107096e-05, "loss": 0.8432, "step": 15800 }, { "epoch": 2.98, "grad_norm": 1.5216230154037476, "learning_rate": 1.4048560135516657e-05, "loss": 0.8054, "step": 15810 }, { "epoch": 2.98, "grad_norm": 30.641324996948242, "learning_rate": 1.4044795783926219e-05, "loss": 0.7257, "step": 15820 }, { "epoch": 2.98, "grad_norm": 10.028465270996094, "learning_rate": 1.404103143233578e-05, "loss": 0.9489, "step": 15830 }, { "epoch": 2.98, "grad_norm": 27.563535690307617, "learning_rate": 1.4037267080745342e-05, "loss": 0.8339, "step": 15840 }, { "epoch": 2.98, "grad_norm": 8.942625045776367, "learning_rate": 1.4033502729154905e-05, "loss": 0.7258, "step": 15850 }, { "epoch": 2.99, "grad_norm": 16.081558227539062, "learning_rate": 1.4029738377564466e-05, "loss": 0.6341, "step": 15860 }, { "epoch": 2.99, "grad_norm": 6.025300025939941, "learning_rate": 1.4025974025974028e-05, "loss": 0.6362, "step": 15870 }, { "epoch": 2.99, "grad_norm": 15.980204582214355, "learning_rate": 1.402220967438359e-05, "loss": 0.8119, "step": 15880 }, { "epoch": 2.99, "grad_norm": 34.32817840576172, "learning_rate": 1.401844532279315e-05, "loss": 0.8354, "step": 15890 }, { "epoch": 2.99, "grad_norm": 9.960022926330566, "learning_rate": 1.4014680971202712e-05, "loss": 0.727, "step": 15900 }, { "epoch": 2.99, "grad_norm": 17.459871292114258, "learning_rate": 1.4010916619612274e-05, "loss": 0.4824, "step": 15910 }, { "epoch": 3.0, "grad_norm": 7.697139263153076, "learning_rate": 1.4007152268021835e-05, "loss": 0.4821, "step": 15920 }, { "epoch": 3.0, "grad_norm": 21.81731605529785, "learning_rate": 1.4003387916431397e-05, "loss": 0.6092, "step": 15930 }, { "epoch": 3.0, "eval_accuracy": 0.9081333333333333, "eval_loss": 0.3438812494277954, "eval_runtime": 51.1646, "eval_samples_per_second": 146.586, "eval_steps_per_second": 18.333, "step": 15939 }, { "epoch": 3.0, "grad_norm": 30.34982681274414, "learning_rate": 1.3999623564840956e-05, "loss": 1.0665, "step": 15940 }, { "epoch": 3.0, "grad_norm": 11.997414588928223, "learning_rate": 1.3995859213250518e-05, "loss": 0.8418, "step": 15950 }, { "epoch": 3.0, "grad_norm": 12.47726058959961, "learning_rate": 1.399209486166008e-05, "loss": 0.6524, "step": 15960 }, { "epoch": 3.01, "grad_norm": 5.827723026275635, "learning_rate": 1.398833051006964e-05, "loss": 0.6862, "step": 15970 }, { "epoch": 3.01, "grad_norm": 11.972338676452637, "learning_rate": 1.3984566158479202e-05, "loss": 0.6641, "step": 15980 }, { "epoch": 3.01, "grad_norm": 16.457120895385742, "learning_rate": 1.3980801806888764e-05, "loss": 0.9417, "step": 15990 }, { "epoch": 3.01, "grad_norm": 11.473155975341797, "learning_rate": 1.3977037455298325e-05, "loss": 0.6272, "step": 16000 }, { "epoch": 3.01, "grad_norm": 26.96430778503418, "learning_rate": 1.3973273103707887e-05, "loss": 0.3731, "step": 16010 }, { "epoch": 3.02, "grad_norm": 12.995346069335938, "learning_rate": 1.3969508752117448e-05, "loss": 0.7276, "step": 16020 }, { "epoch": 3.02, "grad_norm": 9.100690841674805, "learning_rate": 1.396574440052701e-05, "loss": 0.4041, "step": 16030 }, { "epoch": 3.02, "grad_norm": 29.41590690612793, "learning_rate": 1.3961980048936573e-05, "loss": 0.5657, "step": 16040 }, { "epoch": 3.02, "grad_norm": 1.4971169233322144, "learning_rate": 1.3958215697346134e-05, "loss": 0.8166, "step": 16050 }, { "epoch": 3.02, "grad_norm": 27.66693687438965, "learning_rate": 1.3954451345755696e-05, "loss": 0.5622, "step": 16060 }, { "epoch": 3.02, "grad_norm": 19.55706214904785, "learning_rate": 1.3950686994165257e-05, "loss": 0.4681, "step": 16070 }, { "epoch": 3.03, "grad_norm": 8.878369331359863, "learning_rate": 1.3946922642574819e-05, "loss": 0.4866, "step": 16080 }, { "epoch": 3.03, "grad_norm": 1.7334586381912231, "learning_rate": 1.394315829098438e-05, "loss": 0.52, "step": 16090 }, { "epoch": 3.03, "grad_norm": 11.960335731506348, "learning_rate": 1.3939393939393942e-05, "loss": 1.061, "step": 16100 }, { "epoch": 3.03, "grad_norm": 9.888663291931152, "learning_rate": 1.3935629587803503e-05, "loss": 0.6879, "step": 16110 }, { "epoch": 3.03, "grad_norm": 5.383222579956055, "learning_rate": 1.3931865236213063e-05, "loss": 0.8313, "step": 16120 }, { "epoch": 3.04, "grad_norm": 0.5276015996932983, "learning_rate": 1.3928100884622624e-05, "loss": 0.6315, "step": 16130 }, { "epoch": 3.04, "grad_norm": 12.056757926940918, "learning_rate": 1.3924336533032186e-05, "loss": 0.5598, "step": 16140 }, { "epoch": 3.04, "grad_norm": 6.8872971534729, "learning_rate": 1.3920572181441747e-05, "loss": 0.3837, "step": 16150 }, { "epoch": 3.04, "grad_norm": 3.6625053882598877, "learning_rate": 1.3916807829851309e-05, "loss": 0.6365, "step": 16160 }, { "epoch": 3.04, "grad_norm": 0.40750113129615784, "learning_rate": 1.391304347826087e-05, "loss": 0.545, "step": 16170 }, { "epoch": 3.05, "grad_norm": 5.1755170822143555, "learning_rate": 1.3909279126670432e-05, "loss": 0.5662, "step": 16180 }, { "epoch": 3.05, "grad_norm": 27.666248321533203, "learning_rate": 1.3905514775079993e-05, "loss": 0.7077, "step": 16190 }, { "epoch": 3.05, "grad_norm": 6.176600456237793, "learning_rate": 1.3901750423489554e-05, "loss": 0.5436, "step": 16200 }, { "epoch": 3.05, "grad_norm": 40.43038558959961, "learning_rate": 1.3897986071899116e-05, "loss": 0.6344, "step": 16210 }, { "epoch": 3.05, "grad_norm": 13.843911170959473, "learning_rate": 1.3894221720308679e-05, "loss": 0.7547, "step": 16220 }, { "epoch": 3.05, "grad_norm": 17.818954467773438, "learning_rate": 1.389045736871824e-05, "loss": 0.8285, "step": 16230 }, { "epoch": 3.06, "grad_norm": 23.41729164123535, "learning_rate": 1.3886693017127802e-05, "loss": 0.5006, "step": 16240 }, { "epoch": 3.06, "grad_norm": 24.877290725708008, "learning_rate": 1.3882928665537363e-05, "loss": 1.136, "step": 16250 }, { "epoch": 3.06, "grad_norm": 12.05526065826416, "learning_rate": 1.3879164313946925e-05, "loss": 0.703, "step": 16260 }, { "epoch": 3.06, "grad_norm": 12.669425964355469, "learning_rate": 1.3875399962356486e-05, "loss": 0.7638, "step": 16270 }, { "epoch": 3.06, "grad_norm": 4.3127899169921875, "learning_rate": 1.3871635610766048e-05, "loss": 0.8595, "step": 16280 }, { "epoch": 3.07, "grad_norm": 27.62041473388672, "learning_rate": 1.3867871259175608e-05, "loss": 0.8843, "step": 16290 }, { "epoch": 3.07, "grad_norm": 19.740711212158203, "learning_rate": 1.3864106907585169e-05, "loss": 0.6221, "step": 16300 }, { "epoch": 3.07, "grad_norm": 4.2288312911987305, "learning_rate": 1.386034255599473e-05, "loss": 0.6524, "step": 16310 }, { "epoch": 3.07, "grad_norm": 15.323369026184082, "learning_rate": 1.3856578204404292e-05, "loss": 0.9442, "step": 16320 }, { "epoch": 3.07, "grad_norm": 7.15553617477417, "learning_rate": 1.3852813852813853e-05, "loss": 0.6764, "step": 16330 }, { "epoch": 3.08, "grad_norm": 22.282184600830078, "learning_rate": 1.3849049501223415e-05, "loss": 0.6296, "step": 16340 }, { "epoch": 3.08, "grad_norm": 20.944698333740234, "learning_rate": 1.3845285149632976e-05, "loss": 0.6328, "step": 16350 }, { "epoch": 3.08, "grad_norm": 11.877099990844727, "learning_rate": 1.3841520798042538e-05, "loss": 0.668, "step": 16360 }, { "epoch": 3.08, "grad_norm": 7.8392863273620605, "learning_rate": 1.38377564464521e-05, "loss": 0.4543, "step": 16370 }, { "epoch": 3.08, "grad_norm": 19.019250869750977, "learning_rate": 1.383399209486166e-05, "loss": 0.8038, "step": 16380 }, { "epoch": 3.08, "grad_norm": 0.9338988065719604, "learning_rate": 1.3830227743271222e-05, "loss": 0.4974, "step": 16390 }, { "epoch": 3.09, "grad_norm": 0.640573263168335, "learning_rate": 1.3826463391680784e-05, "loss": 0.6984, "step": 16400 }, { "epoch": 3.09, "grad_norm": 15.033812522888184, "learning_rate": 1.3822699040090347e-05, "loss": 0.9574, "step": 16410 }, { "epoch": 3.09, "grad_norm": 12.06335163116455, "learning_rate": 1.3818934688499908e-05, "loss": 0.6254, "step": 16420 }, { "epoch": 3.09, "grad_norm": 4.687410354614258, "learning_rate": 1.381517033690947e-05, "loss": 0.6764, "step": 16430 }, { "epoch": 3.09, "grad_norm": 27.31790542602539, "learning_rate": 1.3811405985319031e-05, "loss": 0.5824, "step": 16440 }, { "epoch": 3.1, "grad_norm": 9.797171592712402, "learning_rate": 1.3807641633728593e-05, "loss": 0.4788, "step": 16450 }, { "epoch": 3.1, "grad_norm": 17.67564582824707, "learning_rate": 1.3803877282138154e-05, "loss": 1.209, "step": 16460 }, { "epoch": 3.1, "grad_norm": 0.271230548620224, "learning_rate": 1.3800112930547714e-05, "loss": 0.4775, "step": 16470 }, { "epoch": 3.1, "grad_norm": 33.21747589111328, "learning_rate": 1.3796348578957275e-05, "loss": 1.0736, "step": 16480 }, { "epoch": 3.1, "grad_norm": 88.40038299560547, "learning_rate": 1.3792584227366837e-05, "loss": 0.8484, "step": 16490 }, { "epoch": 3.11, "grad_norm": 17.692691802978516, "learning_rate": 1.3788819875776398e-05, "loss": 0.7804, "step": 16500 }, { "epoch": 3.11, "grad_norm": 9.918412208557129, "learning_rate": 1.378505552418596e-05, "loss": 0.7316, "step": 16510 }, { "epoch": 3.11, "grad_norm": 5.417355537414551, "learning_rate": 1.3781291172595521e-05, "loss": 0.6182, "step": 16520 }, { "epoch": 3.11, "grad_norm": 11.936298370361328, "learning_rate": 1.3777526821005083e-05, "loss": 0.4077, "step": 16530 }, { "epoch": 3.11, "grad_norm": 13.960946083068848, "learning_rate": 1.3773762469414644e-05, "loss": 0.526, "step": 16540 }, { "epoch": 3.12, "grad_norm": 22.762529373168945, "learning_rate": 1.3769998117824206e-05, "loss": 0.6749, "step": 16550 }, { "epoch": 3.12, "grad_norm": 14.109274864196777, "learning_rate": 1.3766233766233767e-05, "loss": 0.5186, "step": 16560 }, { "epoch": 3.12, "grad_norm": 35.54779815673828, "learning_rate": 1.3762469414643328e-05, "loss": 0.7046, "step": 16570 }, { "epoch": 3.12, "grad_norm": 12.966293334960938, "learning_rate": 1.375870506305289e-05, "loss": 0.7402, "step": 16580 }, { "epoch": 3.12, "grad_norm": 8.81118392944336, "learning_rate": 1.3754940711462453e-05, "loss": 0.6679, "step": 16590 }, { "epoch": 3.12, "grad_norm": 0.5349300503730774, "learning_rate": 1.3751176359872015e-05, "loss": 0.6342, "step": 16600 }, { "epoch": 3.13, "grad_norm": 14.064860343933105, "learning_rate": 1.3747412008281576e-05, "loss": 0.8408, "step": 16610 }, { "epoch": 3.13, "grad_norm": 13.386101722717285, "learning_rate": 1.3743647656691137e-05, "loss": 0.6574, "step": 16620 }, { "epoch": 3.13, "grad_norm": 0.2948550879955292, "learning_rate": 1.3739883305100699e-05, "loss": 0.8882, "step": 16630 }, { "epoch": 3.13, "grad_norm": 8.500995635986328, "learning_rate": 1.3736118953510259e-05, "loss": 0.6037, "step": 16640 }, { "epoch": 3.13, "grad_norm": 18.741573333740234, "learning_rate": 1.373235460191982e-05, "loss": 0.629, "step": 16650 }, { "epoch": 3.14, "grad_norm": 0.3394733667373657, "learning_rate": 1.3728590250329382e-05, "loss": 0.7729, "step": 16660 }, { "epoch": 3.14, "grad_norm": 11.697025299072266, "learning_rate": 1.3724825898738943e-05, "loss": 0.512, "step": 16670 }, { "epoch": 3.14, "grad_norm": 9.116700172424316, "learning_rate": 1.3721061547148505e-05, "loss": 0.6958, "step": 16680 }, { "epoch": 3.14, "grad_norm": 59.24357223510742, "learning_rate": 1.3717297195558066e-05, "loss": 0.9625, "step": 16690 }, { "epoch": 3.14, "grad_norm": 20.930126190185547, "learning_rate": 1.3713532843967627e-05, "loss": 0.7139, "step": 16700 }, { "epoch": 3.15, "grad_norm": 46.4968147277832, "learning_rate": 1.3709768492377189e-05, "loss": 0.8418, "step": 16710 }, { "epoch": 3.15, "grad_norm": 0.5635331273078918, "learning_rate": 1.370600414078675e-05, "loss": 0.7676, "step": 16720 }, { "epoch": 3.15, "grad_norm": 0.2128181755542755, "learning_rate": 1.3702239789196312e-05, "loss": 0.9451, "step": 16730 }, { "epoch": 3.15, "grad_norm": 21.21966552734375, "learning_rate": 1.3698475437605873e-05, "loss": 0.5834, "step": 16740 }, { "epoch": 3.15, "grad_norm": 37.33884811401367, "learning_rate": 1.3694711086015435e-05, "loss": 0.8985, "step": 16750 }, { "epoch": 3.15, "grad_norm": 7.059642314910889, "learning_rate": 1.3690946734424996e-05, "loss": 0.7002, "step": 16760 }, { "epoch": 3.16, "grad_norm": 14.130393981933594, "learning_rate": 1.3687182382834558e-05, "loss": 0.8018, "step": 16770 }, { "epoch": 3.16, "grad_norm": 15.686850547790527, "learning_rate": 1.368341803124412e-05, "loss": 0.7179, "step": 16780 }, { "epoch": 3.16, "grad_norm": 7.120452404022217, "learning_rate": 1.3679653679653682e-05, "loss": 0.6912, "step": 16790 }, { "epoch": 3.16, "grad_norm": 15.659395217895508, "learning_rate": 1.3675889328063244e-05, "loss": 0.8925, "step": 16800 }, { "epoch": 3.16, "grad_norm": 27.15504264831543, "learning_rate": 1.3672124976472802e-05, "loss": 0.4575, "step": 16810 }, { "epoch": 3.17, "grad_norm": 7.112409591674805, "learning_rate": 1.3668360624882365e-05, "loss": 0.7047, "step": 16820 }, { "epoch": 3.17, "grad_norm": 21.725502014160156, "learning_rate": 1.3664596273291926e-05, "loss": 0.5739, "step": 16830 }, { "epoch": 3.17, "grad_norm": 17.80892562866211, "learning_rate": 1.3660831921701488e-05, "loss": 0.8959, "step": 16840 }, { "epoch": 3.17, "grad_norm": 1.4555948972702026, "learning_rate": 1.365706757011105e-05, "loss": 0.4714, "step": 16850 }, { "epoch": 3.17, "grad_norm": 2.813807725906372, "learning_rate": 1.365330321852061e-05, "loss": 0.5298, "step": 16860 }, { "epoch": 3.18, "grad_norm": 15.255325317382812, "learning_rate": 1.3649538866930172e-05, "loss": 0.7289, "step": 16870 }, { "epoch": 3.18, "grad_norm": 10.276988983154297, "learning_rate": 1.3645774515339734e-05, "loss": 0.3811, "step": 16880 }, { "epoch": 3.18, "grad_norm": 4.02056884765625, "learning_rate": 1.3642010163749295e-05, "loss": 0.7805, "step": 16890 }, { "epoch": 3.18, "grad_norm": 20.98756217956543, "learning_rate": 1.3638245812158857e-05, "loss": 0.7219, "step": 16900 }, { "epoch": 3.18, "grad_norm": 1.9777402877807617, "learning_rate": 1.3634481460568418e-05, "loss": 0.4574, "step": 16910 }, { "epoch": 3.18, "grad_norm": 17.15428924560547, "learning_rate": 1.363071710897798e-05, "loss": 0.6216, "step": 16920 }, { "epoch": 3.19, "grad_norm": 26.12958335876465, "learning_rate": 1.3626952757387541e-05, "loss": 0.5892, "step": 16930 }, { "epoch": 3.19, "grad_norm": 2.257452964782715, "learning_rate": 1.3623188405797103e-05, "loss": 0.6007, "step": 16940 }, { "epoch": 3.19, "grad_norm": 21.12044334411621, "learning_rate": 1.3619424054206664e-05, "loss": 0.6436, "step": 16950 }, { "epoch": 3.19, "grad_norm": 27.5797061920166, "learning_rate": 1.3615659702616225e-05, "loss": 0.6577, "step": 16960 }, { "epoch": 3.19, "grad_norm": 0.14470890164375305, "learning_rate": 1.3611895351025789e-05, "loss": 0.7087, "step": 16970 }, { "epoch": 3.2, "grad_norm": 25.093280792236328, "learning_rate": 1.360813099943535e-05, "loss": 0.8523, "step": 16980 }, { "epoch": 3.2, "grad_norm": 9.746846199035645, "learning_rate": 1.3604366647844908e-05, "loss": 0.6063, "step": 16990 }, { "epoch": 3.2, "grad_norm": 26.40338897705078, "learning_rate": 1.360060229625447e-05, "loss": 0.7969, "step": 17000 }, { "epoch": 3.2, "grad_norm": 16.754243850708008, "learning_rate": 1.3596837944664033e-05, "loss": 0.7463, "step": 17010 }, { "epoch": 3.2, "grad_norm": 30.17478370666504, "learning_rate": 1.3593073593073594e-05, "loss": 0.8098, "step": 17020 }, { "epoch": 3.21, "grad_norm": 10.640960693359375, "learning_rate": 1.3589309241483156e-05, "loss": 0.5837, "step": 17030 }, { "epoch": 3.21, "grad_norm": 21.33949089050293, "learning_rate": 1.3585544889892717e-05, "loss": 0.627, "step": 17040 }, { "epoch": 3.21, "grad_norm": 11.06455135345459, "learning_rate": 1.3581780538302279e-05, "loss": 0.6147, "step": 17050 }, { "epoch": 3.21, "grad_norm": 25.937108993530273, "learning_rate": 1.357801618671184e-05, "loss": 1.1128, "step": 17060 }, { "epoch": 3.21, "grad_norm": 6.419862270355225, "learning_rate": 1.3574251835121402e-05, "loss": 0.8132, "step": 17070 }, { "epoch": 3.21, "grad_norm": 26.92257308959961, "learning_rate": 1.3570487483530963e-05, "loss": 0.4862, "step": 17080 }, { "epoch": 3.22, "grad_norm": 10.812932968139648, "learning_rate": 1.3566723131940524e-05, "loss": 1.0484, "step": 17090 }, { "epoch": 3.22, "grad_norm": 23.167081832885742, "learning_rate": 1.3562958780350086e-05, "loss": 0.815, "step": 17100 }, { "epoch": 3.22, "grad_norm": 19.610149383544922, "learning_rate": 1.3559194428759647e-05, "loss": 0.788, "step": 17110 }, { "epoch": 3.22, "grad_norm": 13.85695743560791, "learning_rate": 1.3555430077169209e-05, "loss": 0.65, "step": 17120 }, { "epoch": 3.22, "grad_norm": 4.279960632324219, "learning_rate": 1.355166572557877e-05, "loss": 0.6214, "step": 17130 }, { "epoch": 3.23, "grad_norm": 21.195491790771484, "learning_rate": 1.3547901373988332e-05, "loss": 0.5917, "step": 17140 }, { "epoch": 3.23, "grad_norm": 43.27781295776367, "learning_rate": 1.3544137022397895e-05, "loss": 0.5481, "step": 17150 }, { "epoch": 3.23, "grad_norm": 32.89248275756836, "learning_rate": 1.3540372670807453e-05, "loss": 0.7128, "step": 17160 }, { "epoch": 3.23, "grad_norm": 1.7565685510635376, "learning_rate": 1.3536608319217014e-05, "loss": 0.5305, "step": 17170 }, { "epoch": 3.23, "grad_norm": 0.9114441871643066, "learning_rate": 1.3532843967626576e-05, "loss": 0.4674, "step": 17180 }, { "epoch": 3.24, "grad_norm": 14.061701774597168, "learning_rate": 1.3529079616036137e-05, "loss": 0.947, "step": 17190 }, { "epoch": 3.24, "grad_norm": 14.889267921447754, "learning_rate": 1.35253152644457e-05, "loss": 0.8785, "step": 17200 }, { "epoch": 3.24, "grad_norm": 10.14665412902832, "learning_rate": 1.3521550912855262e-05, "loss": 0.728, "step": 17210 }, { "epoch": 3.24, "grad_norm": 16.317060470581055, "learning_rate": 1.3517786561264823e-05, "loss": 0.615, "step": 17220 }, { "epoch": 3.24, "grad_norm": 14.949483871459961, "learning_rate": 1.3514022209674385e-05, "loss": 0.4112, "step": 17230 }, { "epoch": 3.24, "grad_norm": 42.36807632446289, "learning_rate": 1.3510257858083946e-05, "loss": 0.5082, "step": 17240 }, { "epoch": 3.25, "grad_norm": 10.184826850891113, "learning_rate": 1.3506493506493508e-05, "loss": 0.63, "step": 17250 }, { "epoch": 3.25, "grad_norm": 7.961637496948242, "learning_rate": 1.350272915490307e-05, "loss": 0.373, "step": 17260 }, { "epoch": 3.25, "grad_norm": 15.121786117553711, "learning_rate": 1.349896480331263e-05, "loss": 1.0031, "step": 17270 }, { "epoch": 3.25, "grad_norm": 17.208942413330078, "learning_rate": 1.3495200451722192e-05, "loss": 0.7613, "step": 17280 }, { "epoch": 3.25, "grad_norm": 3.7287890911102295, "learning_rate": 1.3491436100131754e-05, "loss": 0.4566, "step": 17290 }, { "epoch": 3.26, "grad_norm": 34.4078254699707, "learning_rate": 1.3487671748541315e-05, "loss": 0.8477, "step": 17300 }, { "epoch": 3.26, "grad_norm": 6.368019104003906, "learning_rate": 1.3483907396950877e-05, "loss": 0.6715, "step": 17310 }, { "epoch": 3.26, "grad_norm": 6.000946044921875, "learning_rate": 1.3480143045360438e-05, "loss": 0.3316, "step": 17320 }, { "epoch": 3.26, "grad_norm": 8.809701919555664, "learning_rate": 1.347637869377e-05, "loss": 0.9712, "step": 17330 }, { "epoch": 3.26, "grad_norm": 2.8006772994995117, "learning_rate": 1.347261434217956e-05, "loss": 0.5571, "step": 17340 }, { "epoch": 3.27, "grad_norm": 3.8963782787323, "learning_rate": 1.346884999058912e-05, "loss": 0.8123, "step": 17350 }, { "epoch": 3.27, "grad_norm": 21.654504776000977, "learning_rate": 1.3465085638998682e-05, "loss": 0.46, "step": 17360 }, { "epoch": 3.27, "grad_norm": 21.95523452758789, "learning_rate": 1.3461321287408244e-05, "loss": 0.8436, "step": 17370 }, { "epoch": 3.27, "grad_norm": 21.384435653686523, "learning_rate": 1.3457556935817807e-05, "loss": 0.6596, "step": 17380 }, { "epoch": 3.27, "grad_norm": 2.9826455116271973, "learning_rate": 1.3453792584227368e-05, "loss": 0.7622, "step": 17390 }, { "epoch": 3.27, "grad_norm": 1.4264588356018066, "learning_rate": 1.345002823263693e-05, "loss": 0.739, "step": 17400 }, { "epoch": 3.28, "grad_norm": 7.517381191253662, "learning_rate": 1.3446263881046491e-05, "loss": 0.6091, "step": 17410 }, { "epoch": 3.28, "grad_norm": 32.621761322021484, "learning_rate": 1.3442499529456053e-05, "loss": 0.6992, "step": 17420 }, { "epoch": 3.28, "grad_norm": 4.760457992553711, "learning_rate": 1.3438735177865614e-05, "loss": 1.0, "step": 17430 }, { "epoch": 3.28, "grad_norm": 0.4995521008968353, "learning_rate": 1.3434970826275176e-05, "loss": 0.523, "step": 17440 }, { "epoch": 3.28, "grad_norm": 8.695629119873047, "learning_rate": 1.3431206474684737e-05, "loss": 0.646, "step": 17450 }, { "epoch": 3.29, "grad_norm": 0.28502389788627625, "learning_rate": 1.3427442123094298e-05, "loss": 0.5188, "step": 17460 }, { "epoch": 3.29, "grad_norm": 0.6979949474334717, "learning_rate": 1.342367777150386e-05, "loss": 0.5536, "step": 17470 }, { "epoch": 3.29, "grad_norm": 24.936214447021484, "learning_rate": 1.3419913419913421e-05, "loss": 0.647, "step": 17480 }, { "epoch": 3.29, "grad_norm": 8.776481628417969, "learning_rate": 1.3416149068322983e-05, "loss": 0.8732, "step": 17490 }, { "epoch": 3.29, "grad_norm": 19.038969039916992, "learning_rate": 1.3412384716732544e-05, "loss": 0.77, "step": 17500 }, { "epoch": 3.3, "grad_norm": 4.335228443145752, "learning_rate": 1.3408620365142104e-05, "loss": 0.6023, "step": 17510 }, { "epoch": 3.3, "grad_norm": 7.746741771697998, "learning_rate": 1.3404856013551666e-05, "loss": 0.6851, "step": 17520 }, { "epoch": 3.3, "grad_norm": 24.021316528320312, "learning_rate": 1.3401091661961227e-05, "loss": 0.8683, "step": 17530 }, { "epoch": 3.3, "grad_norm": 40.04768371582031, "learning_rate": 1.3397327310370788e-05, "loss": 0.7139, "step": 17540 }, { "epoch": 3.3, "grad_norm": 28.369369506835938, "learning_rate": 1.339356295878035e-05, "loss": 0.7984, "step": 17550 }, { "epoch": 3.31, "grad_norm": 12.895668983459473, "learning_rate": 1.3389798607189911e-05, "loss": 0.8768, "step": 17560 }, { "epoch": 3.31, "grad_norm": 13.10739517211914, "learning_rate": 1.3386034255599475e-05, "loss": 0.9014, "step": 17570 }, { "epoch": 3.31, "grad_norm": 6.7018537521362305, "learning_rate": 1.3382269904009036e-05, "loss": 0.6438, "step": 17580 }, { "epoch": 3.31, "grad_norm": 20.47855567932129, "learning_rate": 1.3378505552418597e-05, "loss": 0.5956, "step": 17590 }, { "epoch": 3.31, "grad_norm": 34.99126052856445, "learning_rate": 1.3374741200828159e-05, "loss": 0.6245, "step": 17600 }, { "epoch": 3.31, "grad_norm": 31.813350677490234, "learning_rate": 1.337097684923772e-05, "loss": 0.9122, "step": 17610 }, { "epoch": 3.32, "grad_norm": 15.438579559326172, "learning_rate": 1.3367212497647282e-05, "loss": 0.7581, "step": 17620 }, { "epoch": 3.32, "grad_norm": 10.1848726272583, "learning_rate": 1.3363448146056843e-05, "loss": 0.3745, "step": 17630 }, { "epoch": 3.32, "grad_norm": 0.6416832804679871, "learning_rate": 1.3359683794466405e-05, "loss": 0.5186, "step": 17640 }, { "epoch": 3.32, "grad_norm": 17.22964859008789, "learning_rate": 1.3355919442875966e-05, "loss": 0.517, "step": 17650 }, { "epoch": 3.32, "grad_norm": 23.54407501220703, "learning_rate": 1.3352155091285528e-05, "loss": 0.6123, "step": 17660 }, { "epoch": 3.33, "grad_norm": 24.60756492614746, "learning_rate": 1.3348390739695089e-05, "loss": 0.6709, "step": 17670 }, { "epoch": 3.33, "grad_norm": 18.41277313232422, "learning_rate": 1.334462638810465e-05, "loss": 0.724, "step": 17680 }, { "epoch": 3.33, "grad_norm": 13.058307647705078, "learning_rate": 1.334086203651421e-05, "loss": 0.6962, "step": 17690 }, { "epoch": 3.33, "grad_norm": 13.01867389678955, "learning_rate": 1.3337097684923772e-05, "loss": 0.5346, "step": 17700 }, { "epoch": 3.33, "grad_norm": 13.48055362701416, "learning_rate": 1.3333333333333333e-05, "loss": 0.5472, "step": 17710 }, { "epoch": 3.34, "grad_norm": 18.029436111450195, "learning_rate": 1.3329568981742895e-05, "loss": 0.4834, "step": 17720 }, { "epoch": 3.34, "grad_norm": 2.22222900390625, "learning_rate": 1.3325804630152456e-05, "loss": 0.6628, "step": 17730 }, { "epoch": 3.34, "grad_norm": 7.793396472930908, "learning_rate": 1.3322040278562018e-05, "loss": 0.866, "step": 17740 }, { "epoch": 3.34, "grad_norm": 42.029598236083984, "learning_rate": 1.331827592697158e-05, "loss": 0.5733, "step": 17750 }, { "epoch": 3.34, "grad_norm": 8.039556503295898, "learning_rate": 1.3314511575381142e-05, "loss": 0.6282, "step": 17760 }, { "epoch": 3.34, "grad_norm": 13.458028793334961, "learning_rate": 1.3310747223790704e-05, "loss": 0.911, "step": 17770 }, { "epoch": 3.35, "grad_norm": 14.425552368164062, "learning_rate": 1.3306982872200265e-05, "loss": 1.0583, "step": 17780 }, { "epoch": 3.35, "grad_norm": 3.239820718765259, "learning_rate": 1.3303218520609827e-05, "loss": 0.5288, "step": 17790 }, { "epoch": 3.35, "grad_norm": 49.50566482543945, "learning_rate": 1.3299454169019388e-05, "loss": 0.9413, "step": 17800 }, { "epoch": 3.35, "grad_norm": 15.337508201599121, "learning_rate": 1.329568981742895e-05, "loss": 0.6259, "step": 17810 }, { "epoch": 3.35, "grad_norm": 22.253278732299805, "learning_rate": 1.3291925465838511e-05, "loss": 0.8045, "step": 17820 }, { "epoch": 3.36, "grad_norm": 8.542914390563965, "learning_rate": 1.3288161114248073e-05, "loss": 0.5947, "step": 17830 }, { "epoch": 3.36, "grad_norm": 9.067444801330566, "learning_rate": 1.3284396762657634e-05, "loss": 0.5303, "step": 17840 }, { "epoch": 3.36, "grad_norm": 15.519441604614258, "learning_rate": 1.3280632411067195e-05, "loss": 1.0939, "step": 17850 }, { "epoch": 3.36, "grad_norm": 8.470025062561035, "learning_rate": 1.3276868059476755e-05, "loss": 0.7933, "step": 17860 }, { "epoch": 3.36, "grad_norm": 17.665695190429688, "learning_rate": 1.3273103707886317e-05, "loss": 0.7463, "step": 17870 }, { "epoch": 3.37, "grad_norm": 17.912128448486328, "learning_rate": 1.3269339356295878e-05, "loss": 0.6041, "step": 17880 }, { "epoch": 3.37, "grad_norm": 26.95820426940918, "learning_rate": 1.326557500470544e-05, "loss": 0.7265, "step": 17890 }, { "epoch": 3.37, "grad_norm": 13.88226318359375, "learning_rate": 1.3261810653115001e-05, "loss": 0.7233, "step": 17900 }, { "epoch": 3.37, "grad_norm": 9.494647979736328, "learning_rate": 1.3258046301524563e-05, "loss": 0.5738, "step": 17910 }, { "epoch": 3.37, "grad_norm": 23.288188934326172, "learning_rate": 1.3254281949934124e-05, "loss": 0.9191, "step": 17920 }, { "epoch": 3.37, "grad_norm": 12.423943519592285, "learning_rate": 1.3250517598343685e-05, "loss": 0.5722, "step": 17930 }, { "epoch": 3.38, "grad_norm": 20.487674713134766, "learning_rate": 1.3246753246753249e-05, "loss": 0.5988, "step": 17940 }, { "epoch": 3.38, "grad_norm": 8.707905769348145, "learning_rate": 1.324298889516281e-05, "loss": 0.3331, "step": 17950 }, { "epoch": 3.38, "grad_norm": 0.4977474808692932, "learning_rate": 1.3239224543572372e-05, "loss": 0.5852, "step": 17960 }, { "epoch": 3.38, "grad_norm": 6.157559871673584, "learning_rate": 1.3235460191981933e-05, "loss": 0.6341, "step": 17970 }, { "epoch": 3.38, "grad_norm": 23.23993682861328, "learning_rate": 1.3231695840391494e-05, "loss": 0.9592, "step": 17980 }, { "epoch": 3.39, "grad_norm": 10.312979698181152, "learning_rate": 1.3227931488801056e-05, "loss": 0.7303, "step": 17990 }, { "epoch": 3.39, "grad_norm": 24.67930030822754, "learning_rate": 1.3224167137210617e-05, "loss": 0.5402, "step": 18000 }, { "epoch": 3.39, "grad_norm": 22.200210571289062, "learning_rate": 1.3220402785620179e-05, "loss": 0.8094, "step": 18010 }, { "epoch": 3.39, "grad_norm": 5.099223613739014, "learning_rate": 1.321663843402974e-05, "loss": 0.8314, "step": 18020 }, { "epoch": 3.39, "grad_norm": 19.834396362304688, "learning_rate": 1.32128740824393e-05, "loss": 1.009, "step": 18030 }, { "epoch": 3.4, "grad_norm": 11.00228214263916, "learning_rate": 1.3209109730848861e-05, "loss": 0.8001, "step": 18040 }, { "epoch": 3.4, "grad_norm": 19.782005310058594, "learning_rate": 1.3205345379258423e-05, "loss": 0.5065, "step": 18050 }, { "epoch": 3.4, "grad_norm": 9.906347274780273, "learning_rate": 1.3201581027667984e-05, "loss": 0.4765, "step": 18060 }, { "epoch": 3.4, "grad_norm": 28.395952224731445, "learning_rate": 1.3197816676077546e-05, "loss": 0.9283, "step": 18070 }, { "epoch": 3.4, "grad_norm": 4.160763740539551, "learning_rate": 1.3194052324487107e-05, "loss": 0.6412, "step": 18080 }, { "epoch": 3.4, "grad_norm": 16.857194900512695, "learning_rate": 1.3190287972896669e-05, "loss": 0.888, "step": 18090 }, { "epoch": 3.41, "grad_norm": 3.8556270599365234, "learning_rate": 1.318652362130623e-05, "loss": 0.8731, "step": 18100 }, { "epoch": 3.41, "grad_norm": 29.509401321411133, "learning_rate": 1.3182759269715792e-05, "loss": 0.6451, "step": 18110 }, { "epoch": 3.41, "grad_norm": 8.988554954528809, "learning_rate": 1.3178994918125355e-05, "loss": 0.9483, "step": 18120 }, { "epoch": 3.41, "grad_norm": 19.900251388549805, "learning_rate": 1.3175230566534916e-05, "loss": 1.0578, "step": 18130 }, { "epoch": 3.41, "grad_norm": 11.77758502960205, "learning_rate": 1.3171466214944478e-05, "loss": 0.6442, "step": 18140 }, { "epoch": 3.42, "grad_norm": 6.527004718780518, "learning_rate": 1.316770186335404e-05, "loss": 0.7697, "step": 18150 }, { "epoch": 3.42, "grad_norm": 5.472777843475342, "learning_rate": 1.31639375117636e-05, "loss": 0.9177, "step": 18160 }, { "epoch": 3.42, "grad_norm": 11.783276557922363, "learning_rate": 1.3160173160173162e-05, "loss": 0.6803, "step": 18170 }, { "epoch": 3.42, "grad_norm": 9.348353385925293, "learning_rate": 1.3156408808582724e-05, "loss": 0.9163, "step": 18180 }, { "epoch": 3.42, "grad_norm": 8.337559700012207, "learning_rate": 1.3152644456992285e-05, "loss": 0.5074, "step": 18190 }, { "epoch": 3.43, "grad_norm": 18.181156158447266, "learning_rate": 1.3148880105401847e-05, "loss": 0.6039, "step": 18200 }, { "epoch": 3.43, "grad_norm": 24.175315856933594, "learning_rate": 1.3145115753811406e-05, "loss": 0.6791, "step": 18210 }, { "epoch": 3.43, "grad_norm": 11.990982055664062, "learning_rate": 1.3141351402220968e-05, "loss": 0.5122, "step": 18220 }, { "epoch": 3.43, "grad_norm": 12.18565845489502, "learning_rate": 1.313758705063053e-05, "loss": 0.8138, "step": 18230 }, { "epoch": 3.43, "grad_norm": 20.374452590942383, "learning_rate": 1.313382269904009e-05, "loss": 0.4286, "step": 18240 }, { "epoch": 3.43, "grad_norm": 11.977636337280273, "learning_rate": 1.3130058347449652e-05, "loss": 0.6633, "step": 18250 }, { "epoch": 3.44, "grad_norm": 5.555273056030273, "learning_rate": 1.3126293995859214e-05, "loss": 0.3768, "step": 18260 }, { "epoch": 3.44, "grad_norm": 12.093500137329102, "learning_rate": 1.3122529644268775e-05, "loss": 0.5081, "step": 18270 }, { "epoch": 3.44, "grad_norm": 12.523153305053711, "learning_rate": 1.3118765292678337e-05, "loss": 0.7281, "step": 18280 }, { "epoch": 3.44, "grad_norm": 34.439266204833984, "learning_rate": 1.3115000941087898e-05, "loss": 0.7471, "step": 18290 }, { "epoch": 3.44, "grad_norm": 28.79136848449707, "learning_rate": 1.311123658949746e-05, "loss": 0.6478, "step": 18300 }, { "epoch": 3.45, "grad_norm": 5.4620561599731445, "learning_rate": 1.3107472237907023e-05, "loss": 0.8856, "step": 18310 }, { "epoch": 3.45, "grad_norm": 14.058579444885254, "learning_rate": 1.3103707886316584e-05, "loss": 0.4122, "step": 18320 }, { "epoch": 3.45, "grad_norm": 17.870243072509766, "learning_rate": 1.3099943534726146e-05, "loss": 0.4202, "step": 18330 }, { "epoch": 3.45, "grad_norm": 32.40336990356445, "learning_rate": 1.3096179183135707e-05, "loss": 0.9052, "step": 18340 }, { "epoch": 3.45, "grad_norm": 10.193222999572754, "learning_rate": 1.3092414831545268e-05, "loss": 0.5433, "step": 18350 }, { "epoch": 3.46, "grad_norm": 7.227964401245117, "learning_rate": 1.308865047995483e-05, "loss": 0.6235, "step": 18360 }, { "epoch": 3.46, "grad_norm": 17.942949295043945, "learning_rate": 1.3084886128364391e-05, "loss": 0.9279, "step": 18370 }, { "epoch": 3.46, "grad_norm": 2.4786839485168457, "learning_rate": 1.3081121776773951e-05, "loss": 0.5688, "step": 18380 }, { "epoch": 3.46, "grad_norm": 13.7864408493042, "learning_rate": 1.3077357425183513e-05, "loss": 0.6254, "step": 18390 }, { "epoch": 3.46, "grad_norm": 11.290862083435059, "learning_rate": 1.3073593073593074e-05, "loss": 0.5499, "step": 18400 }, { "epoch": 3.47, "grad_norm": 27.211687088012695, "learning_rate": 1.3069828722002636e-05, "loss": 0.7081, "step": 18410 }, { "epoch": 3.47, "grad_norm": 31.036930084228516, "learning_rate": 1.3066064370412197e-05, "loss": 0.692, "step": 18420 }, { "epoch": 3.47, "grad_norm": 9.828187942504883, "learning_rate": 1.3062300018821758e-05, "loss": 0.848, "step": 18430 }, { "epoch": 3.47, "grad_norm": 22.125829696655273, "learning_rate": 1.305853566723132e-05, "loss": 0.7898, "step": 18440 }, { "epoch": 3.47, "grad_norm": 15.01488971710205, "learning_rate": 1.3054771315640881e-05, "loss": 0.4647, "step": 18450 }, { "epoch": 3.47, "grad_norm": 14.60596752166748, "learning_rate": 1.3051006964050443e-05, "loss": 0.7542, "step": 18460 }, { "epoch": 3.48, "grad_norm": 23.092918395996094, "learning_rate": 1.3047242612460004e-05, "loss": 0.8983, "step": 18470 }, { "epoch": 3.48, "grad_norm": 12.916792869567871, "learning_rate": 1.3043478260869566e-05, "loss": 0.6377, "step": 18480 }, { "epoch": 3.48, "grad_norm": 30.594213485717773, "learning_rate": 1.3039713909279127e-05, "loss": 0.9676, "step": 18490 }, { "epoch": 3.48, "grad_norm": 18.934587478637695, "learning_rate": 1.303594955768869e-05, "loss": 0.8782, "step": 18500 }, { "epoch": 3.48, "grad_norm": 2.794344663619995, "learning_rate": 1.3032185206098252e-05, "loss": 0.3121, "step": 18510 }, { "epoch": 3.49, "grad_norm": 2.3938238620758057, "learning_rate": 1.3028420854507813e-05, "loss": 0.9087, "step": 18520 }, { "epoch": 3.49, "grad_norm": 32.515846252441406, "learning_rate": 1.3024656502917375e-05, "loss": 0.6834, "step": 18530 }, { "epoch": 3.49, "grad_norm": 4.718508720397949, "learning_rate": 1.3020892151326936e-05, "loss": 0.6034, "step": 18540 }, { "epoch": 3.49, "grad_norm": 16.135196685791016, "learning_rate": 1.3017127799736498e-05, "loss": 0.7444, "step": 18550 }, { "epoch": 3.49, "grad_norm": 15.95462417602539, "learning_rate": 1.3013363448146057e-05, "loss": 0.482, "step": 18560 }, { "epoch": 3.5, "grad_norm": 15.838091850280762, "learning_rate": 1.3009599096555619e-05, "loss": 0.7012, "step": 18570 }, { "epoch": 3.5, "grad_norm": 9.506869316101074, "learning_rate": 1.300583474496518e-05, "loss": 0.5384, "step": 18580 }, { "epoch": 3.5, "grad_norm": 11.965920448303223, "learning_rate": 1.3002070393374742e-05, "loss": 0.5091, "step": 18590 }, { "epoch": 3.5, "grad_norm": 21.009666442871094, "learning_rate": 1.2998306041784303e-05, "loss": 0.9806, "step": 18600 }, { "epoch": 3.5, "grad_norm": 19.624370574951172, "learning_rate": 1.2994541690193865e-05, "loss": 0.7226, "step": 18610 }, { "epoch": 3.5, "grad_norm": 18.199827194213867, "learning_rate": 1.2990777338603426e-05, "loss": 0.7008, "step": 18620 }, { "epoch": 3.51, "grad_norm": 16.78691864013672, "learning_rate": 1.2987012987012988e-05, "loss": 0.8391, "step": 18630 }, { "epoch": 3.51, "grad_norm": 11.214224815368652, "learning_rate": 1.2983248635422549e-05, "loss": 0.7674, "step": 18640 }, { "epoch": 3.51, "grad_norm": 12.816788673400879, "learning_rate": 1.297948428383211e-05, "loss": 0.6571, "step": 18650 }, { "epoch": 3.51, "grad_norm": 26.343637466430664, "learning_rate": 1.2975719932241672e-05, "loss": 0.7407, "step": 18660 }, { "epoch": 3.51, "grad_norm": 10.422934532165527, "learning_rate": 1.2971955580651234e-05, "loss": 0.571, "step": 18670 }, { "epoch": 3.52, "grad_norm": 4.7122015953063965, "learning_rate": 1.2968191229060797e-05, "loss": 0.4944, "step": 18680 }, { "epoch": 3.52, "grad_norm": 10.654007911682129, "learning_rate": 1.2964426877470358e-05, "loss": 0.4871, "step": 18690 }, { "epoch": 3.52, "grad_norm": 16.5805606842041, "learning_rate": 1.296066252587992e-05, "loss": 0.7417, "step": 18700 }, { "epoch": 3.52, "grad_norm": 12.002159118652344, "learning_rate": 1.2956898174289481e-05, "loss": 0.686, "step": 18710 }, { "epoch": 3.52, "grad_norm": 22.57682991027832, "learning_rate": 1.2953133822699043e-05, "loss": 0.9398, "step": 18720 }, { "epoch": 3.53, "grad_norm": 8.608962059020996, "learning_rate": 1.2949369471108602e-05, "loss": 0.5677, "step": 18730 }, { "epoch": 3.53, "grad_norm": 14.006396293640137, "learning_rate": 1.2945605119518164e-05, "loss": 0.635, "step": 18740 }, { "epoch": 3.53, "grad_norm": 10.348053932189941, "learning_rate": 1.2941840767927725e-05, "loss": 0.7357, "step": 18750 }, { "epoch": 3.53, "grad_norm": 18.81407928466797, "learning_rate": 1.2938076416337287e-05, "loss": 0.9314, "step": 18760 }, { "epoch": 3.53, "grad_norm": 11.178011894226074, "learning_rate": 1.2934312064746848e-05, "loss": 0.454, "step": 18770 }, { "epoch": 3.53, "grad_norm": 23.306005477905273, "learning_rate": 1.293054771315641e-05, "loss": 0.5233, "step": 18780 }, { "epoch": 3.54, "grad_norm": 5.833727836608887, "learning_rate": 1.2926783361565971e-05, "loss": 0.5184, "step": 18790 }, { "epoch": 3.54, "grad_norm": 11.050354957580566, "learning_rate": 1.2923019009975533e-05, "loss": 0.6627, "step": 18800 }, { "epoch": 3.54, "grad_norm": 17.933313369750977, "learning_rate": 1.2919254658385094e-05, "loss": 0.6955, "step": 18810 }, { "epoch": 3.54, "grad_norm": 27.6488037109375, "learning_rate": 1.2915490306794655e-05, "loss": 0.7085, "step": 18820 }, { "epoch": 3.54, "grad_norm": 19.928560256958008, "learning_rate": 1.2911725955204217e-05, "loss": 0.4012, "step": 18830 }, { "epoch": 3.55, "grad_norm": 12.084012985229492, "learning_rate": 1.2907961603613778e-05, "loss": 0.679, "step": 18840 }, { "epoch": 3.55, "grad_norm": 36.998104095458984, "learning_rate": 1.290419725202334e-05, "loss": 0.8037, "step": 18850 }, { "epoch": 3.55, "grad_norm": 15.250351905822754, "learning_rate": 1.2900432900432901e-05, "loss": 0.5382, "step": 18860 }, { "epoch": 3.55, "grad_norm": 30.874340057373047, "learning_rate": 1.2896668548842464e-05, "loss": 0.4945, "step": 18870 }, { "epoch": 3.55, "grad_norm": 22.66957664489746, "learning_rate": 1.2892904197252026e-05, "loss": 0.5362, "step": 18880 }, { "epoch": 3.56, "grad_norm": 29.47343635559082, "learning_rate": 1.2889139845661587e-05, "loss": 0.6591, "step": 18890 }, { "epoch": 3.56, "grad_norm": 11.1913423538208, "learning_rate": 1.2885375494071149e-05, "loss": 0.6665, "step": 18900 }, { "epoch": 3.56, "grad_norm": 7.629205703735352, "learning_rate": 1.2881611142480709e-05, "loss": 0.879, "step": 18910 }, { "epoch": 3.56, "grad_norm": 9.654862403869629, "learning_rate": 1.287784679089027e-05, "loss": 0.5893, "step": 18920 }, { "epoch": 3.56, "grad_norm": 0.7786961793899536, "learning_rate": 1.2874082439299832e-05, "loss": 0.7301, "step": 18930 }, { "epoch": 3.56, "grad_norm": 18.76505470275879, "learning_rate": 1.2870318087709393e-05, "loss": 0.8441, "step": 18940 }, { "epoch": 3.57, "grad_norm": 34.90388870239258, "learning_rate": 1.2866553736118954e-05, "loss": 0.7965, "step": 18950 }, { "epoch": 3.57, "grad_norm": 14.218279838562012, "learning_rate": 1.2862789384528516e-05, "loss": 0.7871, "step": 18960 }, { "epoch": 3.57, "grad_norm": 12.61451244354248, "learning_rate": 1.2859025032938077e-05, "loss": 0.3458, "step": 18970 }, { "epoch": 3.57, "grad_norm": 7.381354331970215, "learning_rate": 1.2855260681347639e-05, "loss": 0.3608, "step": 18980 }, { "epoch": 3.57, "grad_norm": 5.548383712768555, "learning_rate": 1.28514963297572e-05, "loss": 0.3053, "step": 18990 }, { "epoch": 3.58, "grad_norm": 24.012208938598633, "learning_rate": 1.2847731978166762e-05, "loss": 0.73, "step": 19000 }, { "epoch": 3.58, "grad_norm": 8.774457931518555, "learning_rate": 1.2843967626576323e-05, "loss": 0.6298, "step": 19010 }, { "epoch": 3.58, "grad_norm": 30.910158157348633, "learning_rate": 1.2840203274985885e-05, "loss": 0.7751, "step": 19020 }, { "epoch": 3.58, "grad_norm": 34.78523635864258, "learning_rate": 1.2836438923395446e-05, "loss": 0.7953, "step": 19030 }, { "epoch": 3.58, "grad_norm": 19.161802291870117, "learning_rate": 1.2832674571805008e-05, "loss": 0.6131, "step": 19040 }, { "epoch": 3.59, "grad_norm": 0.4744579792022705, "learning_rate": 1.282891022021457e-05, "loss": 0.5262, "step": 19050 }, { "epoch": 3.59, "grad_norm": 12.062616348266602, "learning_rate": 1.2825145868624132e-05, "loss": 0.8151, "step": 19060 }, { "epoch": 3.59, "grad_norm": 11.607182502746582, "learning_rate": 1.2821381517033694e-05, "loss": 0.5112, "step": 19070 }, { "epoch": 3.59, "grad_norm": 1.1499428749084473, "learning_rate": 1.2817617165443252e-05, "loss": 0.9765, "step": 19080 }, { "epoch": 3.59, "grad_norm": 4.771254062652588, "learning_rate": 1.2813852813852813e-05, "loss": 0.5044, "step": 19090 }, { "epoch": 3.59, "grad_norm": 11.462512016296387, "learning_rate": 1.2810088462262376e-05, "loss": 0.7876, "step": 19100 }, { "epoch": 3.6, "grad_norm": 29.608722686767578, "learning_rate": 1.2806324110671938e-05, "loss": 0.9717, "step": 19110 }, { "epoch": 3.6, "grad_norm": 18.42778968811035, "learning_rate": 1.28025597590815e-05, "loss": 0.4388, "step": 19120 }, { "epoch": 3.6, "grad_norm": 8.988313674926758, "learning_rate": 1.279879540749106e-05, "loss": 0.5285, "step": 19130 }, { "epoch": 3.6, "grad_norm": 17.52735710144043, "learning_rate": 1.2795031055900622e-05, "loss": 0.5965, "step": 19140 }, { "epoch": 3.6, "grad_norm": 24.257164001464844, "learning_rate": 1.2791266704310184e-05, "loss": 0.8865, "step": 19150 }, { "epoch": 3.61, "grad_norm": 5.7428460121154785, "learning_rate": 1.2787502352719745e-05, "loss": 0.5267, "step": 19160 }, { "epoch": 3.61, "grad_norm": 8.993688583374023, "learning_rate": 1.2783738001129307e-05, "loss": 0.8054, "step": 19170 }, { "epoch": 3.61, "grad_norm": 6.0230631828308105, "learning_rate": 1.2779973649538868e-05, "loss": 0.4008, "step": 19180 }, { "epoch": 3.61, "grad_norm": 7.1093220710754395, "learning_rate": 1.277620929794843e-05, "loss": 0.5139, "step": 19190 }, { "epoch": 3.61, "grad_norm": 6.246382713317871, "learning_rate": 1.2772444946357991e-05, "loss": 0.7167, "step": 19200 }, { "epoch": 3.62, "grad_norm": 9.685704231262207, "learning_rate": 1.2768680594767552e-05, "loss": 0.6331, "step": 19210 }, { "epoch": 3.62, "grad_norm": 3.0230801105499268, "learning_rate": 1.2764916243177114e-05, "loss": 0.6397, "step": 19220 }, { "epoch": 3.62, "grad_norm": 17.838150024414062, "learning_rate": 1.2761151891586675e-05, "loss": 0.5742, "step": 19230 }, { "epoch": 3.62, "grad_norm": 21.397781372070312, "learning_rate": 1.2757387539996238e-05, "loss": 0.8132, "step": 19240 }, { "epoch": 3.62, "grad_norm": 8.271471977233887, "learning_rate": 1.2753623188405797e-05, "loss": 0.6171, "step": 19250 }, { "epoch": 3.63, "grad_norm": 2.5334246158599854, "learning_rate": 1.2749858836815358e-05, "loss": 0.8327, "step": 19260 }, { "epoch": 3.63, "grad_norm": 4.004415988922119, "learning_rate": 1.274609448522492e-05, "loss": 0.5323, "step": 19270 }, { "epoch": 3.63, "grad_norm": 22.41325569152832, "learning_rate": 1.2742330133634483e-05, "loss": 0.5696, "step": 19280 }, { "epoch": 3.63, "grad_norm": 10.288199424743652, "learning_rate": 1.2738565782044044e-05, "loss": 0.6255, "step": 19290 }, { "epoch": 3.63, "grad_norm": 7.55498743057251, "learning_rate": 1.2734801430453606e-05, "loss": 0.6258, "step": 19300 }, { "epoch": 3.63, "grad_norm": 5.852587699890137, "learning_rate": 1.2731037078863167e-05, "loss": 0.6389, "step": 19310 }, { "epoch": 3.64, "grad_norm": 27.851564407348633, "learning_rate": 1.2727272727272728e-05, "loss": 0.8596, "step": 19320 }, { "epoch": 3.64, "grad_norm": 9.760847091674805, "learning_rate": 1.272350837568229e-05, "loss": 0.5892, "step": 19330 }, { "epoch": 3.64, "grad_norm": 14.357726097106934, "learning_rate": 1.2719744024091851e-05, "loss": 0.4372, "step": 19340 }, { "epoch": 3.64, "grad_norm": 2.770073175430298, "learning_rate": 1.2715979672501413e-05, "loss": 0.6819, "step": 19350 }, { "epoch": 3.64, "grad_norm": 32.54544448852539, "learning_rate": 1.2712215320910974e-05, "loss": 0.8689, "step": 19360 }, { "epoch": 3.65, "grad_norm": 28.392282485961914, "learning_rate": 1.2708450969320536e-05, "loss": 0.6886, "step": 19370 }, { "epoch": 3.65, "grad_norm": 29.35589599609375, "learning_rate": 1.2704686617730097e-05, "loss": 0.5073, "step": 19380 }, { "epoch": 3.65, "grad_norm": 4.598623752593994, "learning_rate": 1.2700922266139659e-05, "loss": 0.3873, "step": 19390 }, { "epoch": 3.65, "grad_norm": 31.010652542114258, "learning_rate": 1.269715791454922e-05, "loss": 0.6336, "step": 19400 }, { "epoch": 3.65, "grad_norm": 17.593690872192383, "learning_rate": 1.2693393562958782e-05, "loss": 0.7767, "step": 19410 }, { "epoch": 3.66, "grad_norm": 6.044388771057129, "learning_rate": 1.2689629211368343e-05, "loss": 0.683, "step": 19420 }, { "epoch": 3.66, "grad_norm": 9.18665599822998, "learning_rate": 1.2685864859777903e-05, "loss": 0.9063, "step": 19430 }, { "epoch": 3.66, "grad_norm": 12.083257675170898, "learning_rate": 1.2682100508187464e-05, "loss": 1.0481, "step": 19440 }, { "epoch": 3.66, "grad_norm": 18.31049919128418, "learning_rate": 1.2678336156597026e-05, "loss": 0.8629, "step": 19450 }, { "epoch": 3.66, "grad_norm": 5.21190881729126, "learning_rate": 1.2674571805006587e-05, "loss": 0.5297, "step": 19460 }, { "epoch": 3.66, "grad_norm": 14.538198471069336, "learning_rate": 1.267080745341615e-05, "loss": 0.6776, "step": 19470 }, { "epoch": 3.67, "grad_norm": 3.3181121349334717, "learning_rate": 1.2667043101825712e-05, "loss": 0.6773, "step": 19480 }, { "epoch": 3.67, "grad_norm": 1.1416178941726685, "learning_rate": 1.2663278750235273e-05, "loss": 0.4913, "step": 19490 }, { "epoch": 3.67, "grad_norm": 4.539183139801025, "learning_rate": 1.2659514398644835e-05, "loss": 0.8529, "step": 19500 }, { "epoch": 3.67, "grad_norm": 11.008049011230469, "learning_rate": 1.2655750047054396e-05, "loss": 0.5791, "step": 19510 }, { "epoch": 3.67, "grad_norm": 26.218887329101562, "learning_rate": 1.2651985695463958e-05, "loss": 0.6222, "step": 19520 }, { "epoch": 3.68, "grad_norm": 9.582514762878418, "learning_rate": 1.2648221343873519e-05, "loss": 0.5567, "step": 19530 }, { "epoch": 3.68, "grad_norm": 33.36174011230469, "learning_rate": 1.264445699228308e-05, "loss": 0.5457, "step": 19540 }, { "epoch": 3.68, "grad_norm": 5.197388648986816, "learning_rate": 1.2640692640692642e-05, "loss": 1.0242, "step": 19550 }, { "epoch": 3.68, "grad_norm": 29.88850212097168, "learning_rate": 1.2636928289102204e-05, "loss": 0.6045, "step": 19560 }, { "epoch": 3.68, "grad_norm": 14.333205223083496, "learning_rate": 1.2633163937511765e-05, "loss": 0.8825, "step": 19570 }, { "epoch": 3.69, "grad_norm": 16.74696159362793, "learning_rate": 1.2629399585921326e-05, "loss": 0.5199, "step": 19580 }, { "epoch": 3.69, "grad_norm": 10.93972110748291, "learning_rate": 1.2625635234330888e-05, "loss": 0.5204, "step": 19590 }, { "epoch": 3.69, "grad_norm": 26.37276268005371, "learning_rate": 1.2621870882740448e-05, "loss": 0.6918, "step": 19600 }, { "epoch": 3.69, "grad_norm": 30.152820587158203, "learning_rate": 1.2618106531150009e-05, "loss": 0.6616, "step": 19610 }, { "epoch": 3.69, "grad_norm": 4.047630310058594, "learning_rate": 1.261434217955957e-05, "loss": 0.3501, "step": 19620 }, { "epoch": 3.69, "grad_norm": 23.427507400512695, "learning_rate": 1.2610577827969132e-05, "loss": 0.7386, "step": 19630 }, { "epoch": 3.7, "grad_norm": 17.632530212402344, "learning_rate": 1.2606813476378694e-05, "loss": 0.3864, "step": 19640 }, { "epoch": 3.7, "grad_norm": 13.861688613891602, "learning_rate": 1.2603049124788255e-05, "loss": 0.4736, "step": 19650 }, { "epoch": 3.7, "grad_norm": 0.12637591361999512, "learning_rate": 1.2599284773197818e-05, "loss": 0.459, "step": 19660 }, { "epoch": 3.7, "grad_norm": 12.556920051574707, "learning_rate": 1.259552042160738e-05, "loss": 0.5324, "step": 19670 }, { "epoch": 3.7, "grad_norm": 15.041160583496094, "learning_rate": 1.2591756070016941e-05, "loss": 0.7857, "step": 19680 }, { "epoch": 3.71, "grad_norm": 8.056427955627441, "learning_rate": 1.2587991718426503e-05, "loss": 0.5911, "step": 19690 }, { "epoch": 3.71, "grad_norm": 6.120177268981934, "learning_rate": 1.2584227366836064e-05, "loss": 0.4515, "step": 19700 }, { "epoch": 3.71, "grad_norm": 5.58326530456543, "learning_rate": 1.2580463015245625e-05, "loss": 0.8182, "step": 19710 }, { "epoch": 3.71, "grad_norm": 13.465167045593262, "learning_rate": 1.2576698663655187e-05, "loss": 0.48, "step": 19720 }, { "epoch": 3.71, "grad_norm": 2.778507709503174, "learning_rate": 1.2572934312064748e-05, "loss": 0.5068, "step": 19730 }, { "epoch": 3.72, "grad_norm": 7.899831771850586, "learning_rate": 1.256916996047431e-05, "loss": 0.5613, "step": 19740 }, { "epoch": 3.72, "grad_norm": 8.313310623168945, "learning_rate": 1.2565405608883871e-05, "loss": 0.583, "step": 19750 }, { "epoch": 3.72, "grad_norm": 9.49411678314209, "learning_rate": 1.2561641257293433e-05, "loss": 1.0025, "step": 19760 }, { "epoch": 3.72, "grad_norm": 24.08152198791504, "learning_rate": 1.2557876905702994e-05, "loss": 0.7991, "step": 19770 }, { "epoch": 3.72, "grad_norm": 6.793327808380127, "learning_rate": 1.2554112554112554e-05, "loss": 0.8257, "step": 19780 }, { "epoch": 3.72, "grad_norm": 17.370943069458008, "learning_rate": 1.2550348202522115e-05, "loss": 0.8301, "step": 19790 }, { "epoch": 3.73, "grad_norm": 9.847330093383789, "learning_rate": 1.2546583850931677e-05, "loss": 0.6801, "step": 19800 }, { "epoch": 3.73, "grad_norm": 7.884547710418701, "learning_rate": 1.2542819499341238e-05, "loss": 0.4344, "step": 19810 }, { "epoch": 3.73, "grad_norm": 0.6122704744338989, "learning_rate": 1.25390551477508e-05, "loss": 0.649, "step": 19820 }, { "epoch": 3.73, "grad_norm": 12.32701301574707, "learning_rate": 1.2535290796160361e-05, "loss": 0.4236, "step": 19830 }, { "epoch": 3.73, "grad_norm": 11.68048095703125, "learning_rate": 1.2531526444569924e-05, "loss": 0.6277, "step": 19840 }, { "epoch": 3.74, "grad_norm": 27.56481170654297, "learning_rate": 1.2527762092979486e-05, "loss": 0.6161, "step": 19850 }, { "epoch": 3.74, "grad_norm": 10.342185020446777, "learning_rate": 1.2523997741389047e-05, "loss": 0.7714, "step": 19860 }, { "epoch": 3.74, "grad_norm": 15.999653816223145, "learning_rate": 1.2520233389798609e-05, "loss": 0.4018, "step": 19870 }, { "epoch": 3.74, "grad_norm": 14.883124351501465, "learning_rate": 1.251646903820817e-05, "loss": 0.9517, "step": 19880 }, { "epoch": 3.74, "grad_norm": 4.749094009399414, "learning_rate": 1.2512704686617732e-05, "loss": 0.4089, "step": 19890 }, { "epoch": 3.75, "grad_norm": 23.341344833374023, "learning_rate": 1.2508940335027293e-05, "loss": 0.5633, "step": 19900 }, { "epoch": 3.75, "grad_norm": 15.799015045166016, "learning_rate": 1.2505175983436855e-05, "loss": 0.662, "step": 19910 }, { "epoch": 3.75, "grad_norm": 25.26270294189453, "learning_rate": 1.2501411631846416e-05, "loss": 0.6803, "step": 19920 }, { "epoch": 3.75, "grad_norm": 19.431108474731445, "learning_rate": 1.2497647280255978e-05, "loss": 0.6431, "step": 19930 }, { "epoch": 3.75, "grad_norm": 6.340080261230469, "learning_rate": 1.2493882928665539e-05, "loss": 0.6813, "step": 19940 }, { "epoch": 3.75, "grad_norm": 10.563408851623535, "learning_rate": 1.2490118577075099e-05, "loss": 0.5349, "step": 19950 }, { "epoch": 3.76, "grad_norm": 18.89433479309082, "learning_rate": 1.248635422548466e-05, "loss": 0.4407, "step": 19960 }, { "epoch": 3.76, "grad_norm": 5.784712791442871, "learning_rate": 1.2482589873894222e-05, "loss": 0.7272, "step": 19970 }, { "epoch": 3.76, "grad_norm": 24.42642593383789, "learning_rate": 1.2478825522303783e-05, "loss": 0.6009, "step": 19980 }, { "epoch": 3.76, "grad_norm": 21.03069496154785, "learning_rate": 1.2475061170713345e-05, "loss": 0.652, "step": 19990 }, { "epoch": 3.76, "grad_norm": 0.11219408363103867, "learning_rate": 1.2471296819122906e-05, "loss": 0.5116, "step": 20000 }, { "epoch": 3.77, "grad_norm": 10.447319030761719, "learning_rate": 1.2467532467532468e-05, "loss": 0.3867, "step": 20010 }, { "epoch": 3.77, "grad_norm": 8.39681625366211, "learning_rate": 1.2463768115942029e-05, "loss": 0.7164, "step": 20020 }, { "epoch": 3.77, "grad_norm": 13.545207977294922, "learning_rate": 1.2460003764351592e-05, "loss": 0.7469, "step": 20030 }, { "epoch": 3.77, "grad_norm": 8.322916030883789, "learning_rate": 1.2456239412761154e-05, "loss": 0.6016, "step": 20040 }, { "epoch": 3.77, "grad_norm": 10.224037170410156, "learning_rate": 1.2452475061170715e-05, "loss": 0.6915, "step": 20050 }, { "epoch": 3.78, "grad_norm": 10.355185508728027, "learning_rate": 1.2448710709580277e-05, "loss": 0.5082, "step": 20060 }, { "epoch": 3.78, "grad_norm": 11.303367614746094, "learning_rate": 1.2444946357989838e-05, "loss": 0.574, "step": 20070 }, { "epoch": 3.78, "grad_norm": 15.811086654663086, "learning_rate": 1.24411820063994e-05, "loss": 0.8375, "step": 20080 }, { "epoch": 3.78, "grad_norm": 19.942611694335938, "learning_rate": 1.2437417654808961e-05, "loss": 0.6268, "step": 20090 }, { "epoch": 3.78, "grad_norm": 8.685022354125977, "learning_rate": 1.2433653303218522e-05, "loss": 0.5048, "step": 20100 }, { "epoch": 3.79, "grad_norm": 30.918336868286133, "learning_rate": 1.2429888951628084e-05, "loss": 0.5821, "step": 20110 }, { "epoch": 3.79, "grad_norm": 22.617633819580078, "learning_rate": 1.2426124600037644e-05, "loss": 0.6512, "step": 20120 }, { "epoch": 3.79, "grad_norm": 7.3372483253479, "learning_rate": 1.2422360248447205e-05, "loss": 0.958, "step": 20130 }, { "epoch": 3.79, "grad_norm": 9.846896171569824, "learning_rate": 1.2418595896856767e-05, "loss": 0.61, "step": 20140 }, { "epoch": 3.79, "grad_norm": 27.524892807006836, "learning_rate": 1.2414831545266328e-05, "loss": 0.885, "step": 20150 }, { "epoch": 3.79, "grad_norm": 5.012969017028809, "learning_rate": 1.241106719367589e-05, "loss": 0.5705, "step": 20160 }, { "epoch": 3.8, "grad_norm": 14.11467456817627, "learning_rate": 1.2407302842085451e-05, "loss": 0.7462, "step": 20170 }, { "epoch": 3.8, "grad_norm": 23.304222106933594, "learning_rate": 1.2403538490495012e-05, "loss": 0.8118, "step": 20180 }, { "epoch": 3.8, "grad_norm": 5.021602630615234, "learning_rate": 1.2399774138904574e-05, "loss": 0.6483, "step": 20190 }, { "epoch": 3.8, "grad_norm": 4.954014301300049, "learning_rate": 1.2396009787314135e-05, "loss": 0.5144, "step": 20200 }, { "epoch": 3.8, "grad_norm": 15.003413200378418, "learning_rate": 1.2392245435723698e-05, "loss": 0.6549, "step": 20210 }, { "epoch": 3.81, "grad_norm": 0.8703759908676147, "learning_rate": 1.238848108413326e-05, "loss": 0.477, "step": 20220 }, { "epoch": 3.81, "grad_norm": 19.85989761352539, "learning_rate": 1.2384716732542821e-05, "loss": 0.6608, "step": 20230 }, { "epoch": 3.81, "grad_norm": 10.10875129699707, "learning_rate": 1.2380952380952383e-05, "loss": 0.7041, "step": 20240 }, { "epoch": 3.81, "grad_norm": 6.1762471199035645, "learning_rate": 1.2377188029361944e-05, "loss": 0.9546, "step": 20250 }, { "epoch": 3.81, "grad_norm": 44.439823150634766, "learning_rate": 1.2373423677771506e-05, "loss": 1.0269, "step": 20260 }, { "epoch": 3.82, "grad_norm": 17.293577194213867, "learning_rate": 1.2369659326181067e-05, "loss": 0.6694, "step": 20270 }, { "epoch": 3.82, "grad_norm": 5.511439800262451, "learning_rate": 1.2365894974590629e-05, "loss": 0.502, "step": 20280 }, { "epoch": 3.82, "grad_norm": 18.506452560424805, "learning_rate": 1.236213062300019e-05, "loss": 1.0257, "step": 20290 }, { "epoch": 3.82, "grad_norm": 10.748970031738281, "learning_rate": 1.235836627140975e-05, "loss": 0.7194, "step": 20300 }, { "epoch": 3.82, "grad_norm": 39.78062057495117, "learning_rate": 1.2354601919819311e-05, "loss": 0.5527, "step": 20310 }, { "epoch": 3.82, "grad_norm": 16.632009506225586, "learning_rate": 1.2350837568228873e-05, "loss": 0.7186, "step": 20320 }, { "epoch": 3.83, "grad_norm": 3.820341110229492, "learning_rate": 1.2347073216638434e-05, "loss": 0.6947, "step": 20330 }, { "epoch": 3.83, "grad_norm": 1.6036685705184937, "learning_rate": 1.2343308865047996e-05, "loss": 0.4921, "step": 20340 }, { "epoch": 3.83, "grad_norm": 1.2304280996322632, "learning_rate": 1.2339544513457557e-05, "loss": 0.6795, "step": 20350 }, { "epoch": 3.83, "grad_norm": 18.970306396484375, "learning_rate": 1.2335780161867119e-05, "loss": 0.6016, "step": 20360 }, { "epoch": 3.83, "grad_norm": 8.561296463012695, "learning_rate": 1.233201581027668e-05, "loss": 0.9288, "step": 20370 }, { "epoch": 3.84, "grad_norm": 11.71761417388916, "learning_rate": 1.2328251458686242e-05, "loss": 0.7948, "step": 20380 }, { "epoch": 3.84, "grad_norm": 38.39409637451172, "learning_rate": 1.2324487107095803e-05, "loss": 0.6868, "step": 20390 }, { "epoch": 3.84, "grad_norm": 19.950315475463867, "learning_rate": 1.2320722755505366e-05, "loss": 0.4955, "step": 20400 }, { "epoch": 3.84, "grad_norm": 6.911773204803467, "learning_rate": 1.2316958403914928e-05, "loss": 0.3525, "step": 20410 }, { "epoch": 3.84, "grad_norm": 13.918087005615234, "learning_rate": 1.2313194052324489e-05, "loss": 0.3229, "step": 20420 }, { "epoch": 3.85, "grad_norm": 14.926342964172363, "learning_rate": 1.230942970073405e-05, "loss": 0.6098, "step": 20430 }, { "epoch": 3.85, "grad_norm": 16.889162063598633, "learning_rate": 1.2305665349143612e-05, "loss": 0.6025, "step": 20440 }, { "epoch": 3.85, "grad_norm": 17.502153396606445, "learning_rate": 1.2301900997553174e-05, "loss": 0.4296, "step": 20450 }, { "epoch": 3.85, "grad_norm": 17.864023208618164, "learning_rate": 1.2298136645962735e-05, "loss": 0.9841, "step": 20460 }, { "epoch": 3.85, "grad_norm": 8.081547737121582, "learning_rate": 1.2294372294372295e-05, "loss": 0.5448, "step": 20470 }, { "epoch": 3.85, "grad_norm": 10.248896598815918, "learning_rate": 1.2290607942781856e-05, "loss": 0.7463, "step": 20480 }, { "epoch": 3.86, "grad_norm": 43.104915618896484, "learning_rate": 1.2286843591191418e-05, "loss": 0.868, "step": 20490 }, { "epoch": 3.86, "grad_norm": 23.549795150756836, "learning_rate": 1.2283079239600979e-05, "loss": 0.551, "step": 20500 }, { "epoch": 3.86, "grad_norm": 18.552968978881836, "learning_rate": 1.227931488801054e-05, "loss": 0.5959, "step": 20510 }, { "epoch": 3.86, "grad_norm": 15.869778633117676, "learning_rate": 1.2275550536420102e-05, "loss": 0.7179, "step": 20520 }, { "epoch": 3.86, "grad_norm": 14.710973739624023, "learning_rate": 1.2271786184829664e-05, "loss": 0.6882, "step": 20530 }, { "epoch": 3.87, "grad_norm": 8.942631721496582, "learning_rate": 1.2268021833239225e-05, "loss": 0.8396, "step": 20540 }, { "epoch": 3.87, "grad_norm": 6.654508113861084, "learning_rate": 1.2264257481648786e-05, "loss": 0.3994, "step": 20550 }, { "epoch": 3.87, "grad_norm": 47.291954040527344, "learning_rate": 1.2260493130058348e-05, "loss": 0.7838, "step": 20560 }, { "epoch": 3.87, "grad_norm": 13.667283058166504, "learning_rate": 1.225672877846791e-05, "loss": 0.4754, "step": 20570 }, { "epoch": 3.87, "grad_norm": 41.05614471435547, "learning_rate": 1.2252964426877473e-05, "loss": 0.643, "step": 20580 }, { "epoch": 3.88, "grad_norm": 16.537818908691406, "learning_rate": 1.2249200075287034e-05, "loss": 0.7026, "step": 20590 }, { "epoch": 3.88, "grad_norm": 16.900470733642578, "learning_rate": 1.2245435723696595e-05, "loss": 0.4249, "step": 20600 }, { "epoch": 3.88, "grad_norm": 11.43319034576416, "learning_rate": 1.2241671372106157e-05, "loss": 0.8047, "step": 20610 }, { "epoch": 3.88, "grad_norm": 1.8628902435302734, "learning_rate": 1.2237907020515718e-05, "loss": 0.5949, "step": 20620 }, { "epoch": 3.88, "grad_norm": 2.925947427749634, "learning_rate": 1.223414266892528e-05, "loss": 0.6901, "step": 20630 }, { "epoch": 3.88, "grad_norm": 10.759583473205566, "learning_rate": 1.2230378317334841e-05, "loss": 0.4635, "step": 20640 }, { "epoch": 3.89, "grad_norm": 13.309684753417969, "learning_rate": 1.2226613965744401e-05, "loss": 0.5326, "step": 20650 }, { "epoch": 3.89, "grad_norm": 23.810779571533203, "learning_rate": 1.2222849614153963e-05, "loss": 0.603, "step": 20660 }, { "epoch": 3.89, "grad_norm": 14.398448944091797, "learning_rate": 1.2219085262563524e-05, "loss": 0.9449, "step": 20670 }, { "epoch": 3.89, "grad_norm": 10.191229820251465, "learning_rate": 1.2215320910973085e-05, "loss": 0.4746, "step": 20680 }, { "epoch": 3.89, "grad_norm": 18.595684051513672, "learning_rate": 1.2211556559382647e-05, "loss": 1.0004, "step": 20690 }, { "epoch": 3.9, "grad_norm": 32.1865119934082, "learning_rate": 1.2207792207792208e-05, "loss": 0.4889, "step": 20700 }, { "epoch": 3.9, "grad_norm": 0.28670892119407654, "learning_rate": 1.220402785620177e-05, "loss": 0.6854, "step": 20710 }, { "epoch": 3.9, "grad_norm": 5.30502986907959, "learning_rate": 1.2200263504611331e-05, "loss": 0.5467, "step": 20720 }, { "epoch": 3.9, "grad_norm": 12.79511547088623, "learning_rate": 1.2196499153020893e-05, "loss": 0.4574, "step": 20730 }, { "epoch": 3.9, "grad_norm": 17.650203704833984, "learning_rate": 1.2192734801430454e-05, "loss": 0.5639, "step": 20740 }, { "epoch": 3.91, "grad_norm": 0.7680536508560181, "learning_rate": 1.2188970449840016e-05, "loss": 0.6494, "step": 20750 }, { "epoch": 3.91, "grad_norm": 0.08082833886146545, "learning_rate": 1.2185206098249577e-05, "loss": 0.4382, "step": 20760 }, { "epoch": 3.91, "grad_norm": 14.408023834228516, "learning_rate": 1.218144174665914e-05, "loss": 0.8581, "step": 20770 }, { "epoch": 3.91, "grad_norm": 10.851282119750977, "learning_rate": 1.2177677395068702e-05, "loss": 0.7481, "step": 20780 }, { "epoch": 3.91, "grad_norm": 29.99077796936035, "learning_rate": 1.2173913043478263e-05, "loss": 1.0, "step": 20790 }, { "epoch": 3.91, "grad_norm": 0.19973182678222656, "learning_rate": 1.2170148691887825e-05, "loss": 0.6387, "step": 20800 }, { "epoch": 3.92, "grad_norm": 7.756618499755859, "learning_rate": 1.2166384340297386e-05, "loss": 0.5351, "step": 20810 }, { "epoch": 3.92, "grad_norm": 20.255647659301758, "learning_rate": 1.2162619988706946e-05, "loss": 0.7969, "step": 20820 }, { "epoch": 3.92, "grad_norm": 0.8674776554107666, "learning_rate": 1.2158855637116507e-05, "loss": 0.7177, "step": 20830 }, { "epoch": 3.92, "grad_norm": 8.670698165893555, "learning_rate": 1.2155091285526069e-05, "loss": 0.624, "step": 20840 }, { "epoch": 3.92, "grad_norm": 0.19249022006988525, "learning_rate": 1.215132693393563e-05, "loss": 0.4953, "step": 20850 }, { "epoch": 3.93, "grad_norm": 26.02955436706543, "learning_rate": 1.2147562582345192e-05, "loss": 0.5161, "step": 20860 }, { "epoch": 3.93, "grad_norm": 29.887535095214844, "learning_rate": 1.2143798230754753e-05, "loss": 0.5798, "step": 20870 }, { "epoch": 3.93, "grad_norm": 17.09107208251953, "learning_rate": 1.2140033879164315e-05, "loss": 0.5887, "step": 20880 }, { "epoch": 3.93, "grad_norm": 21.360456466674805, "learning_rate": 1.2136269527573876e-05, "loss": 0.76, "step": 20890 }, { "epoch": 3.93, "grad_norm": 8.325711250305176, "learning_rate": 1.2132505175983438e-05, "loss": 0.6259, "step": 20900 }, { "epoch": 3.94, "grad_norm": 8.927617073059082, "learning_rate": 1.2128740824392999e-05, "loss": 0.5466, "step": 20910 }, { "epoch": 3.94, "grad_norm": 9.248173713684082, "learning_rate": 1.212497647280256e-05, "loss": 0.4414, "step": 20920 }, { "epoch": 3.94, "grad_norm": 9.122405052185059, "learning_rate": 1.2121212121212122e-05, "loss": 0.5872, "step": 20930 }, { "epoch": 3.94, "grad_norm": 8.157663345336914, "learning_rate": 1.2117447769621683e-05, "loss": 0.577, "step": 20940 }, { "epoch": 3.94, "grad_norm": 19.186445236206055, "learning_rate": 1.2113683418031245e-05, "loss": 0.7963, "step": 20950 }, { "epoch": 3.95, "grad_norm": 8.585948944091797, "learning_rate": 1.2109919066440808e-05, "loss": 0.6799, "step": 20960 }, { "epoch": 3.95, "grad_norm": 11.789979934692383, "learning_rate": 1.210615471485037e-05, "loss": 0.5179, "step": 20970 }, { "epoch": 3.95, "grad_norm": 5.875610828399658, "learning_rate": 1.2102390363259931e-05, "loss": 0.437, "step": 20980 }, { "epoch": 3.95, "grad_norm": 18.89061164855957, "learning_rate": 1.2098626011669492e-05, "loss": 0.577, "step": 20990 }, { "epoch": 3.95, "grad_norm": 4.297598838806152, "learning_rate": 1.2094861660079052e-05, "loss": 0.7225, "step": 21000 }, { "epoch": 3.95, "grad_norm": 0.36552512645721436, "learning_rate": 1.2091097308488614e-05, "loss": 0.2754, "step": 21010 }, { "epoch": 3.96, "grad_norm": 13.013652801513672, "learning_rate": 1.2087332956898175e-05, "loss": 0.669, "step": 21020 }, { "epoch": 3.96, "grad_norm": 6.945061683654785, "learning_rate": 1.2083568605307737e-05, "loss": 0.5892, "step": 21030 }, { "epoch": 3.96, "grad_norm": 4.39546012878418, "learning_rate": 1.2079804253717298e-05, "loss": 0.6006, "step": 21040 }, { "epoch": 3.96, "grad_norm": 24.449954986572266, "learning_rate": 1.207603990212686e-05, "loss": 0.503, "step": 21050 }, { "epoch": 3.96, "grad_norm": 9.902664184570312, "learning_rate": 1.2072275550536421e-05, "loss": 0.6385, "step": 21060 }, { "epoch": 3.97, "grad_norm": 44.686317443847656, "learning_rate": 1.2068511198945982e-05, "loss": 0.4779, "step": 21070 }, { "epoch": 3.97, "grad_norm": 13.766404151916504, "learning_rate": 1.2064746847355544e-05, "loss": 0.6803, "step": 21080 }, { "epoch": 3.97, "grad_norm": 41.97696304321289, "learning_rate": 1.2060982495765105e-05, "loss": 1.1873, "step": 21090 }, { "epoch": 3.97, "grad_norm": 15.863570213317871, "learning_rate": 1.2057218144174667e-05, "loss": 0.5433, "step": 21100 }, { "epoch": 3.97, "grad_norm": 10.411752700805664, "learning_rate": 1.2053453792584228e-05, "loss": 0.5845, "step": 21110 }, { "epoch": 3.98, "grad_norm": 4.70853853225708, "learning_rate": 1.204968944099379e-05, "loss": 0.9748, "step": 21120 }, { "epoch": 3.98, "grad_norm": 41.68354034423828, "learning_rate": 1.2045925089403351e-05, "loss": 0.7451, "step": 21130 }, { "epoch": 3.98, "grad_norm": 12.279452323913574, "learning_rate": 1.2042160737812914e-05, "loss": 0.6239, "step": 21140 }, { "epoch": 3.98, "grad_norm": 1.5319571495056152, "learning_rate": 1.2038396386222476e-05, "loss": 0.6755, "step": 21150 }, { "epoch": 3.98, "grad_norm": 19.07404899597168, "learning_rate": 1.2034632034632037e-05, "loss": 0.9424, "step": 21160 }, { "epoch": 3.98, "grad_norm": 23.491230010986328, "learning_rate": 1.2030867683041595e-05, "loss": 0.7338, "step": 21170 }, { "epoch": 3.99, "grad_norm": 30.06789207458496, "learning_rate": 1.2027103331451157e-05, "loss": 0.705, "step": 21180 }, { "epoch": 3.99, "grad_norm": 7.1863789558410645, "learning_rate": 1.202333897986072e-05, "loss": 0.964, "step": 21190 }, { "epoch": 3.99, "grad_norm": 19.22167205810547, "learning_rate": 1.2019574628270281e-05, "loss": 0.5426, "step": 21200 }, { "epoch": 3.99, "grad_norm": 18.454984664916992, "learning_rate": 1.2015810276679843e-05, "loss": 0.5774, "step": 21210 }, { "epoch": 3.99, "grad_norm": 12.360136985778809, "learning_rate": 1.2012045925089404e-05, "loss": 0.9418, "step": 21220 }, { "epoch": 4.0, "grad_norm": 11.939126968383789, "learning_rate": 1.2008281573498966e-05, "loss": 0.7091, "step": 21230 }, { "epoch": 4.0, "grad_norm": 32.17779541015625, "learning_rate": 1.2004517221908527e-05, "loss": 0.7587, "step": 21240 }, { "epoch": 4.0, "grad_norm": 7.594665050506592, "learning_rate": 1.2000752870318089e-05, "loss": 0.4173, "step": 21250 }, { "epoch": 4.0, "eval_accuracy": 0.9169333333333334, "eval_loss": 0.3204882740974426, "eval_runtime": 51.2733, "eval_samples_per_second": 146.275, "eval_steps_per_second": 18.294, "step": 21252 }, { "epoch": 4.0, "grad_norm": 20.44147491455078, "learning_rate": 1.199698851872765e-05, "loss": 0.7655, "step": 21260 }, { "epoch": 4.0, "grad_norm": 5.429800987243652, "learning_rate": 1.1993224167137212e-05, "loss": 0.507, "step": 21270 }, { "epoch": 4.01, "grad_norm": 2.7339699268341064, "learning_rate": 1.1989459815546773e-05, "loss": 0.5326, "step": 21280 }, { "epoch": 4.01, "grad_norm": 10.058930397033691, "learning_rate": 1.1985695463956335e-05, "loss": 0.923, "step": 21290 }, { "epoch": 4.01, "grad_norm": 17.565692901611328, "learning_rate": 1.1981931112365896e-05, "loss": 0.5113, "step": 21300 }, { "epoch": 4.01, "grad_norm": 6.418165683746338, "learning_rate": 1.1978166760775457e-05, "loss": 0.7591, "step": 21310 }, { "epoch": 4.01, "grad_norm": 8.731622695922852, "learning_rate": 1.1974402409185019e-05, "loss": 0.4488, "step": 21320 }, { "epoch": 4.01, "grad_norm": 6.82095193862915, "learning_rate": 1.1970638057594582e-05, "loss": 0.5737, "step": 21330 }, { "epoch": 4.02, "grad_norm": 19.75332260131836, "learning_rate": 1.196687370600414e-05, "loss": 0.5355, "step": 21340 }, { "epoch": 4.02, "grad_norm": 8.986883163452148, "learning_rate": 1.1963109354413702e-05, "loss": 0.749, "step": 21350 }, { "epoch": 4.02, "grad_norm": 5.045528411865234, "learning_rate": 1.1959345002823263e-05, "loss": 0.7225, "step": 21360 }, { "epoch": 4.02, "grad_norm": 24.6765079498291, "learning_rate": 1.1955580651232826e-05, "loss": 0.5938, "step": 21370 }, { "epoch": 4.02, "grad_norm": 14.547659873962402, "learning_rate": 1.1951816299642388e-05, "loss": 0.554, "step": 21380 }, { "epoch": 4.03, "grad_norm": 21.172176361083984, "learning_rate": 1.1948051948051949e-05, "loss": 0.3325, "step": 21390 }, { "epoch": 4.03, "grad_norm": 35.6104736328125, "learning_rate": 1.194428759646151e-05, "loss": 0.8367, "step": 21400 }, { "epoch": 4.03, "grad_norm": 4.693765163421631, "learning_rate": 1.1940523244871072e-05, "loss": 0.6465, "step": 21410 }, { "epoch": 4.03, "grad_norm": 12.738019943237305, "learning_rate": 1.1936758893280634e-05, "loss": 0.5504, "step": 21420 }, { "epoch": 4.03, "grad_norm": 26.901885986328125, "learning_rate": 1.1932994541690195e-05, "loss": 0.4753, "step": 21430 }, { "epoch": 4.04, "grad_norm": 21.4942569732666, "learning_rate": 1.1929230190099756e-05, "loss": 0.8649, "step": 21440 }, { "epoch": 4.04, "grad_norm": 14.137928009033203, "learning_rate": 1.1925465838509318e-05, "loss": 0.5552, "step": 21450 }, { "epoch": 4.04, "grad_norm": 18.820674896240234, "learning_rate": 1.192170148691888e-05, "loss": 0.577, "step": 21460 }, { "epoch": 4.04, "grad_norm": 3.413235664367676, "learning_rate": 1.191793713532844e-05, "loss": 0.5026, "step": 21470 }, { "epoch": 4.04, "grad_norm": 1.3790395259857178, "learning_rate": 1.1914172783738002e-05, "loss": 0.4477, "step": 21480 }, { "epoch": 4.04, "grad_norm": 16.56000328063965, "learning_rate": 1.1910408432147564e-05, "loss": 0.6547, "step": 21490 }, { "epoch": 4.05, "grad_norm": 6.222777843475342, "learning_rate": 1.1906644080557125e-05, "loss": 0.6857, "step": 21500 }, { "epoch": 4.05, "grad_norm": 9.765985488891602, "learning_rate": 1.1902879728966688e-05, "loss": 0.6043, "step": 21510 }, { "epoch": 4.05, "grad_norm": 21.691728591918945, "learning_rate": 1.1899115377376246e-05, "loss": 0.3465, "step": 21520 }, { "epoch": 4.05, "grad_norm": 0.24102507531642914, "learning_rate": 1.1895351025785808e-05, "loss": 0.4715, "step": 21530 }, { "epoch": 4.05, "grad_norm": 6.748645305633545, "learning_rate": 1.189158667419537e-05, "loss": 0.4595, "step": 21540 }, { "epoch": 4.06, "grad_norm": 6.551207065582275, "learning_rate": 1.188782232260493e-05, "loss": 0.8763, "step": 21550 }, { "epoch": 4.06, "grad_norm": 9.870513916015625, "learning_rate": 1.1884057971014494e-05, "loss": 0.4955, "step": 21560 }, { "epoch": 4.06, "grad_norm": 15.267050743103027, "learning_rate": 1.1880293619424055e-05, "loss": 0.5343, "step": 21570 }, { "epoch": 4.06, "grad_norm": 16.510562896728516, "learning_rate": 1.1876529267833617e-05, "loss": 0.7292, "step": 21580 }, { "epoch": 4.06, "grad_norm": 5.880744457244873, "learning_rate": 1.1872764916243178e-05, "loss": 0.8015, "step": 21590 }, { "epoch": 4.07, "grad_norm": 12.049949645996094, "learning_rate": 1.186900056465274e-05, "loss": 0.5739, "step": 21600 }, { "epoch": 4.07, "grad_norm": 13.833176612854004, "learning_rate": 1.1865236213062301e-05, "loss": 0.9261, "step": 21610 }, { "epoch": 4.07, "grad_norm": 21.508180618286133, "learning_rate": 1.1861471861471863e-05, "loss": 0.6697, "step": 21620 }, { "epoch": 4.07, "grad_norm": 0.2058361917734146, "learning_rate": 1.1857707509881424e-05, "loss": 0.7873, "step": 21630 }, { "epoch": 4.07, "grad_norm": 31.943452835083008, "learning_rate": 1.1853943158290986e-05, "loss": 0.5298, "step": 21640 }, { "epoch": 4.07, "grad_norm": 11.333353042602539, "learning_rate": 1.1850178806700547e-05, "loss": 0.3821, "step": 21650 }, { "epoch": 4.08, "grad_norm": 8.284579277038574, "learning_rate": 1.1846414455110109e-05, "loss": 0.4522, "step": 21660 }, { "epoch": 4.08, "grad_norm": 11.12885570526123, "learning_rate": 1.184265010351967e-05, "loss": 0.9707, "step": 21670 }, { "epoch": 4.08, "grad_norm": 5.9140191078186035, "learning_rate": 1.1838885751929231e-05, "loss": 0.5, "step": 21680 }, { "epoch": 4.08, "grad_norm": 6.177058696746826, "learning_rate": 1.1835121400338791e-05, "loss": 0.6024, "step": 21690 }, { "epoch": 4.08, "grad_norm": 2.904001235961914, "learning_rate": 1.1831357048748353e-05, "loss": 0.8046, "step": 21700 }, { "epoch": 4.09, "grad_norm": 16.1057186126709, "learning_rate": 1.1827592697157914e-05, "loss": 0.5458, "step": 21710 }, { "epoch": 4.09, "grad_norm": 19.81020736694336, "learning_rate": 1.1823828345567476e-05, "loss": 0.3633, "step": 21720 }, { "epoch": 4.09, "grad_norm": 16.992023468017578, "learning_rate": 1.1820063993977037e-05, "loss": 0.9944, "step": 21730 }, { "epoch": 4.09, "grad_norm": 24.561140060424805, "learning_rate": 1.18162996423866e-05, "loss": 0.4752, "step": 21740 }, { "epoch": 4.09, "grad_norm": 0.726672887802124, "learning_rate": 1.1812535290796162e-05, "loss": 0.4632, "step": 21750 }, { "epoch": 4.1, "grad_norm": 0.690126359462738, "learning_rate": 1.1808770939205723e-05, "loss": 0.5868, "step": 21760 }, { "epoch": 4.1, "grad_norm": 36.9278564453125, "learning_rate": 1.1805006587615285e-05, "loss": 0.772, "step": 21770 }, { "epoch": 4.1, "grad_norm": 16.300630569458008, "learning_rate": 1.1801242236024846e-05, "loss": 0.4303, "step": 21780 }, { "epoch": 4.1, "grad_norm": 0.3904343843460083, "learning_rate": 1.1797477884434408e-05, "loss": 0.5117, "step": 21790 }, { "epoch": 4.1, "grad_norm": 26.473928451538086, "learning_rate": 1.1793713532843969e-05, "loss": 0.4777, "step": 21800 }, { "epoch": 4.11, "grad_norm": 12.805113792419434, "learning_rate": 1.178994918125353e-05, "loss": 0.7928, "step": 21810 }, { "epoch": 4.11, "grad_norm": 26.79267692565918, "learning_rate": 1.1786184829663092e-05, "loss": 0.6495, "step": 21820 }, { "epoch": 4.11, "grad_norm": 18.316246032714844, "learning_rate": 1.1782420478072653e-05, "loss": 0.7989, "step": 21830 }, { "epoch": 4.11, "grad_norm": 15.82396411895752, "learning_rate": 1.1778656126482215e-05, "loss": 0.6794, "step": 21840 }, { "epoch": 4.11, "grad_norm": 15.773408889770508, "learning_rate": 1.1774891774891776e-05, "loss": 0.7149, "step": 21850 }, { "epoch": 4.11, "grad_norm": 9.06588077545166, "learning_rate": 1.1771127423301338e-05, "loss": 0.7622, "step": 21860 }, { "epoch": 4.12, "grad_norm": 0.6174959540367126, "learning_rate": 1.1767363071710898e-05, "loss": 0.7434, "step": 21870 }, { "epoch": 4.12, "grad_norm": 4.278200149536133, "learning_rate": 1.1763598720120459e-05, "loss": 0.6226, "step": 21880 }, { "epoch": 4.12, "grad_norm": 16.598983764648438, "learning_rate": 1.175983436853002e-05, "loss": 0.5363, "step": 21890 }, { "epoch": 4.12, "grad_norm": 25.885684967041016, "learning_rate": 1.1756070016939582e-05, "loss": 0.5998, "step": 21900 }, { "epoch": 4.12, "grad_norm": 7.993864059448242, "learning_rate": 1.1752305665349143e-05, "loss": 0.3801, "step": 21910 }, { "epoch": 4.13, "grad_norm": 0.9169643521308899, "learning_rate": 1.1748541313758705e-05, "loss": 0.6745, "step": 21920 }, { "epoch": 4.13, "grad_norm": 48.847991943359375, "learning_rate": 1.1744776962168268e-05, "loss": 0.5772, "step": 21930 }, { "epoch": 4.13, "grad_norm": 34.63701629638672, "learning_rate": 1.174101261057783e-05, "loss": 0.6175, "step": 21940 }, { "epoch": 4.13, "grad_norm": 55.32223129272461, "learning_rate": 1.1737248258987391e-05, "loss": 1.0425, "step": 21950 }, { "epoch": 4.13, "grad_norm": 8.96568489074707, "learning_rate": 1.1733483907396952e-05, "loss": 0.5169, "step": 21960 }, { "epoch": 4.14, "grad_norm": 13.289989471435547, "learning_rate": 1.1729719555806514e-05, "loss": 0.6216, "step": 21970 }, { "epoch": 4.14, "grad_norm": 0.6537827253341675, "learning_rate": 1.1725955204216075e-05, "loss": 0.7422, "step": 21980 }, { "epoch": 4.14, "grad_norm": 3.4758856296539307, "learning_rate": 1.1722190852625637e-05, "loss": 0.6289, "step": 21990 }, { "epoch": 4.14, "grad_norm": 3.0647480487823486, "learning_rate": 1.1718426501035198e-05, "loss": 0.598, "step": 22000 }, { "epoch": 4.14, "grad_norm": 9.152379989624023, "learning_rate": 1.171466214944476e-05, "loss": 0.4765, "step": 22010 }, { "epoch": 4.14, "grad_norm": 8.172050476074219, "learning_rate": 1.1710897797854321e-05, "loss": 0.6093, "step": 22020 }, { "epoch": 4.15, "grad_norm": 9.752901077270508, "learning_rate": 1.1707133446263883e-05, "loss": 0.6086, "step": 22030 }, { "epoch": 4.15, "grad_norm": 12.649826049804688, "learning_rate": 1.1703369094673442e-05, "loss": 0.7863, "step": 22040 }, { "epoch": 4.15, "grad_norm": 11.14371109008789, "learning_rate": 1.1699604743083004e-05, "loss": 0.6756, "step": 22050 }, { "epoch": 4.15, "grad_norm": 0.7610763311386108, "learning_rate": 1.1695840391492565e-05, "loss": 0.6398, "step": 22060 }, { "epoch": 4.15, "grad_norm": 1.436449408531189, "learning_rate": 1.1692076039902127e-05, "loss": 0.5272, "step": 22070 }, { "epoch": 4.16, "grad_norm": 22.43182373046875, "learning_rate": 1.1688311688311688e-05, "loss": 0.5553, "step": 22080 }, { "epoch": 4.16, "grad_norm": 0.680665910243988, "learning_rate": 1.168454733672125e-05, "loss": 0.3977, "step": 22090 }, { "epoch": 4.16, "grad_norm": 12.118298530578613, "learning_rate": 1.1680782985130811e-05, "loss": 0.482, "step": 22100 }, { "epoch": 4.16, "grad_norm": 19.042518615722656, "learning_rate": 1.1677018633540373e-05, "loss": 0.5045, "step": 22110 }, { "epoch": 4.16, "grad_norm": 9.090872764587402, "learning_rate": 1.1673254281949936e-05, "loss": 0.4631, "step": 22120 }, { "epoch": 4.17, "grad_norm": 2.18812894821167, "learning_rate": 1.1669489930359497e-05, "loss": 0.6592, "step": 22130 }, { "epoch": 4.17, "grad_norm": 36.4178466796875, "learning_rate": 1.1665725578769059e-05, "loss": 0.6483, "step": 22140 }, { "epoch": 4.17, "grad_norm": 28.83828353881836, "learning_rate": 1.166196122717862e-05, "loss": 0.4226, "step": 22150 }, { "epoch": 4.17, "grad_norm": 31.52997398376465, "learning_rate": 1.1658196875588182e-05, "loss": 0.3637, "step": 22160 }, { "epoch": 4.17, "grad_norm": 8.760178565979004, "learning_rate": 1.1654432523997743e-05, "loss": 0.7955, "step": 22170 }, { "epoch": 4.17, "grad_norm": 0.6389203071594238, "learning_rate": 1.1650668172407305e-05, "loss": 0.4444, "step": 22180 }, { "epoch": 4.18, "grad_norm": 29.18195915222168, "learning_rate": 1.1646903820816866e-05, "loss": 0.7053, "step": 22190 }, { "epoch": 4.18, "grad_norm": 22.56359100341797, "learning_rate": 1.1643139469226427e-05, "loss": 0.4031, "step": 22200 }, { "epoch": 4.18, "grad_norm": 23.801780700683594, "learning_rate": 1.1639375117635989e-05, "loss": 0.624, "step": 22210 }, { "epoch": 4.18, "grad_norm": 13.610224723815918, "learning_rate": 1.1635610766045549e-05, "loss": 0.9822, "step": 22220 }, { "epoch": 4.18, "grad_norm": 18.63134765625, "learning_rate": 1.163184641445511e-05, "loss": 0.6779, "step": 22230 }, { "epoch": 4.19, "grad_norm": 16.268205642700195, "learning_rate": 1.1628082062864672e-05, "loss": 0.5979, "step": 22240 }, { "epoch": 4.19, "grad_norm": 8.651780128479004, "learning_rate": 1.1624317711274233e-05, "loss": 0.7667, "step": 22250 }, { "epoch": 4.19, "grad_norm": 42.6129035949707, "learning_rate": 1.1620553359683795e-05, "loss": 0.6944, "step": 22260 }, { "epoch": 4.19, "grad_norm": 3.1881678104400635, "learning_rate": 1.1616789008093356e-05, "loss": 0.471, "step": 22270 }, { "epoch": 4.19, "grad_norm": 0.7028786540031433, "learning_rate": 1.1613024656502917e-05, "loss": 0.5197, "step": 22280 }, { "epoch": 4.2, "grad_norm": 47.69763946533203, "learning_rate": 1.1609260304912479e-05, "loss": 0.6065, "step": 22290 }, { "epoch": 4.2, "grad_norm": 6.314722061157227, "learning_rate": 1.1605495953322042e-05, "loss": 0.5556, "step": 22300 }, { "epoch": 4.2, "grad_norm": 8.165750503540039, "learning_rate": 1.1601731601731604e-05, "loss": 0.7189, "step": 22310 }, { "epoch": 4.2, "grad_norm": 7.349857807159424, "learning_rate": 1.1597967250141165e-05, "loss": 0.5367, "step": 22320 }, { "epoch": 4.2, "grad_norm": 10.950774192810059, "learning_rate": 1.1594202898550726e-05, "loss": 0.773, "step": 22330 }, { "epoch": 4.2, "grad_norm": 16.20650291442871, "learning_rate": 1.1590438546960288e-05, "loss": 0.6576, "step": 22340 }, { "epoch": 4.21, "grad_norm": 20.38640594482422, "learning_rate": 1.158667419536985e-05, "loss": 0.5126, "step": 22350 }, { "epoch": 4.21, "grad_norm": 8.257250785827637, "learning_rate": 1.158290984377941e-05, "loss": 0.591, "step": 22360 }, { "epoch": 4.21, "grad_norm": 5.747903823852539, "learning_rate": 1.1579145492188972e-05, "loss": 0.6314, "step": 22370 }, { "epoch": 4.21, "grad_norm": 19.206281661987305, "learning_rate": 1.1575381140598534e-05, "loss": 0.6481, "step": 22380 }, { "epoch": 4.21, "grad_norm": 22.839046478271484, "learning_rate": 1.1571616789008094e-05, "loss": 0.5929, "step": 22390 }, { "epoch": 4.22, "grad_norm": 22.537134170532227, "learning_rate": 1.1567852437417655e-05, "loss": 0.4775, "step": 22400 }, { "epoch": 4.22, "grad_norm": 0.13030314445495605, "learning_rate": 1.1564088085827216e-05, "loss": 0.4585, "step": 22410 }, { "epoch": 4.22, "grad_norm": 11.866349220275879, "learning_rate": 1.1560323734236778e-05, "loss": 0.6309, "step": 22420 }, { "epoch": 4.22, "grad_norm": 15.4808349609375, "learning_rate": 1.155655938264634e-05, "loss": 0.7321, "step": 22430 }, { "epoch": 4.22, "grad_norm": 14.688508033752441, "learning_rate": 1.15527950310559e-05, "loss": 0.8127, "step": 22440 }, { "epoch": 4.23, "grad_norm": 8.376343727111816, "learning_rate": 1.1549030679465462e-05, "loss": 0.6156, "step": 22450 }, { "epoch": 4.23, "grad_norm": 22.739774703979492, "learning_rate": 1.1545266327875024e-05, "loss": 0.6053, "step": 22460 }, { "epoch": 4.23, "grad_norm": 1.1516119241714478, "learning_rate": 1.1541501976284585e-05, "loss": 0.5797, "step": 22470 }, { "epoch": 4.23, "grad_norm": 38.92045974731445, "learning_rate": 1.1537737624694147e-05, "loss": 0.7583, "step": 22480 }, { "epoch": 4.23, "grad_norm": 11.236977577209473, "learning_rate": 1.153397327310371e-05, "loss": 0.7248, "step": 22490 }, { "epoch": 4.23, "grad_norm": 1.5737559795379639, "learning_rate": 1.1530208921513271e-05, "loss": 0.7463, "step": 22500 }, { "epoch": 4.24, "grad_norm": 1.3805655241012573, "learning_rate": 1.1526444569922833e-05, "loss": 0.4989, "step": 22510 }, { "epoch": 4.24, "grad_norm": 115.92575073242188, "learning_rate": 1.1522680218332394e-05, "loss": 0.4902, "step": 22520 }, { "epoch": 4.24, "grad_norm": 1.0640676021575928, "learning_rate": 1.1518915866741956e-05, "loss": 0.5065, "step": 22530 }, { "epoch": 4.24, "grad_norm": 9.37736988067627, "learning_rate": 1.1515151515151517e-05, "loss": 0.493, "step": 22540 }, { "epoch": 4.24, "grad_norm": 11.277339935302734, "learning_rate": 1.1511387163561079e-05, "loss": 0.5904, "step": 22550 }, { "epoch": 4.25, "grad_norm": 33.85553741455078, "learning_rate": 1.1507622811970638e-05, "loss": 0.5691, "step": 22560 }, { "epoch": 4.25, "grad_norm": 16.795196533203125, "learning_rate": 1.15038584603802e-05, "loss": 0.6498, "step": 22570 }, { "epoch": 4.25, "grad_norm": 6.320994853973389, "learning_rate": 1.1500094108789761e-05, "loss": 0.5735, "step": 22580 }, { "epoch": 4.25, "grad_norm": 0.7785674929618835, "learning_rate": 1.1496329757199323e-05, "loss": 0.5169, "step": 22590 }, { "epoch": 4.25, "grad_norm": 1.8020360469818115, "learning_rate": 1.1492565405608884e-05, "loss": 0.3877, "step": 22600 }, { "epoch": 4.26, "grad_norm": 28.67214584350586, "learning_rate": 1.1488801054018446e-05, "loss": 0.6102, "step": 22610 }, { "epoch": 4.26, "grad_norm": 33.07846450805664, "learning_rate": 1.1485036702428007e-05, "loss": 0.6309, "step": 22620 }, { "epoch": 4.26, "grad_norm": 41.755577087402344, "learning_rate": 1.1481272350837569e-05, "loss": 0.9355, "step": 22630 }, { "epoch": 4.26, "grad_norm": 7.226475238800049, "learning_rate": 1.147750799924713e-05, "loss": 0.6734, "step": 22640 }, { "epoch": 4.26, "grad_norm": 20.88840675354004, "learning_rate": 1.1473743647656691e-05, "loss": 0.4802, "step": 22650 }, { "epoch": 4.27, "grad_norm": 18.926301956176758, "learning_rate": 1.1469979296066253e-05, "loss": 0.6868, "step": 22660 }, { "epoch": 4.27, "grad_norm": 9.090205192565918, "learning_rate": 1.1466214944475816e-05, "loss": 0.7658, "step": 22670 }, { "epoch": 4.27, "grad_norm": 55.62081527709961, "learning_rate": 1.1462450592885378e-05, "loss": 0.6087, "step": 22680 }, { "epoch": 4.27, "grad_norm": 20.29065704345703, "learning_rate": 1.1458686241294939e-05, "loss": 0.4497, "step": 22690 }, { "epoch": 4.27, "grad_norm": 6.561438083648682, "learning_rate": 1.14549218897045e-05, "loss": 0.5936, "step": 22700 }, { "epoch": 4.27, "grad_norm": 12.46176815032959, "learning_rate": 1.1451157538114062e-05, "loss": 0.8636, "step": 22710 }, { "epoch": 4.28, "grad_norm": 31.131755828857422, "learning_rate": 1.1447393186523623e-05, "loss": 0.7126, "step": 22720 }, { "epoch": 4.28, "grad_norm": 20.14628028869629, "learning_rate": 1.1443628834933185e-05, "loss": 0.4201, "step": 22730 }, { "epoch": 4.28, "grad_norm": 8.134221076965332, "learning_rate": 1.1439864483342745e-05, "loss": 0.595, "step": 22740 }, { "epoch": 4.28, "grad_norm": 23.64768409729004, "learning_rate": 1.1436100131752306e-05, "loss": 0.8233, "step": 22750 }, { "epoch": 4.28, "grad_norm": 3.0115549564361572, "learning_rate": 1.1432335780161868e-05, "loss": 0.3933, "step": 22760 }, { "epoch": 4.29, "grad_norm": 0.10152573138475418, "learning_rate": 1.1428571428571429e-05, "loss": 0.4964, "step": 22770 }, { "epoch": 4.29, "grad_norm": 12.97737979888916, "learning_rate": 1.142480707698099e-05, "loss": 0.281, "step": 22780 }, { "epoch": 4.29, "grad_norm": 10.339591979980469, "learning_rate": 1.1421042725390552e-05, "loss": 0.7338, "step": 22790 }, { "epoch": 4.29, "grad_norm": 21.024656295776367, "learning_rate": 1.1417278373800113e-05, "loss": 0.4871, "step": 22800 }, { "epoch": 4.29, "grad_norm": 0.6640139818191528, "learning_rate": 1.1413514022209675e-05, "loss": 0.5691, "step": 22810 }, { "epoch": 4.3, "grad_norm": 19.123470306396484, "learning_rate": 1.1409749670619236e-05, "loss": 0.8221, "step": 22820 }, { "epoch": 4.3, "grad_norm": 16.33519172668457, "learning_rate": 1.1405985319028798e-05, "loss": 0.6879, "step": 22830 }, { "epoch": 4.3, "grad_norm": 19.19475746154785, "learning_rate": 1.140222096743836e-05, "loss": 0.5056, "step": 22840 }, { "epoch": 4.3, "grad_norm": 4.6960883140563965, "learning_rate": 1.139845661584792e-05, "loss": 0.6242, "step": 22850 }, { "epoch": 4.3, "grad_norm": 0.4895837604999542, "learning_rate": 1.1394692264257484e-05, "loss": 0.6206, "step": 22860 }, { "epoch": 4.3, "grad_norm": 18.621294021606445, "learning_rate": 1.1390927912667045e-05, "loss": 0.6418, "step": 22870 }, { "epoch": 4.31, "grad_norm": 12.699724197387695, "learning_rate": 1.1387163561076607e-05, "loss": 0.4555, "step": 22880 }, { "epoch": 4.31, "grad_norm": 8.417510986328125, "learning_rate": 1.1383399209486168e-05, "loss": 0.5236, "step": 22890 }, { "epoch": 4.31, "grad_norm": 9.543535232543945, "learning_rate": 1.137963485789573e-05, "loss": 0.5137, "step": 22900 }, { "epoch": 4.31, "grad_norm": 21.985715866088867, "learning_rate": 1.137587050630529e-05, "loss": 0.4621, "step": 22910 }, { "epoch": 4.31, "grad_norm": 2.261923313140869, "learning_rate": 1.1372106154714851e-05, "loss": 0.6878, "step": 22920 }, { "epoch": 4.32, "grad_norm": 12.996893882751465, "learning_rate": 1.1368341803124412e-05, "loss": 0.7822, "step": 22930 }, { "epoch": 4.32, "grad_norm": 1.7661455869674683, "learning_rate": 1.1364577451533974e-05, "loss": 0.627, "step": 22940 }, { "epoch": 4.32, "grad_norm": 16.084827423095703, "learning_rate": 1.1360813099943535e-05, "loss": 0.4727, "step": 22950 }, { "epoch": 4.32, "grad_norm": 10.136256217956543, "learning_rate": 1.1357048748353097e-05, "loss": 0.5857, "step": 22960 }, { "epoch": 4.32, "grad_norm": 13.618847846984863, "learning_rate": 1.1353284396762658e-05, "loss": 0.5905, "step": 22970 }, { "epoch": 4.33, "grad_norm": 1.0733072757720947, "learning_rate": 1.134952004517222e-05, "loss": 0.6132, "step": 22980 }, { "epoch": 4.33, "grad_norm": 4.069360733032227, "learning_rate": 1.1345755693581781e-05, "loss": 0.3998, "step": 22990 }, { "epoch": 4.33, "grad_norm": 27.945581436157227, "learning_rate": 1.1341991341991343e-05, "loss": 0.5658, "step": 23000 }, { "epoch": 4.33, "grad_norm": 0.5599634647369385, "learning_rate": 1.1338226990400904e-05, "loss": 0.3289, "step": 23010 }, { "epoch": 4.33, "grad_norm": 21.056047439575195, "learning_rate": 1.1334462638810466e-05, "loss": 0.4886, "step": 23020 }, { "epoch": 4.33, "grad_norm": 19.576793670654297, "learning_rate": 1.1330698287220027e-05, "loss": 0.584, "step": 23030 }, { "epoch": 4.34, "grad_norm": 8.226250648498535, "learning_rate": 1.132693393562959e-05, "loss": 0.7993, "step": 23040 }, { "epoch": 4.34, "grad_norm": 7.203392028808594, "learning_rate": 1.1323169584039152e-05, "loss": 0.5483, "step": 23050 }, { "epoch": 4.34, "grad_norm": 27.118764877319336, "learning_rate": 1.1319405232448713e-05, "loss": 0.6039, "step": 23060 }, { "epoch": 4.34, "grad_norm": 8.184642791748047, "learning_rate": 1.1315640880858275e-05, "loss": 0.5206, "step": 23070 }, { "epoch": 4.34, "grad_norm": 1.8194717168807983, "learning_rate": 1.1311876529267836e-05, "loss": 0.7129, "step": 23080 }, { "epoch": 4.35, "grad_norm": 7.509031772613525, "learning_rate": 1.1308112177677396e-05, "loss": 0.7681, "step": 23090 }, { "epoch": 4.35, "grad_norm": 19.41481590270996, "learning_rate": 1.1304347826086957e-05, "loss": 0.5878, "step": 23100 }, { "epoch": 4.35, "grad_norm": 9.031009674072266, "learning_rate": 1.1300583474496519e-05, "loss": 0.4608, "step": 23110 }, { "epoch": 4.35, "grad_norm": 3.5254902839660645, "learning_rate": 1.129681912290608e-05, "loss": 0.6231, "step": 23120 }, { "epoch": 4.35, "grad_norm": 14.69057559967041, "learning_rate": 1.1293054771315642e-05, "loss": 0.7666, "step": 23130 }, { "epoch": 4.36, "grad_norm": 26.575754165649414, "learning_rate": 1.1289290419725203e-05, "loss": 0.5632, "step": 23140 }, { "epoch": 4.36, "grad_norm": 12.611248016357422, "learning_rate": 1.1285526068134765e-05, "loss": 0.6975, "step": 23150 }, { "epoch": 4.36, "grad_norm": 19.788602828979492, "learning_rate": 1.1281761716544326e-05, "loss": 0.6446, "step": 23160 }, { "epoch": 4.36, "grad_norm": 20.97464370727539, "learning_rate": 1.1277997364953887e-05, "loss": 0.7142, "step": 23170 }, { "epoch": 4.36, "grad_norm": 11.11438274383545, "learning_rate": 1.1274233013363449e-05, "loss": 0.4329, "step": 23180 }, { "epoch": 4.36, "grad_norm": 6.7174296379089355, "learning_rate": 1.127046866177301e-05, "loss": 0.642, "step": 23190 }, { "epoch": 4.37, "grad_norm": 12.077559471130371, "learning_rate": 1.1266704310182572e-05, "loss": 0.6088, "step": 23200 }, { "epoch": 4.37, "grad_norm": 0.6835032105445862, "learning_rate": 1.1262939958592133e-05, "loss": 0.7096, "step": 23210 }, { "epoch": 4.37, "grad_norm": 13.644143104553223, "learning_rate": 1.1259175607001695e-05, "loss": 0.5963, "step": 23220 }, { "epoch": 4.37, "grad_norm": 17.367053985595703, "learning_rate": 1.1255411255411258e-05, "loss": 0.5982, "step": 23230 }, { "epoch": 4.37, "grad_norm": 19.097530364990234, "learning_rate": 1.125164690382082e-05, "loss": 0.7949, "step": 23240 }, { "epoch": 4.38, "grad_norm": 11.06638240814209, "learning_rate": 1.124788255223038e-05, "loss": 0.5742, "step": 23250 }, { "epoch": 4.38, "grad_norm": 6.701301097869873, "learning_rate": 1.1244118200639939e-05, "loss": 0.3653, "step": 23260 }, { "epoch": 4.38, "grad_norm": 7.139585018157959, "learning_rate": 1.1240353849049502e-05, "loss": 0.2213, "step": 23270 }, { "epoch": 4.38, "grad_norm": 15.305197715759277, "learning_rate": 1.1236589497459064e-05, "loss": 0.4505, "step": 23280 }, { "epoch": 4.38, "grad_norm": 9.9895601272583, "learning_rate": 1.1232825145868625e-05, "loss": 0.7347, "step": 23290 }, { "epoch": 4.39, "grad_norm": 30.742244720458984, "learning_rate": 1.1229060794278186e-05, "loss": 0.7636, "step": 23300 }, { "epoch": 4.39, "grad_norm": 24.35445785522461, "learning_rate": 1.1225296442687748e-05, "loss": 0.7456, "step": 23310 }, { "epoch": 4.39, "grad_norm": 24.246612548828125, "learning_rate": 1.122153209109731e-05, "loss": 0.6049, "step": 23320 }, { "epoch": 4.39, "grad_norm": 33.06283950805664, "learning_rate": 1.121776773950687e-05, "loss": 0.5307, "step": 23330 }, { "epoch": 4.39, "grad_norm": 25.850997924804688, "learning_rate": 1.1214003387916432e-05, "loss": 0.7956, "step": 23340 }, { "epoch": 4.39, "grad_norm": 11.392745018005371, "learning_rate": 1.1210239036325994e-05, "loss": 0.6775, "step": 23350 }, { "epoch": 4.4, "grad_norm": 10.8051118850708, "learning_rate": 1.1206474684735555e-05, "loss": 0.4612, "step": 23360 }, { "epoch": 4.4, "grad_norm": 32.45104217529297, "learning_rate": 1.1202710333145117e-05, "loss": 0.6303, "step": 23370 }, { "epoch": 4.4, "grad_norm": 5.765473365783691, "learning_rate": 1.1198945981554678e-05, "loss": 0.7854, "step": 23380 }, { "epoch": 4.4, "grad_norm": 0.4003254771232605, "learning_rate": 1.119518162996424e-05, "loss": 0.5774, "step": 23390 }, { "epoch": 4.4, "grad_norm": 8.617371559143066, "learning_rate": 1.1191417278373801e-05, "loss": 1.0461, "step": 23400 }, { "epoch": 4.41, "grad_norm": 17.002267837524414, "learning_rate": 1.1187652926783363e-05, "loss": 0.7655, "step": 23410 }, { "epoch": 4.41, "grad_norm": 20.015209197998047, "learning_rate": 1.1183888575192926e-05, "loss": 0.6856, "step": 23420 }, { "epoch": 4.41, "grad_norm": 5.477625846862793, "learning_rate": 1.1180124223602484e-05, "loss": 0.7369, "step": 23430 }, { "epoch": 4.41, "grad_norm": 9.53620719909668, "learning_rate": 1.1176359872012045e-05, "loss": 0.4399, "step": 23440 }, { "epoch": 4.41, "grad_norm": 3.9531502723693848, "learning_rate": 1.1172595520421607e-05, "loss": 0.9006, "step": 23450 }, { "epoch": 4.42, "grad_norm": 5.668755054473877, "learning_rate": 1.116883116883117e-05, "loss": 0.432, "step": 23460 }, { "epoch": 4.42, "grad_norm": 36.14069366455078, "learning_rate": 1.1165066817240731e-05, "loss": 0.7544, "step": 23470 }, { "epoch": 4.42, "grad_norm": 9.506393432617188, "learning_rate": 1.1161302465650293e-05, "loss": 0.7204, "step": 23480 }, { "epoch": 4.42, "grad_norm": 22.080291748046875, "learning_rate": 1.1157538114059854e-05, "loss": 0.5264, "step": 23490 }, { "epoch": 4.42, "grad_norm": 6.617203712463379, "learning_rate": 1.1153773762469416e-05, "loss": 0.5839, "step": 23500 }, { "epoch": 4.42, "grad_norm": 13.228835105895996, "learning_rate": 1.1150009410878977e-05, "loss": 0.7058, "step": 23510 }, { "epoch": 4.43, "grad_norm": 0.6176514625549316, "learning_rate": 1.1146245059288539e-05, "loss": 0.4557, "step": 23520 }, { "epoch": 4.43, "grad_norm": 0.5672368407249451, "learning_rate": 1.11424807076981e-05, "loss": 0.4541, "step": 23530 }, { "epoch": 4.43, "grad_norm": 2.1428656578063965, "learning_rate": 1.1138716356107661e-05, "loss": 0.7391, "step": 23540 }, { "epoch": 4.43, "grad_norm": 10.963018417358398, "learning_rate": 1.1134952004517223e-05, "loss": 0.5223, "step": 23550 }, { "epoch": 4.43, "grad_norm": 9.472241401672363, "learning_rate": 1.1131187652926784e-05, "loss": 0.5409, "step": 23560 }, { "epoch": 4.44, "grad_norm": 18.68202781677246, "learning_rate": 1.1127423301336346e-05, "loss": 0.6596, "step": 23570 }, { "epoch": 4.44, "grad_norm": 4.82119083404541, "learning_rate": 1.1123658949745907e-05, "loss": 0.6226, "step": 23580 }, { "epoch": 4.44, "grad_norm": 5.042635917663574, "learning_rate": 1.1119894598155469e-05, "loss": 0.6738, "step": 23590 }, { "epoch": 4.44, "grad_norm": 12.62334156036377, "learning_rate": 1.1116130246565032e-05, "loss": 0.7358, "step": 23600 }, { "epoch": 4.44, "grad_norm": 15.830984115600586, "learning_rate": 1.111236589497459e-05, "loss": 0.7408, "step": 23610 }, { "epoch": 4.45, "grad_norm": 9.996474266052246, "learning_rate": 1.1108601543384151e-05, "loss": 0.3432, "step": 23620 }, { "epoch": 4.45, "grad_norm": 42.45836639404297, "learning_rate": 1.1104837191793713e-05, "loss": 0.7615, "step": 23630 }, { "epoch": 4.45, "grad_norm": 32.250850677490234, "learning_rate": 1.1101072840203274e-05, "loss": 0.4661, "step": 23640 }, { "epoch": 4.45, "grad_norm": 24.006662368774414, "learning_rate": 1.1097308488612838e-05, "loss": 0.7126, "step": 23650 }, { "epoch": 4.45, "grad_norm": 7.966846942901611, "learning_rate": 1.1093544137022399e-05, "loss": 0.7873, "step": 23660 }, { "epoch": 4.46, "grad_norm": 16.041297912597656, "learning_rate": 1.108977978543196e-05, "loss": 0.5328, "step": 23670 }, { "epoch": 4.46, "grad_norm": 12.257124900817871, "learning_rate": 1.1086015433841522e-05, "loss": 0.549, "step": 23680 }, { "epoch": 4.46, "grad_norm": 9.333507537841797, "learning_rate": 1.1082251082251083e-05, "loss": 0.4787, "step": 23690 }, { "epoch": 4.46, "grad_norm": 15.94845199584961, "learning_rate": 1.1078486730660645e-05, "loss": 0.5384, "step": 23700 }, { "epoch": 4.46, "grad_norm": 15.848910331726074, "learning_rate": 1.1074722379070206e-05, "loss": 0.6707, "step": 23710 }, { "epoch": 4.46, "grad_norm": 12.358846664428711, "learning_rate": 1.1070958027479768e-05, "loss": 0.6222, "step": 23720 }, { "epoch": 4.47, "grad_norm": 16.612165451049805, "learning_rate": 1.106719367588933e-05, "loss": 0.4296, "step": 23730 }, { "epoch": 4.47, "grad_norm": 7.621057987213135, "learning_rate": 1.106342932429889e-05, "loss": 0.4142, "step": 23740 }, { "epoch": 4.47, "grad_norm": 26.88300895690918, "learning_rate": 1.1059664972708452e-05, "loss": 0.3773, "step": 23750 }, { "epoch": 4.47, "grad_norm": 14.327881813049316, "learning_rate": 1.1055900621118014e-05, "loss": 0.5598, "step": 23760 }, { "epoch": 4.47, "grad_norm": 7.642137050628662, "learning_rate": 1.1052136269527575e-05, "loss": 0.5747, "step": 23770 }, { "epoch": 4.48, "grad_norm": 11.110989570617676, "learning_rate": 1.1048371917937135e-05, "loss": 0.4744, "step": 23780 }, { "epoch": 4.48, "grad_norm": 11.560811042785645, "learning_rate": 1.1044607566346696e-05, "loss": 0.7039, "step": 23790 }, { "epoch": 4.48, "grad_norm": 17.159273147583008, "learning_rate": 1.1040843214756258e-05, "loss": 0.7643, "step": 23800 }, { "epoch": 4.48, "grad_norm": 15.362770080566406, "learning_rate": 1.103707886316582e-05, "loss": 0.6629, "step": 23810 }, { "epoch": 4.48, "grad_norm": 0.8261994123458862, "learning_rate": 1.103331451157538e-05, "loss": 0.5071, "step": 23820 }, { "epoch": 4.49, "grad_norm": 6.509442329406738, "learning_rate": 1.1029550159984944e-05, "loss": 0.4378, "step": 23830 }, { "epoch": 4.49, "grad_norm": 15.329565048217773, "learning_rate": 1.1025785808394505e-05, "loss": 0.5858, "step": 23840 }, { "epoch": 4.49, "grad_norm": 5.65818977355957, "learning_rate": 1.1022021456804067e-05, "loss": 0.4484, "step": 23850 }, { "epoch": 4.49, "grad_norm": 4.112398624420166, "learning_rate": 1.1018257105213628e-05, "loss": 0.4658, "step": 23860 }, { "epoch": 4.49, "grad_norm": 12.523710250854492, "learning_rate": 1.101449275362319e-05, "loss": 0.5546, "step": 23870 }, { "epoch": 4.49, "grad_norm": 0.8319380879402161, "learning_rate": 1.1010728402032751e-05, "loss": 0.4997, "step": 23880 }, { "epoch": 4.5, "grad_norm": 17.31648063659668, "learning_rate": 1.1006964050442313e-05, "loss": 0.4323, "step": 23890 }, { "epoch": 4.5, "grad_norm": 34.52608871459961, "learning_rate": 1.1003199698851874e-05, "loss": 0.544, "step": 23900 }, { "epoch": 4.5, "grad_norm": 9.72608757019043, "learning_rate": 1.0999435347261436e-05, "loss": 0.6388, "step": 23910 }, { "epoch": 4.5, "grad_norm": 17.97516441345215, "learning_rate": 1.0995670995670997e-05, "loss": 0.8557, "step": 23920 }, { "epoch": 4.5, "grad_norm": 0.5164489150047302, "learning_rate": 1.0991906644080558e-05, "loss": 0.7708, "step": 23930 }, { "epoch": 4.51, "grad_norm": 4.0153117179870605, "learning_rate": 1.098814229249012e-05, "loss": 0.6138, "step": 23940 }, { "epoch": 4.51, "grad_norm": 1.1666315793991089, "learning_rate": 1.0984377940899681e-05, "loss": 0.7278, "step": 23950 }, { "epoch": 4.51, "grad_norm": 9.93372917175293, "learning_rate": 1.0980613589309241e-05, "loss": 0.4048, "step": 23960 }, { "epoch": 4.51, "grad_norm": 21.520671844482422, "learning_rate": 1.0976849237718803e-05, "loss": 0.6345, "step": 23970 }, { "epoch": 4.51, "grad_norm": 3.4470133781433105, "learning_rate": 1.0973084886128364e-05, "loss": 1.0712, "step": 23980 }, { "epoch": 4.52, "grad_norm": 29.81308364868164, "learning_rate": 1.0969320534537926e-05, "loss": 0.4262, "step": 23990 }, { "epoch": 4.52, "grad_norm": 4.830698490142822, "learning_rate": 1.0965556182947487e-05, "loss": 0.7396, "step": 24000 }, { "epoch": 4.52, "grad_norm": 9.33254623413086, "learning_rate": 1.0961791831357048e-05, "loss": 0.8639, "step": 24010 }, { "epoch": 4.52, "grad_norm": 14.065550804138184, "learning_rate": 1.0958027479766612e-05, "loss": 0.623, "step": 24020 }, { "epoch": 4.52, "grad_norm": 11.378796577453613, "learning_rate": 1.0954263128176173e-05, "loss": 0.581, "step": 24030 }, { "epoch": 4.52, "grad_norm": 13.996289253234863, "learning_rate": 1.0950498776585735e-05, "loss": 0.5295, "step": 24040 }, { "epoch": 4.53, "grad_norm": 8.590402603149414, "learning_rate": 1.0946734424995296e-05, "loss": 0.7987, "step": 24050 }, { "epoch": 4.53, "grad_norm": 11.498170852661133, "learning_rate": 1.0942970073404857e-05, "loss": 0.6092, "step": 24060 }, { "epoch": 4.53, "grad_norm": 10.157093048095703, "learning_rate": 1.0939205721814419e-05, "loss": 0.3375, "step": 24070 }, { "epoch": 4.53, "grad_norm": 10.8042573928833, "learning_rate": 1.093544137022398e-05, "loss": 0.7559, "step": 24080 }, { "epoch": 4.53, "grad_norm": 0.13831891119480133, "learning_rate": 1.0931677018633542e-05, "loss": 0.655, "step": 24090 }, { "epoch": 4.54, "grad_norm": 21.07738494873047, "learning_rate": 1.0927912667043103e-05, "loss": 0.8755, "step": 24100 }, { "epoch": 4.54, "grad_norm": 25.695125579833984, "learning_rate": 1.0924148315452665e-05, "loss": 0.4189, "step": 24110 }, { "epoch": 4.54, "grad_norm": 23.678091049194336, "learning_rate": 1.0920383963862226e-05, "loss": 0.6966, "step": 24120 }, { "epoch": 4.54, "grad_norm": 17.928577423095703, "learning_rate": 1.0916619612271786e-05, "loss": 0.4332, "step": 24130 }, { "epoch": 4.54, "grad_norm": 40.44157409667969, "learning_rate": 1.0912855260681347e-05, "loss": 0.4427, "step": 24140 }, { "epoch": 4.55, "grad_norm": 4.393035411834717, "learning_rate": 1.0909090909090909e-05, "loss": 0.5062, "step": 24150 }, { "epoch": 4.55, "grad_norm": 46.50747299194336, "learning_rate": 1.090532655750047e-05, "loss": 0.7385, "step": 24160 }, { "epoch": 4.55, "grad_norm": 0.6289504170417786, "learning_rate": 1.0901562205910032e-05, "loss": 0.3068, "step": 24170 }, { "epoch": 4.55, "grad_norm": 26.921764373779297, "learning_rate": 1.0897797854319593e-05, "loss": 0.6155, "step": 24180 }, { "epoch": 4.55, "grad_norm": 38.250587463378906, "learning_rate": 1.0894033502729155e-05, "loss": 0.576, "step": 24190 }, { "epoch": 4.55, "grad_norm": 2.182755470275879, "learning_rate": 1.0890269151138718e-05, "loss": 0.5502, "step": 24200 }, { "epoch": 4.56, "grad_norm": 8.932013511657715, "learning_rate": 1.088650479954828e-05, "loss": 0.4656, "step": 24210 }, { "epoch": 4.56, "grad_norm": 8.519400596618652, "learning_rate": 1.088274044795784e-05, "loss": 0.3739, "step": 24220 }, { "epoch": 4.56, "grad_norm": 21.846633911132812, "learning_rate": 1.0878976096367402e-05, "loss": 0.5867, "step": 24230 }, { "epoch": 4.56, "grad_norm": 19.62921142578125, "learning_rate": 1.0875211744776964e-05, "loss": 0.5858, "step": 24240 }, { "epoch": 4.56, "grad_norm": 60.666866302490234, "learning_rate": 1.0871447393186525e-05, "loss": 0.6691, "step": 24250 }, { "epoch": 4.57, "grad_norm": 12.690692901611328, "learning_rate": 1.0867683041596087e-05, "loss": 0.5031, "step": 24260 }, { "epoch": 4.57, "grad_norm": 22.332693099975586, "learning_rate": 1.0863918690005648e-05, "loss": 0.6551, "step": 24270 }, { "epoch": 4.57, "grad_norm": 8.846254348754883, "learning_rate": 1.086015433841521e-05, "loss": 0.5339, "step": 24280 }, { "epoch": 4.57, "grad_norm": 30.030580520629883, "learning_rate": 1.0856389986824771e-05, "loss": 0.8483, "step": 24290 }, { "epoch": 4.57, "grad_norm": 30.104228973388672, "learning_rate": 1.0852625635234333e-05, "loss": 0.4223, "step": 24300 }, { "epoch": 4.58, "grad_norm": 26.719762802124023, "learning_rate": 1.0848861283643892e-05, "loss": 0.5728, "step": 24310 }, { "epoch": 4.58, "grad_norm": 8.01773452758789, "learning_rate": 1.0845096932053454e-05, "loss": 0.7319, "step": 24320 }, { "epoch": 4.58, "grad_norm": 17.984601974487305, "learning_rate": 1.0841332580463015e-05, "loss": 0.6263, "step": 24330 }, { "epoch": 4.58, "grad_norm": 1.179288625717163, "learning_rate": 1.0837568228872577e-05, "loss": 0.2777, "step": 24340 }, { "epoch": 4.58, "grad_norm": 17.420080184936523, "learning_rate": 1.0833803877282138e-05, "loss": 0.3534, "step": 24350 }, { "epoch": 4.58, "grad_norm": 20.7567138671875, "learning_rate": 1.08300395256917e-05, "loss": 1.0827, "step": 24360 }, { "epoch": 4.59, "grad_norm": 28.857418060302734, "learning_rate": 1.0826275174101261e-05, "loss": 0.6296, "step": 24370 }, { "epoch": 4.59, "grad_norm": 20.736568450927734, "learning_rate": 1.0822510822510823e-05, "loss": 0.9884, "step": 24380 }, { "epoch": 4.59, "grad_norm": 22.843528747558594, "learning_rate": 1.0818746470920386e-05, "loss": 0.4609, "step": 24390 }, { "epoch": 4.59, "grad_norm": 14.99477767944336, "learning_rate": 1.0814982119329947e-05, "loss": 0.7755, "step": 24400 }, { "epoch": 4.59, "grad_norm": 13.172017097473145, "learning_rate": 1.0811217767739509e-05, "loss": 0.4842, "step": 24410 }, { "epoch": 4.6, "grad_norm": 15.066924095153809, "learning_rate": 1.080745341614907e-05, "loss": 0.4145, "step": 24420 }, { "epoch": 4.6, "grad_norm": 0.11462697386741638, "learning_rate": 1.0803689064558631e-05, "loss": 0.5677, "step": 24430 }, { "epoch": 4.6, "grad_norm": 0.14985047280788422, "learning_rate": 1.0799924712968193e-05, "loss": 0.4298, "step": 24440 }, { "epoch": 4.6, "grad_norm": 13.569608688354492, "learning_rate": 1.0796160361377754e-05, "loss": 0.5217, "step": 24450 }, { "epoch": 4.6, "grad_norm": 9.481203079223633, "learning_rate": 1.0792396009787316e-05, "loss": 0.8526, "step": 24460 }, { "epoch": 4.61, "grad_norm": 4.702198028564453, "learning_rate": 1.0788631658196877e-05, "loss": 0.5461, "step": 24470 }, { "epoch": 4.61, "grad_norm": 10.113847732543945, "learning_rate": 1.0784867306606437e-05, "loss": 0.7229, "step": 24480 }, { "epoch": 4.61, "grad_norm": 9.467721939086914, "learning_rate": 1.0781102955015999e-05, "loss": 0.6535, "step": 24490 }, { "epoch": 4.61, "grad_norm": 18.867504119873047, "learning_rate": 1.077733860342556e-05, "loss": 0.7685, "step": 24500 }, { "epoch": 4.61, "grad_norm": 8.213113784790039, "learning_rate": 1.0773574251835121e-05, "loss": 0.7004, "step": 24510 }, { "epoch": 4.62, "grad_norm": 2.475691795349121, "learning_rate": 1.0769809900244683e-05, "loss": 0.6569, "step": 24520 }, { "epoch": 4.62, "grad_norm": 17.931053161621094, "learning_rate": 1.0766045548654244e-05, "loss": 0.6393, "step": 24530 }, { "epoch": 4.62, "grad_norm": 8.332486152648926, "learning_rate": 1.0762281197063806e-05, "loss": 0.6192, "step": 24540 }, { "epoch": 4.62, "grad_norm": 0.999433159828186, "learning_rate": 1.0758516845473367e-05, "loss": 0.5856, "step": 24550 }, { "epoch": 4.62, "grad_norm": 5.174107074737549, "learning_rate": 1.0754752493882929e-05, "loss": 0.6075, "step": 24560 }, { "epoch": 4.62, "grad_norm": 9.829113006591797, "learning_rate": 1.0750988142292492e-05, "loss": 0.5431, "step": 24570 }, { "epoch": 4.63, "grad_norm": 41.149696350097656, "learning_rate": 1.0747223790702053e-05, "loss": 0.4506, "step": 24580 }, { "epoch": 4.63, "grad_norm": 15.11819839477539, "learning_rate": 1.0743459439111615e-05, "loss": 0.7719, "step": 24590 }, { "epoch": 4.63, "grad_norm": 7.7300124168396, "learning_rate": 1.0739695087521176e-05, "loss": 0.5971, "step": 24600 }, { "epoch": 4.63, "grad_norm": 20.490882873535156, "learning_rate": 1.0735930735930738e-05, "loss": 0.4271, "step": 24610 }, { "epoch": 4.63, "grad_norm": 7.432632923126221, "learning_rate": 1.07321663843403e-05, "loss": 0.7363, "step": 24620 }, { "epoch": 4.64, "grad_norm": 15.051981925964355, "learning_rate": 1.072840203274986e-05, "loss": 0.5787, "step": 24630 }, { "epoch": 4.64, "grad_norm": 7.251710414886475, "learning_rate": 1.0724637681159422e-05, "loss": 0.7642, "step": 24640 }, { "epoch": 4.64, "grad_norm": 17.7536678314209, "learning_rate": 1.0720873329568982e-05, "loss": 0.6854, "step": 24650 }, { "epoch": 4.64, "grad_norm": 16.003955841064453, "learning_rate": 1.0717108977978543e-05, "loss": 0.409, "step": 24660 }, { "epoch": 4.64, "grad_norm": 4.870722770690918, "learning_rate": 1.0713344626388105e-05, "loss": 0.7751, "step": 24670 }, { "epoch": 4.65, "grad_norm": 17.760967254638672, "learning_rate": 1.0709580274797666e-05, "loss": 0.3988, "step": 24680 }, { "epoch": 4.65, "grad_norm": 1.264788031578064, "learning_rate": 1.0705815923207228e-05, "loss": 0.5068, "step": 24690 }, { "epoch": 4.65, "grad_norm": 18.01626205444336, "learning_rate": 1.070205157161679e-05, "loss": 0.4463, "step": 24700 }, { "epoch": 4.65, "grad_norm": 6.05813455581665, "learning_rate": 1.069828722002635e-05, "loss": 0.6254, "step": 24710 }, { "epoch": 4.65, "grad_norm": 25.01399803161621, "learning_rate": 1.0694522868435912e-05, "loss": 0.5808, "step": 24720 }, { "epoch": 4.65, "grad_norm": 5.349250316619873, "learning_rate": 1.0690758516845474e-05, "loss": 0.5773, "step": 24730 }, { "epoch": 4.66, "grad_norm": 0.2522965967655182, "learning_rate": 1.0686994165255035e-05, "loss": 0.3343, "step": 24740 }, { "epoch": 4.66, "grad_norm": 18.957963943481445, "learning_rate": 1.0683229813664597e-05, "loss": 0.6543, "step": 24750 }, { "epoch": 4.66, "grad_norm": 40.8657112121582, "learning_rate": 1.067946546207416e-05, "loss": 0.4904, "step": 24760 }, { "epoch": 4.66, "grad_norm": 20.800670623779297, "learning_rate": 1.0675701110483721e-05, "loss": 0.4968, "step": 24770 }, { "epoch": 4.66, "grad_norm": 7.221461296081543, "learning_rate": 1.0671936758893283e-05, "loss": 0.4546, "step": 24780 }, { "epoch": 4.67, "grad_norm": 2.7894575595855713, "learning_rate": 1.0668172407302844e-05, "loss": 0.4324, "step": 24790 }, { "epoch": 4.67, "grad_norm": 2.2078590393066406, "learning_rate": 1.0664408055712406e-05, "loss": 0.3639, "step": 24800 }, { "epoch": 4.67, "grad_norm": 13.342238426208496, "learning_rate": 1.0660643704121967e-05, "loss": 0.3975, "step": 24810 }, { "epoch": 4.67, "grad_norm": 15.394166946411133, "learning_rate": 1.0656879352531528e-05, "loss": 0.696, "step": 24820 }, { "epoch": 4.67, "grad_norm": 13.436577796936035, "learning_rate": 1.0653115000941088e-05, "loss": 0.8646, "step": 24830 }, { "epoch": 4.68, "grad_norm": 10.768505096435547, "learning_rate": 1.064935064935065e-05, "loss": 0.6416, "step": 24840 }, { "epoch": 4.68, "grad_norm": 21.5819034576416, "learning_rate": 1.0645586297760211e-05, "loss": 0.4017, "step": 24850 }, { "epoch": 4.68, "grad_norm": 4.942729473114014, "learning_rate": 1.0641821946169773e-05, "loss": 0.5493, "step": 24860 }, { "epoch": 4.68, "grad_norm": 28.3702392578125, "learning_rate": 1.0638057594579334e-05, "loss": 0.4549, "step": 24870 }, { "epoch": 4.68, "grad_norm": 18.81791877746582, "learning_rate": 1.0634293242988896e-05, "loss": 0.4436, "step": 24880 }, { "epoch": 4.68, "grad_norm": 14.989437103271484, "learning_rate": 1.0630528891398457e-05, "loss": 0.5157, "step": 24890 }, { "epoch": 4.69, "grad_norm": 10.192904472351074, "learning_rate": 1.0626764539808018e-05, "loss": 0.6498, "step": 24900 }, { "epoch": 4.69, "grad_norm": 11.987055778503418, "learning_rate": 1.062300018821758e-05, "loss": 0.5607, "step": 24910 }, { "epoch": 4.69, "grad_norm": 8.632203102111816, "learning_rate": 1.0619235836627141e-05, "loss": 0.7684, "step": 24920 }, { "epoch": 4.69, "grad_norm": 18.148733139038086, "learning_rate": 1.0615471485036703e-05, "loss": 0.4516, "step": 24930 }, { "epoch": 4.69, "grad_norm": 13.001415252685547, "learning_rate": 1.0611707133446264e-05, "loss": 0.6813, "step": 24940 }, { "epoch": 4.7, "grad_norm": 9.432195663452148, "learning_rate": 1.0607942781855827e-05, "loss": 0.7401, "step": 24950 }, { "epoch": 4.7, "grad_norm": 14.515077590942383, "learning_rate": 1.0604178430265389e-05, "loss": 0.411, "step": 24960 }, { "epoch": 4.7, "grad_norm": 7.635622024536133, "learning_rate": 1.060041407867495e-05, "loss": 0.55, "step": 24970 }, { "epoch": 4.7, "grad_norm": 21.15432357788086, "learning_rate": 1.0596649727084512e-05, "loss": 0.9953, "step": 24980 }, { "epoch": 4.7, "grad_norm": 12.600613594055176, "learning_rate": 1.0592885375494073e-05, "loss": 0.5475, "step": 24990 }, { "epoch": 4.71, "grad_norm": 15.378485679626465, "learning_rate": 1.0589121023903633e-05, "loss": 0.6201, "step": 25000 }, { "epoch": 4.71, "grad_norm": 16.676759719848633, "learning_rate": 1.0585356672313195e-05, "loss": 0.6947, "step": 25010 }, { "epoch": 4.71, "grad_norm": 9.91386604309082, "learning_rate": 1.0581592320722756e-05, "loss": 0.7228, "step": 25020 }, { "epoch": 4.71, "grad_norm": 21.984731674194336, "learning_rate": 1.0577827969132317e-05, "loss": 0.6805, "step": 25030 }, { "epoch": 4.71, "grad_norm": 0.7576162219047546, "learning_rate": 1.0574063617541879e-05, "loss": 0.4934, "step": 25040 }, { "epoch": 4.71, "grad_norm": 4.221212387084961, "learning_rate": 1.057029926595144e-05, "loss": 0.5943, "step": 25050 }, { "epoch": 4.72, "grad_norm": 11.483154296875, "learning_rate": 1.0566534914361002e-05, "loss": 0.3277, "step": 25060 }, { "epoch": 4.72, "grad_norm": 6.1331024169921875, "learning_rate": 1.0562770562770563e-05, "loss": 0.9332, "step": 25070 }, { "epoch": 4.72, "grad_norm": 1.2644256353378296, "learning_rate": 1.0559006211180125e-05, "loss": 0.894, "step": 25080 }, { "epoch": 4.72, "grad_norm": 4.269558906555176, "learning_rate": 1.0555241859589686e-05, "loss": 0.6306, "step": 25090 }, { "epoch": 4.72, "grad_norm": 17.586618423461914, "learning_rate": 1.0551477507999248e-05, "loss": 0.5255, "step": 25100 }, { "epoch": 4.73, "grad_norm": 0.7638131976127625, "learning_rate": 1.0547713156408809e-05, "loss": 0.6211, "step": 25110 }, { "epoch": 4.73, "grad_norm": 6.860489368438721, "learning_rate": 1.054394880481837e-05, "loss": 0.5254, "step": 25120 }, { "epoch": 4.73, "grad_norm": 3.1576449871063232, "learning_rate": 1.0540184453227934e-05, "loss": 0.7581, "step": 25130 }, { "epoch": 4.73, "grad_norm": 23.722333908081055, "learning_rate": 1.0536420101637495e-05, "loss": 0.3437, "step": 25140 }, { "epoch": 4.73, "grad_norm": 2.296964168548584, "learning_rate": 1.0532655750047057e-05, "loss": 0.4786, "step": 25150 }, { "epoch": 4.74, "grad_norm": 18.260684967041016, "learning_rate": 1.0528891398456618e-05, "loss": 0.8271, "step": 25160 }, { "epoch": 4.74, "grad_norm": 11.189855575561523, "learning_rate": 1.052512704686618e-05, "loss": 0.4455, "step": 25170 }, { "epoch": 4.74, "grad_norm": 25.472238540649414, "learning_rate": 1.052136269527574e-05, "loss": 0.5042, "step": 25180 }, { "epoch": 4.74, "grad_norm": 3.388648271560669, "learning_rate": 1.05175983436853e-05, "loss": 0.6337, "step": 25190 }, { "epoch": 4.74, "grad_norm": 6.780930042266846, "learning_rate": 1.0513833992094862e-05, "loss": 0.7147, "step": 25200 }, { "epoch": 4.74, "grad_norm": 28.922266006469727, "learning_rate": 1.0510069640504424e-05, "loss": 0.4112, "step": 25210 }, { "epoch": 4.75, "grad_norm": 14.36142349243164, "learning_rate": 1.0506305288913985e-05, "loss": 0.7538, "step": 25220 }, { "epoch": 4.75, "grad_norm": 16.995763778686523, "learning_rate": 1.0502540937323547e-05, "loss": 0.7889, "step": 25230 }, { "epoch": 4.75, "grad_norm": 0.24860745668411255, "learning_rate": 1.0498776585733108e-05, "loss": 0.4844, "step": 25240 }, { "epoch": 4.75, "grad_norm": 7.232578754425049, "learning_rate": 1.049501223414267e-05, "loss": 0.9642, "step": 25250 }, { "epoch": 4.75, "grad_norm": 6.162290573120117, "learning_rate": 1.0491247882552231e-05, "loss": 0.4363, "step": 25260 }, { "epoch": 4.76, "grad_norm": 4.756533145904541, "learning_rate": 1.0487483530961793e-05, "loss": 0.7687, "step": 25270 }, { "epoch": 4.76, "grad_norm": 5.880683422088623, "learning_rate": 1.0483719179371354e-05, "loss": 0.3593, "step": 25280 }, { "epoch": 4.76, "grad_norm": 12.597949981689453, "learning_rate": 1.0479954827780915e-05, "loss": 0.5685, "step": 25290 }, { "epoch": 4.76, "grad_norm": 4.098395824432373, "learning_rate": 1.0476190476190477e-05, "loss": 0.4729, "step": 25300 }, { "epoch": 4.76, "grad_norm": 30.62727928161621, "learning_rate": 1.0472426124600038e-05, "loss": 0.8119, "step": 25310 }, { "epoch": 4.77, "grad_norm": 18.5235595703125, "learning_rate": 1.0468661773009602e-05, "loss": 0.4133, "step": 25320 }, { "epoch": 4.77, "grad_norm": 26.45856285095215, "learning_rate": 1.0464897421419163e-05, "loss": 0.5182, "step": 25330 }, { "epoch": 4.77, "grad_norm": 16.52037811279297, "learning_rate": 1.0461133069828724e-05, "loss": 0.5143, "step": 25340 }, { "epoch": 4.77, "grad_norm": 0.0581025630235672, "learning_rate": 1.0457368718238282e-05, "loss": 0.6238, "step": 25350 }, { "epoch": 4.77, "grad_norm": 2.4997506141662598, "learning_rate": 1.0453604366647846e-05, "loss": 0.4463, "step": 25360 }, { "epoch": 4.78, "grad_norm": 11.980353355407715, "learning_rate": 1.0449840015057407e-05, "loss": 0.4376, "step": 25370 }, { "epoch": 4.78, "grad_norm": 14.3879976272583, "learning_rate": 1.0446075663466969e-05, "loss": 1.0135, "step": 25380 }, { "epoch": 4.78, "grad_norm": 20.742895126342773, "learning_rate": 1.044231131187653e-05, "loss": 0.5509, "step": 25390 }, { "epoch": 4.78, "grad_norm": 7.918363094329834, "learning_rate": 1.0438546960286091e-05, "loss": 0.6053, "step": 25400 }, { "epoch": 4.78, "grad_norm": 15.34104061126709, "learning_rate": 1.0434782608695653e-05, "loss": 0.7682, "step": 25410 }, { "epoch": 4.78, "grad_norm": 10.88797378540039, "learning_rate": 1.0431018257105214e-05, "loss": 0.6793, "step": 25420 }, { "epoch": 4.79, "grad_norm": 21.449735641479492, "learning_rate": 1.0427253905514776e-05, "loss": 0.7585, "step": 25430 }, { "epoch": 4.79, "grad_norm": 13.024362564086914, "learning_rate": 1.0423489553924337e-05, "loss": 0.6198, "step": 25440 }, { "epoch": 4.79, "grad_norm": 1.5722072124481201, "learning_rate": 1.0419725202333899e-05, "loss": 0.466, "step": 25450 }, { "epoch": 4.79, "grad_norm": 4.080865383148193, "learning_rate": 1.041596085074346e-05, "loss": 0.5095, "step": 25460 }, { "epoch": 4.79, "grad_norm": 23.28003692626953, "learning_rate": 1.0412196499153022e-05, "loss": 0.8175, "step": 25470 }, { "epoch": 4.8, "grad_norm": 8.450470924377441, "learning_rate": 1.0408432147562583e-05, "loss": 0.5053, "step": 25480 }, { "epoch": 4.8, "grad_norm": 18.8877010345459, "learning_rate": 1.0404667795972145e-05, "loss": 0.6964, "step": 25490 }, { "epoch": 4.8, "grad_norm": 4.335949420928955, "learning_rate": 1.0400903444381708e-05, "loss": 0.4675, "step": 25500 }, { "epoch": 4.8, "grad_norm": 6.426523685455322, "learning_rate": 1.039713909279127e-05, "loss": 0.6782, "step": 25510 }, { "epoch": 4.8, "grad_norm": 2.119450330734253, "learning_rate": 1.0393374741200827e-05, "loss": 0.4604, "step": 25520 }, { "epoch": 4.81, "grad_norm": 59.50654983520508, "learning_rate": 1.0389610389610389e-05, "loss": 0.6908, "step": 25530 }, { "epoch": 4.81, "grad_norm": 1.951970100402832, "learning_rate": 1.038584603801995e-05, "loss": 0.6409, "step": 25540 }, { "epoch": 4.81, "grad_norm": 1.0901389122009277, "learning_rate": 1.0382081686429513e-05, "loss": 0.5373, "step": 25550 }, { "epoch": 4.81, "grad_norm": 7.754991054534912, "learning_rate": 1.0378317334839075e-05, "loss": 0.4142, "step": 25560 }, { "epoch": 4.81, "grad_norm": 12.59786605834961, "learning_rate": 1.0374552983248636e-05, "loss": 0.544, "step": 25570 }, { "epoch": 4.81, "grad_norm": 32.1705436706543, "learning_rate": 1.0370788631658198e-05, "loss": 0.4405, "step": 25580 }, { "epoch": 4.82, "grad_norm": 12.26833438873291, "learning_rate": 1.036702428006776e-05, "loss": 0.9665, "step": 25590 }, { "epoch": 4.82, "grad_norm": 7.950512886047363, "learning_rate": 1.036325992847732e-05, "loss": 0.7382, "step": 25600 }, { "epoch": 4.82, "grad_norm": 14.049162864685059, "learning_rate": 1.0359495576886882e-05, "loss": 0.4283, "step": 25610 }, { "epoch": 4.82, "grad_norm": 19.716205596923828, "learning_rate": 1.0355731225296444e-05, "loss": 0.3962, "step": 25620 }, { "epoch": 4.82, "grad_norm": 10.237818717956543, "learning_rate": 1.0351966873706005e-05, "loss": 0.5062, "step": 25630 }, { "epoch": 4.83, "grad_norm": 27.731966018676758, "learning_rate": 1.0348202522115567e-05, "loss": 0.4489, "step": 25640 }, { "epoch": 4.83, "grad_norm": 2.626387596130371, "learning_rate": 1.0344438170525128e-05, "loss": 0.3363, "step": 25650 }, { "epoch": 4.83, "grad_norm": 23.135208129882812, "learning_rate": 1.034067381893469e-05, "loss": 0.7282, "step": 25660 }, { "epoch": 4.83, "grad_norm": 17.033761978149414, "learning_rate": 1.0336909467344251e-05, "loss": 0.5404, "step": 25670 }, { "epoch": 4.83, "grad_norm": 5.097890377044678, "learning_rate": 1.0333145115753812e-05, "loss": 1.1042, "step": 25680 }, { "epoch": 4.84, "grad_norm": 10.225035667419434, "learning_rate": 1.0329380764163376e-05, "loss": 0.4477, "step": 25690 }, { "epoch": 4.84, "grad_norm": 7.208095550537109, "learning_rate": 1.0325616412572934e-05, "loss": 0.699, "step": 25700 }, { "epoch": 4.84, "grad_norm": 9.896465301513672, "learning_rate": 1.0321852060982495e-05, "loss": 0.4655, "step": 25710 }, { "epoch": 4.84, "grad_norm": 12.159602165222168, "learning_rate": 1.0318087709392057e-05, "loss": 0.7253, "step": 25720 }, { "epoch": 4.84, "grad_norm": 0.5531998872756958, "learning_rate": 1.031432335780162e-05, "loss": 0.3742, "step": 25730 }, { "epoch": 4.84, "grad_norm": 8.790497779846191, "learning_rate": 1.0310559006211181e-05, "loss": 0.5695, "step": 25740 }, { "epoch": 4.85, "grad_norm": 3.4259729385375977, "learning_rate": 1.0306794654620743e-05, "loss": 0.7616, "step": 25750 }, { "epoch": 4.85, "grad_norm": 21.889638900756836, "learning_rate": 1.0303030303030304e-05, "loss": 0.6106, "step": 25760 }, { "epoch": 4.85, "grad_norm": 13.903435707092285, "learning_rate": 1.0299265951439866e-05, "loss": 0.6131, "step": 25770 }, { "epoch": 4.85, "grad_norm": 19.042236328125, "learning_rate": 1.0295501599849427e-05, "loss": 0.6028, "step": 25780 }, { "epoch": 4.85, "grad_norm": 25.542591094970703, "learning_rate": 1.0291737248258988e-05, "loss": 0.5859, "step": 25790 }, { "epoch": 4.86, "grad_norm": 12.444345474243164, "learning_rate": 1.028797289666855e-05, "loss": 0.6576, "step": 25800 }, { "epoch": 4.86, "grad_norm": 0.6395838260650635, "learning_rate": 1.0284208545078111e-05, "loss": 0.7174, "step": 25810 }, { "epoch": 4.86, "grad_norm": 13.468929290771484, "learning_rate": 1.0280444193487673e-05, "loss": 0.6934, "step": 25820 }, { "epoch": 4.86, "grad_norm": 16.97991180419922, "learning_rate": 1.0276679841897234e-05, "loss": 0.9544, "step": 25830 }, { "epoch": 4.86, "grad_norm": 22.151426315307617, "learning_rate": 1.0272915490306796e-05, "loss": 0.5222, "step": 25840 }, { "epoch": 4.87, "grad_norm": 9.480589866638184, "learning_rate": 1.0269151138716357e-05, "loss": 0.6157, "step": 25850 }, { "epoch": 4.87, "grad_norm": 15.794319152832031, "learning_rate": 1.0265386787125919e-05, "loss": 0.8519, "step": 25860 }, { "epoch": 4.87, "grad_norm": 9.91077709197998, "learning_rate": 1.0261622435535478e-05, "loss": 0.3128, "step": 25870 }, { "epoch": 4.87, "grad_norm": 10.460318565368652, "learning_rate": 1.025785808394504e-05, "loss": 0.7739, "step": 25880 }, { "epoch": 4.87, "grad_norm": 29.492027282714844, "learning_rate": 1.0254093732354601e-05, "loss": 0.6284, "step": 25890 }, { "epoch": 4.87, "grad_norm": 7.003443241119385, "learning_rate": 1.0250329380764163e-05, "loss": 0.4957, "step": 25900 }, { "epoch": 4.88, "grad_norm": 36.19068145751953, "learning_rate": 1.0246565029173724e-05, "loss": 0.5009, "step": 25910 }, { "epoch": 4.88, "grad_norm": 17.596288681030273, "learning_rate": 1.0242800677583287e-05, "loss": 0.5891, "step": 25920 }, { "epoch": 4.88, "grad_norm": 34.26020812988281, "learning_rate": 1.0239036325992849e-05, "loss": 0.767, "step": 25930 }, { "epoch": 4.88, "grad_norm": 0.572287380695343, "learning_rate": 1.023527197440241e-05, "loss": 0.4655, "step": 25940 }, { "epoch": 4.88, "grad_norm": 11.611014366149902, "learning_rate": 1.0231507622811972e-05, "loss": 0.6033, "step": 25950 }, { "epoch": 4.89, "grad_norm": 5.741128444671631, "learning_rate": 1.0227743271221533e-05, "loss": 0.2666, "step": 25960 }, { "epoch": 4.89, "grad_norm": 27.423297882080078, "learning_rate": 1.0223978919631095e-05, "loss": 0.5137, "step": 25970 }, { "epoch": 4.89, "grad_norm": 7.721019744873047, "learning_rate": 1.0220214568040656e-05, "loss": 0.8591, "step": 25980 }, { "epoch": 4.89, "grad_norm": 5.386199474334717, "learning_rate": 1.0216450216450218e-05, "loss": 0.5424, "step": 25990 }, { "epoch": 4.89, "grad_norm": 16.02313804626465, "learning_rate": 1.0212685864859779e-05, "loss": 0.9359, "step": 26000 }, { "epoch": 4.9, "grad_norm": 28.932086944580078, "learning_rate": 1.020892151326934e-05, "loss": 0.4697, "step": 26010 }, { "epoch": 4.9, "grad_norm": 7.247927665710449, "learning_rate": 1.0205157161678902e-05, "loss": 0.5748, "step": 26020 }, { "epoch": 4.9, "grad_norm": 0.293780118227005, "learning_rate": 1.0201392810088464e-05, "loss": 0.4207, "step": 26030 }, { "epoch": 4.9, "grad_norm": 9.770951271057129, "learning_rate": 1.0197628458498025e-05, "loss": 0.6643, "step": 26040 }, { "epoch": 4.9, "grad_norm": 17.128610610961914, "learning_rate": 1.0193864106907585e-05, "loss": 0.4047, "step": 26050 }, { "epoch": 4.9, "grad_norm": 20.82026481628418, "learning_rate": 1.0190099755317146e-05, "loss": 0.7626, "step": 26060 }, { "epoch": 4.91, "grad_norm": 9.654380798339844, "learning_rate": 1.0186335403726708e-05, "loss": 0.4447, "step": 26070 }, { "epoch": 4.91, "grad_norm": 19.486652374267578, "learning_rate": 1.0182571052136269e-05, "loss": 0.4849, "step": 26080 }, { "epoch": 4.91, "grad_norm": 15.955955505371094, "learning_rate": 1.017880670054583e-05, "loss": 0.7018, "step": 26090 }, { "epoch": 4.91, "grad_norm": 6.0235915184021, "learning_rate": 1.0175042348955392e-05, "loss": 0.5864, "step": 26100 }, { "epoch": 4.91, "grad_norm": 41.60502624511719, "learning_rate": 1.0171277997364955e-05, "loss": 0.9113, "step": 26110 }, { "epoch": 4.92, "grad_norm": 10.696873664855957, "learning_rate": 1.0167513645774517e-05, "loss": 0.4178, "step": 26120 }, { "epoch": 4.92, "grad_norm": 27.66557502746582, "learning_rate": 1.0163749294184078e-05, "loss": 0.5074, "step": 26130 }, { "epoch": 4.92, "grad_norm": 4.632658958435059, "learning_rate": 1.015998494259364e-05, "loss": 0.3351, "step": 26140 }, { "epoch": 4.92, "grad_norm": 26.457387924194336, "learning_rate": 1.0156220591003201e-05, "loss": 1.3971, "step": 26150 }, { "epoch": 4.92, "grad_norm": 3.868096351623535, "learning_rate": 1.0152456239412763e-05, "loss": 0.5699, "step": 26160 }, { "epoch": 4.93, "grad_norm": 1.3630057573318481, "learning_rate": 1.0148691887822324e-05, "loss": 0.4219, "step": 26170 }, { "epoch": 4.93, "grad_norm": 34.00837326049805, "learning_rate": 1.0144927536231885e-05, "loss": 0.8625, "step": 26180 }, { "epoch": 4.93, "grad_norm": 2.9621551036834717, "learning_rate": 1.0141163184641447e-05, "loss": 0.6725, "step": 26190 }, { "epoch": 4.93, "grad_norm": 0.44564133882522583, "learning_rate": 1.0137398833051008e-05, "loss": 0.6897, "step": 26200 }, { "epoch": 4.93, "grad_norm": 10.090829849243164, "learning_rate": 1.013363448146057e-05, "loss": 0.7626, "step": 26210 }, { "epoch": 4.94, "grad_norm": 10.721175193786621, "learning_rate": 1.012987012987013e-05, "loss": 0.7807, "step": 26220 }, { "epoch": 4.94, "grad_norm": 8.438685417175293, "learning_rate": 1.0126105778279691e-05, "loss": 0.6646, "step": 26230 }, { "epoch": 4.94, "grad_norm": 20.982872009277344, "learning_rate": 1.0122341426689252e-05, "loss": 0.7103, "step": 26240 }, { "epoch": 4.94, "grad_norm": 24.98394012451172, "learning_rate": 1.0118577075098814e-05, "loss": 0.658, "step": 26250 }, { "epoch": 4.94, "grad_norm": 10.647464752197266, "learning_rate": 1.0114812723508375e-05, "loss": 0.5797, "step": 26260 }, { "epoch": 4.94, "grad_norm": 5.082032203674316, "learning_rate": 1.0111048371917937e-05, "loss": 0.3246, "step": 26270 }, { "epoch": 4.95, "grad_norm": 34.345130920410156, "learning_rate": 1.0107284020327498e-05, "loss": 0.7128, "step": 26280 }, { "epoch": 4.95, "grad_norm": 1.867757797241211, "learning_rate": 1.0103519668737061e-05, "loss": 0.4022, "step": 26290 }, { "epoch": 4.95, "grad_norm": 17.364198684692383, "learning_rate": 1.0099755317146623e-05, "loss": 0.4057, "step": 26300 }, { "epoch": 4.95, "grad_norm": 0.3039032518863678, "learning_rate": 1.0095990965556184e-05, "loss": 0.6772, "step": 26310 }, { "epoch": 4.95, "grad_norm": 7.0659050941467285, "learning_rate": 1.0092226613965746e-05, "loss": 0.4634, "step": 26320 }, { "epoch": 4.96, "grad_norm": 6.5546135902404785, "learning_rate": 1.0088462262375307e-05, "loss": 0.6775, "step": 26330 }, { "epoch": 4.96, "grad_norm": 19.550182342529297, "learning_rate": 1.0084697910784869e-05, "loss": 0.3757, "step": 26340 }, { "epoch": 4.96, "grad_norm": 10.655951499938965, "learning_rate": 1.008093355919443e-05, "loss": 0.5657, "step": 26350 }, { "epoch": 4.96, "grad_norm": 7.180771350860596, "learning_rate": 1.0077169207603992e-05, "loss": 0.7091, "step": 26360 }, { "epoch": 4.96, "grad_norm": 10.868316650390625, "learning_rate": 1.0073404856013553e-05, "loss": 0.6567, "step": 26370 }, { "epoch": 4.97, "grad_norm": 43.5999641418457, "learning_rate": 1.0069640504423115e-05, "loss": 0.4698, "step": 26380 }, { "epoch": 4.97, "grad_norm": 12.528432846069336, "learning_rate": 1.0065876152832676e-05, "loss": 0.7588, "step": 26390 }, { "epoch": 4.97, "grad_norm": 13.703747749328613, "learning_rate": 1.0062111801242236e-05, "loss": 0.7794, "step": 26400 }, { "epoch": 4.97, "grad_norm": 0.5737854838371277, "learning_rate": 1.0058347449651797e-05, "loss": 0.6142, "step": 26410 }, { "epoch": 4.97, "grad_norm": 15.537364959716797, "learning_rate": 1.0054583098061359e-05, "loss": 0.585, "step": 26420 }, { "epoch": 4.97, "grad_norm": 14.01577377319336, "learning_rate": 1.005081874647092e-05, "loss": 0.7088, "step": 26430 }, { "epoch": 4.98, "grad_norm": 5.803639888763428, "learning_rate": 1.0047054394880482e-05, "loss": 0.7963, "step": 26440 }, { "epoch": 4.98, "grad_norm": 16.50397300720215, "learning_rate": 1.0043290043290043e-05, "loss": 0.5466, "step": 26450 }, { "epoch": 4.98, "grad_norm": 10.755285263061523, "learning_rate": 1.0039525691699605e-05, "loss": 0.4802, "step": 26460 }, { "epoch": 4.98, "grad_norm": 4.0640869140625, "learning_rate": 1.0035761340109166e-05, "loss": 0.8407, "step": 26470 }, { "epoch": 4.98, "grad_norm": 10.815974235534668, "learning_rate": 1.003199698851873e-05, "loss": 0.5498, "step": 26480 }, { "epoch": 4.99, "grad_norm": 21.308177947998047, "learning_rate": 1.002823263692829e-05, "loss": 0.6421, "step": 26490 }, { "epoch": 4.99, "grad_norm": 20.949392318725586, "learning_rate": 1.0024468285337852e-05, "loss": 0.4232, "step": 26500 }, { "epoch": 4.99, "grad_norm": 6.295169830322266, "learning_rate": 1.0020703933747414e-05, "loss": 0.3888, "step": 26510 }, { "epoch": 4.99, "grad_norm": 22.155059814453125, "learning_rate": 1.0016939582156975e-05, "loss": 0.4725, "step": 26520 }, { "epoch": 4.99, "grad_norm": 11.023420333862305, "learning_rate": 1.0013175230566537e-05, "loss": 0.9657, "step": 26530 }, { "epoch": 5.0, "grad_norm": 7.431711673736572, "learning_rate": 1.0009410878976098e-05, "loss": 0.8442, "step": 26540 }, { "epoch": 5.0, "grad_norm": 7.200937271118164, "learning_rate": 1.000564652738566e-05, "loss": 0.4408, "step": 26550 }, { "epoch": 5.0, "grad_norm": 16.981725692749023, "learning_rate": 1.0001882175795221e-05, "loss": 0.4665, "step": 26560 }, { "epoch": 5.0, "eval_accuracy": 0.9174666666666667, "eval_loss": 0.3038625717163086, "eval_runtime": 51.1218, "eval_samples_per_second": 146.708, "eval_steps_per_second": 18.348, "step": 26565 }, { "epoch": 5.0, "grad_norm": 11.93405532836914, "learning_rate": 9.998117824204782e-06, "loss": 0.5697, "step": 26570 }, { "epoch": 5.0, "grad_norm": 5.752691745758057, "learning_rate": 9.994353472614344e-06, "loss": 0.407, "step": 26580 }, { "epoch": 5.0, "grad_norm": 0.38019922375679016, "learning_rate": 9.990589121023905e-06, "loss": 0.5588, "step": 26590 }, { "epoch": 5.01, "grad_norm": 23.82176399230957, "learning_rate": 9.986824769433467e-06, "loss": 0.7845, "step": 26600 }, { "epoch": 5.01, "grad_norm": 18.493980407714844, "learning_rate": 9.983060417843027e-06, "loss": 1.0395, "step": 26610 }, { "epoch": 5.01, "grad_norm": 0.250558078289032, "learning_rate": 9.979296066252588e-06, "loss": 0.2666, "step": 26620 }, { "epoch": 5.01, "grad_norm": 12.60075569152832, "learning_rate": 9.97553171466215e-06, "loss": 0.4569, "step": 26630 }, { "epoch": 5.01, "grad_norm": 12.024946212768555, "learning_rate": 9.971767363071711e-06, "loss": 0.4128, "step": 26640 }, { "epoch": 5.02, "grad_norm": 0.44587206840515137, "learning_rate": 9.968003011481272e-06, "loss": 0.5314, "step": 26650 }, { "epoch": 5.02, "grad_norm": 3.9360930919647217, "learning_rate": 9.964238659890836e-06, "loss": 0.5811, "step": 26660 }, { "epoch": 5.02, "grad_norm": 5.4166035652160645, "learning_rate": 9.960474308300397e-06, "loss": 0.5869, "step": 26670 }, { "epoch": 5.02, "grad_norm": 13.620014190673828, "learning_rate": 9.956709956709958e-06, "loss": 0.5772, "step": 26680 }, { "epoch": 5.02, "grad_norm": 18.73905372619629, "learning_rate": 9.95294560511952e-06, "loss": 0.4458, "step": 26690 }, { "epoch": 5.03, "grad_norm": 9.41979694366455, "learning_rate": 9.94918125352908e-06, "loss": 0.6342, "step": 26700 }, { "epoch": 5.03, "grad_norm": 6.560514450073242, "learning_rate": 9.945416901938641e-06, "loss": 0.8343, "step": 26710 }, { "epoch": 5.03, "grad_norm": 24.214908599853516, "learning_rate": 9.941652550348203e-06, "loss": 0.3422, "step": 26720 }, { "epoch": 5.03, "grad_norm": 13.99776554107666, "learning_rate": 9.937888198757764e-06, "loss": 0.6253, "step": 26730 }, { "epoch": 5.03, "grad_norm": 13.242351531982422, "learning_rate": 9.934123847167326e-06, "loss": 0.543, "step": 26740 }, { "epoch": 5.03, "grad_norm": 33.52753448486328, "learning_rate": 9.930359495576887e-06, "loss": 0.4902, "step": 26750 }, { "epoch": 5.04, "grad_norm": 23.321645736694336, "learning_rate": 9.92659514398645e-06, "loss": 0.656, "step": 26760 }, { "epoch": 5.04, "grad_norm": 16.174564361572266, "learning_rate": 9.922830792396012e-06, "loss": 0.5337, "step": 26770 }, { "epoch": 5.04, "grad_norm": 4.621311664581299, "learning_rate": 9.919066440805573e-06, "loss": 0.7531, "step": 26780 }, { "epoch": 5.04, "grad_norm": 29.9796085357666, "learning_rate": 9.915302089215133e-06, "loss": 0.6874, "step": 26790 }, { "epoch": 5.04, "grad_norm": 26.092121124267578, "learning_rate": 9.911537737624694e-06, "loss": 0.7789, "step": 26800 }, { "epoch": 5.05, "grad_norm": 7.301915645599365, "learning_rate": 9.907773386034256e-06, "loss": 0.385, "step": 26810 }, { "epoch": 5.05, "grad_norm": 19.605470657348633, "learning_rate": 9.904009034443817e-06, "loss": 0.702, "step": 26820 }, { "epoch": 5.05, "grad_norm": 13.46226692199707, "learning_rate": 9.900244682853379e-06, "loss": 0.24, "step": 26830 }, { "epoch": 5.05, "grad_norm": 5.802454948425293, "learning_rate": 9.89648033126294e-06, "loss": 0.2608, "step": 26840 }, { "epoch": 5.05, "grad_norm": 8.033814430236816, "learning_rate": 9.892715979672503e-06, "loss": 0.6069, "step": 26850 }, { "epoch": 5.06, "grad_norm": 0.2305469959974289, "learning_rate": 9.888951628082065e-06, "loss": 0.4146, "step": 26860 }, { "epoch": 5.06, "grad_norm": 7.512425422668457, "learning_rate": 9.885187276491625e-06, "loss": 0.6316, "step": 26870 }, { "epoch": 5.06, "grad_norm": 3.023247480392456, "learning_rate": 9.881422924901186e-06, "loss": 0.7018, "step": 26880 }, { "epoch": 5.06, "grad_norm": 20.460777282714844, "learning_rate": 9.877658573310747e-06, "loss": 0.5622, "step": 26890 }, { "epoch": 5.06, "grad_norm": 5.8153839111328125, "learning_rate": 9.873894221720309e-06, "loss": 0.5251, "step": 26900 }, { "epoch": 5.06, "grad_norm": 27.608034133911133, "learning_rate": 9.87012987012987e-06, "loss": 0.6028, "step": 26910 }, { "epoch": 5.07, "grad_norm": 33.49705123901367, "learning_rate": 9.866365518539432e-06, "loss": 0.5239, "step": 26920 }, { "epoch": 5.07, "grad_norm": 1.4094599485397339, "learning_rate": 9.862601166948993e-06, "loss": 0.7576, "step": 26930 }, { "epoch": 5.07, "grad_norm": 8.930601119995117, "learning_rate": 9.858836815358556e-06, "loss": 0.4088, "step": 26940 }, { "epoch": 5.07, "grad_norm": 0.6943548321723938, "learning_rate": 9.855072463768118e-06, "loss": 0.633, "step": 26950 }, { "epoch": 5.07, "grad_norm": 33.55091094970703, "learning_rate": 9.851308112177678e-06, "loss": 0.7098, "step": 26960 }, { "epoch": 5.08, "grad_norm": 18.824798583984375, "learning_rate": 9.847543760587239e-06, "loss": 0.5024, "step": 26970 }, { "epoch": 5.08, "grad_norm": 19.024484634399414, "learning_rate": 9.8437794089968e-06, "loss": 0.5722, "step": 26980 }, { "epoch": 5.08, "grad_norm": 4.507168292999268, "learning_rate": 9.840015057406362e-06, "loss": 0.2062, "step": 26990 }, { "epoch": 5.08, "grad_norm": 31.550439834594727, "learning_rate": 9.836250705815924e-06, "loss": 0.4145, "step": 27000 }, { "epoch": 5.08, "grad_norm": 11.541763305664062, "learning_rate": 9.832486354225485e-06, "loss": 0.522, "step": 27010 }, { "epoch": 5.09, "grad_norm": 14.351038932800293, "learning_rate": 9.828722002635046e-06, "loss": 0.5481, "step": 27020 }, { "epoch": 5.09, "grad_norm": 4.900013446807861, "learning_rate": 9.82495765104461e-06, "loss": 0.4472, "step": 27030 }, { "epoch": 5.09, "grad_norm": 14.692139625549316, "learning_rate": 9.821193299454171e-06, "loss": 0.6838, "step": 27040 }, { "epoch": 5.09, "grad_norm": 0.5007912516593933, "learning_rate": 9.81742894786373e-06, "loss": 0.7123, "step": 27050 }, { "epoch": 5.09, "grad_norm": 29.906484603881836, "learning_rate": 9.813664596273292e-06, "loss": 0.3299, "step": 27060 }, { "epoch": 5.1, "grad_norm": 21.051406860351562, "learning_rate": 9.809900244682854e-06, "loss": 0.471, "step": 27070 }, { "epoch": 5.1, "grad_norm": 8.312321662902832, "learning_rate": 9.806135893092415e-06, "loss": 0.5687, "step": 27080 }, { "epoch": 5.1, "grad_norm": 0.8224940896034241, "learning_rate": 9.802371541501977e-06, "loss": 0.3544, "step": 27090 }, { "epoch": 5.1, "grad_norm": 18.394412994384766, "learning_rate": 9.798607189911538e-06, "loss": 0.4271, "step": 27100 }, { "epoch": 5.1, "grad_norm": 14.561628341674805, "learning_rate": 9.7948428383211e-06, "loss": 0.7428, "step": 27110 }, { "epoch": 5.1, "grad_norm": 17.597429275512695, "learning_rate": 9.791078486730661e-06, "loss": 0.322, "step": 27120 }, { "epoch": 5.11, "grad_norm": 0.4523058831691742, "learning_rate": 9.787314135140224e-06, "loss": 0.4471, "step": 27130 }, { "epoch": 5.11, "grad_norm": 57.31825637817383, "learning_rate": 9.783549783549784e-06, "loss": 1.022, "step": 27140 }, { "epoch": 5.11, "grad_norm": 15.83234977722168, "learning_rate": 9.779785431959345e-06, "loss": 0.5429, "step": 27150 }, { "epoch": 5.11, "grad_norm": 17.558889389038086, "learning_rate": 9.776021080368907e-06, "loss": 0.4771, "step": 27160 }, { "epoch": 5.11, "grad_norm": 6.830644607543945, "learning_rate": 9.772256728778468e-06, "loss": 0.3859, "step": 27170 }, { "epoch": 5.12, "grad_norm": 23.56614875793457, "learning_rate": 9.76849237718803e-06, "loss": 0.2237, "step": 27180 }, { "epoch": 5.12, "grad_norm": 17.846912384033203, "learning_rate": 9.764728025597591e-06, "loss": 0.5612, "step": 27190 }, { "epoch": 5.12, "grad_norm": 22.106182098388672, "learning_rate": 9.760963674007153e-06, "loss": 0.4336, "step": 27200 }, { "epoch": 5.12, "grad_norm": 34.99475860595703, "learning_rate": 9.757199322416714e-06, "loss": 0.5471, "step": 27210 }, { "epoch": 5.12, "grad_norm": 22.88408660888672, "learning_rate": 9.753434970826276e-06, "loss": 0.5702, "step": 27220 }, { "epoch": 5.13, "grad_norm": 0.2636334300041199, "learning_rate": 9.749670619235837e-06, "loss": 0.6926, "step": 27230 }, { "epoch": 5.13, "grad_norm": 0.4821383059024811, "learning_rate": 9.745906267645399e-06, "loss": 0.4507, "step": 27240 }, { "epoch": 5.13, "grad_norm": 0.9522204399108887, "learning_rate": 9.74214191605496e-06, "loss": 0.7467, "step": 27250 }, { "epoch": 5.13, "grad_norm": 0.07715526968240738, "learning_rate": 9.738377564464521e-06, "loss": 0.5735, "step": 27260 }, { "epoch": 5.13, "grad_norm": 2.0689947605133057, "learning_rate": 9.734613212874083e-06, "loss": 0.621, "step": 27270 }, { "epoch": 5.13, "grad_norm": 4.146627902984619, "learning_rate": 9.730848861283644e-06, "loss": 0.3251, "step": 27280 }, { "epoch": 5.14, "grad_norm": 25.737661361694336, "learning_rate": 9.727084509693206e-06, "loss": 0.34, "step": 27290 }, { "epoch": 5.14, "grad_norm": 10.852874755859375, "learning_rate": 9.723320158102767e-06, "loss": 0.6041, "step": 27300 }, { "epoch": 5.14, "grad_norm": 18.879383087158203, "learning_rate": 9.719555806512329e-06, "loss": 0.6012, "step": 27310 }, { "epoch": 5.14, "grad_norm": 0.9953970909118652, "learning_rate": 9.71579145492189e-06, "loss": 0.62, "step": 27320 }, { "epoch": 5.14, "grad_norm": 1.2048535346984863, "learning_rate": 9.712027103331452e-06, "loss": 0.5219, "step": 27330 }, { "epoch": 5.15, "grad_norm": 13.980968475341797, "learning_rate": 9.708262751741013e-06, "loss": 0.7282, "step": 27340 }, { "epoch": 5.15, "grad_norm": 9.173256874084473, "learning_rate": 9.704498400150575e-06, "loss": 0.432, "step": 27350 }, { "epoch": 5.15, "grad_norm": 11.977608680725098, "learning_rate": 9.700734048560136e-06, "loss": 0.5185, "step": 27360 }, { "epoch": 5.15, "grad_norm": 9.067453384399414, "learning_rate": 9.696969696969698e-06, "loss": 0.4714, "step": 27370 }, { "epoch": 5.15, "grad_norm": 4.337044715881348, "learning_rate": 9.693205345379259e-06, "loss": 0.3863, "step": 27380 }, { "epoch": 5.16, "grad_norm": 6.323875427246094, "learning_rate": 9.68944099378882e-06, "loss": 0.7175, "step": 27390 }, { "epoch": 5.16, "grad_norm": 2.4866652488708496, "learning_rate": 9.685676642198382e-06, "loss": 0.3135, "step": 27400 }, { "epoch": 5.16, "grad_norm": 2.433683395385742, "learning_rate": 9.681912290607943e-06, "loss": 0.576, "step": 27410 }, { "epoch": 5.16, "grad_norm": 10.03541374206543, "learning_rate": 9.678147939017505e-06, "loss": 0.4971, "step": 27420 }, { "epoch": 5.16, "grad_norm": 22.778186798095703, "learning_rate": 9.674383587427066e-06, "loss": 0.5237, "step": 27430 }, { "epoch": 5.16, "grad_norm": 4.677865028381348, "learning_rate": 9.670619235836628e-06, "loss": 0.7147, "step": 27440 }, { "epoch": 5.17, "grad_norm": 21.216205596923828, "learning_rate": 9.66685488424619e-06, "loss": 0.5606, "step": 27450 }, { "epoch": 5.17, "grad_norm": 10.490550994873047, "learning_rate": 9.66309053265575e-06, "loss": 0.7098, "step": 27460 }, { "epoch": 5.17, "grad_norm": 7.556525230407715, "learning_rate": 9.659326181065312e-06, "loss": 0.6452, "step": 27470 }, { "epoch": 5.17, "grad_norm": 0.8397094011306763, "learning_rate": 9.655561829474874e-06, "loss": 0.1972, "step": 27480 }, { "epoch": 5.17, "grad_norm": 9.92435073852539, "learning_rate": 9.651797477884435e-06, "loss": 0.4522, "step": 27490 }, { "epoch": 5.18, "grad_norm": 1.3592374324798584, "learning_rate": 9.648033126293997e-06, "loss": 0.4904, "step": 27500 }, { "epoch": 5.18, "grad_norm": 10.651955604553223, "learning_rate": 9.644268774703558e-06, "loss": 0.5157, "step": 27510 }, { "epoch": 5.18, "grad_norm": 18.821256637573242, "learning_rate": 9.64050442311312e-06, "loss": 0.5196, "step": 27520 }, { "epoch": 5.18, "grad_norm": 3.651150703430176, "learning_rate": 9.636740071522681e-06, "loss": 0.5902, "step": 27530 }, { "epoch": 5.18, "grad_norm": 13.892976760864258, "learning_rate": 9.632975719932242e-06, "loss": 0.8091, "step": 27540 }, { "epoch": 5.19, "grad_norm": 0.7897984981536865, "learning_rate": 9.629211368341804e-06, "loss": 0.4273, "step": 27550 }, { "epoch": 5.19, "grad_norm": 16.25598907470703, "learning_rate": 9.625447016751365e-06, "loss": 0.3544, "step": 27560 }, { "epoch": 5.19, "grad_norm": 15.311908721923828, "learning_rate": 9.621682665160927e-06, "loss": 0.3885, "step": 27570 }, { "epoch": 5.19, "grad_norm": 13.913825988769531, "learning_rate": 9.617918313570488e-06, "loss": 0.3768, "step": 27580 }, { "epoch": 5.19, "grad_norm": 0.829640805721283, "learning_rate": 9.61415396198005e-06, "loss": 0.3235, "step": 27590 }, { "epoch": 5.19, "grad_norm": 11.236851692199707, "learning_rate": 9.610389610389611e-06, "loss": 0.73, "step": 27600 }, { "epoch": 5.2, "grad_norm": 9.920482635498047, "learning_rate": 9.606625258799173e-06, "loss": 0.379, "step": 27610 }, { "epoch": 5.2, "grad_norm": 5.420136451721191, "learning_rate": 9.602860907208734e-06, "loss": 0.7466, "step": 27620 }, { "epoch": 5.2, "grad_norm": 32.80744934082031, "learning_rate": 9.599096555618296e-06, "loss": 0.906, "step": 27630 }, { "epoch": 5.2, "grad_norm": 21.18132209777832, "learning_rate": 9.595332204027857e-06, "loss": 0.8318, "step": 27640 }, { "epoch": 5.2, "grad_norm": 7.528763771057129, "learning_rate": 9.591567852437418e-06, "loss": 0.6328, "step": 27650 }, { "epoch": 5.21, "grad_norm": 24.74061393737793, "learning_rate": 9.58780350084698e-06, "loss": 0.5783, "step": 27660 }, { "epoch": 5.21, "grad_norm": 15.430523872375488, "learning_rate": 9.584039149256541e-06, "loss": 0.554, "step": 27670 }, { "epoch": 5.21, "grad_norm": 23.323516845703125, "learning_rate": 9.580274797666103e-06, "loss": 0.246, "step": 27680 }, { "epoch": 5.21, "grad_norm": 29.96780776977539, "learning_rate": 9.576510446075664e-06, "loss": 0.7304, "step": 27690 }, { "epoch": 5.21, "grad_norm": 19.847814559936523, "learning_rate": 9.572746094485226e-06, "loss": 0.4962, "step": 27700 }, { "epoch": 5.22, "grad_norm": 11.185002326965332, "learning_rate": 9.568981742894787e-06, "loss": 0.4532, "step": 27710 }, { "epoch": 5.22, "grad_norm": 12.05192756652832, "learning_rate": 9.565217391304349e-06, "loss": 0.6854, "step": 27720 }, { "epoch": 5.22, "grad_norm": 18.75062370300293, "learning_rate": 9.56145303971391e-06, "loss": 0.7951, "step": 27730 }, { "epoch": 5.22, "grad_norm": 5.429780960083008, "learning_rate": 9.557688688123472e-06, "loss": 0.5807, "step": 27740 }, { "epoch": 5.22, "grad_norm": 6.862247943878174, "learning_rate": 9.553924336533033e-06, "loss": 0.4013, "step": 27750 }, { "epoch": 5.22, "grad_norm": 2.499727487564087, "learning_rate": 9.550159984942595e-06, "loss": 0.5839, "step": 27760 }, { "epoch": 5.23, "grad_norm": 20.708763122558594, "learning_rate": 9.546395633352156e-06, "loss": 0.5312, "step": 27770 }, { "epoch": 5.23, "grad_norm": 35.32229995727539, "learning_rate": 9.542631281761717e-06, "loss": 0.5386, "step": 27780 }, { "epoch": 5.23, "grad_norm": 4.731060028076172, "learning_rate": 9.538866930171279e-06, "loss": 0.3741, "step": 27790 }, { "epoch": 5.23, "grad_norm": 15.941679000854492, "learning_rate": 9.53510257858084e-06, "loss": 0.5086, "step": 27800 }, { "epoch": 5.23, "grad_norm": 33.2602653503418, "learning_rate": 9.531338226990402e-06, "loss": 0.6115, "step": 27810 }, { "epoch": 5.24, "grad_norm": 10.409682273864746, "learning_rate": 9.527573875399963e-06, "loss": 0.3425, "step": 27820 }, { "epoch": 5.24, "grad_norm": 18.776081085205078, "learning_rate": 9.523809523809525e-06, "loss": 0.4186, "step": 27830 }, { "epoch": 5.24, "grad_norm": 45.04935073852539, "learning_rate": 9.520045172219086e-06, "loss": 0.5253, "step": 27840 }, { "epoch": 5.24, "grad_norm": 8.127985954284668, "learning_rate": 9.516280820628648e-06, "loss": 0.5575, "step": 27850 }, { "epoch": 5.24, "grad_norm": 3.796123504638672, "learning_rate": 9.512516469038209e-06, "loss": 0.3674, "step": 27860 }, { "epoch": 5.25, "grad_norm": 3.369060754776001, "learning_rate": 9.50875211744777e-06, "loss": 0.539, "step": 27870 }, { "epoch": 5.25, "grad_norm": 1.8295410871505737, "learning_rate": 9.504987765857332e-06, "loss": 0.3522, "step": 27880 }, { "epoch": 5.25, "grad_norm": 21.568384170532227, "learning_rate": 9.501223414266894e-06, "loss": 0.4866, "step": 27890 }, { "epoch": 5.25, "grad_norm": 6.660335540771484, "learning_rate": 9.497459062676455e-06, "loss": 0.6279, "step": 27900 }, { "epoch": 5.25, "grad_norm": 0.142924964427948, "learning_rate": 9.493694711086016e-06, "loss": 0.7068, "step": 27910 }, { "epoch": 5.26, "grad_norm": 0.3119409680366516, "learning_rate": 9.489930359495578e-06, "loss": 0.5291, "step": 27920 }, { "epoch": 5.26, "grad_norm": 41.5748405456543, "learning_rate": 9.48616600790514e-06, "loss": 0.6962, "step": 27930 }, { "epoch": 5.26, "grad_norm": 21.246931076049805, "learning_rate": 9.4824016563147e-06, "loss": 0.5247, "step": 27940 }, { "epoch": 5.26, "grad_norm": 9.162127494812012, "learning_rate": 9.478637304724262e-06, "loss": 0.6259, "step": 27950 }, { "epoch": 5.26, "grad_norm": 14.135149955749512, "learning_rate": 9.474872953133824e-06, "loss": 0.5191, "step": 27960 }, { "epoch": 5.26, "grad_norm": 14.310225486755371, "learning_rate": 9.471108601543385e-06, "loss": 0.6222, "step": 27970 }, { "epoch": 5.27, "grad_norm": 23.264753341674805, "learning_rate": 9.467344249952947e-06, "loss": 0.8251, "step": 27980 }, { "epoch": 5.27, "grad_norm": 18.3626766204834, "learning_rate": 9.463579898362508e-06, "loss": 0.6605, "step": 27990 }, { "epoch": 5.27, "grad_norm": 1.1121855974197388, "learning_rate": 9.45981554677207e-06, "loss": 0.396, "step": 28000 }, { "epoch": 5.27, "grad_norm": 0.5820768475532532, "learning_rate": 9.456051195181631e-06, "loss": 0.3876, "step": 28010 }, { "epoch": 5.27, "grad_norm": 19.84071159362793, "learning_rate": 9.452286843591193e-06, "loss": 0.2798, "step": 28020 }, { "epoch": 5.28, "grad_norm": 14.78951644897461, "learning_rate": 9.448522492000754e-06, "loss": 0.7496, "step": 28030 }, { "epoch": 5.28, "grad_norm": 1.7860188484191895, "learning_rate": 9.444758140410315e-06, "loss": 0.3007, "step": 28040 }, { "epoch": 5.28, "grad_norm": 23.34420394897461, "learning_rate": 9.440993788819877e-06, "loss": 0.6097, "step": 28050 }, { "epoch": 5.28, "grad_norm": 11.595169067382812, "learning_rate": 9.437229437229438e-06, "loss": 0.7458, "step": 28060 }, { "epoch": 5.28, "grad_norm": 13.248278617858887, "learning_rate": 9.433465085639e-06, "loss": 0.4254, "step": 28070 }, { "epoch": 5.29, "grad_norm": 9.730267524719238, "learning_rate": 9.429700734048561e-06, "loss": 0.2484, "step": 28080 }, { "epoch": 5.29, "grad_norm": 10.137773513793945, "learning_rate": 9.425936382458121e-06, "loss": 0.5563, "step": 28090 }, { "epoch": 5.29, "grad_norm": 9.176275253295898, "learning_rate": 9.422172030867684e-06, "loss": 0.7219, "step": 28100 }, { "epoch": 5.29, "grad_norm": 18.387922286987305, "learning_rate": 9.418407679277246e-06, "loss": 0.4597, "step": 28110 }, { "epoch": 5.29, "grad_norm": 5.12867546081543, "learning_rate": 9.414643327686807e-06, "loss": 0.6968, "step": 28120 }, { "epoch": 5.29, "grad_norm": 5.446651935577393, "learning_rate": 9.410878976096369e-06, "loss": 0.5488, "step": 28130 }, { "epoch": 5.3, "grad_norm": 11.691588401794434, "learning_rate": 9.40711462450593e-06, "loss": 0.3528, "step": 28140 }, { "epoch": 5.3, "grad_norm": 5.106471538543701, "learning_rate": 9.403350272915491e-06, "loss": 0.5703, "step": 28150 }, { "epoch": 5.3, "grad_norm": 22.144344329833984, "learning_rate": 9.399585921325053e-06, "loss": 0.5555, "step": 28160 }, { "epoch": 5.3, "grad_norm": 11.630510330200195, "learning_rate": 9.395821569734614e-06, "loss": 0.6737, "step": 28170 }, { "epoch": 5.3, "grad_norm": 0.5806583166122437, "learning_rate": 9.392057218144174e-06, "loss": 0.5265, "step": 28180 }, { "epoch": 5.31, "grad_norm": 4.3490214347839355, "learning_rate": 9.388292866553737e-06, "loss": 0.7698, "step": 28190 }, { "epoch": 5.31, "grad_norm": 19.73464584350586, "learning_rate": 9.384528514963299e-06, "loss": 0.3166, "step": 28200 }, { "epoch": 5.31, "grad_norm": 8.744988441467285, "learning_rate": 9.38076416337286e-06, "loss": 0.5361, "step": 28210 }, { "epoch": 5.31, "grad_norm": 13.170635223388672, "learning_rate": 9.376999811782422e-06, "loss": 0.5091, "step": 28220 }, { "epoch": 5.31, "grad_norm": 3.8502869606018066, "learning_rate": 9.373235460191983e-06, "loss": 0.6684, "step": 28230 }, { "epoch": 5.32, "grad_norm": 4.719302177429199, "learning_rate": 9.369471108601545e-06, "loss": 0.3989, "step": 28240 }, { "epoch": 5.32, "grad_norm": 32.28798294067383, "learning_rate": 9.365706757011106e-06, "loss": 0.509, "step": 28250 }, { "epoch": 5.32, "grad_norm": 17.933258056640625, "learning_rate": 9.361942405420668e-06, "loss": 0.5806, "step": 28260 }, { "epoch": 5.32, "grad_norm": 4.121888637542725, "learning_rate": 9.358178053830227e-06, "loss": 0.4916, "step": 28270 }, { "epoch": 5.32, "grad_norm": 7.130378723144531, "learning_rate": 9.354413702239789e-06, "loss": 0.4107, "step": 28280 }, { "epoch": 5.32, "grad_norm": 14.70533561706543, "learning_rate": 9.350649350649352e-06, "loss": 0.581, "step": 28290 }, { "epoch": 5.33, "grad_norm": 11.548562049865723, "learning_rate": 9.346884999058913e-06, "loss": 0.3762, "step": 28300 }, { "epoch": 5.33, "grad_norm": 15.485182762145996, "learning_rate": 9.343120647468475e-06, "loss": 0.7006, "step": 28310 }, { "epoch": 5.33, "grad_norm": 26.412918090820312, "learning_rate": 9.339356295878036e-06, "loss": 0.2112, "step": 28320 }, { "epoch": 5.33, "grad_norm": 13.118706703186035, "learning_rate": 9.335591944287598e-06, "loss": 0.3918, "step": 28330 }, { "epoch": 5.33, "grad_norm": 6.297987937927246, "learning_rate": 9.33182759269716e-06, "loss": 0.4208, "step": 28340 }, { "epoch": 5.34, "grad_norm": 5.268683433532715, "learning_rate": 9.32806324110672e-06, "loss": 0.5968, "step": 28350 }, { "epoch": 5.34, "grad_norm": 6.9022955894470215, "learning_rate": 9.32429888951628e-06, "loss": 0.5602, "step": 28360 }, { "epoch": 5.34, "grad_norm": 1.084697961807251, "learning_rate": 9.320534537925842e-06, "loss": 0.9877, "step": 28370 }, { "epoch": 5.34, "grad_norm": 18.268627166748047, "learning_rate": 9.316770186335405e-06, "loss": 0.5616, "step": 28380 }, { "epoch": 5.34, "grad_norm": 11.595245361328125, "learning_rate": 9.313005834744967e-06, "loss": 0.3982, "step": 28390 }, { "epoch": 5.35, "grad_norm": 15.181175231933594, "learning_rate": 9.309241483154528e-06, "loss": 0.4555, "step": 28400 }, { "epoch": 5.35, "grad_norm": 20.354490280151367, "learning_rate": 9.30547713156409e-06, "loss": 0.4182, "step": 28410 }, { "epoch": 5.35, "grad_norm": 2.795022487640381, "learning_rate": 9.301712779973651e-06, "loss": 0.629, "step": 28420 }, { "epoch": 5.35, "grad_norm": 3.4192261695861816, "learning_rate": 9.297948428383212e-06, "loss": 0.6771, "step": 28430 }, { "epoch": 5.35, "grad_norm": 18.82028579711914, "learning_rate": 9.294184076792772e-06, "loss": 0.6471, "step": 28440 }, { "epoch": 5.35, "grad_norm": 10.711726188659668, "learning_rate": 9.290419725202334e-06, "loss": 0.707, "step": 28450 }, { "epoch": 5.36, "grad_norm": 18.926769256591797, "learning_rate": 9.286655373611895e-06, "loss": 0.499, "step": 28460 }, { "epoch": 5.36, "grad_norm": 6.421844482421875, "learning_rate": 9.282891022021458e-06, "loss": 0.506, "step": 28470 }, { "epoch": 5.36, "grad_norm": 30.580154418945312, "learning_rate": 9.27912667043102e-06, "loss": 0.4126, "step": 28480 }, { "epoch": 5.36, "grad_norm": 1.4444148540496826, "learning_rate": 9.275362318840581e-06, "loss": 0.321, "step": 28490 }, { "epoch": 5.36, "grad_norm": 0.1973607838153839, "learning_rate": 9.271597967250143e-06, "loss": 0.6311, "step": 28500 }, { "epoch": 5.37, "grad_norm": 4.095947265625, "learning_rate": 9.267833615659704e-06, "loss": 0.3348, "step": 28510 }, { "epoch": 5.37, "grad_norm": 3.3935463428497314, "learning_rate": 9.264069264069266e-06, "loss": 0.7266, "step": 28520 }, { "epoch": 5.37, "grad_norm": 1.5833213329315186, "learning_rate": 9.260304912478825e-06, "loss": 0.2602, "step": 28530 }, { "epoch": 5.37, "grad_norm": 32.14379119873047, "learning_rate": 9.256540560888387e-06, "loss": 0.6631, "step": 28540 }, { "epoch": 5.37, "grad_norm": 6.494132041931152, "learning_rate": 9.252776209297948e-06, "loss": 0.686, "step": 28550 }, { "epoch": 5.38, "grad_norm": 7.422737121582031, "learning_rate": 9.24901185770751e-06, "loss": 0.7162, "step": 28560 }, { "epoch": 5.38, "grad_norm": 21.754234313964844, "learning_rate": 9.245247506117073e-06, "loss": 0.5544, "step": 28570 }, { "epoch": 5.38, "grad_norm": 11.023734092712402, "learning_rate": 9.241483154526634e-06, "loss": 0.7434, "step": 28580 }, { "epoch": 5.38, "grad_norm": 14.888821601867676, "learning_rate": 9.237718802936196e-06, "loss": 0.4337, "step": 28590 }, { "epoch": 5.38, "grad_norm": 6.61778450012207, "learning_rate": 9.233954451345757e-06, "loss": 0.7, "step": 28600 }, { "epoch": 5.38, "grad_norm": 10.373224258422852, "learning_rate": 9.230190099755319e-06, "loss": 0.5232, "step": 28610 }, { "epoch": 5.39, "grad_norm": 6.890915870666504, "learning_rate": 9.226425748164878e-06, "loss": 0.4981, "step": 28620 }, { "epoch": 5.39, "grad_norm": 7.358579158782959, "learning_rate": 9.22266139657444e-06, "loss": 0.7168, "step": 28630 }, { "epoch": 5.39, "grad_norm": 26.474361419677734, "learning_rate": 9.218897044984001e-06, "loss": 0.4434, "step": 28640 }, { "epoch": 5.39, "grad_norm": 0.49774327874183655, "learning_rate": 9.215132693393563e-06, "loss": 0.4762, "step": 28650 }, { "epoch": 5.39, "grad_norm": 0.7938456535339355, "learning_rate": 9.211368341803126e-06, "loss": 0.4004, "step": 28660 }, { "epoch": 5.4, "grad_norm": 16.776710510253906, "learning_rate": 9.207603990212687e-06, "loss": 0.6284, "step": 28670 }, { "epoch": 5.4, "grad_norm": 28.070096969604492, "learning_rate": 9.203839638622249e-06, "loss": 0.6981, "step": 28680 }, { "epoch": 5.4, "grad_norm": 10.797050476074219, "learning_rate": 9.20007528703181e-06, "loss": 0.4466, "step": 28690 }, { "epoch": 5.4, "grad_norm": 25.648643493652344, "learning_rate": 9.19631093544137e-06, "loss": 0.7307, "step": 28700 }, { "epoch": 5.4, "grad_norm": 20.66429901123047, "learning_rate": 9.192546583850932e-06, "loss": 0.5789, "step": 28710 }, { "epoch": 5.41, "grad_norm": 21.309707641601562, "learning_rate": 9.188782232260493e-06, "loss": 0.3682, "step": 28720 }, { "epoch": 5.41, "grad_norm": 7.1550703048706055, "learning_rate": 9.185017880670055e-06, "loss": 0.3696, "step": 28730 }, { "epoch": 5.41, "grad_norm": 19.179853439331055, "learning_rate": 9.181253529079616e-06, "loss": 0.4966, "step": 28740 }, { "epoch": 5.41, "grad_norm": 12.53691291809082, "learning_rate": 9.177489177489179e-06, "loss": 0.4346, "step": 28750 }, { "epoch": 5.41, "grad_norm": 17.41402244567871, "learning_rate": 9.17372482589874e-06, "loss": 0.469, "step": 28760 }, { "epoch": 5.42, "grad_norm": 9.987686157226562, "learning_rate": 9.169960474308302e-06, "loss": 0.4145, "step": 28770 }, { "epoch": 5.42, "grad_norm": 9.735391616821289, "learning_rate": 9.166196122717864e-06, "loss": 0.4125, "step": 28780 }, { "epoch": 5.42, "grad_norm": 40.975223541259766, "learning_rate": 9.162431771127423e-06, "loss": 0.6056, "step": 28790 }, { "epoch": 5.42, "grad_norm": 22.280542373657227, "learning_rate": 9.158667419536985e-06, "loss": 0.6244, "step": 28800 }, { "epoch": 5.42, "grad_norm": 8.667016983032227, "learning_rate": 9.154903067946546e-06, "loss": 0.3564, "step": 28810 }, { "epoch": 5.42, "grad_norm": 0.26233333349227905, "learning_rate": 9.151138716356108e-06, "loss": 0.5011, "step": 28820 }, { "epoch": 5.43, "grad_norm": 8.251745223999023, "learning_rate": 9.147374364765669e-06, "loss": 0.5049, "step": 28830 }, { "epoch": 5.43, "grad_norm": 18.818889617919922, "learning_rate": 9.143610013175232e-06, "loss": 0.3172, "step": 28840 }, { "epoch": 5.43, "grad_norm": 26.497833251953125, "learning_rate": 9.139845661584794e-06, "loss": 0.5095, "step": 28850 }, { "epoch": 5.43, "grad_norm": 8.361761093139648, "learning_rate": 9.136081309994355e-06, "loss": 0.5354, "step": 28860 }, { "epoch": 5.43, "grad_norm": 17.610157012939453, "learning_rate": 9.132316958403917e-06, "loss": 0.6836, "step": 28870 }, { "epoch": 5.44, "grad_norm": 7.032655715942383, "learning_rate": 9.128552606813476e-06, "loss": 0.5749, "step": 28880 }, { "epoch": 5.44, "grad_norm": 18.669261932373047, "learning_rate": 9.124788255223038e-06, "loss": 0.5949, "step": 28890 }, { "epoch": 5.44, "grad_norm": 12.058159828186035, "learning_rate": 9.1210239036326e-06, "loss": 0.5494, "step": 28900 }, { "epoch": 5.44, "grad_norm": 0.7247989773750305, "learning_rate": 9.11725955204216e-06, "loss": 0.4002, "step": 28910 }, { "epoch": 5.44, "grad_norm": 6.094273090362549, "learning_rate": 9.113495200451722e-06, "loss": 0.5376, "step": 28920 }, { "epoch": 5.45, "grad_norm": 17.795515060424805, "learning_rate": 9.109730848861284e-06, "loss": 0.5065, "step": 28930 }, { "epoch": 5.45, "grad_norm": 13.030588150024414, "learning_rate": 9.105966497270847e-06, "loss": 0.5984, "step": 28940 }, { "epoch": 5.45, "grad_norm": 0.7247048020362854, "learning_rate": 9.102202145680408e-06, "loss": 0.6524, "step": 28950 }, { "epoch": 5.45, "grad_norm": 0.13299456238746643, "learning_rate": 9.09843779408997e-06, "loss": 0.616, "step": 28960 }, { "epoch": 5.45, "grad_norm": 5.874953746795654, "learning_rate": 9.09467344249953e-06, "loss": 0.6393, "step": 28970 }, { "epoch": 5.45, "grad_norm": 14.227991104125977, "learning_rate": 9.090909090909091e-06, "loss": 0.3868, "step": 28980 }, { "epoch": 5.46, "grad_norm": 8.069024085998535, "learning_rate": 9.087144739318652e-06, "loss": 0.6467, "step": 28990 }, { "epoch": 5.46, "grad_norm": 12.728198051452637, "learning_rate": 9.083380387728214e-06, "loss": 0.8101, "step": 29000 }, { "epoch": 5.46, "grad_norm": 31.68919563293457, "learning_rate": 9.079616036137775e-06, "loss": 0.4018, "step": 29010 }, { "epoch": 5.46, "grad_norm": 37.730751037597656, "learning_rate": 9.075851684547337e-06, "loss": 0.3918, "step": 29020 }, { "epoch": 5.46, "grad_norm": 32.44590759277344, "learning_rate": 9.0720873329569e-06, "loss": 0.5716, "step": 29030 }, { "epoch": 5.47, "grad_norm": 3.4160537719726562, "learning_rate": 9.068322981366461e-06, "loss": 0.6425, "step": 29040 }, { "epoch": 5.47, "grad_norm": 15.794535636901855, "learning_rate": 9.064558629776021e-06, "loss": 0.4578, "step": 29050 }, { "epoch": 5.47, "grad_norm": 14.576927185058594, "learning_rate": 9.060794278185583e-06, "loss": 0.2114, "step": 29060 }, { "epoch": 5.47, "grad_norm": 12.429288864135742, "learning_rate": 9.057029926595144e-06, "loss": 0.571, "step": 29070 }, { "epoch": 5.47, "grad_norm": 12.511153221130371, "learning_rate": 9.053265575004706e-06, "loss": 0.5288, "step": 29080 }, { "epoch": 5.48, "grad_norm": 11.03250789642334, "learning_rate": 9.049501223414267e-06, "loss": 0.6037, "step": 29090 }, { "epoch": 5.48, "grad_norm": 14.617107391357422, "learning_rate": 9.045736871823829e-06, "loss": 0.7927, "step": 29100 }, { "epoch": 5.48, "grad_norm": 10.177559852600098, "learning_rate": 9.04197252023339e-06, "loss": 0.5179, "step": 29110 }, { "epoch": 5.48, "grad_norm": 1.5865446329116821, "learning_rate": 9.038208168642953e-06, "loss": 0.6091, "step": 29120 }, { "epoch": 5.48, "grad_norm": 18.313032150268555, "learning_rate": 9.034443817052515e-06, "loss": 0.4665, "step": 29130 }, { "epoch": 5.48, "grad_norm": 23.01014518737793, "learning_rate": 9.030679465462074e-06, "loss": 0.6638, "step": 29140 }, { "epoch": 5.49, "grad_norm": 18.016878128051758, "learning_rate": 9.026915113871636e-06, "loss": 0.437, "step": 29150 }, { "epoch": 5.49, "grad_norm": 19.193099975585938, "learning_rate": 9.023150762281197e-06, "loss": 0.864, "step": 29160 }, { "epoch": 5.49, "grad_norm": 22.776002883911133, "learning_rate": 9.019386410690759e-06, "loss": 0.3799, "step": 29170 }, { "epoch": 5.49, "grad_norm": 19.30347442626953, "learning_rate": 9.01562205910032e-06, "loss": 0.4335, "step": 29180 }, { "epoch": 5.49, "grad_norm": 8.978057861328125, "learning_rate": 9.011857707509882e-06, "loss": 0.6437, "step": 29190 }, { "epoch": 5.5, "grad_norm": 24.524974822998047, "learning_rate": 9.008093355919443e-06, "loss": 0.6654, "step": 29200 }, { "epoch": 5.5, "grad_norm": 29.421937942504883, "learning_rate": 9.004329004329005e-06, "loss": 1.0022, "step": 29210 }, { "epoch": 5.5, "grad_norm": 9.427165985107422, "learning_rate": 9.000564652738568e-06, "loss": 0.618, "step": 29220 }, { "epoch": 5.5, "grad_norm": 3.5719804763793945, "learning_rate": 8.996800301148128e-06, "loss": 0.4899, "step": 29230 }, { "epoch": 5.5, "grad_norm": 4.902514934539795, "learning_rate": 8.993035949557689e-06, "loss": 0.6078, "step": 29240 }, { "epoch": 5.51, "grad_norm": 7.195416450500488, "learning_rate": 8.98927159796725e-06, "loss": 0.5264, "step": 29250 }, { "epoch": 5.51, "grad_norm": 5.761614799499512, "learning_rate": 8.985507246376812e-06, "loss": 0.5512, "step": 29260 }, { "epoch": 5.51, "grad_norm": 20.03443145751953, "learning_rate": 8.981742894786373e-06, "loss": 0.3887, "step": 29270 }, { "epoch": 5.51, "grad_norm": 20.721759796142578, "learning_rate": 8.977978543195935e-06, "loss": 0.4945, "step": 29280 }, { "epoch": 5.51, "grad_norm": 3.0783581733703613, "learning_rate": 8.974214191605496e-06, "loss": 0.5722, "step": 29290 }, { "epoch": 5.51, "grad_norm": 5.727044105529785, "learning_rate": 8.970449840015058e-06, "loss": 0.9616, "step": 29300 }, { "epoch": 5.52, "grad_norm": 12.99439811706543, "learning_rate": 8.96668548842462e-06, "loss": 0.282, "step": 29310 }, { "epoch": 5.52, "grad_norm": 22.195281982421875, "learning_rate": 8.96292113683418e-06, "loss": 0.8487, "step": 29320 }, { "epoch": 5.52, "grad_norm": 16.877878189086914, "learning_rate": 8.959156785243742e-06, "loss": 0.4369, "step": 29330 }, { "epoch": 5.52, "grad_norm": 3.1448419094085693, "learning_rate": 8.955392433653304e-06, "loss": 0.4618, "step": 29340 }, { "epoch": 5.52, "grad_norm": 11.687005996704102, "learning_rate": 8.951628082062865e-06, "loss": 0.5836, "step": 29350 }, { "epoch": 5.53, "grad_norm": 52.1478385925293, "learning_rate": 8.947863730472427e-06, "loss": 0.8731, "step": 29360 }, { "epoch": 5.53, "grad_norm": 8.80933952331543, "learning_rate": 8.944099378881988e-06, "loss": 0.6717, "step": 29370 }, { "epoch": 5.53, "grad_norm": 18.5314884185791, "learning_rate": 8.94033502729155e-06, "loss": 0.4894, "step": 29380 }, { "epoch": 5.53, "grad_norm": 11.329646110534668, "learning_rate": 8.936570675701111e-06, "loss": 0.4815, "step": 29390 }, { "epoch": 5.53, "grad_norm": 21.230852127075195, "learning_rate": 8.932806324110672e-06, "loss": 0.4756, "step": 29400 }, { "epoch": 5.54, "grad_norm": 8.987369537353516, "learning_rate": 8.929041972520234e-06, "loss": 0.553, "step": 29410 }, { "epoch": 5.54, "grad_norm": 9.444402694702148, "learning_rate": 8.925277620929795e-06, "loss": 0.4838, "step": 29420 }, { "epoch": 5.54, "grad_norm": 0.40830114483833313, "learning_rate": 8.921513269339357e-06, "loss": 0.7624, "step": 29430 }, { "epoch": 5.54, "grad_norm": 0.5162389874458313, "learning_rate": 8.917748917748918e-06, "loss": 0.2015, "step": 29440 }, { "epoch": 5.54, "grad_norm": 37.219818115234375, "learning_rate": 8.91398456615848e-06, "loss": 0.5263, "step": 29450 }, { "epoch": 5.54, "grad_norm": 8.195032119750977, "learning_rate": 8.910220214568041e-06, "loss": 0.6158, "step": 29460 }, { "epoch": 5.55, "grad_norm": 5.147323131561279, "learning_rate": 8.906455862977603e-06, "loss": 0.5521, "step": 29470 }, { "epoch": 5.55, "grad_norm": 3.7888503074645996, "learning_rate": 8.902691511387164e-06, "loss": 0.357, "step": 29480 }, { "epoch": 5.55, "grad_norm": 6.756317138671875, "learning_rate": 8.898927159796726e-06, "loss": 0.4016, "step": 29490 }, { "epoch": 5.55, "grad_norm": 26.7462158203125, "learning_rate": 8.895162808206287e-06, "loss": 0.6225, "step": 29500 }, { "epoch": 5.55, "grad_norm": 3.3406825065612793, "learning_rate": 8.891398456615848e-06, "loss": 0.5504, "step": 29510 }, { "epoch": 5.56, "grad_norm": 7.413740634918213, "learning_rate": 8.88763410502541e-06, "loss": 0.4144, "step": 29520 }, { "epoch": 5.56, "grad_norm": 0.5586540699005127, "learning_rate": 8.883869753434971e-06, "loss": 0.394, "step": 29530 }, { "epoch": 5.56, "grad_norm": 28.930585861206055, "learning_rate": 8.880105401844533e-06, "loss": 0.436, "step": 29540 }, { "epoch": 5.56, "grad_norm": 4.1949849128723145, "learning_rate": 8.876341050254094e-06, "loss": 0.7621, "step": 29550 }, { "epoch": 5.56, "grad_norm": 15.377513885498047, "learning_rate": 8.872576698663656e-06, "loss": 0.3309, "step": 29560 }, { "epoch": 5.57, "grad_norm": 10.37800121307373, "learning_rate": 8.868812347073217e-06, "loss": 0.5609, "step": 29570 }, { "epoch": 5.57, "grad_norm": 16.737653732299805, "learning_rate": 8.865047995482779e-06, "loss": 0.5195, "step": 29580 }, { "epoch": 5.57, "grad_norm": 10.057021141052246, "learning_rate": 8.86128364389234e-06, "loss": 0.5856, "step": 29590 }, { "epoch": 5.57, "grad_norm": 18.19561195373535, "learning_rate": 8.857519292301902e-06, "loss": 0.6032, "step": 29600 }, { "epoch": 5.57, "grad_norm": 8.191587448120117, "learning_rate": 8.853754940711463e-06, "loss": 0.3631, "step": 29610 }, { "epoch": 5.58, "grad_norm": 14.722908973693848, "learning_rate": 8.849990589121025e-06, "loss": 0.9588, "step": 29620 }, { "epoch": 5.58, "grad_norm": 25.523221969604492, "learning_rate": 8.846226237530586e-06, "loss": 0.7066, "step": 29630 }, { "epoch": 5.58, "grad_norm": 9.714911460876465, "learning_rate": 8.842461885940147e-06, "loss": 0.4797, "step": 29640 }, { "epoch": 5.58, "grad_norm": 5.88856315612793, "learning_rate": 8.838697534349709e-06, "loss": 0.4359, "step": 29650 }, { "epoch": 5.58, "grad_norm": 12.061199188232422, "learning_rate": 8.83493318275927e-06, "loss": 0.5571, "step": 29660 }, { "epoch": 5.58, "grad_norm": 18.20134162902832, "learning_rate": 8.831168831168832e-06, "loss": 0.3449, "step": 29670 }, { "epoch": 5.59, "grad_norm": 12.034881591796875, "learning_rate": 8.827404479578393e-06, "loss": 0.7451, "step": 29680 }, { "epoch": 5.59, "grad_norm": 4.537769317626953, "learning_rate": 8.823640127987955e-06, "loss": 0.7454, "step": 29690 }, { "epoch": 5.59, "grad_norm": 39.31631088256836, "learning_rate": 8.819875776397516e-06, "loss": 0.3897, "step": 29700 }, { "epoch": 5.59, "grad_norm": 20.764333724975586, "learning_rate": 8.816111424807078e-06, "loss": 0.4239, "step": 29710 }, { "epoch": 5.59, "grad_norm": 4.3475823402404785, "learning_rate": 8.812347073216639e-06, "loss": 0.4626, "step": 29720 }, { "epoch": 5.6, "grad_norm": 26.93927001953125, "learning_rate": 8.8085827216262e-06, "loss": 0.3852, "step": 29730 }, { "epoch": 5.6, "grad_norm": 23.00481414794922, "learning_rate": 8.804818370035762e-06, "loss": 0.5371, "step": 29740 }, { "epoch": 5.6, "grad_norm": 1.6085634231567383, "learning_rate": 8.801054018445324e-06, "loss": 0.3698, "step": 29750 }, { "epoch": 5.6, "grad_norm": 13.103279113769531, "learning_rate": 8.797289666854885e-06, "loss": 0.6233, "step": 29760 }, { "epoch": 5.6, "grad_norm": 14.591599464416504, "learning_rate": 8.793525315264446e-06, "loss": 0.4826, "step": 29770 }, { "epoch": 5.61, "grad_norm": 23.947324752807617, "learning_rate": 8.789760963674008e-06, "loss": 0.5319, "step": 29780 }, { "epoch": 5.61, "grad_norm": 11.421167373657227, "learning_rate": 8.78599661208357e-06, "loss": 0.5641, "step": 29790 }, { "epoch": 5.61, "grad_norm": 4.573395252227783, "learning_rate": 8.78223226049313e-06, "loss": 0.6296, "step": 29800 }, { "epoch": 5.61, "grad_norm": 30.208024978637695, "learning_rate": 8.778467908902692e-06, "loss": 0.4905, "step": 29810 }, { "epoch": 5.61, "grad_norm": 33.18610763549805, "learning_rate": 8.774703557312254e-06, "loss": 0.7144, "step": 29820 }, { "epoch": 5.61, "grad_norm": 34.4425048828125, "learning_rate": 8.770939205721815e-06, "loss": 0.8056, "step": 29830 }, { "epoch": 5.62, "grad_norm": 9.860750198364258, "learning_rate": 8.767174854131377e-06, "loss": 0.4445, "step": 29840 }, { "epoch": 5.62, "grad_norm": 10.247167587280273, "learning_rate": 8.763410502540938e-06, "loss": 0.472, "step": 29850 }, { "epoch": 5.62, "grad_norm": 30.189830780029297, "learning_rate": 8.7596461509505e-06, "loss": 0.4546, "step": 29860 }, { "epoch": 5.62, "grad_norm": 4.302022933959961, "learning_rate": 8.755881799360061e-06, "loss": 0.6976, "step": 29870 }, { "epoch": 5.62, "grad_norm": 0.30874672532081604, "learning_rate": 8.752117447769623e-06, "loss": 0.711, "step": 29880 }, { "epoch": 5.63, "grad_norm": 5.02794075012207, "learning_rate": 8.748353096179184e-06, "loss": 0.6327, "step": 29890 }, { "epoch": 5.63, "grad_norm": 1.8799618482589722, "learning_rate": 8.744588744588745e-06, "loss": 0.3782, "step": 29900 }, { "epoch": 5.63, "grad_norm": 12.888080596923828, "learning_rate": 8.740824392998307e-06, "loss": 0.4036, "step": 29910 }, { "epoch": 5.63, "grad_norm": 24.87213897705078, "learning_rate": 8.737060041407868e-06, "loss": 0.51, "step": 29920 }, { "epoch": 5.63, "grad_norm": 20.8959903717041, "learning_rate": 8.73329568981743e-06, "loss": 0.434, "step": 29930 }, { "epoch": 5.64, "grad_norm": 15.75831413269043, "learning_rate": 8.729531338226991e-06, "loss": 0.5039, "step": 29940 }, { "epoch": 5.64, "grad_norm": 6.654936790466309, "learning_rate": 8.725766986636553e-06, "loss": 0.6428, "step": 29950 }, { "epoch": 5.64, "grad_norm": 12.773100852966309, "learning_rate": 8.722002635046114e-06, "loss": 0.5853, "step": 29960 }, { "epoch": 5.64, "grad_norm": 19.850297927856445, "learning_rate": 8.718238283455676e-06, "loss": 0.7984, "step": 29970 }, { "epoch": 5.64, "grad_norm": 11.2599458694458, "learning_rate": 8.714473931865237e-06, "loss": 0.7433, "step": 29980 }, { "epoch": 5.64, "grad_norm": 14.740336418151855, "learning_rate": 8.710709580274799e-06, "loss": 0.448, "step": 29990 }, { "epoch": 5.65, "grad_norm": 11.278069496154785, "learning_rate": 8.70694522868436e-06, "loss": 0.5628, "step": 30000 }, { "epoch": 5.65, "grad_norm": 17.312114715576172, "learning_rate": 8.703180877093921e-06, "loss": 0.8434, "step": 30010 }, { "epoch": 5.65, "grad_norm": 10.707963943481445, "learning_rate": 8.699416525503483e-06, "loss": 0.5479, "step": 30020 }, { "epoch": 5.65, "grad_norm": 33.91600036621094, "learning_rate": 8.695652173913044e-06, "loss": 1.1438, "step": 30030 }, { "epoch": 5.65, "grad_norm": 26.10776138305664, "learning_rate": 8.691887822322606e-06, "loss": 0.6822, "step": 30040 }, { "epoch": 5.66, "grad_norm": 4.60282564163208, "learning_rate": 8.688123470732167e-06, "loss": 0.5197, "step": 30050 }, { "epoch": 5.66, "grad_norm": 1.051890254020691, "learning_rate": 8.684359119141729e-06, "loss": 0.6023, "step": 30060 }, { "epoch": 5.66, "grad_norm": 20.802509307861328, "learning_rate": 8.68059476755129e-06, "loss": 0.3724, "step": 30070 }, { "epoch": 5.66, "grad_norm": 11.538596153259277, "learning_rate": 8.676830415960852e-06, "loss": 0.5301, "step": 30080 }, { "epoch": 5.66, "grad_norm": 31.12389373779297, "learning_rate": 8.673066064370413e-06, "loss": 0.6386, "step": 30090 }, { "epoch": 5.67, "grad_norm": 17.99003028869629, "learning_rate": 8.669301712779975e-06, "loss": 0.4503, "step": 30100 }, { "epoch": 5.67, "grad_norm": 25.43760108947754, "learning_rate": 8.665537361189536e-06, "loss": 0.6649, "step": 30110 }, { "epoch": 5.67, "grad_norm": 0.21140538156032562, "learning_rate": 8.661773009599098e-06, "loss": 0.3958, "step": 30120 }, { "epoch": 5.67, "grad_norm": 6.105637550354004, "learning_rate": 8.658008658008659e-06, "loss": 0.7797, "step": 30130 }, { "epoch": 5.67, "grad_norm": 3.348720073699951, "learning_rate": 8.65424430641822e-06, "loss": 0.3967, "step": 30140 }, { "epoch": 5.67, "grad_norm": 17.45372772216797, "learning_rate": 8.650479954827782e-06, "loss": 0.4381, "step": 30150 }, { "epoch": 5.68, "grad_norm": 5.134333610534668, "learning_rate": 8.646715603237343e-06, "loss": 0.6194, "step": 30160 }, { "epoch": 5.68, "grad_norm": 28.561365127563477, "learning_rate": 8.642951251646905e-06, "loss": 0.8043, "step": 30170 }, { "epoch": 5.68, "grad_norm": 9.468137741088867, "learning_rate": 8.639186900056465e-06, "loss": 0.5759, "step": 30180 }, { "epoch": 5.68, "grad_norm": 5.760190010070801, "learning_rate": 8.635422548466028e-06, "loss": 0.6533, "step": 30190 }, { "epoch": 5.68, "grad_norm": 22.225126266479492, "learning_rate": 8.63165819687559e-06, "loss": 0.7, "step": 30200 }, { "epoch": 5.69, "grad_norm": 14.245889663696289, "learning_rate": 8.62789384528515e-06, "loss": 0.6829, "step": 30210 }, { "epoch": 5.69, "grad_norm": 30.38525390625, "learning_rate": 8.624129493694712e-06, "loss": 0.5493, "step": 30220 }, { "epoch": 5.69, "grad_norm": 16.88604164123535, "learning_rate": 8.620365142104274e-06, "loss": 0.6734, "step": 30230 }, { "epoch": 5.69, "grad_norm": 23.054962158203125, "learning_rate": 8.616600790513835e-06, "loss": 0.6553, "step": 30240 }, { "epoch": 5.69, "grad_norm": 16.953598022460938, "learning_rate": 8.612836438923397e-06, "loss": 0.6512, "step": 30250 }, { "epoch": 5.7, "grad_norm": 22.24566078186035, "learning_rate": 8.609072087332958e-06, "loss": 0.5785, "step": 30260 }, { "epoch": 5.7, "grad_norm": 0.3672504723072052, "learning_rate": 8.605307735742518e-06, "loss": 0.6382, "step": 30270 }, { "epoch": 5.7, "grad_norm": 0.8988174796104431, "learning_rate": 8.601543384152081e-06, "loss": 0.4421, "step": 30280 }, { "epoch": 5.7, "grad_norm": 15.05488395690918, "learning_rate": 8.597779032561642e-06, "loss": 0.4611, "step": 30290 }, { "epoch": 5.7, "grad_norm": 24.66227912902832, "learning_rate": 8.594014680971204e-06, "loss": 0.5693, "step": 30300 }, { "epoch": 5.7, "grad_norm": 24.09734344482422, "learning_rate": 8.590250329380765e-06, "loss": 0.4477, "step": 30310 }, { "epoch": 5.71, "grad_norm": 9.480891227722168, "learning_rate": 8.586485977790327e-06, "loss": 0.5086, "step": 30320 }, { "epoch": 5.71, "grad_norm": 41.71274185180664, "learning_rate": 8.582721626199888e-06, "loss": 0.8377, "step": 30330 }, { "epoch": 5.71, "grad_norm": 30.149093627929688, "learning_rate": 8.57895727460945e-06, "loss": 0.6889, "step": 30340 }, { "epoch": 5.71, "grad_norm": 14.241212844848633, "learning_rate": 8.575192923019011e-06, "loss": 0.6828, "step": 30350 }, { "epoch": 5.71, "grad_norm": 8.023870468139648, "learning_rate": 8.571428571428571e-06, "loss": 0.5508, "step": 30360 }, { "epoch": 5.72, "grad_norm": 1.6237149238586426, "learning_rate": 8.567664219838134e-06, "loss": 0.3401, "step": 30370 }, { "epoch": 5.72, "grad_norm": 7.518566608428955, "learning_rate": 8.563899868247696e-06, "loss": 0.4684, "step": 30380 }, { "epoch": 5.72, "grad_norm": 15.331060409545898, "learning_rate": 8.560135516657257e-06, "loss": 0.5097, "step": 30390 }, { "epoch": 5.72, "grad_norm": 6.36647891998291, "learning_rate": 8.556371165066818e-06, "loss": 0.5214, "step": 30400 }, { "epoch": 5.72, "grad_norm": 17.332448959350586, "learning_rate": 8.55260681347638e-06, "loss": 0.4498, "step": 30410 }, { "epoch": 5.73, "grad_norm": 63.40143966674805, "learning_rate": 8.548842461885941e-06, "loss": 0.6639, "step": 30420 }, { "epoch": 5.73, "grad_norm": 9.465449333190918, "learning_rate": 8.545078110295503e-06, "loss": 0.5251, "step": 30430 }, { "epoch": 5.73, "grad_norm": 11.202866554260254, "learning_rate": 8.541313758705064e-06, "loss": 0.4721, "step": 30440 }, { "epoch": 5.73, "grad_norm": 66.58721923828125, "learning_rate": 8.537549407114624e-06, "loss": 0.6034, "step": 30450 }, { "epoch": 5.73, "grad_norm": 36.35587692260742, "learning_rate": 8.533785055524186e-06, "loss": 0.5666, "step": 30460 }, { "epoch": 5.73, "grad_norm": 0.15961378812789917, "learning_rate": 8.530020703933749e-06, "loss": 0.7364, "step": 30470 }, { "epoch": 5.74, "grad_norm": 6.215275287628174, "learning_rate": 8.52625635234331e-06, "loss": 0.4973, "step": 30480 }, { "epoch": 5.74, "grad_norm": 29.853361129760742, "learning_rate": 8.522492000752872e-06, "loss": 0.6356, "step": 30490 }, { "epoch": 5.74, "grad_norm": 18.131460189819336, "learning_rate": 8.518727649162433e-06, "loss": 0.7122, "step": 30500 }, { "epoch": 5.74, "grad_norm": 17.9971866607666, "learning_rate": 8.514963297571995e-06, "loss": 0.784, "step": 30510 }, { "epoch": 5.74, "grad_norm": 21.481882095336914, "learning_rate": 8.511198945981556e-06, "loss": 0.6354, "step": 30520 }, { "epoch": 5.75, "grad_norm": 7.704098224639893, "learning_rate": 8.507434594391116e-06, "loss": 0.5242, "step": 30530 }, { "epoch": 5.75, "grad_norm": 0.10782869905233383, "learning_rate": 8.503670242800677e-06, "loss": 0.3119, "step": 30540 }, { "epoch": 5.75, "grad_norm": 9.13038158416748, "learning_rate": 8.499905891210239e-06, "loss": 0.5088, "step": 30550 }, { "epoch": 5.75, "grad_norm": 1.3244730234146118, "learning_rate": 8.496141539619802e-06, "loss": 0.3711, "step": 30560 }, { "epoch": 5.75, "grad_norm": 12.541553497314453, "learning_rate": 8.492377188029363e-06, "loss": 0.4415, "step": 30570 }, { "epoch": 5.76, "grad_norm": 6.784567832946777, "learning_rate": 8.488612836438925e-06, "loss": 0.3377, "step": 30580 }, { "epoch": 5.76, "grad_norm": 29.57703971862793, "learning_rate": 8.484848484848486e-06, "loss": 0.5329, "step": 30590 }, { "epoch": 5.76, "grad_norm": 16.579954147338867, "learning_rate": 8.481084133258048e-06, "loss": 0.4068, "step": 30600 }, { "epoch": 5.76, "grad_norm": 13.801152229309082, "learning_rate": 8.477319781667609e-06, "loss": 0.5752, "step": 30610 }, { "epoch": 5.76, "grad_norm": 4.930255889892578, "learning_rate": 8.473555430077169e-06, "loss": 0.6852, "step": 30620 }, { "epoch": 5.77, "grad_norm": 6.195820331573486, "learning_rate": 8.46979107848673e-06, "loss": 0.4997, "step": 30630 }, { "epoch": 5.77, "grad_norm": 14.934310913085938, "learning_rate": 8.466026726896292e-06, "loss": 0.4644, "step": 30640 }, { "epoch": 5.77, "grad_norm": 13.503046989440918, "learning_rate": 8.462262375305855e-06, "loss": 0.6239, "step": 30650 }, { "epoch": 5.77, "grad_norm": 7.415577411651611, "learning_rate": 8.458498023715416e-06, "loss": 0.4476, "step": 30660 }, { "epoch": 5.77, "grad_norm": 11.47219181060791, "learning_rate": 8.454733672124978e-06, "loss": 0.4326, "step": 30670 }, { "epoch": 5.77, "grad_norm": 31.040847778320312, "learning_rate": 8.45096932053454e-06, "loss": 0.5231, "step": 30680 }, { "epoch": 5.78, "grad_norm": 14.181180953979492, "learning_rate": 8.4472049689441e-06, "loss": 0.6028, "step": 30690 }, { "epoch": 5.78, "grad_norm": 14.366339683532715, "learning_rate": 8.443440617353662e-06, "loss": 0.5395, "step": 30700 }, { "epoch": 5.78, "grad_norm": 1.154944896697998, "learning_rate": 8.439676265763222e-06, "loss": 0.6325, "step": 30710 }, { "epoch": 5.78, "grad_norm": 5.166085243225098, "learning_rate": 8.435911914172784e-06, "loss": 0.5345, "step": 30720 }, { "epoch": 5.78, "grad_norm": 2.16483211517334, "learning_rate": 8.432147562582345e-06, "loss": 0.4621, "step": 30730 }, { "epoch": 5.79, "grad_norm": 24.309677124023438, "learning_rate": 8.428383210991906e-06, "loss": 0.4354, "step": 30740 }, { "epoch": 5.79, "grad_norm": 30.56849479675293, "learning_rate": 8.42461885940147e-06, "loss": 0.6885, "step": 30750 }, { "epoch": 5.79, "grad_norm": 30.05738067626953, "learning_rate": 8.420854507811031e-06, "loss": 0.3176, "step": 30760 }, { "epoch": 5.79, "grad_norm": 1.265407919883728, "learning_rate": 8.417090156220593e-06, "loss": 0.415, "step": 30770 }, { "epoch": 5.79, "grad_norm": 1.8372256755828857, "learning_rate": 8.413325804630154e-06, "loss": 0.3897, "step": 30780 }, { "epoch": 5.8, "grad_norm": 5.470216274261475, "learning_rate": 8.409561453039714e-06, "loss": 0.7885, "step": 30790 }, { "epoch": 5.8, "grad_norm": 28.494230270385742, "learning_rate": 8.405797101449275e-06, "loss": 0.5577, "step": 30800 }, { "epoch": 5.8, "grad_norm": 3.192377805709839, "learning_rate": 8.402032749858837e-06, "loss": 0.6976, "step": 30810 }, { "epoch": 5.8, "grad_norm": 16.5463924407959, "learning_rate": 8.398268398268398e-06, "loss": 0.4397, "step": 30820 }, { "epoch": 5.8, "grad_norm": 28.271276473999023, "learning_rate": 8.39450404667796e-06, "loss": 0.8356, "step": 30830 }, { "epoch": 5.8, "grad_norm": 1.6185463666915894, "learning_rate": 8.390739695087523e-06, "loss": 0.5331, "step": 30840 }, { "epoch": 5.81, "grad_norm": 27.62196159362793, "learning_rate": 8.386975343497084e-06, "loss": 0.4344, "step": 30850 }, { "epoch": 5.81, "grad_norm": 47.22684097290039, "learning_rate": 8.383210991906646e-06, "loss": 0.7706, "step": 30860 }, { "epoch": 5.81, "grad_norm": 9.242682456970215, "learning_rate": 8.379446640316207e-06, "loss": 0.6522, "step": 30870 }, { "epoch": 5.81, "grad_norm": 9.148713111877441, "learning_rate": 8.375682288725767e-06, "loss": 0.4746, "step": 30880 }, { "epoch": 5.81, "grad_norm": 24.679914474487305, "learning_rate": 8.371917937135328e-06, "loss": 0.6837, "step": 30890 }, { "epoch": 5.82, "grad_norm": 14.974303245544434, "learning_rate": 8.36815358554489e-06, "loss": 0.6234, "step": 30900 }, { "epoch": 5.82, "grad_norm": 17.962249755859375, "learning_rate": 8.364389233954451e-06, "loss": 0.507, "step": 30910 }, { "epoch": 5.82, "grad_norm": 10.956331253051758, "learning_rate": 8.360624882364013e-06, "loss": 0.5623, "step": 30920 }, { "epoch": 5.82, "grad_norm": 0.729061484336853, "learning_rate": 8.356860530773576e-06, "loss": 0.6084, "step": 30930 }, { "epoch": 5.82, "grad_norm": 12.048168182373047, "learning_rate": 8.353096179183137e-06, "loss": 0.3989, "step": 30940 }, { "epoch": 5.83, "grad_norm": 7.992116928100586, "learning_rate": 8.349331827592699e-06, "loss": 0.4077, "step": 30950 }, { "epoch": 5.83, "grad_norm": 19.040918350219727, "learning_rate": 8.34556747600226e-06, "loss": 0.9477, "step": 30960 }, { "epoch": 5.83, "grad_norm": 12.26267147064209, "learning_rate": 8.34180312441182e-06, "loss": 0.5282, "step": 30970 }, { "epoch": 5.83, "grad_norm": 7.242198467254639, "learning_rate": 8.338038772821381e-06, "loss": 0.4611, "step": 30980 }, { "epoch": 5.83, "grad_norm": 27.804534912109375, "learning_rate": 8.334274421230943e-06, "loss": 0.5615, "step": 30990 }, { "epoch": 5.83, "grad_norm": 0.4633367955684662, "learning_rate": 8.330510069640504e-06, "loss": 0.533, "step": 31000 }, { "epoch": 5.84, "grad_norm": 7.059324741363525, "learning_rate": 8.326745718050066e-06, "loss": 0.1811, "step": 31010 }, { "epoch": 5.84, "grad_norm": 13.364731788635254, "learning_rate": 8.322981366459629e-06, "loss": 0.4192, "step": 31020 }, { "epoch": 5.84, "grad_norm": 10.441876411437988, "learning_rate": 8.31921701486919e-06, "loss": 0.4543, "step": 31030 }, { "epoch": 5.84, "grad_norm": 1.0615036487579346, "learning_rate": 8.315452663278752e-06, "loss": 0.4342, "step": 31040 }, { "epoch": 5.84, "grad_norm": 0.598838210105896, "learning_rate": 8.311688311688313e-06, "loss": 0.6021, "step": 31050 }, { "epoch": 5.85, "grad_norm": 32.29551315307617, "learning_rate": 8.307923960097873e-06, "loss": 0.5029, "step": 31060 }, { "epoch": 5.85, "grad_norm": 42.38307571411133, "learning_rate": 8.304159608507435e-06, "loss": 0.4835, "step": 31070 }, { "epoch": 5.85, "grad_norm": 0.41502639651298523, "learning_rate": 8.300395256916996e-06, "loss": 0.2942, "step": 31080 }, { "epoch": 5.85, "grad_norm": 1.5998517274856567, "learning_rate": 8.296630905326558e-06, "loss": 0.4603, "step": 31090 }, { "epoch": 5.85, "grad_norm": 18.18465232849121, "learning_rate": 8.292866553736119e-06, "loss": 0.5552, "step": 31100 }, { "epoch": 5.86, "grad_norm": 19.617231369018555, "learning_rate": 8.28910220214568e-06, "loss": 0.5419, "step": 31110 }, { "epoch": 5.86, "grad_norm": 33.43571472167969, "learning_rate": 8.285337850555244e-06, "loss": 0.3924, "step": 31120 }, { "epoch": 5.86, "grad_norm": 14.464301109313965, "learning_rate": 8.281573498964805e-06, "loss": 0.5375, "step": 31130 }, { "epoch": 5.86, "grad_norm": 15.446751594543457, "learning_rate": 8.277809147374365e-06, "loss": 0.2812, "step": 31140 }, { "epoch": 5.86, "grad_norm": 24.76715087890625, "learning_rate": 8.274044795783926e-06, "loss": 0.4512, "step": 31150 }, { "epoch": 5.86, "grad_norm": 19.724828720092773, "learning_rate": 8.270280444193488e-06, "loss": 0.5171, "step": 31160 }, { "epoch": 5.87, "grad_norm": 6.281129837036133, "learning_rate": 8.26651609260305e-06, "loss": 0.3646, "step": 31170 }, { "epoch": 5.87, "grad_norm": 0.5755620002746582, "learning_rate": 8.26275174101261e-06, "loss": 0.6023, "step": 31180 }, { "epoch": 5.87, "grad_norm": 24.553237915039062, "learning_rate": 8.258987389422172e-06, "loss": 0.4937, "step": 31190 }, { "epoch": 5.87, "grad_norm": 25.367462158203125, "learning_rate": 8.255223037831734e-06, "loss": 0.4216, "step": 31200 }, { "epoch": 5.87, "grad_norm": 5.50634765625, "learning_rate": 8.251458686241297e-06, "loss": 0.3745, "step": 31210 }, { "epoch": 5.88, "grad_norm": 31.35790252685547, "learning_rate": 8.247694334650858e-06, "loss": 0.6542, "step": 31220 }, { "epoch": 5.88, "grad_norm": 18.49880027770996, "learning_rate": 8.243929983060418e-06, "loss": 1.0202, "step": 31230 }, { "epoch": 5.88, "grad_norm": 0.294521301984787, "learning_rate": 8.24016563146998e-06, "loss": 0.2461, "step": 31240 }, { "epoch": 5.88, "grad_norm": 2.930119276046753, "learning_rate": 8.236401279879541e-06, "loss": 0.4741, "step": 31250 }, { "epoch": 5.88, "grad_norm": 14.062408447265625, "learning_rate": 8.232636928289102e-06, "loss": 0.5901, "step": 31260 }, { "epoch": 5.89, "grad_norm": 1.9135679006576538, "learning_rate": 8.228872576698664e-06, "loss": 0.3728, "step": 31270 }, { "epoch": 5.89, "grad_norm": 12.453121185302734, "learning_rate": 8.225108225108225e-06, "loss": 0.612, "step": 31280 }, { "epoch": 5.89, "grad_norm": 18.037458419799805, "learning_rate": 8.221343873517787e-06, "loss": 0.6464, "step": 31290 }, { "epoch": 5.89, "grad_norm": 14.36330795288086, "learning_rate": 8.21757952192735e-06, "loss": 0.6451, "step": 31300 }, { "epoch": 5.89, "grad_norm": 17.178071975708008, "learning_rate": 8.213815170336911e-06, "loss": 0.4489, "step": 31310 }, { "epoch": 5.89, "grad_norm": 3.543149948120117, "learning_rate": 8.210050818746471e-06, "loss": 0.8048, "step": 31320 }, { "epoch": 5.9, "grad_norm": 8.747088432312012, "learning_rate": 8.206286467156033e-06, "loss": 0.5323, "step": 31330 }, { "epoch": 5.9, "grad_norm": 10.16124439239502, "learning_rate": 8.202522115565594e-06, "loss": 0.2721, "step": 31340 }, { "epoch": 5.9, "grad_norm": 25.606826782226562, "learning_rate": 8.198757763975156e-06, "loss": 0.3302, "step": 31350 }, { "epoch": 5.9, "grad_norm": 10.258896827697754, "learning_rate": 8.194993412384717e-06, "loss": 0.4167, "step": 31360 }, { "epoch": 5.9, "grad_norm": 11.452646255493164, "learning_rate": 8.191229060794278e-06, "loss": 0.5729, "step": 31370 }, { "epoch": 5.91, "grad_norm": 20.775026321411133, "learning_rate": 8.18746470920384e-06, "loss": 0.6972, "step": 31380 }, { "epoch": 5.91, "grad_norm": 0.9261711239814758, "learning_rate": 8.183700357613401e-06, "loss": 0.4341, "step": 31390 }, { "epoch": 5.91, "grad_norm": 23.06140899658203, "learning_rate": 8.179936006022963e-06, "loss": 0.6072, "step": 31400 }, { "epoch": 5.91, "grad_norm": 0.12934058904647827, "learning_rate": 8.176171654432524e-06, "loss": 0.5984, "step": 31410 }, { "epoch": 5.91, "grad_norm": 11.607454299926758, "learning_rate": 8.172407302842086e-06, "loss": 0.4704, "step": 31420 }, { "epoch": 5.92, "grad_norm": 17.176578521728516, "learning_rate": 8.168642951251647e-06, "loss": 0.6911, "step": 31430 }, { "epoch": 5.92, "grad_norm": 21.022741317749023, "learning_rate": 8.164878599661209e-06, "loss": 0.5338, "step": 31440 }, { "epoch": 5.92, "grad_norm": 1.4555150270462036, "learning_rate": 8.16111424807077e-06, "loss": 0.5478, "step": 31450 }, { "epoch": 5.92, "grad_norm": 0.8523018956184387, "learning_rate": 8.157349896480332e-06, "loss": 0.3781, "step": 31460 }, { "epoch": 5.92, "grad_norm": 0.12009706348180771, "learning_rate": 8.153585544889893e-06, "loss": 0.5797, "step": 31470 }, { "epoch": 5.93, "grad_norm": 3.4244158267974854, "learning_rate": 8.149821193299455e-06, "loss": 0.2886, "step": 31480 }, { "epoch": 5.93, "grad_norm": 5.182470798492432, "learning_rate": 8.146056841709016e-06, "loss": 0.4928, "step": 31490 }, { "epoch": 5.93, "grad_norm": 37.03120040893555, "learning_rate": 8.142292490118577e-06, "loss": 0.5429, "step": 31500 }, { "epoch": 5.93, "grad_norm": 10.818685531616211, "learning_rate": 8.138528138528139e-06, "loss": 0.7055, "step": 31510 }, { "epoch": 5.93, "grad_norm": 19.673473358154297, "learning_rate": 8.1347637869377e-06, "loss": 0.7041, "step": 31520 }, { "epoch": 5.93, "grad_norm": 19.44447135925293, "learning_rate": 8.130999435347262e-06, "loss": 0.4307, "step": 31530 }, { "epoch": 5.94, "grad_norm": 15.414891242980957, "learning_rate": 8.127235083756823e-06, "loss": 0.433, "step": 31540 }, { "epoch": 5.94, "grad_norm": 1.3869212865829468, "learning_rate": 8.123470732166385e-06, "loss": 0.4351, "step": 31550 }, { "epoch": 5.94, "grad_norm": 6.919072151184082, "learning_rate": 8.119706380575946e-06, "loss": 0.4183, "step": 31560 }, { "epoch": 5.94, "grad_norm": 18.920053482055664, "learning_rate": 8.115942028985508e-06, "loss": 0.6036, "step": 31570 }, { "epoch": 5.94, "grad_norm": 23.368011474609375, "learning_rate": 8.112177677395069e-06, "loss": 0.4688, "step": 31580 }, { "epoch": 5.95, "grad_norm": 43.60259246826172, "learning_rate": 8.10841332580463e-06, "loss": 0.2959, "step": 31590 }, { "epoch": 5.95, "grad_norm": 3.7617850303649902, "learning_rate": 8.104648974214192e-06, "loss": 0.3229, "step": 31600 }, { "epoch": 5.95, "grad_norm": 28.480266571044922, "learning_rate": 8.100884622623754e-06, "loss": 0.5828, "step": 31610 }, { "epoch": 5.95, "grad_norm": 6.059977054595947, "learning_rate": 8.097120271033315e-06, "loss": 0.6493, "step": 31620 }, { "epoch": 5.95, "grad_norm": 21.183731079101562, "learning_rate": 8.093355919442876e-06, "loss": 0.4919, "step": 31630 }, { "epoch": 5.96, "grad_norm": 0.6658665537834167, "learning_rate": 8.089591567852438e-06, "loss": 0.3223, "step": 31640 }, { "epoch": 5.96, "grad_norm": 6.641610622406006, "learning_rate": 8.085827216262e-06, "loss": 0.452, "step": 31650 }, { "epoch": 5.96, "grad_norm": 2.345966339111328, "learning_rate": 8.08206286467156e-06, "loss": 0.4293, "step": 31660 }, { "epoch": 5.96, "grad_norm": 21.061580657958984, "learning_rate": 8.078298513081122e-06, "loss": 0.5237, "step": 31670 }, { "epoch": 5.96, "grad_norm": 20.903779983520508, "learning_rate": 8.074534161490684e-06, "loss": 0.4802, "step": 31680 }, { "epoch": 5.96, "grad_norm": 13.212108612060547, "learning_rate": 8.070769809900245e-06, "loss": 0.674, "step": 31690 }, { "epoch": 5.97, "grad_norm": 12.111379623413086, "learning_rate": 8.067005458309807e-06, "loss": 0.519, "step": 31700 }, { "epoch": 5.97, "grad_norm": 21.36750030517578, "learning_rate": 8.063241106719368e-06, "loss": 0.6218, "step": 31710 }, { "epoch": 5.97, "grad_norm": 35.358192443847656, "learning_rate": 8.05947675512893e-06, "loss": 0.5728, "step": 31720 }, { "epoch": 5.97, "grad_norm": 16.899436950683594, "learning_rate": 8.055712403538491e-06, "loss": 0.3583, "step": 31730 }, { "epoch": 5.97, "grad_norm": 27.319355010986328, "learning_rate": 8.051948051948052e-06, "loss": 0.5901, "step": 31740 }, { "epoch": 5.98, "grad_norm": 14.794363021850586, "learning_rate": 8.048183700357614e-06, "loss": 0.5036, "step": 31750 }, { "epoch": 5.98, "grad_norm": 0.04856634885072708, "learning_rate": 8.044419348767175e-06, "loss": 0.4173, "step": 31760 }, { "epoch": 5.98, "grad_norm": 16.557096481323242, "learning_rate": 8.040654997176737e-06, "loss": 0.6479, "step": 31770 }, { "epoch": 5.98, "grad_norm": 12.90949821472168, "learning_rate": 8.036890645586298e-06, "loss": 0.5475, "step": 31780 }, { "epoch": 5.98, "grad_norm": 23.995361328125, "learning_rate": 8.03312629399586e-06, "loss": 0.4879, "step": 31790 }, { "epoch": 5.99, "grad_norm": 15.855318069458008, "learning_rate": 8.029361942405421e-06, "loss": 0.5636, "step": 31800 }, { "epoch": 5.99, "grad_norm": 17.926624298095703, "learning_rate": 8.025597590814983e-06, "loss": 0.6181, "step": 31810 }, { "epoch": 5.99, "grad_norm": 25.674468994140625, "learning_rate": 8.021833239224544e-06, "loss": 0.5092, "step": 31820 }, { "epoch": 5.99, "grad_norm": 15.86432933807373, "learning_rate": 8.018068887634106e-06, "loss": 0.5039, "step": 31830 }, { "epoch": 5.99, "grad_norm": 16.778894424438477, "learning_rate": 8.014304536043667e-06, "loss": 0.53, "step": 31840 }, { "epoch": 5.99, "grad_norm": 20.491600036621094, "learning_rate": 8.010540184453229e-06, "loss": 0.5336, "step": 31850 }, { "epoch": 6.0, "grad_norm": 22.663936614990234, "learning_rate": 8.00677583286279e-06, "loss": 0.2657, "step": 31860 }, { "epoch": 6.0, "grad_norm": 30.6973876953125, "learning_rate": 8.003011481272351e-06, "loss": 0.3944, "step": 31870 }, { "epoch": 6.0, "eval_accuracy": 0.9201333333333334, "eval_loss": 0.3082124590873718, "eval_runtime": 51.3947, "eval_samples_per_second": 145.929, "eval_steps_per_second": 18.251, "step": 31878 }, { "epoch": 6.0, "grad_norm": 16.97609519958496, "learning_rate": 7.999247129681913e-06, "loss": 0.5414, "step": 31880 }, { "epoch": 6.0, "grad_norm": 34.15602111816406, "learning_rate": 7.995482778091474e-06, "loss": 0.7876, "step": 31890 }, { "epoch": 6.0, "grad_norm": 18.55365753173828, "learning_rate": 7.991718426501036e-06, "loss": 0.5426, "step": 31900 }, { "epoch": 6.01, "grad_norm": 23.530202865600586, "learning_rate": 7.987954074910597e-06, "loss": 0.5966, "step": 31910 }, { "epoch": 6.01, "grad_norm": 22.947160720825195, "learning_rate": 7.984189723320159e-06, "loss": 0.5899, "step": 31920 }, { "epoch": 6.01, "grad_norm": 21.2280216217041, "learning_rate": 7.98042537172972e-06, "loss": 0.2858, "step": 31930 }, { "epoch": 6.01, "grad_norm": 13.020065307617188, "learning_rate": 7.976661020139282e-06, "loss": 0.6148, "step": 31940 }, { "epoch": 6.01, "grad_norm": 25.60552978515625, "learning_rate": 7.972896668548843e-06, "loss": 0.4152, "step": 31950 }, { "epoch": 6.02, "grad_norm": 0.053030744194984436, "learning_rate": 7.969132316958405e-06, "loss": 0.4397, "step": 31960 }, { "epoch": 6.02, "grad_norm": 18.835439682006836, "learning_rate": 7.965367965367966e-06, "loss": 0.574, "step": 31970 }, { "epoch": 6.02, "grad_norm": 13.809188842773438, "learning_rate": 7.961603613777528e-06, "loss": 0.3238, "step": 31980 }, { "epoch": 6.02, "grad_norm": 17.327247619628906, "learning_rate": 7.957839262187089e-06, "loss": 0.7307, "step": 31990 }, { "epoch": 6.02, "grad_norm": 9.818562507629395, "learning_rate": 7.95407491059665e-06, "loss": 0.4293, "step": 32000 }, { "epoch": 6.02, "grad_norm": 0.10741060227155685, "learning_rate": 7.950310559006212e-06, "loss": 0.3728, "step": 32010 }, { "epoch": 6.03, "grad_norm": 12.27418041229248, "learning_rate": 7.946546207415773e-06, "loss": 0.5624, "step": 32020 }, { "epoch": 6.03, "grad_norm": 24.412097930908203, "learning_rate": 7.942781855825335e-06, "loss": 0.3476, "step": 32030 }, { "epoch": 6.03, "grad_norm": 11.939266204833984, "learning_rate": 7.939017504234896e-06, "loss": 0.6026, "step": 32040 }, { "epoch": 6.03, "grad_norm": 1.6670972108840942, "learning_rate": 7.935253152644458e-06, "loss": 0.461, "step": 32050 }, { "epoch": 6.03, "grad_norm": 18.398563385009766, "learning_rate": 7.93148880105402e-06, "loss": 0.8113, "step": 32060 }, { "epoch": 6.04, "grad_norm": 7.880075454711914, "learning_rate": 7.92772444946358e-06, "loss": 0.3885, "step": 32070 }, { "epoch": 6.04, "grad_norm": 12.342472076416016, "learning_rate": 7.923960097873142e-06, "loss": 0.251, "step": 32080 }, { "epoch": 6.04, "grad_norm": 49.20840072631836, "learning_rate": 7.920195746282704e-06, "loss": 0.5007, "step": 32090 }, { "epoch": 6.04, "grad_norm": 17.772216796875, "learning_rate": 7.916431394692265e-06, "loss": 0.3225, "step": 32100 }, { "epoch": 6.04, "grad_norm": 53.0165901184082, "learning_rate": 7.912667043101827e-06, "loss": 0.597, "step": 32110 }, { "epoch": 6.05, "grad_norm": 16.379194259643555, "learning_rate": 7.908902691511388e-06, "loss": 0.4052, "step": 32120 }, { "epoch": 6.05, "grad_norm": 0.09049464017152786, "learning_rate": 7.90513833992095e-06, "loss": 0.777, "step": 32130 }, { "epoch": 6.05, "grad_norm": 8.671985626220703, "learning_rate": 7.901373988330511e-06, "loss": 0.8194, "step": 32140 }, { "epoch": 6.05, "grad_norm": 34.3663215637207, "learning_rate": 7.897609636740072e-06, "loss": 0.6592, "step": 32150 }, { "epoch": 6.05, "grad_norm": 1.3141288757324219, "learning_rate": 7.893845285149634e-06, "loss": 0.3544, "step": 32160 }, { "epoch": 6.05, "grad_norm": 17.053375244140625, "learning_rate": 7.890080933559195e-06, "loss": 0.7546, "step": 32170 }, { "epoch": 6.06, "grad_norm": 40.75886917114258, "learning_rate": 7.886316581968757e-06, "loss": 0.5418, "step": 32180 }, { "epoch": 6.06, "grad_norm": 9.297530174255371, "learning_rate": 7.882552230378318e-06, "loss": 0.6633, "step": 32190 }, { "epoch": 6.06, "grad_norm": 13.147290229797363, "learning_rate": 7.87878787878788e-06, "loss": 0.4542, "step": 32200 }, { "epoch": 6.06, "grad_norm": 31.284879684448242, "learning_rate": 7.875023527197441e-06, "loss": 0.4669, "step": 32210 }, { "epoch": 6.06, "grad_norm": 15.669341087341309, "learning_rate": 7.871259175607003e-06, "loss": 0.1938, "step": 32220 }, { "epoch": 6.07, "grad_norm": 44.40217590332031, "learning_rate": 7.867494824016564e-06, "loss": 0.3318, "step": 32230 }, { "epoch": 6.07, "grad_norm": 6.310189247131348, "learning_rate": 7.863730472426126e-06, "loss": 0.6378, "step": 32240 }, { "epoch": 6.07, "grad_norm": 13.212310791015625, "learning_rate": 7.859966120835687e-06, "loss": 0.6015, "step": 32250 }, { "epoch": 6.07, "grad_norm": 15.912529945373535, "learning_rate": 7.856201769245248e-06, "loss": 0.5288, "step": 32260 }, { "epoch": 6.07, "grad_norm": 1.8978145122528076, "learning_rate": 7.852437417654808e-06, "loss": 0.4826, "step": 32270 }, { "epoch": 6.08, "grad_norm": 39.089073181152344, "learning_rate": 7.848673066064371e-06, "loss": 0.5948, "step": 32280 }, { "epoch": 6.08, "grad_norm": 8.845064163208008, "learning_rate": 7.844908714473933e-06, "loss": 0.4591, "step": 32290 }, { "epoch": 6.08, "grad_norm": 20.97511100769043, "learning_rate": 7.841144362883494e-06, "loss": 0.6119, "step": 32300 }, { "epoch": 6.08, "grad_norm": 8.488319396972656, "learning_rate": 7.837380011293056e-06, "loss": 0.6892, "step": 32310 }, { "epoch": 6.08, "grad_norm": 38.96818542480469, "learning_rate": 7.833615659702617e-06, "loss": 0.3597, "step": 32320 }, { "epoch": 6.09, "grad_norm": 1.3680989742279053, "learning_rate": 7.829851308112179e-06, "loss": 0.5009, "step": 32330 }, { "epoch": 6.09, "grad_norm": 26.558027267456055, "learning_rate": 7.82608695652174e-06, "loss": 0.6395, "step": 32340 }, { "epoch": 6.09, "grad_norm": 17.90494155883789, "learning_rate": 7.822322604931302e-06, "loss": 0.6256, "step": 32350 }, { "epoch": 6.09, "grad_norm": 11.778217315673828, "learning_rate": 7.818558253340861e-06, "loss": 0.3306, "step": 32360 }, { "epoch": 6.09, "grad_norm": 18.40743637084961, "learning_rate": 7.814793901750425e-06, "loss": 0.4534, "step": 32370 }, { "epoch": 6.09, "grad_norm": 16.67788314819336, "learning_rate": 7.811029550159986e-06, "loss": 0.6644, "step": 32380 }, { "epoch": 6.1, "grad_norm": 0.07633399218320847, "learning_rate": 7.807265198569547e-06, "loss": 0.5102, "step": 32390 }, { "epoch": 6.1, "grad_norm": 15.977582931518555, "learning_rate": 7.803500846979109e-06, "loss": 0.5423, "step": 32400 }, { "epoch": 6.1, "grad_norm": 53.08889389038086, "learning_rate": 7.79973649538867e-06, "loss": 0.596, "step": 32410 }, { "epoch": 6.1, "grad_norm": 7.981726169586182, "learning_rate": 7.795972143798232e-06, "loss": 0.6868, "step": 32420 }, { "epoch": 6.1, "grad_norm": 22.594266891479492, "learning_rate": 7.792207792207793e-06, "loss": 0.6394, "step": 32430 }, { "epoch": 6.11, "grad_norm": 18.69639778137207, "learning_rate": 7.788443440617355e-06, "loss": 0.4663, "step": 32440 }, { "epoch": 6.11, "grad_norm": 11.259078025817871, "learning_rate": 7.784679089026915e-06, "loss": 0.4948, "step": 32450 }, { "epoch": 6.11, "grad_norm": 6.536806583404541, "learning_rate": 7.780914737436478e-06, "loss": 0.5363, "step": 32460 }, { "epoch": 6.11, "grad_norm": 24.416154861450195, "learning_rate": 7.777150385846039e-06, "loss": 0.6788, "step": 32470 }, { "epoch": 6.11, "grad_norm": 7.047547340393066, "learning_rate": 7.7733860342556e-06, "loss": 0.6391, "step": 32480 }, { "epoch": 6.12, "grad_norm": 32.992122650146484, "learning_rate": 7.769621682665162e-06, "loss": 0.4505, "step": 32490 }, { "epoch": 6.12, "grad_norm": 21.74233055114746, "learning_rate": 7.765857331074724e-06, "loss": 0.441, "step": 32500 }, { "epoch": 6.12, "grad_norm": 2.9683542251586914, "learning_rate": 7.762092979484285e-06, "loss": 0.4137, "step": 32510 }, { "epoch": 6.12, "grad_norm": 6.940702438354492, "learning_rate": 7.758328627893846e-06, "loss": 0.3561, "step": 32520 }, { "epoch": 6.12, "grad_norm": 13.663592338562012, "learning_rate": 7.754564276303408e-06, "loss": 0.4385, "step": 32530 }, { "epoch": 6.12, "grad_norm": 4.1214752197265625, "learning_rate": 7.750799924712968e-06, "loss": 0.5538, "step": 32540 }, { "epoch": 6.13, "grad_norm": 41.12890625, "learning_rate": 7.747035573122529e-06, "loss": 0.4315, "step": 32550 }, { "epoch": 6.13, "grad_norm": 6.651817798614502, "learning_rate": 7.743271221532092e-06, "loss": 0.5182, "step": 32560 }, { "epoch": 6.13, "grad_norm": 22.12759780883789, "learning_rate": 7.739506869941654e-06, "loss": 0.4464, "step": 32570 }, { "epoch": 6.13, "grad_norm": 9.155705451965332, "learning_rate": 7.735742518351215e-06, "loss": 0.5285, "step": 32580 }, { "epoch": 6.13, "grad_norm": 22.846324920654297, "learning_rate": 7.731978166760777e-06, "loss": 0.5287, "step": 32590 }, { "epoch": 6.14, "grad_norm": 2.597435474395752, "learning_rate": 7.728213815170338e-06, "loss": 0.064, "step": 32600 }, { "epoch": 6.14, "grad_norm": 10.483798027038574, "learning_rate": 7.7244494635799e-06, "loss": 0.3093, "step": 32610 }, { "epoch": 6.14, "grad_norm": 9.434378623962402, "learning_rate": 7.72068511198946e-06, "loss": 0.6346, "step": 32620 }, { "epoch": 6.14, "grad_norm": 0.27151790261268616, "learning_rate": 7.71692076039902e-06, "loss": 0.3794, "step": 32630 }, { "epoch": 6.14, "grad_norm": 7.006466388702393, "learning_rate": 7.713156408808582e-06, "loss": 0.5533, "step": 32640 }, { "epoch": 6.15, "grad_norm": 6.906270980834961, "learning_rate": 7.709392057218145e-06, "loss": 0.4613, "step": 32650 }, { "epoch": 6.15, "grad_norm": 1.0680428743362427, "learning_rate": 7.705627705627707e-06, "loss": 0.474, "step": 32660 }, { "epoch": 6.15, "grad_norm": 0.06817318499088287, "learning_rate": 7.701863354037268e-06, "loss": 0.4843, "step": 32670 }, { "epoch": 6.15, "grad_norm": 9.567086219787598, "learning_rate": 7.69809900244683e-06, "loss": 0.6909, "step": 32680 }, { "epoch": 6.15, "grad_norm": 4.2329301834106445, "learning_rate": 7.694334650856391e-06, "loss": 0.4636, "step": 32690 }, { "epoch": 6.15, "grad_norm": 0.9662070870399475, "learning_rate": 7.690570299265953e-06, "loss": 0.4107, "step": 32700 }, { "epoch": 6.16, "grad_norm": 10.737784385681152, "learning_rate": 7.686805947675512e-06, "loss": 0.3815, "step": 32710 }, { "epoch": 6.16, "grad_norm": 16.375762939453125, "learning_rate": 7.683041596085074e-06, "loss": 0.4073, "step": 32720 }, { "epoch": 6.16, "grad_norm": 3.9611103534698486, "learning_rate": 7.679277244494635e-06, "loss": 0.5408, "step": 32730 }, { "epoch": 6.16, "grad_norm": 3.900883674621582, "learning_rate": 7.675512892904199e-06, "loss": 0.5193, "step": 32740 }, { "epoch": 6.16, "grad_norm": 13.367103576660156, "learning_rate": 7.67174854131376e-06, "loss": 0.7197, "step": 32750 }, { "epoch": 6.17, "grad_norm": 0.168531134724617, "learning_rate": 7.667984189723321e-06, "loss": 0.3934, "step": 32760 }, { "epoch": 6.17, "grad_norm": 2.455349922180176, "learning_rate": 7.664219838132883e-06, "loss": 0.5187, "step": 32770 }, { "epoch": 6.17, "grad_norm": 17.506908416748047, "learning_rate": 7.660455486542444e-06, "loss": 0.5227, "step": 32780 }, { "epoch": 6.17, "grad_norm": 7.857856750488281, "learning_rate": 7.656691134952006e-06, "loss": 0.6049, "step": 32790 }, { "epoch": 6.17, "grad_norm": 10.179645538330078, "learning_rate": 7.652926783361566e-06, "loss": 0.5264, "step": 32800 }, { "epoch": 6.18, "grad_norm": 15.510612487792969, "learning_rate": 7.649162431771127e-06, "loss": 0.3707, "step": 32810 }, { "epoch": 6.18, "grad_norm": 4.712235927581787, "learning_rate": 7.645398080180689e-06, "loss": 0.5458, "step": 32820 }, { "epoch": 6.18, "grad_norm": 7.241546630859375, "learning_rate": 7.641633728590252e-06, "loss": 0.5487, "step": 32830 }, { "epoch": 6.18, "grad_norm": 1.5072888135910034, "learning_rate": 7.637869376999813e-06, "loss": 0.6038, "step": 32840 }, { "epoch": 6.18, "grad_norm": 0.652369499206543, "learning_rate": 7.634105025409375e-06, "loss": 0.5724, "step": 32850 }, { "epoch": 6.18, "grad_norm": 4.111026287078857, "learning_rate": 7.630340673818936e-06, "loss": 0.2281, "step": 32860 }, { "epoch": 6.19, "grad_norm": 0.2701318860054016, "learning_rate": 7.6265763222284976e-06, "loss": 0.6504, "step": 32870 }, { "epoch": 6.19, "grad_norm": 13.367432594299316, "learning_rate": 7.622811970638057e-06, "loss": 0.2885, "step": 32880 }, { "epoch": 6.19, "grad_norm": 5.478610515594482, "learning_rate": 7.61904761904762e-06, "loss": 0.5181, "step": 32890 }, { "epoch": 6.19, "grad_norm": 8.030369758605957, "learning_rate": 7.615283267457181e-06, "loss": 0.5897, "step": 32900 }, { "epoch": 6.19, "grad_norm": 13.834616661071777, "learning_rate": 7.6115189158667426e-06, "loss": 0.5119, "step": 32910 }, { "epoch": 6.2, "grad_norm": 5.764062881469727, "learning_rate": 7.607754564276304e-06, "loss": 0.5504, "step": 32920 }, { "epoch": 6.2, "grad_norm": 51.650718688964844, "learning_rate": 7.6039902126858655e-06, "loss": 0.4928, "step": 32930 }, { "epoch": 6.2, "grad_norm": 43.39817428588867, "learning_rate": 7.600225861095427e-06, "loss": 0.3655, "step": 32940 }, { "epoch": 6.2, "grad_norm": 1.6382009983062744, "learning_rate": 7.596461509504988e-06, "loss": 0.4539, "step": 32950 }, { "epoch": 6.2, "grad_norm": 0.764748215675354, "learning_rate": 7.592697157914551e-06, "loss": 0.6964, "step": 32960 }, { "epoch": 6.21, "grad_norm": 46.74039077758789, "learning_rate": 7.5889328063241105e-06, "loss": 0.4344, "step": 32970 }, { "epoch": 6.21, "grad_norm": 24.714452743530273, "learning_rate": 7.585168454733673e-06, "loss": 0.7972, "step": 32980 }, { "epoch": 6.21, "grad_norm": 15.897273063659668, "learning_rate": 7.581404103143234e-06, "loss": 0.443, "step": 32990 }, { "epoch": 6.21, "grad_norm": 5.593212604522705, "learning_rate": 7.577639751552796e-06, "loss": 0.5032, "step": 33000 }, { "epoch": 6.21, "grad_norm": 10.625405311584473, "learning_rate": 7.573875399962357e-06, "loss": 0.4239, "step": 33010 }, { "epoch": 6.21, "grad_norm": 28.671268463134766, "learning_rate": 7.570111048371919e-06, "loss": 0.3406, "step": 33020 }, { "epoch": 6.22, "grad_norm": 1.3727935552597046, "learning_rate": 7.56634669678148e-06, "loss": 0.4927, "step": 33030 }, { "epoch": 6.22, "grad_norm": 8.277888298034668, "learning_rate": 7.5625823451910415e-06, "loss": 0.5651, "step": 33040 }, { "epoch": 6.22, "grad_norm": 23.557905197143555, "learning_rate": 7.558817993600603e-06, "loss": 0.4431, "step": 33050 }, { "epoch": 6.22, "grad_norm": 27.223262786865234, "learning_rate": 7.555053642010164e-06, "loss": 0.8412, "step": 33060 }, { "epoch": 6.22, "grad_norm": 14.346407890319824, "learning_rate": 7.551289290419725e-06, "loss": 0.5623, "step": 33070 }, { "epoch": 6.23, "grad_norm": 0.6994918584823608, "learning_rate": 7.547524938829287e-06, "loss": 0.6004, "step": 33080 }, { "epoch": 6.23, "grad_norm": 3.5773377418518066, "learning_rate": 7.543760587238849e-06, "loss": 0.4514, "step": 33090 }, { "epoch": 6.23, "grad_norm": 15.664129257202148, "learning_rate": 7.53999623564841e-06, "loss": 0.437, "step": 33100 }, { "epoch": 6.23, "grad_norm": 14.454998016357422, "learning_rate": 7.536231884057972e-06, "loss": 0.3146, "step": 33110 }, { "epoch": 6.23, "grad_norm": 14.287149429321289, "learning_rate": 7.532467532467533e-06, "loss": 0.3651, "step": 33120 }, { "epoch": 6.24, "grad_norm": 35.363502502441406, "learning_rate": 7.528703180877095e-06, "loss": 0.3985, "step": 33130 }, { "epoch": 6.24, "grad_norm": 11.074187278747559, "learning_rate": 7.524938829286656e-06, "loss": 0.7611, "step": 33140 }, { "epoch": 6.24, "grad_norm": 2.34920597076416, "learning_rate": 7.521174477696217e-06, "loss": 0.6438, "step": 33150 }, { "epoch": 6.24, "grad_norm": 3.194157361984253, "learning_rate": 7.517410126105778e-06, "loss": 0.9574, "step": 33160 }, { "epoch": 6.24, "grad_norm": 15.871007919311523, "learning_rate": 7.5136457745153405e-06, "loss": 0.8127, "step": 33170 }, { "epoch": 6.25, "grad_norm": 0.2083834558725357, "learning_rate": 7.509881422924902e-06, "loss": 0.438, "step": 33180 }, { "epoch": 6.25, "grad_norm": 9.858641624450684, "learning_rate": 7.5061170713344635e-06, "loss": 0.4504, "step": 33190 }, { "epoch": 6.25, "grad_norm": 9.694896697998047, "learning_rate": 7.502352719744025e-06, "loss": 0.6191, "step": 33200 }, { "epoch": 6.25, "grad_norm": 18.69541358947754, "learning_rate": 7.498588368153586e-06, "loss": 0.9686, "step": 33210 }, { "epoch": 6.25, "grad_norm": 18.836008071899414, "learning_rate": 7.494824016563148e-06, "loss": 0.4195, "step": 33220 }, { "epoch": 6.25, "grad_norm": 8.28152084350586, "learning_rate": 7.4910596649727084e-06, "loss": 0.4661, "step": 33230 }, { "epoch": 6.26, "grad_norm": 0.8841540813446045, "learning_rate": 7.48729531338227e-06, "loss": 1.1192, "step": 33240 }, { "epoch": 6.26, "grad_norm": 21.22022247314453, "learning_rate": 7.483530961791831e-06, "loss": 0.5756, "step": 33250 }, { "epoch": 6.26, "grad_norm": 5.069668769836426, "learning_rate": 7.479766610201394e-06, "loss": 0.5685, "step": 33260 }, { "epoch": 6.26, "grad_norm": 1.734766960144043, "learning_rate": 7.476002258610955e-06, "loss": 0.2923, "step": 33270 }, { "epoch": 6.26, "grad_norm": 15.850728988647461, "learning_rate": 7.472237907020517e-06, "loss": 0.6755, "step": 33280 }, { "epoch": 6.27, "grad_norm": 0.8242527842521667, "learning_rate": 7.468473555430078e-06, "loss": 0.5514, "step": 33290 }, { "epoch": 6.27, "grad_norm": 22.578418731689453, "learning_rate": 7.4647092038396395e-06, "loss": 0.5968, "step": 33300 }, { "epoch": 6.27, "grad_norm": 17.16265869140625, "learning_rate": 7.460944852249201e-06, "loss": 0.4647, "step": 33310 }, { "epoch": 6.27, "grad_norm": 32.857601165771484, "learning_rate": 7.457180500658762e-06, "loss": 0.511, "step": 33320 }, { "epoch": 6.27, "grad_norm": 10.078173637390137, "learning_rate": 7.453416149068323e-06, "loss": 0.3435, "step": 33330 }, { "epoch": 6.28, "grad_norm": 18.10398292541504, "learning_rate": 7.4496517974778845e-06, "loss": 0.348, "step": 33340 }, { "epoch": 6.28, "grad_norm": 9.601995468139648, "learning_rate": 7.445887445887446e-06, "loss": 0.4952, "step": 33350 }, { "epoch": 6.28, "grad_norm": 13.778106689453125, "learning_rate": 7.442123094297008e-06, "loss": 0.4809, "step": 33360 }, { "epoch": 6.28, "grad_norm": 19.068946838378906, "learning_rate": 7.43835874270657e-06, "loss": 0.3458, "step": 33370 }, { "epoch": 6.28, "grad_norm": 1.7670868635177612, "learning_rate": 7.434594391116131e-06, "loss": 0.6565, "step": 33380 }, { "epoch": 6.28, "grad_norm": 25.797090530395508, "learning_rate": 7.430830039525693e-06, "loss": 0.7225, "step": 33390 }, { "epoch": 6.29, "grad_norm": 30.813005447387695, "learning_rate": 7.427065687935254e-06, "loss": 0.8055, "step": 33400 }, { "epoch": 6.29, "grad_norm": 7.973330497741699, "learning_rate": 7.423301336344815e-06, "loss": 0.4666, "step": 33410 }, { "epoch": 6.29, "grad_norm": 9.906311988830566, "learning_rate": 7.419536984754376e-06, "loss": 0.3954, "step": 33420 }, { "epoch": 6.29, "grad_norm": 43.313724517822266, "learning_rate": 7.415772633163938e-06, "loss": 0.6149, "step": 33430 }, { "epoch": 6.29, "grad_norm": 7.175143241882324, "learning_rate": 7.412008281573499e-06, "loss": 0.6234, "step": 33440 }, { "epoch": 6.3, "grad_norm": 28.30812644958496, "learning_rate": 7.4082439299830614e-06, "loss": 0.3256, "step": 33450 }, { "epoch": 6.3, "grad_norm": 12.158953666687012, "learning_rate": 7.404479578392623e-06, "loss": 0.3962, "step": 33460 }, { "epoch": 6.3, "grad_norm": 17.756189346313477, "learning_rate": 7.400715226802184e-06, "loss": 0.5152, "step": 33470 }, { "epoch": 6.3, "grad_norm": 18.10654067993164, "learning_rate": 7.396950875211746e-06, "loss": 0.6907, "step": 33480 }, { "epoch": 6.3, "grad_norm": 18.467599868774414, "learning_rate": 7.3931865236213064e-06, "loss": 0.4874, "step": 33490 }, { "epoch": 6.31, "grad_norm": 1.991639256477356, "learning_rate": 7.389422172030868e-06, "loss": 0.3471, "step": 33500 }, { "epoch": 6.31, "grad_norm": 20.875844955444336, "learning_rate": 7.385657820440429e-06, "loss": 0.8077, "step": 33510 }, { "epoch": 6.31, "grad_norm": 4.168043613433838, "learning_rate": 7.381893468849991e-06, "loss": 0.3689, "step": 33520 }, { "epoch": 6.31, "grad_norm": 0.10714370012283325, "learning_rate": 7.378129117259552e-06, "loss": 0.4633, "step": 33530 }, { "epoch": 6.31, "grad_norm": 14.68402099609375, "learning_rate": 7.3743647656691146e-06, "loss": 0.5519, "step": 33540 }, { "epoch": 6.31, "grad_norm": 1.1338722705841064, "learning_rate": 7.370600414078676e-06, "loss": 0.6535, "step": 33550 }, { "epoch": 6.32, "grad_norm": 20.80348014831543, "learning_rate": 7.3668360624882375e-06, "loss": 0.6073, "step": 33560 }, { "epoch": 6.32, "grad_norm": 17.18090057373047, "learning_rate": 7.363071710897799e-06, "loss": 0.5032, "step": 33570 }, { "epoch": 6.32, "grad_norm": 0.5403892397880554, "learning_rate": 7.3593073593073596e-06, "loss": 0.4669, "step": 33580 }, { "epoch": 6.32, "grad_norm": 10.763050079345703, "learning_rate": 7.355543007716921e-06, "loss": 0.5767, "step": 33590 }, { "epoch": 6.32, "grad_norm": 46.35898971557617, "learning_rate": 7.3517786561264825e-06, "loss": 0.6032, "step": 33600 }, { "epoch": 6.33, "grad_norm": 12.066878318786621, "learning_rate": 7.348014304536044e-06, "loss": 0.3331, "step": 33610 }, { "epoch": 6.33, "grad_norm": 7.480466365814209, "learning_rate": 7.344249952945605e-06, "loss": 0.6212, "step": 33620 }, { "epoch": 6.33, "grad_norm": 18.744464874267578, "learning_rate": 7.340485601355168e-06, "loss": 0.7442, "step": 33630 }, { "epoch": 6.33, "grad_norm": 7.432648658752441, "learning_rate": 7.336721249764729e-06, "loss": 0.358, "step": 33640 }, { "epoch": 6.33, "grad_norm": 8.940939903259277, "learning_rate": 7.332956898174291e-06, "loss": 0.7015, "step": 33650 }, { "epoch": 6.34, "grad_norm": 0.28902769088745117, "learning_rate": 7.329192546583852e-06, "loss": 0.5588, "step": 33660 }, { "epoch": 6.34, "grad_norm": 0.46211880445480347, "learning_rate": 7.325428194993413e-06, "loss": 0.6197, "step": 33670 }, { "epoch": 6.34, "grad_norm": 25.7103328704834, "learning_rate": 7.321663843402974e-06, "loss": 0.4847, "step": 33680 }, { "epoch": 6.34, "grad_norm": 8.652647018432617, "learning_rate": 7.317899491812536e-06, "loss": 0.3872, "step": 33690 }, { "epoch": 6.34, "grad_norm": 10.216687202453613, "learning_rate": 7.314135140222097e-06, "loss": 0.4059, "step": 33700 }, { "epoch": 6.34, "grad_norm": 2.1355504989624023, "learning_rate": 7.3103707886316586e-06, "loss": 0.5013, "step": 33710 }, { "epoch": 6.35, "grad_norm": 18.312965393066406, "learning_rate": 7.30660643704122e-06, "loss": 0.536, "step": 33720 }, { "epoch": 6.35, "grad_norm": 11.597451210021973, "learning_rate": 7.302842085450782e-06, "loss": 0.4923, "step": 33730 }, { "epoch": 6.35, "grad_norm": 21.869190216064453, "learning_rate": 7.299077733860344e-06, "loss": 0.6294, "step": 33740 }, { "epoch": 6.35, "grad_norm": 9.284852981567383, "learning_rate": 7.295313382269905e-06, "loss": 0.4809, "step": 33750 }, { "epoch": 6.35, "grad_norm": 26.03097152709961, "learning_rate": 7.291549030679466e-06, "loss": 0.6448, "step": 33760 }, { "epoch": 6.36, "grad_norm": 22.911418914794922, "learning_rate": 7.287784679089027e-06, "loss": 0.5017, "step": 33770 }, { "epoch": 6.36, "grad_norm": 50.75450897216797, "learning_rate": 7.284020327498589e-06, "loss": 0.6046, "step": 33780 }, { "epoch": 6.36, "grad_norm": 22.943689346313477, "learning_rate": 7.28025597590815e-06, "loss": 0.4832, "step": 33790 }, { "epoch": 6.36, "grad_norm": 7.2119460105896, "learning_rate": 7.276491624317712e-06, "loss": 0.852, "step": 33800 }, { "epoch": 6.36, "grad_norm": 9.327651023864746, "learning_rate": 7.272727272727273e-06, "loss": 0.5135, "step": 33810 }, { "epoch": 6.37, "grad_norm": 20.863204956054688, "learning_rate": 7.2689629211368355e-06, "loss": 0.5866, "step": 33820 }, { "epoch": 6.37, "grad_norm": 6.156462669372559, "learning_rate": 7.265198569546397e-06, "loss": 0.62, "step": 33830 }, { "epoch": 6.37, "grad_norm": 24.723724365234375, "learning_rate": 7.2614342179559576e-06, "loss": 0.6837, "step": 33840 }, { "epoch": 6.37, "grad_norm": 0.5178024172782898, "learning_rate": 7.257669866365519e-06, "loss": 0.2405, "step": 33850 }, { "epoch": 6.37, "grad_norm": 7.65985107421875, "learning_rate": 7.2539055147750805e-06, "loss": 0.3353, "step": 33860 }, { "epoch": 6.37, "grad_norm": 21.34691619873047, "learning_rate": 7.250141163184642e-06, "loss": 0.6229, "step": 33870 }, { "epoch": 6.38, "grad_norm": 20.477985382080078, "learning_rate": 7.246376811594203e-06, "loss": 0.4679, "step": 33880 }, { "epoch": 6.38, "grad_norm": 22.93824005126953, "learning_rate": 7.242612460003765e-06, "loss": 0.5659, "step": 33890 }, { "epoch": 6.38, "grad_norm": 0.7325233221054077, "learning_rate": 7.238848108413326e-06, "loss": 0.5032, "step": 33900 }, { "epoch": 6.38, "grad_norm": 14.224431037902832, "learning_rate": 7.235083756822889e-06, "loss": 0.674, "step": 33910 }, { "epoch": 6.38, "grad_norm": 19.057111740112305, "learning_rate": 7.23131940523245e-06, "loss": 0.4854, "step": 33920 }, { "epoch": 6.39, "grad_norm": 16.500669479370117, "learning_rate": 7.22755505364201e-06, "loss": 0.4265, "step": 33930 }, { "epoch": 6.39, "grad_norm": 18.075031280517578, "learning_rate": 7.223790702051572e-06, "loss": 0.8634, "step": 33940 }, { "epoch": 6.39, "grad_norm": 1.375807523727417, "learning_rate": 7.220026350461134e-06, "loss": 0.5841, "step": 33950 }, { "epoch": 6.39, "grad_norm": 2.372751474380493, "learning_rate": 7.216261998870695e-06, "loss": 0.3938, "step": 33960 }, { "epoch": 6.39, "grad_norm": 0.7880887985229492, "learning_rate": 7.2124976472802565e-06, "loss": 0.5251, "step": 33970 }, { "epoch": 6.4, "grad_norm": 10.042675018310547, "learning_rate": 7.208733295689818e-06, "loss": 0.3012, "step": 33980 }, { "epoch": 6.4, "grad_norm": 41.02911376953125, "learning_rate": 7.2049689440993795e-06, "loss": 0.6843, "step": 33990 }, { "epoch": 6.4, "grad_norm": 0.09337029606103897, "learning_rate": 7.201204592508941e-06, "loss": 0.6831, "step": 34000 }, { "epoch": 6.4, "grad_norm": 0.04473143443465233, "learning_rate": 7.197440240918503e-06, "loss": 0.5183, "step": 34010 }, { "epoch": 6.4, "grad_norm": 30.65087127685547, "learning_rate": 7.193675889328063e-06, "loss": 0.5636, "step": 34020 }, { "epoch": 6.41, "grad_norm": 22.237075805664062, "learning_rate": 7.189911537737625e-06, "loss": 0.6722, "step": 34030 }, { "epoch": 6.41, "grad_norm": 23.21628761291504, "learning_rate": 7.186147186147187e-06, "loss": 0.4067, "step": 34040 }, { "epoch": 6.41, "grad_norm": 6.441122055053711, "learning_rate": 7.182382834556748e-06, "loss": 0.373, "step": 34050 }, { "epoch": 6.41, "grad_norm": 17.526185989379883, "learning_rate": 7.17861848296631e-06, "loss": 0.4955, "step": 34060 }, { "epoch": 6.41, "grad_norm": 5.90300178527832, "learning_rate": 7.174854131375871e-06, "loss": 0.6357, "step": 34070 }, { "epoch": 6.41, "grad_norm": 14.964542388916016, "learning_rate": 7.171089779785433e-06, "loss": 0.6288, "step": 34080 }, { "epoch": 6.42, "grad_norm": 14.822508811950684, "learning_rate": 7.167325428194994e-06, "loss": 0.6348, "step": 34090 }, { "epoch": 6.42, "grad_norm": 40.991546630859375, "learning_rate": 7.163561076604555e-06, "loss": 0.3295, "step": 34100 }, { "epoch": 6.42, "grad_norm": 13.501119613647461, "learning_rate": 7.159796725014116e-06, "loss": 0.5754, "step": 34110 }, { "epoch": 6.42, "grad_norm": 18.85485076904297, "learning_rate": 7.1560323734236784e-06, "loss": 0.5432, "step": 34120 }, { "epoch": 6.42, "grad_norm": 8.995746612548828, "learning_rate": 7.15226802183324e-06, "loss": 0.9828, "step": 34130 }, { "epoch": 6.43, "grad_norm": 13.933249473571777, "learning_rate": 7.148503670242801e-06, "loss": 0.5424, "step": 34140 }, { "epoch": 6.43, "grad_norm": 13.510233879089355, "learning_rate": 7.144739318652363e-06, "loss": 0.6617, "step": 34150 }, { "epoch": 6.43, "grad_norm": 13.763504981994629, "learning_rate": 7.140974967061924e-06, "loss": 0.4808, "step": 34160 }, { "epoch": 6.43, "grad_norm": 18.854583740234375, "learning_rate": 7.137210615471486e-06, "loss": 0.3383, "step": 34170 }, { "epoch": 6.43, "grad_norm": 0.48108717799186707, "learning_rate": 7.133446263881047e-06, "loss": 0.4345, "step": 34180 }, { "epoch": 6.44, "grad_norm": 50.08021926879883, "learning_rate": 7.129681912290608e-06, "loss": 0.4781, "step": 34190 }, { "epoch": 6.44, "grad_norm": 18.643566131591797, "learning_rate": 7.125917560700169e-06, "loss": 0.4631, "step": 34200 }, { "epoch": 6.44, "grad_norm": 12.921839714050293, "learning_rate": 7.122153209109732e-06, "loss": 0.5039, "step": 34210 }, { "epoch": 6.44, "grad_norm": 27.30744171142578, "learning_rate": 7.118388857519293e-06, "loss": 0.5912, "step": 34220 }, { "epoch": 6.44, "grad_norm": 27.566017150878906, "learning_rate": 7.1146245059288545e-06, "loss": 0.4702, "step": 34230 }, { "epoch": 6.44, "grad_norm": 12.484590530395508, "learning_rate": 7.110860154338416e-06, "loss": 0.5907, "step": 34240 }, { "epoch": 6.45, "grad_norm": 19.984474182128906, "learning_rate": 7.1070958027479774e-06, "loss": 0.2223, "step": 34250 }, { "epoch": 6.45, "grad_norm": 6.380549430847168, "learning_rate": 7.103331451157539e-06, "loss": 0.3962, "step": 34260 }, { "epoch": 6.45, "grad_norm": 8.179642677307129, "learning_rate": 7.0995670995671e-06, "loss": 0.778, "step": 34270 }, { "epoch": 6.45, "grad_norm": 0.15643277764320374, "learning_rate": 7.095802747976661e-06, "loss": 0.316, "step": 34280 }, { "epoch": 6.45, "grad_norm": 17.682392120361328, "learning_rate": 7.0920383963862224e-06, "loss": 0.3378, "step": 34290 }, { "epoch": 6.46, "grad_norm": 10.891948699951172, "learning_rate": 7.088274044795784e-06, "loss": 0.345, "step": 34300 }, { "epoch": 6.46, "grad_norm": 37.315853118896484, "learning_rate": 7.084509693205346e-06, "loss": 0.5888, "step": 34310 }, { "epoch": 6.46, "grad_norm": 0.2774854302406311, "learning_rate": 7.080745341614908e-06, "loss": 0.2816, "step": 34320 }, { "epoch": 6.46, "grad_norm": 15.914167404174805, "learning_rate": 7.076980990024469e-06, "loss": 0.6068, "step": 34330 }, { "epoch": 6.46, "grad_norm": 17.352615356445312, "learning_rate": 7.073216638434031e-06, "loss": 0.4122, "step": 34340 }, { "epoch": 6.47, "grad_norm": 21.448284149169922, "learning_rate": 7.069452286843592e-06, "loss": 0.2653, "step": 34350 }, { "epoch": 6.47, "grad_norm": 21.003995895385742, "learning_rate": 7.0656879352531535e-06, "loss": 0.3752, "step": 34360 }, { "epoch": 6.47, "grad_norm": 12.033404350280762, "learning_rate": 7.061923583662714e-06, "loss": 0.5186, "step": 34370 }, { "epoch": 6.47, "grad_norm": 9.405254364013672, "learning_rate": 7.058159232072276e-06, "loss": 0.7365, "step": 34380 }, { "epoch": 6.47, "grad_norm": 25.861106872558594, "learning_rate": 7.054394880481837e-06, "loss": 0.7919, "step": 34390 }, { "epoch": 6.47, "grad_norm": 6.728229522705078, "learning_rate": 7.050630528891399e-06, "loss": 0.4589, "step": 34400 }, { "epoch": 6.48, "grad_norm": 29.374160766601562, "learning_rate": 7.046866177300961e-06, "loss": 0.7846, "step": 34410 }, { "epoch": 6.48, "grad_norm": 31.664583206176758, "learning_rate": 7.043101825710522e-06, "loss": 0.9016, "step": 34420 }, { "epoch": 6.48, "grad_norm": 9.76760482788086, "learning_rate": 7.039337474120084e-06, "loss": 0.5766, "step": 34430 }, { "epoch": 6.48, "grad_norm": 19.36331558227539, "learning_rate": 7.035573122529645e-06, "loss": 0.4637, "step": 34440 }, { "epoch": 6.48, "grad_norm": 7.009848594665527, "learning_rate": 7.031808770939206e-06, "loss": 0.6186, "step": 34450 }, { "epoch": 6.49, "grad_norm": 11.30031967163086, "learning_rate": 7.028044419348767e-06, "loss": 0.859, "step": 34460 }, { "epoch": 6.49, "grad_norm": 23.84954833984375, "learning_rate": 7.024280067758329e-06, "loss": 0.4672, "step": 34470 }, { "epoch": 6.49, "grad_norm": 1.282763957977295, "learning_rate": 7.02051571616789e-06, "loss": 0.3136, "step": 34480 }, { "epoch": 6.49, "grad_norm": 0.5825543999671936, "learning_rate": 7.0167513645774525e-06, "loss": 0.4409, "step": 34490 }, { "epoch": 6.49, "grad_norm": 23.45937728881836, "learning_rate": 7.012987012987014e-06, "loss": 0.5154, "step": 34500 }, { "epoch": 6.5, "grad_norm": 0.2989109456539154, "learning_rate": 7.009222661396575e-06, "loss": 0.5457, "step": 34510 }, { "epoch": 6.5, "grad_norm": 12.64752197265625, "learning_rate": 7.005458309806137e-06, "loss": 0.4991, "step": 34520 }, { "epoch": 6.5, "grad_norm": 56.25718688964844, "learning_rate": 7.001693958215698e-06, "loss": 0.6106, "step": 34530 }, { "epoch": 6.5, "grad_norm": 16.893884658813477, "learning_rate": 6.997929606625259e-06, "loss": 0.4511, "step": 34540 }, { "epoch": 6.5, "grad_norm": 8.532828330993652, "learning_rate": 6.99416525503482e-06, "loss": 0.388, "step": 34550 }, { "epoch": 6.5, "grad_norm": 12.58056926727295, "learning_rate": 6.990400903444382e-06, "loss": 0.8358, "step": 34560 }, { "epoch": 6.51, "grad_norm": 20.3173885345459, "learning_rate": 6.986636551853943e-06, "loss": 0.3383, "step": 34570 }, { "epoch": 6.51, "grad_norm": 7.896786689758301, "learning_rate": 6.982872200263505e-06, "loss": 0.5333, "step": 34580 }, { "epoch": 6.51, "grad_norm": 6.4286065101623535, "learning_rate": 6.979107848673067e-06, "loss": 0.4064, "step": 34590 }, { "epoch": 6.51, "grad_norm": 23.140363693237305, "learning_rate": 6.9753434970826286e-06, "loss": 0.623, "step": 34600 }, { "epoch": 6.51, "grad_norm": 12.2300443649292, "learning_rate": 6.97157914549219e-06, "loss": 0.4408, "step": 34610 }, { "epoch": 6.52, "grad_norm": 30.439172744750977, "learning_rate": 6.9678147939017515e-06, "loss": 0.3854, "step": 34620 }, { "epoch": 6.52, "grad_norm": 13.196409225463867, "learning_rate": 6.964050442311312e-06, "loss": 0.4278, "step": 34630 }, { "epoch": 6.52, "grad_norm": 8.042349815368652, "learning_rate": 6.9602860907208736e-06, "loss": 0.4624, "step": 34640 }, { "epoch": 6.52, "grad_norm": 7.406027793884277, "learning_rate": 6.956521739130435e-06, "loss": 0.4726, "step": 34650 }, { "epoch": 6.52, "grad_norm": 8.683379173278809, "learning_rate": 6.9527573875399965e-06, "loss": 0.6854, "step": 34660 }, { "epoch": 6.53, "grad_norm": 23.16596794128418, "learning_rate": 6.948993035949558e-06, "loss": 0.6081, "step": 34670 }, { "epoch": 6.53, "grad_norm": 0.07282250374555588, "learning_rate": 6.94522868435912e-06, "loss": 0.5896, "step": 34680 }, { "epoch": 6.53, "grad_norm": 39.00768280029297, "learning_rate": 6.941464332768682e-06, "loss": 0.5436, "step": 34690 }, { "epoch": 6.53, "grad_norm": 35.062660217285156, "learning_rate": 6.937699981178243e-06, "loss": 0.6628, "step": 34700 }, { "epoch": 6.53, "grad_norm": 9.127306938171387, "learning_rate": 6.933935629587804e-06, "loss": 0.7176, "step": 34710 }, { "epoch": 6.53, "grad_norm": 0.9268749952316284, "learning_rate": 6.930171277997365e-06, "loss": 0.438, "step": 34720 }, { "epoch": 6.54, "grad_norm": 2.197774648666382, "learning_rate": 6.926406926406927e-06, "loss": 0.4989, "step": 34730 }, { "epoch": 6.54, "grad_norm": 32.574371337890625, "learning_rate": 6.922642574816488e-06, "loss": 0.586, "step": 34740 }, { "epoch": 6.54, "grad_norm": 31.461170196533203, "learning_rate": 6.91887822322605e-06, "loss": 0.5052, "step": 34750 }, { "epoch": 6.54, "grad_norm": 12.898176193237305, "learning_rate": 6.915113871635611e-06, "loss": 0.5327, "step": 34760 }, { "epoch": 6.54, "grad_norm": 2.2167086601257324, "learning_rate": 6.911349520045173e-06, "loss": 0.5105, "step": 34770 }, { "epoch": 6.55, "grad_norm": 15.682035446166992, "learning_rate": 6.907585168454735e-06, "loss": 0.3067, "step": 34780 }, { "epoch": 6.55, "grad_norm": 41.14264678955078, "learning_rate": 6.903820816864296e-06, "loss": 0.4904, "step": 34790 }, { "epoch": 6.55, "grad_norm": 0.08841241896152496, "learning_rate": 6.900056465273857e-06, "loss": 0.4581, "step": 34800 }, { "epoch": 6.55, "grad_norm": 38.545257568359375, "learning_rate": 6.896292113683418e-06, "loss": 0.4896, "step": 34810 }, { "epoch": 6.55, "grad_norm": 35.339149475097656, "learning_rate": 6.89252776209298e-06, "loss": 0.5188, "step": 34820 }, { "epoch": 6.56, "grad_norm": 14.379655838012695, "learning_rate": 6.888763410502541e-06, "loss": 0.3942, "step": 34830 }, { "epoch": 6.56, "grad_norm": 21.1137752532959, "learning_rate": 6.884999058912103e-06, "loss": 0.5441, "step": 34840 }, { "epoch": 6.56, "grad_norm": 20.75609588623047, "learning_rate": 6.881234707321664e-06, "loss": 0.6482, "step": 34850 }, { "epoch": 6.56, "grad_norm": 19.010169982910156, "learning_rate": 6.8774703557312265e-06, "loss": 0.4973, "step": 34860 }, { "epoch": 6.56, "grad_norm": 19.815196990966797, "learning_rate": 6.873706004140788e-06, "loss": 0.6047, "step": 34870 }, { "epoch": 6.57, "grad_norm": 8.65109634399414, "learning_rate": 6.8699416525503495e-06, "loss": 0.3104, "step": 34880 }, { "epoch": 6.57, "grad_norm": 1.1485213041305542, "learning_rate": 6.86617730095991e-06, "loss": 0.4565, "step": 34890 }, { "epoch": 6.57, "grad_norm": 1.1633775234222412, "learning_rate": 6.8624129493694715e-06, "loss": 0.5844, "step": 34900 }, { "epoch": 6.57, "grad_norm": 7.708075523376465, "learning_rate": 6.858648597779033e-06, "loss": 0.6197, "step": 34910 }, { "epoch": 6.57, "grad_norm": 4.411716938018799, "learning_rate": 6.8548842461885945e-06, "loss": 0.5093, "step": 34920 }, { "epoch": 6.57, "grad_norm": 18.344188690185547, "learning_rate": 6.851119894598156e-06, "loss": 0.693, "step": 34930 }, { "epoch": 6.58, "grad_norm": 40.74114990234375, "learning_rate": 6.847355543007717e-06, "loss": 0.6018, "step": 34940 }, { "epoch": 6.58, "grad_norm": 0.057777296751737595, "learning_rate": 6.843591191417279e-06, "loss": 0.5193, "step": 34950 }, { "epoch": 6.58, "grad_norm": 13.32892894744873, "learning_rate": 6.839826839826841e-06, "loss": 0.5995, "step": 34960 }, { "epoch": 6.58, "grad_norm": 28.153160095214844, "learning_rate": 6.836062488236401e-06, "loss": 0.5686, "step": 34970 }, { "epoch": 6.58, "grad_norm": 7.822689056396484, "learning_rate": 6.832298136645963e-06, "loss": 0.4162, "step": 34980 }, { "epoch": 6.59, "grad_norm": 25.546289443969727, "learning_rate": 6.828533785055525e-06, "loss": 0.3894, "step": 34990 }, { "epoch": 6.59, "grad_norm": 0.47211772203445435, "learning_rate": 6.824769433465086e-06, "loss": 0.2991, "step": 35000 }, { "epoch": 6.59, "grad_norm": 6.658637523651123, "learning_rate": 6.821005081874648e-06, "loss": 0.6118, "step": 35010 }, { "epoch": 6.59, "grad_norm": 4.372053146362305, "learning_rate": 6.817240730284209e-06, "loss": 0.239, "step": 35020 }, { "epoch": 6.59, "grad_norm": 25.087793350219727, "learning_rate": 6.8134763786937705e-06, "loss": 0.6427, "step": 35030 }, { "epoch": 6.6, "grad_norm": 18.80016326904297, "learning_rate": 6.809712027103332e-06, "loss": 0.3743, "step": 35040 }, { "epoch": 6.6, "grad_norm": 0.2986818552017212, "learning_rate": 6.805947675512894e-06, "loss": 0.3221, "step": 35050 }, { "epoch": 6.6, "grad_norm": 15.778828620910645, "learning_rate": 6.802183323922454e-06, "loss": 0.855, "step": 35060 }, { "epoch": 6.6, "grad_norm": 1.3159215450286865, "learning_rate": 6.798418972332016e-06, "loss": 0.5559, "step": 35070 }, { "epoch": 6.6, "grad_norm": 0.06491217762231827, "learning_rate": 6.794654620741578e-06, "loss": 0.2105, "step": 35080 }, { "epoch": 6.6, "grad_norm": 19.969703674316406, "learning_rate": 6.790890269151139e-06, "loss": 0.4636, "step": 35090 }, { "epoch": 6.61, "grad_norm": 11.362780570983887, "learning_rate": 6.787125917560701e-06, "loss": 0.5571, "step": 35100 }, { "epoch": 6.61, "grad_norm": 8.18557357788086, "learning_rate": 6.783361565970262e-06, "loss": 0.4913, "step": 35110 }, { "epoch": 6.61, "grad_norm": 10.069347381591797, "learning_rate": 6.779597214379824e-06, "loss": 0.2839, "step": 35120 }, { "epoch": 6.61, "grad_norm": 2.1053032875061035, "learning_rate": 6.775832862789385e-06, "loss": 0.6116, "step": 35130 }, { "epoch": 6.61, "grad_norm": 12.72517204284668, "learning_rate": 6.7720685111989474e-06, "loss": 0.4671, "step": 35140 }, { "epoch": 6.62, "grad_norm": 19.65533447265625, "learning_rate": 6.768304159608507e-06, "loss": 0.5756, "step": 35150 }, { "epoch": 6.62, "grad_norm": 15.998537063598633, "learning_rate": 6.764539808018069e-06, "loss": 0.5428, "step": 35160 }, { "epoch": 6.62, "grad_norm": 24.56063461303711, "learning_rate": 6.760775456427631e-06, "loss": 0.6397, "step": 35170 }, { "epoch": 6.62, "grad_norm": 0.9871610403060913, "learning_rate": 6.7570111048371924e-06, "loss": 0.5336, "step": 35180 }, { "epoch": 6.62, "grad_norm": 0.14848846197128296, "learning_rate": 6.753246753246754e-06, "loss": 0.4377, "step": 35190 }, { "epoch": 6.63, "grad_norm": 55.045509338378906, "learning_rate": 6.749482401656315e-06, "loss": 0.5341, "step": 35200 }, { "epoch": 6.63, "grad_norm": 13.329508781433105, "learning_rate": 6.745718050065877e-06, "loss": 0.5933, "step": 35210 }, { "epoch": 6.63, "grad_norm": 3.753176212310791, "learning_rate": 6.741953698475438e-06, "loss": 0.301, "step": 35220 }, { "epoch": 6.63, "grad_norm": 0.3469744622707367, "learning_rate": 6.738189346885e-06, "loss": 0.3636, "step": 35230 }, { "epoch": 6.63, "grad_norm": 8.834991455078125, "learning_rate": 6.73442499529456e-06, "loss": 0.7362, "step": 35240 }, { "epoch": 6.63, "grad_norm": 13.375589370727539, "learning_rate": 6.730660643704122e-06, "loss": 0.4198, "step": 35250 }, { "epoch": 6.64, "grad_norm": 0.9780624508857727, "learning_rate": 6.726896292113684e-06, "loss": 0.2504, "step": 35260 }, { "epoch": 6.64, "grad_norm": 12.835625648498535, "learning_rate": 6.723131940523246e-06, "loss": 0.7162, "step": 35270 }, { "epoch": 6.64, "grad_norm": 1.3344264030456543, "learning_rate": 6.719367588932807e-06, "loss": 0.3267, "step": 35280 }, { "epoch": 6.64, "grad_norm": 33.066993713378906, "learning_rate": 6.7156032373423685e-06, "loss": 0.367, "step": 35290 }, { "epoch": 6.64, "grad_norm": 5.983303546905518, "learning_rate": 6.71183888575193e-06, "loss": 0.8041, "step": 35300 }, { "epoch": 6.65, "grad_norm": 34.84541320800781, "learning_rate": 6.7080745341614914e-06, "loss": 0.4726, "step": 35310 }, { "epoch": 6.65, "grad_norm": 15.427559852600098, "learning_rate": 6.704310182571052e-06, "loss": 0.2845, "step": 35320 }, { "epoch": 6.65, "grad_norm": 8.32028865814209, "learning_rate": 6.7005458309806135e-06, "loss": 0.5386, "step": 35330 }, { "epoch": 6.65, "grad_norm": 49.66559982299805, "learning_rate": 6.696781479390175e-06, "loss": 0.4041, "step": 35340 }, { "epoch": 6.65, "grad_norm": 3.6853106021881104, "learning_rate": 6.693017127799737e-06, "loss": 0.6245, "step": 35350 }, { "epoch": 6.66, "grad_norm": 2.9729301929473877, "learning_rate": 6.689252776209299e-06, "loss": 0.5603, "step": 35360 }, { "epoch": 6.66, "grad_norm": 7.048079490661621, "learning_rate": 6.68548842461886e-06, "loss": 0.5224, "step": 35370 }, { "epoch": 6.66, "grad_norm": 4.4709296226501465, "learning_rate": 6.681724073028422e-06, "loss": 0.2556, "step": 35380 }, { "epoch": 6.66, "grad_norm": 29.859085083007812, "learning_rate": 6.677959721437983e-06, "loss": 0.5444, "step": 35390 }, { "epoch": 6.66, "grad_norm": 6.1535797119140625, "learning_rate": 6.6741953698475446e-06, "loss": 0.5421, "step": 35400 }, { "epoch": 6.66, "grad_norm": 5.314403533935547, "learning_rate": 6.670431018257105e-06, "loss": 0.5939, "step": 35410 }, { "epoch": 6.67, "grad_norm": 17.58977508544922, "learning_rate": 6.666666666666667e-06, "loss": 0.4557, "step": 35420 }, { "epoch": 6.67, "grad_norm": 1.5328501462936401, "learning_rate": 6.662902315076228e-06, "loss": 0.4192, "step": 35430 }, { "epoch": 6.67, "grad_norm": 16.774066925048828, "learning_rate": 6.65913796348579e-06, "loss": 0.644, "step": 35440 }, { "epoch": 6.67, "grad_norm": 56.260440826416016, "learning_rate": 6.655373611895352e-06, "loss": 0.7726, "step": 35450 }, { "epoch": 6.67, "grad_norm": 20.503341674804688, "learning_rate": 6.651609260304913e-06, "loss": 0.5263, "step": 35460 }, { "epoch": 6.68, "grad_norm": 4.7927422523498535, "learning_rate": 6.647844908714475e-06, "loss": 0.5455, "step": 35470 }, { "epoch": 6.68, "grad_norm": 0.0458342470228672, "learning_rate": 6.644080557124036e-06, "loss": 0.3213, "step": 35480 }, { "epoch": 6.68, "grad_norm": 12.544048309326172, "learning_rate": 6.640316205533598e-06, "loss": 0.7539, "step": 35490 }, { "epoch": 6.68, "grad_norm": 23.94100570678711, "learning_rate": 6.636551853943158e-06, "loss": 0.7003, "step": 35500 }, { "epoch": 6.68, "grad_norm": 1.0551177263259888, "learning_rate": 6.63278750235272e-06, "loss": 0.3386, "step": 35510 }, { "epoch": 6.69, "grad_norm": 18.300785064697266, "learning_rate": 6.629023150762281e-06, "loss": 0.5915, "step": 35520 }, { "epoch": 6.69, "grad_norm": 2.241436243057251, "learning_rate": 6.625258799171843e-06, "loss": 0.3854, "step": 35530 }, { "epoch": 6.69, "grad_norm": 2.476534366607666, "learning_rate": 6.621494447581405e-06, "loss": 0.5499, "step": 35540 }, { "epoch": 6.69, "grad_norm": 14.764324188232422, "learning_rate": 6.6177300959909665e-06, "loss": 0.4017, "step": 35550 }, { "epoch": 6.69, "grad_norm": 20.741302490234375, "learning_rate": 6.613965744400528e-06, "loss": 0.533, "step": 35560 }, { "epoch": 6.69, "grad_norm": 20.3760929107666, "learning_rate": 6.610201392810089e-06, "loss": 0.7259, "step": 35570 }, { "epoch": 6.7, "grad_norm": 22.055103302001953, "learning_rate": 6.60643704121965e-06, "loss": 0.7339, "step": 35580 }, { "epoch": 6.7, "grad_norm": 0.13808929920196533, "learning_rate": 6.6026726896292115e-06, "loss": 0.2974, "step": 35590 }, { "epoch": 6.7, "grad_norm": 0.07140693813562393, "learning_rate": 6.598908338038773e-06, "loss": 0.3145, "step": 35600 }, { "epoch": 6.7, "grad_norm": 24.929866790771484, "learning_rate": 6.595143986448334e-06, "loss": 0.2889, "step": 35610 }, { "epoch": 6.7, "grad_norm": 16.1369571685791, "learning_rate": 6.591379634857896e-06, "loss": 1.012, "step": 35620 }, { "epoch": 6.71, "grad_norm": 29.283899307250977, "learning_rate": 6.587615283267458e-06, "loss": 0.5044, "step": 35630 }, { "epoch": 6.71, "grad_norm": 8.903387069702148, "learning_rate": 6.58385093167702e-06, "loss": 0.3735, "step": 35640 }, { "epoch": 6.71, "grad_norm": 32.65473937988281, "learning_rate": 6.580086580086581e-06, "loss": 0.3131, "step": 35650 }, { "epoch": 6.71, "grad_norm": 3.421217203140259, "learning_rate": 6.5763222284961426e-06, "loss": 0.4596, "step": 35660 }, { "epoch": 6.71, "grad_norm": 22.2224178314209, "learning_rate": 6.572557876905703e-06, "loss": 0.6354, "step": 35670 }, { "epoch": 6.72, "grad_norm": 10.610343933105469, "learning_rate": 6.568793525315265e-06, "loss": 0.6314, "step": 35680 }, { "epoch": 6.72, "grad_norm": 18.17767906188965, "learning_rate": 6.565029173724826e-06, "loss": 0.4187, "step": 35690 }, { "epoch": 6.72, "grad_norm": 11.569199562072754, "learning_rate": 6.5612648221343875e-06, "loss": 0.3357, "step": 35700 }, { "epoch": 6.72, "grad_norm": 2.6818912029266357, "learning_rate": 6.557500470543949e-06, "loss": 0.3558, "step": 35710 }, { "epoch": 6.72, "grad_norm": 7.722105026245117, "learning_rate": 6.553736118953511e-06, "loss": 0.3008, "step": 35720 }, { "epoch": 6.73, "grad_norm": 47.493011474609375, "learning_rate": 6.549971767363073e-06, "loss": 0.6678, "step": 35730 }, { "epoch": 6.73, "grad_norm": 21.441015243530273, "learning_rate": 6.546207415772634e-06, "loss": 0.5969, "step": 35740 }, { "epoch": 6.73, "grad_norm": 0.7386395931243896, "learning_rate": 6.542443064182196e-06, "loss": 0.6374, "step": 35750 }, { "epoch": 6.73, "grad_norm": 12.457733154296875, "learning_rate": 6.538678712591756e-06, "loss": 0.5366, "step": 35760 }, { "epoch": 6.73, "grad_norm": 16.376632690429688, "learning_rate": 6.534914361001318e-06, "loss": 0.3374, "step": 35770 }, { "epoch": 6.73, "grad_norm": 16.77738380432129, "learning_rate": 6.531150009410879e-06, "loss": 0.3973, "step": 35780 }, { "epoch": 6.74, "grad_norm": 3.2299344539642334, "learning_rate": 6.527385657820441e-06, "loss": 0.3064, "step": 35790 }, { "epoch": 6.74, "grad_norm": 29.193862915039062, "learning_rate": 6.523621306230002e-06, "loss": 0.5058, "step": 35800 }, { "epoch": 6.74, "grad_norm": 0.34984686970710754, "learning_rate": 6.519856954639564e-06, "loss": 0.4511, "step": 35810 }, { "epoch": 6.74, "grad_norm": 14.813493728637695, "learning_rate": 6.516092603049126e-06, "loss": 0.4569, "step": 35820 }, { "epoch": 6.74, "grad_norm": 15.381288528442383, "learning_rate": 6.512328251458687e-06, "loss": 0.5122, "step": 35830 }, { "epoch": 6.75, "grad_norm": 4.633648872375488, "learning_rate": 6.508563899868249e-06, "loss": 0.4984, "step": 35840 }, { "epoch": 6.75, "grad_norm": 9.600768089294434, "learning_rate": 6.5047995482778095e-06, "loss": 0.4427, "step": 35850 }, { "epoch": 6.75, "grad_norm": 10.763100624084473, "learning_rate": 6.501035196687371e-06, "loss": 0.4402, "step": 35860 }, { "epoch": 6.75, "grad_norm": 11.68506145477295, "learning_rate": 6.497270845096932e-06, "loss": 0.4209, "step": 35870 }, { "epoch": 6.75, "grad_norm": 14.833422660827637, "learning_rate": 6.493506493506494e-06, "loss": 0.6574, "step": 35880 }, { "epoch": 6.76, "grad_norm": 4.88935661315918, "learning_rate": 6.489742141916055e-06, "loss": 0.4479, "step": 35890 }, { "epoch": 6.76, "grad_norm": 10.467138290405273, "learning_rate": 6.485977790325617e-06, "loss": 0.5853, "step": 35900 }, { "epoch": 6.76, "grad_norm": 1.4499626159667969, "learning_rate": 6.482213438735179e-06, "loss": 0.4039, "step": 35910 }, { "epoch": 6.76, "grad_norm": 14.744542121887207, "learning_rate": 6.4784490871447405e-06, "loss": 0.333, "step": 35920 }, { "epoch": 6.76, "grad_norm": 0.04979345574975014, "learning_rate": 6.474684735554301e-06, "loss": 0.4814, "step": 35930 }, { "epoch": 6.76, "grad_norm": 15.655257225036621, "learning_rate": 6.470920383963863e-06, "loss": 0.4449, "step": 35940 }, { "epoch": 6.77, "grad_norm": 0.3292839229106903, "learning_rate": 6.467156032373424e-06, "loss": 0.2991, "step": 35950 }, { "epoch": 6.77, "grad_norm": 22.464662551879883, "learning_rate": 6.4633916807829855e-06, "loss": 0.9058, "step": 35960 }, { "epoch": 6.77, "grad_norm": 26.670724868774414, "learning_rate": 6.459627329192547e-06, "loss": 0.5506, "step": 35970 }, { "epoch": 6.77, "grad_norm": 20.745296478271484, "learning_rate": 6.4558629776021084e-06, "loss": 0.2994, "step": 35980 }, { "epoch": 6.77, "grad_norm": 25.35480499267578, "learning_rate": 6.45209862601167e-06, "loss": 0.5347, "step": 35990 }, { "epoch": 6.78, "grad_norm": 0.18468712270259857, "learning_rate": 6.448334274421232e-06, "loss": 0.4672, "step": 36000 }, { "epoch": 6.78, "grad_norm": 27.896060943603516, "learning_rate": 6.444569922830794e-06, "loss": 0.7579, "step": 36010 }, { "epoch": 6.78, "grad_norm": 8.810677528381348, "learning_rate": 6.440805571240354e-06, "loss": 0.4414, "step": 36020 }, { "epoch": 6.78, "grad_norm": 32.96968078613281, "learning_rate": 6.437041219649916e-06, "loss": 0.336, "step": 36030 }, { "epoch": 6.78, "grad_norm": 35.82292556762695, "learning_rate": 6.433276868059477e-06, "loss": 0.5106, "step": 36040 }, { "epoch": 6.79, "grad_norm": 27.85030746459961, "learning_rate": 6.429512516469039e-06, "loss": 0.5962, "step": 36050 }, { "epoch": 6.79, "grad_norm": 27.592071533203125, "learning_rate": 6.4257481648786e-06, "loss": 0.7332, "step": 36060 }, { "epoch": 6.79, "grad_norm": 23.796886444091797, "learning_rate": 6.421983813288162e-06, "loss": 0.4692, "step": 36070 }, { "epoch": 6.79, "grad_norm": 14.783629417419434, "learning_rate": 6.418219461697723e-06, "loss": 0.4037, "step": 36080 }, { "epoch": 6.79, "grad_norm": 14.800186157226562, "learning_rate": 6.414455110107285e-06, "loss": 0.3501, "step": 36090 }, { "epoch": 6.79, "grad_norm": 31.679777145385742, "learning_rate": 6.410690758516847e-06, "loss": 0.3348, "step": 36100 }, { "epoch": 6.8, "grad_norm": 27.46072769165039, "learning_rate": 6.406926406926407e-06, "loss": 0.7376, "step": 36110 }, { "epoch": 6.8, "grad_norm": 5.877180099487305, "learning_rate": 6.403162055335969e-06, "loss": 0.6983, "step": 36120 }, { "epoch": 6.8, "grad_norm": 2.0558340549468994, "learning_rate": 6.39939770374553e-06, "loss": 0.499, "step": 36130 }, { "epoch": 6.8, "grad_norm": 7.006868839263916, "learning_rate": 6.395633352155092e-06, "loss": 0.544, "step": 36140 }, { "epoch": 6.8, "grad_norm": 15.996200561523438, "learning_rate": 6.391869000564653e-06, "loss": 0.3828, "step": 36150 }, { "epoch": 6.81, "grad_norm": 11.61379337310791, "learning_rate": 6.388104648974215e-06, "loss": 0.4883, "step": 36160 }, { "epoch": 6.81, "grad_norm": 7.964717388153076, "learning_rate": 6.384340297383776e-06, "loss": 0.3079, "step": 36170 }, { "epoch": 6.81, "grad_norm": 0.344844788312912, "learning_rate": 6.380575945793338e-06, "loss": 0.5976, "step": 36180 }, { "epoch": 6.81, "grad_norm": 5.613010883331299, "learning_rate": 6.376811594202898e-06, "loss": 0.5082, "step": 36190 }, { "epoch": 6.81, "grad_norm": 33.136268615722656, "learning_rate": 6.37304724261246e-06, "loss": 0.4065, "step": 36200 }, { "epoch": 6.82, "grad_norm": 26.772239685058594, "learning_rate": 6.369282891022022e-06, "loss": 0.6971, "step": 36210 }, { "epoch": 6.82, "grad_norm": 21.456703186035156, "learning_rate": 6.3655185394315835e-06, "loss": 0.3556, "step": 36220 }, { "epoch": 6.82, "grad_norm": 26.07500648498535, "learning_rate": 6.361754187841145e-06, "loss": 0.5151, "step": 36230 }, { "epoch": 6.82, "grad_norm": 19.035114288330078, "learning_rate": 6.3579898362507064e-06, "loss": 0.897, "step": 36240 }, { "epoch": 6.82, "grad_norm": 11.198712348937988, "learning_rate": 6.354225484660268e-06, "loss": 0.6124, "step": 36250 }, { "epoch": 6.82, "grad_norm": 7.805197715759277, "learning_rate": 6.350461133069829e-06, "loss": 0.6911, "step": 36260 }, { "epoch": 6.83, "grad_norm": 29.66564178466797, "learning_rate": 6.346696781479391e-06, "loss": 0.2914, "step": 36270 }, { "epoch": 6.83, "grad_norm": 42.64144515991211, "learning_rate": 6.342932429888951e-06, "loss": 0.8512, "step": 36280 }, { "epoch": 6.83, "grad_norm": 6.411401748657227, "learning_rate": 6.339168078298513e-06, "loss": 0.5668, "step": 36290 }, { "epoch": 6.83, "grad_norm": 11.873297691345215, "learning_rate": 6.335403726708075e-06, "loss": 0.6048, "step": 36300 }, { "epoch": 6.83, "grad_norm": 18.666488647460938, "learning_rate": 6.331639375117637e-06, "loss": 0.6428, "step": 36310 }, { "epoch": 6.84, "grad_norm": 13.76421070098877, "learning_rate": 6.327875023527198e-06, "loss": 0.4083, "step": 36320 }, { "epoch": 6.84, "grad_norm": 10.075658798217773, "learning_rate": 6.3241106719367596e-06, "loss": 0.3769, "step": 36330 }, { "epoch": 6.84, "grad_norm": 19.852584838867188, "learning_rate": 6.320346320346321e-06, "loss": 0.6485, "step": 36340 }, { "epoch": 6.84, "grad_norm": 15.061224937438965, "learning_rate": 6.3165819687558825e-06, "loss": 0.4621, "step": 36350 }, { "epoch": 6.84, "grad_norm": 15.020421981811523, "learning_rate": 6.312817617165444e-06, "loss": 0.5403, "step": 36360 }, { "epoch": 6.85, "grad_norm": 0.03311420977115631, "learning_rate": 6.3090532655750046e-06, "loss": 0.2076, "step": 36370 }, { "epoch": 6.85, "grad_norm": 40.68346405029297, "learning_rate": 6.305288913984566e-06, "loss": 0.529, "step": 36380 }, { "epoch": 6.85, "grad_norm": 6.1792521476745605, "learning_rate": 6.3015245623941275e-06, "loss": 0.5218, "step": 36390 }, { "epoch": 6.85, "grad_norm": 18.217483520507812, "learning_rate": 6.29776021080369e-06, "loss": 0.3166, "step": 36400 }, { "epoch": 6.85, "grad_norm": 0.7558429837226868, "learning_rate": 6.293995859213251e-06, "loss": 0.6442, "step": 36410 }, { "epoch": 6.85, "grad_norm": 2.1329829692840576, "learning_rate": 6.290231507622813e-06, "loss": 0.5574, "step": 36420 }, { "epoch": 6.86, "grad_norm": 19.163063049316406, "learning_rate": 6.286467156032374e-06, "loss": 0.4113, "step": 36430 }, { "epoch": 6.86, "grad_norm": 21.05096435546875, "learning_rate": 6.282702804441936e-06, "loss": 0.644, "step": 36440 }, { "epoch": 6.86, "grad_norm": 15.979854583740234, "learning_rate": 6.278938452851497e-06, "loss": 0.4465, "step": 36450 }, { "epoch": 6.86, "grad_norm": 0.3811950087547302, "learning_rate": 6.275174101261058e-06, "loss": 0.472, "step": 36460 }, { "epoch": 6.86, "grad_norm": 2.4271154403686523, "learning_rate": 6.271409749670619e-06, "loss": 0.6266, "step": 36470 }, { "epoch": 6.87, "grad_norm": 21.372413635253906, "learning_rate": 6.267645398080181e-06, "loss": 0.5725, "step": 36480 }, { "epoch": 6.87, "grad_norm": 13.645919799804688, "learning_rate": 6.263881046489743e-06, "loss": 0.4645, "step": 36490 }, { "epoch": 6.87, "grad_norm": 56.878578186035156, "learning_rate": 6.260116694899304e-06, "loss": 0.5547, "step": 36500 }, { "epoch": 6.87, "grad_norm": 2.6566834449768066, "learning_rate": 6.256352343308866e-06, "loss": 0.3839, "step": 36510 }, { "epoch": 6.87, "grad_norm": 18.267131805419922, "learning_rate": 6.252587991718427e-06, "loss": 0.3353, "step": 36520 }, { "epoch": 6.88, "grad_norm": 29.942182540893555, "learning_rate": 6.248823640127989e-06, "loss": 0.8058, "step": 36530 }, { "epoch": 6.88, "grad_norm": 44.51432418823242, "learning_rate": 6.245059288537549e-06, "loss": 0.7491, "step": 36540 }, { "epoch": 6.88, "grad_norm": 22.656213760375977, "learning_rate": 6.241294936947111e-06, "loss": 0.3901, "step": 36550 }, { "epoch": 6.88, "grad_norm": 4.600666046142578, "learning_rate": 6.237530585356672e-06, "loss": 0.4892, "step": 36560 }, { "epoch": 6.88, "grad_norm": 0.16996584832668304, "learning_rate": 6.233766233766234e-06, "loss": 0.3893, "step": 36570 }, { "epoch": 6.88, "grad_norm": 7.973287105560303, "learning_rate": 6.230001882175796e-06, "loss": 0.4334, "step": 36580 }, { "epoch": 6.89, "grad_norm": 6.781386375427246, "learning_rate": 6.2262375305853575e-06, "loss": 0.4581, "step": 36590 }, { "epoch": 6.89, "grad_norm": 17.54666519165039, "learning_rate": 6.222473178994919e-06, "loss": 0.351, "step": 36600 }, { "epoch": 6.89, "grad_norm": 1.0639405250549316, "learning_rate": 6.2187088274044805e-06, "loss": 0.514, "step": 36610 }, { "epoch": 6.89, "grad_norm": 0.03612279146909714, "learning_rate": 6.214944475814042e-06, "loss": 0.6789, "step": 36620 }, { "epoch": 6.89, "grad_norm": 13.270979881286621, "learning_rate": 6.2111801242236025e-06, "loss": 0.4822, "step": 36630 }, { "epoch": 6.9, "grad_norm": 1.6162692308425903, "learning_rate": 6.207415772633164e-06, "loss": 0.4827, "step": 36640 }, { "epoch": 6.9, "grad_norm": 28.315080642700195, "learning_rate": 6.2036514210427255e-06, "loss": 0.616, "step": 36650 }, { "epoch": 6.9, "grad_norm": 11.296304702758789, "learning_rate": 6.199887069452287e-06, "loss": 0.6246, "step": 36660 }, { "epoch": 6.9, "grad_norm": 10.235584259033203, "learning_rate": 6.196122717861849e-06, "loss": 0.433, "step": 36670 }, { "epoch": 6.9, "grad_norm": 26.53406524658203, "learning_rate": 6.192358366271411e-06, "loss": 0.3869, "step": 36680 }, { "epoch": 6.91, "grad_norm": 3.4729597568511963, "learning_rate": 6.188594014680972e-06, "loss": 0.3534, "step": 36690 }, { "epoch": 6.91, "grad_norm": 20.50278091430664, "learning_rate": 6.184829663090534e-06, "loss": 0.3664, "step": 36700 }, { "epoch": 6.91, "grad_norm": 7.9125566482543945, "learning_rate": 6.181065311500095e-06, "loss": 0.3867, "step": 36710 }, { "epoch": 6.91, "grad_norm": 17.5158634185791, "learning_rate": 6.177300959909656e-06, "loss": 0.315, "step": 36720 }, { "epoch": 6.91, "grad_norm": 30.33664321899414, "learning_rate": 6.173536608319217e-06, "loss": 0.4541, "step": 36730 }, { "epoch": 6.92, "grad_norm": 24.297866821289062, "learning_rate": 6.169772256728779e-06, "loss": 0.772, "step": 36740 }, { "epoch": 6.92, "grad_norm": 16.661861419677734, "learning_rate": 6.16600790513834e-06, "loss": 0.5398, "step": 36750 }, { "epoch": 6.92, "grad_norm": 0.9989810585975647, "learning_rate": 6.1622435535479015e-06, "loss": 0.4729, "step": 36760 }, { "epoch": 6.92, "grad_norm": 20.895919799804688, "learning_rate": 6.158479201957464e-06, "loss": 0.5922, "step": 36770 }, { "epoch": 6.92, "grad_norm": 13.476861953735352, "learning_rate": 6.154714850367025e-06, "loss": 0.6265, "step": 36780 }, { "epoch": 6.92, "grad_norm": 29.94508171081543, "learning_rate": 6.150950498776587e-06, "loss": 0.5422, "step": 36790 }, { "epoch": 6.93, "grad_norm": 6.413869380950928, "learning_rate": 6.147186147186147e-06, "loss": 0.4729, "step": 36800 }, { "epoch": 6.93, "grad_norm": 26.468042373657227, "learning_rate": 6.143421795595709e-06, "loss": 0.4204, "step": 36810 }, { "epoch": 6.93, "grad_norm": 5.048950672149658, "learning_rate": 6.13965744400527e-06, "loss": 0.5322, "step": 36820 }, { "epoch": 6.93, "grad_norm": 20.66354751586914, "learning_rate": 6.135893092414832e-06, "loss": 0.4366, "step": 36830 }, { "epoch": 6.93, "grad_norm": 14.539073944091797, "learning_rate": 6.132128740824393e-06, "loss": 0.5468, "step": 36840 }, { "epoch": 6.94, "grad_norm": 19.072439193725586, "learning_rate": 6.128364389233955e-06, "loss": 0.2972, "step": 36850 }, { "epoch": 6.94, "grad_norm": 0.2176668494939804, "learning_rate": 6.124600037643517e-06, "loss": 0.4319, "step": 36860 }, { "epoch": 6.94, "grad_norm": 1.570940613746643, "learning_rate": 6.1208356860530784e-06, "loss": 0.2465, "step": 36870 }, { "epoch": 6.94, "grad_norm": 0.4804990291595459, "learning_rate": 6.11707133446264e-06, "loss": 0.5241, "step": 36880 }, { "epoch": 6.94, "grad_norm": 20.45847511291504, "learning_rate": 6.1133069828722005e-06, "loss": 0.5885, "step": 36890 }, { "epoch": 6.95, "grad_norm": 0.5880195498466492, "learning_rate": 6.109542631281762e-06, "loss": 0.6118, "step": 36900 }, { "epoch": 6.95, "grad_norm": 19.450361251831055, "learning_rate": 6.1057782796913234e-06, "loss": 0.6389, "step": 36910 }, { "epoch": 6.95, "grad_norm": 6.449407577514648, "learning_rate": 6.102013928100885e-06, "loss": 0.4828, "step": 36920 }, { "epoch": 6.95, "grad_norm": 19.77794075012207, "learning_rate": 6.098249576510446e-06, "loss": 0.5739, "step": 36930 }, { "epoch": 6.95, "grad_norm": 19.867918014526367, "learning_rate": 6.094485224920008e-06, "loss": 0.5181, "step": 36940 }, { "epoch": 6.95, "grad_norm": 11.12002944946289, "learning_rate": 6.09072087332957e-06, "loss": 0.526, "step": 36950 }, { "epoch": 6.96, "grad_norm": 2.037919521331787, "learning_rate": 6.086956521739132e-06, "loss": 0.4911, "step": 36960 }, { "epoch": 6.96, "grad_norm": 31.55478286743164, "learning_rate": 6.083192170148693e-06, "loss": 0.856, "step": 36970 }, { "epoch": 6.96, "grad_norm": 18.756759643554688, "learning_rate": 6.079427818558254e-06, "loss": 0.7955, "step": 36980 }, { "epoch": 6.96, "grad_norm": 18.737565994262695, "learning_rate": 6.075663466967815e-06, "loss": 0.5185, "step": 36990 }, { "epoch": 6.96, "grad_norm": 8.580978393554688, "learning_rate": 6.071899115377377e-06, "loss": 0.5442, "step": 37000 }, { "epoch": 6.97, "grad_norm": 16.809736251831055, "learning_rate": 6.068134763786938e-06, "loss": 0.7495, "step": 37010 }, { "epoch": 6.97, "grad_norm": 11.04421329498291, "learning_rate": 6.0643704121964995e-06, "loss": 0.2769, "step": 37020 }, { "epoch": 6.97, "grad_norm": 11.780550956726074, "learning_rate": 6.060606060606061e-06, "loss": 0.8881, "step": 37030 }, { "epoch": 6.97, "grad_norm": 1.5381724834442139, "learning_rate": 6.0568417090156224e-06, "loss": 0.4434, "step": 37040 }, { "epoch": 6.97, "grad_norm": 2.1745412349700928, "learning_rate": 6.053077357425185e-06, "loss": 0.3855, "step": 37050 }, { "epoch": 6.98, "grad_norm": 1.2057762145996094, "learning_rate": 6.049313005834746e-06, "loss": 0.5586, "step": 37060 }, { "epoch": 6.98, "grad_norm": 4.387953758239746, "learning_rate": 6.045548654244307e-06, "loss": 0.6215, "step": 37070 }, { "epoch": 6.98, "grad_norm": 15.834346771240234, "learning_rate": 6.041784302653868e-06, "loss": 0.6887, "step": 37080 }, { "epoch": 6.98, "grad_norm": 22.549379348754883, "learning_rate": 6.03801995106343e-06, "loss": 0.5226, "step": 37090 }, { "epoch": 6.98, "grad_norm": 21.140119552612305, "learning_rate": 6.034255599472991e-06, "loss": 0.8414, "step": 37100 }, { "epoch": 6.98, "grad_norm": 8.247200012207031, "learning_rate": 6.030491247882553e-06, "loss": 0.49, "step": 37110 }, { "epoch": 6.99, "grad_norm": 7.904429912567139, "learning_rate": 6.026726896292114e-06, "loss": 0.2786, "step": 37120 }, { "epoch": 6.99, "grad_norm": 26.10804557800293, "learning_rate": 6.022962544701676e-06, "loss": 0.3379, "step": 37130 }, { "epoch": 6.99, "grad_norm": 12.558807373046875, "learning_rate": 6.019198193111238e-06, "loss": 0.6979, "step": 37140 }, { "epoch": 6.99, "grad_norm": 4.539823055267334, "learning_rate": 6.015433841520798e-06, "loss": 0.4851, "step": 37150 }, { "epoch": 6.99, "grad_norm": 17.593862533569336, "learning_rate": 6.01166948993036e-06, "loss": 0.4093, "step": 37160 }, { "epoch": 7.0, "grad_norm": 6.471458435058594, "learning_rate": 6.007905138339921e-06, "loss": 0.4581, "step": 37170 }, { "epoch": 7.0, "grad_norm": 6.4814348220825195, "learning_rate": 6.004140786749483e-06, "loss": 0.4462, "step": 37180 }, { "epoch": 7.0, "grad_norm": 0.26597660779953003, "learning_rate": 6.000376435159044e-06, "loss": 0.303, "step": 37190 }, { "epoch": 7.0, "eval_accuracy": 0.9241333333333334, "eval_loss": 0.30108216404914856, "eval_runtime": 52.0717, "eval_samples_per_second": 144.032, "eval_steps_per_second": 18.014, "step": 37191 }, { "epoch": 7.0, "grad_norm": 4.4914164543151855, "learning_rate": 5.996612083568606e-06, "loss": 0.5521, "step": 37200 }, { "epoch": 7.0, "grad_norm": 20.4888973236084, "learning_rate": 5.992847731978167e-06, "loss": 0.3498, "step": 37210 }, { "epoch": 7.01, "grad_norm": 18.38087272644043, "learning_rate": 5.989083380387729e-06, "loss": 0.6529, "step": 37220 }, { "epoch": 7.01, "grad_norm": 9.481815338134766, "learning_rate": 5.985319028797291e-06, "loss": 0.5024, "step": 37230 }, { "epoch": 7.01, "grad_norm": 2.8433196544647217, "learning_rate": 5.981554677206851e-06, "loss": 0.3982, "step": 37240 }, { "epoch": 7.01, "grad_norm": 26.133316040039062, "learning_rate": 5.977790325616413e-06, "loss": 0.9014, "step": 37250 }, { "epoch": 7.01, "grad_norm": 17.805519104003906, "learning_rate": 5.9740259740259746e-06, "loss": 0.4383, "step": 37260 }, { "epoch": 7.01, "grad_norm": 8.60024642944336, "learning_rate": 5.970261622435536e-06, "loss": 0.7659, "step": 37270 }, { "epoch": 7.02, "grad_norm": 1.4241914749145508, "learning_rate": 5.9664972708450975e-06, "loss": 0.5184, "step": 37280 }, { "epoch": 7.02, "grad_norm": 3.293872356414795, "learning_rate": 5.962732919254659e-06, "loss": 0.4679, "step": 37290 }, { "epoch": 7.02, "grad_norm": 9.334550857543945, "learning_rate": 5.95896856766422e-06, "loss": 0.3055, "step": 37300 }, { "epoch": 7.02, "grad_norm": 12.182844161987305, "learning_rate": 5.955204216073782e-06, "loss": 0.4598, "step": 37310 }, { "epoch": 7.02, "grad_norm": 21.008893966674805, "learning_rate": 5.951439864483344e-06, "loss": 0.5906, "step": 37320 }, { "epoch": 7.03, "grad_norm": 21.775415420532227, "learning_rate": 5.947675512892904e-06, "loss": 0.4748, "step": 37330 }, { "epoch": 7.03, "grad_norm": 33.95808792114258, "learning_rate": 5.943911161302465e-06, "loss": 0.6582, "step": 37340 }, { "epoch": 7.03, "grad_norm": 28.52260971069336, "learning_rate": 5.940146809712028e-06, "loss": 0.4331, "step": 37350 }, { "epoch": 7.03, "grad_norm": 0.2158019095659256, "learning_rate": 5.936382458121589e-06, "loss": 0.4771, "step": 37360 }, { "epoch": 7.03, "grad_norm": 11.500349998474121, "learning_rate": 5.932618106531151e-06, "loss": 0.2201, "step": 37370 }, { "epoch": 7.04, "grad_norm": 13.823716163635254, "learning_rate": 5.928853754940712e-06, "loss": 0.429, "step": 37380 }, { "epoch": 7.04, "grad_norm": 20.627649307250977, "learning_rate": 5.9250894033502736e-06, "loss": 0.526, "step": 37390 }, { "epoch": 7.04, "grad_norm": 1.0602643489837646, "learning_rate": 5.921325051759835e-06, "loss": 0.3343, "step": 37400 }, { "epoch": 7.04, "grad_norm": 34.25511932373047, "learning_rate": 5.917560700169396e-06, "loss": 0.7096, "step": 37410 }, { "epoch": 7.04, "grad_norm": 58.332923889160156, "learning_rate": 5.913796348578957e-06, "loss": 0.5007, "step": 37420 }, { "epoch": 7.04, "grad_norm": 0.520494282245636, "learning_rate": 5.9100319969885186e-06, "loss": 0.6852, "step": 37430 }, { "epoch": 7.05, "grad_norm": 14.561441421508789, "learning_rate": 5.906267645398081e-06, "loss": 0.7221, "step": 37440 }, { "epoch": 7.05, "grad_norm": 7.289056777954102, "learning_rate": 5.902503293807642e-06, "loss": 0.4541, "step": 37450 }, { "epoch": 7.05, "grad_norm": 15.227532386779785, "learning_rate": 5.898738942217204e-06, "loss": 0.6697, "step": 37460 }, { "epoch": 7.05, "grad_norm": 33.787776947021484, "learning_rate": 5.894974590626765e-06, "loss": 0.5081, "step": 37470 }, { "epoch": 7.05, "grad_norm": 18.24729347229004, "learning_rate": 5.891210239036327e-06, "loss": 0.5493, "step": 37480 }, { "epoch": 7.06, "grad_norm": 17.62848472595215, "learning_rate": 5.887445887445888e-06, "loss": 0.6633, "step": 37490 }, { "epoch": 7.06, "grad_norm": 13.691800117492676, "learning_rate": 5.883681535855449e-06, "loss": 0.5323, "step": 37500 }, { "epoch": 7.06, "grad_norm": 33.989376068115234, "learning_rate": 5.87991718426501e-06, "loss": 0.5766, "step": 37510 }, { "epoch": 7.06, "grad_norm": 6.813748359680176, "learning_rate": 5.876152832674572e-06, "loss": 0.4941, "step": 37520 }, { "epoch": 7.06, "grad_norm": 10.321836471557617, "learning_rate": 5.872388481084134e-06, "loss": 0.6905, "step": 37530 }, { "epoch": 7.07, "grad_norm": 0.11116129904985428, "learning_rate": 5.8686241294936955e-06, "loss": 0.2757, "step": 37540 }, { "epoch": 7.07, "grad_norm": 15.543323516845703, "learning_rate": 5.864859777903257e-06, "loss": 0.5171, "step": 37550 }, { "epoch": 7.07, "grad_norm": 10.380474090576172, "learning_rate": 5.861095426312818e-06, "loss": 0.6989, "step": 37560 }, { "epoch": 7.07, "grad_norm": 4.593369007110596, "learning_rate": 5.85733107472238e-06, "loss": 0.3715, "step": 37570 }, { "epoch": 7.07, "grad_norm": 0.048009395599365234, "learning_rate": 5.853566723131941e-06, "loss": 0.4439, "step": 37580 }, { "epoch": 7.08, "grad_norm": 0.29102352261543274, "learning_rate": 5.849802371541502e-06, "loss": 0.3896, "step": 37590 }, { "epoch": 7.08, "grad_norm": 31.79117774963379, "learning_rate": 5.846038019951063e-06, "loss": 0.4598, "step": 37600 }, { "epoch": 7.08, "grad_norm": 0.6519231796264648, "learning_rate": 5.842273668360625e-06, "loss": 0.4029, "step": 37610 }, { "epoch": 7.08, "grad_norm": 0.04763523116707802, "learning_rate": 5.838509316770186e-06, "loss": 0.4446, "step": 37620 }, { "epoch": 7.08, "grad_norm": 5.059743881225586, "learning_rate": 5.834744965179749e-06, "loss": 0.2375, "step": 37630 }, { "epoch": 7.08, "grad_norm": 18.59803581237793, "learning_rate": 5.83098061358931e-06, "loss": 0.2464, "step": 37640 }, { "epoch": 7.09, "grad_norm": 10.456582069396973, "learning_rate": 5.8272162619988715e-06, "loss": 0.3523, "step": 37650 }, { "epoch": 7.09, "grad_norm": 32.90705871582031, "learning_rate": 5.823451910408433e-06, "loss": 0.5078, "step": 37660 }, { "epoch": 7.09, "grad_norm": 3.806859254837036, "learning_rate": 5.8196875588179945e-06, "loss": 0.3444, "step": 37670 }, { "epoch": 7.09, "grad_norm": 0.30914196372032166, "learning_rate": 5.815923207227555e-06, "loss": 0.4412, "step": 37680 }, { "epoch": 7.09, "grad_norm": 1.1022928953170776, "learning_rate": 5.8121588556371165e-06, "loss": 0.4646, "step": 37690 }, { "epoch": 7.1, "grad_norm": 2.6765952110290527, "learning_rate": 5.808394504046678e-06, "loss": 0.4558, "step": 37700 }, { "epoch": 7.1, "grad_norm": 18.882295608520508, "learning_rate": 5.8046301524562395e-06, "loss": 0.3892, "step": 37710 }, { "epoch": 7.1, "grad_norm": 23.711225509643555, "learning_rate": 5.800865800865802e-06, "loss": 0.4363, "step": 37720 }, { "epoch": 7.1, "grad_norm": 15.44466781616211, "learning_rate": 5.797101449275363e-06, "loss": 0.3836, "step": 37730 }, { "epoch": 7.1, "grad_norm": 42.25343322753906, "learning_rate": 5.793337097684925e-06, "loss": 0.3638, "step": 37740 }, { "epoch": 7.11, "grad_norm": 5.034309387207031, "learning_rate": 5.789572746094486e-06, "loss": 0.4246, "step": 37750 }, { "epoch": 7.11, "grad_norm": 8.63538646697998, "learning_rate": 5.785808394504047e-06, "loss": 0.3743, "step": 37760 }, { "epoch": 7.11, "grad_norm": 7.230229377746582, "learning_rate": 5.782044042913608e-06, "loss": 0.3815, "step": 37770 }, { "epoch": 7.11, "grad_norm": 45.37762451171875, "learning_rate": 5.77827969132317e-06, "loss": 0.5341, "step": 37780 }, { "epoch": 7.11, "grad_norm": 13.632198333740234, "learning_rate": 5.774515339732731e-06, "loss": 0.5235, "step": 37790 }, { "epoch": 7.11, "grad_norm": 0.051364749670028687, "learning_rate": 5.770750988142293e-06, "loss": 0.2958, "step": 37800 }, { "epoch": 7.12, "grad_norm": 31.266891479492188, "learning_rate": 5.766986636551855e-06, "loss": 0.7418, "step": 37810 }, { "epoch": 7.12, "grad_norm": 21.40471649169922, "learning_rate": 5.763222284961416e-06, "loss": 0.5075, "step": 37820 }, { "epoch": 7.12, "grad_norm": 0.524456262588501, "learning_rate": 5.759457933370978e-06, "loss": 0.3901, "step": 37830 }, { "epoch": 7.12, "grad_norm": 4.217536926269531, "learning_rate": 5.755693581780539e-06, "loss": 0.5058, "step": 37840 }, { "epoch": 7.12, "grad_norm": 14.552117347717285, "learning_rate": 5.7519292301901e-06, "loss": 0.7943, "step": 37850 }, { "epoch": 7.13, "grad_norm": 8.193163871765137, "learning_rate": 5.748164878599661e-06, "loss": 0.4663, "step": 37860 }, { "epoch": 7.13, "grad_norm": 29.58894920349121, "learning_rate": 5.744400527009223e-06, "loss": 0.4782, "step": 37870 }, { "epoch": 7.13, "grad_norm": 1.9362828731536865, "learning_rate": 5.740636175418784e-06, "loss": 0.4437, "step": 37880 }, { "epoch": 7.13, "grad_norm": 11.760843276977539, "learning_rate": 5.736871823828346e-06, "loss": 0.6013, "step": 37890 }, { "epoch": 7.13, "grad_norm": 11.666337966918945, "learning_rate": 5.733107472237908e-06, "loss": 0.477, "step": 37900 }, { "epoch": 7.14, "grad_norm": 28.812885284423828, "learning_rate": 5.7293431206474695e-06, "loss": 0.5293, "step": 37910 }, { "epoch": 7.14, "grad_norm": 38.73482894897461, "learning_rate": 5.725578769057031e-06, "loss": 0.5247, "step": 37920 }, { "epoch": 7.14, "grad_norm": 14.522818565368652, "learning_rate": 5.7218144174665924e-06, "loss": 0.6026, "step": 37930 }, { "epoch": 7.14, "grad_norm": 5.249240875244141, "learning_rate": 5.718050065876153e-06, "loss": 0.3719, "step": 37940 }, { "epoch": 7.14, "grad_norm": 20.325544357299805, "learning_rate": 5.7142857142857145e-06, "loss": 0.4992, "step": 37950 }, { "epoch": 7.14, "grad_norm": 10.740631103515625, "learning_rate": 5.710521362695276e-06, "loss": 0.5879, "step": 37960 }, { "epoch": 7.15, "grad_norm": 12.676395416259766, "learning_rate": 5.7067570111048374e-06, "loss": 0.46, "step": 37970 }, { "epoch": 7.15, "grad_norm": 18.58064079284668, "learning_rate": 5.702992659514399e-06, "loss": 0.3504, "step": 37980 }, { "epoch": 7.15, "grad_norm": 4.198108196258545, "learning_rate": 5.69922830792396e-06, "loss": 0.5009, "step": 37990 }, { "epoch": 7.15, "grad_norm": 41.74122619628906, "learning_rate": 5.695463956333523e-06, "loss": 0.4566, "step": 38000 }, { "epoch": 7.15, "grad_norm": 4.8473219871521, "learning_rate": 5.691699604743084e-06, "loss": 0.437, "step": 38010 }, { "epoch": 7.16, "grad_norm": 0.8085455298423767, "learning_rate": 5.687935253152645e-06, "loss": 0.4446, "step": 38020 }, { "epoch": 7.16, "grad_norm": 12.186211585998535, "learning_rate": 5.684170901562206e-06, "loss": 0.3344, "step": 38030 }, { "epoch": 7.16, "grad_norm": 8.344589233398438, "learning_rate": 5.680406549971768e-06, "loss": 0.6024, "step": 38040 }, { "epoch": 7.16, "grad_norm": 16.539567947387695, "learning_rate": 5.676642198381329e-06, "loss": 0.4564, "step": 38050 }, { "epoch": 7.16, "grad_norm": 13.2172269821167, "learning_rate": 5.672877846790891e-06, "loss": 0.336, "step": 38060 }, { "epoch": 7.17, "grad_norm": 0.8715003728866577, "learning_rate": 5.669113495200452e-06, "loss": 0.5934, "step": 38070 }, { "epoch": 7.17, "grad_norm": 0.03967274725437164, "learning_rate": 5.6653491436100135e-06, "loss": 0.4955, "step": 38080 }, { "epoch": 7.17, "grad_norm": 42.34977722167969, "learning_rate": 5.661584792019576e-06, "loss": 0.6749, "step": 38090 }, { "epoch": 7.17, "grad_norm": 13.210389137268066, "learning_rate": 5.657820440429137e-06, "loss": 0.4131, "step": 38100 }, { "epoch": 7.17, "grad_norm": 6.826159477233887, "learning_rate": 5.654056088838698e-06, "loss": 0.3849, "step": 38110 }, { "epoch": 7.17, "grad_norm": 4.8396406173706055, "learning_rate": 5.650291737248259e-06, "loss": 0.4347, "step": 38120 }, { "epoch": 7.18, "grad_norm": 0.492663711309433, "learning_rate": 5.646527385657821e-06, "loss": 0.462, "step": 38130 }, { "epoch": 7.18, "grad_norm": 11.567475318908691, "learning_rate": 5.642763034067382e-06, "loss": 0.4399, "step": 38140 }, { "epoch": 7.18, "grad_norm": 15.863377571105957, "learning_rate": 5.638998682476944e-06, "loss": 0.8885, "step": 38150 }, { "epoch": 7.18, "grad_norm": 14.544013977050781, "learning_rate": 5.635234330886505e-06, "loss": 0.5326, "step": 38160 }, { "epoch": 7.18, "grad_norm": 10.913338661193848, "learning_rate": 5.631469979296067e-06, "loss": 0.4607, "step": 38170 }, { "epoch": 7.19, "grad_norm": 9.55777645111084, "learning_rate": 5.627705627705629e-06, "loss": 0.2982, "step": 38180 }, { "epoch": 7.19, "grad_norm": 32.77585983276367, "learning_rate": 5.62394127611519e-06, "loss": 0.5706, "step": 38190 }, { "epoch": 7.19, "grad_norm": 17.475671768188477, "learning_rate": 5.620176924524751e-06, "loss": 0.5987, "step": 38200 }, { "epoch": 7.19, "grad_norm": 20.316225051879883, "learning_rate": 5.6164125729343125e-06, "loss": 0.2997, "step": 38210 }, { "epoch": 7.19, "grad_norm": 24.66366958618164, "learning_rate": 5.612648221343874e-06, "loss": 0.5617, "step": 38220 }, { "epoch": 7.2, "grad_norm": 2.09023380279541, "learning_rate": 5.608883869753435e-06, "loss": 0.1539, "step": 38230 }, { "epoch": 7.2, "grad_norm": 15.220237731933594, "learning_rate": 5.605119518162997e-06, "loss": 0.426, "step": 38240 }, { "epoch": 7.2, "grad_norm": 2.988645076751709, "learning_rate": 5.601355166572558e-06, "loss": 0.4009, "step": 38250 }, { "epoch": 7.2, "grad_norm": 10.845462799072266, "learning_rate": 5.59759081498212e-06, "loss": 0.3647, "step": 38260 }, { "epoch": 7.2, "grad_norm": 24.533771514892578, "learning_rate": 5.593826463391681e-06, "loss": 0.6294, "step": 38270 }, { "epoch": 7.2, "grad_norm": 33.65419006347656, "learning_rate": 5.590062111801242e-06, "loss": 0.608, "step": 38280 }, { "epoch": 7.21, "grad_norm": 22.451557159423828, "learning_rate": 5.586297760210803e-06, "loss": 0.421, "step": 38290 }, { "epoch": 7.21, "grad_norm": 10.964974403381348, "learning_rate": 5.582533408620366e-06, "loss": 0.4638, "step": 38300 }, { "epoch": 7.21, "grad_norm": 0.03477161377668381, "learning_rate": 5.578769057029927e-06, "loss": 0.4434, "step": 38310 }, { "epoch": 7.21, "grad_norm": 0.03130391240119934, "learning_rate": 5.5750047054394886e-06, "loss": 0.65, "step": 38320 }, { "epoch": 7.21, "grad_norm": 7.138416290283203, "learning_rate": 5.57124035384905e-06, "loss": 0.4412, "step": 38330 }, { "epoch": 7.22, "grad_norm": 0.3959789574146271, "learning_rate": 5.5674760022586115e-06, "loss": 0.3357, "step": 38340 }, { "epoch": 7.22, "grad_norm": 14.614726066589355, "learning_rate": 5.563711650668173e-06, "loss": 0.6037, "step": 38350 }, { "epoch": 7.22, "grad_norm": 9.566722869873047, "learning_rate": 5.559947299077734e-06, "loss": 0.4153, "step": 38360 }, { "epoch": 7.22, "grad_norm": 24.345317840576172, "learning_rate": 5.556182947487295e-06, "loss": 0.3658, "step": 38370 }, { "epoch": 7.22, "grad_norm": 45.182655334472656, "learning_rate": 5.5524185958968565e-06, "loss": 0.3317, "step": 38380 }, { "epoch": 7.23, "grad_norm": 20.688343048095703, "learning_rate": 5.548654244306419e-06, "loss": 0.6442, "step": 38390 }, { "epoch": 7.23, "grad_norm": 0.04111315310001373, "learning_rate": 5.54488989271598e-06, "loss": 0.6456, "step": 38400 }, { "epoch": 7.23, "grad_norm": 25.130800247192383, "learning_rate": 5.541125541125542e-06, "loss": 0.4863, "step": 38410 }, { "epoch": 7.23, "grad_norm": 8.105908393859863, "learning_rate": 5.537361189535103e-06, "loss": 0.6085, "step": 38420 }, { "epoch": 7.23, "grad_norm": 20.2083740234375, "learning_rate": 5.533596837944665e-06, "loss": 0.5963, "step": 38430 }, { "epoch": 7.24, "grad_norm": 0.32264116406440735, "learning_rate": 5.529832486354226e-06, "loss": 0.2069, "step": 38440 }, { "epoch": 7.24, "grad_norm": 15.73507022857666, "learning_rate": 5.5260681347637875e-06, "loss": 0.3864, "step": 38450 }, { "epoch": 7.24, "grad_norm": 13.899942398071289, "learning_rate": 5.522303783173348e-06, "loss": 0.4887, "step": 38460 }, { "epoch": 7.24, "grad_norm": 29.511226654052734, "learning_rate": 5.51853943158291e-06, "loss": 0.4645, "step": 38470 }, { "epoch": 7.24, "grad_norm": 24.286212921142578, "learning_rate": 5.514775079992472e-06, "loss": 0.9881, "step": 38480 }, { "epoch": 7.24, "grad_norm": 7.7546491622924805, "learning_rate": 5.511010728402033e-06, "loss": 0.1187, "step": 38490 }, { "epoch": 7.25, "grad_norm": 2.176553726196289, "learning_rate": 5.507246376811595e-06, "loss": 0.2799, "step": 38500 }, { "epoch": 7.25, "grad_norm": 20.29352378845215, "learning_rate": 5.503482025221156e-06, "loss": 0.5774, "step": 38510 }, { "epoch": 7.25, "grad_norm": 11.283035278320312, "learning_rate": 5.499717673630718e-06, "loss": 0.6969, "step": 38520 }, { "epoch": 7.25, "grad_norm": 2.3551344871520996, "learning_rate": 5.495953322040279e-06, "loss": 0.6055, "step": 38530 }, { "epoch": 7.25, "grad_norm": 17.497970581054688, "learning_rate": 5.492188970449841e-06, "loss": 0.5596, "step": 38540 }, { "epoch": 7.26, "grad_norm": 0.6391315460205078, "learning_rate": 5.488424618859401e-06, "loss": 0.5974, "step": 38550 }, { "epoch": 7.26, "grad_norm": 10.734766960144043, "learning_rate": 5.484660267268963e-06, "loss": 0.5015, "step": 38560 }, { "epoch": 7.26, "grad_norm": 28.943450927734375, "learning_rate": 5.480895915678524e-06, "loss": 0.291, "step": 38570 }, { "epoch": 7.26, "grad_norm": 30.02581024169922, "learning_rate": 5.4771315640880865e-06, "loss": 0.5427, "step": 38580 }, { "epoch": 7.26, "grad_norm": 2.238271474838257, "learning_rate": 5.473367212497648e-06, "loss": 0.5335, "step": 38590 }, { "epoch": 7.27, "grad_norm": 23.164579391479492, "learning_rate": 5.4696028609072095e-06, "loss": 0.6102, "step": 38600 }, { "epoch": 7.27, "grad_norm": 15.776400566101074, "learning_rate": 5.465838509316771e-06, "loss": 0.5805, "step": 38610 }, { "epoch": 7.27, "grad_norm": 15.679583549499512, "learning_rate": 5.462074157726332e-06, "loss": 0.667, "step": 38620 }, { "epoch": 7.27, "grad_norm": 13.96922492980957, "learning_rate": 5.458309806135893e-06, "loss": 0.1757, "step": 38630 }, { "epoch": 7.27, "grad_norm": 20.9793643951416, "learning_rate": 5.4545454545454545e-06, "loss": 0.4583, "step": 38640 }, { "epoch": 7.27, "grad_norm": 13.186413764953613, "learning_rate": 5.450781102955016e-06, "loss": 0.3714, "step": 38650 }, { "epoch": 7.28, "grad_norm": 0.07995478063821793, "learning_rate": 5.447016751364577e-06, "loss": 0.3397, "step": 38660 }, { "epoch": 7.28, "grad_norm": 0.7243136167526245, "learning_rate": 5.44325239977414e-06, "loss": 0.4177, "step": 38670 }, { "epoch": 7.28, "grad_norm": 19.715696334838867, "learning_rate": 5.439488048183701e-06, "loss": 0.5963, "step": 38680 }, { "epoch": 7.28, "grad_norm": 13.43181037902832, "learning_rate": 5.435723696593263e-06, "loss": 0.4663, "step": 38690 }, { "epoch": 7.28, "grad_norm": 21.230018615722656, "learning_rate": 5.431959345002824e-06, "loss": 0.3512, "step": 38700 }, { "epoch": 7.29, "grad_norm": 20.25225067138672, "learning_rate": 5.4281949934123855e-06, "loss": 0.7707, "step": 38710 }, { "epoch": 7.29, "grad_norm": 0.15251778066158295, "learning_rate": 5.424430641821946e-06, "loss": 0.3512, "step": 38720 }, { "epoch": 7.29, "grad_norm": 25.118371963500977, "learning_rate": 5.420666290231508e-06, "loss": 0.5093, "step": 38730 }, { "epoch": 7.29, "grad_norm": 5.278887748718262, "learning_rate": 5.416901938641069e-06, "loss": 0.5096, "step": 38740 }, { "epoch": 7.29, "grad_norm": 2.8506057262420654, "learning_rate": 5.4131375870506305e-06, "loss": 0.4723, "step": 38750 }, { "epoch": 7.3, "grad_norm": 12.97459602355957, "learning_rate": 5.409373235460193e-06, "loss": 0.74, "step": 38760 }, { "epoch": 7.3, "grad_norm": 12.791807174682617, "learning_rate": 5.405608883869754e-06, "loss": 0.5725, "step": 38770 }, { "epoch": 7.3, "grad_norm": 0.048314113169908524, "learning_rate": 5.401844532279316e-06, "loss": 0.5425, "step": 38780 }, { "epoch": 7.3, "grad_norm": 2.543400287628174, "learning_rate": 5.398080180688877e-06, "loss": 0.5112, "step": 38790 }, { "epoch": 7.3, "grad_norm": 8.345157623291016, "learning_rate": 5.394315829098439e-06, "loss": 0.2803, "step": 38800 }, { "epoch": 7.3, "grad_norm": 37.12696838378906, "learning_rate": 5.390551477507999e-06, "loss": 0.609, "step": 38810 }, { "epoch": 7.31, "grad_norm": 15.115370750427246, "learning_rate": 5.386787125917561e-06, "loss": 0.4371, "step": 38820 }, { "epoch": 7.31, "grad_norm": 24.323984146118164, "learning_rate": 5.383022774327122e-06, "loss": 0.7161, "step": 38830 }, { "epoch": 7.31, "grad_norm": 27.755170822143555, "learning_rate": 5.379258422736684e-06, "loss": 0.559, "step": 38840 }, { "epoch": 7.31, "grad_norm": 14.916714668273926, "learning_rate": 5.375494071146246e-06, "loss": 0.3077, "step": 38850 }, { "epoch": 7.31, "grad_norm": 7.22851037979126, "learning_rate": 5.3717297195558074e-06, "loss": 0.4101, "step": 38860 }, { "epoch": 7.32, "grad_norm": 17.815229415893555, "learning_rate": 5.367965367965369e-06, "loss": 0.6718, "step": 38870 }, { "epoch": 7.32, "grad_norm": 43.300785064697266, "learning_rate": 5.36420101637493e-06, "loss": 0.2887, "step": 38880 }, { "epoch": 7.32, "grad_norm": 44.66895294189453, "learning_rate": 5.360436664784491e-06, "loss": 0.4488, "step": 38890 }, { "epoch": 7.32, "grad_norm": 13.247306823730469, "learning_rate": 5.3566723131940524e-06, "loss": 0.4967, "step": 38900 }, { "epoch": 7.32, "grad_norm": 9.896644592285156, "learning_rate": 5.352907961603614e-06, "loss": 0.5352, "step": 38910 }, { "epoch": 7.33, "grad_norm": 15.37808609008789, "learning_rate": 5.349143610013175e-06, "loss": 0.4616, "step": 38920 }, { "epoch": 7.33, "grad_norm": 21.590145111083984, "learning_rate": 5.345379258422737e-06, "loss": 0.8465, "step": 38930 }, { "epoch": 7.33, "grad_norm": 3.483672618865967, "learning_rate": 5.341614906832298e-06, "loss": 0.3051, "step": 38940 }, { "epoch": 7.33, "grad_norm": 24.26028060913086, "learning_rate": 5.337850555241861e-06, "loss": 0.3835, "step": 38950 }, { "epoch": 7.33, "grad_norm": 7.518918991088867, "learning_rate": 5.334086203651422e-06, "loss": 0.3848, "step": 38960 }, { "epoch": 7.33, "grad_norm": 25.557096481323242, "learning_rate": 5.3303218520609835e-06, "loss": 0.511, "step": 38970 }, { "epoch": 7.34, "grad_norm": 5.811315059661865, "learning_rate": 5.326557500470544e-06, "loss": 0.3622, "step": 38980 }, { "epoch": 7.34, "grad_norm": 0.346421480178833, "learning_rate": 5.322793148880106e-06, "loss": 0.4936, "step": 38990 }, { "epoch": 7.34, "grad_norm": 28.79204559326172, "learning_rate": 5.319028797289667e-06, "loss": 0.8451, "step": 39000 }, { "epoch": 7.34, "grad_norm": 44.75595474243164, "learning_rate": 5.3152644456992285e-06, "loss": 0.4559, "step": 39010 }, { "epoch": 7.34, "grad_norm": 10.665853500366211, "learning_rate": 5.31150009410879e-06, "loss": 0.5485, "step": 39020 }, { "epoch": 7.35, "grad_norm": 11.347060203552246, "learning_rate": 5.307735742518351e-06, "loss": 0.5688, "step": 39030 }, { "epoch": 7.35, "grad_norm": 0.026885811239480972, "learning_rate": 5.303971390927914e-06, "loss": 0.6052, "step": 39040 }, { "epoch": 7.35, "grad_norm": 17.71526336669922, "learning_rate": 5.300207039337475e-06, "loss": 0.2693, "step": 39050 }, { "epoch": 7.35, "grad_norm": 19.759750366210938, "learning_rate": 5.296442687747037e-06, "loss": 0.4032, "step": 39060 }, { "epoch": 7.35, "grad_norm": 7.999401569366455, "learning_rate": 5.292678336156597e-06, "loss": 0.4863, "step": 39070 }, { "epoch": 7.36, "grad_norm": 0.07535995543003082, "learning_rate": 5.288913984566159e-06, "loss": 0.3263, "step": 39080 }, { "epoch": 7.36, "grad_norm": 15.817834854125977, "learning_rate": 5.28514963297572e-06, "loss": 0.7172, "step": 39090 }, { "epoch": 7.36, "grad_norm": 7.389345169067383, "learning_rate": 5.281385281385282e-06, "loss": 0.737, "step": 39100 }, { "epoch": 7.36, "grad_norm": 1.000137209892273, "learning_rate": 5.277620929794843e-06, "loss": 0.5336, "step": 39110 }, { "epoch": 7.36, "grad_norm": 0.07644683867692947, "learning_rate": 5.2738565782044046e-06, "loss": 0.3762, "step": 39120 }, { "epoch": 7.36, "grad_norm": 34.44324493408203, "learning_rate": 5.270092226613967e-06, "loss": 0.434, "step": 39130 }, { "epoch": 7.37, "grad_norm": 15.976019859313965, "learning_rate": 5.266327875023528e-06, "loss": 0.3836, "step": 39140 }, { "epoch": 7.37, "grad_norm": 0.10559114813804626, "learning_rate": 5.26256352343309e-06, "loss": 0.3044, "step": 39150 }, { "epoch": 7.37, "grad_norm": 13.347066879272461, "learning_rate": 5.25879917184265e-06, "loss": 0.5778, "step": 39160 }, { "epoch": 7.37, "grad_norm": 49.38035583496094, "learning_rate": 5.255034820252212e-06, "loss": 0.3162, "step": 39170 }, { "epoch": 7.37, "grad_norm": 3.3886990547180176, "learning_rate": 5.251270468661773e-06, "loss": 0.3553, "step": 39180 }, { "epoch": 7.38, "grad_norm": 9.105963706970215, "learning_rate": 5.247506117071335e-06, "loss": 0.7682, "step": 39190 }, { "epoch": 7.38, "grad_norm": 14.80863094329834, "learning_rate": 5.243741765480896e-06, "loss": 0.9301, "step": 39200 }, { "epoch": 7.38, "grad_norm": 3.077606201171875, "learning_rate": 5.239977413890458e-06, "loss": 0.5738, "step": 39210 }, { "epoch": 7.38, "grad_norm": 9.167863845825195, "learning_rate": 5.236213062300019e-06, "loss": 0.6652, "step": 39220 }, { "epoch": 7.38, "grad_norm": 39.81913375854492, "learning_rate": 5.2324487107095815e-06, "loss": 0.6311, "step": 39230 }, { "epoch": 7.39, "grad_norm": 0.21679580211639404, "learning_rate": 5.228684359119141e-06, "loss": 0.4019, "step": 39240 }, { "epoch": 7.39, "grad_norm": 42.18350601196289, "learning_rate": 5.2249200075287036e-06, "loss": 0.4451, "step": 39250 }, { "epoch": 7.39, "grad_norm": 4.216889381408691, "learning_rate": 5.221155655938265e-06, "loss": 0.4791, "step": 39260 }, { "epoch": 7.39, "grad_norm": 27.158920288085938, "learning_rate": 5.2173913043478265e-06, "loss": 0.632, "step": 39270 }, { "epoch": 7.39, "grad_norm": 3.5005364418029785, "learning_rate": 5.213626952757388e-06, "loss": 0.3373, "step": 39280 }, { "epoch": 7.4, "grad_norm": 5.771264553070068, "learning_rate": 5.209862601166949e-06, "loss": 0.8069, "step": 39290 }, { "epoch": 7.4, "grad_norm": 21.01832389831543, "learning_rate": 5.206098249576511e-06, "loss": 0.6151, "step": 39300 }, { "epoch": 7.4, "grad_norm": 0.21687443554401398, "learning_rate": 5.202333897986072e-06, "loss": 0.4028, "step": 39310 }, { "epoch": 7.4, "grad_norm": 17.40138816833496, "learning_rate": 5.198569546395635e-06, "loss": 0.3488, "step": 39320 }, { "epoch": 7.4, "grad_norm": 18.54778289794922, "learning_rate": 5.194805194805194e-06, "loss": 0.4226, "step": 39330 }, { "epoch": 7.4, "grad_norm": 0.8926262259483337, "learning_rate": 5.191040843214757e-06, "loss": 0.4072, "step": 39340 }, { "epoch": 7.41, "grad_norm": 0.043081410229206085, "learning_rate": 5.187276491624318e-06, "loss": 0.4525, "step": 39350 }, { "epoch": 7.41, "grad_norm": 28.826021194458008, "learning_rate": 5.18351214003388e-06, "loss": 0.4743, "step": 39360 }, { "epoch": 7.41, "grad_norm": 19.19860076904297, "learning_rate": 5.179747788443441e-06, "loss": 0.6083, "step": 39370 }, { "epoch": 7.41, "grad_norm": 23.61128044128418, "learning_rate": 5.1759834368530025e-06, "loss": 0.5457, "step": 39380 }, { "epoch": 7.41, "grad_norm": 15.979894638061523, "learning_rate": 5.172219085262564e-06, "loss": 0.4737, "step": 39390 }, { "epoch": 7.42, "grad_norm": 16.330854415893555, "learning_rate": 5.1684547336721255e-06, "loss": 0.6512, "step": 39400 }, { "epoch": 7.42, "grad_norm": 3.0890164375305176, "learning_rate": 5.164690382081688e-06, "loss": 0.5336, "step": 39410 }, { "epoch": 7.42, "grad_norm": 24.518455505371094, "learning_rate": 5.1609260304912475e-06, "loss": 0.4603, "step": 39420 }, { "epoch": 7.42, "grad_norm": 0.07051704078912735, "learning_rate": 5.15716167890081e-06, "loss": 0.2793, "step": 39430 }, { "epoch": 7.42, "grad_norm": 9.986491203308105, "learning_rate": 5.153397327310371e-06, "loss": 0.5185, "step": 39440 }, { "epoch": 7.43, "grad_norm": 6.8465142250061035, "learning_rate": 5.149632975719933e-06, "loss": 0.6369, "step": 39450 }, { "epoch": 7.43, "grad_norm": 0.4283851683139801, "learning_rate": 5.145868624129494e-06, "loss": 0.5151, "step": 39460 }, { "epoch": 7.43, "grad_norm": 13.111185073852539, "learning_rate": 5.142104272539056e-06, "loss": 0.4777, "step": 39470 }, { "epoch": 7.43, "grad_norm": 18.829418182373047, "learning_rate": 5.138339920948617e-06, "loss": 0.5422, "step": 39480 }, { "epoch": 7.43, "grad_norm": 18.628862380981445, "learning_rate": 5.134575569358179e-06, "loss": 0.5727, "step": 39490 }, { "epoch": 7.43, "grad_norm": 8.623425483703613, "learning_rate": 5.130811217767739e-06, "loss": 0.5578, "step": 39500 }, { "epoch": 7.44, "grad_norm": 7.155189514160156, "learning_rate": 5.127046866177301e-06, "loss": 0.6192, "step": 39510 }, { "epoch": 7.44, "grad_norm": 2.426950216293335, "learning_rate": 5.123282514586862e-06, "loss": 0.5366, "step": 39520 }, { "epoch": 7.44, "grad_norm": 5.574533939361572, "learning_rate": 5.1195181629964245e-06, "loss": 0.4428, "step": 39530 }, { "epoch": 7.44, "grad_norm": 0.15088945627212524, "learning_rate": 5.115753811405986e-06, "loss": 0.4344, "step": 39540 }, { "epoch": 7.44, "grad_norm": 53.62294387817383, "learning_rate": 5.111989459815547e-06, "loss": 0.3419, "step": 39550 }, { "epoch": 7.45, "grad_norm": 19.655786514282227, "learning_rate": 5.108225108225109e-06, "loss": 0.4297, "step": 39560 }, { "epoch": 7.45, "grad_norm": 9.113370895385742, "learning_rate": 5.10446075663467e-06, "loss": 0.4997, "step": 39570 }, { "epoch": 7.45, "grad_norm": 8.7140474319458, "learning_rate": 5.100696405044232e-06, "loss": 0.5054, "step": 39580 }, { "epoch": 7.45, "grad_norm": 0.8119073510169983, "learning_rate": 5.096932053453792e-06, "loss": 0.4112, "step": 39590 }, { "epoch": 7.45, "grad_norm": 24.722349166870117, "learning_rate": 5.093167701863354e-06, "loss": 0.4957, "step": 39600 }, { "epoch": 7.46, "grad_norm": 23.276451110839844, "learning_rate": 5.089403350272915e-06, "loss": 0.0708, "step": 39610 }, { "epoch": 7.46, "grad_norm": 10.128479957580566, "learning_rate": 5.085638998682478e-06, "loss": 0.3646, "step": 39620 }, { "epoch": 7.46, "grad_norm": 79.91693878173828, "learning_rate": 5.081874647092039e-06, "loss": 0.567, "step": 39630 }, { "epoch": 7.46, "grad_norm": 33.41339111328125, "learning_rate": 5.0781102955016005e-06, "loss": 0.3338, "step": 39640 }, { "epoch": 7.46, "grad_norm": 21.802555084228516, "learning_rate": 5.074345943911162e-06, "loss": 0.3191, "step": 39650 }, { "epoch": 7.46, "grad_norm": 18.43767738342285, "learning_rate": 5.0705815923207234e-06, "loss": 0.5815, "step": 39660 }, { "epoch": 7.47, "grad_norm": 14.148484230041504, "learning_rate": 5.066817240730285e-06, "loss": 0.2298, "step": 39670 }, { "epoch": 7.47, "grad_norm": 20.37163734436035, "learning_rate": 5.0630528891398455e-06, "loss": 0.8356, "step": 39680 }, { "epoch": 7.47, "grad_norm": 6.101649284362793, "learning_rate": 5.059288537549407e-06, "loss": 0.4301, "step": 39690 }, { "epoch": 7.47, "grad_norm": 1.8522956371307373, "learning_rate": 5.0555241859589684e-06, "loss": 0.3962, "step": 39700 }, { "epoch": 7.47, "grad_norm": 37.072349548339844, "learning_rate": 5.051759834368531e-06, "loss": 0.4138, "step": 39710 }, { "epoch": 7.48, "grad_norm": 28.832242965698242, "learning_rate": 5.047995482778092e-06, "loss": 0.5587, "step": 39720 }, { "epoch": 7.48, "grad_norm": 29.438364028930664, "learning_rate": 5.044231131187654e-06, "loss": 0.3898, "step": 39730 }, { "epoch": 7.48, "grad_norm": 0.1980200558900833, "learning_rate": 5.040466779597215e-06, "loss": 0.1686, "step": 39740 }, { "epoch": 7.48, "grad_norm": 2.973400592803955, "learning_rate": 5.036702428006777e-06, "loss": 0.2205, "step": 39750 }, { "epoch": 7.48, "grad_norm": 41.77248001098633, "learning_rate": 5.032938076416338e-06, "loss": 0.5068, "step": 39760 }, { "epoch": 7.49, "grad_norm": 23.240005493164062, "learning_rate": 5.029173724825899e-06, "loss": 0.2396, "step": 39770 }, { "epoch": 7.49, "grad_norm": 29.466114044189453, "learning_rate": 5.02540937323546e-06, "loss": 0.7611, "step": 39780 }, { "epoch": 7.49, "grad_norm": 18.149154663085938, "learning_rate": 5.021645021645022e-06, "loss": 0.4035, "step": 39790 }, { "epoch": 7.49, "grad_norm": 15.7437105178833, "learning_rate": 5.017880670054583e-06, "loss": 0.5208, "step": 39800 }, { "epoch": 7.49, "grad_norm": 23.674476623535156, "learning_rate": 5.014116318464145e-06, "loss": 0.3757, "step": 39810 }, { "epoch": 7.49, "grad_norm": 2.990457057952881, "learning_rate": 5.010351966873707e-06, "loss": 0.3227, "step": 39820 }, { "epoch": 7.5, "grad_norm": 5.852288246154785, "learning_rate": 5.006587615283268e-06, "loss": 0.398, "step": 39830 }, { "epoch": 7.5, "grad_norm": 21.586061477661133, "learning_rate": 5.00282326369283e-06, "loss": 0.4684, "step": 39840 }, { "epoch": 7.5, "grad_norm": 18.58787727355957, "learning_rate": 4.999058912102391e-06, "loss": 0.4361, "step": 39850 }, { "epoch": 7.5, "grad_norm": 6.530487060546875, "learning_rate": 4.995294560511953e-06, "loss": 0.419, "step": 39860 }, { "epoch": 7.5, "grad_norm": 18.475072860717773, "learning_rate": 4.991530208921513e-06, "loss": 0.5548, "step": 39870 }, { "epoch": 7.51, "grad_norm": 14.466314315795898, "learning_rate": 4.987765857331075e-06, "loss": 0.5023, "step": 39880 }, { "epoch": 7.51, "grad_norm": 15.371254920959473, "learning_rate": 4.984001505740636e-06, "loss": 0.4835, "step": 39890 }, { "epoch": 7.51, "grad_norm": 3.507498264312744, "learning_rate": 4.9802371541501985e-06, "loss": 0.4585, "step": 39900 }, { "epoch": 7.51, "grad_norm": 30.916147232055664, "learning_rate": 4.97647280255976e-06, "loss": 0.5154, "step": 39910 }, { "epoch": 7.51, "grad_norm": 14.572548866271973, "learning_rate": 4.9727084509693206e-06, "loss": 0.2884, "step": 39920 }, { "epoch": 7.52, "grad_norm": 0.2854940891265869, "learning_rate": 4.968944099378882e-06, "loss": 0.4503, "step": 39930 }, { "epoch": 7.52, "grad_norm": 30.854312896728516, "learning_rate": 4.9651797477884435e-06, "loss": 0.5331, "step": 39940 }, { "epoch": 7.52, "grad_norm": 19.74198341369629, "learning_rate": 4.961415396198006e-06, "loss": 0.1224, "step": 39950 }, { "epoch": 7.52, "grad_norm": 25.249618530273438, "learning_rate": 4.957651044607566e-06, "loss": 0.4178, "step": 39960 }, { "epoch": 7.52, "grad_norm": 8.16231632232666, "learning_rate": 4.953886693017128e-06, "loss": 0.5486, "step": 39970 }, { "epoch": 7.52, "grad_norm": 3.6326353549957275, "learning_rate": 4.950122341426689e-06, "loss": 0.5058, "step": 39980 }, { "epoch": 7.53, "grad_norm": 20.812101364135742, "learning_rate": 4.946357989836252e-06, "loss": 0.5612, "step": 39990 }, { "epoch": 7.53, "grad_norm": 24.15587615966797, "learning_rate": 4.942593638245812e-06, "loss": 0.5528, "step": 40000 }, { "epoch": 7.53, "grad_norm": 0.3987553119659424, "learning_rate": 4.938829286655374e-06, "loss": 0.3515, "step": 40010 }, { "epoch": 7.53, "grad_norm": 0.9960697889328003, "learning_rate": 4.935064935064935e-06, "loss": 0.3475, "step": 40020 }, { "epoch": 7.53, "grad_norm": 11.244698524475098, "learning_rate": 4.931300583474497e-06, "loss": 0.7113, "step": 40030 }, { "epoch": 7.54, "grad_norm": 12.09398365020752, "learning_rate": 4.927536231884059e-06, "loss": 0.3169, "step": 40040 }, { "epoch": 7.54, "grad_norm": 17.471298217773438, "learning_rate": 4.9237718802936196e-06, "loss": 0.3766, "step": 40050 }, { "epoch": 7.54, "grad_norm": 20.0872802734375, "learning_rate": 4.920007528703181e-06, "loss": 0.4371, "step": 40060 }, { "epoch": 7.54, "grad_norm": 0.6726735830307007, "learning_rate": 4.9162431771127425e-06, "loss": 0.5611, "step": 40070 }, { "epoch": 7.54, "grad_norm": 0.362667977809906, "learning_rate": 4.912478825522305e-06, "loss": 0.4279, "step": 40080 }, { "epoch": 7.55, "grad_norm": 48.890018463134766, "learning_rate": 4.908714473931865e-06, "loss": 0.3399, "step": 40090 }, { "epoch": 7.55, "grad_norm": 2.0744881629943848, "learning_rate": 4.904950122341427e-06, "loss": 0.2226, "step": 40100 }, { "epoch": 7.55, "grad_norm": 4.881161212921143, "learning_rate": 4.901185770750988e-06, "loss": 0.6647, "step": 40110 }, { "epoch": 7.55, "grad_norm": 15.641790390014648, "learning_rate": 4.89742141916055e-06, "loss": 0.4884, "step": 40120 }, { "epoch": 7.55, "grad_norm": 27.704832077026367, "learning_rate": 4.893657067570112e-06, "loss": 0.6596, "step": 40130 }, { "epoch": 7.56, "grad_norm": 11.091574668884277, "learning_rate": 4.889892715979673e-06, "loss": 0.4945, "step": 40140 }, { "epoch": 7.56, "grad_norm": 23.857437133789062, "learning_rate": 4.886128364389234e-06, "loss": 0.4619, "step": 40150 }, { "epoch": 7.56, "grad_norm": 0.4577654004096985, "learning_rate": 4.882364012798796e-06, "loss": 0.2583, "step": 40160 }, { "epoch": 7.56, "grad_norm": 0.02999284863471985, "learning_rate": 4.878599661208357e-06, "loss": 0.4674, "step": 40170 }, { "epoch": 7.56, "grad_norm": 8.35001277923584, "learning_rate": 4.8748353096179186e-06, "loss": 0.4048, "step": 40180 }, { "epoch": 7.56, "grad_norm": 24.522512435913086, "learning_rate": 4.87107095802748e-06, "loss": 0.3845, "step": 40190 }, { "epoch": 7.57, "grad_norm": 11.478434562683105, "learning_rate": 4.8673066064370415e-06, "loss": 0.4528, "step": 40200 }, { "epoch": 7.57, "grad_norm": 34.55609130859375, "learning_rate": 4.863542254846603e-06, "loss": 0.3256, "step": 40210 }, { "epoch": 7.57, "grad_norm": 30.954259872436523, "learning_rate": 4.859777903256164e-06, "loss": 0.6146, "step": 40220 }, { "epoch": 7.57, "grad_norm": 17.295373916625977, "learning_rate": 4.856013551665726e-06, "loss": 0.3439, "step": 40230 }, { "epoch": 7.57, "grad_norm": 4.239554405212402, "learning_rate": 4.852249200075287e-06, "loss": 0.4998, "step": 40240 }, { "epoch": 7.58, "grad_norm": 9.876043319702148, "learning_rate": 4.848484848484849e-06, "loss": 0.4356, "step": 40250 }, { "epoch": 7.58, "grad_norm": 14.549151420593262, "learning_rate": 4.84472049689441e-06, "loss": 0.3502, "step": 40260 }, { "epoch": 7.58, "grad_norm": 51.860286712646484, "learning_rate": 4.840956145303972e-06, "loss": 0.6799, "step": 40270 }, { "epoch": 7.58, "grad_norm": 14.050992965698242, "learning_rate": 4.837191793713533e-06, "loss": 0.2468, "step": 40280 }, { "epoch": 7.58, "grad_norm": 43.28739547729492, "learning_rate": 4.833427442123095e-06, "loss": 0.4276, "step": 40290 }, { "epoch": 7.59, "grad_norm": 0.5742427706718445, "learning_rate": 4.829663090532656e-06, "loss": 0.3327, "step": 40300 }, { "epoch": 7.59, "grad_norm": 21.811113357543945, "learning_rate": 4.8258987389422175e-06, "loss": 0.4486, "step": 40310 }, { "epoch": 7.59, "grad_norm": 0.1784685254096985, "learning_rate": 4.822134387351779e-06, "loss": 0.2737, "step": 40320 }, { "epoch": 7.59, "grad_norm": 38.78385543823242, "learning_rate": 4.8183700357613405e-06, "loss": 0.4103, "step": 40330 }, { "epoch": 7.59, "grad_norm": 26.003643035888672, "learning_rate": 4.814605684170902e-06, "loss": 0.4167, "step": 40340 }, { "epoch": 7.59, "grad_norm": 0.23788726329803467, "learning_rate": 4.810841332580463e-06, "loss": 0.5304, "step": 40350 }, { "epoch": 7.6, "grad_norm": 18.81310272216797, "learning_rate": 4.807076980990025e-06, "loss": 0.5927, "step": 40360 }, { "epoch": 7.6, "grad_norm": 36.79855728149414, "learning_rate": 4.803312629399586e-06, "loss": 0.3944, "step": 40370 }, { "epoch": 7.6, "grad_norm": 11.548039436340332, "learning_rate": 4.799548277809148e-06, "loss": 0.7024, "step": 40380 }, { "epoch": 7.6, "grad_norm": 17.895483016967773, "learning_rate": 4.795783926218709e-06, "loss": 0.4029, "step": 40390 }, { "epoch": 7.6, "grad_norm": 25.039541244506836, "learning_rate": 4.792019574628271e-06, "loss": 0.4372, "step": 40400 }, { "epoch": 7.61, "grad_norm": 5.053126811981201, "learning_rate": 4.788255223037832e-06, "loss": 0.1909, "step": 40410 }, { "epoch": 7.61, "grad_norm": 19.565786361694336, "learning_rate": 4.784490871447394e-06, "loss": 0.6079, "step": 40420 }, { "epoch": 7.61, "grad_norm": 21.104755401611328, "learning_rate": 4.780726519856955e-06, "loss": 0.5151, "step": 40430 }, { "epoch": 7.61, "grad_norm": 20.68798828125, "learning_rate": 4.7769621682665165e-06, "loss": 0.4636, "step": 40440 }, { "epoch": 7.61, "grad_norm": 0.7437390685081482, "learning_rate": 4.773197816676078e-06, "loss": 0.4972, "step": 40450 }, { "epoch": 7.62, "grad_norm": 5.112244606018066, "learning_rate": 4.7694334650856395e-06, "loss": 0.4124, "step": 40460 }, { "epoch": 7.62, "grad_norm": 14.092161178588867, "learning_rate": 4.765669113495201e-06, "loss": 0.5287, "step": 40470 }, { "epoch": 7.62, "grad_norm": 30.17268180847168, "learning_rate": 4.761904761904762e-06, "loss": 0.5739, "step": 40480 }, { "epoch": 7.62, "grad_norm": 6.536983013153076, "learning_rate": 4.758140410314324e-06, "loss": 0.2285, "step": 40490 }, { "epoch": 7.62, "grad_norm": 0.2827499508857727, "learning_rate": 4.754376058723885e-06, "loss": 0.6735, "step": 40500 }, { "epoch": 7.62, "grad_norm": 6.0071563720703125, "learning_rate": 4.750611707133447e-06, "loss": 0.5643, "step": 40510 }, { "epoch": 7.63, "grad_norm": 8.479032516479492, "learning_rate": 4.746847355543008e-06, "loss": 0.3364, "step": 40520 }, { "epoch": 7.63, "grad_norm": 18.724328994750977, "learning_rate": 4.74308300395257e-06, "loss": 0.3524, "step": 40530 }, { "epoch": 7.63, "grad_norm": 9.712454795837402, "learning_rate": 4.739318652362131e-06, "loss": 0.4297, "step": 40540 }, { "epoch": 7.63, "grad_norm": 22.55899429321289, "learning_rate": 4.735554300771693e-06, "loss": 0.3444, "step": 40550 }, { "epoch": 7.63, "grad_norm": 1.9720405340194702, "learning_rate": 4.731789949181254e-06, "loss": 0.4414, "step": 40560 }, { "epoch": 7.64, "grad_norm": 0.06697694957256317, "learning_rate": 4.7280255975908155e-06, "loss": 0.4789, "step": 40570 }, { "epoch": 7.64, "grad_norm": 11.40739631652832, "learning_rate": 4.724261246000377e-06, "loss": 0.5276, "step": 40580 }, { "epoch": 7.64, "grad_norm": 17.688581466674805, "learning_rate": 4.7204968944099384e-06, "loss": 0.4745, "step": 40590 }, { "epoch": 7.64, "grad_norm": 21.510601043701172, "learning_rate": 4.7167325428195e-06, "loss": 0.3477, "step": 40600 }, { "epoch": 7.64, "grad_norm": 0.09768573194742203, "learning_rate": 4.7129681912290605e-06, "loss": 0.2761, "step": 40610 }, { "epoch": 7.65, "grad_norm": 19.56352424621582, "learning_rate": 4.709203839638623e-06, "loss": 0.4043, "step": 40620 }, { "epoch": 7.65, "grad_norm": 29.654693603515625, "learning_rate": 4.705439488048184e-06, "loss": 0.4509, "step": 40630 }, { "epoch": 7.65, "grad_norm": 3.091660737991333, "learning_rate": 4.701675136457746e-06, "loss": 0.6926, "step": 40640 }, { "epoch": 7.65, "grad_norm": 0.025056788697838783, "learning_rate": 4.697910784867307e-06, "loss": 0.4671, "step": 40650 }, { "epoch": 7.65, "grad_norm": 16.61036491394043, "learning_rate": 4.694146433276869e-06, "loss": 0.4399, "step": 40660 }, { "epoch": 7.65, "grad_norm": 14.356207847595215, "learning_rate": 4.69038208168643e-06, "loss": 0.4281, "step": 40670 }, { "epoch": 7.66, "grad_norm": 26.823406219482422, "learning_rate": 4.686617730095992e-06, "loss": 0.5089, "step": 40680 }, { "epoch": 7.66, "grad_norm": 2.588717460632324, "learning_rate": 4.682853378505553e-06, "loss": 0.4385, "step": 40690 }, { "epoch": 7.66, "grad_norm": 8.470640182495117, "learning_rate": 4.679089026915114e-06, "loss": 0.534, "step": 40700 }, { "epoch": 7.66, "grad_norm": 1.754682183265686, "learning_rate": 4.675324675324676e-06, "loss": 0.5254, "step": 40710 }, { "epoch": 7.66, "grad_norm": 37.55604934692383, "learning_rate": 4.6715603237342374e-06, "loss": 0.4852, "step": 40720 }, { "epoch": 7.67, "grad_norm": 4.329613208770752, "learning_rate": 4.667795972143799e-06, "loss": 0.3389, "step": 40730 }, { "epoch": 7.67, "grad_norm": 5.099806785583496, "learning_rate": 4.66403162055336e-06, "loss": 0.6732, "step": 40740 }, { "epoch": 7.67, "grad_norm": 8.931364059448242, "learning_rate": 4.660267268962921e-06, "loss": 0.4488, "step": 40750 }, { "epoch": 7.67, "grad_norm": 9.485333442687988, "learning_rate": 4.656502917372483e-06, "loss": 0.3706, "step": 40760 }, { "epoch": 7.67, "grad_norm": 10.791669845581055, "learning_rate": 4.652738565782045e-06, "loss": 0.5172, "step": 40770 }, { "epoch": 7.68, "grad_norm": 11.53160285949707, "learning_rate": 4.648974214191606e-06, "loss": 0.325, "step": 40780 }, { "epoch": 7.68, "grad_norm": 0.11603966355323792, "learning_rate": 4.645209862601167e-06, "loss": 0.5713, "step": 40790 }, { "epoch": 7.68, "grad_norm": 4.958864688873291, "learning_rate": 4.641445511010729e-06, "loss": 0.4887, "step": 40800 }, { "epoch": 7.68, "grad_norm": 6.0979814529418945, "learning_rate": 4.637681159420291e-06, "loss": 0.4514, "step": 40810 }, { "epoch": 7.68, "grad_norm": 37.97774124145508, "learning_rate": 4.633916807829852e-06, "loss": 0.5397, "step": 40820 }, { "epoch": 7.68, "grad_norm": 6.384561061859131, "learning_rate": 4.630152456239413e-06, "loss": 0.2419, "step": 40830 }, { "epoch": 7.69, "grad_norm": 32.47441101074219, "learning_rate": 4.626388104648974e-06, "loss": 0.7723, "step": 40840 }, { "epoch": 7.69, "grad_norm": 58.51771545410156, "learning_rate": 4.622623753058536e-06, "loss": 0.4001, "step": 40850 }, { "epoch": 7.69, "grad_norm": 1.2339733839035034, "learning_rate": 4.618859401468098e-06, "loss": 0.2751, "step": 40860 }, { "epoch": 7.69, "grad_norm": 1.42243492603302, "learning_rate": 4.615095049877659e-06, "loss": 0.4036, "step": 40870 }, { "epoch": 7.69, "grad_norm": 0.05874158814549446, "learning_rate": 4.61133069828722e-06, "loss": 0.2054, "step": 40880 }, { "epoch": 7.7, "grad_norm": 0.06559593230485916, "learning_rate": 4.607566346696781e-06, "loss": 0.3678, "step": 40890 }, { "epoch": 7.7, "grad_norm": 37.953216552734375, "learning_rate": 4.603801995106344e-06, "loss": 0.7121, "step": 40900 }, { "epoch": 7.7, "grad_norm": 28.273658752441406, "learning_rate": 4.600037643515905e-06, "loss": 0.7176, "step": 40910 }, { "epoch": 7.7, "grad_norm": 0.9303427934646606, "learning_rate": 4.596273291925466e-06, "loss": 0.4192, "step": 40920 }, { "epoch": 7.7, "grad_norm": 6.678520202636719, "learning_rate": 4.592508940335027e-06, "loss": 0.4673, "step": 40930 }, { "epoch": 7.71, "grad_norm": 10.7068510055542, "learning_rate": 4.5887445887445896e-06, "loss": 0.5474, "step": 40940 }, { "epoch": 7.71, "grad_norm": 24.268388748168945, "learning_rate": 4.584980237154151e-06, "loss": 0.4693, "step": 40950 }, { "epoch": 7.71, "grad_norm": 0.8732045292854309, "learning_rate": 4.581215885563712e-06, "loss": 0.3434, "step": 40960 }, { "epoch": 7.71, "grad_norm": 4.5128865242004395, "learning_rate": 4.577451533973273e-06, "loss": 0.5951, "step": 40970 }, { "epoch": 7.71, "grad_norm": 29.08911895751953, "learning_rate": 4.5736871823828346e-06, "loss": 0.8518, "step": 40980 }, { "epoch": 7.72, "grad_norm": 34.50959014892578, "learning_rate": 4.569922830792397e-06, "loss": 0.3967, "step": 40990 }, { "epoch": 7.72, "grad_norm": 12.16829776763916, "learning_rate": 4.566158479201958e-06, "loss": 0.568, "step": 41000 }, { "epoch": 7.72, "grad_norm": 8.815430641174316, "learning_rate": 4.562394127611519e-06, "loss": 0.6214, "step": 41010 }, { "epoch": 7.72, "grad_norm": 0.7041884064674377, "learning_rate": 4.55862977602108e-06, "loss": 0.3887, "step": 41020 }, { "epoch": 7.72, "grad_norm": 5.827664852142334, "learning_rate": 4.554865424430642e-06, "loss": 0.3346, "step": 41030 }, { "epoch": 7.72, "grad_norm": 13.621718406677246, "learning_rate": 4.551101072840204e-06, "loss": 0.5485, "step": 41040 }, { "epoch": 7.73, "grad_norm": 0.27952587604522705, "learning_rate": 4.547336721249765e-06, "loss": 0.4817, "step": 41050 }, { "epoch": 7.73, "grad_norm": 7.287542343139648, "learning_rate": 4.543572369659326e-06, "loss": 0.674, "step": 41060 }, { "epoch": 7.73, "grad_norm": 8.12995433807373, "learning_rate": 4.539808018068888e-06, "loss": 0.6428, "step": 41070 }, { "epoch": 7.73, "grad_norm": 20.937484741210938, "learning_rate": 4.53604366647845e-06, "loss": 0.5712, "step": 41080 }, { "epoch": 7.73, "grad_norm": 15.425496101379395, "learning_rate": 4.532279314888011e-06, "loss": 0.7567, "step": 41090 }, { "epoch": 7.74, "grad_norm": 14.051750183105469, "learning_rate": 4.528514963297572e-06, "loss": 0.5802, "step": 41100 }, { "epoch": 7.74, "grad_norm": 0.16733068227767944, "learning_rate": 4.5247506117071336e-06, "loss": 0.2608, "step": 41110 }, { "epoch": 7.74, "grad_norm": 17.37752914428711, "learning_rate": 4.520986260116695e-06, "loss": 0.5201, "step": 41120 }, { "epoch": 7.74, "grad_norm": 18.55728530883789, "learning_rate": 4.517221908526257e-06, "loss": 0.4982, "step": 41130 }, { "epoch": 7.74, "grad_norm": 15.515111923217773, "learning_rate": 4.513457556935818e-06, "loss": 0.5779, "step": 41140 }, { "epoch": 7.75, "grad_norm": 2.7760565280914307, "learning_rate": 4.509693205345379e-06, "loss": 0.2145, "step": 41150 }, { "epoch": 7.75, "grad_norm": 7.785154342651367, "learning_rate": 4.505928853754941e-06, "loss": 0.2332, "step": 41160 }, { "epoch": 7.75, "grad_norm": 27.99343490600586, "learning_rate": 4.502164502164502e-06, "loss": 0.4307, "step": 41170 }, { "epoch": 7.75, "grad_norm": 19.354997634887695, "learning_rate": 4.498400150574064e-06, "loss": 0.8148, "step": 41180 }, { "epoch": 7.75, "grad_norm": 2.94992995262146, "learning_rate": 4.494635798983625e-06, "loss": 0.1812, "step": 41190 }, { "epoch": 7.75, "grad_norm": 10.769566535949707, "learning_rate": 4.490871447393187e-06, "loss": 0.4019, "step": 41200 }, { "epoch": 7.76, "grad_norm": 15.878639221191406, "learning_rate": 4.487107095802748e-06, "loss": 0.4287, "step": 41210 }, { "epoch": 7.76, "grad_norm": 0.04310622438788414, "learning_rate": 4.48334274421231e-06, "loss": 0.333, "step": 41220 }, { "epoch": 7.76, "grad_norm": 5.737595081329346, "learning_rate": 4.479578392621871e-06, "loss": 0.4404, "step": 41230 }, { "epoch": 7.76, "grad_norm": 14.339284896850586, "learning_rate": 4.4758140410314325e-06, "loss": 0.4577, "step": 41240 }, { "epoch": 7.76, "grad_norm": 10.48139762878418, "learning_rate": 4.472049689440994e-06, "loss": 0.3453, "step": 41250 }, { "epoch": 7.77, "grad_norm": 4.904701232910156, "learning_rate": 4.4682853378505555e-06, "loss": 0.3275, "step": 41260 }, { "epoch": 7.77, "grad_norm": 0.7866021990776062, "learning_rate": 4.464520986260117e-06, "loss": 0.4991, "step": 41270 }, { "epoch": 7.77, "grad_norm": 13.277933120727539, "learning_rate": 4.460756634669678e-06, "loss": 0.4167, "step": 41280 }, { "epoch": 7.77, "grad_norm": 31.82375717163086, "learning_rate": 4.45699228307924e-06, "loss": 0.4556, "step": 41290 }, { "epoch": 7.77, "grad_norm": 36.45254898071289, "learning_rate": 4.453227931488801e-06, "loss": 0.4507, "step": 41300 }, { "epoch": 7.78, "grad_norm": 38.87356948852539, "learning_rate": 4.449463579898363e-06, "loss": 0.4436, "step": 41310 }, { "epoch": 7.78, "grad_norm": 0.09693789482116699, "learning_rate": 4.445699228307924e-06, "loss": 0.3846, "step": 41320 }, { "epoch": 7.78, "grad_norm": 17.44363784790039, "learning_rate": 4.441934876717486e-06, "loss": 0.3734, "step": 41330 }, { "epoch": 7.78, "grad_norm": 3.033371925354004, "learning_rate": 4.438170525127047e-06, "loss": 0.3748, "step": 41340 }, { "epoch": 7.78, "grad_norm": 14.98725414276123, "learning_rate": 4.434406173536609e-06, "loss": 0.8152, "step": 41350 }, { "epoch": 7.78, "grad_norm": 6.687844753265381, "learning_rate": 4.43064182194617e-06, "loss": 0.6698, "step": 41360 }, { "epoch": 7.79, "grad_norm": 0.14900249242782593, "learning_rate": 4.4268774703557315e-06, "loss": 0.3195, "step": 41370 }, { "epoch": 7.79, "grad_norm": 9.252775192260742, "learning_rate": 4.423113118765293e-06, "loss": 0.4316, "step": 41380 }, { "epoch": 7.79, "grad_norm": 7.680717945098877, "learning_rate": 4.4193487671748545e-06, "loss": 0.164, "step": 41390 }, { "epoch": 7.79, "grad_norm": 7.291797637939453, "learning_rate": 4.415584415584416e-06, "loss": 0.5361, "step": 41400 }, { "epoch": 7.79, "grad_norm": 3.572070360183716, "learning_rate": 4.411820063993977e-06, "loss": 0.4278, "step": 41410 }, { "epoch": 7.8, "grad_norm": 0.2998538911342621, "learning_rate": 4.408055712403539e-06, "loss": 0.3525, "step": 41420 }, { "epoch": 7.8, "grad_norm": 0.47384247183799744, "learning_rate": 4.4042913608131e-06, "loss": 0.2823, "step": 41430 }, { "epoch": 7.8, "grad_norm": 0.05279775708913803, "learning_rate": 4.400527009222662e-06, "loss": 0.4339, "step": 41440 }, { "epoch": 7.8, "grad_norm": 26.150938034057617, "learning_rate": 4.396762657632223e-06, "loss": 0.537, "step": 41450 }, { "epoch": 7.8, "grad_norm": 29.126239776611328, "learning_rate": 4.392998306041785e-06, "loss": 0.4351, "step": 41460 }, { "epoch": 7.81, "grad_norm": 13.385824203491211, "learning_rate": 4.389233954451346e-06, "loss": 0.2548, "step": 41470 }, { "epoch": 7.81, "grad_norm": 110.32833099365234, "learning_rate": 4.385469602860908e-06, "loss": 0.6887, "step": 41480 }, { "epoch": 7.81, "grad_norm": 2.3352975845336914, "learning_rate": 4.381705251270469e-06, "loss": 0.4305, "step": 41490 }, { "epoch": 7.81, "grad_norm": 18.318313598632812, "learning_rate": 4.3779408996800305e-06, "loss": 0.2915, "step": 41500 }, { "epoch": 7.81, "grad_norm": 14.995280265808105, "learning_rate": 4.374176548089592e-06, "loss": 0.4918, "step": 41510 }, { "epoch": 7.81, "grad_norm": 8.48552417755127, "learning_rate": 4.3704121964991534e-06, "loss": 0.502, "step": 41520 }, { "epoch": 7.82, "grad_norm": 6.212337017059326, "learning_rate": 4.366647844908715e-06, "loss": 0.5833, "step": 41530 }, { "epoch": 7.82, "grad_norm": 7.40939474105835, "learning_rate": 4.362883493318276e-06, "loss": 0.4578, "step": 41540 }, { "epoch": 7.82, "grad_norm": 7.82789945602417, "learning_rate": 4.359119141727838e-06, "loss": 0.4524, "step": 41550 }, { "epoch": 7.82, "grad_norm": 20.18194580078125, "learning_rate": 4.355354790137399e-06, "loss": 0.7412, "step": 41560 }, { "epoch": 7.82, "grad_norm": 1.9940485954284668, "learning_rate": 4.351590438546961e-06, "loss": 0.3181, "step": 41570 }, { "epoch": 7.83, "grad_norm": 4.829334259033203, "learning_rate": 4.347826086956522e-06, "loss": 0.19, "step": 41580 }, { "epoch": 7.83, "grad_norm": 0.15990076959133148, "learning_rate": 4.344061735366084e-06, "loss": 0.2216, "step": 41590 }, { "epoch": 7.83, "grad_norm": 6.221506595611572, "learning_rate": 4.340297383775645e-06, "loss": 0.389, "step": 41600 }, { "epoch": 7.83, "grad_norm": 8.496952056884766, "learning_rate": 4.336533032185207e-06, "loss": 0.5286, "step": 41610 }, { "epoch": 7.83, "grad_norm": 23.715831756591797, "learning_rate": 4.332768680594768e-06, "loss": 0.4569, "step": 41620 }, { "epoch": 7.84, "grad_norm": 5.149649143218994, "learning_rate": 4.3290043290043295e-06, "loss": 0.4774, "step": 41630 }, { "epoch": 7.84, "grad_norm": 17.749088287353516, "learning_rate": 4.325239977413891e-06, "loss": 0.514, "step": 41640 }, { "epoch": 7.84, "grad_norm": 1.3218231201171875, "learning_rate": 4.3214756258234524e-06, "loss": 0.2201, "step": 41650 }, { "epoch": 7.84, "grad_norm": 10.610811233520508, "learning_rate": 4.317711274233014e-06, "loss": 0.5015, "step": 41660 }, { "epoch": 7.84, "grad_norm": 30.114301681518555, "learning_rate": 4.313946922642575e-06, "loss": 0.456, "step": 41670 }, { "epoch": 7.84, "grad_norm": 6.920668601989746, "learning_rate": 4.310182571052137e-06, "loss": 0.4653, "step": 41680 }, { "epoch": 7.85, "grad_norm": 0.0986671969294548, "learning_rate": 4.306418219461698e-06, "loss": 0.4564, "step": 41690 }, { "epoch": 7.85, "grad_norm": 10.98770523071289, "learning_rate": 4.302653867871259e-06, "loss": 0.4449, "step": 41700 }, { "epoch": 7.85, "grad_norm": 7.872487545013428, "learning_rate": 4.298889516280821e-06, "loss": 0.3511, "step": 41710 }, { "epoch": 7.85, "grad_norm": 0.12211643904447556, "learning_rate": 4.295125164690383e-06, "loss": 0.3521, "step": 41720 }, { "epoch": 7.85, "grad_norm": 2.3334715366363525, "learning_rate": 4.291360813099944e-06, "loss": 0.4879, "step": 41730 }, { "epoch": 7.86, "grad_norm": 1.1513813734054565, "learning_rate": 4.287596461509506e-06, "loss": 0.2625, "step": 41740 }, { "epoch": 7.86, "grad_norm": 3.9860312938690186, "learning_rate": 4.283832109919067e-06, "loss": 0.3835, "step": 41750 }, { "epoch": 7.86, "grad_norm": 16.243440628051758, "learning_rate": 4.2800677583286285e-06, "loss": 0.4477, "step": 41760 }, { "epoch": 7.86, "grad_norm": 23.092260360717773, "learning_rate": 4.27630340673819e-06, "loss": 0.4379, "step": 41770 }, { "epoch": 7.86, "grad_norm": 0.4055626094341278, "learning_rate": 4.272539055147751e-06, "loss": 0.5019, "step": 41780 }, { "epoch": 7.87, "grad_norm": 15.21871566772461, "learning_rate": 4.268774703557312e-06, "loss": 0.4496, "step": 41790 }, { "epoch": 7.87, "grad_norm": 22.797069549560547, "learning_rate": 4.265010351966874e-06, "loss": 0.5968, "step": 41800 }, { "epoch": 7.87, "grad_norm": 0.028871312737464905, "learning_rate": 4.261246000376436e-06, "loss": 0.5744, "step": 41810 }, { "epoch": 7.87, "grad_norm": 14.592629432678223, "learning_rate": 4.257481648785997e-06, "loss": 0.2576, "step": 41820 }, { "epoch": 7.87, "grad_norm": 50.91252899169922, "learning_rate": 4.253717297195558e-06, "loss": 0.4805, "step": 41830 }, { "epoch": 7.88, "grad_norm": 0.8403691649436951, "learning_rate": 4.249952945605119e-06, "loss": 0.3705, "step": 41840 }, { "epoch": 7.88, "grad_norm": 0.6822836995124817, "learning_rate": 4.246188594014682e-06, "loss": 0.2433, "step": 41850 }, { "epoch": 7.88, "grad_norm": 4.622705936431885, "learning_rate": 4.242424242424243e-06, "loss": 0.6058, "step": 41860 }, { "epoch": 7.88, "grad_norm": 17.88176155090332, "learning_rate": 4.2386598908338046e-06, "loss": 0.5387, "step": 41870 }, { "epoch": 7.88, "grad_norm": 0.5473073124885559, "learning_rate": 4.234895539243365e-06, "loss": 0.6625, "step": 41880 }, { "epoch": 7.88, "grad_norm": 0.17052659392356873, "learning_rate": 4.2311311876529275e-06, "loss": 0.3257, "step": 41890 }, { "epoch": 7.89, "grad_norm": 0.7636673450469971, "learning_rate": 4.227366836062489e-06, "loss": 0.3624, "step": 41900 }, { "epoch": 7.89, "grad_norm": 20.62749481201172, "learning_rate": 4.22360248447205e-06, "loss": 0.4521, "step": 41910 }, { "epoch": 7.89, "grad_norm": 6.845425128936768, "learning_rate": 4.219838132881611e-06, "loss": 0.3049, "step": 41920 }, { "epoch": 7.89, "grad_norm": 26.27695655822754, "learning_rate": 4.2160737812911725e-06, "loss": 0.699, "step": 41930 }, { "epoch": 7.89, "grad_norm": 10.913443565368652, "learning_rate": 4.212309429700735e-06, "loss": 0.7243, "step": 41940 }, { "epoch": 7.9, "grad_norm": 29.593870162963867, "learning_rate": 4.208545078110296e-06, "loss": 0.5249, "step": 41950 }, { "epoch": 7.9, "grad_norm": 7.848869323730469, "learning_rate": 4.204780726519857e-06, "loss": 0.4757, "step": 41960 }, { "epoch": 7.9, "grad_norm": 10.693510055541992, "learning_rate": 4.201016374929418e-06, "loss": 0.4779, "step": 41970 }, { "epoch": 7.9, "grad_norm": 2.0626046657562256, "learning_rate": 4.19725202333898e-06, "loss": 0.412, "step": 41980 }, { "epoch": 7.9, "grad_norm": 7.025002479553223, "learning_rate": 4.193487671748542e-06, "loss": 0.2752, "step": 41990 }, { "epoch": 7.91, "grad_norm": 23.475351333618164, "learning_rate": 4.1897233201581036e-06, "loss": 0.4691, "step": 42000 }, { "epoch": 7.91, "grad_norm": 17.180540084838867, "learning_rate": 4.185958968567664e-06, "loss": 0.3662, "step": 42010 }, { "epoch": 7.91, "grad_norm": 30.194869995117188, "learning_rate": 4.182194616977226e-06, "loss": 0.3635, "step": 42020 }, { "epoch": 7.91, "grad_norm": 0.9849434494972229, "learning_rate": 4.178430265386788e-06, "loss": 0.5017, "step": 42030 }, { "epoch": 7.91, "grad_norm": 0.7494462132453918, "learning_rate": 4.174665913796349e-06, "loss": 0.3769, "step": 42040 }, { "epoch": 7.91, "grad_norm": 15.633011817932129, "learning_rate": 4.17090156220591e-06, "loss": 0.4248, "step": 42050 }, { "epoch": 7.92, "grad_norm": 0.03347090259194374, "learning_rate": 4.1671372106154715e-06, "loss": 0.6362, "step": 42060 }, { "epoch": 7.92, "grad_norm": 2.5611062049865723, "learning_rate": 4.163372859025033e-06, "loss": 0.5368, "step": 42070 }, { "epoch": 7.92, "grad_norm": 3.9073028564453125, "learning_rate": 4.159608507434595e-06, "loss": 0.6604, "step": 42080 }, { "epoch": 7.92, "grad_norm": 30.027442932128906, "learning_rate": 4.155844155844157e-06, "loss": 0.521, "step": 42090 }, { "epoch": 7.92, "grad_norm": 22.11733055114746, "learning_rate": 4.152079804253717e-06, "loss": 0.5561, "step": 42100 }, { "epoch": 7.93, "grad_norm": 13.414361000061035, "learning_rate": 4.148315452663279e-06, "loss": 0.5497, "step": 42110 }, { "epoch": 7.93, "grad_norm": 0.3089528977870941, "learning_rate": 4.14455110107284e-06, "loss": 0.3887, "step": 42120 }, { "epoch": 7.93, "grad_norm": 31.682811737060547, "learning_rate": 4.1407867494824025e-06, "loss": 0.7196, "step": 42130 }, { "epoch": 7.93, "grad_norm": 43.80996322631836, "learning_rate": 4.137022397891963e-06, "loss": 0.3509, "step": 42140 }, { "epoch": 7.93, "grad_norm": 3.049875020980835, "learning_rate": 4.133258046301525e-06, "loss": 0.4552, "step": 42150 }, { "epoch": 7.94, "grad_norm": 22.50640296936035, "learning_rate": 4.129493694711086e-06, "loss": 0.298, "step": 42160 }, { "epoch": 7.94, "grad_norm": 19.20767593383789, "learning_rate": 4.125729343120648e-06, "loss": 0.3963, "step": 42170 }, { "epoch": 7.94, "grad_norm": 15.141514778137207, "learning_rate": 4.121964991530209e-06, "loss": 0.4502, "step": 42180 }, { "epoch": 7.94, "grad_norm": 17.997087478637695, "learning_rate": 4.1182006399397705e-06, "loss": 0.2375, "step": 42190 }, { "epoch": 7.94, "grad_norm": 1.2107799053192139, "learning_rate": 4.114436288349332e-06, "loss": 0.4342, "step": 42200 }, { "epoch": 7.94, "grad_norm": 17.25642967224121, "learning_rate": 4.110671936758893e-06, "loss": 0.599, "step": 42210 }, { "epoch": 7.95, "grad_norm": 23.210981369018555, "learning_rate": 4.106907585168456e-06, "loss": 0.6262, "step": 42220 }, { "epoch": 7.95, "grad_norm": 18.647939682006836, "learning_rate": 4.103143233578016e-06, "loss": 0.6408, "step": 42230 }, { "epoch": 7.95, "grad_norm": 24.76263999938965, "learning_rate": 4.099378881987578e-06, "loss": 0.4443, "step": 42240 }, { "epoch": 7.95, "grad_norm": 6.9933013916015625, "learning_rate": 4.095614530397139e-06, "loss": 0.6158, "step": 42250 }, { "epoch": 7.95, "grad_norm": 0.15161870419979095, "learning_rate": 4.091850178806701e-06, "loss": 0.2637, "step": 42260 }, { "epoch": 7.96, "grad_norm": 5.794905185699463, "learning_rate": 4.088085827216262e-06, "loss": 1.0222, "step": 42270 }, { "epoch": 7.96, "grad_norm": 37.5270881652832, "learning_rate": 4.084321475625824e-06, "loss": 0.3388, "step": 42280 }, { "epoch": 7.96, "grad_norm": 20.47008514404297, "learning_rate": 4.080557124035385e-06, "loss": 0.4998, "step": 42290 }, { "epoch": 7.96, "grad_norm": 10.392271041870117, "learning_rate": 4.0767927724449465e-06, "loss": 0.2468, "step": 42300 }, { "epoch": 7.96, "grad_norm": 22.866073608398438, "learning_rate": 4.073028420854508e-06, "loss": 0.361, "step": 42310 }, { "epoch": 7.97, "grad_norm": 3.4224941730499268, "learning_rate": 4.0692640692640695e-06, "loss": 0.5056, "step": 42320 }, { "epoch": 7.97, "grad_norm": 15.538722038269043, "learning_rate": 4.065499717673631e-06, "loss": 0.5327, "step": 42330 }, { "epoch": 7.97, "grad_norm": 17.44469451904297, "learning_rate": 4.061735366083192e-06, "loss": 0.628, "step": 42340 }, { "epoch": 7.97, "grad_norm": 24.630922317504883, "learning_rate": 4.057971014492754e-06, "loss": 0.5497, "step": 42350 }, { "epoch": 7.97, "grad_norm": 2.710216760635376, "learning_rate": 4.054206662902315e-06, "loss": 0.2458, "step": 42360 }, { "epoch": 7.97, "grad_norm": 2.76000714302063, "learning_rate": 4.050442311311877e-06, "loss": 0.4532, "step": 42370 }, { "epoch": 7.98, "grad_norm": 20.97554588317871, "learning_rate": 4.046677959721438e-06, "loss": 0.559, "step": 42380 }, { "epoch": 7.98, "grad_norm": 53.717735290527344, "learning_rate": 4.042913608131e-06, "loss": 0.8006, "step": 42390 }, { "epoch": 7.98, "grad_norm": 11.499624252319336, "learning_rate": 4.039149256540561e-06, "loss": 0.593, "step": 42400 }, { "epoch": 7.98, "grad_norm": 1.2157829999923706, "learning_rate": 4.035384904950123e-06, "loss": 0.4126, "step": 42410 }, { "epoch": 7.98, "grad_norm": 6.117697715759277, "learning_rate": 4.031620553359684e-06, "loss": 0.4347, "step": 42420 }, { "epoch": 7.99, "grad_norm": 6.3184051513671875, "learning_rate": 4.0278562017692455e-06, "loss": 0.2998, "step": 42430 }, { "epoch": 7.99, "grad_norm": 20.870182037353516, "learning_rate": 4.024091850178807e-06, "loss": 0.6725, "step": 42440 }, { "epoch": 7.99, "grad_norm": 17.98334503173828, "learning_rate": 4.0203274985883684e-06, "loss": 0.3782, "step": 42450 }, { "epoch": 7.99, "grad_norm": 7.8092193603515625, "learning_rate": 4.01656314699793e-06, "loss": 0.3603, "step": 42460 }, { "epoch": 7.99, "grad_norm": 1.0869126319885254, "learning_rate": 4.012798795407491e-06, "loss": 0.4338, "step": 42470 }, { "epoch": 8.0, "grad_norm": 4.888917446136475, "learning_rate": 4.009034443817053e-06, "loss": 0.3939, "step": 42480 }, { "epoch": 8.0, "grad_norm": 0.40398913621902466, "learning_rate": 4.005270092226614e-06, "loss": 0.6347, "step": 42490 }, { "epoch": 8.0, "grad_norm": 17.665483474731445, "learning_rate": 4.001505740636176e-06, "loss": 0.6128, "step": 42500 }, { "epoch": 8.0, "eval_accuracy": 0.9261333333333334, "eval_loss": 0.2983257472515106, "eval_runtime": 51.1739, "eval_samples_per_second": 146.559, "eval_steps_per_second": 18.33, "step": 42504 }, { "epoch": 8.0, "grad_norm": 8.759658813476562, "learning_rate": 3.997741389045737e-06, "loss": 0.6272, "step": 42510 }, { "epoch": 8.0, "grad_norm": 1.319584846496582, "learning_rate": 3.993977037455299e-06, "loss": 0.4546, "step": 42520 }, { "epoch": 8.0, "grad_norm": 16.56199836730957, "learning_rate": 3.99021268586486e-06, "loss": 0.7, "step": 42530 }, { "epoch": 8.01, "grad_norm": 2.113156318664551, "learning_rate": 3.986448334274422e-06, "loss": 0.2512, "step": 42540 }, { "epoch": 8.01, "grad_norm": 5.923486709594727, "learning_rate": 3.982683982683983e-06, "loss": 0.4087, "step": 42550 }, { "epoch": 8.01, "grad_norm": 10.338817596435547, "learning_rate": 3.9789196310935445e-06, "loss": 0.4118, "step": 42560 }, { "epoch": 8.01, "grad_norm": 16.17352294921875, "learning_rate": 3.975155279503106e-06, "loss": 0.6563, "step": 42570 }, { "epoch": 8.01, "grad_norm": 12.729879379272461, "learning_rate": 3.9713909279126674e-06, "loss": 0.5468, "step": 42580 }, { "epoch": 8.02, "grad_norm": 40.52555847167969, "learning_rate": 3.967626576322229e-06, "loss": 0.3984, "step": 42590 }, { "epoch": 8.02, "grad_norm": 22.066959381103516, "learning_rate": 3.96386222473179e-06, "loss": 0.6228, "step": 42600 }, { "epoch": 8.02, "grad_norm": 10.428549766540527, "learning_rate": 3.960097873141352e-06, "loss": 0.5729, "step": 42610 }, { "epoch": 8.02, "grad_norm": 9.915628433227539, "learning_rate": 3.956333521550913e-06, "loss": 0.3231, "step": 42620 }, { "epoch": 8.02, "grad_norm": 33.11208724975586, "learning_rate": 3.952569169960475e-06, "loss": 0.3779, "step": 42630 }, { "epoch": 8.03, "grad_norm": 4.810709476470947, "learning_rate": 3.948804818370036e-06, "loss": 0.6932, "step": 42640 }, { "epoch": 8.03, "grad_norm": 21.556655883789062, "learning_rate": 3.945040466779598e-06, "loss": 0.3488, "step": 42650 }, { "epoch": 8.03, "grad_norm": 12.891266822814941, "learning_rate": 3.941276115189159e-06, "loss": 0.4167, "step": 42660 }, { "epoch": 8.03, "grad_norm": 0.5069437026977539, "learning_rate": 3.9375117635987206e-06, "loss": 0.2696, "step": 42670 }, { "epoch": 8.03, "grad_norm": 8.775614738464355, "learning_rate": 3.933747412008282e-06, "loss": 0.5591, "step": 42680 }, { "epoch": 8.04, "grad_norm": 6.012180805206299, "learning_rate": 3.9299830604178435e-06, "loss": 0.5263, "step": 42690 }, { "epoch": 8.04, "grad_norm": 7.999334812164307, "learning_rate": 3.926218708827404e-06, "loss": 0.3107, "step": 42700 }, { "epoch": 8.04, "grad_norm": 12.329937934875488, "learning_rate": 3.922454357236966e-06, "loss": 0.3968, "step": 42710 }, { "epoch": 8.04, "grad_norm": 16.49465560913086, "learning_rate": 3.918690005646528e-06, "loss": 0.6177, "step": 42720 }, { "epoch": 8.04, "grad_norm": 21.259458541870117, "learning_rate": 3.914925654056089e-06, "loss": 0.5283, "step": 42730 }, { "epoch": 8.04, "grad_norm": 17.6918888092041, "learning_rate": 3.911161302465651e-06, "loss": 0.2778, "step": 42740 }, { "epoch": 8.05, "grad_norm": 8.05215072631836, "learning_rate": 3.907396950875212e-06, "loss": 0.4152, "step": 42750 }, { "epoch": 8.05, "grad_norm": 17.829729080200195, "learning_rate": 3.903632599284774e-06, "loss": 0.2527, "step": 42760 }, { "epoch": 8.05, "grad_norm": 28.24970054626465, "learning_rate": 3.899868247694335e-06, "loss": 0.3636, "step": 42770 }, { "epoch": 8.05, "grad_norm": 13.935927391052246, "learning_rate": 3.896103896103897e-06, "loss": 0.2224, "step": 42780 }, { "epoch": 8.05, "grad_norm": 21.196792602539062, "learning_rate": 3.892339544513457e-06, "loss": 0.5058, "step": 42790 }, { "epoch": 8.06, "grad_norm": 18.93883514404297, "learning_rate": 3.8885751929230196e-06, "loss": 0.4206, "step": 42800 }, { "epoch": 8.06, "grad_norm": 0.547015368938446, "learning_rate": 3.884810841332581e-06, "loss": 0.6522, "step": 42810 }, { "epoch": 8.06, "grad_norm": 9.596744537353516, "learning_rate": 3.8810464897421425e-06, "loss": 0.6818, "step": 42820 }, { "epoch": 8.06, "grad_norm": 17.509187698364258, "learning_rate": 3.877282138151704e-06, "loss": 0.3802, "step": 42830 }, { "epoch": 8.06, "grad_norm": 25.433029174804688, "learning_rate": 3.8735177865612646e-06, "loss": 0.2126, "step": 42840 }, { "epoch": 8.07, "grad_norm": 41.067604064941406, "learning_rate": 3.869753434970827e-06, "loss": 0.4666, "step": 42850 }, { "epoch": 8.07, "grad_norm": 14.244887351989746, "learning_rate": 3.865989083380388e-06, "loss": 0.4267, "step": 42860 }, { "epoch": 8.07, "grad_norm": 19.82988929748535, "learning_rate": 3.86222473178995e-06, "loss": 0.6026, "step": 42870 }, { "epoch": 8.07, "grad_norm": 4.339291572570801, "learning_rate": 3.85846038019951e-06, "loss": 0.2404, "step": 42880 }, { "epoch": 8.07, "grad_norm": 17.403106689453125, "learning_rate": 3.854696028609073e-06, "loss": 0.6288, "step": 42890 }, { "epoch": 8.07, "grad_norm": 44.81716537475586, "learning_rate": 3.850931677018634e-06, "loss": 0.4931, "step": 42900 }, { "epoch": 8.08, "grad_norm": 6.8778510093688965, "learning_rate": 3.847167325428196e-06, "loss": 0.4437, "step": 42910 }, { "epoch": 8.08, "grad_norm": 0.9462854862213135, "learning_rate": 3.843402973837756e-06, "loss": 0.3791, "step": 42920 }, { "epoch": 8.08, "grad_norm": 13.075630187988281, "learning_rate": 3.839638622247318e-06, "loss": 0.4265, "step": 42930 }, { "epoch": 8.08, "grad_norm": 34.70516586303711, "learning_rate": 3.83587427065688e-06, "loss": 0.3808, "step": 42940 }, { "epoch": 8.08, "grad_norm": 0.2037682831287384, "learning_rate": 3.8321099190664415e-06, "loss": 0.3715, "step": 42950 }, { "epoch": 8.09, "grad_norm": 1.690575361251831, "learning_rate": 3.828345567476003e-06, "loss": 0.3384, "step": 42960 }, { "epoch": 8.09, "grad_norm": 4.726039886474609, "learning_rate": 3.8245812158855635e-06, "loss": 0.3805, "step": 42970 }, { "epoch": 8.09, "grad_norm": 10.409626007080078, "learning_rate": 3.820816864295126e-06, "loss": 0.2682, "step": 42980 }, { "epoch": 8.09, "grad_norm": 12.783897399902344, "learning_rate": 3.817052512704687e-06, "loss": 0.6112, "step": 42990 }, { "epoch": 8.09, "grad_norm": 0.31685903668403625, "learning_rate": 3.8132881611142488e-06, "loss": 0.4119, "step": 43000 }, { "epoch": 8.1, "grad_norm": 43.89032745361328, "learning_rate": 3.80952380952381e-06, "loss": 0.4113, "step": 43010 }, { "epoch": 8.1, "grad_norm": 3.8857526779174805, "learning_rate": 3.8057594579333713e-06, "loss": 0.6077, "step": 43020 }, { "epoch": 8.1, "grad_norm": 7.122863292694092, "learning_rate": 3.8019951063429327e-06, "loss": 0.1793, "step": 43030 }, { "epoch": 8.1, "grad_norm": 21.782251358032227, "learning_rate": 3.798230754752494e-06, "loss": 0.5502, "step": 43040 }, { "epoch": 8.1, "grad_norm": 12.728435516357422, "learning_rate": 3.7944664031620552e-06, "loss": 0.6658, "step": 43050 }, { "epoch": 8.1, "grad_norm": 20.162574768066406, "learning_rate": 3.790702051571617e-06, "loss": 0.5913, "step": 43060 }, { "epoch": 8.11, "grad_norm": 0.8549222946166992, "learning_rate": 3.7869376999811786e-06, "loss": 0.2046, "step": 43070 }, { "epoch": 8.11, "grad_norm": 19.518613815307617, "learning_rate": 3.78317334839074e-06, "loss": 0.4695, "step": 43080 }, { "epoch": 8.11, "grad_norm": 8.266684532165527, "learning_rate": 3.7794089968003015e-06, "loss": 0.4126, "step": 43090 }, { "epoch": 8.11, "grad_norm": 21.11467170715332, "learning_rate": 3.7756446452098625e-06, "loss": 0.6289, "step": 43100 }, { "epoch": 8.11, "grad_norm": 25.66094970703125, "learning_rate": 3.7718802936194244e-06, "loss": 0.2911, "step": 43110 }, { "epoch": 8.12, "grad_norm": 23.889019012451172, "learning_rate": 3.768115942028986e-06, "loss": 0.6902, "step": 43120 }, { "epoch": 8.12, "grad_norm": 11.190979957580566, "learning_rate": 3.7643515904385473e-06, "loss": 0.4301, "step": 43130 }, { "epoch": 8.12, "grad_norm": 1.0701913833618164, "learning_rate": 3.7605872388481084e-06, "loss": 0.356, "step": 43140 }, { "epoch": 8.12, "grad_norm": 9.612865447998047, "learning_rate": 3.7568228872576703e-06, "loss": 0.487, "step": 43150 }, { "epoch": 8.12, "grad_norm": 0.3060283064842224, "learning_rate": 3.7530585356672317e-06, "loss": 0.2213, "step": 43160 }, { "epoch": 8.13, "grad_norm": 1.2015522718429565, "learning_rate": 3.749294184076793e-06, "loss": 0.4213, "step": 43170 }, { "epoch": 8.13, "grad_norm": 9.978540420532227, "learning_rate": 3.7455298324863542e-06, "loss": 0.6853, "step": 43180 }, { "epoch": 8.13, "grad_norm": 10.406660079956055, "learning_rate": 3.7417654808959157e-06, "loss": 0.4558, "step": 43190 }, { "epoch": 8.13, "grad_norm": 0.04582936689257622, "learning_rate": 3.7380011293054776e-06, "loss": 0.468, "step": 43200 }, { "epoch": 8.13, "grad_norm": 4.475496292114258, "learning_rate": 3.734236777715039e-06, "loss": 0.5617, "step": 43210 }, { "epoch": 8.13, "grad_norm": 11.952781677246094, "learning_rate": 3.7304724261246005e-06, "loss": 0.4628, "step": 43220 }, { "epoch": 8.14, "grad_norm": 23.401905059814453, "learning_rate": 3.7267080745341615e-06, "loss": 0.3671, "step": 43230 }, { "epoch": 8.14, "grad_norm": 11.260015487670898, "learning_rate": 3.722943722943723e-06, "loss": 0.5275, "step": 43240 }, { "epoch": 8.14, "grad_norm": 7.514461994171143, "learning_rate": 3.719179371353285e-06, "loss": 0.6261, "step": 43250 }, { "epoch": 8.14, "grad_norm": 13.519734382629395, "learning_rate": 3.7154150197628463e-06, "loss": 0.5934, "step": 43260 }, { "epoch": 8.14, "grad_norm": 26.910083770751953, "learning_rate": 3.7116506681724074e-06, "loss": 0.7417, "step": 43270 }, { "epoch": 8.15, "grad_norm": 8.547293663024902, "learning_rate": 3.707886316581969e-06, "loss": 0.2699, "step": 43280 }, { "epoch": 8.15, "grad_norm": 0.09624161571264267, "learning_rate": 3.7041219649915307e-06, "loss": 0.5396, "step": 43290 }, { "epoch": 8.15, "grad_norm": 10.193184852600098, "learning_rate": 3.700357613401092e-06, "loss": 0.6458, "step": 43300 }, { "epoch": 8.15, "grad_norm": 4.950165271759033, "learning_rate": 3.6965932618106532e-06, "loss": 0.3573, "step": 43310 }, { "epoch": 8.15, "grad_norm": 20.236759185791016, "learning_rate": 3.6928289102202147e-06, "loss": 0.5585, "step": 43320 }, { "epoch": 8.16, "grad_norm": 7.94288969039917, "learning_rate": 3.689064558629776e-06, "loss": 0.5142, "step": 43330 }, { "epoch": 8.16, "grad_norm": 4.145201206207275, "learning_rate": 3.685300207039338e-06, "loss": 0.3554, "step": 43340 }, { "epoch": 8.16, "grad_norm": 14.267728805541992, "learning_rate": 3.6815358554488995e-06, "loss": 0.3611, "step": 43350 }, { "epoch": 8.16, "grad_norm": 0.9065011143684387, "learning_rate": 3.6777715038584605e-06, "loss": 0.2953, "step": 43360 }, { "epoch": 8.16, "grad_norm": 9.731773376464844, "learning_rate": 3.674007152268022e-06, "loss": 0.5442, "step": 43370 }, { "epoch": 8.16, "grad_norm": 23.468883514404297, "learning_rate": 3.670242800677584e-06, "loss": 0.6701, "step": 43380 }, { "epoch": 8.17, "grad_norm": 9.353164672851562, "learning_rate": 3.6664784490871453e-06, "loss": 0.4046, "step": 43390 }, { "epoch": 8.17, "grad_norm": 10.205562591552734, "learning_rate": 3.6627140974967064e-06, "loss": 0.3539, "step": 43400 }, { "epoch": 8.17, "grad_norm": 7.568580150604248, "learning_rate": 3.658949745906268e-06, "loss": 0.7413, "step": 43410 }, { "epoch": 8.17, "grad_norm": 0.29625949263572693, "learning_rate": 3.6551853943158293e-06, "loss": 0.3291, "step": 43420 }, { "epoch": 8.17, "grad_norm": 8.475521087646484, "learning_rate": 3.651421042725391e-06, "loss": 0.239, "step": 43430 }, { "epoch": 8.18, "grad_norm": 21.264280319213867, "learning_rate": 3.6476566911349526e-06, "loss": 0.6242, "step": 43440 }, { "epoch": 8.18, "grad_norm": 7.065421104431152, "learning_rate": 3.6438923395445137e-06, "loss": 0.4781, "step": 43450 }, { "epoch": 8.18, "grad_norm": 11.760064125061035, "learning_rate": 3.640127987954075e-06, "loss": 0.2676, "step": 43460 }, { "epoch": 8.18, "grad_norm": 0.0757002905011177, "learning_rate": 3.6363636363636366e-06, "loss": 0.4388, "step": 43470 }, { "epoch": 8.18, "grad_norm": 6.151671886444092, "learning_rate": 3.6325992847731985e-06, "loss": 0.5118, "step": 43480 }, { "epoch": 8.19, "grad_norm": 0.9541302919387817, "learning_rate": 3.6288349331827595e-06, "loss": 0.7873, "step": 43490 }, { "epoch": 8.19, "grad_norm": 0.43069586157798767, "learning_rate": 3.625070581592321e-06, "loss": 0.4353, "step": 43500 }, { "epoch": 8.19, "grad_norm": 20.052248001098633, "learning_rate": 3.6213062300018824e-06, "loss": 0.4903, "step": 43510 }, { "epoch": 8.19, "grad_norm": 8.033267974853516, "learning_rate": 3.6175418784114443e-06, "loss": 0.5027, "step": 43520 }, { "epoch": 8.19, "grad_norm": 12.000602722167969, "learning_rate": 3.613777526821005e-06, "loss": 0.4379, "step": 43530 }, { "epoch": 8.19, "grad_norm": 0.025615401566028595, "learning_rate": 3.610013175230567e-06, "loss": 0.362, "step": 43540 }, { "epoch": 8.2, "grad_norm": 0.06578720360994339, "learning_rate": 3.6062488236401283e-06, "loss": 0.4048, "step": 43550 }, { "epoch": 8.2, "grad_norm": 19.945615768432617, "learning_rate": 3.6024844720496897e-06, "loss": 0.4744, "step": 43560 }, { "epoch": 8.2, "grad_norm": 1.860599398612976, "learning_rate": 3.5987201204592516e-06, "loss": 0.3348, "step": 43570 }, { "epoch": 8.2, "grad_norm": 0.18426920473575592, "learning_rate": 3.5949557688688127e-06, "loss": 0.3012, "step": 43580 }, { "epoch": 8.2, "grad_norm": 18.49991798400879, "learning_rate": 3.591191417278374e-06, "loss": 0.5334, "step": 43590 }, { "epoch": 8.21, "grad_norm": 7.533271312713623, "learning_rate": 3.5874270656879356e-06, "loss": 0.269, "step": 43600 }, { "epoch": 8.21, "grad_norm": 31.758325576782227, "learning_rate": 3.583662714097497e-06, "loss": 0.6352, "step": 43610 }, { "epoch": 8.21, "grad_norm": 24.36396026611328, "learning_rate": 3.579898362507058e-06, "loss": 0.3615, "step": 43620 }, { "epoch": 8.21, "grad_norm": 0.032064277678728104, "learning_rate": 3.57613401091662e-06, "loss": 0.7085, "step": 43630 }, { "epoch": 8.21, "grad_norm": 10.80846881866455, "learning_rate": 3.5723696593261814e-06, "loss": 0.6805, "step": 43640 }, { "epoch": 8.22, "grad_norm": 7.788702964782715, "learning_rate": 3.568605307735743e-06, "loss": 0.6523, "step": 43650 }, { "epoch": 8.22, "grad_norm": 6.078163146972656, "learning_rate": 3.564840956145304e-06, "loss": 0.3491, "step": 43660 }, { "epoch": 8.22, "grad_norm": 34.41270065307617, "learning_rate": 3.561076604554866e-06, "loss": 0.3832, "step": 43670 }, { "epoch": 8.22, "grad_norm": 0.11715062707662582, "learning_rate": 3.5573122529644273e-06, "loss": 0.4516, "step": 43680 }, { "epoch": 8.22, "grad_norm": 25.467695236206055, "learning_rate": 3.5535479013739887e-06, "loss": 0.3377, "step": 43690 }, { "epoch": 8.23, "grad_norm": 0.11090195924043655, "learning_rate": 3.54978354978355e-06, "loss": 0.4038, "step": 43700 }, { "epoch": 8.23, "grad_norm": 12.662226676940918, "learning_rate": 3.5460191981931112e-06, "loss": 0.379, "step": 43710 }, { "epoch": 8.23, "grad_norm": 5.7085652351379395, "learning_rate": 3.542254846602673e-06, "loss": 0.3542, "step": 43720 }, { "epoch": 8.23, "grad_norm": 15.676560401916504, "learning_rate": 3.5384904950122346e-06, "loss": 0.4124, "step": 43730 }, { "epoch": 8.23, "grad_norm": 34.08810806274414, "learning_rate": 3.534726143421796e-06, "loss": 0.3937, "step": 43740 }, { "epoch": 8.23, "grad_norm": 9.684732437133789, "learning_rate": 3.530961791831357e-06, "loss": 0.2242, "step": 43750 }, { "epoch": 8.24, "grad_norm": 8.743865013122559, "learning_rate": 3.5271974402409185e-06, "loss": 0.3141, "step": 43760 }, { "epoch": 8.24, "grad_norm": 9.374190330505371, "learning_rate": 3.5234330886504804e-06, "loss": 0.3993, "step": 43770 }, { "epoch": 8.24, "grad_norm": 6.62257719039917, "learning_rate": 3.519668737060042e-06, "loss": 0.4389, "step": 43780 }, { "epoch": 8.24, "grad_norm": 126.70000457763672, "learning_rate": 3.515904385469603e-06, "loss": 0.3082, "step": 43790 }, { "epoch": 8.24, "grad_norm": 16.551645278930664, "learning_rate": 3.5121400338791644e-06, "loss": 0.3682, "step": 43800 }, { "epoch": 8.25, "grad_norm": 0.19622400403022766, "learning_rate": 3.5083756822887262e-06, "loss": 0.389, "step": 43810 }, { "epoch": 8.25, "grad_norm": 11.304600715637207, "learning_rate": 3.5046113306982877e-06, "loss": 0.4126, "step": 43820 }, { "epoch": 8.25, "grad_norm": 13.223969459533691, "learning_rate": 3.500846979107849e-06, "loss": 0.3159, "step": 43830 }, { "epoch": 8.25, "grad_norm": 0.057929929345846176, "learning_rate": 3.49708262751741e-06, "loss": 0.6223, "step": 43840 }, { "epoch": 8.25, "grad_norm": 28.158432006835938, "learning_rate": 3.4933182759269717e-06, "loss": 0.3654, "step": 43850 }, { "epoch": 8.26, "grad_norm": 0.9253088235855103, "learning_rate": 3.4895539243365336e-06, "loss": 0.5879, "step": 43860 }, { "epoch": 8.26, "grad_norm": 25.360929489135742, "learning_rate": 3.485789572746095e-06, "loss": 0.5482, "step": 43870 }, { "epoch": 8.26, "grad_norm": 25.015487670898438, "learning_rate": 3.482025221155656e-06, "loss": 0.5676, "step": 43880 }, { "epoch": 8.26, "grad_norm": 25.207012176513672, "learning_rate": 3.4782608695652175e-06, "loss": 0.5568, "step": 43890 }, { "epoch": 8.26, "grad_norm": 29.287954330444336, "learning_rate": 3.474496517974779e-06, "loss": 0.5971, "step": 43900 }, { "epoch": 8.26, "grad_norm": 0.47074535489082336, "learning_rate": 3.470732166384341e-06, "loss": 0.3605, "step": 43910 }, { "epoch": 8.27, "grad_norm": 7.249814510345459, "learning_rate": 3.466967814793902e-06, "loss": 0.4114, "step": 43920 }, { "epoch": 8.27, "grad_norm": 16.79249382019043, "learning_rate": 3.4632034632034634e-06, "loss": 0.4512, "step": 43930 }, { "epoch": 8.27, "grad_norm": 20.726879119873047, "learning_rate": 3.459439111613025e-06, "loss": 0.4038, "step": 43940 }, { "epoch": 8.27, "grad_norm": 46.845523834228516, "learning_rate": 3.4556747600225867e-06, "loss": 0.4775, "step": 43950 }, { "epoch": 8.27, "grad_norm": 18.067689895629883, "learning_rate": 3.451910408432148e-06, "loss": 0.4175, "step": 43960 }, { "epoch": 8.28, "grad_norm": 42.61848831176758, "learning_rate": 3.448146056841709e-06, "loss": 0.6115, "step": 43970 }, { "epoch": 8.28, "grad_norm": 6.6008687019348145, "learning_rate": 3.4443817052512707e-06, "loss": 0.3097, "step": 43980 }, { "epoch": 8.28, "grad_norm": 27.931385040283203, "learning_rate": 3.440617353660832e-06, "loss": 0.3939, "step": 43990 }, { "epoch": 8.28, "grad_norm": 17.77448844909668, "learning_rate": 3.436853002070394e-06, "loss": 0.299, "step": 44000 }, { "epoch": 8.28, "grad_norm": 8.479036331176758, "learning_rate": 3.433088650479955e-06, "loss": 0.4371, "step": 44010 }, { "epoch": 8.29, "grad_norm": 12.390946388244629, "learning_rate": 3.4293242988895165e-06, "loss": 0.2047, "step": 44020 }, { "epoch": 8.29, "grad_norm": 12.151750564575195, "learning_rate": 3.425559947299078e-06, "loss": 0.5079, "step": 44030 }, { "epoch": 8.29, "grad_norm": 17.92153549194336, "learning_rate": 3.4217955957086394e-06, "loss": 0.543, "step": 44040 }, { "epoch": 8.29, "grad_norm": 8.07863998413086, "learning_rate": 3.4180312441182005e-06, "loss": 0.5345, "step": 44050 }, { "epoch": 8.29, "grad_norm": 0.12737701833248138, "learning_rate": 3.4142668925277623e-06, "loss": 0.7064, "step": 44060 }, { "epoch": 8.29, "grad_norm": 10.987157821655273, "learning_rate": 3.410502540937324e-06, "loss": 0.6823, "step": 44070 }, { "epoch": 8.3, "grad_norm": 12.050490379333496, "learning_rate": 3.4067381893468853e-06, "loss": 0.5315, "step": 44080 }, { "epoch": 8.3, "grad_norm": 17.466691970825195, "learning_rate": 3.402973837756447e-06, "loss": 0.5675, "step": 44090 }, { "epoch": 8.3, "grad_norm": 19.85411262512207, "learning_rate": 3.399209486166008e-06, "loss": 0.3984, "step": 44100 }, { "epoch": 8.3, "grad_norm": 12.507843971252441, "learning_rate": 3.3954451345755696e-06, "loss": 0.413, "step": 44110 }, { "epoch": 8.3, "grad_norm": 3.8064775466918945, "learning_rate": 3.391680782985131e-06, "loss": 0.4874, "step": 44120 }, { "epoch": 8.31, "grad_norm": 20.31875228881836, "learning_rate": 3.3879164313946926e-06, "loss": 0.5857, "step": 44130 }, { "epoch": 8.31, "grad_norm": 0.29422613978385925, "learning_rate": 3.3841520798042536e-06, "loss": 0.3885, "step": 44140 }, { "epoch": 8.31, "grad_norm": 13.232914924621582, "learning_rate": 3.3803877282138155e-06, "loss": 0.2175, "step": 44150 }, { "epoch": 8.31, "grad_norm": 0.48562586307525635, "learning_rate": 3.376623376623377e-06, "loss": 0.2482, "step": 44160 }, { "epoch": 8.31, "grad_norm": 18.276710510253906, "learning_rate": 3.3728590250329384e-06, "loss": 0.5239, "step": 44170 }, { "epoch": 8.32, "grad_norm": 1.2684160470962524, "learning_rate": 3.3690946734425e-06, "loss": 0.2591, "step": 44180 }, { "epoch": 8.32, "grad_norm": 11.328255653381348, "learning_rate": 3.365330321852061e-06, "loss": 0.3205, "step": 44190 }, { "epoch": 8.32, "grad_norm": 8.913219451904297, "learning_rate": 3.361565970261623e-06, "loss": 0.4611, "step": 44200 }, { "epoch": 8.32, "grad_norm": 1.8168461322784424, "learning_rate": 3.3578016186711843e-06, "loss": 0.2763, "step": 44210 }, { "epoch": 8.32, "grad_norm": 9.876787185668945, "learning_rate": 3.3540372670807457e-06, "loss": 0.3158, "step": 44220 }, { "epoch": 8.32, "grad_norm": 26.141977310180664, "learning_rate": 3.3502729154903068e-06, "loss": 0.519, "step": 44230 }, { "epoch": 8.33, "grad_norm": 0.03895975649356842, "learning_rate": 3.3465085638998686e-06, "loss": 0.3647, "step": 44240 }, { "epoch": 8.33, "grad_norm": 28.49066162109375, "learning_rate": 3.34274421230943e-06, "loss": 0.5051, "step": 44250 }, { "epoch": 8.33, "grad_norm": 48.42610168457031, "learning_rate": 3.3389798607189916e-06, "loss": 0.5039, "step": 44260 }, { "epoch": 8.33, "grad_norm": 33.13661575317383, "learning_rate": 3.3352155091285526e-06, "loss": 0.6969, "step": 44270 }, { "epoch": 8.33, "grad_norm": 24.356779098510742, "learning_rate": 3.331451157538114e-06, "loss": 0.646, "step": 44280 }, { "epoch": 8.34, "grad_norm": 26.351469039916992, "learning_rate": 3.327686805947676e-06, "loss": 0.4839, "step": 44290 }, { "epoch": 8.34, "grad_norm": 3.9066474437713623, "learning_rate": 3.3239224543572374e-06, "loss": 0.4166, "step": 44300 }, { "epoch": 8.34, "grad_norm": 0.12449698895215988, "learning_rate": 3.320158102766799e-06, "loss": 0.213, "step": 44310 }, { "epoch": 8.34, "grad_norm": 6.33676290512085, "learning_rate": 3.31639375117636e-06, "loss": 0.3959, "step": 44320 }, { "epoch": 8.34, "grad_norm": 12.31187629699707, "learning_rate": 3.3126293995859214e-06, "loss": 0.6402, "step": 44330 }, { "epoch": 8.35, "grad_norm": 9.686624526977539, "learning_rate": 3.3088650479954832e-06, "loss": 0.4905, "step": 44340 }, { "epoch": 8.35, "grad_norm": 15.566709518432617, "learning_rate": 3.3051006964050447e-06, "loss": 0.3201, "step": 44350 }, { "epoch": 8.35, "grad_norm": 19.786592483520508, "learning_rate": 3.3013363448146057e-06, "loss": 0.4149, "step": 44360 }, { "epoch": 8.35, "grad_norm": 43.112449645996094, "learning_rate": 3.297571993224167e-06, "loss": 0.5619, "step": 44370 }, { "epoch": 8.35, "grad_norm": 32.787532806396484, "learning_rate": 3.293807641633729e-06, "loss": 0.4769, "step": 44380 }, { "epoch": 8.35, "grad_norm": 21.896869659423828, "learning_rate": 3.2900432900432905e-06, "loss": 0.3075, "step": 44390 }, { "epoch": 8.36, "grad_norm": 2.6966474056243896, "learning_rate": 3.2862789384528516e-06, "loss": 0.6221, "step": 44400 }, { "epoch": 8.36, "grad_norm": 0.19632263481616974, "learning_rate": 3.282514586862413e-06, "loss": 0.4079, "step": 44410 }, { "epoch": 8.36, "grad_norm": 23.635337829589844, "learning_rate": 3.2787502352719745e-06, "loss": 0.3567, "step": 44420 }, { "epoch": 8.36, "grad_norm": 13.236377716064453, "learning_rate": 3.2749858836815364e-06, "loss": 0.3691, "step": 44430 }, { "epoch": 8.36, "grad_norm": 19.794361114501953, "learning_rate": 3.271221532091098e-06, "loss": 0.5006, "step": 44440 }, { "epoch": 8.37, "grad_norm": 0.1621648520231247, "learning_rate": 3.267457180500659e-06, "loss": 0.6175, "step": 44450 }, { "epoch": 8.37, "grad_norm": 0.08422687649726868, "learning_rate": 3.2636928289102203e-06, "loss": 0.4767, "step": 44460 }, { "epoch": 8.37, "grad_norm": 6.153237819671631, "learning_rate": 3.259928477319782e-06, "loss": 0.3885, "step": 44470 }, { "epoch": 8.37, "grad_norm": 18.663501739501953, "learning_rate": 3.2561641257293437e-06, "loss": 0.3928, "step": 44480 }, { "epoch": 8.37, "grad_norm": 12.590583801269531, "learning_rate": 3.2523997741389047e-06, "loss": 0.6693, "step": 44490 }, { "epoch": 8.38, "grad_norm": 0.5464389324188232, "learning_rate": 3.248635422548466e-06, "loss": 0.4404, "step": 44500 }, { "epoch": 8.38, "grad_norm": 2.3248085975646973, "learning_rate": 3.2448710709580277e-06, "loss": 0.6251, "step": 44510 }, { "epoch": 8.38, "grad_norm": 27.805416107177734, "learning_rate": 3.2411067193675895e-06, "loss": 0.1487, "step": 44520 }, { "epoch": 8.38, "grad_norm": 31.039339065551758, "learning_rate": 3.2373423677771506e-06, "loss": 0.4554, "step": 44530 }, { "epoch": 8.38, "grad_norm": 20.5913143157959, "learning_rate": 3.233578016186712e-06, "loss": 0.3836, "step": 44540 }, { "epoch": 8.39, "grad_norm": 43.38515853881836, "learning_rate": 3.2298136645962735e-06, "loss": 0.4318, "step": 44550 }, { "epoch": 8.39, "grad_norm": 12.96084976196289, "learning_rate": 3.226049313005835e-06, "loss": 0.3663, "step": 44560 }, { "epoch": 8.39, "grad_norm": 8.591901779174805, "learning_rate": 3.222284961415397e-06, "loss": 0.4199, "step": 44570 }, { "epoch": 8.39, "grad_norm": 14.201598167419434, "learning_rate": 3.218520609824958e-06, "loss": 0.6438, "step": 44580 }, { "epoch": 8.39, "grad_norm": 20.398212432861328, "learning_rate": 3.2147562582345193e-06, "loss": 0.2309, "step": 44590 }, { "epoch": 8.39, "grad_norm": 4.721048355102539, "learning_rate": 3.210991906644081e-06, "loss": 0.5085, "step": 44600 }, { "epoch": 8.4, "grad_norm": 11.639357566833496, "learning_rate": 3.2072275550536427e-06, "loss": 0.5171, "step": 44610 }, { "epoch": 8.4, "grad_norm": 9.085111618041992, "learning_rate": 3.2034632034632033e-06, "loss": 0.4878, "step": 44620 }, { "epoch": 8.4, "grad_norm": 14.544848442077637, "learning_rate": 3.199698851872765e-06, "loss": 0.4514, "step": 44630 }, { "epoch": 8.4, "grad_norm": 3.9646859169006348, "learning_rate": 3.1959345002823266e-06, "loss": 0.4601, "step": 44640 }, { "epoch": 8.4, "grad_norm": 25.853208541870117, "learning_rate": 3.192170148691888e-06, "loss": 0.5069, "step": 44650 }, { "epoch": 8.41, "grad_norm": 0.13335028290748596, "learning_rate": 3.188405797101449e-06, "loss": 0.4518, "step": 44660 }, { "epoch": 8.41, "grad_norm": 4.05998420715332, "learning_rate": 3.184641445511011e-06, "loss": 0.2396, "step": 44670 }, { "epoch": 8.41, "grad_norm": 15.973206520080566, "learning_rate": 3.1808770939205725e-06, "loss": 0.3994, "step": 44680 }, { "epoch": 8.41, "grad_norm": 5.575290679931641, "learning_rate": 3.177112742330134e-06, "loss": 0.8244, "step": 44690 }, { "epoch": 8.41, "grad_norm": 78.56654357910156, "learning_rate": 3.1733483907396954e-06, "loss": 0.8038, "step": 44700 }, { "epoch": 8.42, "grad_norm": 0.16877728700637817, "learning_rate": 3.1695840391492564e-06, "loss": 0.3286, "step": 44710 }, { "epoch": 8.42, "grad_norm": 9.68952751159668, "learning_rate": 3.1658196875588183e-06, "loss": 0.5651, "step": 44720 }, { "epoch": 8.42, "grad_norm": 47.49652862548828, "learning_rate": 3.1620553359683798e-06, "loss": 0.3635, "step": 44730 }, { "epoch": 8.42, "grad_norm": 19.1754207611084, "learning_rate": 3.1582909843779412e-06, "loss": 0.5277, "step": 44740 }, { "epoch": 8.42, "grad_norm": 21.708391189575195, "learning_rate": 3.1545266327875023e-06, "loss": 0.4516, "step": 44750 }, { "epoch": 8.42, "grad_norm": 2.858497381210327, "learning_rate": 3.1507622811970637e-06, "loss": 0.5113, "step": 44760 }, { "epoch": 8.43, "grad_norm": 14.189351081848145, "learning_rate": 3.1469979296066256e-06, "loss": 0.3809, "step": 44770 }, { "epoch": 8.43, "grad_norm": 15.705812454223633, "learning_rate": 3.143233578016187e-06, "loss": 0.4126, "step": 44780 }, { "epoch": 8.43, "grad_norm": 66.25598907470703, "learning_rate": 3.1394692264257485e-06, "loss": 0.6253, "step": 44790 }, { "epoch": 8.43, "grad_norm": 7.330162048339844, "learning_rate": 3.1357048748353096e-06, "loss": 0.5187, "step": 44800 }, { "epoch": 8.43, "grad_norm": 5.743100166320801, "learning_rate": 3.1319405232448715e-06, "loss": 0.4984, "step": 44810 }, { "epoch": 8.44, "grad_norm": 8.980222702026367, "learning_rate": 3.128176171654433e-06, "loss": 0.4996, "step": 44820 }, { "epoch": 8.44, "grad_norm": 0.5218448042869568, "learning_rate": 3.1244118200639944e-06, "loss": 0.5344, "step": 44830 }, { "epoch": 8.44, "grad_norm": 1.2095797061920166, "learning_rate": 3.1206474684735554e-06, "loss": 0.2706, "step": 44840 }, { "epoch": 8.44, "grad_norm": 44.43700408935547, "learning_rate": 3.116883116883117e-06, "loss": 0.4895, "step": 44850 }, { "epoch": 8.44, "grad_norm": 5.458186626434326, "learning_rate": 3.1131187652926788e-06, "loss": 0.3371, "step": 44860 }, { "epoch": 8.45, "grad_norm": 6.718016624450684, "learning_rate": 3.1093544137022402e-06, "loss": 0.2024, "step": 44870 }, { "epoch": 8.45, "grad_norm": 0.143607959151268, "learning_rate": 3.1055900621118013e-06, "loss": 0.2257, "step": 44880 }, { "epoch": 8.45, "grad_norm": 10.796164512634277, "learning_rate": 3.1018257105213627e-06, "loss": 0.6509, "step": 44890 }, { "epoch": 8.45, "grad_norm": 9.629598617553711, "learning_rate": 3.0980613589309246e-06, "loss": 0.5034, "step": 44900 }, { "epoch": 8.45, "grad_norm": 0.05811280012130737, "learning_rate": 3.094297007340486e-06, "loss": 0.4932, "step": 44910 }, { "epoch": 8.45, "grad_norm": 9.126168251037598, "learning_rate": 3.0905326557500475e-06, "loss": 0.4464, "step": 44920 }, { "epoch": 8.46, "grad_norm": 15.563265800476074, "learning_rate": 3.0867683041596086e-06, "loss": 0.4157, "step": 44930 }, { "epoch": 8.46, "grad_norm": 14.518123626708984, "learning_rate": 3.08300395256917e-06, "loss": 0.3727, "step": 44940 }, { "epoch": 8.46, "grad_norm": 5.1271138191223145, "learning_rate": 3.079239600978732e-06, "loss": 0.6179, "step": 44950 }, { "epoch": 8.46, "grad_norm": 0.39322352409362793, "learning_rate": 3.0754752493882934e-06, "loss": 0.2365, "step": 44960 }, { "epoch": 8.46, "grad_norm": 3.110018014907837, "learning_rate": 3.0717108977978544e-06, "loss": 0.4637, "step": 44970 }, { "epoch": 8.47, "grad_norm": 0.4475475251674652, "learning_rate": 3.067946546207416e-06, "loss": 0.4693, "step": 44980 }, { "epoch": 8.47, "grad_norm": 31.466312408447266, "learning_rate": 3.0641821946169773e-06, "loss": 0.6135, "step": 44990 }, { "epoch": 8.47, "grad_norm": 8.0161714553833, "learning_rate": 3.0604178430265392e-06, "loss": 0.4308, "step": 45000 }, { "epoch": 8.47, "grad_norm": 0.21481114625930786, "learning_rate": 3.0566534914361003e-06, "loss": 0.3452, "step": 45010 }, { "epoch": 8.47, "grad_norm": 6.250048637390137, "learning_rate": 3.0528891398456617e-06, "loss": 0.3198, "step": 45020 }, { "epoch": 8.48, "grad_norm": 14.705049514770508, "learning_rate": 3.049124788255223e-06, "loss": 0.5674, "step": 45030 }, { "epoch": 8.48, "grad_norm": 13.137300491333008, "learning_rate": 3.045360436664785e-06, "loss": 0.4053, "step": 45040 }, { "epoch": 8.48, "grad_norm": 5.976210594177246, "learning_rate": 3.0415960850743465e-06, "loss": 0.5598, "step": 45050 }, { "epoch": 8.48, "grad_norm": 23.372941970825195, "learning_rate": 3.0378317334839076e-06, "loss": 0.9162, "step": 45060 }, { "epoch": 8.48, "grad_norm": 16.597759246826172, "learning_rate": 3.034067381893469e-06, "loss": 0.4671, "step": 45070 }, { "epoch": 8.48, "grad_norm": 21.51179313659668, "learning_rate": 3.0303030303030305e-06, "loss": 0.5754, "step": 45080 }, { "epoch": 8.49, "grad_norm": 1.9006558656692505, "learning_rate": 3.0265386787125924e-06, "loss": 0.4043, "step": 45090 }, { "epoch": 8.49, "grad_norm": 12.068243980407715, "learning_rate": 3.0227743271221534e-06, "loss": 0.3859, "step": 45100 }, { "epoch": 8.49, "grad_norm": 7.060889720916748, "learning_rate": 3.019009975531715e-06, "loss": 0.3461, "step": 45110 }, { "epoch": 8.49, "grad_norm": 8.987565040588379, "learning_rate": 3.0152456239412763e-06, "loss": 0.5619, "step": 45120 }, { "epoch": 8.49, "grad_norm": 7.495594501495361, "learning_rate": 3.011481272350838e-06, "loss": 0.2997, "step": 45130 }, { "epoch": 8.5, "grad_norm": 19.95897102355957, "learning_rate": 3.007716920760399e-06, "loss": 0.5481, "step": 45140 }, { "epoch": 8.5, "grad_norm": 13.875322341918945, "learning_rate": 3.0039525691699607e-06, "loss": 0.4323, "step": 45150 }, { "epoch": 8.5, "grad_norm": 13.79182243347168, "learning_rate": 3.000188217579522e-06, "loss": 0.2615, "step": 45160 }, { "epoch": 8.5, "grad_norm": 1.8631694316864014, "learning_rate": 2.9964238659890836e-06, "loss": 0.7032, "step": 45170 }, { "epoch": 8.5, "grad_norm": 5.367626190185547, "learning_rate": 2.9926595143986455e-06, "loss": 0.509, "step": 45180 }, { "epoch": 8.51, "grad_norm": 6.53688907623291, "learning_rate": 2.9888951628082066e-06, "loss": 0.4337, "step": 45190 }, { "epoch": 8.51, "grad_norm": 17.974353790283203, "learning_rate": 2.985130811217768e-06, "loss": 0.7414, "step": 45200 }, { "epoch": 8.51, "grad_norm": 17.594377517700195, "learning_rate": 2.9813664596273295e-06, "loss": 0.5211, "step": 45210 }, { "epoch": 8.51, "grad_norm": 29.985471725463867, "learning_rate": 2.977602108036891e-06, "loss": 0.6543, "step": 45220 }, { "epoch": 8.51, "grad_norm": 0.37220674753189087, "learning_rate": 2.973837756446452e-06, "loss": 0.183, "step": 45230 }, { "epoch": 8.51, "grad_norm": 51.8664436340332, "learning_rate": 2.970073404856014e-06, "loss": 0.4546, "step": 45240 }, { "epoch": 8.52, "grad_norm": 0.12566237151622772, "learning_rate": 2.9663090532655753e-06, "loss": 0.6928, "step": 45250 }, { "epoch": 8.52, "grad_norm": 27.250438690185547, "learning_rate": 2.9625447016751368e-06, "loss": 0.6348, "step": 45260 }, { "epoch": 8.52, "grad_norm": 1.0455703735351562, "learning_rate": 2.958780350084698e-06, "loss": 0.51, "step": 45270 }, { "epoch": 8.52, "grad_norm": 18.566070556640625, "learning_rate": 2.9550159984942593e-06, "loss": 0.4247, "step": 45280 }, { "epoch": 8.52, "grad_norm": 22.149303436279297, "learning_rate": 2.951251646903821e-06, "loss": 0.5396, "step": 45290 }, { "epoch": 8.53, "grad_norm": 7.4732561111450195, "learning_rate": 2.9474872953133826e-06, "loss": 0.6225, "step": 45300 }, { "epoch": 8.53, "grad_norm": 21.741275787353516, "learning_rate": 2.943722943722944e-06, "loss": 0.7131, "step": 45310 }, { "epoch": 8.53, "grad_norm": 3.925509214401245, "learning_rate": 2.939958592132505e-06, "loss": 0.3896, "step": 45320 }, { "epoch": 8.53, "grad_norm": 0.22670765221118927, "learning_rate": 2.936194240542067e-06, "loss": 0.2686, "step": 45330 }, { "epoch": 8.53, "grad_norm": 9.519122123718262, "learning_rate": 2.9324298889516285e-06, "loss": 0.5187, "step": 45340 }, { "epoch": 8.54, "grad_norm": 1.0992772579193115, "learning_rate": 2.92866553736119e-06, "loss": 0.2774, "step": 45350 }, { "epoch": 8.54, "grad_norm": 16.670631408691406, "learning_rate": 2.924901185770751e-06, "loss": 0.7732, "step": 45360 }, { "epoch": 8.54, "grad_norm": 0.2704927623271942, "learning_rate": 2.9211368341803124e-06, "loss": 0.3941, "step": 45370 }, { "epoch": 8.54, "grad_norm": 0.42437541484832764, "learning_rate": 2.9173724825898743e-06, "loss": 0.689, "step": 45380 }, { "epoch": 8.54, "grad_norm": 1.1275883913040161, "learning_rate": 2.9136081309994358e-06, "loss": 0.3139, "step": 45390 }, { "epoch": 8.55, "grad_norm": 19.983110427856445, "learning_rate": 2.9098437794089972e-06, "loss": 0.5787, "step": 45400 }, { "epoch": 8.55, "grad_norm": 17.412151336669922, "learning_rate": 2.9060794278185583e-06, "loss": 0.4265, "step": 45410 }, { "epoch": 8.55, "grad_norm": 1.204263687133789, "learning_rate": 2.9023150762281197e-06, "loss": 0.8414, "step": 45420 }, { "epoch": 8.55, "grad_norm": 18.92685890197754, "learning_rate": 2.8985507246376816e-06, "loss": 0.5895, "step": 45430 }, { "epoch": 8.55, "grad_norm": 20.907590866088867, "learning_rate": 2.894786373047243e-06, "loss": 0.6537, "step": 45440 }, { "epoch": 8.55, "grad_norm": 0.030812498182058334, "learning_rate": 2.891022021456804e-06, "loss": 0.8201, "step": 45450 }, { "epoch": 8.56, "grad_norm": 19.68631935119629, "learning_rate": 2.8872576698663656e-06, "loss": 0.27, "step": 45460 }, { "epoch": 8.56, "grad_norm": 0.13562491536140442, "learning_rate": 2.8834933182759275e-06, "loss": 0.7518, "step": 45470 }, { "epoch": 8.56, "grad_norm": 13.175758361816406, "learning_rate": 2.879728966685489e-06, "loss": 0.2638, "step": 45480 }, { "epoch": 8.56, "grad_norm": 38.72014617919922, "learning_rate": 2.87596461509505e-06, "loss": 0.6186, "step": 45490 }, { "epoch": 8.56, "grad_norm": 5.701416969299316, "learning_rate": 2.8722002635046114e-06, "loss": 0.4767, "step": 45500 }, { "epoch": 8.57, "grad_norm": 6.803305625915527, "learning_rate": 2.868435911914173e-06, "loss": 0.3493, "step": 45510 }, { "epoch": 8.57, "grad_norm": 34.136417388916016, "learning_rate": 2.8646715603237348e-06, "loss": 0.3706, "step": 45520 }, { "epoch": 8.57, "grad_norm": 15.489117622375488, "learning_rate": 2.8609072087332962e-06, "loss": 0.5064, "step": 45530 }, { "epoch": 8.57, "grad_norm": 5.791557788848877, "learning_rate": 2.8571428571428573e-06, "loss": 0.4265, "step": 45540 }, { "epoch": 8.57, "grad_norm": 5.909194469451904, "learning_rate": 2.8533785055524187e-06, "loss": 0.4563, "step": 45550 }, { "epoch": 8.58, "grad_norm": 21.65177345275879, "learning_rate": 2.84961415396198e-06, "loss": 0.3891, "step": 45560 }, { "epoch": 8.58, "grad_norm": 3.773228645324707, "learning_rate": 2.845849802371542e-06, "loss": 0.501, "step": 45570 }, { "epoch": 8.58, "grad_norm": 9.513245582580566, "learning_rate": 2.842085450781103e-06, "loss": 0.4636, "step": 45580 }, { "epoch": 8.58, "grad_norm": 8.289735794067383, "learning_rate": 2.8383210991906646e-06, "loss": 0.4985, "step": 45590 }, { "epoch": 8.58, "grad_norm": 0.9724116921424866, "learning_rate": 2.834556747600226e-06, "loss": 0.5116, "step": 45600 }, { "epoch": 8.58, "grad_norm": 21.775800704956055, "learning_rate": 2.830792396009788e-06, "loss": 0.5441, "step": 45610 }, { "epoch": 8.59, "grad_norm": 48.54513931274414, "learning_rate": 2.827028044419349e-06, "loss": 0.3705, "step": 45620 }, { "epoch": 8.59, "grad_norm": 0.5952854752540588, "learning_rate": 2.8232636928289104e-06, "loss": 0.3866, "step": 45630 }, { "epoch": 8.59, "grad_norm": 12.538860321044922, "learning_rate": 2.819499341238472e-06, "loss": 0.4496, "step": 45640 }, { "epoch": 8.59, "grad_norm": 46.39492416381836, "learning_rate": 2.8157349896480333e-06, "loss": 0.3195, "step": 45650 }, { "epoch": 8.59, "grad_norm": 13.821070671081543, "learning_rate": 2.811970638057595e-06, "loss": 0.2962, "step": 45660 }, { "epoch": 8.6, "grad_norm": 6.4973225593566895, "learning_rate": 2.8082062864671562e-06, "loss": 0.4642, "step": 45670 }, { "epoch": 8.6, "grad_norm": 9.997817039489746, "learning_rate": 2.8044419348767177e-06, "loss": 0.2811, "step": 45680 }, { "epoch": 8.6, "grad_norm": 25.398149490356445, "learning_rate": 2.800677583286279e-06, "loss": 0.5049, "step": 45690 }, { "epoch": 8.6, "grad_norm": 22.4417667388916, "learning_rate": 2.7969132316958406e-06, "loss": 0.5439, "step": 45700 }, { "epoch": 8.6, "grad_norm": 19.63978385925293, "learning_rate": 2.7931488801054017e-06, "loss": 0.5902, "step": 45710 }, { "epoch": 8.61, "grad_norm": 0.3071557581424713, "learning_rate": 2.7893845285149635e-06, "loss": 0.3403, "step": 45720 }, { "epoch": 8.61, "grad_norm": 35.829811096191406, "learning_rate": 2.785620176924525e-06, "loss": 0.3616, "step": 45730 }, { "epoch": 8.61, "grad_norm": 14.984370231628418, "learning_rate": 2.7818558253340865e-06, "loss": 0.7215, "step": 45740 }, { "epoch": 8.61, "grad_norm": 8.33390998840332, "learning_rate": 2.7780914737436475e-06, "loss": 0.6237, "step": 45750 }, { "epoch": 8.61, "grad_norm": 22.571699142456055, "learning_rate": 2.7743271221532094e-06, "loss": 0.6739, "step": 45760 }, { "epoch": 8.61, "grad_norm": 17.001461029052734, "learning_rate": 2.770562770562771e-06, "loss": 0.2547, "step": 45770 }, { "epoch": 8.62, "grad_norm": 19.975543975830078, "learning_rate": 2.7667984189723323e-06, "loss": 0.5098, "step": 45780 }, { "epoch": 8.62, "grad_norm": 8.409460067749023, "learning_rate": 2.7630340673818938e-06, "loss": 0.407, "step": 45790 }, { "epoch": 8.62, "grad_norm": 11.9586820602417, "learning_rate": 2.759269715791455e-06, "loss": 0.6268, "step": 45800 }, { "epoch": 8.62, "grad_norm": 25.05490493774414, "learning_rate": 2.7555053642010167e-06, "loss": 0.3221, "step": 45810 }, { "epoch": 8.62, "grad_norm": 16.877796173095703, "learning_rate": 2.751741012610578e-06, "loss": 0.3368, "step": 45820 }, { "epoch": 8.63, "grad_norm": 37.895652770996094, "learning_rate": 2.7479766610201396e-06, "loss": 0.5498, "step": 45830 }, { "epoch": 8.63, "grad_norm": 16.549964904785156, "learning_rate": 2.7442123094297007e-06, "loss": 0.6585, "step": 45840 }, { "epoch": 8.63, "grad_norm": 23.333738327026367, "learning_rate": 2.740447957839262e-06, "loss": 0.4209, "step": 45850 }, { "epoch": 8.63, "grad_norm": 17.801307678222656, "learning_rate": 2.736683606248824e-06, "loss": 0.3118, "step": 45860 }, { "epoch": 8.63, "grad_norm": 4.754730224609375, "learning_rate": 2.7329192546583855e-06, "loss": 0.3706, "step": 45870 }, { "epoch": 8.64, "grad_norm": 12.892595291137695, "learning_rate": 2.7291549030679465e-06, "loss": 0.5805, "step": 45880 }, { "epoch": 8.64, "grad_norm": 16.939796447753906, "learning_rate": 2.725390551477508e-06, "loss": 0.5106, "step": 45890 }, { "epoch": 8.64, "grad_norm": 16.29544448852539, "learning_rate": 2.72162619988707e-06, "loss": 0.4779, "step": 45900 }, { "epoch": 8.64, "grad_norm": 28.001670837402344, "learning_rate": 2.7178618482966313e-06, "loss": 0.1302, "step": 45910 }, { "epoch": 8.64, "grad_norm": 0.1437859982252121, "learning_rate": 2.7140974967061928e-06, "loss": 0.4312, "step": 45920 }, { "epoch": 8.64, "grad_norm": 18.679183959960938, "learning_rate": 2.710333145115754e-06, "loss": 0.2055, "step": 45930 }, { "epoch": 8.65, "grad_norm": 19.28763198852539, "learning_rate": 2.7065687935253153e-06, "loss": 0.828, "step": 45940 }, { "epoch": 8.65, "grad_norm": 77.62793731689453, "learning_rate": 2.702804441934877e-06, "loss": 0.5013, "step": 45950 }, { "epoch": 8.65, "grad_norm": 21.08930015563965, "learning_rate": 2.6990400903444386e-06, "loss": 0.5488, "step": 45960 }, { "epoch": 8.65, "grad_norm": 25.826997756958008, "learning_rate": 2.6952757387539996e-06, "loss": 0.3493, "step": 45970 }, { "epoch": 8.65, "grad_norm": 4.74186897277832, "learning_rate": 2.691511387163561e-06, "loss": 0.5234, "step": 45980 }, { "epoch": 8.66, "grad_norm": 8.202046394348145, "learning_rate": 2.687747035573123e-06, "loss": 0.5571, "step": 45990 }, { "epoch": 8.66, "grad_norm": 19.04949188232422, "learning_rate": 2.6839826839826844e-06, "loss": 0.4959, "step": 46000 }, { "epoch": 8.66, "grad_norm": 10.953725814819336, "learning_rate": 2.6802183323922455e-06, "loss": 0.5771, "step": 46010 }, { "epoch": 8.66, "grad_norm": 1.0045043230056763, "learning_rate": 2.676453980801807e-06, "loss": 0.3382, "step": 46020 }, { "epoch": 8.66, "grad_norm": 0.04298378899693489, "learning_rate": 2.6726896292113684e-06, "loss": 0.4584, "step": 46030 }, { "epoch": 8.67, "grad_norm": 0.0919141098856926, "learning_rate": 2.6689252776209303e-06, "loss": 0.5683, "step": 46040 }, { "epoch": 8.67, "grad_norm": 27.5792293548584, "learning_rate": 2.6651609260304918e-06, "loss": 0.5817, "step": 46050 }, { "epoch": 8.67, "grad_norm": 5.107207775115967, "learning_rate": 2.661396574440053e-06, "loss": 0.3344, "step": 46060 }, { "epoch": 8.67, "grad_norm": 0.08540444076061249, "learning_rate": 2.6576322228496142e-06, "loss": 0.4315, "step": 46070 }, { "epoch": 8.67, "grad_norm": 6.959580898284912, "learning_rate": 2.6538678712591757e-06, "loss": 0.7812, "step": 46080 }, { "epoch": 8.67, "grad_norm": 21.82856559753418, "learning_rate": 2.6501035196687376e-06, "loss": 0.4448, "step": 46090 }, { "epoch": 8.68, "grad_norm": 17.987428665161133, "learning_rate": 2.6463391680782986e-06, "loss": 0.5252, "step": 46100 }, { "epoch": 8.68, "grad_norm": 29.11836814880371, "learning_rate": 2.64257481648786e-06, "loss": 0.5055, "step": 46110 }, { "epoch": 8.68, "grad_norm": 10.755464553833008, "learning_rate": 2.6388104648974216e-06, "loss": 0.2324, "step": 46120 }, { "epoch": 8.68, "grad_norm": 20.402507781982422, "learning_rate": 2.6350461133069834e-06, "loss": 0.3029, "step": 46130 }, { "epoch": 8.68, "grad_norm": 4.016020774841309, "learning_rate": 2.631281761716545e-06, "loss": 0.3907, "step": 46140 }, { "epoch": 8.69, "grad_norm": 14.392875671386719, "learning_rate": 2.627517410126106e-06, "loss": 0.5084, "step": 46150 }, { "epoch": 8.69, "grad_norm": 1.0872794389724731, "learning_rate": 2.6237530585356674e-06, "loss": 0.658, "step": 46160 }, { "epoch": 8.69, "grad_norm": 5.119357109069824, "learning_rate": 2.619988706945229e-06, "loss": 0.3549, "step": 46170 }, { "epoch": 8.69, "grad_norm": 0.06370334327220917, "learning_rate": 2.6162243553547907e-06, "loss": 0.4486, "step": 46180 }, { "epoch": 8.69, "grad_norm": 8.075560569763184, "learning_rate": 2.6124600037643518e-06, "loss": 0.4195, "step": 46190 }, { "epoch": 8.7, "grad_norm": 27.81960678100586, "learning_rate": 2.6086956521739132e-06, "loss": 0.7454, "step": 46200 }, { "epoch": 8.7, "grad_norm": 16.909664154052734, "learning_rate": 2.6049313005834747e-06, "loss": 0.5945, "step": 46210 }, { "epoch": 8.7, "grad_norm": 1.0878101587295532, "learning_rate": 2.601166948993036e-06, "loss": 0.3299, "step": 46220 }, { "epoch": 8.7, "grad_norm": 19.037450790405273, "learning_rate": 2.597402597402597e-06, "loss": 0.338, "step": 46230 }, { "epoch": 8.7, "grad_norm": 14.639829635620117, "learning_rate": 2.593638245812159e-06, "loss": 0.425, "step": 46240 }, { "epoch": 8.71, "grad_norm": 0.7264255881309509, "learning_rate": 2.5898738942217205e-06, "loss": 0.3609, "step": 46250 }, { "epoch": 8.71, "grad_norm": 8.234590530395508, "learning_rate": 2.586109542631282e-06, "loss": 0.4549, "step": 46260 }, { "epoch": 8.71, "grad_norm": 9.959664344787598, "learning_rate": 2.582345191040844e-06, "loss": 0.3512, "step": 46270 }, { "epoch": 8.71, "grad_norm": 0.06254367530345917, "learning_rate": 2.578580839450405e-06, "loss": 0.2918, "step": 46280 }, { "epoch": 8.71, "grad_norm": 2.8327713012695312, "learning_rate": 2.5748164878599664e-06, "loss": 0.5665, "step": 46290 }, { "epoch": 8.71, "grad_norm": 6.886368751525879, "learning_rate": 2.571052136269528e-06, "loss": 0.4747, "step": 46300 }, { "epoch": 8.72, "grad_norm": 10.857604026794434, "learning_rate": 2.5672877846790893e-06, "loss": 0.7911, "step": 46310 }, { "epoch": 8.72, "grad_norm": 17.762399673461914, "learning_rate": 2.5635234330886503e-06, "loss": 0.2114, "step": 46320 }, { "epoch": 8.72, "grad_norm": 9.51750659942627, "learning_rate": 2.5597590814982122e-06, "loss": 0.2595, "step": 46330 }, { "epoch": 8.72, "grad_norm": 9.107878684997559, "learning_rate": 2.5559947299077737e-06, "loss": 0.514, "step": 46340 }, { "epoch": 8.72, "grad_norm": 0.17868681252002716, "learning_rate": 2.552230378317335e-06, "loss": 0.3083, "step": 46350 }, { "epoch": 8.73, "grad_norm": 0.14755873382091522, "learning_rate": 2.548466026726896e-06, "loss": 0.6239, "step": 46360 }, { "epoch": 8.73, "grad_norm": 4.853813648223877, "learning_rate": 2.5447016751364576e-06, "loss": 0.4185, "step": 46370 }, { "epoch": 8.73, "grad_norm": 20.324190139770508, "learning_rate": 2.5409373235460195e-06, "loss": 0.6337, "step": 46380 }, { "epoch": 8.73, "grad_norm": 0.38305485248565674, "learning_rate": 2.537172971955581e-06, "loss": 0.3451, "step": 46390 }, { "epoch": 8.73, "grad_norm": 15.589489936828613, "learning_rate": 2.5334086203651425e-06, "loss": 0.7042, "step": 46400 }, { "epoch": 8.74, "grad_norm": 35.21918869018555, "learning_rate": 2.5296442687747035e-06, "loss": 0.6307, "step": 46410 }, { "epoch": 8.74, "grad_norm": 0.9967852830886841, "learning_rate": 2.5258799171842654e-06, "loss": 0.457, "step": 46420 }, { "epoch": 8.74, "grad_norm": 18.615461349487305, "learning_rate": 2.522115565593827e-06, "loss": 0.4258, "step": 46430 }, { "epoch": 8.74, "grad_norm": 7.5972981452941895, "learning_rate": 2.5183512140033883e-06, "loss": 0.3654, "step": 46440 }, { "epoch": 8.74, "grad_norm": 0.10862822085618973, "learning_rate": 2.5145868624129493e-06, "loss": 0.386, "step": 46450 }, { "epoch": 8.74, "grad_norm": 13.601619720458984, "learning_rate": 2.510822510822511e-06, "loss": 0.4652, "step": 46460 }, { "epoch": 8.75, "grad_norm": 11.241927146911621, "learning_rate": 2.5070581592320727e-06, "loss": 0.2974, "step": 46470 }, { "epoch": 8.75, "grad_norm": 4.576162338256836, "learning_rate": 2.503293807641634e-06, "loss": 0.4565, "step": 46480 }, { "epoch": 8.75, "grad_norm": 1.5839916467666626, "learning_rate": 2.4995294560511956e-06, "loss": 0.3777, "step": 46490 }, { "epoch": 8.75, "grad_norm": 25.34798812866211, "learning_rate": 2.4957651044607566e-06, "loss": 0.4831, "step": 46500 }, { "epoch": 8.75, "grad_norm": 0.46405714750289917, "learning_rate": 2.492000752870318e-06, "loss": 0.3859, "step": 46510 }, { "epoch": 8.76, "grad_norm": 12.911664962768555, "learning_rate": 2.48823640127988e-06, "loss": 0.4369, "step": 46520 }, { "epoch": 8.76, "grad_norm": 10.62582778930664, "learning_rate": 2.484472049689441e-06, "loss": 0.2246, "step": 46530 }, { "epoch": 8.76, "grad_norm": 15.52188777923584, "learning_rate": 2.480707698099003e-06, "loss": 0.3334, "step": 46540 }, { "epoch": 8.76, "grad_norm": 8.674419403076172, "learning_rate": 2.476943346508564e-06, "loss": 0.4668, "step": 46550 }, { "epoch": 8.76, "grad_norm": 17.727657318115234, "learning_rate": 2.473178994918126e-06, "loss": 0.4731, "step": 46560 }, { "epoch": 8.77, "grad_norm": 11.835549354553223, "learning_rate": 2.469414643327687e-06, "loss": 0.4562, "step": 46570 }, { "epoch": 8.77, "grad_norm": 5.226273536682129, "learning_rate": 2.4656502917372483e-06, "loss": 0.4373, "step": 46580 }, { "epoch": 8.77, "grad_norm": 16.807971954345703, "learning_rate": 2.4618859401468098e-06, "loss": 0.5285, "step": 46590 }, { "epoch": 8.77, "grad_norm": 11.890029907226562, "learning_rate": 2.4581215885563712e-06, "loss": 0.3881, "step": 46600 }, { "epoch": 8.77, "grad_norm": 7.499889373779297, "learning_rate": 2.4543572369659327e-06, "loss": 0.3529, "step": 46610 }, { "epoch": 8.77, "grad_norm": 2.0104305744171143, "learning_rate": 2.450592885375494e-06, "loss": 0.5556, "step": 46620 }, { "epoch": 8.78, "grad_norm": 1.1118526458740234, "learning_rate": 2.446828533785056e-06, "loss": 0.3103, "step": 46630 }, { "epoch": 8.78, "grad_norm": 10.730484008789062, "learning_rate": 2.443064182194617e-06, "loss": 0.4211, "step": 46640 }, { "epoch": 8.78, "grad_norm": 0.4229322075843811, "learning_rate": 2.4392998306041785e-06, "loss": 0.7197, "step": 46650 }, { "epoch": 8.78, "grad_norm": 7.243539810180664, "learning_rate": 2.43553547901374e-06, "loss": 0.2981, "step": 46660 }, { "epoch": 8.78, "grad_norm": 0.10506740212440491, "learning_rate": 2.4317711274233015e-06, "loss": 0.4107, "step": 46670 }, { "epoch": 8.79, "grad_norm": 0.9739378690719604, "learning_rate": 2.428006775832863e-06, "loss": 0.4228, "step": 46680 }, { "epoch": 8.79, "grad_norm": 21.565500259399414, "learning_rate": 2.4242424242424244e-06, "loss": 0.554, "step": 46690 }, { "epoch": 8.79, "grad_norm": 8.53973388671875, "learning_rate": 2.420478072651986e-06, "loss": 0.7312, "step": 46700 }, { "epoch": 8.79, "grad_norm": 15.285102844238281, "learning_rate": 2.4167137210615473e-06, "loss": 0.4223, "step": 46710 }, { "epoch": 8.79, "grad_norm": 27.030920028686523, "learning_rate": 2.4129493694711088e-06, "loss": 0.3982, "step": 46720 }, { "epoch": 8.8, "grad_norm": 1.0703567266464233, "learning_rate": 2.4091850178806702e-06, "loss": 0.5266, "step": 46730 }, { "epoch": 8.8, "grad_norm": 1.5598318576812744, "learning_rate": 2.4054206662902317e-06, "loss": 0.4147, "step": 46740 }, { "epoch": 8.8, "grad_norm": 5.893131732940674, "learning_rate": 2.401656314699793e-06, "loss": 0.9074, "step": 46750 }, { "epoch": 8.8, "grad_norm": 0.11686406284570694, "learning_rate": 2.3978919631093546e-06, "loss": 0.3176, "step": 46760 }, { "epoch": 8.8, "grad_norm": 17.671430587768555, "learning_rate": 2.394127611518916e-06, "loss": 0.6666, "step": 46770 }, { "epoch": 8.8, "grad_norm": 6.053145408630371, "learning_rate": 2.3903632599284775e-06, "loss": 0.453, "step": 46780 }, { "epoch": 8.81, "grad_norm": 17.766340255737305, "learning_rate": 2.386598908338039e-06, "loss": 0.292, "step": 46790 }, { "epoch": 8.81, "grad_norm": 7.656116485595703, "learning_rate": 2.3828345567476005e-06, "loss": 0.4534, "step": 46800 }, { "epoch": 8.81, "grad_norm": 9.28414249420166, "learning_rate": 2.379070205157162e-06, "loss": 0.2725, "step": 46810 }, { "epoch": 8.81, "grad_norm": 0.04184451699256897, "learning_rate": 2.3753058535667234e-06, "loss": 0.4548, "step": 46820 }, { "epoch": 8.81, "grad_norm": 51.41383743286133, "learning_rate": 2.371541501976285e-06, "loss": 0.6366, "step": 46830 }, { "epoch": 8.82, "grad_norm": 0.9687474370002747, "learning_rate": 2.3677771503858463e-06, "loss": 0.1807, "step": 46840 }, { "epoch": 8.82, "grad_norm": 10.670173645019531, "learning_rate": 2.3640127987954078e-06, "loss": 0.2247, "step": 46850 }, { "epoch": 8.82, "grad_norm": 25.308748245239258, "learning_rate": 2.3602484472049692e-06, "loss": 0.4082, "step": 46860 }, { "epoch": 8.82, "grad_norm": 6.221033573150635, "learning_rate": 2.3564840956145303e-06, "loss": 0.4089, "step": 46870 }, { "epoch": 8.82, "grad_norm": 6.021878719329834, "learning_rate": 2.352719744024092e-06, "loss": 0.5282, "step": 46880 }, { "epoch": 8.83, "grad_norm": 4.055378437042236, "learning_rate": 2.3489553924336536e-06, "loss": 0.3669, "step": 46890 }, { "epoch": 8.83, "grad_norm": 2.8399837017059326, "learning_rate": 2.345191040843215e-06, "loss": 0.4622, "step": 46900 }, { "epoch": 8.83, "grad_norm": 13.155150413513184, "learning_rate": 2.3414266892527765e-06, "loss": 0.3145, "step": 46910 }, { "epoch": 8.83, "grad_norm": 6.751850605010986, "learning_rate": 2.337662337662338e-06, "loss": 0.6429, "step": 46920 }, { "epoch": 8.83, "grad_norm": 25.934751510620117, "learning_rate": 2.3338979860718994e-06, "loss": 0.3962, "step": 46930 }, { "epoch": 8.83, "grad_norm": 0.046511393040418625, "learning_rate": 2.3301336344814605e-06, "loss": 0.6207, "step": 46940 }, { "epoch": 8.84, "grad_norm": 26.9798641204834, "learning_rate": 2.3263692828910224e-06, "loss": 0.4355, "step": 46950 }, { "epoch": 8.84, "grad_norm": 9.000409126281738, "learning_rate": 2.3226049313005834e-06, "loss": 0.5159, "step": 46960 }, { "epoch": 8.84, "grad_norm": 26.408952713012695, "learning_rate": 2.3188405797101453e-06, "loss": 0.3787, "step": 46970 }, { "epoch": 8.84, "grad_norm": 53.70569610595703, "learning_rate": 2.3150762281197063e-06, "loss": 0.5017, "step": 46980 }, { "epoch": 8.84, "grad_norm": 24.432947158813477, "learning_rate": 2.311311876529268e-06, "loss": 0.2867, "step": 46990 }, { "epoch": 8.85, "grad_norm": 0.2684187591075897, "learning_rate": 2.3075475249388297e-06, "loss": 0.3527, "step": 47000 }, { "epoch": 8.85, "grad_norm": 7.198283672332764, "learning_rate": 2.3037831733483907e-06, "loss": 0.4833, "step": 47010 }, { "epoch": 8.85, "grad_norm": 15.57585334777832, "learning_rate": 2.3000188217579526e-06, "loss": 0.3947, "step": 47020 }, { "epoch": 8.85, "grad_norm": 34.16131591796875, "learning_rate": 2.2962544701675136e-06, "loss": 0.4851, "step": 47030 }, { "epoch": 8.85, "grad_norm": 1.565393090248108, "learning_rate": 2.2924901185770755e-06, "loss": 0.5857, "step": 47040 }, { "epoch": 8.86, "grad_norm": 43.33597946166992, "learning_rate": 2.2887257669866366e-06, "loss": 0.6679, "step": 47050 }, { "epoch": 8.86, "grad_norm": 11.061119079589844, "learning_rate": 2.2849614153961984e-06, "loss": 0.5468, "step": 47060 }, { "epoch": 8.86, "grad_norm": 0.20168103277683258, "learning_rate": 2.2811970638057595e-06, "loss": 0.5677, "step": 47070 }, { "epoch": 8.86, "grad_norm": 5.765427112579346, "learning_rate": 2.277432712215321e-06, "loss": 0.6151, "step": 47080 }, { "epoch": 8.86, "grad_norm": 14.285442352294922, "learning_rate": 2.2736683606248824e-06, "loss": 0.2915, "step": 47090 }, { "epoch": 8.87, "grad_norm": 11.015250205993652, "learning_rate": 2.269904009034444e-06, "loss": 0.4167, "step": 47100 }, { "epoch": 8.87, "grad_norm": 42.790672302246094, "learning_rate": 2.2661396574440053e-06, "loss": 0.632, "step": 47110 }, { "epoch": 8.87, "grad_norm": 23.58126449584961, "learning_rate": 2.2623753058535668e-06, "loss": 0.5522, "step": 47120 }, { "epoch": 8.87, "grad_norm": 0.10688672214746475, "learning_rate": 2.2586109542631287e-06, "loss": 0.5342, "step": 47130 }, { "epoch": 8.87, "grad_norm": 22.022897720336914, "learning_rate": 2.2548466026726897e-06, "loss": 0.5221, "step": 47140 }, { "epoch": 8.87, "grad_norm": 3.2762904167175293, "learning_rate": 2.251082251082251e-06, "loss": 0.508, "step": 47150 }, { "epoch": 8.88, "grad_norm": 8.347319602966309, "learning_rate": 2.2473178994918126e-06, "loss": 0.3269, "step": 47160 }, { "epoch": 8.88, "grad_norm": 0.2670442461967468, "learning_rate": 2.243553547901374e-06, "loss": 0.4172, "step": 47170 }, { "epoch": 8.88, "grad_norm": 22.307165145874023, "learning_rate": 2.2397891963109355e-06, "loss": 0.6191, "step": 47180 }, { "epoch": 8.88, "grad_norm": 12.482026100158691, "learning_rate": 2.236024844720497e-06, "loss": 0.4978, "step": 47190 }, { "epoch": 8.88, "grad_norm": 12.081169128417969, "learning_rate": 2.2322604931300585e-06, "loss": 0.4355, "step": 47200 }, { "epoch": 8.89, "grad_norm": 16.33040428161621, "learning_rate": 2.22849614153962e-06, "loss": 0.5954, "step": 47210 }, { "epoch": 8.89, "grad_norm": 15.850767135620117, "learning_rate": 2.2247317899491814e-06, "loss": 0.4529, "step": 47220 }, { "epoch": 8.89, "grad_norm": 39.77294158935547, "learning_rate": 2.220967438358743e-06, "loss": 0.2148, "step": 47230 }, { "epoch": 8.89, "grad_norm": 18.885601043701172, "learning_rate": 2.2172030867683043e-06, "loss": 0.4973, "step": 47240 }, { "epoch": 8.89, "grad_norm": 16.801286697387695, "learning_rate": 2.2134387351778658e-06, "loss": 0.4566, "step": 47250 }, { "epoch": 8.9, "grad_norm": 18.179574966430664, "learning_rate": 2.2096743835874272e-06, "loss": 0.7612, "step": 47260 }, { "epoch": 8.9, "grad_norm": 3.8040480613708496, "learning_rate": 2.2059100319969887e-06, "loss": 0.4395, "step": 47270 }, { "epoch": 8.9, "grad_norm": 26.792110443115234, "learning_rate": 2.20214568040655e-06, "loss": 0.6617, "step": 47280 }, { "epoch": 8.9, "grad_norm": 10.858789443969727, "learning_rate": 2.1983813288161116e-06, "loss": 0.4822, "step": 47290 }, { "epoch": 8.9, "grad_norm": 0.04407741501927376, "learning_rate": 2.194616977225673e-06, "loss": 0.6258, "step": 47300 }, { "epoch": 8.9, "grad_norm": 23.669815063476562, "learning_rate": 2.1908526256352345e-06, "loss": 0.4182, "step": 47310 }, { "epoch": 8.91, "grad_norm": 15.9193115234375, "learning_rate": 2.187088274044796e-06, "loss": 0.3994, "step": 47320 }, { "epoch": 8.91, "grad_norm": 35.726600646972656, "learning_rate": 2.1833239224543575e-06, "loss": 0.2649, "step": 47330 }, { "epoch": 8.91, "grad_norm": 23.043758392333984, "learning_rate": 2.179559570863919e-06, "loss": 0.3575, "step": 47340 }, { "epoch": 8.91, "grad_norm": 1.2536325454711914, "learning_rate": 2.1757952192734804e-06, "loss": 0.3955, "step": 47350 }, { "epoch": 8.91, "grad_norm": 19.012310028076172, "learning_rate": 2.172030867683042e-06, "loss": 0.5392, "step": 47360 }, { "epoch": 8.92, "grad_norm": 7.179901599884033, "learning_rate": 2.1682665160926033e-06, "loss": 0.5012, "step": 47370 }, { "epoch": 8.92, "grad_norm": 15.418367385864258, "learning_rate": 2.1645021645021648e-06, "loss": 0.5293, "step": 47380 }, { "epoch": 8.92, "grad_norm": 25.173019409179688, "learning_rate": 2.1607378129117262e-06, "loss": 0.4274, "step": 47390 }, { "epoch": 8.92, "grad_norm": 1.2999225854873657, "learning_rate": 2.1569734613212877e-06, "loss": 0.3006, "step": 47400 }, { "epoch": 8.92, "grad_norm": 0.06721743196249008, "learning_rate": 2.153209109730849e-06, "loss": 0.5072, "step": 47410 }, { "epoch": 8.93, "grad_norm": 43.51065444946289, "learning_rate": 2.1494447581404106e-06, "loss": 0.5687, "step": 47420 }, { "epoch": 8.93, "grad_norm": 1.2883092164993286, "learning_rate": 2.145680406549972e-06, "loss": 0.2178, "step": 47430 }, { "epoch": 8.93, "grad_norm": 3.8370001316070557, "learning_rate": 2.1419160549595335e-06, "loss": 0.4656, "step": 47440 }, { "epoch": 8.93, "grad_norm": 1.2734487056732178, "learning_rate": 2.138151703369095e-06, "loss": 0.3782, "step": 47450 }, { "epoch": 8.93, "grad_norm": 7.262458324432373, "learning_rate": 2.134387351778656e-06, "loss": 0.4573, "step": 47460 }, { "epoch": 8.93, "grad_norm": 33.35980987548828, "learning_rate": 2.130623000188218e-06, "loss": 0.3935, "step": 47470 }, { "epoch": 8.94, "grad_norm": 5.109274387359619, "learning_rate": 2.126858648597779e-06, "loss": 0.4141, "step": 47480 }, { "epoch": 8.94, "grad_norm": 5.794544219970703, "learning_rate": 2.123094297007341e-06, "loss": 0.4369, "step": 47490 }, { "epoch": 8.94, "grad_norm": 1.0066646337509155, "learning_rate": 2.1193299454169023e-06, "loss": 0.6685, "step": 47500 }, { "epoch": 8.94, "grad_norm": 11.47095012664795, "learning_rate": 2.1155655938264637e-06, "loss": 0.3829, "step": 47510 }, { "epoch": 8.94, "grad_norm": 14.297951698303223, "learning_rate": 2.111801242236025e-06, "loss": 0.4893, "step": 47520 }, { "epoch": 8.95, "grad_norm": 11.840413093566895, "learning_rate": 2.1080368906455862e-06, "loss": 0.7612, "step": 47530 }, { "epoch": 8.95, "grad_norm": 38.1298713684082, "learning_rate": 2.104272539055148e-06, "loss": 0.6616, "step": 47540 }, { "epoch": 8.95, "grad_norm": 25.78653907775879, "learning_rate": 2.100508187464709e-06, "loss": 0.2917, "step": 47550 }, { "epoch": 8.95, "grad_norm": 19.186622619628906, "learning_rate": 2.096743835874271e-06, "loss": 0.5563, "step": 47560 }, { "epoch": 8.95, "grad_norm": 6.531424522399902, "learning_rate": 2.092979484283832e-06, "loss": 0.3453, "step": 47570 }, { "epoch": 8.96, "grad_norm": 0.02262808382511139, "learning_rate": 2.089215132693394e-06, "loss": 0.4988, "step": 47580 }, { "epoch": 8.96, "grad_norm": 18.038076400756836, "learning_rate": 2.085450781102955e-06, "loss": 0.5999, "step": 47590 }, { "epoch": 8.96, "grad_norm": 12.460607528686523, "learning_rate": 2.0816864295125165e-06, "loss": 0.7356, "step": 47600 }, { "epoch": 8.96, "grad_norm": 45.871124267578125, "learning_rate": 2.0779220779220784e-06, "loss": 0.2332, "step": 47610 }, { "epoch": 8.96, "grad_norm": 6.987379550933838, "learning_rate": 2.0741577263316394e-06, "loss": 0.4788, "step": 47620 }, { "epoch": 8.96, "grad_norm": 27.808523178100586, "learning_rate": 2.0703933747412013e-06, "loss": 0.6439, "step": 47630 }, { "epoch": 8.97, "grad_norm": 15.022832870483398, "learning_rate": 2.0666290231507623e-06, "loss": 0.4583, "step": 47640 }, { "epoch": 8.97, "grad_norm": 1.9445959329605103, "learning_rate": 2.062864671560324e-06, "loss": 0.6531, "step": 47650 }, { "epoch": 8.97, "grad_norm": 19.67501449584961, "learning_rate": 2.0591003199698852e-06, "loss": 0.5846, "step": 47660 }, { "epoch": 8.97, "grad_norm": 4.834009647369385, "learning_rate": 2.0553359683794467e-06, "loss": 0.3116, "step": 47670 }, { "epoch": 8.97, "grad_norm": 41.817848205566406, "learning_rate": 2.051571616789008e-06, "loss": 0.5561, "step": 47680 }, { "epoch": 8.98, "grad_norm": 10.177002906799316, "learning_rate": 2.0478072651985696e-06, "loss": 0.4742, "step": 47690 }, { "epoch": 8.98, "grad_norm": 9.804389953613281, "learning_rate": 2.044042913608131e-06, "loss": 0.5149, "step": 47700 }, { "epoch": 8.98, "grad_norm": 0.5583279132843018, "learning_rate": 2.0402785620176925e-06, "loss": 0.1706, "step": 47710 }, { "epoch": 8.98, "grad_norm": 17.16972541809082, "learning_rate": 2.036514210427254e-06, "loss": 0.5996, "step": 47720 }, { "epoch": 8.98, "grad_norm": 12.238551139831543, "learning_rate": 2.0327498588368155e-06, "loss": 0.3095, "step": 47730 }, { "epoch": 8.99, "grad_norm": 23.57866859436035, "learning_rate": 2.028985507246377e-06, "loss": 0.332, "step": 47740 }, { "epoch": 8.99, "grad_norm": 40.92672348022461, "learning_rate": 2.0252211556559384e-06, "loss": 0.6045, "step": 47750 }, { "epoch": 8.99, "grad_norm": 0.26052126288414, "learning_rate": 2.0214568040655e-06, "loss": 0.5199, "step": 47760 }, { "epoch": 8.99, "grad_norm": 14.104440689086914, "learning_rate": 2.0176924524750613e-06, "loss": 0.3415, "step": 47770 }, { "epoch": 8.99, "grad_norm": 5.497945785522461, "learning_rate": 2.0139281008846228e-06, "loss": 0.3376, "step": 47780 }, { "epoch": 8.99, "grad_norm": 16.819019317626953, "learning_rate": 2.0101637492941842e-06, "loss": 0.2294, "step": 47790 }, { "epoch": 9.0, "grad_norm": 26.409992218017578, "learning_rate": 2.0063993977037457e-06, "loss": 0.4863, "step": 47800 }, { "epoch": 9.0, "grad_norm": 37.679100036621094, "learning_rate": 2.002635046113307e-06, "loss": 0.3794, "step": 47810 }, { "epoch": 9.0, "eval_accuracy": 0.926, "eval_loss": 0.29454880952835083, "eval_runtime": 51.0107, "eval_samples_per_second": 147.028, "eval_steps_per_second": 18.388, "step": 47817 }, { "epoch": 9.0, "grad_norm": 0.07574218511581421, "learning_rate": 1.9988706945228686e-06, "loss": 0.384, "step": 47820 }, { "epoch": 9.0, "grad_norm": 0.08334751427173615, "learning_rate": 1.99510634293243e-06, "loss": 0.3326, "step": 47830 }, { "epoch": 9.0, "grad_norm": 22.99428367614746, "learning_rate": 1.9913419913419915e-06, "loss": 0.3852, "step": 47840 }, { "epoch": 9.01, "grad_norm": 2.2758443355560303, "learning_rate": 1.987577639751553e-06, "loss": 0.3487, "step": 47850 }, { "epoch": 9.01, "grad_norm": 16.36058807373047, "learning_rate": 1.9838132881611144e-06, "loss": 0.732, "step": 47860 }, { "epoch": 9.01, "grad_norm": 18.320423126220703, "learning_rate": 1.980048936570676e-06, "loss": 0.5723, "step": 47870 }, { "epoch": 9.01, "grad_norm": 0.30943912267684937, "learning_rate": 1.9762845849802374e-06, "loss": 0.5259, "step": 47880 }, { "epoch": 9.01, "grad_norm": 10.592155456542969, "learning_rate": 1.972520233389799e-06, "loss": 0.357, "step": 47890 }, { "epoch": 9.02, "grad_norm": 20.643972396850586, "learning_rate": 1.9687558817993603e-06, "loss": 0.4431, "step": 47900 }, { "epoch": 9.02, "grad_norm": 3.8286397457122803, "learning_rate": 1.9649915302089217e-06, "loss": 0.4602, "step": 47910 }, { "epoch": 9.02, "grad_norm": 13.922072410583496, "learning_rate": 1.961227178618483e-06, "loss": 0.4626, "step": 47920 }, { "epoch": 9.02, "grad_norm": 14.62498950958252, "learning_rate": 1.9574628270280447e-06, "loss": 0.1669, "step": 47930 }, { "epoch": 9.02, "grad_norm": 13.237974166870117, "learning_rate": 1.953698475437606e-06, "loss": 0.3758, "step": 47940 }, { "epoch": 9.03, "grad_norm": 47.09726333618164, "learning_rate": 1.9499341238471676e-06, "loss": 0.6936, "step": 47950 }, { "epoch": 9.03, "grad_norm": 7.44305419921875, "learning_rate": 1.9461697722567286e-06, "loss": 0.5368, "step": 47960 }, { "epoch": 9.03, "grad_norm": 10.614571571350098, "learning_rate": 1.9424054206662905e-06, "loss": 0.3991, "step": 47970 }, { "epoch": 9.03, "grad_norm": 14.058085441589355, "learning_rate": 1.938641069075852e-06, "loss": 0.2912, "step": 47980 }, { "epoch": 9.03, "grad_norm": 10.969157218933105, "learning_rate": 1.9348767174854134e-06, "loss": 0.4087, "step": 47990 }, { "epoch": 9.03, "grad_norm": 10.787178039550781, "learning_rate": 1.931112365894975e-06, "loss": 0.4769, "step": 48000 }, { "epoch": 9.04, "grad_norm": 15.043909072875977, "learning_rate": 1.9273480143045364e-06, "loss": 0.1376, "step": 48010 }, { "epoch": 9.04, "grad_norm": 1.5361169576644897, "learning_rate": 1.923583662714098e-06, "loss": 0.4009, "step": 48020 }, { "epoch": 9.04, "grad_norm": 9.03320598602295, "learning_rate": 1.919819311123659e-06, "loss": 0.6094, "step": 48030 }, { "epoch": 9.04, "grad_norm": 0.36295774579048157, "learning_rate": 1.9160549595332207e-06, "loss": 0.282, "step": 48040 }, { "epoch": 9.04, "grad_norm": 5.759310722351074, "learning_rate": 1.9122906079427818e-06, "loss": 0.2708, "step": 48050 }, { "epoch": 9.05, "grad_norm": 0.02767598256468773, "learning_rate": 1.9085262563523437e-06, "loss": 0.5305, "step": 48060 }, { "epoch": 9.05, "grad_norm": 1.4058023691177368, "learning_rate": 1.904761904761905e-06, "loss": 0.4482, "step": 48070 }, { "epoch": 9.05, "grad_norm": 0.3036022186279297, "learning_rate": 1.9009975531714664e-06, "loss": 0.5453, "step": 48080 }, { "epoch": 9.05, "grad_norm": 4.839325904846191, "learning_rate": 1.8972332015810276e-06, "loss": 0.3741, "step": 48090 }, { "epoch": 9.05, "grad_norm": 14.845388412475586, "learning_rate": 1.8934688499905893e-06, "loss": 0.5632, "step": 48100 }, { "epoch": 9.06, "grad_norm": 16.649734497070312, "learning_rate": 1.8897044984001508e-06, "loss": 0.1853, "step": 48110 }, { "epoch": 9.06, "grad_norm": 25.03996467590332, "learning_rate": 1.8859401468097122e-06, "loss": 0.3492, "step": 48120 }, { "epoch": 9.06, "grad_norm": 30.69152069091797, "learning_rate": 1.8821757952192737e-06, "loss": 0.4833, "step": 48130 }, { "epoch": 9.06, "grad_norm": 13.534026145935059, "learning_rate": 1.8784114436288351e-06, "loss": 0.3608, "step": 48140 }, { "epoch": 9.06, "grad_norm": 23.936994552612305, "learning_rate": 1.8746470920383966e-06, "loss": 0.4873, "step": 48150 }, { "epoch": 9.06, "grad_norm": 41.1656608581543, "learning_rate": 1.8708827404479578e-06, "loss": 0.683, "step": 48160 }, { "epoch": 9.07, "grad_norm": 12.103029251098633, "learning_rate": 1.8671183888575195e-06, "loss": 0.3607, "step": 48170 }, { "epoch": 9.07, "grad_norm": 0.34992146492004395, "learning_rate": 1.8633540372670808e-06, "loss": 0.3565, "step": 48180 }, { "epoch": 9.07, "grad_norm": 5.494448661804199, "learning_rate": 1.8595896856766424e-06, "loss": 0.3514, "step": 48190 }, { "epoch": 9.07, "grad_norm": 19.909494400024414, "learning_rate": 1.8558253340862037e-06, "loss": 0.3663, "step": 48200 }, { "epoch": 9.07, "grad_norm": 23.300514221191406, "learning_rate": 1.8520609824957654e-06, "loss": 0.6781, "step": 48210 }, { "epoch": 9.08, "grad_norm": 10.3643798828125, "learning_rate": 1.8482966309053266e-06, "loss": 0.4268, "step": 48220 }, { "epoch": 9.08, "grad_norm": 23.494760513305664, "learning_rate": 1.844532279314888e-06, "loss": 0.4987, "step": 48230 }, { "epoch": 9.08, "grad_norm": 3.709768772125244, "learning_rate": 1.8407679277244497e-06, "loss": 0.3542, "step": 48240 }, { "epoch": 9.08, "grad_norm": 0.16881948709487915, "learning_rate": 1.837003576134011e-06, "loss": 0.5134, "step": 48250 }, { "epoch": 9.08, "grad_norm": 9.188733100891113, "learning_rate": 1.8332392245435727e-06, "loss": 0.6763, "step": 48260 }, { "epoch": 9.09, "grad_norm": 12.953929901123047, "learning_rate": 1.829474872953134e-06, "loss": 0.3074, "step": 48270 }, { "epoch": 9.09, "grad_norm": 6.657861232757568, "learning_rate": 1.8257105213626956e-06, "loss": 0.3391, "step": 48280 }, { "epoch": 9.09, "grad_norm": 22.9435977935791, "learning_rate": 1.8219461697722568e-06, "loss": 0.7275, "step": 48290 }, { "epoch": 9.09, "grad_norm": 25.13018035888672, "learning_rate": 1.8181818181818183e-06, "loss": 0.1599, "step": 48300 }, { "epoch": 9.09, "grad_norm": 16.501449584960938, "learning_rate": 1.8144174665913798e-06, "loss": 0.577, "step": 48310 }, { "epoch": 9.09, "grad_norm": 39.59345626831055, "learning_rate": 1.8106531150009412e-06, "loss": 0.7677, "step": 48320 }, { "epoch": 9.1, "grad_norm": 14.423561096191406, "learning_rate": 1.8068887634105025e-06, "loss": 0.5273, "step": 48330 }, { "epoch": 9.1, "grad_norm": 24.364534378051758, "learning_rate": 1.8031244118200641e-06, "loss": 0.2518, "step": 48340 }, { "epoch": 9.1, "grad_norm": 12.524968147277832, "learning_rate": 1.7993600602296258e-06, "loss": 0.6041, "step": 48350 }, { "epoch": 9.1, "grad_norm": 23.22210693359375, "learning_rate": 1.795595708639187e-06, "loss": 0.6273, "step": 48360 }, { "epoch": 9.1, "grad_norm": 9.35206413269043, "learning_rate": 1.7918313570487485e-06, "loss": 0.363, "step": 48370 }, { "epoch": 9.11, "grad_norm": 11.951454162597656, "learning_rate": 1.78806700545831e-06, "loss": 0.2939, "step": 48380 }, { "epoch": 9.11, "grad_norm": 8.72339153289795, "learning_rate": 1.7843026538678714e-06, "loss": 0.61, "step": 48390 }, { "epoch": 9.11, "grad_norm": 36.56248092651367, "learning_rate": 1.780538302277433e-06, "loss": 0.5281, "step": 48400 }, { "epoch": 9.11, "grad_norm": 13.832367897033691, "learning_rate": 1.7767739506869944e-06, "loss": 0.2781, "step": 48410 }, { "epoch": 9.11, "grad_norm": 24.075450897216797, "learning_rate": 1.7730095990965556e-06, "loss": 0.6961, "step": 48420 }, { "epoch": 9.12, "grad_norm": 2.9399142265319824, "learning_rate": 1.7692452475061173e-06, "loss": 0.2952, "step": 48430 }, { "epoch": 9.12, "grad_norm": 16.432899475097656, "learning_rate": 1.7654808959156785e-06, "loss": 0.4107, "step": 48440 }, { "epoch": 9.12, "grad_norm": 21.72768211364746, "learning_rate": 1.7617165443252402e-06, "loss": 0.7116, "step": 48450 }, { "epoch": 9.12, "grad_norm": 20.015758514404297, "learning_rate": 1.7579521927348015e-06, "loss": 0.8156, "step": 48460 }, { "epoch": 9.12, "grad_norm": 7.275087833404541, "learning_rate": 1.7541878411443631e-06, "loss": 0.7025, "step": 48470 }, { "epoch": 9.12, "grad_norm": 59.83738708496094, "learning_rate": 1.7504234895539246e-06, "loss": 0.363, "step": 48480 }, { "epoch": 9.13, "grad_norm": 19.102800369262695, "learning_rate": 1.7466591379634858e-06, "loss": 0.6994, "step": 48490 }, { "epoch": 9.13, "grad_norm": 37.69584274291992, "learning_rate": 1.7428947863730475e-06, "loss": 0.6699, "step": 48500 }, { "epoch": 9.13, "grad_norm": 7.24312686920166, "learning_rate": 1.7391304347826088e-06, "loss": 0.9191, "step": 48510 }, { "epoch": 9.13, "grad_norm": 28.769311904907227, "learning_rate": 1.7353660831921704e-06, "loss": 0.3994, "step": 48520 }, { "epoch": 9.13, "grad_norm": 0.12715600430965424, "learning_rate": 1.7316017316017317e-06, "loss": 0.3204, "step": 48530 }, { "epoch": 9.14, "grad_norm": 29.197851181030273, "learning_rate": 1.7278373800112933e-06, "loss": 0.6342, "step": 48540 }, { "epoch": 9.14, "grad_norm": 4.771313667297363, "learning_rate": 1.7240730284208546e-06, "loss": 0.4141, "step": 48550 }, { "epoch": 9.14, "grad_norm": 10.536064147949219, "learning_rate": 1.720308676830416e-06, "loss": 0.407, "step": 48560 }, { "epoch": 9.14, "grad_norm": 19.451534271240234, "learning_rate": 1.7165443252399775e-06, "loss": 0.4267, "step": 48570 }, { "epoch": 9.14, "grad_norm": 15.74153995513916, "learning_rate": 1.712779973649539e-06, "loss": 0.3457, "step": 48580 }, { "epoch": 9.15, "grad_norm": 8.152856826782227, "learning_rate": 1.7090156220591002e-06, "loss": 0.2276, "step": 48590 }, { "epoch": 9.15, "grad_norm": 6.447235584259033, "learning_rate": 1.705251270468662e-06, "loss": 0.2864, "step": 48600 }, { "epoch": 9.15, "grad_norm": 15.62842845916748, "learning_rate": 1.7014869188782236e-06, "loss": 0.5409, "step": 48610 }, { "epoch": 9.15, "grad_norm": 7.044529438018799, "learning_rate": 1.6977225672877848e-06, "loss": 0.6466, "step": 48620 }, { "epoch": 9.15, "grad_norm": 17.935876846313477, "learning_rate": 1.6939582156973463e-06, "loss": 0.4593, "step": 48630 }, { "epoch": 9.15, "grad_norm": 18.33387565612793, "learning_rate": 1.6901938641069077e-06, "loss": 0.5601, "step": 48640 }, { "epoch": 9.16, "grad_norm": 11.096664428710938, "learning_rate": 1.6864295125164692e-06, "loss": 0.7234, "step": 48650 }, { "epoch": 9.16, "grad_norm": 31.238327026367188, "learning_rate": 1.6826651609260305e-06, "loss": 0.496, "step": 48660 }, { "epoch": 9.16, "grad_norm": 16.450227737426758, "learning_rate": 1.6789008093355921e-06, "loss": 0.4201, "step": 48670 }, { "epoch": 9.16, "grad_norm": 7.500368118286133, "learning_rate": 1.6751364577451534e-06, "loss": 0.3319, "step": 48680 }, { "epoch": 9.16, "grad_norm": 3.6829004287719727, "learning_rate": 1.671372106154715e-06, "loss": 0.5277, "step": 48690 }, { "epoch": 9.17, "grad_norm": 5.226105213165283, "learning_rate": 1.6676077545642763e-06, "loss": 0.4659, "step": 48700 }, { "epoch": 9.17, "grad_norm": 17.099241256713867, "learning_rate": 1.663843402973838e-06, "loss": 0.1683, "step": 48710 }, { "epoch": 9.17, "grad_norm": 19.311145782470703, "learning_rate": 1.6600790513833994e-06, "loss": 0.3761, "step": 48720 }, { "epoch": 9.17, "grad_norm": 11.647774696350098, "learning_rate": 1.6563146997929607e-06, "loss": 0.4523, "step": 48730 }, { "epoch": 9.17, "grad_norm": 18.899429321289062, "learning_rate": 1.6525503482025224e-06, "loss": 0.4971, "step": 48740 }, { "epoch": 9.18, "grad_norm": 0.5511684417724609, "learning_rate": 1.6487859966120836e-06, "loss": 0.2433, "step": 48750 }, { "epoch": 9.18, "grad_norm": 7.991838455200195, "learning_rate": 1.6450216450216453e-06, "loss": 0.4022, "step": 48760 }, { "epoch": 9.18, "grad_norm": 25.697017669677734, "learning_rate": 1.6412572934312065e-06, "loss": 0.4417, "step": 48770 }, { "epoch": 9.18, "grad_norm": 12.145305633544922, "learning_rate": 1.6374929418407682e-06, "loss": 0.4612, "step": 48780 }, { "epoch": 9.18, "grad_norm": 5.5945892333984375, "learning_rate": 1.6337285902503294e-06, "loss": 0.3146, "step": 48790 }, { "epoch": 9.19, "grad_norm": 12.325715065002441, "learning_rate": 1.629964238659891e-06, "loss": 0.6039, "step": 48800 }, { "epoch": 9.19, "grad_norm": 5.670255184173584, "learning_rate": 1.6261998870694524e-06, "loss": 0.5651, "step": 48810 }, { "epoch": 9.19, "grad_norm": 80.41706848144531, "learning_rate": 1.6224355354790138e-06, "loss": 0.5711, "step": 48820 }, { "epoch": 9.19, "grad_norm": 8.065768241882324, "learning_rate": 1.6186711838885753e-06, "loss": 0.6124, "step": 48830 }, { "epoch": 9.19, "grad_norm": 0.024990417063236237, "learning_rate": 1.6149068322981367e-06, "loss": 0.269, "step": 48840 }, { "epoch": 9.19, "grad_norm": 44.296573638916016, "learning_rate": 1.6111424807076984e-06, "loss": 0.4364, "step": 48850 }, { "epoch": 9.2, "grad_norm": 6.221458435058594, "learning_rate": 1.6073781291172597e-06, "loss": 0.2955, "step": 48860 }, { "epoch": 9.2, "grad_norm": 17.889265060424805, "learning_rate": 1.6036137775268213e-06, "loss": 0.3683, "step": 48870 }, { "epoch": 9.2, "grad_norm": 9.34254264831543, "learning_rate": 1.5998494259363826e-06, "loss": 0.5364, "step": 48880 }, { "epoch": 9.2, "grad_norm": 0.1648782640695572, "learning_rate": 1.596085074345944e-06, "loss": 0.3695, "step": 48890 }, { "epoch": 9.2, "grad_norm": 1.0630319118499756, "learning_rate": 1.5923207227555055e-06, "loss": 0.3158, "step": 48900 }, { "epoch": 9.21, "grad_norm": 28.75983428955078, "learning_rate": 1.588556371165067e-06, "loss": 0.3781, "step": 48910 }, { "epoch": 9.21, "grad_norm": 4.615087509155273, "learning_rate": 1.5847920195746282e-06, "loss": 0.3918, "step": 48920 }, { "epoch": 9.21, "grad_norm": 15.744338035583496, "learning_rate": 1.5810276679841899e-06, "loss": 0.153, "step": 48930 }, { "epoch": 9.21, "grad_norm": 5.790966510772705, "learning_rate": 1.5772633163937511e-06, "loss": 0.3341, "step": 48940 }, { "epoch": 9.21, "grad_norm": 0.05763714388012886, "learning_rate": 1.5734989648033128e-06, "loss": 0.4415, "step": 48950 }, { "epoch": 9.22, "grad_norm": 15.090495109558105, "learning_rate": 1.5697346132128743e-06, "loss": 0.5551, "step": 48960 }, { "epoch": 9.22, "grad_norm": 14.567575454711914, "learning_rate": 1.5659702616224357e-06, "loss": 0.7791, "step": 48970 }, { "epoch": 9.22, "grad_norm": 5.712708473205566, "learning_rate": 1.5622059100319972e-06, "loss": 0.3734, "step": 48980 }, { "epoch": 9.22, "grad_norm": 64.14569091796875, "learning_rate": 1.5584415584415584e-06, "loss": 0.4575, "step": 48990 }, { "epoch": 9.22, "grad_norm": 20.8106632232666, "learning_rate": 1.5546772068511201e-06, "loss": 0.4518, "step": 49000 }, { "epoch": 9.22, "grad_norm": 9.944857597351074, "learning_rate": 1.5509128552606814e-06, "loss": 0.4911, "step": 49010 }, { "epoch": 9.23, "grad_norm": 8.021162986755371, "learning_rate": 1.547148503670243e-06, "loss": 0.6, "step": 49020 }, { "epoch": 9.23, "grad_norm": 13.243287086486816, "learning_rate": 1.5433841520798043e-06, "loss": 0.507, "step": 49030 }, { "epoch": 9.23, "grad_norm": 32.78733825683594, "learning_rate": 1.539619800489366e-06, "loss": 0.5617, "step": 49040 }, { "epoch": 9.23, "grad_norm": 3.1205811500549316, "learning_rate": 1.5358554488989272e-06, "loss": 0.4243, "step": 49050 }, { "epoch": 9.23, "grad_norm": 14.635000228881836, "learning_rate": 1.5320910973084887e-06, "loss": 0.5009, "step": 49060 }, { "epoch": 9.24, "grad_norm": 18.14510154724121, "learning_rate": 1.5283267457180501e-06, "loss": 0.3399, "step": 49070 }, { "epoch": 9.24, "grad_norm": 0.11989280581474304, "learning_rate": 1.5245623941276116e-06, "loss": 0.3495, "step": 49080 }, { "epoch": 9.24, "grad_norm": 24.832670211791992, "learning_rate": 1.5207980425371733e-06, "loss": 0.3462, "step": 49090 }, { "epoch": 9.24, "grad_norm": 12.079398155212402, "learning_rate": 1.5170336909467345e-06, "loss": 0.3772, "step": 49100 }, { "epoch": 9.24, "grad_norm": 0.3867708146572113, "learning_rate": 1.5132693393562962e-06, "loss": 0.4475, "step": 49110 }, { "epoch": 9.25, "grad_norm": 26.119632720947266, "learning_rate": 1.5095049877658574e-06, "loss": 0.6135, "step": 49120 }, { "epoch": 9.25, "grad_norm": 1.460374116897583, "learning_rate": 1.505740636175419e-06, "loss": 0.3183, "step": 49130 }, { "epoch": 9.25, "grad_norm": 5.709151268005371, "learning_rate": 1.5019762845849804e-06, "loss": 0.6727, "step": 49140 }, { "epoch": 9.25, "grad_norm": 3.2548608779907227, "learning_rate": 1.4982119329945418e-06, "loss": 0.3298, "step": 49150 }, { "epoch": 9.25, "grad_norm": 0.8151334524154663, "learning_rate": 1.4944475814041033e-06, "loss": 0.4324, "step": 49160 }, { "epoch": 9.25, "grad_norm": 8.766486167907715, "learning_rate": 1.4906832298136647e-06, "loss": 0.5849, "step": 49170 }, { "epoch": 9.26, "grad_norm": 12.971768379211426, "learning_rate": 1.486918878223226e-06, "loss": 0.5602, "step": 49180 }, { "epoch": 9.26, "grad_norm": 0.07424553483724594, "learning_rate": 1.4831545266327877e-06, "loss": 0.3006, "step": 49190 }, { "epoch": 9.26, "grad_norm": 40.09004592895508, "learning_rate": 1.479390175042349e-06, "loss": 0.4148, "step": 49200 }, { "epoch": 9.26, "grad_norm": 40.092464447021484, "learning_rate": 1.4756258234519106e-06, "loss": 0.312, "step": 49210 }, { "epoch": 9.26, "grad_norm": 28.163589477539062, "learning_rate": 1.471861471861472e-06, "loss": 0.281, "step": 49220 }, { "epoch": 9.27, "grad_norm": 12.769076347351074, "learning_rate": 1.4680971202710335e-06, "loss": 0.5183, "step": 49230 }, { "epoch": 9.27, "grad_norm": 2.987441301345825, "learning_rate": 1.464332768680595e-06, "loss": 0.5749, "step": 49240 }, { "epoch": 9.27, "grad_norm": 6.323493003845215, "learning_rate": 1.4605684170901562e-06, "loss": 0.4274, "step": 49250 }, { "epoch": 9.27, "grad_norm": 10.778115272521973, "learning_rate": 1.4568040654997179e-06, "loss": 0.5296, "step": 49260 }, { "epoch": 9.27, "grad_norm": 0.0840461477637291, "learning_rate": 1.4530397139092791e-06, "loss": 0.3179, "step": 49270 }, { "epoch": 9.28, "grad_norm": 0.9376565217971802, "learning_rate": 1.4492753623188408e-06, "loss": 0.2639, "step": 49280 }, { "epoch": 9.28, "grad_norm": 13.167097091674805, "learning_rate": 1.445511010728402e-06, "loss": 0.3673, "step": 49290 }, { "epoch": 9.28, "grad_norm": 0.04004925489425659, "learning_rate": 1.4417466591379637e-06, "loss": 0.349, "step": 49300 }, { "epoch": 9.28, "grad_norm": 21.248655319213867, "learning_rate": 1.437982307547525e-06, "loss": 0.6205, "step": 49310 }, { "epoch": 9.28, "grad_norm": 1.5469751358032227, "learning_rate": 1.4342179559570864e-06, "loss": 0.3818, "step": 49320 }, { "epoch": 9.28, "grad_norm": 0.027218230068683624, "learning_rate": 1.4304536043666481e-06, "loss": 0.3784, "step": 49330 }, { "epoch": 9.29, "grad_norm": 21.65924072265625, "learning_rate": 1.4266892527762094e-06, "loss": 0.4164, "step": 49340 }, { "epoch": 9.29, "grad_norm": 32.658966064453125, "learning_rate": 1.422924901185771e-06, "loss": 0.4811, "step": 49350 }, { "epoch": 9.29, "grad_norm": 0.4044090509414673, "learning_rate": 1.4191605495953323e-06, "loss": 0.3344, "step": 49360 }, { "epoch": 9.29, "grad_norm": 25.323366165161133, "learning_rate": 1.415396198004894e-06, "loss": 0.2475, "step": 49370 }, { "epoch": 9.29, "grad_norm": 10.626187324523926, "learning_rate": 1.4116318464144552e-06, "loss": 0.4904, "step": 49380 }, { "epoch": 9.3, "grad_norm": 11.54336166381836, "learning_rate": 1.4078674948240167e-06, "loss": 0.4524, "step": 49390 }, { "epoch": 9.3, "grad_norm": 26.418764114379883, "learning_rate": 1.4041031432335781e-06, "loss": 0.5615, "step": 49400 }, { "epoch": 9.3, "grad_norm": 3.740413188934326, "learning_rate": 1.4003387916431396e-06, "loss": 0.1174, "step": 49410 }, { "epoch": 9.3, "grad_norm": 15.656806945800781, "learning_rate": 1.3965744400527008e-06, "loss": 0.5396, "step": 49420 }, { "epoch": 9.3, "grad_norm": 0.4741308391094208, "learning_rate": 1.3928100884622625e-06, "loss": 0.5084, "step": 49430 }, { "epoch": 9.31, "grad_norm": 3.0225412845611572, "learning_rate": 1.3890457368718238e-06, "loss": 0.4074, "step": 49440 }, { "epoch": 9.31, "grad_norm": 0.20880745351314545, "learning_rate": 1.3852813852813854e-06, "loss": 0.2169, "step": 49450 }, { "epoch": 9.31, "grad_norm": 0.055619094520807266, "learning_rate": 1.3815170336909469e-06, "loss": 0.2984, "step": 49460 }, { "epoch": 9.31, "grad_norm": 0.15215404331684113, "learning_rate": 1.3777526821005083e-06, "loss": 0.2482, "step": 49470 }, { "epoch": 9.31, "grad_norm": 5.14688777923584, "learning_rate": 1.3739883305100698e-06, "loss": 0.285, "step": 49480 }, { "epoch": 9.31, "grad_norm": 19.21902084350586, "learning_rate": 1.370223978919631e-06, "loss": 0.4919, "step": 49490 }, { "epoch": 9.32, "grad_norm": 23.251541137695312, "learning_rate": 1.3664596273291927e-06, "loss": 0.4524, "step": 49500 }, { "epoch": 9.32, "grad_norm": 5.6620588302612305, "learning_rate": 1.362695275738754e-06, "loss": 0.3456, "step": 49510 }, { "epoch": 9.32, "grad_norm": 0.2966133654117584, "learning_rate": 1.3589309241483157e-06, "loss": 0.4254, "step": 49520 }, { "epoch": 9.32, "grad_norm": 1.9271348714828491, "learning_rate": 1.355166572557877e-06, "loss": 0.75, "step": 49530 }, { "epoch": 9.32, "grad_norm": 47.30406188964844, "learning_rate": 1.3514022209674386e-06, "loss": 0.2837, "step": 49540 }, { "epoch": 9.33, "grad_norm": 6.934784889221191, "learning_rate": 1.3476378693769998e-06, "loss": 0.1063, "step": 49550 }, { "epoch": 9.33, "grad_norm": 0.4049472212791443, "learning_rate": 1.3438735177865615e-06, "loss": 0.2103, "step": 49560 }, { "epoch": 9.33, "grad_norm": 8.255983352661133, "learning_rate": 1.3401091661961227e-06, "loss": 0.4509, "step": 49570 }, { "epoch": 9.33, "grad_norm": 14.6072416305542, "learning_rate": 1.3363448146056842e-06, "loss": 0.6148, "step": 49580 }, { "epoch": 9.33, "grad_norm": 0.24846920371055603, "learning_rate": 1.3325804630152459e-06, "loss": 0.2994, "step": 49590 }, { "epoch": 9.34, "grad_norm": 10.933160781860352, "learning_rate": 1.3288161114248071e-06, "loss": 0.3762, "step": 49600 }, { "epoch": 9.34, "grad_norm": 19.94293975830078, "learning_rate": 1.3250517598343688e-06, "loss": 0.6265, "step": 49610 }, { "epoch": 9.34, "grad_norm": 0.2474469691514969, "learning_rate": 1.32128740824393e-06, "loss": 0.5879, "step": 49620 }, { "epoch": 9.34, "grad_norm": 4.398937225341797, "learning_rate": 1.3175230566534917e-06, "loss": 0.4399, "step": 49630 }, { "epoch": 9.34, "grad_norm": 0.6015844941139221, "learning_rate": 1.313758705063053e-06, "loss": 0.2968, "step": 49640 }, { "epoch": 9.35, "grad_norm": 0.09437728673219681, "learning_rate": 1.3099943534726144e-06, "loss": 0.4135, "step": 49650 }, { "epoch": 9.35, "grad_norm": 26.453380584716797, "learning_rate": 1.3062300018821759e-06, "loss": 0.3049, "step": 49660 }, { "epoch": 9.35, "grad_norm": 1.22520911693573, "learning_rate": 1.3024656502917373e-06, "loss": 0.4276, "step": 49670 }, { "epoch": 9.35, "grad_norm": 26.06831169128418, "learning_rate": 1.2987012987012986e-06, "loss": 0.4042, "step": 49680 }, { "epoch": 9.35, "grad_norm": 34.11174392700195, "learning_rate": 1.2949369471108603e-06, "loss": 0.2479, "step": 49690 }, { "epoch": 9.35, "grad_norm": 6.261451721191406, "learning_rate": 1.291172595520422e-06, "loss": 0.4995, "step": 49700 }, { "epoch": 9.36, "grad_norm": 25.64111328125, "learning_rate": 1.2874082439299832e-06, "loss": 0.4061, "step": 49710 }, { "epoch": 9.36, "grad_norm": 12.963571548461914, "learning_rate": 1.2836438923395447e-06, "loss": 0.3339, "step": 49720 }, { "epoch": 9.36, "grad_norm": 22.063447952270508, "learning_rate": 1.2798795407491061e-06, "loss": 0.5355, "step": 49730 }, { "epoch": 9.36, "grad_norm": 5.219394207000732, "learning_rate": 1.2761151891586676e-06, "loss": 0.3802, "step": 49740 }, { "epoch": 9.36, "grad_norm": 1.1139286756515503, "learning_rate": 1.2723508375682288e-06, "loss": 0.1823, "step": 49750 }, { "epoch": 9.37, "grad_norm": 36.65229034423828, "learning_rate": 1.2685864859777905e-06, "loss": 0.4732, "step": 49760 }, { "epoch": 9.37, "grad_norm": 1.4162871837615967, "learning_rate": 1.2648221343873517e-06, "loss": 0.255, "step": 49770 }, { "epoch": 9.37, "grad_norm": 1.9202029705047607, "learning_rate": 1.2610577827969134e-06, "loss": 0.5858, "step": 49780 }, { "epoch": 9.37, "grad_norm": 0.02834530733525753, "learning_rate": 1.2572934312064747e-06, "loss": 0.5202, "step": 49790 }, { "epoch": 9.37, "grad_norm": 8.294178009033203, "learning_rate": 1.2535290796160363e-06, "loss": 0.1606, "step": 49800 }, { "epoch": 9.38, "grad_norm": 10.036473274230957, "learning_rate": 1.2497647280255978e-06, "loss": 0.2661, "step": 49810 }, { "epoch": 9.38, "grad_norm": 0.019691968336701393, "learning_rate": 1.246000376435159e-06, "loss": 0.4195, "step": 49820 }, { "epoch": 9.38, "grad_norm": 14.037164688110352, "learning_rate": 1.2422360248447205e-06, "loss": 0.3664, "step": 49830 }, { "epoch": 9.38, "grad_norm": 20.38290786743164, "learning_rate": 1.238471673254282e-06, "loss": 0.4841, "step": 49840 }, { "epoch": 9.38, "grad_norm": 6.970554351806641, "learning_rate": 1.2347073216638434e-06, "loss": 0.2273, "step": 49850 }, { "epoch": 9.38, "grad_norm": 22.626224517822266, "learning_rate": 1.2309429700734049e-06, "loss": 0.482, "step": 49860 }, { "epoch": 9.39, "grad_norm": 17.559032440185547, "learning_rate": 1.2271786184829664e-06, "loss": 0.4027, "step": 49870 }, { "epoch": 9.39, "grad_norm": 7.236875057220459, "learning_rate": 1.223414266892528e-06, "loss": 0.278, "step": 49880 }, { "epoch": 9.39, "grad_norm": 10.33447265625, "learning_rate": 1.2196499153020893e-06, "loss": 0.8987, "step": 49890 }, { "epoch": 9.39, "grad_norm": 15.287224769592285, "learning_rate": 1.2158855637116507e-06, "loss": 0.4645, "step": 49900 }, { "epoch": 9.39, "grad_norm": 0.09853526204824448, "learning_rate": 1.2121212121212122e-06, "loss": 0.7842, "step": 49910 }, { "epoch": 9.4, "grad_norm": 7.803390979766846, "learning_rate": 1.2083568605307737e-06, "loss": 0.5503, "step": 49920 }, { "epoch": 9.4, "grad_norm": 34.554622650146484, "learning_rate": 1.2045925089403351e-06, "loss": 0.3182, "step": 49930 }, { "epoch": 9.4, "grad_norm": 0.17311842739582062, "learning_rate": 1.2008281573498966e-06, "loss": 0.5634, "step": 49940 }, { "epoch": 9.4, "grad_norm": 0.07609983533620834, "learning_rate": 1.197063805759458e-06, "loss": 0.3777, "step": 49950 }, { "epoch": 9.4, "grad_norm": 0.10443427413702011, "learning_rate": 1.1932994541690195e-06, "loss": 0.3804, "step": 49960 }, { "epoch": 9.41, "grad_norm": 1.417898178100586, "learning_rate": 1.189535102578581e-06, "loss": 0.3145, "step": 49970 }, { "epoch": 9.41, "grad_norm": 0.049046795815229416, "learning_rate": 1.1857707509881424e-06, "loss": 0.5672, "step": 49980 }, { "epoch": 9.41, "grad_norm": 0.9617329835891724, "learning_rate": 1.1820063993977039e-06, "loss": 0.5691, "step": 49990 }, { "epoch": 9.41, "grad_norm": 10.128259658813477, "learning_rate": 1.1782420478072651e-06, "loss": 0.5053, "step": 50000 }, { "epoch": 9.41, "grad_norm": 5.14787483215332, "learning_rate": 1.1744776962168268e-06, "loss": 0.5338, "step": 50010 }, { "epoch": 9.41, "grad_norm": 0.5360021591186523, "learning_rate": 1.1707133446263883e-06, "loss": 0.3056, "step": 50020 }, { "epoch": 9.42, "grad_norm": 10.969987869262695, "learning_rate": 1.1669489930359497e-06, "loss": 0.4361, "step": 50030 }, { "epoch": 9.42, "grad_norm": 26.70008659362793, "learning_rate": 1.1631846414455112e-06, "loss": 0.3068, "step": 50040 }, { "epoch": 9.42, "grad_norm": 10.523760795593262, "learning_rate": 1.1594202898550726e-06, "loss": 0.3849, "step": 50050 }, { "epoch": 9.42, "grad_norm": 10.677010536193848, "learning_rate": 1.155655938264634e-06, "loss": 0.4415, "step": 50060 }, { "epoch": 9.42, "grad_norm": 7.661884307861328, "learning_rate": 1.1518915866741954e-06, "loss": 0.0457, "step": 50070 }, { "epoch": 9.43, "grad_norm": 15.285215377807617, "learning_rate": 1.1481272350837568e-06, "loss": 0.2535, "step": 50080 }, { "epoch": 9.43, "grad_norm": 42.20638656616211, "learning_rate": 1.1443628834933183e-06, "loss": 0.4552, "step": 50090 }, { "epoch": 9.43, "grad_norm": 35.964088439941406, "learning_rate": 1.1405985319028797e-06, "loss": 0.589, "step": 50100 }, { "epoch": 9.43, "grad_norm": 2.185014486312866, "learning_rate": 1.1368341803124412e-06, "loss": 0.6969, "step": 50110 }, { "epoch": 9.43, "grad_norm": 0.04032859951257706, "learning_rate": 1.1330698287220027e-06, "loss": 0.3934, "step": 50120 }, { "epoch": 9.44, "grad_norm": 62.85636520385742, "learning_rate": 1.1293054771315643e-06, "loss": 0.4421, "step": 50130 }, { "epoch": 9.44, "grad_norm": 11.45727252960205, "learning_rate": 1.1255411255411256e-06, "loss": 0.5068, "step": 50140 }, { "epoch": 9.44, "grad_norm": 0.05326225981116295, "learning_rate": 1.121776773950687e-06, "loss": 0.3979, "step": 50150 }, { "epoch": 9.44, "grad_norm": 16.516693115234375, "learning_rate": 1.1180124223602485e-06, "loss": 0.2904, "step": 50160 }, { "epoch": 9.44, "grad_norm": 20.180761337280273, "learning_rate": 1.11424807076981e-06, "loss": 0.3479, "step": 50170 }, { "epoch": 9.44, "grad_norm": 7.504915714263916, "learning_rate": 1.1104837191793714e-06, "loss": 0.4464, "step": 50180 }, { "epoch": 9.45, "grad_norm": 8.153180122375488, "learning_rate": 1.1067193675889329e-06, "loss": 0.4644, "step": 50190 }, { "epoch": 9.45, "grad_norm": 0.3960212171077728, "learning_rate": 1.1029550159984943e-06, "loss": 0.3971, "step": 50200 }, { "epoch": 9.45, "grad_norm": 20.968027114868164, "learning_rate": 1.0991906644080558e-06, "loss": 0.2904, "step": 50210 }, { "epoch": 9.45, "grad_norm": 28.1608943939209, "learning_rate": 1.0954263128176173e-06, "loss": 0.3588, "step": 50220 }, { "epoch": 9.45, "grad_norm": 15.617292404174805, "learning_rate": 1.0916619612271787e-06, "loss": 0.4479, "step": 50230 }, { "epoch": 9.46, "grad_norm": 0.6835996508598328, "learning_rate": 1.0878976096367402e-06, "loss": 0.3493, "step": 50240 }, { "epoch": 9.46, "grad_norm": 11.191716194152832, "learning_rate": 1.0841332580463016e-06, "loss": 0.3882, "step": 50250 }, { "epoch": 9.46, "grad_norm": 4.108872890472412, "learning_rate": 1.0803689064558631e-06, "loss": 0.4316, "step": 50260 }, { "epoch": 9.46, "grad_norm": 6.614520072937012, "learning_rate": 1.0766045548654246e-06, "loss": 0.7346, "step": 50270 }, { "epoch": 9.46, "grad_norm": 11.33967113494873, "learning_rate": 1.072840203274986e-06, "loss": 0.5321, "step": 50280 }, { "epoch": 9.47, "grad_norm": 4.6037492752075195, "learning_rate": 1.0690758516845475e-06, "loss": 0.2836, "step": 50290 }, { "epoch": 9.47, "grad_norm": 13.846996307373047, "learning_rate": 1.065311500094109e-06, "loss": 0.5464, "step": 50300 }, { "epoch": 9.47, "grad_norm": 0.2525990307331085, "learning_rate": 1.0615471485036704e-06, "loss": 0.3565, "step": 50310 }, { "epoch": 9.47, "grad_norm": 15.951204299926758, "learning_rate": 1.0577827969132319e-06, "loss": 0.4183, "step": 50320 }, { "epoch": 9.47, "grad_norm": 5.232489585876465, "learning_rate": 1.0540184453227931e-06, "loss": 0.3319, "step": 50330 }, { "epoch": 9.47, "grad_norm": 0.023767894133925438, "learning_rate": 1.0502540937323546e-06, "loss": 0.5855, "step": 50340 }, { "epoch": 9.48, "grad_norm": 0.39152446389198303, "learning_rate": 1.046489742141916e-06, "loss": 0.3209, "step": 50350 }, { "epoch": 9.48, "grad_norm": 15.136631965637207, "learning_rate": 1.0427253905514775e-06, "loss": 0.4202, "step": 50360 }, { "epoch": 9.48, "grad_norm": 5.339528560638428, "learning_rate": 1.0389610389610392e-06, "loss": 0.2623, "step": 50370 }, { "epoch": 9.48, "grad_norm": 5.4745097160339355, "learning_rate": 1.0351966873706006e-06, "loss": 0.317, "step": 50380 }, { "epoch": 9.48, "grad_norm": 6.607236385345459, "learning_rate": 1.031432335780162e-06, "loss": 0.4808, "step": 50390 }, { "epoch": 9.49, "grad_norm": 16.006132125854492, "learning_rate": 1.0276679841897233e-06, "loss": 0.195, "step": 50400 }, { "epoch": 9.49, "grad_norm": 3.6388096809387207, "learning_rate": 1.0239036325992848e-06, "loss": 0.3289, "step": 50410 }, { "epoch": 9.49, "grad_norm": 11.516218185424805, "learning_rate": 1.0201392810088463e-06, "loss": 0.4734, "step": 50420 }, { "epoch": 9.49, "grad_norm": 30.199953079223633, "learning_rate": 1.0163749294184077e-06, "loss": 0.5322, "step": 50430 }, { "epoch": 9.49, "grad_norm": 2.062737464904785, "learning_rate": 1.0126105778279692e-06, "loss": 0.2732, "step": 50440 }, { "epoch": 9.5, "grad_norm": 0.2747786343097687, "learning_rate": 1.0088462262375306e-06, "loss": 0.389, "step": 50450 }, { "epoch": 9.5, "grad_norm": 23.68992042541504, "learning_rate": 1.0050818746470921e-06, "loss": 0.7078, "step": 50460 }, { "epoch": 9.5, "grad_norm": 10.194795608520508, "learning_rate": 1.0013175230566536e-06, "loss": 0.3264, "step": 50470 }, { "epoch": 9.5, "grad_norm": 21.266271591186523, "learning_rate": 9.97553171466215e-07, "loss": 0.5062, "step": 50480 }, { "epoch": 9.5, "grad_norm": 19.471515655517578, "learning_rate": 9.937888198757765e-07, "loss": 0.5086, "step": 50490 }, { "epoch": 9.5, "grad_norm": 17.782821655273438, "learning_rate": 9.90024468285338e-07, "loss": 0.4353, "step": 50500 }, { "epoch": 9.51, "grad_norm": 12.496779441833496, "learning_rate": 9.862601166948994e-07, "loss": 0.4012, "step": 50510 }, { "epoch": 9.51, "grad_norm": 36.61666488647461, "learning_rate": 9.824957651044609e-07, "loss": 0.6408, "step": 50520 }, { "epoch": 9.51, "grad_norm": 21.555734634399414, "learning_rate": 9.787314135140223e-07, "loss": 0.3065, "step": 50530 }, { "epoch": 9.51, "grad_norm": 19.377365112304688, "learning_rate": 9.749670619235838e-07, "loss": 0.249, "step": 50540 }, { "epoch": 9.51, "grad_norm": 20.095693588256836, "learning_rate": 9.712027103331453e-07, "loss": 0.3249, "step": 50550 }, { "epoch": 9.52, "grad_norm": 18.395681381225586, "learning_rate": 9.674383587427067e-07, "loss": 0.3726, "step": 50560 }, { "epoch": 9.52, "grad_norm": 0.07585670053958893, "learning_rate": 9.636740071522682e-07, "loss": 0.5645, "step": 50570 }, { "epoch": 9.52, "grad_norm": 19.90555763244629, "learning_rate": 9.599096555618294e-07, "loss": 0.4348, "step": 50580 }, { "epoch": 9.52, "grad_norm": 12.60806655883789, "learning_rate": 9.561453039713909e-07, "loss": 0.5422, "step": 50590 }, { "epoch": 9.52, "grad_norm": 6.923961162567139, "learning_rate": 9.523809523809525e-07, "loss": 0.5583, "step": 50600 }, { "epoch": 9.53, "grad_norm": 27.019386291503906, "learning_rate": 9.486166007905138e-07, "loss": 0.4477, "step": 50610 }, { "epoch": 9.53, "grad_norm": 22.43336296081543, "learning_rate": 9.448522492000754e-07, "loss": 0.2844, "step": 50620 }, { "epoch": 9.53, "grad_norm": 22.106285095214844, "learning_rate": 9.410878976096368e-07, "loss": 0.4775, "step": 50630 }, { "epoch": 9.53, "grad_norm": 14.707420349121094, "learning_rate": 9.373235460191983e-07, "loss": 0.3398, "step": 50640 }, { "epoch": 9.53, "grad_norm": 3.7978923320770264, "learning_rate": 9.335591944287598e-07, "loss": 0.3592, "step": 50650 }, { "epoch": 9.54, "grad_norm": 16.250072479248047, "learning_rate": 9.297948428383212e-07, "loss": 0.5696, "step": 50660 }, { "epoch": 9.54, "grad_norm": 5.815196990966797, "learning_rate": 9.260304912478827e-07, "loss": 0.2838, "step": 50670 }, { "epoch": 9.54, "grad_norm": 0.15209926664829254, "learning_rate": 9.22266139657444e-07, "loss": 0.4138, "step": 50680 }, { "epoch": 9.54, "grad_norm": 1.1849756240844727, "learning_rate": 9.185017880670055e-07, "loss": 0.5292, "step": 50690 }, { "epoch": 9.54, "grad_norm": 22.73065948486328, "learning_rate": 9.14737436476567e-07, "loss": 0.367, "step": 50700 }, { "epoch": 9.54, "grad_norm": 5.767063140869141, "learning_rate": 9.109730848861284e-07, "loss": 0.2446, "step": 50710 }, { "epoch": 9.55, "grad_norm": 0.5526207089424133, "learning_rate": 9.072087332956899e-07, "loss": 0.3764, "step": 50720 }, { "epoch": 9.55, "grad_norm": 13.495482444763184, "learning_rate": 9.034443817052512e-07, "loss": 0.4672, "step": 50730 }, { "epoch": 9.55, "grad_norm": 3.9866769313812256, "learning_rate": 8.996800301148129e-07, "loss": 0.5576, "step": 50740 }, { "epoch": 9.55, "grad_norm": 25.318235397338867, "learning_rate": 8.959156785243743e-07, "loss": 0.3028, "step": 50750 }, { "epoch": 9.55, "grad_norm": 7.270644187927246, "learning_rate": 8.921513269339357e-07, "loss": 0.4609, "step": 50760 }, { "epoch": 9.56, "grad_norm": 1.1293132305145264, "learning_rate": 8.883869753434972e-07, "loss": 0.3103, "step": 50770 }, { "epoch": 9.56, "grad_norm": 2.1888303756713867, "learning_rate": 8.846226237530586e-07, "loss": 0.151, "step": 50780 }, { "epoch": 9.56, "grad_norm": 16.56170082092285, "learning_rate": 8.808582721626201e-07, "loss": 0.4869, "step": 50790 }, { "epoch": 9.56, "grad_norm": 15.505925178527832, "learning_rate": 8.770939205721816e-07, "loss": 0.6466, "step": 50800 }, { "epoch": 9.56, "grad_norm": 37.347965240478516, "learning_rate": 8.733295689817429e-07, "loss": 0.5309, "step": 50810 }, { "epoch": 9.57, "grad_norm": 9.028761863708496, "learning_rate": 8.695652173913044e-07, "loss": 0.4925, "step": 50820 }, { "epoch": 9.57, "grad_norm": 18.40884780883789, "learning_rate": 8.658008658008658e-07, "loss": 0.4293, "step": 50830 }, { "epoch": 9.57, "grad_norm": 20.18854331970215, "learning_rate": 8.620365142104273e-07, "loss": 0.4924, "step": 50840 }, { "epoch": 9.57, "grad_norm": 17.72684097290039, "learning_rate": 8.582721626199888e-07, "loss": 0.3837, "step": 50850 }, { "epoch": 9.57, "grad_norm": 8.206231117248535, "learning_rate": 8.545078110295501e-07, "loss": 0.463, "step": 50860 }, { "epoch": 9.57, "grad_norm": 0.24733124673366547, "learning_rate": 8.507434594391118e-07, "loss": 0.5556, "step": 50870 }, { "epoch": 9.58, "grad_norm": 21.102048873901367, "learning_rate": 8.469791078486731e-07, "loss": 0.3003, "step": 50880 }, { "epoch": 9.58, "grad_norm": 9.614502906799316, "learning_rate": 8.432147562582346e-07, "loss": 0.4534, "step": 50890 }, { "epoch": 9.58, "grad_norm": 0.05175158008933067, "learning_rate": 8.394504046677961e-07, "loss": 0.3368, "step": 50900 }, { "epoch": 9.58, "grad_norm": 18.473926544189453, "learning_rate": 8.356860530773575e-07, "loss": 0.6013, "step": 50910 }, { "epoch": 9.58, "grad_norm": 18.408864974975586, "learning_rate": 8.31921701486919e-07, "loss": 0.4173, "step": 50920 }, { "epoch": 9.59, "grad_norm": 27.239898681640625, "learning_rate": 8.281573498964803e-07, "loss": 0.4574, "step": 50930 }, { "epoch": 9.59, "grad_norm": 6.1605753898620605, "learning_rate": 8.243929983060418e-07, "loss": 0.6228, "step": 50940 }, { "epoch": 9.59, "grad_norm": 4.264019012451172, "learning_rate": 8.206286467156033e-07, "loss": 0.2077, "step": 50950 }, { "epoch": 9.59, "grad_norm": 9.811421394348145, "learning_rate": 8.168642951251647e-07, "loss": 0.4051, "step": 50960 }, { "epoch": 9.59, "grad_norm": 5.31566858291626, "learning_rate": 8.130999435347262e-07, "loss": 0.37, "step": 50970 }, { "epoch": 9.6, "grad_norm": 0.32057228684425354, "learning_rate": 8.093355919442876e-07, "loss": 0.6192, "step": 50980 }, { "epoch": 9.6, "grad_norm": 5.027352809906006, "learning_rate": 8.055712403538492e-07, "loss": 0.7235, "step": 50990 }, { "epoch": 9.6, "grad_norm": 0.16423563659191132, "learning_rate": 8.018068887634107e-07, "loss": 0.3614, "step": 51000 }, { "epoch": 9.6, "grad_norm": 7.00469970703125, "learning_rate": 7.98042537172972e-07, "loss": 0.2669, "step": 51010 }, { "epoch": 9.6, "grad_norm": 13.766214370727539, "learning_rate": 7.942781855825335e-07, "loss": 0.3001, "step": 51020 }, { "epoch": 9.6, "grad_norm": 15.029500007629395, "learning_rate": 7.905138339920949e-07, "loss": 0.5506, "step": 51030 }, { "epoch": 9.61, "grad_norm": 18.616168975830078, "learning_rate": 7.867494824016564e-07, "loss": 0.8327, "step": 51040 }, { "epoch": 9.61, "grad_norm": 1.1364197731018066, "learning_rate": 7.829851308112179e-07, "loss": 0.3129, "step": 51050 }, { "epoch": 9.61, "grad_norm": 29.443477630615234, "learning_rate": 7.792207792207792e-07, "loss": 0.5467, "step": 51060 }, { "epoch": 9.61, "grad_norm": 34.73600387573242, "learning_rate": 7.754564276303407e-07, "loss": 0.4792, "step": 51070 }, { "epoch": 9.61, "grad_norm": 0.39562925696372986, "learning_rate": 7.716920760399021e-07, "loss": 0.4972, "step": 51080 }, { "epoch": 9.62, "grad_norm": 21.628002166748047, "learning_rate": 7.679277244494636e-07, "loss": 0.6476, "step": 51090 }, { "epoch": 9.62, "grad_norm": 0.8809748888015747, "learning_rate": 7.641633728590251e-07, "loss": 0.5343, "step": 51100 }, { "epoch": 9.62, "grad_norm": 28.88115882873535, "learning_rate": 7.603990212685866e-07, "loss": 0.3758, "step": 51110 }, { "epoch": 9.62, "grad_norm": 8.242012977600098, "learning_rate": 7.566346696781481e-07, "loss": 0.3154, "step": 51120 }, { "epoch": 9.62, "grad_norm": 0.03372791409492493, "learning_rate": 7.528703180877094e-07, "loss": 0.4118, "step": 51130 }, { "epoch": 9.63, "grad_norm": 3.514137029647827, "learning_rate": 7.491059664972709e-07, "loss": 0.3894, "step": 51140 }, { "epoch": 9.63, "grad_norm": 0.49889636039733887, "learning_rate": 7.453416149068324e-07, "loss": 0.3594, "step": 51150 }, { "epoch": 9.63, "grad_norm": 0.3000124394893646, "learning_rate": 7.415772633163938e-07, "loss": 0.3072, "step": 51160 }, { "epoch": 9.63, "grad_norm": 0.06081683188676834, "learning_rate": 7.378129117259553e-07, "loss": 0.3226, "step": 51170 }, { "epoch": 9.63, "grad_norm": 5.96708345413208, "learning_rate": 7.340485601355168e-07, "loss": 0.4402, "step": 51180 }, { "epoch": 9.63, "grad_norm": 0.35737451910972595, "learning_rate": 7.302842085450781e-07, "loss": 0.7907, "step": 51190 }, { "epoch": 9.64, "grad_norm": 4.599313735961914, "learning_rate": 7.265198569546396e-07, "loss": 0.1227, "step": 51200 }, { "epoch": 9.64, "grad_norm": 4.307950973510742, "learning_rate": 7.22755505364201e-07, "loss": 0.2469, "step": 51210 }, { "epoch": 9.64, "grad_norm": 22.732894897460938, "learning_rate": 7.189911537737625e-07, "loss": 0.4584, "step": 51220 }, { "epoch": 9.64, "grad_norm": 21.445600509643555, "learning_rate": 7.152268021833241e-07, "loss": 0.355, "step": 51230 }, { "epoch": 9.64, "grad_norm": 7.818153381347656, "learning_rate": 7.114624505928855e-07, "loss": 0.3512, "step": 51240 }, { "epoch": 9.65, "grad_norm": 11.798916816711426, "learning_rate": 7.07698099002447e-07, "loss": 0.4811, "step": 51250 }, { "epoch": 9.65, "grad_norm": 21.753093719482422, "learning_rate": 7.039337474120083e-07, "loss": 0.3904, "step": 51260 }, { "epoch": 9.65, "grad_norm": 9.064457893371582, "learning_rate": 7.001693958215698e-07, "loss": 0.3594, "step": 51270 }, { "epoch": 9.65, "grad_norm": 0.17999929189682007, "learning_rate": 6.964050442311313e-07, "loss": 0.5093, "step": 51280 }, { "epoch": 9.65, "grad_norm": 27.64767837524414, "learning_rate": 6.926406926406927e-07, "loss": 0.4248, "step": 51290 }, { "epoch": 9.66, "grad_norm": 13.119711875915527, "learning_rate": 6.888763410502542e-07, "loss": 0.2889, "step": 51300 }, { "epoch": 9.66, "grad_norm": 17.546010971069336, "learning_rate": 6.851119894598155e-07, "loss": 0.402, "step": 51310 }, { "epoch": 9.66, "grad_norm": 3.10264253616333, "learning_rate": 6.81347637869377e-07, "loss": 0.3537, "step": 51320 }, { "epoch": 9.66, "grad_norm": 34.127784729003906, "learning_rate": 6.775832862789384e-07, "loss": 0.4739, "step": 51330 }, { "epoch": 9.66, "grad_norm": 20.95803451538086, "learning_rate": 6.738189346884999e-07, "loss": 0.2397, "step": 51340 }, { "epoch": 9.66, "grad_norm": 20.92197036743164, "learning_rate": 6.700545830980614e-07, "loss": 0.7235, "step": 51350 }, { "epoch": 9.67, "grad_norm": 0.09385684132575989, "learning_rate": 6.662902315076229e-07, "loss": 0.612, "step": 51360 }, { "epoch": 9.67, "grad_norm": 11.531614303588867, "learning_rate": 6.625258799171844e-07, "loss": 0.1948, "step": 51370 }, { "epoch": 9.67, "grad_norm": 15.86019515991211, "learning_rate": 6.587615283267459e-07, "loss": 0.2973, "step": 51380 }, { "epoch": 9.67, "grad_norm": 12.996696472167969, "learning_rate": 6.549971767363072e-07, "loss": 0.3127, "step": 51390 }, { "epoch": 9.67, "grad_norm": 12.861435890197754, "learning_rate": 6.512328251458687e-07, "loss": 0.5937, "step": 51400 }, { "epoch": 9.68, "grad_norm": 8.911890029907227, "learning_rate": 6.474684735554301e-07, "loss": 0.364, "step": 51410 }, { "epoch": 9.68, "grad_norm": 24.71923828125, "learning_rate": 6.437041219649916e-07, "loss": 0.6153, "step": 51420 }, { "epoch": 9.68, "grad_norm": 17.451587677001953, "learning_rate": 6.399397703745531e-07, "loss": 0.6571, "step": 51430 }, { "epoch": 9.68, "grad_norm": 20.86277198791504, "learning_rate": 6.361754187841144e-07, "loss": 0.3892, "step": 51440 }, { "epoch": 9.68, "grad_norm": 32.68037414550781, "learning_rate": 6.324110671936759e-07, "loss": 0.2615, "step": 51450 }, { "epoch": 9.69, "grad_norm": 25.63720703125, "learning_rate": 6.286467156032373e-07, "loss": 0.3267, "step": 51460 }, { "epoch": 9.69, "grad_norm": 5.5767645835876465, "learning_rate": 6.248823640127989e-07, "loss": 0.2016, "step": 51470 }, { "epoch": 9.69, "grad_norm": 27.157485961914062, "learning_rate": 6.211180124223603e-07, "loss": 0.6425, "step": 51480 }, { "epoch": 9.69, "grad_norm": 8.377914428710938, "learning_rate": 6.173536608319217e-07, "loss": 0.2027, "step": 51490 }, { "epoch": 9.69, "grad_norm": 21.33051300048828, "learning_rate": 6.135893092414832e-07, "loss": 0.2035, "step": 51500 }, { "epoch": 9.7, "grad_norm": 17.3638916015625, "learning_rate": 6.098249576510446e-07, "loss": 0.1943, "step": 51510 }, { "epoch": 9.7, "grad_norm": 0.3384857475757599, "learning_rate": 6.060606060606061e-07, "loss": 0.1849, "step": 51520 }, { "epoch": 9.7, "grad_norm": 1.3415716886520386, "learning_rate": 6.022962544701676e-07, "loss": 0.5632, "step": 51530 }, { "epoch": 9.7, "grad_norm": 30.25799560546875, "learning_rate": 5.98531902879729e-07, "loss": 0.43, "step": 51540 }, { "epoch": 9.7, "grad_norm": 32.7429084777832, "learning_rate": 5.947675512892905e-07, "loss": 0.4043, "step": 51550 }, { "epoch": 9.7, "grad_norm": 17.513532638549805, "learning_rate": 5.910031996988519e-07, "loss": 0.4679, "step": 51560 }, { "epoch": 9.71, "grad_norm": 25.66455841064453, "learning_rate": 5.872388481084134e-07, "loss": 0.3619, "step": 51570 }, { "epoch": 9.71, "grad_norm": 20.97486114501953, "learning_rate": 5.834744965179749e-07, "loss": 0.6867, "step": 51580 }, { "epoch": 9.71, "grad_norm": 7.469673156738281, "learning_rate": 5.797101449275363e-07, "loss": 0.3848, "step": 51590 }, { "epoch": 9.71, "grad_norm": 32.72283935546875, "learning_rate": 5.759457933370977e-07, "loss": 0.3702, "step": 51600 }, { "epoch": 9.71, "grad_norm": 7.483311176300049, "learning_rate": 5.721814417466591e-07, "loss": 0.29, "step": 51610 }, { "epoch": 9.72, "grad_norm": 12.17415714263916, "learning_rate": 5.684170901562206e-07, "loss": 0.3262, "step": 51620 }, { "epoch": 9.72, "grad_norm": 18.75547218322754, "learning_rate": 5.646527385657822e-07, "loss": 0.4394, "step": 51630 }, { "epoch": 9.72, "grad_norm": 12.477697372436523, "learning_rate": 5.608883869753435e-07, "loss": 0.4251, "step": 51640 }, { "epoch": 9.72, "grad_norm": 15.879676818847656, "learning_rate": 5.57124035384905e-07, "loss": 0.4689, "step": 51650 }, { "epoch": 9.72, "grad_norm": 31.482437133789062, "learning_rate": 5.533596837944664e-07, "loss": 0.525, "step": 51660 }, { "epoch": 9.73, "grad_norm": 9.857966423034668, "learning_rate": 5.495953322040279e-07, "loss": 0.313, "step": 51670 }, { "epoch": 9.73, "grad_norm": 0.783555269241333, "learning_rate": 5.458309806135894e-07, "loss": 0.5831, "step": 51680 }, { "epoch": 9.73, "grad_norm": 0.14815597236156464, "learning_rate": 5.420666290231508e-07, "loss": 0.5301, "step": 51690 }, { "epoch": 9.73, "grad_norm": 4.29640531539917, "learning_rate": 5.383022774327123e-07, "loss": 0.4009, "step": 51700 }, { "epoch": 9.73, "grad_norm": 14.452727317810059, "learning_rate": 5.345379258422737e-07, "loss": 0.3563, "step": 51710 }, { "epoch": 9.73, "grad_norm": 1.4909330606460571, "learning_rate": 5.307735742518352e-07, "loss": 0.3447, "step": 51720 }, { "epoch": 9.74, "grad_norm": 0.06365931034088135, "learning_rate": 5.270092226613966e-07, "loss": 0.1957, "step": 51730 }, { "epoch": 9.74, "grad_norm": 1.6421184539794922, "learning_rate": 5.23244871070958e-07, "loss": 0.5981, "step": 51740 }, { "epoch": 9.74, "grad_norm": 0.5028958320617676, "learning_rate": 5.194805194805196e-07, "loss": 0.3268, "step": 51750 }, { "epoch": 9.74, "grad_norm": 4.667960166931152, "learning_rate": 5.15716167890081e-07, "loss": 0.1761, "step": 51760 }, { "epoch": 9.74, "grad_norm": 1.6822726726531982, "learning_rate": 5.119518162996424e-07, "loss": 0.5087, "step": 51770 }, { "epoch": 9.75, "grad_norm": 0.5095930099487305, "learning_rate": 5.081874647092039e-07, "loss": 0.6654, "step": 51780 }, { "epoch": 9.75, "grad_norm": 0.034996677190065384, "learning_rate": 5.044231131187653e-07, "loss": 0.22, "step": 51790 }, { "epoch": 9.75, "grad_norm": 0.16648948192596436, "learning_rate": 5.006587615283268e-07, "loss": 0.1263, "step": 51800 }, { "epoch": 9.75, "grad_norm": 32.999053955078125, "learning_rate": 4.968944099378882e-07, "loss": 0.2778, "step": 51810 }, { "epoch": 9.75, "grad_norm": 54.918148040771484, "learning_rate": 4.931300583474497e-07, "loss": 0.5931, "step": 51820 }, { "epoch": 9.76, "grad_norm": 28.86396026611328, "learning_rate": 4.893657067570112e-07, "loss": 0.2479, "step": 51830 }, { "epoch": 9.76, "grad_norm": 33.95524597167969, "learning_rate": 4.856013551665726e-07, "loss": 0.2287, "step": 51840 }, { "epoch": 9.76, "grad_norm": 20.653039932250977, "learning_rate": 4.818370035761341e-07, "loss": 0.4423, "step": 51850 }, { "epoch": 9.76, "grad_norm": 13.929657936096191, "learning_rate": 4.780726519856954e-07, "loss": 0.5723, "step": 51860 }, { "epoch": 9.76, "grad_norm": 7.612604141235352, "learning_rate": 4.743083003952569e-07, "loss": 0.6222, "step": 51870 }, { "epoch": 9.76, "grad_norm": 14.222421646118164, "learning_rate": 4.705439488048184e-07, "loss": 0.3785, "step": 51880 }, { "epoch": 9.77, "grad_norm": 0.42615869641304016, "learning_rate": 4.667795972143799e-07, "loss": 0.4963, "step": 51890 }, { "epoch": 9.77, "grad_norm": 7.262185096740723, "learning_rate": 4.6301524562394134e-07, "loss": 0.6145, "step": 51900 }, { "epoch": 9.77, "grad_norm": 11.753568649291992, "learning_rate": 4.5925089403350275e-07, "loss": 0.5361, "step": 51910 }, { "epoch": 9.77, "grad_norm": 0.5254884362220764, "learning_rate": 4.554865424430642e-07, "loss": 0.6324, "step": 51920 }, { "epoch": 9.77, "grad_norm": 0.03477970510721207, "learning_rate": 4.517221908526256e-07, "loss": 0.401, "step": 51930 }, { "epoch": 9.78, "grad_norm": 15.322552680969238, "learning_rate": 4.4795783926218713e-07, "loss": 0.5914, "step": 51940 }, { "epoch": 9.78, "grad_norm": 8.4584379196167, "learning_rate": 4.441934876717486e-07, "loss": 0.4317, "step": 51950 }, { "epoch": 9.78, "grad_norm": 15.443913459777832, "learning_rate": 4.4042913608131005e-07, "loss": 0.4615, "step": 51960 }, { "epoch": 9.78, "grad_norm": 6.165141582489014, "learning_rate": 4.3666478449087146e-07, "loss": 0.3485, "step": 51970 }, { "epoch": 9.78, "grad_norm": 22.44362449645996, "learning_rate": 4.329004329004329e-07, "loss": 0.493, "step": 51980 }, { "epoch": 9.79, "grad_norm": 0.9760726690292358, "learning_rate": 4.291360813099944e-07, "loss": 0.2653, "step": 51990 }, { "epoch": 9.79, "grad_norm": 0.020068148151040077, "learning_rate": 4.253717297195559e-07, "loss": 0.2711, "step": 52000 }, { "epoch": 9.79, "grad_norm": 14.025554656982422, "learning_rate": 4.216073781291173e-07, "loss": 0.4435, "step": 52010 }, { "epoch": 9.79, "grad_norm": 0.10281354933977127, "learning_rate": 4.1784302653867876e-07, "loss": 0.2737, "step": 52020 }, { "epoch": 9.79, "grad_norm": 17.83829116821289, "learning_rate": 4.1407867494824017e-07, "loss": 0.5229, "step": 52030 }, { "epoch": 9.79, "grad_norm": 0.09223546832799911, "learning_rate": 4.1031432335780163e-07, "loss": 0.4226, "step": 52040 }, { "epoch": 9.8, "grad_norm": 0.02461303025484085, "learning_rate": 4.065499717673631e-07, "loss": 0.2307, "step": 52050 }, { "epoch": 9.8, "grad_norm": 12.551608085632324, "learning_rate": 4.027856201769246e-07, "loss": 0.1268, "step": 52060 }, { "epoch": 9.8, "grad_norm": 12.722687721252441, "learning_rate": 3.99021268586486e-07, "loss": 0.5292, "step": 52070 }, { "epoch": 9.8, "grad_norm": 7.628570556640625, "learning_rate": 3.9525691699604747e-07, "loss": 0.2919, "step": 52080 }, { "epoch": 9.8, "grad_norm": 7.446393013000488, "learning_rate": 3.9149256540560893e-07, "loss": 0.426, "step": 52090 }, { "epoch": 9.81, "grad_norm": 24.474695205688477, "learning_rate": 3.8772821381517034e-07, "loss": 0.4843, "step": 52100 }, { "epoch": 9.81, "grad_norm": 14.59511661529541, "learning_rate": 3.839638622247318e-07, "loss": 0.4363, "step": 52110 }, { "epoch": 9.81, "grad_norm": 0.08716907352209091, "learning_rate": 3.801995106342933e-07, "loss": 0.3595, "step": 52120 }, { "epoch": 9.81, "grad_norm": 32.84730529785156, "learning_rate": 3.764351590438547e-07, "loss": 0.5217, "step": 52130 }, { "epoch": 9.81, "grad_norm": 1.3554562330245972, "learning_rate": 3.726708074534162e-07, "loss": 0.2144, "step": 52140 }, { "epoch": 9.82, "grad_norm": 14.616629600524902, "learning_rate": 3.6890645586297765e-07, "loss": 0.4909, "step": 52150 }, { "epoch": 9.82, "grad_norm": 15.903406143188477, "learning_rate": 3.6514210427253905e-07, "loss": 0.6216, "step": 52160 }, { "epoch": 9.82, "grad_norm": 10.447539329528809, "learning_rate": 3.613777526821005e-07, "loss": 0.3794, "step": 52170 }, { "epoch": 9.82, "grad_norm": 5.395421981811523, "learning_rate": 3.5761340109166203e-07, "loss": 0.4563, "step": 52180 }, { "epoch": 9.82, "grad_norm": 2.901611328125, "learning_rate": 3.538490495012235e-07, "loss": 0.3466, "step": 52190 }, { "epoch": 9.82, "grad_norm": 7.655229568481445, "learning_rate": 3.500846979107849e-07, "loss": 0.5837, "step": 52200 }, { "epoch": 9.83, "grad_norm": 46.457275390625, "learning_rate": 3.4632034632034636e-07, "loss": 0.3432, "step": 52210 }, { "epoch": 9.83, "grad_norm": 0.17484161257743835, "learning_rate": 3.4255599472990776e-07, "loss": 0.3191, "step": 52220 }, { "epoch": 9.83, "grad_norm": 21.340837478637695, "learning_rate": 3.387916431394692e-07, "loss": 0.9333, "step": 52230 }, { "epoch": 9.83, "grad_norm": 9.154580116271973, "learning_rate": 3.350272915490307e-07, "loss": 0.2877, "step": 52240 }, { "epoch": 9.83, "grad_norm": 31.7108097076416, "learning_rate": 3.312629399585922e-07, "loss": 0.5454, "step": 52250 }, { "epoch": 9.84, "grad_norm": 0.7132735252380371, "learning_rate": 3.274985883681536e-07, "loss": 0.2906, "step": 52260 }, { "epoch": 9.84, "grad_norm": 9.988203048706055, "learning_rate": 3.2373423677771507e-07, "loss": 0.3561, "step": 52270 }, { "epoch": 9.84, "grad_norm": 20.169540405273438, "learning_rate": 3.1996988518727653e-07, "loss": 0.5719, "step": 52280 }, { "epoch": 9.84, "grad_norm": 2.338794708251953, "learning_rate": 3.1620553359683794e-07, "loss": 0.5348, "step": 52290 }, { "epoch": 9.84, "grad_norm": 9.757970809936523, "learning_rate": 3.1244118200639945e-07, "loss": 0.3251, "step": 52300 }, { "epoch": 9.85, "grad_norm": 16.40189552307129, "learning_rate": 3.0867683041596086e-07, "loss": 0.4938, "step": 52310 }, { "epoch": 9.85, "grad_norm": 0.104170061647892, "learning_rate": 3.049124788255223e-07, "loss": 0.473, "step": 52320 }, { "epoch": 9.85, "grad_norm": 16.414779663085938, "learning_rate": 3.011481272350838e-07, "loss": 0.4607, "step": 52330 }, { "epoch": 9.85, "grad_norm": 10.880690574645996, "learning_rate": 2.9738377564464524e-07, "loss": 0.5413, "step": 52340 }, { "epoch": 9.85, "grad_norm": 0.09918955713510513, "learning_rate": 2.936194240542067e-07, "loss": 0.3942, "step": 52350 }, { "epoch": 9.86, "grad_norm": 31.41875457763672, "learning_rate": 2.8985507246376816e-07, "loss": 0.5384, "step": 52360 }, { "epoch": 9.86, "grad_norm": 15.386962890625, "learning_rate": 2.8609072087332957e-07, "loss": 0.4931, "step": 52370 }, { "epoch": 9.86, "grad_norm": 6.390880584716797, "learning_rate": 2.823263692828911e-07, "loss": 0.3672, "step": 52380 }, { "epoch": 9.86, "grad_norm": 29.264728546142578, "learning_rate": 2.785620176924525e-07, "loss": 0.2607, "step": 52390 }, { "epoch": 9.86, "grad_norm": 10.851874351501465, "learning_rate": 2.7479766610201395e-07, "loss": 0.6167, "step": 52400 }, { "epoch": 9.86, "grad_norm": 23.39576530456543, "learning_rate": 2.710333145115754e-07, "loss": 0.4207, "step": 52410 }, { "epoch": 9.87, "grad_norm": 20.70049476623535, "learning_rate": 2.6726896292113687e-07, "loss": 0.502, "step": 52420 }, { "epoch": 9.87, "grad_norm": 12.70500373840332, "learning_rate": 2.635046113306983e-07, "loss": 0.4077, "step": 52430 }, { "epoch": 9.87, "grad_norm": 0.4478881359100342, "learning_rate": 2.597402597402598e-07, "loss": 0.184, "step": 52440 }, { "epoch": 9.87, "grad_norm": 8.121723175048828, "learning_rate": 2.559759081498212e-07, "loss": 0.2444, "step": 52450 }, { "epoch": 9.87, "grad_norm": 0.5330722332000732, "learning_rate": 2.5221155655938266e-07, "loss": 0.4361, "step": 52460 }, { "epoch": 9.88, "grad_norm": 31.016155242919922, "learning_rate": 2.484472049689441e-07, "loss": 0.5419, "step": 52470 }, { "epoch": 9.88, "grad_norm": 7.886919975280762, "learning_rate": 2.446828533785056e-07, "loss": 0.4408, "step": 52480 }, { "epoch": 9.88, "grad_norm": 29.892057418823242, "learning_rate": 2.4091850178806704e-07, "loss": 0.4313, "step": 52490 }, { "epoch": 9.88, "grad_norm": 0.040985073894262314, "learning_rate": 2.3715415019762845e-07, "loss": 0.3838, "step": 52500 }, { "epoch": 9.88, "grad_norm": 20.565820693969727, "learning_rate": 2.3338979860718994e-07, "loss": 0.5992, "step": 52510 }, { "epoch": 9.89, "grad_norm": 9.025721549987793, "learning_rate": 2.2962544701675137e-07, "loss": 0.3285, "step": 52520 }, { "epoch": 9.89, "grad_norm": 18.053125381469727, "learning_rate": 2.258610954263128e-07, "loss": 0.4098, "step": 52530 }, { "epoch": 9.89, "grad_norm": 20.12868309020996, "learning_rate": 2.220967438358743e-07, "loss": 0.5116, "step": 52540 }, { "epoch": 9.89, "grad_norm": 18.908733367919922, "learning_rate": 2.1833239224543573e-07, "loss": 0.5385, "step": 52550 }, { "epoch": 9.89, "grad_norm": 1.1563811302185059, "learning_rate": 2.145680406549972e-07, "loss": 0.5077, "step": 52560 }, { "epoch": 9.89, "grad_norm": 4.780964374542236, "learning_rate": 2.1080368906455865e-07, "loss": 0.4955, "step": 52570 }, { "epoch": 9.9, "grad_norm": 9.796252250671387, "learning_rate": 2.0703933747412008e-07, "loss": 0.4149, "step": 52580 }, { "epoch": 9.9, "grad_norm": 0.043842703104019165, "learning_rate": 2.0327498588368155e-07, "loss": 0.2674, "step": 52590 }, { "epoch": 9.9, "grad_norm": 17.572214126586914, "learning_rate": 1.99510634293243e-07, "loss": 0.3753, "step": 52600 }, { "epoch": 9.9, "grad_norm": 1.6240875720977783, "learning_rate": 1.9574628270280447e-07, "loss": 0.2265, "step": 52610 }, { "epoch": 9.9, "grad_norm": 23.174564361572266, "learning_rate": 1.919819311123659e-07, "loss": 0.7401, "step": 52620 }, { "epoch": 9.91, "grad_norm": 6.317526817321777, "learning_rate": 1.8821757952192736e-07, "loss": 0.5374, "step": 52630 }, { "epoch": 9.91, "grad_norm": 15.101905822753906, "learning_rate": 1.8445322793148882e-07, "loss": 0.3169, "step": 52640 }, { "epoch": 9.91, "grad_norm": 12.77408218383789, "learning_rate": 1.8068887634105026e-07, "loss": 0.6194, "step": 52650 }, { "epoch": 9.91, "grad_norm": 7.3459248542785645, "learning_rate": 1.7692452475061174e-07, "loss": 0.2904, "step": 52660 }, { "epoch": 9.91, "grad_norm": 12.190526962280273, "learning_rate": 1.7316017316017318e-07, "loss": 0.4855, "step": 52670 }, { "epoch": 9.92, "grad_norm": 7.406374931335449, "learning_rate": 1.693958215697346e-07, "loss": 0.2343, "step": 52680 }, { "epoch": 9.92, "grad_norm": 0.23926931619644165, "learning_rate": 1.656314699792961e-07, "loss": 0.2742, "step": 52690 }, { "epoch": 9.92, "grad_norm": 2.192960739135742, "learning_rate": 1.6186711838885753e-07, "loss": 0.3677, "step": 52700 }, { "epoch": 9.92, "grad_norm": 12.674443244934082, "learning_rate": 1.5810276679841897e-07, "loss": 0.4027, "step": 52710 }, { "epoch": 9.92, "grad_norm": 11.900289535522461, "learning_rate": 1.5433841520798043e-07, "loss": 0.3339, "step": 52720 }, { "epoch": 9.92, "grad_norm": 14.372079849243164, "learning_rate": 1.505740636175419e-07, "loss": 0.5075, "step": 52730 }, { "epoch": 9.93, "grad_norm": 2.849031925201416, "learning_rate": 1.4680971202710335e-07, "loss": 0.1418, "step": 52740 }, { "epoch": 9.93, "grad_norm": 5.773308753967285, "learning_rate": 1.4304536043666478e-07, "loss": 0.4116, "step": 52750 }, { "epoch": 9.93, "grad_norm": 10.172654151916504, "learning_rate": 1.3928100884622625e-07, "loss": 0.4328, "step": 52760 }, { "epoch": 9.93, "grad_norm": 13.555858612060547, "learning_rate": 1.355166572557877e-07, "loss": 0.4646, "step": 52770 }, { "epoch": 9.93, "grad_norm": 7.2199177742004395, "learning_rate": 1.3175230566534914e-07, "loss": 0.4224, "step": 52780 }, { "epoch": 9.94, "grad_norm": 21.8734130859375, "learning_rate": 1.279879540749106e-07, "loss": 0.3808, "step": 52790 }, { "epoch": 9.94, "grad_norm": 16.765634536743164, "learning_rate": 1.2422360248447206e-07, "loss": 0.615, "step": 52800 }, { "epoch": 9.94, "grad_norm": 13.97728157043457, "learning_rate": 1.2045925089403352e-07, "loss": 0.516, "step": 52810 }, { "epoch": 9.94, "grad_norm": 17.42995834350586, "learning_rate": 1.1669489930359497e-07, "loss": 0.3234, "step": 52820 }, { "epoch": 9.94, "grad_norm": 0.2923338711261749, "learning_rate": 1.129305477131564e-07, "loss": 0.2994, "step": 52830 }, { "epoch": 9.95, "grad_norm": 7.7803497314453125, "learning_rate": 1.0916619612271786e-07, "loss": 0.3944, "step": 52840 }, { "epoch": 9.95, "grad_norm": 27.952133178710938, "learning_rate": 1.0540184453227933e-07, "loss": 0.3718, "step": 52850 }, { "epoch": 9.95, "grad_norm": 34.63023376464844, "learning_rate": 1.0163749294184077e-07, "loss": 0.7521, "step": 52860 }, { "epoch": 9.95, "grad_norm": 24.050615310668945, "learning_rate": 9.787314135140223e-08, "loss": 0.4687, "step": 52870 }, { "epoch": 9.95, "grad_norm": 0.22404804825782776, "learning_rate": 9.410878976096368e-08, "loss": 0.3324, "step": 52880 }, { "epoch": 9.95, "grad_norm": 8.329122543334961, "learning_rate": 9.034443817052513e-08, "loss": 0.5473, "step": 52890 }, { "epoch": 9.96, "grad_norm": 2.2988033294677734, "learning_rate": 8.658008658008659e-08, "loss": 0.1834, "step": 52900 }, { "epoch": 9.96, "grad_norm": 16.856712341308594, "learning_rate": 8.281573498964805e-08, "loss": 0.3419, "step": 52910 }, { "epoch": 9.96, "grad_norm": 22.70846176147461, "learning_rate": 7.905138339920948e-08, "loss": 0.4666, "step": 52920 }, { "epoch": 9.96, "grad_norm": 8.477212905883789, "learning_rate": 7.528703180877094e-08, "loss": 0.1898, "step": 52930 }, { "epoch": 9.96, "grad_norm": 7.554108619689941, "learning_rate": 7.152268021833239e-08, "loss": 0.4636, "step": 52940 }, { "epoch": 9.97, "grad_norm": 35.77024841308594, "learning_rate": 6.775832862789385e-08, "loss": 0.5227, "step": 52950 }, { "epoch": 9.97, "grad_norm": 0.06595637649297714, "learning_rate": 6.39939770374553e-08, "loss": 0.3577, "step": 52960 }, { "epoch": 9.97, "grad_norm": 19.368412017822266, "learning_rate": 6.022962544701676e-08, "loss": 0.5107, "step": 52970 }, { "epoch": 9.97, "grad_norm": 0.05418463796377182, "learning_rate": 5.64652738565782e-08, "loss": 0.4543, "step": 52980 }, { "epoch": 9.97, "grad_norm": 10.112632751464844, "learning_rate": 5.270092226613966e-08, "loss": 0.6876, "step": 52990 }, { "epoch": 9.98, "grad_norm": 22.078157424926758, "learning_rate": 4.893657067570112e-08, "loss": 0.505, "step": 53000 }, { "epoch": 9.98, "grad_norm": 7.689653396606445, "learning_rate": 4.5172219085262564e-08, "loss": 0.4111, "step": 53010 }, { "epoch": 9.98, "grad_norm": 17.780681610107422, "learning_rate": 4.1407867494824025e-08, "loss": 0.6067, "step": 53020 }, { "epoch": 9.98, "grad_norm": 54.270999908447266, "learning_rate": 3.764351590438547e-08, "loss": 0.3095, "step": 53030 }, { "epoch": 9.98, "grad_norm": 12.391640663146973, "learning_rate": 3.3879164313946926e-08, "loss": 0.6079, "step": 53040 }, { "epoch": 9.98, "grad_norm": 15.050522804260254, "learning_rate": 3.011481272350838e-08, "loss": 0.6605, "step": 53050 }, { "epoch": 9.99, "grad_norm": 10.04753303527832, "learning_rate": 2.635046113306983e-08, "loss": 0.5057, "step": 53060 }, { "epoch": 9.99, "grad_norm": 14.447115898132324, "learning_rate": 2.2586109542631282e-08, "loss": 0.5889, "step": 53070 }, { "epoch": 9.99, "grad_norm": 15.600353240966797, "learning_rate": 1.8821757952192736e-08, "loss": 0.3044, "step": 53080 }, { "epoch": 9.99, "grad_norm": 41.5399284362793, "learning_rate": 1.505740636175419e-08, "loss": 0.3476, "step": 53090 }, { "epoch": 9.99, "grad_norm": 0.05904613435268402, "learning_rate": 1.1293054771315641e-08, "loss": 0.3915, "step": 53100 }, { "epoch": 10.0, "grad_norm": 0.6749778389930725, "learning_rate": 7.528703180877095e-09, "loss": 0.4269, "step": 53110 }, { "epoch": 10.0, "grad_norm": 0.3194694519042969, "learning_rate": 3.7643515904385476e-09, "loss": 0.2406, "step": 53120 }, { "epoch": 10.0, "grad_norm": 0.023369356989860535, "learning_rate": 0.0, "loss": 0.3274, "step": 53130 }, { "epoch": 10.0, "eval_accuracy": 0.9269333333333334, "eval_loss": 0.3031522035598755, "eval_runtime": 53.4027, "eval_samples_per_second": 140.442, "eval_steps_per_second": 17.565, "step": 53130 }, { "epoch": 10.0, "step": 53130, "total_flos": 3.29630230185984e+19, "train_loss": 0.8302146609071148, "train_runtime": 9599.828, "train_samples_per_second": 44.272, "train_steps_per_second": 5.534 } ], "logging_steps": 10, "max_steps": 53130, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.29630230185984e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }