scottsuk0306's picture
Model save
002bc5a verified
raw
history blame contribute delete
No virus
241 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 13480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000741839762611276,
"grad_norm": 18.7586669921875,
"learning_rate": 2.9999959263751826e-06,
"loss": 2.1609,
"step": 10
},
{
"epoch": 0.001483679525222552,
"grad_norm": 10.413708686828613,
"learning_rate": 2.999983705522856e-06,
"loss": 1.7246,
"step": 20
},
{
"epoch": 0.002225519287833828,
"grad_norm": 11.689846992492676,
"learning_rate": 2.9999633375093975e-06,
"loss": 1.703,
"step": 30
},
{
"epoch": 0.002967359050445104,
"grad_norm": 10.514595985412598,
"learning_rate": 2.9999348224454366e-06,
"loss": 1.5608,
"step": 40
},
{
"epoch": 0.00370919881305638,
"grad_norm": 9.605154037475586,
"learning_rate": 2.9998981604858526e-06,
"loss": 1.5354,
"step": 50
},
{
"epoch": 0.004451038575667656,
"grad_norm": 25.274913787841797,
"learning_rate": 2.999853351829775e-06,
"loss": 1.5925,
"step": 60
},
{
"epoch": 0.0051928783382789315,
"grad_norm": 11.746683120727539,
"learning_rate": 2.9998003967205817e-06,
"loss": 1.4979,
"step": 70
},
{
"epoch": 0.005934718100890208,
"grad_norm": 15.60824203491211,
"learning_rate": 2.9997392954458987e-06,
"loss": 1.4213,
"step": 80
},
{
"epoch": 0.0066765578635014835,
"grad_norm": 12.809992790222168,
"learning_rate": 2.9996700483375973e-06,
"loss": 1.604,
"step": 90
},
{
"epoch": 0.00741839762611276,
"grad_norm": 13.352578163146973,
"learning_rate": 2.9995926557717933e-06,
"loss": 1.5562,
"step": 100
},
{
"epoch": 0.008160237388724036,
"grad_norm": 11.533346176147461,
"learning_rate": 2.9995071181688438e-06,
"loss": 1.4658,
"step": 110
},
{
"epoch": 0.008902077151335312,
"grad_norm": 11.692753791809082,
"learning_rate": 2.9994134359933475e-06,
"loss": 1.4382,
"step": 120
},
{
"epoch": 0.009643916913946587,
"grad_norm": 15.99777889251709,
"learning_rate": 2.9993116097541383e-06,
"loss": 1.5165,
"step": 130
},
{
"epoch": 0.010385756676557863,
"grad_norm": 34.999664306640625,
"learning_rate": 2.999201640004285e-06,
"loss": 1.6118,
"step": 140
},
{
"epoch": 0.01112759643916914,
"grad_norm": 10.084653854370117,
"learning_rate": 2.99908352734109e-06,
"loss": 1.4602,
"step": 150
},
{
"epoch": 0.011869436201780416,
"grad_norm": 96.4203872680664,
"learning_rate": 2.99895727240608e-06,
"loss": 1.6046,
"step": 160
},
{
"epoch": 0.012611275964391691,
"grad_norm": 14.107004165649414,
"learning_rate": 2.9988228758850097e-06,
"loss": 1.6841,
"step": 170
},
{
"epoch": 0.013353115727002967,
"grad_norm": 11.504150390625,
"learning_rate": 2.9986803385078545e-06,
"loss": 1.4916,
"step": 180
},
{
"epoch": 0.014094955489614243,
"grad_norm": 8.963112831115723,
"learning_rate": 2.998529661048805e-06,
"loss": 1.4495,
"step": 190
},
{
"epoch": 0.01483679525222552,
"grad_norm": 11.037364959716797,
"learning_rate": 2.9983708443262657e-06,
"loss": 1.5727,
"step": 200
},
{
"epoch": 0.015578635014836795,
"grad_norm": 10.980456352233887,
"learning_rate": 2.99820388920285e-06,
"loss": 1.6488,
"step": 210
},
{
"epoch": 0.016320474777448073,
"grad_norm": 12.589357376098633,
"learning_rate": 2.9980287965853754e-06,
"loss": 1.4721,
"step": 220
},
{
"epoch": 0.017062314540059347,
"grad_norm": 12.314191818237305,
"learning_rate": 2.9978455674248558e-06,
"loss": 1.6205,
"step": 230
},
{
"epoch": 0.017804154302670624,
"grad_norm": 9.882691383361816,
"learning_rate": 2.9976542027165016e-06,
"loss": 1.5918,
"step": 240
},
{
"epoch": 0.018545994065281898,
"grad_norm": 11.461004257202148,
"learning_rate": 2.99745470349971e-06,
"loss": 1.639,
"step": 250
},
{
"epoch": 0.019287833827893175,
"grad_norm": 9.780576705932617,
"learning_rate": 2.99724707085806e-06,
"loss": 1.3886,
"step": 260
},
{
"epoch": 0.020029673590504452,
"grad_norm": 9.100162506103516,
"learning_rate": 2.9970313059193096e-06,
"loss": 1.2965,
"step": 270
},
{
"epoch": 0.020771513353115726,
"grad_norm": 10.991832733154297,
"learning_rate": 2.996807409855385e-06,
"loss": 1.556,
"step": 280
},
{
"epoch": 0.021513353115727003,
"grad_norm": 10.909322738647461,
"learning_rate": 2.9965753838823784e-06,
"loss": 1.4454,
"step": 290
},
{
"epoch": 0.02225519287833828,
"grad_norm": 9.893937110900879,
"learning_rate": 2.996335229260538e-06,
"loss": 1.5107,
"step": 300
},
{
"epoch": 0.022997032640949554,
"grad_norm": 11.460049629211426,
"learning_rate": 2.996086947294264e-06,
"loss": 1.5962,
"step": 310
},
{
"epoch": 0.02373887240356083,
"grad_norm": 11.341355323791504,
"learning_rate": 2.9958305393321e-06,
"loss": 1.4185,
"step": 320
},
{
"epoch": 0.024480712166172106,
"grad_norm": 8.831833839416504,
"learning_rate": 2.9955660067667256e-06,
"loss": 1.426,
"step": 330
},
{
"epoch": 0.025222551928783383,
"grad_norm": 46.39206314086914,
"learning_rate": 2.995293351034949e-06,
"loss": 1.6725,
"step": 340
},
{
"epoch": 0.02596439169139466,
"grad_norm": 9.161226272583008,
"learning_rate": 2.9950125736177004e-06,
"loss": 1.4317,
"step": 350
},
{
"epoch": 0.026706231454005934,
"grad_norm": 9.408476829528809,
"learning_rate": 2.9947236760400217e-06,
"loss": 1.6589,
"step": 360
},
{
"epoch": 0.02744807121661721,
"grad_norm": 10.739395141601562,
"learning_rate": 2.9944266598710606e-06,
"loss": 1.4851,
"step": 370
},
{
"epoch": 0.028189910979228485,
"grad_norm": 9.48901653289795,
"learning_rate": 2.99412152672406e-06,
"loss": 1.4584,
"step": 380
},
{
"epoch": 0.028931750741839762,
"grad_norm": 21.37883758544922,
"learning_rate": 2.9938082782563505e-06,
"loss": 1.438,
"step": 390
},
{
"epoch": 0.02967359050445104,
"grad_norm": 9.90542984008789,
"learning_rate": 2.993486916169341e-06,
"loss": 1.4416,
"step": 400
},
{
"epoch": 0.030415430267062313,
"grad_norm": 10.844427108764648,
"learning_rate": 2.99315744220851e-06,
"loss": 1.5955,
"step": 410
},
{
"epoch": 0.03115727002967359,
"grad_norm": 9.396254539489746,
"learning_rate": 2.9928198581633946e-06,
"loss": 1.3916,
"step": 420
},
{
"epoch": 0.031899109792284865,
"grad_norm": 9.14573860168457,
"learning_rate": 2.9924741658675827e-06,
"loss": 1.4467,
"step": 430
},
{
"epoch": 0.032640949554896145,
"grad_norm": 8.896514892578125,
"learning_rate": 2.9921203671987025e-06,
"loss": 1.4743,
"step": 440
},
{
"epoch": 0.03338278931750742,
"grad_norm": 10.342260360717773,
"learning_rate": 2.9917584640784107e-06,
"loss": 1.541,
"step": 450
},
{
"epoch": 0.03412462908011869,
"grad_norm": 9.386099815368652,
"learning_rate": 2.991388458472385e-06,
"loss": 1.27,
"step": 460
},
{
"epoch": 0.034866468842729974,
"grad_norm": 10.977550506591797,
"learning_rate": 2.9910103523903087e-06,
"loss": 1.4037,
"step": 470
},
{
"epoch": 0.03560830860534125,
"grad_norm": 9.735797882080078,
"learning_rate": 2.9906241478858667e-06,
"loss": 1.5082,
"step": 480
},
{
"epoch": 0.03635014836795252,
"grad_norm": 9.580273628234863,
"learning_rate": 2.9902298470567285e-06,
"loss": 1.4226,
"step": 490
},
{
"epoch": 0.037091988130563795,
"grad_norm": 12.713663101196289,
"learning_rate": 2.989827452044538e-06,
"loss": 1.5578,
"step": 500
},
{
"epoch": 0.037091988130563795,
"eval_loss": 1.465081810951233,
"eval_runtime": 23.4325,
"eval_samples_per_second": 19.033,
"eval_steps_per_second": 9.517,
"step": 500
},
{
"epoch": 0.037833827893175076,
"grad_norm": 9.113161087036133,
"learning_rate": 2.9894169650349047e-06,
"loss": 1.4684,
"step": 510
},
{
"epoch": 0.03857566765578635,
"grad_norm": 9.525900840759277,
"learning_rate": 2.988998388257388e-06,
"loss": 1.3998,
"step": 520
},
{
"epoch": 0.039317507418397624,
"grad_norm": 10.796713829040527,
"learning_rate": 2.988571723985488e-06,
"loss": 1.556,
"step": 530
},
{
"epoch": 0.040059347181008904,
"grad_norm": 10.529806137084961,
"learning_rate": 2.9881369745366313e-06,
"loss": 1.3819,
"step": 540
},
{
"epoch": 0.04080118694362018,
"grad_norm": 9.02527141571045,
"learning_rate": 2.9876941422721592e-06,
"loss": 1.4893,
"step": 550
},
{
"epoch": 0.04154302670623145,
"grad_norm": 8.577601432800293,
"learning_rate": 2.987243229597316e-06,
"loss": 1.5552,
"step": 560
},
{
"epoch": 0.04228486646884273,
"grad_norm": 10.954402923583984,
"learning_rate": 2.9867842389612326e-06,
"loss": 1.3512,
"step": 570
},
{
"epoch": 0.04302670623145401,
"grad_norm": 9.236324310302734,
"learning_rate": 2.9863171728569175e-06,
"loss": 1.5264,
"step": 580
},
{
"epoch": 0.04376854599406528,
"grad_norm": 9.721325874328613,
"learning_rate": 2.9858420338212393e-06,
"loss": 1.5841,
"step": 590
},
{
"epoch": 0.04451038575667656,
"grad_norm": 10.43162727355957,
"learning_rate": 2.985358824434916e-06,
"loss": 1.6017,
"step": 600
},
{
"epoch": 0.045252225519287835,
"grad_norm": 9.003376960754395,
"learning_rate": 2.984867547322499e-06,
"loss": 1.4716,
"step": 610
},
{
"epoch": 0.04599406528189911,
"grad_norm": 9.628597259521484,
"learning_rate": 2.9843682051523604e-06,
"loss": 1.5641,
"step": 620
},
{
"epoch": 0.04673590504451038,
"grad_norm": 14.442529678344727,
"learning_rate": 2.9838608006366766e-06,
"loss": 1.5637,
"step": 630
},
{
"epoch": 0.04747774480712166,
"grad_norm": 10.035704612731934,
"learning_rate": 2.983345336531415e-06,
"loss": 1.5762,
"step": 640
},
{
"epoch": 0.04821958456973294,
"grad_norm": 8.628552436828613,
"learning_rate": 2.9828218156363188e-06,
"loss": 1.4425,
"step": 650
},
{
"epoch": 0.04896142433234421,
"grad_norm": 10.285855293273926,
"learning_rate": 2.982290240794892e-06,
"loss": 1.4368,
"step": 660
},
{
"epoch": 0.04970326409495549,
"grad_norm": 9.917787551879883,
"learning_rate": 2.981750614894383e-06,
"loss": 1.547,
"step": 670
},
{
"epoch": 0.050445103857566766,
"grad_norm": 10.46651554107666,
"learning_rate": 2.9812029408657698e-06,
"loss": 1.4292,
"step": 680
},
{
"epoch": 0.05118694362017804,
"grad_norm": 8.533087730407715,
"learning_rate": 2.9806472216837436e-06,
"loss": 1.4962,
"step": 690
},
{
"epoch": 0.05192878338278932,
"grad_norm": 9.585909843444824,
"learning_rate": 2.9800834603666935e-06,
"loss": 1.471,
"step": 700
},
{
"epoch": 0.052670623145400594,
"grad_norm": 9.136356353759766,
"learning_rate": 2.9795116599766883e-06,
"loss": 1.31,
"step": 710
},
{
"epoch": 0.05341246290801187,
"grad_norm": 9.795812606811523,
"learning_rate": 2.9789318236194618e-06,
"loss": 1.5102,
"step": 720
},
{
"epoch": 0.05415430267062315,
"grad_norm": 9.728421211242676,
"learning_rate": 2.9783439544443953e-06,
"loss": 1.4569,
"step": 730
},
{
"epoch": 0.05489614243323442,
"grad_norm": 8.628436088562012,
"learning_rate": 2.9777480556444996e-06,
"loss": 1.5004,
"step": 740
},
{
"epoch": 0.055637982195845696,
"grad_norm": 9.77978229522705,
"learning_rate": 2.9771441304563996e-06,
"loss": 1.4255,
"step": 750
},
{
"epoch": 0.05637982195845697,
"grad_norm": 9.335463523864746,
"learning_rate": 2.9765321821603144e-06,
"loss": 1.5658,
"step": 760
},
{
"epoch": 0.05712166172106825,
"grad_norm": 12.877664566040039,
"learning_rate": 2.9759122140800406e-06,
"loss": 1.672,
"step": 770
},
{
"epoch": 0.057863501483679525,
"grad_norm": 15.497161865234375,
"learning_rate": 2.9752842295829357e-06,
"loss": 1.4453,
"step": 780
},
{
"epoch": 0.0586053412462908,
"grad_norm": 9.707072257995605,
"learning_rate": 2.9746482320798967e-06,
"loss": 1.4298,
"step": 790
},
{
"epoch": 0.05934718100890208,
"grad_norm": 9.096467018127441,
"learning_rate": 2.9740042250253443e-06,
"loss": 1.6281,
"step": 800
},
{
"epoch": 0.06008902077151335,
"grad_norm": 10.356392860412598,
"learning_rate": 2.973352211917202e-06,
"loss": 1.4703,
"step": 810
},
{
"epoch": 0.06083086053412463,
"grad_norm": 10.25114917755127,
"learning_rate": 2.972692196296879e-06,
"loss": 1.4442,
"step": 820
},
{
"epoch": 0.06157270029673591,
"grad_norm": 8.946527481079102,
"learning_rate": 2.9720241817492502e-06,
"loss": 1.3684,
"step": 830
},
{
"epoch": 0.06231454005934718,
"grad_norm": 10.437005043029785,
"learning_rate": 2.9713481719026366e-06,
"loss": 1.548,
"step": 840
},
{
"epoch": 0.06305637982195846,
"grad_norm": 9.255142211914062,
"learning_rate": 2.9706641704287855e-06,
"loss": 1.4895,
"step": 850
},
{
"epoch": 0.06379821958456973,
"grad_norm": 9.349931716918945,
"learning_rate": 2.9699721810428503e-06,
"loss": 1.4152,
"step": 860
},
{
"epoch": 0.064540059347181,
"grad_norm": 8.700305938720703,
"learning_rate": 2.9692722075033715e-06,
"loss": 1.4541,
"step": 870
},
{
"epoch": 0.06528189910979229,
"grad_norm": 10.963595390319824,
"learning_rate": 2.9685642536122545e-06,
"loss": 1.3894,
"step": 880
},
{
"epoch": 0.06602373887240356,
"grad_norm": 25.613452911376953,
"learning_rate": 2.967848323214752e-06,
"loss": 1.6023,
"step": 890
},
{
"epoch": 0.06676557863501484,
"grad_norm": 9.307974815368652,
"learning_rate": 2.967124420199439e-06,
"loss": 1.5659,
"step": 900
},
{
"epoch": 0.06750741839762611,
"grad_norm": 9.049477577209473,
"learning_rate": 2.966392548498195e-06,
"loss": 1.5969,
"step": 910
},
{
"epoch": 0.06824925816023739,
"grad_norm": 8.523443222045898,
"learning_rate": 2.9656527120861803e-06,
"loss": 1.491,
"step": 920
},
{
"epoch": 0.06899109792284866,
"grad_norm": 8.638110160827637,
"learning_rate": 2.9649049149818167e-06,
"loss": 1.4304,
"step": 930
},
{
"epoch": 0.06973293768545995,
"grad_norm": 10.084444999694824,
"learning_rate": 2.9641491612467636e-06,
"loss": 1.4847,
"step": 940
},
{
"epoch": 0.07047477744807122,
"grad_norm": 7.784031391143799,
"learning_rate": 2.9633854549858975e-06,
"loss": 1.3943,
"step": 950
},
{
"epoch": 0.0712166172106825,
"grad_norm": 8.431685447692871,
"learning_rate": 2.9626138003472885e-06,
"loss": 1.4669,
"step": 960
},
{
"epoch": 0.07195845697329377,
"grad_norm": 9.953826904296875,
"learning_rate": 2.9618342015221793e-06,
"loss": 1.3398,
"step": 970
},
{
"epoch": 0.07270029673590504,
"grad_norm": 8.906854629516602,
"learning_rate": 2.9610466627449597e-06,
"loss": 1.5057,
"step": 980
},
{
"epoch": 0.07344213649851632,
"grad_norm": 9.184341430664062,
"learning_rate": 2.9602511882931473e-06,
"loss": 1.476,
"step": 990
},
{
"epoch": 0.07418397626112759,
"grad_norm": 9.252667427062988,
"learning_rate": 2.959447782487361e-06,
"loss": 1.4645,
"step": 1000
},
{
"epoch": 0.07418397626112759,
"eval_loss": 1.4362387657165527,
"eval_runtime": 23.4866,
"eval_samples_per_second": 18.99,
"eval_steps_per_second": 9.495,
"step": 1000
},
{
"epoch": 0.07492581602373888,
"grad_norm": 9.09242057800293,
"learning_rate": 2.958636449691299e-06,
"loss": 1.561,
"step": 1010
},
{
"epoch": 0.07566765578635015,
"grad_norm": 8.406475067138672,
"learning_rate": 2.957817194311716e-06,
"loss": 1.4029,
"step": 1020
},
{
"epoch": 0.07640949554896143,
"grad_norm": 9.518254280090332,
"learning_rate": 2.956990020798396e-06,
"loss": 1.5051,
"step": 1030
},
{
"epoch": 0.0771513353115727,
"grad_norm": 9.93432903289795,
"learning_rate": 2.956154933644133e-06,
"loss": 1.2554,
"step": 1040
},
{
"epoch": 0.07789317507418397,
"grad_norm": 7.695739269256592,
"learning_rate": 2.955311937384702e-06,
"loss": 1.4648,
"step": 1050
},
{
"epoch": 0.07863501483679525,
"grad_norm": 9.189163208007812,
"learning_rate": 2.9544610365988374e-06,
"loss": 1.5584,
"step": 1060
},
{
"epoch": 0.07937685459940653,
"grad_norm": 8.053617477416992,
"learning_rate": 2.9536022359082062e-06,
"loss": 1.3786,
"step": 1070
},
{
"epoch": 0.08011869436201781,
"grad_norm": 9.746628761291504,
"learning_rate": 2.9527355399773845e-06,
"loss": 1.3726,
"step": 1080
},
{
"epoch": 0.08086053412462908,
"grad_norm": 8.845373153686523,
"learning_rate": 2.951860953513831e-06,
"loss": 1.3768,
"step": 1090
},
{
"epoch": 0.08160237388724036,
"grad_norm": 8.069707870483398,
"learning_rate": 2.950978481267862e-06,
"loss": 1.3207,
"step": 1100
},
{
"epoch": 0.08234421364985163,
"grad_norm": 9.178265571594238,
"learning_rate": 2.9500881280326244e-06,
"loss": 1.5972,
"step": 1110
},
{
"epoch": 0.0830860534124629,
"grad_norm": 8.713502883911133,
"learning_rate": 2.9491898986440725e-06,
"loss": 1.4182,
"step": 1120
},
{
"epoch": 0.08382789317507418,
"grad_norm": 9.482294082641602,
"learning_rate": 2.948283797980939e-06,
"loss": 1.5129,
"step": 1130
},
{
"epoch": 0.08456973293768547,
"grad_norm": 11.337164878845215,
"learning_rate": 2.947369830964709e-06,
"loss": 1.426,
"step": 1140
},
{
"epoch": 0.08531157270029674,
"grad_norm": 9.933257102966309,
"learning_rate": 2.9464480025595937e-06,
"loss": 1.4275,
"step": 1150
},
{
"epoch": 0.08605341246290801,
"grad_norm": 8.315671920776367,
"learning_rate": 2.9455183177725058e-06,
"loss": 1.4933,
"step": 1160
},
{
"epoch": 0.08679525222551929,
"grad_norm": 8.2044677734375,
"learning_rate": 2.9445807816530258e-06,
"loss": 1.4755,
"step": 1170
},
{
"epoch": 0.08753709198813056,
"grad_norm": 7.752995014190674,
"learning_rate": 2.9436353992933816e-06,
"loss": 1.207,
"step": 1180
},
{
"epoch": 0.08827893175074183,
"grad_norm": 8.823128700256348,
"learning_rate": 2.9426821758284173e-06,
"loss": 1.4338,
"step": 1190
},
{
"epoch": 0.08902077151335312,
"grad_norm": 7.024681091308594,
"learning_rate": 2.9417211164355664e-06,
"loss": 1.4365,
"step": 1200
},
{
"epoch": 0.0897626112759644,
"grad_norm": 8.412097930908203,
"learning_rate": 2.940752226334822e-06,
"loss": 1.1898,
"step": 1210
},
{
"epoch": 0.09050445103857567,
"grad_norm": 8.81240463256836,
"learning_rate": 2.9397755107887114e-06,
"loss": 1.5879,
"step": 1220
},
{
"epoch": 0.09124629080118694,
"grad_norm": 12.02270793914795,
"learning_rate": 2.938790975102264e-06,
"loss": 1.3401,
"step": 1230
},
{
"epoch": 0.09198813056379822,
"grad_norm": 9.22630500793457,
"learning_rate": 2.9377986246229853e-06,
"loss": 1.3431,
"step": 1240
},
{
"epoch": 0.09272997032640949,
"grad_norm": 8.395411491394043,
"learning_rate": 2.9367984647408272e-06,
"loss": 1.3423,
"step": 1250
},
{
"epoch": 0.09347181008902077,
"grad_norm": 9.383752822875977,
"learning_rate": 2.9357905008881574e-06,
"loss": 1.5453,
"step": 1260
},
{
"epoch": 0.09421364985163205,
"grad_norm": 13.686159133911133,
"learning_rate": 2.934774738539731e-06,
"loss": 1.5254,
"step": 1270
},
{
"epoch": 0.09495548961424333,
"grad_norm": 9.126317977905273,
"learning_rate": 2.9337511832126614e-06,
"loss": 1.3578,
"step": 1280
},
{
"epoch": 0.0956973293768546,
"grad_norm": 9.802062034606934,
"learning_rate": 2.9327198404663893e-06,
"loss": 1.3732,
"step": 1290
},
{
"epoch": 0.09643916913946587,
"grad_norm": 8.623979568481445,
"learning_rate": 2.931680715902652e-06,
"loss": 1.4103,
"step": 1300
},
{
"epoch": 0.09718100890207715,
"grad_norm": 9.61336612701416,
"learning_rate": 2.9306338151654547e-06,
"loss": 1.4382,
"step": 1310
},
{
"epoch": 0.09792284866468842,
"grad_norm": 8.745745658874512,
"learning_rate": 2.9295791439410385e-06,
"loss": 1.2856,
"step": 1320
},
{
"epoch": 0.09866468842729971,
"grad_norm": 8.679821968078613,
"learning_rate": 2.9285167079578504e-06,
"loss": 1.257,
"step": 1330
},
{
"epoch": 0.09940652818991098,
"grad_norm": 11.308154106140137,
"learning_rate": 2.92744651298651e-06,
"loss": 1.4787,
"step": 1340
},
{
"epoch": 0.10014836795252226,
"grad_norm": 8.959935188293457,
"learning_rate": 2.926368564839782e-06,
"loss": 1.2769,
"step": 1350
},
{
"epoch": 0.10089020771513353,
"grad_norm": 6.9831438064575195,
"learning_rate": 2.9252828693725405e-06,
"loss": 1.4526,
"step": 1360
},
{
"epoch": 0.1016320474777448,
"grad_norm": 8.822589874267578,
"learning_rate": 2.924189432481741e-06,
"loss": 1.3483,
"step": 1370
},
{
"epoch": 0.10237388724035608,
"grad_norm": 8.989341735839844,
"learning_rate": 2.923088260106386e-06,
"loss": 1.4483,
"step": 1380
},
{
"epoch": 0.10311572700296735,
"grad_norm": 9.763890266418457,
"learning_rate": 2.921979358227492e-06,
"loss": 1.3835,
"step": 1390
},
{
"epoch": 0.10385756676557864,
"grad_norm": 8.562960624694824,
"learning_rate": 2.92086273286806e-06,
"loss": 1.4348,
"step": 1400
},
{
"epoch": 0.10459940652818991,
"grad_norm": 10.014548301696777,
"learning_rate": 2.91973839009304e-06,
"loss": 1.2826,
"step": 1410
},
{
"epoch": 0.10534124629080119,
"grad_norm": 11.542120933532715,
"learning_rate": 2.9186063360093e-06,
"loss": 1.3613,
"step": 1420
},
{
"epoch": 0.10608308605341246,
"grad_norm": 8.246392250061035,
"learning_rate": 2.917466576765591e-06,
"loss": 1.4738,
"step": 1430
},
{
"epoch": 0.10682492581602374,
"grad_norm": 9.511324882507324,
"learning_rate": 2.916319118552515e-06,
"loss": 1.4706,
"step": 1440
},
{
"epoch": 0.10756676557863501,
"grad_norm": 8.671672821044922,
"learning_rate": 2.915163967602492e-06,
"loss": 1.3392,
"step": 1450
},
{
"epoch": 0.1083086053412463,
"grad_norm": 9.805370330810547,
"learning_rate": 2.914001130189722e-06,
"loss": 1.5192,
"step": 1460
},
{
"epoch": 0.10905044510385757,
"grad_norm": 8.378101348876953,
"learning_rate": 2.912830612630158e-06,
"loss": 1.3507,
"step": 1470
},
{
"epoch": 0.10979228486646884,
"grad_norm": 8.799610137939453,
"learning_rate": 2.9116524212814653e-06,
"loss": 1.4003,
"step": 1480
},
{
"epoch": 0.11053412462908012,
"grad_norm": 8.829014778137207,
"learning_rate": 2.91046656254299e-06,
"loss": 1.5949,
"step": 1490
},
{
"epoch": 0.11127596439169139,
"grad_norm": 8.634420394897461,
"learning_rate": 2.9092730428557236e-06,
"loss": 1.4198,
"step": 1500
},
{
"epoch": 0.11127596439169139,
"eval_loss": 1.4195871353149414,
"eval_runtime": 23.4693,
"eval_samples_per_second": 19.004,
"eval_steps_per_second": 9.502,
"step": 1500
},
{
"epoch": 0.11201780415430267,
"grad_norm": 7.636455059051514,
"learning_rate": 2.9080718687022676e-06,
"loss": 1.4234,
"step": 1510
},
{
"epoch": 0.11275964391691394,
"grad_norm": 8.863425254821777,
"learning_rate": 2.9068630466067996e-06,
"loss": 1.5965,
"step": 1520
},
{
"epoch": 0.11350148367952523,
"grad_norm": 8.970385551452637,
"learning_rate": 2.905646583135036e-06,
"loss": 1.4643,
"step": 1530
},
{
"epoch": 0.1142433234421365,
"grad_norm": 10.134622573852539,
"learning_rate": 2.904422484894198e-06,
"loss": 1.4593,
"step": 1540
},
{
"epoch": 0.11498516320474778,
"grad_norm": 8.219001770019531,
"learning_rate": 2.9031907585329753e-06,
"loss": 1.4802,
"step": 1550
},
{
"epoch": 0.11572700296735905,
"grad_norm": 9.880292892456055,
"learning_rate": 2.901951410741489e-06,
"loss": 1.4993,
"step": 1560
},
{
"epoch": 0.11646884272997032,
"grad_norm": 8.31434154510498,
"learning_rate": 2.9007044482512563e-06,
"loss": 1.5126,
"step": 1570
},
{
"epoch": 0.1172106824925816,
"grad_norm": 8.074999809265137,
"learning_rate": 2.899449877835154e-06,
"loss": 1.1785,
"step": 1580
},
{
"epoch": 0.11795252225519288,
"grad_norm": 9.222709655761719,
"learning_rate": 2.8981877063073808e-06,
"loss": 1.3661,
"step": 1590
},
{
"epoch": 0.11869436201780416,
"grad_norm": 9.243541717529297,
"learning_rate": 2.8969179405234202e-06,
"loss": 1.3965,
"step": 1600
},
{
"epoch": 0.11943620178041543,
"grad_norm": 8.484634399414062,
"learning_rate": 2.8956405873800063e-06,
"loss": 1.4526,
"step": 1610
},
{
"epoch": 0.1201780415430267,
"grad_norm": 7.875013828277588,
"learning_rate": 2.8943556538150813e-06,
"loss": 1.439,
"step": 1620
},
{
"epoch": 0.12091988130563798,
"grad_norm": 8.981459617614746,
"learning_rate": 2.893063146807762e-06,
"loss": 1.5325,
"step": 1630
},
{
"epoch": 0.12166172106824925,
"grad_norm": 8.40335464477539,
"learning_rate": 2.8917630733783004e-06,
"loss": 1.4615,
"step": 1640
},
{
"epoch": 0.12240356083086053,
"grad_norm": 8.828475952148438,
"learning_rate": 2.890455440588043e-06,
"loss": 1.5635,
"step": 1650
},
{
"epoch": 0.12314540059347182,
"grad_norm": 10.00554084777832,
"learning_rate": 2.8891402555393995e-06,
"loss": 1.4823,
"step": 1660
},
{
"epoch": 0.12388724035608309,
"grad_norm": 9.928216934204102,
"learning_rate": 2.8878175253757955e-06,
"loss": 1.3582,
"step": 1670
},
{
"epoch": 0.12462908011869436,
"grad_norm": 11.623834609985352,
"learning_rate": 2.8864872572816406e-06,
"loss": 1.4406,
"step": 1680
},
{
"epoch": 0.12537091988130564,
"grad_norm": 12.635778427124023,
"learning_rate": 2.885149458482285e-06,
"loss": 1.3821,
"step": 1690
},
{
"epoch": 0.1261127596439169,
"grad_norm": 10.610758781433105,
"learning_rate": 2.8838041362439823e-06,
"loss": 1.5266,
"step": 1700
},
{
"epoch": 0.12685459940652818,
"grad_norm": 8.499368667602539,
"learning_rate": 2.8824512978738506e-06,
"loss": 1.3015,
"step": 1710
},
{
"epoch": 0.12759643916913946,
"grad_norm": 7.7737507820129395,
"learning_rate": 2.881090950719831e-06,
"loss": 1.3831,
"step": 1720
},
{
"epoch": 0.12833827893175073,
"grad_norm": 9.742268562316895,
"learning_rate": 2.8797231021706486e-06,
"loss": 1.5125,
"step": 1730
},
{
"epoch": 0.129080118694362,
"grad_norm": 9.315298080444336,
"learning_rate": 2.8783477596557722e-06,
"loss": 1.5418,
"step": 1740
},
{
"epoch": 0.1298219584569733,
"grad_norm": 9.360373497009277,
"learning_rate": 2.8769649306453745e-06,
"loss": 1.4129,
"step": 1750
},
{
"epoch": 0.13056379821958458,
"grad_norm": 10.6887845993042,
"learning_rate": 2.8755746226502914e-06,
"loss": 1.3005,
"step": 1760
},
{
"epoch": 0.13130563798219586,
"grad_norm": 8.747626304626465,
"learning_rate": 2.87417684322198e-06,
"loss": 1.3693,
"step": 1770
},
{
"epoch": 0.13204747774480713,
"grad_norm": 10.2086820602417,
"learning_rate": 2.872771599952479e-06,
"loss": 1.3155,
"step": 1780
},
{
"epoch": 0.1327893175074184,
"grad_norm": 8.937162399291992,
"learning_rate": 2.871358900474367e-06,
"loss": 1.5346,
"step": 1790
},
{
"epoch": 0.13353115727002968,
"grad_norm": 8.907169342041016,
"learning_rate": 2.8699387524607205e-06,
"loss": 1.4442,
"step": 1800
},
{
"epoch": 0.13427299703264095,
"grad_norm": 8.316621780395508,
"learning_rate": 2.8685111636250736e-06,
"loss": 1.3703,
"step": 1810
},
{
"epoch": 0.13501483679525222,
"grad_norm": 8.593326568603516,
"learning_rate": 2.867076141721374e-06,
"loss": 1.2765,
"step": 1820
},
{
"epoch": 0.1357566765578635,
"grad_norm": 9.69709300994873,
"learning_rate": 2.865633694543944e-06,
"loss": 1.5247,
"step": 1830
},
{
"epoch": 0.13649851632047477,
"grad_norm": 8.481054306030273,
"learning_rate": 2.864183829927434e-06,
"loss": 1.437,
"step": 1840
},
{
"epoch": 0.13724035608308605,
"grad_norm": 7.5963335037231445,
"learning_rate": 2.8627265557467836e-06,
"loss": 1.3608,
"step": 1850
},
{
"epoch": 0.13798219584569732,
"grad_norm": 9.460357666015625,
"learning_rate": 2.861261879917177e-06,
"loss": 1.4096,
"step": 1860
},
{
"epoch": 0.1387240356083086,
"grad_norm": 8.779165267944336,
"learning_rate": 2.8597898103940014e-06,
"loss": 1.3327,
"step": 1870
},
{
"epoch": 0.1394658753709199,
"grad_norm": 8.048774719238281,
"learning_rate": 2.858310355172801e-06,
"loss": 1.3372,
"step": 1880
},
{
"epoch": 0.14020771513353117,
"grad_norm": 8.53365421295166,
"learning_rate": 2.8568235222892375e-06,
"loss": 1.4482,
"step": 1890
},
{
"epoch": 0.14094955489614244,
"grad_norm": 9.450532913208008,
"learning_rate": 2.8553293198190425e-06,
"loss": 1.3362,
"step": 1900
},
{
"epoch": 0.14169139465875372,
"grad_norm": 7.9473958015441895,
"learning_rate": 2.853827755877977e-06,
"loss": 1.3946,
"step": 1910
},
{
"epoch": 0.142433234421365,
"grad_norm": 10.09933090209961,
"learning_rate": 2.852318838621784e-06,
"loss": 1.5963,
"step": 1920
},
{
"epoch": 0.14317507418397626,
"grad_norm": 8.691498756408691,
"learning_rate": 2.850802576246149e-06,
"loss": 1.3957,
"step": 1930
},
{
"epoch": 0.14391691394658754,
"grad_norm": 9.597620010375977,
"learning_rate": 2.8492789769866493e-06,
"loss": 1.4577,
"step": 1940
},
{
"epoch": 0.1446587537091988,
"grad_norm": 9.706177711486816,
"learning_rate": 2.8477480491187146e-06,
"loss": 1.4256,
"step": 1950
},
{
"epoch": 0.14540059347181009,
"grad_norm": 9.215739250183105,
"learning_rate": 2.846209800957579e-06,
"loss": 1.4918,
"step": 1960
},
{
"epoch": 0.14614243323442136,
"grad_norm": 8.966597557067871,
"learning_rate": 2.8446642408582374e-06,
"loss": 1.435,
"step": 1970
},
{
"epoch": 0.14688427299703263,
"grad_norm": 8.87956428527832,
"learning_rate": 2.8431113772153984e-06,
"loss": 1.4318,
"step": 1980
},
{
"epoch": 0.1476261127596439,
"grad_norm": 9.43526840209961,
"learning_rate": 2.8415512184634413e-06,
"loss": 1.4226,
"step": 1990
},
{
"epoch": 0.14836795252225518,
"grad_norm": 7.335799694061279,
"learning_rate": 2.839983773076367e-06,
"loss": 1.3469,
"step": 2000
},
{
"epoch": 0.14836795252225518,
"eval_loss": 1.405104160308838,
"eval_runtime": 23.4479,
"eval_samples_per_second": 19.021,
"eval_steps_per_second": 9.51,
"step": 2000
},
{
"epoch": 0.14910979228486648,
"grad_norm": 8.135221481323242,
"learning_rate": 2.8384090495677555e-06,
"loss": 1.3779,
"step": 2010
},
{
"epoch": 0.14985163204747776,
"grad_norm": 8.584566116333008,
"learning_rate": 2.8368270564907167e-06,
"loss": 1.4178,
"step": 2020
},
{
"epoch": 0.15059347181008903,
"grad_norm": 9.192804336547852,
"learning_rate": 2.8352378024378462e-06,
"loss": 1.4223,
"step": 2030
},
{
"epoch": 0.1513353115727003,
"grad_norm": 10.986886024475098,
"learning_rate": 2.8336412960411765e-06,
"loss": 1.5351,
"step": 2040
},
{
"epoch": 0.15207715133531158,
"grad_norm": 8.154606819152832,
"learning_rate": 2.832037545972132e-06,
"loss": 1.3744,
"step": 2050
},
{
"epoch": 0.15281899109792285,
"grad_norm": 8.556278228759766,
"learning_rate": 2.8304265609414803e-06,
"loss": 1.3267,
"step": 2060
},
{
"epoch": 0.15356083086053413,
"grad_norm": 9.713357925415039,
"learning_rate": 2.8288083496992867e-06,
"loss": 1.3808,
"step": 2070
},
{
"epoch": 0.1543026706231454,
"grad_norm": 8.706491470336914,
"learning_rate": 2.8271829210348656e-06,
"loss": 1.297,
"step": 2080
},
{
"epoch": 0.15504451038575667,
"grad_norm": 8.89303970336914,
"learning_rate": 2.825550283776731e-06,
"loss": 1.2562,
"step": 2090
},
{
"epoch": 0.15578635014836795,
"grad_norm": 8.402449607849121,
"learning_rate": 2.8239104467925532e-06,
"loss": 1.4105,
"step": 2100
},
{
"epoch": 0.15652818991097922,
"grad_norm": 7.475712776184082,
"learning_rate": 2.8222634189891055e-06,
"loss": 1.3397,
"step": 2110
},
{
"epoch": 0.1572700296735905,
"grad_norm": 8.340933799743652,
"learning_rate": 2.8206092093122193e-06,
"loss": 1.2691,
"step": 2120
},
{
"epoch": 0.15801186943620177,
"grad_norm": 7.353670597076416,
"learning_rate": 2.8189478267467344e-06,
"loss": 1.408,
"step": 2130
},
{
"epoch": 0.15875370919881307,
"grad_norm": 8.455607414245605,
"learning_rate": 2.817279280316449e-06,
"loss": 1.5435,
"step": 2140
},
{
"epoch": 0.15949554896142434,
"grad_norm": 9.295350074768066,
"learning_rate": 2.8156035790840733e-06,
"loss": 1.5229,
"step": 2150
},
{
"epoch": 0.16023738872403562,
"grad_norm": 9.709535598754883,
"learning_rate": 2.8139207321511777e-06,
"loss": 1.5848,
"step": 2160
},
{
"epoch": 0.1609792284866469,
"grad_norm": 10.39367389678955,
"learning_rate": 2.8122307486581455e-06,
"loss": 1.4792,
"step": 2170
},
{
"epoch": 0.16172106824925817,
"grad_norm": 8.161094665527344,
"learning_rate": 2.8105336377841212e-06,
"loss": 1.4138,
"step": 2180
},
{
"epoch": 0.16246290801186944,
"grad_norm": 9.033132553100586,
"learning_rate": 2.808829408746962e-06,
"loss": 1.5123,
"step": 2190
},
{
"epoch": 0.1632047477744807,
"grad_norm": 8.76311206817627,
"learning_rate": 2.8071180708031874e-06,
"loss": 1.4737,
"step": 2200
},
{
"epoch": 0.163946587537092,
"grad_norm": 9.680130004882812,
"learning_rate": 2.8053996332479296e-06,
"loss": 1.3447,
"step": 2210
},
{
"epoch": 0.16468842729970326,
"grad_norm": 9.140039443969727,
"learning_rate": 2.8036741054148817e-06,
"loss": 1.479,
"step": 2220
},
{
"epoch": 0.16543026706231453,
"grad_norm": 7.610710144042969,
"learning_rate": 2.801941496676247e-06,
"loss": 1.3595,
"step": 2230
},
{
"epoch": 0.1661721068249258,
"grad_norm": 11.338227272033691,
"learning_rate": 2.8002018164426896e-06,
"loss": 1.4566,
"step": 2240
},
{
"epoch": 0.16691394658753708,
"grad_norm": 8.052413940429688,
"learning_rate": 2.7984550741632837e-06,
"loss": 1.3201,
"step": 2250
},
{
"epoch": 0.16765578635014836,
"grad_norm": 8.803062438964844,
"learning_rate": 2.7967012793254575e-06,
"loss": 1.3299,
"step": 2260
},
{
"epoch": 0.16839762611275966,
"grad_norm": 8.115534782409668,
"learning_rate": 2.7949404414549484e-06,
"loss": 1.4376,
"step": 2270
},
{
"epoch": 0.16913946587537093,
"grad_norm": 9.156294822692871,
"learning_rate": 2.7931725701157462e-06,
"loss": 1.4132,
"step": 2280
},
{
"epoch": 0.1698813056379822,
"grad_norm": 8.102431297302246,
"learning_rate": 2.7913976749100445e-06,
"loss": 1.4156,
"step": 2290
},
{
"epoch": 0.17062314540059348,
"grad_norm": 8.303695678710938,
"learning_rate": 2.789615765478186e-06,
"loss": 1.2913,
"step": 2300
},
{
"epoch": 0.17136498516320475,
"grad_norm": 7.867891311645508,
"learning_rate": 2.787826851498611e-06,
"loss": 1.2225,
"step": 2310
},
{
"epoch": 0.17210682492581603,
"grad_norm": 8.89625072479248,
"learning_rate": 2.786030942687805e-06,
"loss": 1.5093,
"step": 2320
},
{
"epoch": 0.1728486646884273,
"grad_norm": 8.792491912841797,
"learning_rate": 2.784228048800247e-06,
"loss": 1.3146,
"step": 2330
},
{
"epoch": 0.17359050445103857,
"grad_norm": 9.683384895324707,
"learning_rate": 2.7824181796283543e-06,
"loss": 1.4008,
"step": 2340
},
{
"epoch": 0.17433234421364985,
"grad_norm": 9.359085083007812,
"learning_rate": 2.780601345002431e-06,
"loss": 1.2744,
"step": 2350
},
{
"epoch": 0.17507418397626112,
"grad_norm": 7.971740245819092,
"learning_rate": 2.7787775547906143e-06,
"loss": 1.3748,
"step": 2360
},
{
"epoch": 0.1758160237388724,
"grad_norm": 9.259309768676758,
"learning_rate": 2.77694681889882e-06,
"loss": 1.3978,
"step": 2370
},
{
"epoch": 0.17655786350148367,
"grad_norm": 8.904669761657715,
"learning_rate": 2.7751091472706886e-06,
"loss": 1.3772,
"step": 2380
},
{
"epoch": 0.17729970326409494,
"grad_norm": 7.627325057983398,
"learning_rate": 2.773264549887535e-06,
"loss": 1.3509,
"step": 2390
},
{
"epoch": 0.17804154302670624,
"grad_norm": 9.28232479095459,
"learning_rate": 2.771413036768288e-06,
"loss": 1.4038,
"step": 2400
},
{
"epoch": 0.17878338278931752,
"grad_norm": 11.565908432006836,
"learning_rate": 2.7695546179694412e-06,
"loss": 1.4158,
"step": 2410
},
{
"epoch": 0.1795252225519288,
"grad_norm": 8.238388061523438,
"learning_rate": 2.767689303584996e-06,
"loss": 1.4911,
"step": 2420
},
{
"epoch": 0.18026706231454007,
"grad_norm": 8.432221412658691,
"learning_rate": 2.765817103746407e-06,
"loss": 1.5864,
"step": 2430
},
{
"epoch": 0.18100890207715134,
"grad_norm": 8.204069137573242,
"learning_rate": 2.7639380286225262e-06,
"loss": 1.3994,
"step": 2440
},
{
"epoch": 0.18175074183976261,
"grad_norm": 8.444053649902344,
"learning_rate": 2.762052088419551e-06,
"loss": 1.576,
"step": 2450
},
{
"epoch": 0.1824925816023739,
"grad_norm": 8.946913719177246,
"learning_rate": 2.760159293380965e-06,
"loss": 1.1678,
"step": 2460
},
{
"epoch": 0.18323442136498516,
"grad_norm": 8.895451545715332,
"learning_rate": 2.758259653787483e-06,
"loss": 1.3972,
"step": 2470
},
{
"epoch": 0.18397626112759644,
"grad_norm": 9.011785507202148,
"learning_rate": 2.7563531799569982e-06,
"loss": 1.2209,
"step": 2480
},
{
"epoch": 0.1847181008902077,
"grad_norm": 8.469378471374512,
"learning_rate": 2.754439882244522e-06,
"loss": 1.4777,
"step": 2490
},
{
"epoch": 0.18545994065281898,
"grad_norm": 8.05780029296875,
"learning_rate": 2.7525197710421303e-06,
"loss": 1.3816,
"step": 2500
},
{
"epoch": 0.18545994065281898,
"eval_loss": 1.391993522644043,
"eval_runtime": 23.4505,
"eval_samples_per_second": 19.019,
"eval_steps_per_second": 9.509,
"step": 2500
},
{
"epoch": 0.18620178041543026,
"grad_norm": 7.978991508483887,
"learning_rate": 2.7505928567789073e-06,
"loss": 1.4641,
"step": 2510
},
{
"epoch": 0.18694362017804153,
"grad_norm": 8.432256698608398,
"learning_rate": 2.7486591499208866e-06,
"loss": 1.4184,
"step": 2520
},
{
"epoch": 0.18768545994065283,
"grad_norm": 9.253658294677734,
"learning_rate": 2.7467186609709973e-06,
"loss": 1.4106,
"step": 2530
},
{
"epoch": 0.1884272997032641,
"grad_norm": 16.86107635498047,
"learning_rate": 2.7447714004690042e-06,
"loss": 1.4225,
"step": 2540
},
{
"epoch": 0.18916913946587538,
"grad_norm": 9.117183685302734,
"learning_rate": 2.7428173789914524e-06,
"loss": 1.3031,
"step": 2550
},
{
"epoch": 0.18991097922848665,
"grad_norm": 11.524558067321777,
"learning_rate": 2.740856607151609e-06,
"loss": 1.3394,
"step": 2560
},
{
"epoch": 0.19065281899109793,
"grad_norm": 9.210947036743164,
"learning_rate": 2.7388890955994055e-06,
"loss": 1.5357,
"step": 2570
},
{
"epoch": 0.1913946587537092,
"grad_norm": 10.00994873046875,
"learning_rate": 2.7369148550213806e-06,
"loss": 1.3765,
"step": 2580
},
{
"epoch": 0.19213649851632048,
"grad_norm": 7.468533992767334,
"learning_rate": 2.7349338961406223e-06,
"loss": 1.4192,
"step": 2590
},
{
"epoch": 0.19287833827893175,
"grad_norm": 8.357904434204102,
"learning_rate": 2.7329462297167068e-06,
"loss": 1.3348,
"step": 2600
},
{
"epoch": 0.19362017804154302,
"grad_norm": 9.04192066192627,
"learning_rate": 2.7309518665456454e-06,
"loss": 1.3598,
"step": 2610
},
{
"epoch": 0.1943620178041543,
"grad_norm": 9.699695587158203,
"learning_rate": 2.72895081745982e-06,
"loss": 1.5076,
"step": 2620
},
{
"epoch": 0.19510385756676557,
"grad_norm": 8.667801856994629,
"learning_rate": 2.7269430933279284e-06,
"loss": 1.2957,
"step": 2630
},
{
"epoch": 0.19584569732937684,
"grad_norm": 8.39424991607666,
"learning_rate": 2.724928705054924e-06,
"loss": 1.3713,
"step": 2640
},
{
"epoch": 0.19658753709198812,
"grad_norm": 8.892675399780273,
"learning_rate": 2.7229076635819563e-06,
"loss": 1.4559,
"step": 2650
},
{
"epoch": 0.19732937685459942,
"grad_norm": 10.235827445983887,
"learning_rate": 2.720879979886311e-06,
"loss": 1.3907,
"step": 2660
},
{
"epoch": 0.1980712166172107,
"grad_norm": 9.297379493713379,
"learning_rate": 2.7188456649813526e-06,
"loss": 1.4805,
"step": 2670
},
{
"epoch": 0.19881305637982197,
"grad_norm": 10.14811897277832,
"learning_rate": 2.7168047299164614e-06,
"loss": 1.4573,
"step": 2680
},
{
"epoch": 0.19955489614243324,
"grad_norm": 8.918148040771484,
"learning_rate": 2.7147571857769755e-06,
"loss": 1.3873,
"step": 2690
},
{
"epoch": 0.20029673590504452,
"grad_norm": 8.084507942199707,
"learning_rate": 2.7127030436841307e-06,
"loss": 1.2873,
"step": 2700
},
{
"epoch": 0.2010385756676558,
"grad_norm": 8.225303649902344,
"learning_rate": 2.710642314794999e-06,
"loss": 1.4675,
"step": 2710
},
{
"epoch": 0.20178041543026706,
"grad_norm": 8.811010360717773,
"learning_rate": 2.7085750103024297e-06,
"loss": 1.4683,
"step": 2720
},
{
"epoch": 0.20252225519287834,
"grad_norm": 8.835148811340332,
"learning_rate": 2.7065011414349858e-06,
"loss": 1.4257,
"step": 2730
},
{
"epoch": 0.2032640949554896,
"grad_norm": 15.418182373046875,
"learning_rate": 2.704420719456885e-06,
"loss": 1.4806,
"step": 2740
},
{
"epoch": 0.20400593471810088,
"grad_norm": 9.259235382080078,
"learning_rate": 2.7023337556679402e-06,
"loss": 1.6237,
"step": 2750
},
{
"epoch": 0.20474777448071216,
"grad_norm": 11.389565467834473,
"learning_rate": 2.7002402614034937e-06,
"loss": 1.3695,
"step": 2760
},
{
"epoch": 0.20548961424332343,
"grad_norm": 7.731765270233154,
"learning_rate": 2.69814024803436e-06,
"loss": 1.4801,
"step": 2770
},
{
"epoch": 0.2062314540059347,
"grad_norm": 8.97433853149414,
"learning_rate": 2.6960337269667605e-06,
"loss": 1.4708,
"step": 2780
},
{
"epoch": 0.206973293768546,
"grad_norm": 9.035865783691406,
"learning_rate": 2.6939207096422634e-06,
"loss": 1.4399,
"step": 2790
},
{
"epoch": 0.20771513353115728,
"grad_norm": 9.75682258605957,
"learning_rate": 2.6918012075377224e-06,
"loss": 1.3488,
"step": 2800
},
{
"epoch": 0.20845697329376855,
"grad_norm": 9.119101524353027,
"learning_rate": 2.689675232165213e-06,
"loss": 1.3,
"step": 2810
},
{
"epoch": 0.20919881305637983,
"grad_norm": 8.837667465209961,
"learning_rate": 2.68754279507197e-06,
"loss": 1.3659,
"step": 2820
},
{
"epoch": 0.2099406528189911,
"grad_norm": 8.174179077148438,
"learning_rate": 2.685403907840324e-06,
"loss": 1.3446,
"step": 2830
},
{
"epoch": 0.21068249258160238,
"grad_norm": 9.282876968383789,
"learning_rate": 2.6832585820876413e-06,
"loss": 1.4882,
"step": 2840
},
{
"epoch": 0.21142433234421365,
"grad_norm": 7.6600213050842285,
"learning_rate": 2.681106829466258e-06,
"loss": 1.1834,
"step": 2850
},
{
"epoch": 0.21216617210682492,
"grad_norm": 9.84327220916748,
"learning_rate": 2.678948661663417e-06,
"loss": 1.4927,
"step": 2860
},
{
"epoch": 0.2129080118694362,
"grad_norm": 9.372842788696289,
"learning_rate": 2.6767840904012078e-06,
"loss": 1.4625,
"step": 2870
},
{
"epoch": 0.21364985163204747,
"grad_norm": 7.723082542419434,
"learning_rate": 2.6746131274364977e-06,
"loss": 1.3829,
"step": 2880
},
{
"epoch": 0.21439169139465875,
"grad_norm": 8.692205429077148,
"learning_rate": 2.6724357845608716e-06,
"loss": 1.46,
"step": 2890
},
{
"epoch": 0.21513353115727002,
"grad_norm": 9.735092163085938,
"learning_rate": 2.6702520736005673e-06,
"loss": 1.3574,
"step": 2900
},
{
"epoch": 0.2158753709198813,
"grad_norm": 8.781496047973633,
"learning_rate": 2.6680620064164094e-06,
"loss": 1.421,
"step": 2910
},
{
"epoch": 0.2166172106824926,
"grad_norm": 8.708477020263672,
"learning_rate": 2.6658655949037482e-06,
"loss": 1.3353,
"step": 2920
},
{
"epoch": 0.21735905044510387,
"grad_norm": 9.43267822265625,
"learning_rate": 2.6636628509923924e-06,
"loss": 1.2779,
"step": 2930
},
{
"epoch": 0.21810089020771514,
"grad_norm": 9.485703468322754,
"learning_rate": 2.661453786646544e-06,
"loss": 1.4917,
"step": 2940
},
{
"epoch": 0.21884272997032642,
"grad_norm": 9.164180755615234,
"learning_rate": 2.659238413864736e-06,
"loss": 1.2931,
"step": 2950
},
{
"epoch": 0.2195845697329377,
"grad_norm": 8.09424114227295,
"learning_rate": 2.6570167446797654e-06,
"loss": 1.4717,
"step": 2960
},
{
"epoch": 0.22032640949554896,
"grad_norm": 8.689072608947754,
"learning_rate": 2.6547887911586278e-06,
"loss": 1.3389,
"step": 2970
},
{
"epoch": 0.22106824925816024,
"grad_norm": 7.4104838371276855,
"learning_rate": 2.6525545654024517e-06,
"loss": 1.2771,
"step": 2980
},
{
"epoch": 0.2218100890207715,
"grad_norm": 8.580281257629395,
"learning_rate": 2.650314079546434e-06,
"loss": 1.3574,
"step": 2990
},
{
"epoch": 0.22255192878338279,
"grad_norm": 6.826554298400879,
"learning_rate": 2.648067345759774e-06,
"loss": 1.3653,
"step": 3000
},
{
"epoch": 0.22255192878338279,
"eval_loss": 1.380942463874817,
"eval_runtime": 23.446,
"eval_samples_per_second": 19.022,
"eval_steps_per_second": 9.511,
"step": 3000
},
{
"epoch": 0.22329376854599406,
"grad_norm": 8.048758506774902,
"learning_rate": 2.6458143762456038e-06,
"loss": 1.4932,
"step": 3010
},
{
"epoch": 0.22403560830860533,
"grad_norm": 8.818073272705078,
"learning_rate": 2.643555183240928e-06,
"loss": 1.3055,
"step": 3020
},
{
"epoch": 0.2247774480712166,
"grad_norm": 7.931951999664307,
"learning_rate": 2.6412897790165526e-06,
"loss": 1.4524,
"step": 3030
},
{
"epoch": 0.22551928783382788,
"grad_norm": 7.983026504516602,
"learning_rate": 2.6390181758770205e-06,
"loss": 1.3969,
"step": 3040
},
{
"epoch": 0.22626112759643918,
"grad_norm": 9.100227355957031,
"learning_rate": 2.636740386160543e-06,
"loss": 1.3396,
"step": 3050
},
{
"epoch": 0.22700296735905046,
"grad_norm": 8.59542179107666,
"learning_rate": 2.6344564222389353e-06,
"loss": 1.3731,
"step": 3060
},
{
"epoch": 0.22774480712166173,
"grad_norm": 7.7173752784729,
"learning_rate": 2.6321662965175457e-06,
"loss": 1.2887,
"step": 3070
},
{
"epoch": 0.228486646884273,
"grad_norm": 9.884195327758789,
"learning_rate": 2.6298700214351924e-06,
"loss": 1.2001,
"step": 3080
},
{
"epoch": 0.22922848664688428,
"grad_norm": 8.387007713317871,
"learning_rate": 2.627567609464092e-06,
"loss": 1.4851,
"step": 3090
},
{
"epoch": 0.22997032640949555,
"grad_norm": 8.314335823059082,
"learning_rate": 2.6252590731097956e-06,
"loss": 1.3391,
"step": 3100
},
{
"epoch": 0.23071216617210683,
"grad_norm": 8.861979484558105,
"learning_rate": 2.6229444249111175e-06,
"loss": 1.3721,
"step": 3110
},
{
"epoch": 0.2314540059347181,
"grad_norm": 11.68078899383545,
"learning_rate": 2.6206236774400685e-06,
"loss": 1.5759,
"step": 3120
},
{
"epoch": 0.23219584569732937,
"grad_norm": 8.4688081741333,
"learning_rate": 2.618296843301788e-06,
"loss": 1.3431,
"step": 3130
},
{
"epoch": 0.23293768545994065,
"grad_norm": 8.566194534301758,
"learning_rate": 2.6159639351344755e-06,
"loss": 1.373,
"step": 3140
},
{
"epoch": 0.23367952522255192,
"grad_norm": 6.903346538543701,
"learning_rate": 2.6136249656093204e-06,
"loss": 1.2995,
"step": 3150
},
{
"epoch": 0.2344213649851632,
"grad_norm": 8.093761444091797,
"learning_rate": 2.611279947430436e-06,
"loss": 1.4552,
"step": 3160
},
{
"epoch": 0.23516320474777447,
"grad_norm": 9.532185554504395,
"learning_rate": 2.608928893334788e-06,
"loss": 1.359,
"step": 3170
},
{
"epoch": 0.23590504451038577,
"grad_norm": 10.045039176940918,
"learning_rate": 2.6065718160921246e-06,
"loss": 1.5474,
"step": 3180
},
{
"epoch": 0.23664688427299704,
"grad_norm": 9.059492111206055,
"learning_rate": 2.604208728504912e-06,
"loss": 1.2215,
"step": 3190
},
{
"epoch": 0.23738872403560832,
"grad_norm": 10.714762687683105,
"learning_rate": 2.601839643408259e-06,
"loss": 1.3327,
"step": 3200
},
{
"epoch": 0.2381305637982196,
"grad_norm": 8.981411933898926,
"learning_rate": 2.599464573669851e-06,
"loss": 1.3985,
"step": 3210
},
{
"epoch": 0.23887240356083086,
"grad_norm": 8.016975402832031,
"learning_rate": 2.597083532189879e-06,
"loss": 1.2672,
"step": 3220
},
{
"epoch": 0.23961424332344214,
"grad_norm": 9.3323335647583,
"learning_rate": 2.594696531900968e-06,
"loss": 1.2048,
"step": 3230
},
{
"epoch": 0.2403560830860534,
"grad_norm": 7.841317653656006,
"learning_rate": 2.592303585768111e-06,
"loss": 1.3764,
"step": 3240
},
{
"epoch": 0.2410979228486647,
"grad_norm": 9.452821731567383,
"learning_rate": 2.5899047067885935e-06,
"loss": 1.3729,
"step": 3250
},
{
"epoch": 0.24183976261127596,
"grad_norm": 11.088187217712402,
"learning_rate": 2.5874999079919264e-06,
"loss": 1.3502,
"step": 3260
},
{
"epoch": 0.24258160237388723,
"grad_norm": 9.076626777648926,
"learning_rate": 2.5850892024397736e-06,
"loss": 1.3962,
"step": 3270
},
{
"epoch": 0.2433234421364985,
"grad_norm": 9.371712684631348,
"learning_rate": 2.5826726032258818e-06,
"loss": 1.5036,
"step": 3280
},
{
"epoch": 0.24406528189910978,
"grad_norm": 8.981965065002441,
"learning_rate": 2.580250123476009e-06,
"loss": 1.3917,
"step": 3290
},
{
"epoch": 0.24480712166172106,
"grad_norm": 7.41351842880249,
"learning_rate": 2.577821776347853e-06,
"loss": 1.2765,
"step": 3300
},
{
"epoch": 0.24554896142433236,
"grad_norm": 7.898843765258789,
"learning_rate": 2.5753875750309814e-06,
"loss": 1.4827,
"step": 3310
},
{
"epoch": 0.24629080118694363,
"grad_norm": 8.024171829223633,
"learning_rate": 2.572947532746758e-06,
"loss": 1.4173,
"step": 3320
},
{
"epoch": 0.2470326409495549,
"grad_norm": 7.735332489013672,
"learning_rate": 2.570501662748271e-06,
"loss": 1.3901,
"step": 3330
},
{
"epoch": 0.24777448071216618,
"grad_norm": 8.987187385559082,
"learning_rate": 2.568049978320263e-06,
"loss": 1.4371,
"step": 3340
},
{
"epoch": 0.24851632047477745,
"grad_norm": 9.167318344116211,
"learning_rate": 2.5655924927790585e-06,
"loss": 1.3519,
"step": 3350
},
{
"epoch": 0.24925816023738873,
"grad_norm": 7.899603366851807,
"learning_rate": 2.5631292194724884e-06,
"loss": 1.31,
"step": 3360
},
{
"epoch": 0.25,
"grad_norm": 8.992423057556152,
"learning_rate": 2.5606601717798212e-06,
"loss": 1.3822,
"step": 3370
},
{
"epoch": 0.2507418397626113,
"grad_norm": 9.284130096435547,
"learning_rate": 2.558185363111689e-06,
"loss": 1.4068,
"step": 3380
},
{
"epoch": 0.25148367952522255,
"grad_norm": 9.180769920349121,
"learning_rate": 2.555704806910015e-06,
"loss": 1.377,
"step": 3390
},
{
"epoch": 0.2522255192878338,
"grad_norm": 9.335295677185059,
"learning_rate": 2.553218516647939e-06,
"loss": 1.3997,
"step": 3400
},
{
"epoch": 0.2529673590504451,
"grad_norm": 10.324609756469727,
"learning_rate": 2.550726505829746e-06,
"loss": 1.502,
"step": 3410
},
{
"epoch": 0.25370919881305637,
"grad_norm": 8.74648380279541,
"learning_rate": 2.5482287879907926e-06,
"loss": 1.3515,
"step": 3420
},
{
"epoch": 0.25445103857566764,
"grad_norm": 9.311241149902344,
"learning_rate": 2.5457253766974314e-06,
"loss": 1.3607,
"step": 3430
},
{
"epoch": 0.2551928783382789,
"grad_norm": 9.811213493347168,
"learning_rate": 2.543216285546942e-06,
"loss": 1.436,
"step": 3440
},
{
"epoch": 0.2559347181008902,
"grad_norm": 8.822476387023926,
"learning_rate": 2.5407015281674513e-06,
"loss": 1.582,
"step": 3450
},
{
"epoch": 0.25667655786350146,
"grad_norm": 7.025854110717773,
"learning_rate": 2.5381811182178632e-06,
"loss": 1.3498,
"step": 3460
},
{
"epoch": 0.25741839762611274,
"grad_norm": 8.49760627746582,
"learning_rate": 2.5356550693877845e-06,
"loss": 1.4426,
"step": 3470
},
{
"epoch": 0.258160237388724,
"grad_norm": 9.154727935791016,
"learning_rate": 2.5331233953974484e-06,
"loss": 1.2733,
"step": 3480
},
{
"epoch": 0.2589020771513353,
"grad_norm": 7.772784233093262,
"learning_rate": 2.5305861099976416e-06,
"loss": 1.2198,
"step": 3490
},
{
"epoch": 0.2596439169139466,
"grad_norm": 7.934385776519775,
"learning_rate": 2.5280432269696283e-06,
"loss": 1.4087,
"step": 3500
},
{
"epoch": 0.2596439169139466,
"eval_loss": 1.3714910745620728,
"eval_runtime": 23.4503,
"eval_samples_per_second": 19.019,
"eval_steps_per_second": 9.509,
"step": 3500
},
{
"epoch": 0.2603857566765579,
"grad_norm": 7.804587364196777,
"learning_rate": 2.5254947601250787e-06,
"loss": 1.2602,
"step": 3510
},
{
"epoch": 0.26112759643916916,
"grad_norm": 10.741705894470215,
"learning_rate": 2.5229407233059886e-06,
"loss": 1.5066,
"step": 3520
},
{
"epoch": 0.26186943620178044,
"grad_norm": 7.940061092376709,
"learning_rate": 2.5203811303846093e-06,
"loss": 1.3713,
"step": 3530
},
{
"epoch": 0.2626112759643917,
"grad_norm": 8.638043403625488,
"learning_rate": 2.5178159952633683e-06,
"loss": 1.4127,
"step": 3540
},
{
"epoch": 0.263353115727003,
"grad_norm": 7.808784008026123,
"learning_rate": 2.515245331874797e-06,
"loss": 1.3337,
"step": 3550
},
{
"epoch": 0.26409495548961426,
"grad_norm": 7.855457782745361,
"learning_rate": 2.5126691541814516e-06,
"loss": 1.4842,
"step": 3560
},
{
"epoch": 0.26483679525222553,
"grad_norm": 7.667708873748779,
"learning_rate": 2.5100874761758426e-06,
"loss": 1.2371,
"step": 3570
},
{
"epoch": 0.2655786350148368,
"grad_norm": 8.755106925964355,
"learning_rate": 2.5075003118803524e-06,
"loss": 1.4708,
"step": 3580
},
{
"epoch": 0.2663204747774481,
"grad_norm": 8.294569969177246,
"learning_rate": 2.504907675347163e-06,
"loss": 1.4162,
"step": 3590
},
{
"epoch": 0.26706231454005935,
"grad_norm": 8.485974311828613,
"learning_rate": 2.50230958065818e-06,
"loss": 1.4551,
"step": 3600
},
{
"epoch": 0.2678041543026706,
"grad_norm": 12.968074798583984,
"learning_rate": 2.4997060419249534e-06,
"loss": 1.4756,
"step": 3610
},
{
"epoch": 0.2685459940652819,
"grad_norm": 7.765286922454834,
"learning_rate": 2.4970970732886032e-06,
"loss": 1.2534,
"step": 3620
},
{
"epoch": 0.2692878338278932,
"grad_norm": 8.599440574645996,
"learning_rate": 2.494482688919742e-06,
"loss": 1.3371,
"step": 3630
},
{
"epoch": 0.27002967359050445,
"grad_norm": 8.294087409973145,
"learning_rate": 2.491862903018398e-06,
"loss": 1.4185,
"step": 3640
},
{
"epoch": 0.2707715133531157,
"grad_norm": 8.291155815124512,
"learning_rate": 2.489237729813938e-06,
"loss": 1.3793,
"step": 3650
},
{
"epoch": 0.271513353115727,
"grad_norm": 7.898152828216553,
"learning_rate": 2.4866071835649887e-06,
"loss": 1.3714,
"step": 3660
},
{
"epoch": 0.27225519287833827,
"grad_norm": 8.396595001220703,
"learning_rate": 2.483971278559362e-06,
"loss": 1.4737,
"step": 3670
},
{
"epoch": 0.27299703264094954,
"grad_norm": 7.634808540344238,
"learning_rate": 2.4813300291139753e-06,
"loss": 1.3822,
"step": 3680
},
{
"epoch": 0.2737388724035608,
"grad_norm": 8.787116050720215,
"learning_rate": 2.4786834495747738e-06,
"loss": 1.2784,
"step": 3690
},
{
"epoch": 0.2744807121661721,
"grad_norm": 10.124987602233887,
"learning_rate": 2.476031554316655e-06,
"loss": 1.4317,
"step": 3700
},
{
"epoch": 0.27522255192878337,
"grad_norm": 8.735859870910645,
"learning_rate": 2.4733743577433857e-06,
"loss": 1.2954,
"step": 3710
},
{
"epoch": 0.27596439169139464,
"grad_norm": 9.41859245300293,
"learning_rate": 2.470711874287529e-06,
"loss": 1.4109,
"step": 3720
},
{
"epoch": 0.2767062314540059,
"grad_norm": 7.346931457519531,
"learning_rate": 2.4680441184103642e-06,
"loss": 1.3118,
"step": 3730
},
{
"epoch": 0.2774480712166172,
"grad_norm": 8.223915100097656,
"learning_rate": 2.465371104601805e-06,
"loss": 1.451,
"step": 3740
},
{
"epoch": 0.27818991097922846,
"grad_norm": 8.05762004852295,
"learning_rate": 2.4626928473803264e-06,
"loss": 1.4075,
"step": 3750
},
{
"epoch": 0.2789317507418398,
"grad_norm": 10.53507137298584,
"learning_rate": 2.4600093612928813e-06,
"loss": 1.4301,
"step": 3760
},
{
"epoch": 0.27967359050445106,
"grad_norm": 7.951254367828369,
"learning_rate": 2.457320660914824e-06,
"loss": 1.4816,
"step": 3770
},
{
"epoch": 0.28041543026706234,
"grad_norm": 11.656047821044922,
"learning_rate": 2.45462676084983e-06,
"loss": 1.2551,
"step": 3780
},
{
"epoch": 0.2811572700296736,
"grad_norm": 9.22987174987793,
"learning_rate": 2.451927675729816e-06,
"loss": 1.4458,
"step": 3790
},
{
"epoch": 0.2818991097922849,
"grad_norm": 9.910201072692871,
"learning_rate": 2.4492234202148643e-06,
"loss": 1.428,
"step": 3800
},
{
"epoch": 0.28264094955489616,
"grad_norm": 8.999225616455078,
"learning_rate": 2.4465140089931357e-06,
"loss": 1.275,
"step": 3810
},
{
"epoch": 0.28338278931750743,
"grad_norm": 7.863303184509277,
"learning_rate": 2.443799456780798e-06,
"loss": 1.3344,
"step": 3820
},
{
"epoch": 0.2841246290801187,
"grad_norm": 8.949956893920898,
"learning_rate": 2.44107977832194e-06,
"loss": 1.3681,
"step": 3830
},
{
"epoch": 0.28486646884273,
"grad_norm": 10.083333015441895,
"learning_rate": 2.438354988388495e-06,
"loss": 1.2786,
"step": 3840
},
{
"epoch": 0.28560830860534125,
"grad_norm": 8.96097183227539,
"learning_rate": 2.4356251017801596e-06,
"loss": 1.3194,
"step": 3850
},
{
"epoch": 0.28635014836795253,
"grad_norm": 9.839349746704102,
"learning_rate": 2.432890133324311e-06,
"loss": 1.3521,
"step": 3860
},
{
"epoch": 0.2870919881305638,
"grad_norm": 7.604780197143555,
"learning_rate": 2.43015009787593e-06,
"loss": 1.3759,
"step": 3870
},
{
"epoch": 0.2878338278931751,
"grad_norm": 7.909048080444336,
"learning_rate": 2.427405010317519e-06,
"loss": 1.3872,
"step": 3880
},
{
"epoch": 0.28857566765578635,
"grad_norm": 8.023886680603027,
"learning_rate": 2.4246548855590206e-06,
"loss": 1.4451,
"step": 3890
},
{
"epoch": 0.2893175074183976,
"grad_norm": 8.603988647460938,
"learning_rate": 2.4218997385377356e-06,
"loss": 1.3554,
"step": 3900
},
{
"epoch": 0.2900593471810089,
"grad_norm": 8.416375160217285,
"learning_rate": 2.4191395842182455e-06,
"loss": 1.4591,
"step": 3910
},
{
"epoch": 0.29080118694362017,
"grad_norm": 8.673905372619629,
"learning_rate": 2.416374437592327e-06,
"loss": 1.3327,
"step": 3920
},
{
"epoch": 0.29154302670623145,
"grad_norm": 8.481094360351562,
"learning_rate": 2.413604313678874e-06,
"loss": 1.3097,
"step": 3930
},
{
"epoch": 0.2922848664688427,
"grad_norm": 8.51818561553955,
"learning_rate": 2.4108292275238133e-06,
"loss": 1.2288,
"step": 3940
},
{
"epoch": 0.293026706231454,
"grad_norm": 9.287731170654297,
"learning_rate": 2.4080491942000247e-06,
"loss": 1.3104,
"step": 3950
},
{
"epoch": 0.29376854599406527,
"grad_norm": 9.262923240661621,
"learning_rate": 2.4052642288072596e-06,
"loss": 1.5436,
"step": 3960
},
{
"epoch": 0.29451038575667654,
"grad_norm": 9.646564483642578,
"learning_rate": 2.4024743464720555e-06,
"loss": 1.3926,
"step": 3970
},
{
"epoch": 0.2952522255192878,
"grad_norm": 8.739798545837402,
"learning_rate": 2.3996795623476577e-06,
"loss": 1.4747,
"step": 3980
},
{
"epoch": 0.2959940652818991,
"grad_norm": 8.455376625061035,
"learning_rate": 2.396879891613936e-06,
"loss": 1.371,
"step": 3990
},
{
"epoch": 0.29673590504451036,
"grad_norm": 8.93728256225586,
"learning_rate": 2.394075349477302e-06,
"loss": 1.2973,
"step": 4000
},
{
"epoch": 0.29673590504451036,
"eval_loss": 1.36147141456604,
"eval_runtime": 23.4427,
"eval_samples_per_second": 19.025,
"eval_steps_per_second": 9.513,
"step": 4000
},
{
"epoch": 0.29747774480712164,
"grad_norm": 8.445281982421875,
"learning_rate": 2.3912659511706243e-06,
"loss": 1.4152,
"step": 4010
},
{
"epoch": 0.29821958456973297,
"grad_norm": 9.02658748626709,
"learning_rate": 2.3884517119531496e-06,
"loss": 1.4489,
"step": 4020
},
{
"epoch": 0.29896142433234424,
"grad_norm": 8.706474304199219,
"learning_rate": 2.385632647110418e-06,
"loss": 1.401,
"step": 4030
},
{
"epoch": 0.2997032640949555,
"grad_norm": 7.351003170013428,
"learning_rate": 2.382808771954179e-06,
"loss": 1.4131,
"step": 4040
},
{
"epoch": 0.3004451038575668,
"grad_norm": 8.288825988769531,
"learning_rate": 2.3799801018223095e-06,
"loss": 1.2643,
"step": 4050
},
{
"epoch": 0.30118694362017806,
"grad_norm": 8.027029991149902,
"learning_rate": 2.3771466520787316e-06,
"loss": 1.3642,
"step": 4060
},
{
"epoch": 0.30192878338278933,
"grad_norm": 9.516772270202637,
"learning_rate": 2.3743084381133264e-06,
"loss": 1.2057,
"step": 4070
},
{
"epoch": 0.3026706231454006,
"grad_norm": 8.332013130187988,
"learning_rate": 2.371465475341852e-06,
"loss": 1.347,
"step": 4080
},
{
"epoch": 0.3034124629080119,
"grad_norm": 7.586446762084961,
"learning_rate": 2.3686177792058606e-06,
"loss": 1.4661,
"step": 4090
},
{
"epoch": 0.30415430267062316,
"grad_norm": 9.531535148620605,
"learning_rate": 2.3657653651726125e-06,
"loss": 1.242,
"step": 4100
},
{
"epoch": 0.30489614243323443,
"grad_norm": 7.554753303527832,
"learning_rate": 2.362908248734994e-06,
"loss": 1.3381,
"step": 4110
},
{
"epoch": 0.3056379821958457,
"grad_norm": 9.01855754852295,
"learning_rate": 2.360046445411433e-06,
"loss": 1.5718,
"step": 4120
},
{
"epoch": 0.306379821958457,
"grad_norm": 8.020215034484863,
"learning_rate": 2.3571799707458125e-06,
"loss": 1.2917,
"step": 4130
},
{
"epoch": 0.30712166172106825,
"grad_norm": 8.08421802520752,
"learning_rate": 2.35430884030739e-06,
"loss": 1.4316,
"step": 4140
},
{
"epoch": 0.3078635014836795,
"grad_norm": 8.234532356262207,
"learning_rate": 2.351433069690709e-06,
"loss": 1.2778,
"step": 4150
},
{
"epoch": 0.3086053412462908,
"grad_norm": 7.486210823059082,
"learning_rate": 2.348552674515517e-06,
"loss": 1.3158,
"step": 4160
},
{
"epoch": 0.3093471810089021,
"grad_norm": 11.375346183776855,
"learning_rate": 2.34566767042668e-06,
"loss": 1.4065,
"step": 4170
},
{
"epoch": 0.31008902077151335,
"grad_norm": 8.795413970947266,
"learning_rate": 2.3427780730940967e-06,
"loss": 1.3817,
"step": 4180
},
{
"epoch": 0.3108308605341246,
"grad_norm": 8.96834659576416,
"learning_rate": 2.3398838982126147e-06,
"loss": 1.4102,
"step": 4190
},
{
"epoch": 0.3115727002967359,
"grad_norm": 6.874296188354492,
"learning_rate": 2.3369851615019433e-06,
"loss": 1.3764,
"step": 4200
},
{
"epoch": 0.31231454005934717,
"grad_norm": 7.878982067108154,
"learning_rate": 2.3340818787065715e-06,
"loss": 1.313,
"step": 4210
},
{
"epoch": 0.31305637982195844,
"grad_norm": 8.147690773010254,
"learning_rate": 2.3311740655956785e-06,
"loss": 1.4591,
"step": 4220
},
{
"epoch": 0.3137982195845697,
"grad_norm": 8.309657096862793,
"learning_rate": 2.32826173796305e-06,
"loss": 1.367,
"step": 4230
},
{
"epoch": 0.314540059347181,
"grad_norm": 9.30339241027832,
"learning_rate": 2.3253449116269937e-06,
"loss": 1.2814,
"step": 4240
},
{
"epoch": 0.31528189910979226,
"grad_norm": 9.000772476196289,
"learning_rate": 2.3224236024302502e-06,
"loss": 1.2713,
"step": 4250
},
{
"epoch": 0.31602373887240354,
"grad_norm": 8.01784610748291,
"learning_rate": 2.319497826239911e-06,
"loss": 1.3312,
"step": 4260
},
{
"epoch": 0.3167655786350148,
"grad_norm": 8.405533790588379,
"learning_rate": 2.316567598947327e-06,
"loss": 1.3651,
"step": 4270
},
{
"epoch": 0.31750741839762614,
"grad_norm": 8.148391723632812,
"learning_rate": 2.3136329364680287e-06,
"loss": 1.4414,
"step": 4280
},
{
"epoch": 0.3182492581602374,
"grad_norm": 36.44773864746094,
"learning_rate": 2.3106938547416338e-06,
"loss": 1.3181,
"step": 4290
},
{
"epoch": 0.3189910979228487,
"grad_norm": 7.259230613708496,
"learning_rate": 2.307750369731764e-06,
"loss": 1.3493,
"step": 4300
},
{
"epoch": 0.31973293768545996,
"grad_norm": 8.317214012145996,
"learning_rate": 2.304802497425958e-06,
"loss": 1.4059,
"step": 4310
},
{
"epoch": 0.32047477744807124,
"grad_norm": 8.004743576049805,
"learning_rate": 2.3018502538355825e-06,
"loss": 1.4011,
"step": 4320
},
{
"epoch": 0.3212166172106825,
"grad_norm": 9.351004600524902,
"learning_rate": 2.298893654995749e-06,
"loss": 1.5036,
"step": 4330
},
{
"epoch": 0.3219584569732938,
"grad_norm": 8.475602149963379,
"learning_rate": 2.295932716965222e-06,
"loss": 1.2183,
"step": 4340
},
{
"epoch": 0.32270029673590506,
"grad_norm": 7.471583366394043,
"learning_rate": 2.292967455826337e-06,
"loss": 1.3892,
"step": 4350
},
{
"epoch": 0.32344213649851633,
"grad_norm": 9.214890480041504,
"learning_rate": 2.2899978876849085e-06,
"loss": 1.472,
"step": 4360
},
{
"epoch": 0.3241839762611276,
"grad_norm": 8.986857414245605,
"learning_rate": 2.287024028670145e-06,
"loss": 1.2721,
"step": 4370
},
{
"epoch": 0.3249258160237389,
"grad_norm": 8.836446762084961,
"learning_rate": 2.284045894934562e-06,
"loss": 1.2329,
"step": 4380
},
{
"epoch": 0.32566765578635015,
"grad_norm": 8.13981819152832,
"learning_rate": 2.281063502653891e-06,
"loss": 1.2512,
"step": 4390
},
{
"epoch": 0.3264094955489614,
"grad_norm": 8.709846496582031,
"learning_rate": 2.278076868026995e-06,
"loss": 1.3859,
"step": 4400
},
{
"epoch": 0.3271513353115727,
"grad_norm": 9.3983154296875,
"learning_rate": 2.27508600727578e-06,
"loss": 1.4237,
"step": 4410
},
{
"epoch": 0.327893175074184,
"grad_norm": 8.226868629455566,
"learning_rate": 2.272090936645105e-06,
"loss": 1.3894,
"step": 4420
},
{
"epoch": 0.32863501483679525,
"grad_norm": 9.627702713012695,
"learning_rate": 2.2690916724026954e-06,
"loss": 1.3225,
"step": 4430
},
{
"epoch": 0.3293768545994065,
"grad_norm": 11.345617294311523,
"learning_rate": 2.266088230839055e-06,
"loss": 1.3649,
"step": 4440
},
{
"epoch": 0.3301186943620178,
"grad_norm": 7.237599849700928,
"learning_rate": 2.2630806282673744e-06,
"loss": 1.5589,
"step": 4450
},
{
"epoch": 0.33086053412462907,
"grad_norm": 8.742907524108887,
"learning_rate": 2.2600688810234474e-06,
"loss": 1.4584,
"step": 4460
},
{
"epoch": 0.33160237388724034,
"grad_norm": 9.190670013427734,
"learning_rate": 2.257053005465578e-06,
"loss": 1.4466,
"step": 4470
},
{
"epoch": 0.3323442136498516,
"grad_norm": 8.909046173095703,
"learning_rate": 2.2540330179744934e-06,
"loss": 1.3321,
"step": 4480
},
{
"epoch": 0.3330860534124629,
"grad_norm": 8.911348342895508,
"learning_rate": 2.2510089349532553e-06,
"loss": 1.4146,
"step": 4490
},
{
"epoch": 0.33382789317507416,
"grad_norm": 8.258678436279297,
"learning_rate": 2.2479807728271696e-06,
"loss": 1.348,
"step": 4500
},
{
"epoch": 0.33382789317507416,
"eval_loss": 1.3544670343399048,
"eval_runtime": 23.4388,
"eval_samples_per_second": 19.028,
"eval_steps_per_second": 9.514,
"step": 4500
},
{
"epoch": 0.33456973293768544,
"grad_norm": 8.755362510681152,
"learning_rate": 2.2449485480436982e-06,
"loss": 1.3788,
"step": 4510
},
{
"epoch": 0.3353115727002967,
"grad_norm": 8.534749031066895,
"learning_rate": 2.24191227707237e-06,
"loss": 1.2039,
"step": 4520
},
{
"epoch": 0.336053412462908,
"grad_norm": 7.606124401092529,
"learning_rate": 2.238871976404689e-06,
"loss": 1.4215,
"step": 4530
},
{
"epoch": 0.3367952522255193,
"grad_norm": 8.163749694824219,
"learning_rate": 2.235827662554048e-06,
"loss": 1.3814,
"step": 4540
},
{
"epoch": 0.3375370919881306,
"grad_norm": 7.764957427978516,
"learning_rate": 2.232779352055637e-06,
"loss": 1.2437,
"step": 4550
},
{
"epoch": 0.33827893175074186,
"grad_norm": 10.332768440246582,
"learning_rate": 2.2297270614663533e-06,
"loss": 1.4328,
"step": 4560
},
{
"epoch": 0.33902077151335314,
"grad_norm": 8.382997512817383,
"learning_rate": 2.2266708073647128e-06,
"loss": 1.4947,
"step": 4570
},
{
"epoch": 0.3397626112759644,
"grad_norm": 8.392914772033691,
"learning_rate": 2.2236106063507592e-06,
"loss": 1.3206,
"step": 4580
},
{
"epoch": 0.3405044510385757,
"grad_norm": 8.482207298278809,
"learning_rate": 2.220546475045973e-06,
"loss": 1.473,
"step": 4590
},
{
"epoch": 0.34124629080118696,
"grad_norm": 9.380014419555664,
"learning_rate": 2.2174784300931828e-06,
"loss": 1.5559,
"step": 4600
},
{
"epoch": 0.34198813056379823,
"grad_norm": 8.139824867248535,
"learning_rate": 2.2144064881564747e-06,
"loss": 1.5721,
"step": 4610
},
{
"epoch": 0.3427299703264095,
"grad_norm": 9.55907917022705,
"learning_rate": 2.2113306659210997e-06,
"loss": 1.3778,
"step": 4620
},
{
"epoch": 0.3434718100890208,
"grad_norm": 10.155835151672363,
"learning_rate": 2.208250980093386e-06,
"loss": 1.2517,
"step": 4630
},
{
"epoch": 0.34421364985163205,
"grad_norm": 8.608782768249512,
"learning_rate": 2.205167447400646e-06,
"loss": 1.3875,
"step": 4640
},
{
"epoch": 0.3449554896142433,
"grad_norm": 9.097238540649414,
"learning_rate": 2.202080084591087e-06,
"loss": 1.389,
"step": 4650
},
{
"epoch": 0.3456973293768546,
"grad_norm": 8.809340476989746,
"learning_rate": 2.1989889084337194e-06,
"loss": 1.2246,
"step": 4660
},
{
"epoch": 0.3464391691394659,
"grad_norm": 9.638260841369629,
"learning_rate": 2.195893935718266e-06,
"loss": 1.4718,
"step": 4670
},
{
"epoch": 0.34718100890207715,
"grad_norm": 7.2880730628967285,
"learning_rate": 2.19279518325507e-06,
"loss": 1.1473,
"step": 4680
},
{
"epoch": 0.3479228486646884,
"grad_norm": 9.370959281921387,
"learning_rate": 2.1896926678750043e-06,
"loss": 1.3126,
"step": 4690
},
{
"epoch": 0.3486646884272997,
"grad_norm": 7.85057258605957,
"learning_rate": 2.1865864064293813e-06,
"loss": 1.3338,
"step": 4700
},
{
"epoch": 0.34940652818991097,
"grad_norm": 8.449581146240234,
"learning_rate": 2.1834764157898587e-06,
"loss": 1.3948,
"step": 4710
},
{
"epoch": 0.35014836795252224,
"grad_norm": 10.200738906860352,
"learning_rate": 2.18036271284835e-06,
"loss": 1.4157,
"step": 4720
},
{
"epoch": 0.3508902077151335,
"grad_norm": 9.506202697753906,
"learning_rate": 2.177245314516932e-06,
"loss": 1.4382,
"step": 4730
},
{
"epoch": 0.3516320474777448,
"grad_norm": 9.932241439819336,
"learning_rate": 2.174124237727753e-06,
"loss": 1.408,
"step": 4740
},
{
"epoch": 0.35237388724035607,
"grad_norm": 10.123774528503418,
"learning_rate": 2.1709994994329406e-06,
"loss": 1.1708,
"step": 4750
},
{
"epoch": 0.35311572700296734,
"grad_norm": 7.982966899871826,
"learning_rate": 2.1678711166045108e-06,
"loss": 1.2625,
"step": 4760
},
{
"epoch": 0.3538575667655786,
"grad_norm": 9.418827056884766,
"learning_rate": 2.164739106234273e-06,
"loss": 1.3367,
"step": 4770
},
{
"epoch": 0.3545994065281899,
"grad_norm": 9.385802268981934,
"learning_rate": 2.161603485333742e-06,
"loss": 1.5404,
"step": 4780
},
{
"epoch": 0.35534124629080116,
"grad_norm": 8.353150367736816,
"learning_rate": 2.1584642709340414e-06,
"loss": 1.5455,
"step": 4790
},
{
"epoch": 0.3560830860534125,
"grad_norm": 7.22542667388916,
"learning_rate": 2.155321480085813e-06,
"loss": 1.4264,
"step": 4800
},
{
"epoch": 0.35682492581602376,
"grad_norm": 7.641038417816162,
"learning_rate": 2.152175129859125e-06,
"loss": 1.3006,
"step": 4810
},
{
"epoch": 0.35756676557863504,
"grad_norm": 7.675732135772705,
"learning_rate": 2.1490252373433783e-06,
"loss": 1.3992,
"step": 4820
},
{
"epoch": 0.3583086053412463,
"grad_norm": 7.769400119781494,
"learning_rate": 2.1458718196472124e-06,
"loss": 1.2344,
"step": 4830
},
{
"epoch": 0.3590504451038576,
"grad_norm": 8.751335144042969,
"learning_rate": 2.1427148938984156e-06,
"loss": 1.4056,
"step": 4840
},
{
"epoch": 0.35979228486646886,
"grad_norm": 10.821932792663574,
"learning_rate": 2.1395544772438288e-06,
"loss": 1.362,
"step": 4850
},
{
"epoch": 0.36053412462908013,
"grad_norm": 7.864255905151367,
"learning_rate": 2.136390586849255e-06,
"loss": 1.4346,
"step": 4860
},
{
"epoch": 0.3612759643916914,
"grad_norm": 10.004661560058594,
"learning_rate": 2.1332232398993634e-06,
"loss": 1.4811,
"step": 4870
},
{
"epoch": 0.3620178041543027,
"grad_norm": 8.67725944519043,
"learning_rate": 2.130052453597598e-06,
"loss": 1.3436,
"step": 4880
},
{
"epoch": 0.36275964391691395,
"grad_norm": 8.538166999816895,
"learning_rate": 2.126878245166084e-06,
"loss": 1.286,
"step": 4890
},
{
"epoch": 0.36350148367952523,
"grad_norm": 8.13525676727295,
"learning_rate": 2.1237006318455345e-06,
"loss": 1.3891,
"step": 4900
},
{
"epoch": 0.3642433234421365,
"grad_norm": 7.657358646392822,
"learning_rate": 2.1205196308951547e-06,
"loss": 1.4672,
"step": 4910
},
{
"epoch": 0.3649851632047478,
"grad_norm": 9.132546424865723,
"learning_rate": 2.1173352595925505e-06,
"loss": 1.2085,
"step": 4920
},
{
"epoch": 0.36572700296735905,
"grad_norm": 8.413400650024414,
"learning_rate": 2.1141475352336345e-06,
"loss": 1.2139,
"step": 4930
},
{
"epoch": 0.3664688427299703,
"grad_norm": 8.649598121643066,
"learning_rate": 2.1109564751325297e-06,
"loss": 1.5049,
"step": 4940
},
{
"epoch": 0.3672106824925816,
"grad_norm": 10.267006874084473,
"learning_rate": 2.107762096621479e-06,
"loss": 1.4108,
"step": 4950
},
{
"epoch": 0.36795252225519287,
"grad_norm": 8.94491195678711,
"learning_rate": 2.104564417050749e-06,
"loss": 1.3822,
"step": 4960
},
{
"epoch": 0.36869436201780414,
"grad_norm": 7.626391887664795,
"learning_rate": 2.101363453788534e-06,
"loss": 1.4081,
"step": 4970
},
{
"epoch": 0.3694362017804154,
"grad_norm": 9.207382202148438,
"learning_rate": 2.0981592242208664e-06,
"loss": 1.3541,
"step": 4980
},
{
"epoch": 0.3701780415430267,
"grad_norm": 7.966575622558594,
"learning_rate": 2.094951745751518e-06,
"loss": 1.5405,
"step": 4990
},
{
"epoch": 0.37091988130563797,
"grad_norm": 8.80086612701416,
"learning_rate": 2.0917410358019074e-06,
"loss": 1.4639,
"step": 5000
},
{
"epoch": 0.37091988130563797,
"eval_loss": 1.3480095863342285,
"eval_runtime": 23.4543,
"eval_samples_per_second": 19.016,
"eval_steps_per_second": 9.508,
"step": 5000
},
{
"epoch": 0.37166172106824924,
"grad_norm": 8.063216209411621,
"learning_rate": 2.0885271118110046e-06,
"loss": 1.3554,
"step": 5010
},
{
"epoch": 0.3724035608308605,
"grad_norm": 8.728006362915039,
"learning_rate": 2.0853099912352377e-06,
"loss": 1.2087,
"step": 5020
},
{
"epoch": 0.3731454005934718,
"grad_norm": 9.18012523651123,
"learning_rate": 2.0820896915483957e-06,
"loss": 1.3693,
"step": 5030
},
{
"epoch": 0.37388724035608306,
"grad_norm": 7.697686672210693,
"learning_rate": 2.0788662302415355e-06,
"loss": 1.3692,
"step": 5040
},
{
"epoch": 0.37462908011869434,
"grad_norm": 7.777410984039307,
"learning_rate": 2.075639624822886e-06,
"loss": 1.4546,
"step": 5050
},
{
"epoch": 0.37537091988130566,
"grad_norm": 8.502872467041016,
"learning_rate": 2.072409892817755e-06,
"loss": 1.3695,
"step": 5060
},
{
"epoch": 0.37611275964391694,
"grad_norm": 8.375325202941895,
"learning_rate": 2.0691770517684303e-06,
"loss": 1.3583,
"step": 5070
},
{
"epoch": 0.3768545994065282,
"grad_norm": 10.402475357055664,
"learning_rate": 2.0659411192340875e-06,
"loss": 1.4421,
"step": 5080
},
{
"epoch": 0.3775964391691395,
"grad_norm": 8.315070152282715,
"learning_rate": 2.0627021127906936e-06,
"loss": 1.3451,
"step": 5090
},
{
"epoch": 0.37833827893175076,
"grad_norm": 8.026792526245117,
"learning_rate": 2.05946005003091e-06,
"loss": 1.2854,
"step": 5100
},
{
"epoch": 0.37908011869436203,
"grad_norm": 8.60229778289795,
"learning_rate": 2.056214948564002e-06,
"loss": 1.3984,
"step": 5110
},
{
"epoch": 0.3798219584569733,
"grad_norm": 8.691934585571289,
"learning_rate": 2.0529668260157356e-06,
"loss": 1.4777,
"step": 5120
},
{
"epoch": 0.3805637982195846,
"grad_norm": 8.551725387573242,
"learning_rate": 2.049715700028288e-06,
"loss": 1.2376,
"step": 5130
},
{
"epoch": 0.38130563798219586,
"grad_norm": 7.708804130554199,
"learning_rate": 2.04646158826015e-06,
"loss": 1.253,
"step": 5140
},
{
"epoch": 0.38204747774480713,
"grad_norm": 9.0563325881958,
"learning_rate": 2.043204508386028e-06,
"loss": 1.3143,
"step": 5150
},
{
"epoch": 0.3827893175074184,
"grad_norm": 9.717677116394043,
"learning_rate": 2.0399444780967514e-06,
"loss": 1.389,
"step": 5160
},
{
"epoch": 0.3835311572700297,
"grad_norm": 10.435174942016602,
"learning_rate": 2.036681515099173e-06,
"loss": 1.3088,
"step": 5170
},
{
"epoch": 0.38427299703264095,
"grad_norm": 8.454843521118164,
"learning_rate": 2.0334156371160754e-06,
"loss": 1.3449,
"step": 5180
},
{
"epoch": 0.3850148367952522,
"grad_norm": 8.752850532531738,
"learning_rate": 2.030146861886075e-06,
"loss": 1.3281,
"step": 5190
},
{
"epoch": 0.3857566765578635,
"grad_norm": 7.73056173324585,
"learning_rate": 2.0268752071635235e-06,
"loss": 1.4503,
"step": 5200
},
{
"epoch": 0.38649851632047477,
"grad_norm": 8.349225044250488,
"learning_rate": 2.0236006907184124e-06,
"loss": 1.3468,
"step": 5210
},
{
"epoch": 0.38724035608308605,
"grad_norm": 9.541553497314453,
"learning_rate": 2.0203233303362773e-06,
"loss": 1.4216,
"step": 5220
},
{
"epoch": 0.3879821958456973,
"grad_norm": 7.54893159866333,
"learning_rate": 2.0170431438181e-06,
"loss": 1.4398,
"step": 5230
},
{
"epoch": 0.3887240356083086,
"grad_norm": 8.763372421264648,
"learning_rate": 2.0137601489802127e-06,
"loss": 1.5001,
"step": 5240
},
{
"epoch": 0.38946587537091987,
"grad_norm": 6.774653434753418,
"learning_rate": 2.010474363654201e-06,
"loss": 1.2526,
"step": 5250
},
{
"epoch": 0.39020771513353114,
"grad_norm": 7.963438510894775,
"learning_rate": 2.0071858056868074e-06,
"loss": 1.2569,
"step": 5260
},
{
"epoch": 0.3909495548961424,
"grad_norm": 10.730804443359375,
"learning_rate": 2.003894492939834e-06,
"loss": 1.3766,
"step": 5270
},
{
"epoch": 0.3916913946587537,
"grad_norm": 8.266863822937012,
"learning_rate": 2.0006004432900444e-06,
"loss": 1.4004,
"step": 5280
},
{
"epoch": 0.39243323442136496,
"grad_norm": 8.219123840332031,
"learning_rate": 1.997303674629069e-06,
"loss": 1.3371,
"step": 5290
},
{
"epoch": 0.39317507418397624,
"grad_norm": 7.95269250869751,
"learning_rate": 1.9940042048633056e-06,
"loss": 1.4416,
"step": 5300
},
{
"epoch": 0.3939169139465875,
"grad_norm": 7.302926063537598,
"learning_rate": 1.9907020519138247e-06,
"loss": 1.3352,
"step": 5310
},
{
"epoch": 0.39465875370919884,
"grad_norm": 8.411139488220215,
"learning_rate": 1.987397233716267e-06,
"loss": 1.29,
"step": 5320
},
{
"epoch": 0.3954005934718101,
"grad_norm": 7.670512676239014,
"learning_rate": 1.9840897682207537e-06,
"loss": 1.3194,
"step": 5330
},
{
"epoch": 0.3961424332344214,
"grad_norm": 11.99163818359375,
"learning_rate": 1.9807796733917815e-06,
"loss": 1.4642,
"step": 5340
},
{
"epoch": 0.39688427299703266,
"grad_norm": 8.448274612426758,
"learning_rate": 1.9774669672081307e-06,
"loss": 1.277,
"step": 5350
},
{
"epoch": 0.39762611275964393,
"grad_norm": 8.752152442932129,
"learning_rate": 1.9741516676627632e-06,
"loss": 1.3266,
"step": 5360
},
{
"epoch": 0.3983679525222552,
"grad_norm": 8.631105422973633,
"learning_rate": 1.970833792762729e-06,
"loss": 1.4025,
"step": 5370
},
{
"epoch": 0.3991097922848665,
"grad_norm": 8.437644004821777,
"learning_rate": 1.967513360529063e-06,
"loss": 1.4304,
"step": 5380
},
{
"epoch": 0.39985163204747776,
"grad_norm": 8.341066360473633,
"learning_rate": 1.964190388996694e-06,
"loss": 1.3816,
"step": 5390
},
{
"epoch": 0.40059347181008903,
"grad_norm": 7.804527282714844,
"learning_rate": 1.9608648962143394e-06,
"loss": 1.4099,
"step": 5400
},
{
"epoch": 0.4013353115727003,
"grad_norm": 8.778786659240723,
"learning_rate": 1.957536900244414e-06,
"loss": 1.2651,
"step": 5410
},
{
"epoch": 0.4020771513353116,
"grad_norm": 8.054415702819824,
"learning_rate": 1.954206419162925e-06,
"loss": 1.4155,
"step": 5420
},
{
"epoch": 0.40281899109792285,
"grad_norm": 7.543354511260986,
"learning_rate": 1.950873471059382e-06,
"loss": 1.412,
"step": 5430
},
{
"epoch": 0.4035608308605341,
"grad_norm": 9.169261932373047,
"learning_rate": 1.9475380740366903e-06,
"loss": 1.4265,
"step": 5440
},
{
"epoch": 0.4043026706231454,
"grad_norm": 8.047539710998535,
"learning_rate": 1.944200246211058e-06,
"loss": 1.4605,
"step": 5450
},
{
"epoch": 0.4050445103857567,
"grad_norm": 9.375300407409668,
"learning_rate": 1.940860005711897e-06,
"loss": 1.4745,
"step": 5460
},
{
"epoch": 0.40578635014836795,
"grad_norm": 8.199248313903809,
"learning_rate": 1.9375173706817215e-06,
"loss": 1.3614,
"step": 5470
},
{
"epoch": 0.4065281899109792,
"grad_norm": 9.075878143310547,
"learning_rate": 1.9341723592760542e-06,
"loss": 1.4263,
"step": 5480
},
{
"epoch": 0.4072700296735905,
"grad_norm": 7.4491472244262695,
"learning_rate": 1.930824989663323e-06,
"loss": 1.251,
"step": 5490
},
{
"epoch": 0.40801186943620177,
"grad_norm": 8.764143943786621,
"learning_rate": 1.9274752800247654e-06,
"loss": 1.4405,
"step": 5500
},
{
"epoch": 0.40801186943620177,
"eval_loss": 1.340783953666687,
"eval_runtime": 23.4462,
"eval_samples_per_second": 19.022,
"eval_steps_per_second": 9.511,
"step": 5500
},
{
"epoch": 0.40875370919881304,
"grad_norm": 8.902606964111328,
"learning_rate": 1.9241232485543284e-06,
"loss": 1.3789,
"step": 5510
},
{
"epoch": 0.4094955489614243,
"grad_norm": 7.769072532653809,
"learning_rate": 1.9207689134585698e-06,
"loss": 1.5089,
"step": 5520
},
{
"epoch": 0.4102373887240356,
"grad_norm": 9.30247974395752,
"learning_rate": 1.91741229295656e-06,
"loss": 1.2942,
"step": 5530
},
{
"epoch": 0.41097922848664686,
"grad_norm": 9.735326766967773,
"learning_rate": 1.914053405279783e-06,
"loss": 1.1792,
"step": 5540
},
{
"epoch": 0.41172106824925814,
"grad_norm": 8.925307273864746,
"learning_rate": 1.9106922686720356e-06,
"loss": 1.4032,
"step": 5550
},
{
"epoch": 0.4124629080118694,
"grad_norm": 8.152726173400879,
"learning_rate": 1.9073289013893313e-06,
"loss": 1.3349,
"step": 5560
},
{
"epoch": 0.4132047477744807,
"grad_norm": 8.074481964111328,
"learning_rate": 1.9039633216997978e-06,
"loss": 1.2687,
"step": 5570
},
{
"epoch": 0.413946587537092,
"grad_norm": 7.500307559967041,
"learning_rate": 1.900595547883581e-06,
"loss": 1.3318,
"step": 5580
},
{
"epoch": 0.4146884272997033,
"grad_norm": 8.518424987792969,
"learning_rate": 1.8972255982327432e-06,
"loss": 1.4255,
"step": 5590
},
{
"epoch": 0.41543026706231456,
"grad_norm": 9.059218406677246,
"learning_rate": 1.8938534910511652e-06,
"loss": 1.3451,
"step": 5600
},
{
"epoch": 0.41617210682492584,
"grad_norm": 8.822978973388672,
"learning_rate": 1.8904792446544467e-06,
"loss": 1.623,
"step": 5610
},
{
"epoch": 0.4169139465875371,
"grad_norm": 8.972715377807617,
"learning_rate": 1.8871028773698058e-06,
"loss": 1.447,
"step": 5620
},
{
"epoch": 0.4176557863501484,
"grad_norm": 7.2900519371032715,
"learning_rate": 1.8837244075359804e-06,
"loss": 1.3426,
"step": 5630
},
{
"epoch": 0.41839762611275966,
"grad_norm": 8.24610710144043,
"learning_rate": 1.880343853503129e-06,
"loss": 1.3507,
"step": 5640
},
{
"epoch": 0.41913946587537093,
"grad_norm": 10.137441635131836,
"learning_rate": 1.8769612336327294e-06,
"loss": 1.4335,
"step": 5650
},
{
"epoch": 0.4198813056379822,
"grad_norm": 10.343937873840332,
"learning_rate": 1.8735765662974818e-06,
"loss": 1.3133,
"step": 5660
},
{
"epoch": 0.4206231454005935,
"grad_norm": 8.10049057006836,
"learning_rate": 1.8701898698812047e-06,
"loss": 1.31,
"step": 5670
},
{
"epoch": 0.42136498516320475,
"grad_norm": 8.974928855895996,
"learning_rate": 1.86680116277874e-06,
"loss": 1.3522,
"step": 5680
},
{
"epoch": 0.422106824925816,
"grad_norm": 7.443127632141113,
"learning_rate": 1.8634104633958483e-06,
"loss": 1.2373,
"step": 5690
},
{
"epoch": 0.4228486646884273,
"grad_norm": 8.140283584594727,
"learning_rate": 1.8600177901491135e-06,
"loss": 1.2969,
"step": 5700
},
{
"epoch": 0.4235905044510386,
"grad_norm": 8.618755340576172,
"learning_rate": 1.8566231614658389e-06,
"loss": 1.185,
"step": 5710
},
{
"epoch": 0.42433234421364985,
"grad_norm": 8.221843719482422,
"learning_rate": 1.8532265957839497e-06,
"loss": 1.3558,
"step": 5720
},
{
"epoch": 0.4250741839762611,
"grad_norm": 12.334073066711426,
"learning_rate": 1.8498281115518912e-06,
"loss": 1.3281,
"step": 5730
},
{
"epoch": 0.4258160237388724,
"grad_norm": 7.851191997528076,
"learning_rate": 1.8464277272285305e-06,
"loss": 1.2885,
"step": 5740
},
{
"epoch": 0.42655786350148367,
"grad_norm": 8.391671180725098,
"learning_rate": 1.843025461283053e-06,
"loss": 1.3001,
"step": 5750
},
{
"epoch": 0.42729970326409494,
"grad_norm": 9.907540321350098,
"learning_rate": 1.839621332194866e-06,
"loss": 1.4639,
"step": 5760
},
{
"epoch": 0.4280415430267062,
"grad_norm": 8.890905380249023,
"learning_rate": 1.8362153584534963e-06,
"loss": 1.3371,
"step": 5770
},
{
"epoch": 0.4287833827893175,
"grad_norm": 8.191327095031738,
"learning_rate": 1.8328075585584888e-06,
"loss": 1.4174,
"step": 5780
},
{
"epoch": 0.42952522255192876,
"grad_norm": 7.765829563140869,
"learning_rate": 1.829397951019308e-06,
"loss": 1.3488,
"step": 5790
},
{
"epoch": 0.43026706231454004,
"grad_norm": 8.07245922088623,
"learning_rate": 1.8259865543552362e-06,
"loss": 1.1749,
"step": 5800
},
{
"epoch": 0.4310089020771513,
"grad_norm": 7.672754287719727,
"learning_rate": 1.8225733870952739e-06,
"loss": 1.3164,
"step": 5810
},
{
"epoch": 0.4317507418397626,
"grad_norm": 8.181532859802246,
"learning_rate": 1.819158467778038e-06,
"loss": 1.387,
"step": 5820
},
{
"epoch": 0.43249258160237386,
"grad_norm": 8.17938232421875,
"learning_rate": 1.8157418149516617e-06,
"loss": 1.2231,
"step": 5830
},
{
"epoch": 0.4332344213649852,
"grad_norm": 7.951348304748535,
"learning_rate": 1.8123234471736945e-06,
"loss": 1.4411,
"step": 5840
},
{
"epoch": 0.43397626112759646,
"grad_norm": 7.451209545135498,
"learning_rate": 1.8089033830110003e-06,
"loss": 1.3168,
"step": 5850
},
{
"epoch": 0.43471810089020774,
"grad_norm": 8.86732292175293,
"learning_rate": 1.805481641039656e-06,
"loss": 1.4272,
"step": 5860
},
{
"epoch": 0.435459940652819,
"grad_norm": 8.028582572937012,
"learning_rate": 1.8020582398448532e-06,
"loss": 1.2012,
"step": 5870
},
{
"epoch": 0.4362017804154303,
"grad_norm": 7.9948506355285645,
"learning_rate": 1.7986331980207942e-06,
"loss": 1.377,
"step": 5880
},
{
"epoch": 0.43694362017804156,
"grad_norm": 8.945382118225098,
"learning_rate": 1.7952065341705928e-06,
"loss": 1.285,
"step": 5890
},
{
"epoch": 0.43768545994065283,
"grad_norm": 8.703865051269531,
"learning_rate": 1.7917782669061727e-06,
"loss": 1.4814,
"step": 5900
},
{
"epoch": 0.4384272997032641,
"grad_norm": 8.220625877380371,
"learning_rate": 1.7883484148481669e-06,
"loss": 1.3047,
"step": 5910
},
{
"epoch": 0.4391691394658754,
"grad_norm": 8.814275741577148,
"learning_rate": 1.7849169966258158e-06,
"loss": 1.2686,
"step": 5920
},
{
"epoch": 0.43991097922848665,
"grad_norm": 8.656988143920898,
"learning_rate": 1.7814840308768672e-06,
"loss": 1.3689,
"step": 5930
},
{
"epoch": 0.4406528189910979,
"grad_norm": 7.942451000213623,
"learning_rate": 1.778049536247473e-06,
"loss": 1.4089,
"step": 5940
},
{
"epoch": 0.4413946587537092,
"grad_norm": 8.073698997497559,
"learning_rate": 1.7746135313920907e-06,
"loss": 1.3592,
"step": 5950
},
{
"epoch": 0.4421364985163205,
"grad_norm": 9.229683876037598,
"learning_rate": 1.7711760349733793e-06,
"loss": 1.2828,
"step": 5960
},
{
"epoch": 0.44287833827893175,
"grad_norm": 9.150603294372559,
"learning_rate": 1.7677370656620997e-06,
"loss": 1.2879,
"step": 5970
},
{
"epoch": 0.443620178041543,
"grad_norm": 8.25768756866455,
"learning_rate": 1.7642966421370136e-06,
"loss": 1.4304,
"step": 5980
},
{
"epoch": 0.4443620178041543,
"grad_norm": 9.358892440795898,
"learning_rate": 1.7608547830847795e-06,
"loss": 1.4317,
"step": 5990
},
{
"epoch": 0.44510385756676557,
"grad_norm": 8.074627876281738,
"learning_rate": 1.757411507199855e-06,
"loss": 1.2926,
"step": 6000
},
{
"epoch": 0.44510385756676557,
"eval_loss": 1.3348528146743774,
"eval_runtime": 23.4773,
"eval_samples_per_second": 18.997,
"eval_steps_per_second": 9.499,
"step": 6000
},
{
"epoch": 0.44584569732937684,
"grad_norm": 7.637718677520752,
"learning_rate": 1.7539668331843914e-06,
"loss": 1.3149,
"step": 6010
},
{
"epoch": 0.4465875370919881,
"grad_norm": 10.58519458770752,
"learning_rate": 1.7505207797481356e-06,
"loss": 1.3607,
"step": 6020
},
{
"epoch": 0.4473293768545994,
"grad_norm": 7.9096174240112305,
"learning_rate": 1.7470733656083253e-06,
"loss": 1.2627,
"step": 6030
},
{
"epoch": 0.44807121661721067,
"grad_norm": 7.344761848449707,
"learning_rate": 1.7436246094895896e-06,
"loss": 1.4465,
"step": 6040
},
{
"epoch": 0.44881305637982194,
"grad_norm": 8.851872444152832,
"learning_rate": 1.740174530123847e-06,
"loss": 1.3832,
"step": 6050
},
{
"epoch": 0.4495548961424332,
"grad_norm": 8.735071182250977,
"learning_rate": 1.7367231462502024e-06,
"loss": 1.4773,
"step": 6060
},
{
"epoch": 0.4502967359050445,
"grad_norm": 8.918268203735352,
"learning_rate": 1.7332704766148466e-06,
"loss": 1.3897,
"step": 6070
},
{
"epoch": 0.45103857566765576,
"grad_norm": 8.48647689819336,
"learning_rate": 1.729816539970954e-06,
"loss": 1.2423,
"step": 6080
},
{
"epoch": 0.45178041543026703,
"grad_norm": 6.995253562927246,
"learning_rate": 1.72636135507858e-06,
"loss": 1.2992,
"step": 6090
},
{
"epoch": 0.45252225519287836,
"grad_norm": 7.405545234680176,
"learning_rate": 1.7229049407045613e-06,
"loss": 1.3971,
"step": 6100
},
{
"epoch": 0.45326409495548964,
"grad_norm": 8.452637672424316,
"learning_rate": 1.7194473156224113e-06,
"loss": 1.3156,
"step": 6110
},
{
"epoch": 0.4540059347181009,
"grad_norm": 7.952899932861328,
"learning_rate": 1.7159884986122197e-06,
"loss": 1.3817,
"step": 6120
},
{
"epoch": 0.4547477744807122,
"grad_norm": 8.648924827575684,
"learning_rate": 1.7125285084605509e-06,
"loss": 1.3269,
"step": 6130
},
{
"epoch": 0.45548961424332346,
"grad_norm": 7.878424167633057,
"learning_rate": 1.7090673639603399e-06,
"loss": 1.3511,
"step": 6140
},
{
"epoch": 0.45623145400593473,
"grad_norm": 10.038208961486816,
"learning_rate": 1.7056050839107924e-06,
"loss": 1.3547,
"step": 6150
},
{
"epoch": 0.456973293768546,
"grad_norm": 11.209604263305664,
"learning_rate": 1.7021416871172816e-06,
"loss": 1.38,
"step": 6160
},
{
"epoch": 0.4577151335311573,
"grad_norm": 8.880349159240723,
"learning_rate": 1.6986771923912466e-06,
"loss": 1.3767,
"step": 6170
},
{
"epoch": 0.45845697329376855,
"grad_norm": 7.9594221115112305,
"learning_rate": 1.6952116185500891e-06,
"loss": 1.3401,
"step": 6180
},
{
"epoch": 0.45919881305637983,
"grad_norm": 9.231648445129395,
"learning_rate": 1.6917449844170733e-06,
"loss": 1.3873,
"step": 6190
},
{
"epoch": 0.4599406528189911,
"grad_norm": 8.900077819824219,
"learning_rate": 1.6882773088212214e-06,
"loss": 1.4,
"step": 6200
},
{
"epoch": 0.4606824925816024,
"grad_norm": 9.752120018005371,
"learning_rate": 1.6848086105972123e-06,
"loss": 1.3674,
"step": 6210
},
{
"epoch": 0.46142433234421365,
"grad_norm": 9.113099098205566,
"learning_rate": 1.6813389085852794e-06,
"loss": 1.454,
"step": 6220
},
{
"epoch": 0.4621661721068249,
"grad_norm": 8.19013500213623,
"learning_rate": 1.677868221631109e-06,
"loss": 1.3381,
"step": 6230
},
{
"epoch": 0.4629080118694362,
"grad_norm": 7.306256294250488,
"learning_rate": 1.674396568585736e-06,
"loss": 1.3912,
"step": 6240
},
{
"epoch": 0.46364985163204747,
"grad_norm": 8.432893753051758,
"learning_rate": 1.6709239683054433e-06,
"loss": 1.2639,
"step": 6250
},
{
"epoch": 0.46439169139465875,
"grad_norm": 9.081368446350098,
"learning_rate": 1.6674504396516583e-06,
"loss": 1.3728,
"step": 6260
},
{
"epoch": 0.46513353115727,
"grad_norm": 8.188736915588379,
"learning_rate": 1.663976001490851e-06,
"loss": 1.3573,
"step": 6270
},
{
"epoch": 0.4658753709198813,
"grad_norm": 8.223960876464844,
"learning_rate": 1.6605006726944314e-06,
"loss": 1.3602,
"step": 6280
},
{
"epoch": 0.46661721068249257,
"grad_norm": 7.188130855560303,
"learning_rate": 1.6570244721386472e-06,
"loss": 1.3091,
"step": 6290
},
{
"epoch": 0.46735905044510384,
"grad_norm": 8.153417587280273,
"learning_rate": 1.6535474187044809e-06,
"loss": 1.3743,
"step": 6300
},
{
"epoch": 0.4681008902077151,
"grad_norm": 7.9417290687561035,
"learning_rate": 1.650069531277547e-06,
"loss": 1.2242,
"step": 6310
},
{
"epoch": 0.4688427299703264,
"grad_norm": 10.858664512634277,
"learning_rate": 1.6465908287479907e-06,
"loss": 1.329,
"step": 6320
},
{
"epoch": 0.46958456973293766,
"grad_norm": 11.415666580200195,
"learning_rate": 1.6431113300103836e-06,
"loss": 1.3142,
"step": 6330
},
{
"epoch": 0.47032640949554894,
"grad_norm": 9.50818920135498,
"learning_rate": 1.6396310539636222e-06,
"loss": 1.335,
"step": 6340
},
{
"epoch": 0.4710682492581602,
"grad_norm": 8.820195198059082,
"learning_rate": 1.6361500195108256e-06,
"loss": 1.3818,
"step": 6350
},
{
"epoch": 0.47181008902077154,
"grad_norm": 8.231925964355469,
"learning_rate": 1.6326682455592306e-06,
"loss": 1.5702,
"step": 6360
},
{
"epoch": 0.4725519287833828,
"grad_norm": 8.553587913513184,
"learning_rate": 1.6291857510200926e-06,
"loss": 1.3378,
"step": 6370
},
{
"epoch": 0.4732937685459941,
"grad_norm": 8.568156242370605,
"learning_rate": 1.6257025548085788e-06,
"loss": 1.3023,
"step": 6380
},
{
"epoch": 0.47403560830860536,
"grad_norm": 8.378904342651367,
"learning_rate": 1.6222186758436698e-06,
"loss": 1.4306,
"step": 6390
},
{
"epoch": 0.47477744807121663,
"grad_norm": 8.451229095458984,
"learning_rate": 1.6187341330480523e-06,
"loss": 1.166,
"step": 6400
},
{
"epoch": 0.4755192878338279,
"grad_norm": 8.599996566772461,
"learning_rate": 1.6152489453480202e-06,
"loss": 1.365,
"step": 6410
},
{
"epoch": 0.4762611275964392,
"grad_norm": 8.459872245788574,
"learning_rate": 1.6117631316733698e-06,
"loss": 1.278,
"step": 6420
},
{
"epoch": 0.47700296735905046,
"grad_norm": 9.12617301940918,
"learning_rate": 1.6082767109572964e-06,
"loss": 1.2172,
"step": 6430
},
{
"epoch": 0.47774480712166173,
"grad_norm": 7.814152717590332,
"learning_rate": 1.6047897021362942e-06,
"loss": 1.2797,
"step": 6440
},
{
"epoch": 0.478486646884273,
"grad_norm": 9.098596572875977,
"learning_rate": 1.60130212415005e-06,
"loss": 1.3154,
"step": 6450
},
{
"epoch": 0.4792284866468843,
"grad_norm": 9.89655876159668,
"learning_rate": 1.597813995941343e-06,
"loss": 1.4306,
"step": 6460
},
{
"epoch": 0.47997032640949555,
"grad_norm": 8.791460037231445,
"learning_rate": 1.5943253364559412e-06,
"loss": 1.2269,
"step": 6470
},
{
"epoch": 0.4807121661721068,
"grad_norm": 8.997727394104004,
"learning_rate": 1.5908361646424973e-06,
"loss": 1.4215,
"step": 6480
},
{
"epoch": 0.4814540059347181,
"grad_norm": 7.6386284828186035,
"learning_rate": 1.5873464994524473e-06,
"loss": 1.2984,
"step": 6490
},
{
"epoch": 0.4821958456973294,
"grad_norm": 9.248114585876465,
"learning_rate": 1.5838563598399068e-06,
"loss": 1.3452,
"step": 6500
},
{
"epoch": 0.4821958456973294,
"eval_loss": 1.326774001121521,
"eval_runtime": 23.5945,
"eval_samples_per_second": 18.903,
"eval_steps_per_second": 9.451,
"step": 6500
},
{
"epoch": 0.48293768545994065,
"grad_norm": 7.455526828765869,
"learning_rate": 1.580365764761568e-06,
"loss": 1.2932,
"step": 6510
},
{
"epoch": 0.4836795252225519,
"grad_norm": 9.042367935180664,
"learning_rate": 1.5768747331765977e-06,
"loss": 1.358,
"step": 6520
},
{
"epoch": 0.4844213649851632,
"grad_norm": 7.080817222595215,
"learning_rate": 1.5733832840465328e-06,
"loss": 1.2915,
"step": 6530
},
{
"epoch": 0.48516320474777447,
"grad_norm": 11.231888771057129,
"learning_rate": 1.5698914363351784e-06,
"loss": 1.3181,
"step": 6540
},
{
"epoch": 0.48590504451038574,
"grad_norm": 7.5210347175598145,
"learning_rate": 1.5663992090085044e-06,
"loss": 1.3802,
"step": 6550
},
{
"epoch": 0.486646884272997,
"grad_norm": 8.118837356567383,
"learning_rate": 1.5629066210345432e-06,
"loss": 1.4856,
"step": 6560
},
{
"epoch": 0.4873887240356083,
"grad_norm": 7.752665996551514,
"learning_rate": 1.559413691383285e-06,
"loss": 1.3588,
"step": 6570
},
{
"epoch": 0.48813056379821956,
"grad_norm": 8.421116828918457,
"learning_rate": 1.5559204390265764e-06,
"loss": 1.4454,
"step": 6580
},
{
"epoch": 0.48887240356083084,
"grad_norm": 8.583824157714844,
"learning_rate": 1.5524268829380168e-06,
"loss": 1.4392,
"step": 6590
},
{
"epoch": 0.4896142433234421,
"grad_norm": 8.850062370300293,
"learning_rate": 1.5489330420928555e-06,
"loss": 1.3796,
"step": 6600
},
{
"epoch": 0.4903560830860534,
"grad_norm": 7.187986850738525,
"learning_rate": 1.5454389354678882e-06,
"loss": 1.1743,
"step": 6610
},
{
"epoch": 0.4910979228486647,
"grad_norm": 17.749059677124023,
"learning_rate": 1.541944582041353e-06,
"loss": 1.3122,
"step": 6620
},
{
"epoch": 0.491839762611276,
"grad_norm": 9.939379692077637,
"learning_rate": 1.5384500007928312e-06,
"loss": 1.1216,
"step": 6630
},
{
"epoch": 0.49258160237388726,
"grad_norm": 9.638907432556152,
"learning_rate": 1.53495521070314e-06,
"loss": 1.2621,
"step": 6640
},
{
"epoch": 0.49332344213649854,
"grad_norm": 9.247072219848633,
"learning_rate": 1.5314602307542297e-06,
"loss": 1.255,
"step": 6650
},
{
"epoch": 0.4940652818991098,
"grad_norm": 10.329320907592773,
"learning_rate": 1.5279650799290838e-06,
"loss": 1.3395,
"step": 6660
},
{
"epoch": 0.4948071216617211,
"grad_norm": 8.686713218688965,
"learning_rate": 1.5244697772116131e-06,
"loss": 1.1988,
"step": 6670
},
{
"epoch": 0.49554896142433236,
"grad_norm": 9.043136596679688,
"learning_rate": 1.5209743415865535e-06,
"loss": 1.3861,
"step": 6680
},
{
"epoch": 0.49629080118694363,
"grad_norm": 9.186018943786621,
"learning_rate": 1.5174787920393627e-06,
"loss": 1.2588,
"step": 6690
},
{
"epoch": 0.4970326409495549,
"grad_norm": 9.252155303955078,
"learning_rate": 1.5139831475561171e-06,
"loss": 1.554,
"step": 6700
},
{
"epoch": 0.4977744807121662,
"grad_norm": 9.65112018585205,
"learning_rate": 1.510487427123409e-06,
"loss": 1.3435,
"step": 6710
},
{
"epoch": 0.49851632047477745,
"grad_norm": 22.156383514404297,
"learning_rate": 1.5069916497282432e-06,
"loss": 1.178,
"step": 6720
},
{
"epoch": 0.4992581602373887,
"grad_norm": 8.21938419342041,
"learning_rate": 1.5034958343579333e-06,
"loss": 1.3944,
"step": 6730
},
{
"epoch": 0.5,
"grad_norm": 7.787656307220459,
"learning_rate": 1.5e-06,
"loss": 1.4009,
"step": 6740
},
{
"epoch": 0.5007418397626113,
"grad_norm": 8.978195190429688,
"learning_rate": 1.4965041656420666e-06,
"loss": 1.241,
"step": 6750
},
{
"epoch": 0.5014836795252225,
"grad_norm": 9.333284378051758,
"learning_rate": 1.4930083502717571e-06,
"loss": 1.5115,
"step": 6760
},
{
"epoch": 0.5022255192878339,
"grad_norm": 9.057726860046387,
"learning_rate": 1.489512572876591e-06,
"loss": 1.2611,
"step": 6770
},
{
"epoch": 0.5029673590504451,
"grad_norm": 9.008346557617188,
"learning_rate": 1.4860168524438831e-06,
"loss": 1.2435,
"step": 6780
},
{
"epoch": 0.5037091988130564,
"grad_norm": 7.9738640785217285,
"learning_rate": 1.4825212079606374e-06,
"loss": 1.2969,
"step": 6790
},
{
"epoch": 0.5044510385756676,
"grad_norm": 13.787586212158203,
"learning_rate": 1.4790256584134468e-06,
"loss": 1.4168,
"step": 6800
},
{
"epoch": 0.505192878338279,
"grad_norm": 8.508440971374512,
"learning_rate": 1.4755302227883868e-06,
"loss": 1.2758,
"step": 6810
},
{
"epoch": 0.5059347181008902,
"grad_norm": 9.42790699005127,
"learning_rate": 1.4720349200709164e-06,
"loss": 1.2748,
"step": 6820
},
{
"epoch": 0.5066765578635015,
"grad_norm": 9.03829288482666,
"learning_rate": 1.4685397692457704e-06,
"loss": 1.3407,
"step": 6830
},
{
"epoch": 0.5074183976261127,
"grad_norm": 9.19029712677002,
"learning_rate": 1.4650447892968606e-06,
"loss": 1.52,
"step": 6840
},
{
"epoch": 0.5081602373887241,
"grad_norm": 8.805255889892578,
"learning_rate": 1.4615499992071685e-06,
"loss": 1.3314,
"step": 6850
},
{
"epoch": 0.5089020771513353,
"grad_norm": 8.248116493225098,
"learning_rate": 1.4580554179586471e-06,
"loss": 1.4094,
"step": 6860
},
{
"epoch": 0.5096439169139466,
"grad_norm": 8.346354484558105,
"learning_rate": 1.4545610645321123e-06,
"loss": 1.4973,
"step": 6870
},
{
"epoch": 0.5103857566765578,
"grad_norm": 8.899476051330566,
"learning_rate": 1.451066957907145e-06,
"loss": 1.3733,
"step": 6880
},
{
"epoch": 0.5111275964391692,
"grad_norm": 7.146321773529053,
"learning_rate": 1.4475731170619835e-06,
"loss": 1.3282,
"step": 6890
},
{
"epoch": 0.5118694362017804,
"grad_norm": 9.217137336730957,
"learning_rate": 1.444079560973424e-06,
"loss": 1.5009,
"step": 6900
},
{
"epoch": 0.5126112759643917,
"grad_norm": 8.994102478027344,
"learning_rate": 1.4405863086167155e-06,
"loss": 1.3771,
"step": 6910
},
{
"epoch": 0.5133531157270029,
"grad_norm": 7.989219665527344,
"learning_rate": 1.4370933789654571e-06,
"loss": 1.385,
"step": 6920
},
{
"epoch": 0.5140949554896143,
"grad_norm": 8.614723205566406,
"learning_rate": 1.4336007909914957e-06,
"loss": 1.2987,
"step": 6930
},
{
"epoch": 0.5148367952522255,
"grad_norm": 7.992114543914795,
"learning_rate": 1.430108563664822e-06,
"loss": 1.1859,
"step": 6940
},
{
"epoch": 0.5155786350148368,
"grad_norm": 8.345887184143066,
"learning_rate": 1.4266167159534675e-06,
"loss": 1.4507,
"step": 6950
},
{
"epoch": 0.516320474777448,
"grad_norm": 8.506096839904785,
"learning_rate": 1.4231252668234026e-06,
"loss": 1.2592,
"step": 6960
},
{
"epoch": 0.5170623145400594,
"grad_norm": 7.255486011505127,
"learning_rate": 1.4196342352384323e-06,
"loss": 1.2013,
"step": 6970
},
{
"epoch": 0.5178041543026706,
"grad_norm": 7.925352573394775,
"learning_rate": 1.4161436401600939e-06,
"loss": 1.3405,
"step": 6980
},
{
"epoch": 0.5185459940652819,
"grad_norm": 7.987504482269287,
"learning_rate": 1.412653500547553e-06,
"loss": 1.3114,
"step": 6990
},
{
"epoch": 0.5192878338278932,
"grad_norm": 9.995888710021973,
"learning_rate": 1.4091638353575025e-06,
"loss": 1.3076,
"step": 7000
},
{
"epoch": 0.5192878338278932,
"eval_loss": 1.3201655149459839,
"eval_runtime": 23.6126,
"eval_samples_per_second": 18.888,
"eval_steps_per_second": 9.444,
"step": 7000
},
{
"epoch": 0.5200296735905044,
"grad_norm": 6.1546831130981445,
"learning_rate": 1.405674663544059e-06,
"loss": 1.4727,
"step": 7010
},
{
"epoch": 0.5207715133531158,
"grad_norm": 8.864068984985352,
"learning_rate": 1.4021860040586568e-06,
"loss": 1.2877,
"step": 7020
},
{
"epoch": 0.521513353115727,
"grad_norm": 9.57347297668457,
"learning_rate": 1.3986978758499504e-06,
"loss": 1.2283,
"step": 7030
},
{
"epoch": 0.5222551928783383,
"grad_norm": 8.824577331542969,
"learning_rate": 1.395210297863706e-06,
"loss": 1.4945,
"step": 7040
},
{
"epoch": 0.5229970326409495,
"grad_norm": 10.610620498657227,
"learning_rate": 1.3917232890427038e-06,
"loss": 1.4092,
"step": 7050
},
{
"epoch": 0.5237388724035609,
"grad_norm": 7.3669514656066895,
"learning_rate": 1.3882368683266303e-06,
"loss": 1.1762,
"step": 7060
},
{
"epoch": 0.5244807121661721,
"grad_norm": 8.22118091583252,
"learning_rate": 1.38475105465198e-06,
"loss": 1.3694,
"step": 7070
},
{
"epoch": 0.5252225519287834,
"grad_norm": 8.95012378692627,
"learning_rate": 1.3812658669519474e-06,
"loss": 1.3601,
"step": 7080
},
{
"epoch": 0.5259643916913946,
"grad_norm": 8.938467979431152,
"learning_rate": 1.3777813241563305e-06,
"loss": 1.4346,
"step": 7090
},
{
"epoch": 0.526706231454006,
"grad_norm": 8.244651794433594,
"learning_rate": 1.3742974451914208e-06,
"loss": 1.3497,
"step": 7100
},
{
"epoch": 0.5274480712166172,
"grad_norm": 9.305986404418945,
"learning_rate": 1.370814248979908e-06,
"loss": 1.5719,
"step": 7110
},
{
"epoch": 0.5281899109792285,
"grad_norm": 7.710730075836182,
"learning_rate": 1.3673317544407693e-06,
"loss": 1.191,
"step": 7120
},
{
"epoch": 0.5289317507418397,
"grad_norm": 9.58619499206543,
"learning_rate": 1.363849980489175e-06,
"loss": 1.3419,
"step": 7130
},
{
"epoch": 0.5296735905044511,
"grad_norm": 8.806848526000977,
"learning_rate": 1.3603689460363779e-06,
"loss": 1.3253,
"step": 7140
},
{
"epoch": 0.5304154302670623,
"grad_norm": 8.474712371826172,
"learning_rate": 1.3568886699896171e-06,
"loss": 1.2181,
"step": 7150
},
{
"epoch": 0.5311572700296736,
"grad_norm": 8.78541374206543,
"learning_rate": 1.3534091712520096e-06,
"loss": 1.3726,
"step": 7160
},
{
"epoch": 0.5318991097922848,
"grad_norm": 11.253677368164062,
"learning_rate": 1.3499304687224536e-06,
"loss": 1.2884,
"step": 7170
},
{
"epoch": 0.5326409495548962,
"grad_norm": 8.340043067932129,
"learning_rate": 1.3464525812955194e-06,
"loss": 1.3605,
"step": 7180
},
{
"epoch": 0.5333827893175074,
"grad_norm": 8.733418464660645,
"learning_rate": 1.3429755278613535e-06,
"loss": 1.2541,
"step": 7190
},
{
"epoch": 0.5341246290801187,
"grad_norm": 9.979363441467285,
"learning_rate": 1.3394993273055689e-06,
"loss": 1.3203,
"step": 7200
},
{
"epoch": 0.5348664688427299,
"grad_norm": 8.473489761352539,
"learning_rate": 1.3360239985091496e-06,
"loss": 1.3836,
"step": 7210
},
{
"epoch": 0.5356083086053413,
"grad_norm": 8.469969749450684,
"learning_rate": 1.3325495603483418e-06,
"loss": 1.3789,
"step": 7220
},
{
"epoch": 0.5363501483679525,
"grad_norm": 7.77994966506958,
"learning_rate": 1.3290760316945572e-06,
"loss": 1.2116,
"step": 7230
},
{
"epoch": 0.5370919881305638,
"grad_norm": 9.14150619506836,
"learning_rate": 1.325603431414264e-06,
"loss": 1.2778,
"step": 7240
},
{
"epoch": 0.537833827893175,
"grad_norm": 8.883842468261719,
"learning_rate": 1.3221317783688914e-06,
"loss": 1.2829,
"step": 7250
},
{
"epoch": 0.5385756676557863,
"grad_norm": 6.918141841888428,
"learning_rate": 1.3186610914147208e-06,
"loss": 1.2587,
"step": 7260
},
{
"epoch": 0.5393175074183977,
"grad_norm": 8.339578628540039,
"learning_rate": 1.3151913894027878e-06,
"loss": 1.3557,
"step": 7270
},
{
"epoch": 0.5400593471810089,
"grad_norm": 8.50107192993164,
"learning_rate": 1.3117226911787791e-06,
"loss": 1.2453,
"step": 7280
},
{
"epoch": 0.5408011869436202,
"grad_norm": 9.355497360229492,
"learning_rate": 1.3082550155829264e-06,
"loss": 1.4713,
"step": 7290
},
{
"epoch": 0.5415430267062314,
"grad_norm": 8.334994316101074,
"learning_rate": 1.304788381449911e-06,
"loss": 1.2284,
"step": 7300
},
{
"epoch": 0.5422848664688428,
"grad_norm": 9.552740097045898,
"learning_rate": 1.3013228076087534e-06,
"loss": 1.3224,
"step": 7310
},
{
"epoch": 0.543026706231454,
"grad_norm": 9.53915786743164,
"learning_rate": 1.2978583128827187e-06,
"loss": 1.3691,
"step": 7320
},
{
"epoch": 0.5437685459940653,
"grad_norm": 9.11638355255127,
"learning_rate": 1.2943949160892076e-06,
"loss": 1.3347,
"step": 7330
},
{
"epoch": 0.5445103857566765,
"grad_norm": 9.868489265441895,
"learning_rate": 1.2909326360396604e-06,
"loss": 1.5654,
"step": 7340
},
{
"epoch": 0.5452522255192879,
"grad_norm": 8.725732803344727,
"learning_rate": 1.287471491539449e-06,
"loss": 1.3145,
"step": 7350
},
{
"epoch": 0.5459940652818991,
"grad_norm": 8.547471046447754,
"learning_rate": 1.2840115013877804e-06,
"loss": 1.2752,
"step": 7360
},
{
"epoch": 0.5467359050445104,
"grad_norm": 11.938176155090332,
"learning_rate": 1.2805526843775888e-06,
"loss": 1.3646,
"step": 7370
},
{
"epoch": 0.5474777448071216,
"grad_norm": 11.184774398803711,
"learning_rate": 1.2770950592954392e-06,
"loss": 1.4144,
"step": 7380
},
{
"epoch": 0.548219584569733,
"grad_norm": 8.136163711547852,
"learning_rate": 1.27363864492142e-06,
"loss": 1.2555,
"step": 7390
},
{
"epoch": 0.5489614243323442,
"grad_norm": 8.048996925354004,
"learning_rate": 1.2701834600290465e-06,
"loss": 1.3139,
"step": 7400
},
{
"epoch": 0.5497032640949555,
"grad_norm": 8.8002347946167,
"learning_rate": 1.2667295233851534e-06,
"loss": 1.3354,
"step": 7410
},
{
"epoch": 0.5504451038575667,
"grad_norm": 8.829628944396973,
"learning_rate": 1.263276853749798e-06,
"loss": 1.519,
"step": 7420
},
{
"epoch": 0.5511869436201781,
"grad_norm": 8.89567756652832,
"learning_rate": 1.259825469876153e-06,
"loss": 1.4514,
"step": 7430
},
{
"epoch": 0.5519287833827893,
"grad_norm": 8.236814498901367,
"learning_rate": 1.2563753905104107e-06,
"loss": 1.2586,
"step": 7440
},
{
"epoch": 0.5526706231454006,
"grad_norm": 9.010204315185547,
"learning_rate": 1.252926634391675e-06,
"loss": 1.1963,
"step": 7450
},
{
"epoch": 0.5534124629080118,
"grad_norm": 8.456092834472656,
"learning_rate": 1.2494792202518651e-06,
"loss": 1.4698,
"step": 7460
},
{
"epoch": 0.5541543026706232,
"grad_norm": 7.783117294311523,
"learning_rate": 1.2460331668156087e-06,
"loss": 1.2172,
"step": 7470
},
{
"epoch": 0.5548961424332344,
"grad_norm": 8.84600830078125,
"learning_rate": 1.2425884928001456e-06,
"loss": 1.3524,
"step": 7480
},
{
"epoch": 0.5556379821958457,
"grad_norm": 13.498913764953613,
"learning_rate": 1.2391452169152206e-06,
"loss": 1.4842,
"step": 7490
},
{
"epoch": 0.5563798219584569,
"grad_norm": 7.4838433265686035,
"learning_rate": 1.2357033578629871e-06,
"loss": 1.2696,
"step": 7500
},
{
"epoch": 0.5563798219584569,
"eval_loss": 1.31540048122406,
"eval_runtime": 23.6078,
"eval_samples_per_second": 18.892,
"eval_steps_per_second": 9.446,
"step": 7500
},
{
"epoch": 0.5571216617210683,
"grad_norm": 8.919118881225586,
"learning_rate": 1.2322629343379003e-06,
"loss": 1.205,
"step": 7510
},
{
"epoch": 0.5578635014836796,
"grad_norm": 9.142733573913574,
"learning_rate": 1.2288239650266212e-06,
"loss": 1.1951,
"step": 7520
},
{
"epoch": 0.5586053412462908,
"grad_norm": 8.228799819946289,
"learning_rate": 1.2253864686079096e-06,
"loss": 1.2712,
"step": 7530
},
{
"epoch": 0.5593471810089021,
"grad_norm": 9.651594161987305,
"learning_rate": 1.2219504637525272e-06,
"loss": 1.3421,
"step": 7540
},
{
"epoch": 0.5600890207715133,
"grad_norm": 8.157588005065918,
"learning_rate": 1.2185159691231333e-06,
"loss": 1.3639,
"step": 7550
},
{
"epoch": 0.5608308605341247,
"grad_norm": 8.895820617675781,
"learning_rate": 1.2150830033741845e-06,
"loss": 1.4126,
"step": 7560
},
{
"epoch": 0.5615727002967359,
"grad_norm": 8.002445220947266,
"learning_rate": 1.2116515851518336e-06,
"loss": 1.5194,
"step": 7570
},
{
"epoch": 0.5623145400593472,
"grad_norm": 8.261061668395996,
"learning_rate": 1.2082217330938278e-06,
"loss": 1.341,
"step": 7580
},
{
"epoch": 0.5630563798219584,
"grad_norm": 8.519086837768555,
"learning_rate": 1.2047934658294077e-06,
"loss": 1.317,
"step": 7590
},
{
"epoch": 0.5637982195845698,
"grad_norm": 9.409530639648438,
"learning_rate": 1.2013668019792059e-06,
"loss": 1.3258,
"step": 7600
},
{
"epoch": 0.564540059347181,
"grad_norm": 8.627307891845703,
"learning_rate": 1.197941760155147e-06,
"loss": 1.3783,
"step": 7610
},
{
"epoch": 0.5652818991097923,
"grad_norm": 8.954816818237305,
"learning_rate": 1.1945183589603436e-06,
"loss": 1.3198,
"step": 7620
},
{
"epoch": 0.5660237388724035,
"grad_norm": 9.60593318939209,
"learning_rate": 1.191096616989e-06,
"loss": 1.4557,
"step": 7630
},
{
"epoch": 0.5667655786350149,
"grad_norm": 10.09070110321045,
"learning_rate": 1.1876765528263054e-06,
"loss": 1.2944,
"step": 7640
},
{
"epoch": 0.5675074183976261,
"grad_norm": 9.579095840454102,
"learning_rate": 1.1842581850483386e-06,
"loss": 1.1665,
"step": 7650
},
{
"epoch": 0.5682492581602374,
"grad_norm": 8.07282829284668,
"learning_rate": 1.1808415322219623e-06,
"loss": 1.2873,
"step": 7660
},
{
"epoch": 0.5689910979228486,
"grad_norm": 8.33482837677002,
"learning_rate": 1.1774266129047268e-06,
"loss": 1.3965,
"step": 7670
},
{
"epoch": 0.56973293768546,
"grad_norm": 7.368827819824219,
"learning_rate": 1.1740134456447643e-06,
"loss": 1.361,
"step": 7680
},
{
"epoch": 0.5704747774480712,
"grad_norm": 7.657955169677734,
"learning_rate": 1.1706020489806927e-06,
"loss": 1.2028,
"step": 7690
},
{
"epoch": 0.5712166172106825,
"grad_norm": 10.629265785217285,
"learning_rate": 1.1671924414415115e-06,
"loss": 1.4689,
"step": 7700
},
{
"epoch": 0.5719584569732937,
"grad_norm": 8.179962158203125,
"learning_rate": 1.1637846415465042e-06,
"loss": 1.2847,
"step": 7710
},
{
"epoch": 0.5727002967359051,
"grad_norm": 9.228793144226074,
"learning_rate": 1.160378667805134e-06,
"loss": 1.2259,
"step": 7720
},
{
"epoch": 0.5734421364985163,
"grad_norm": 9.42039966583252,
"learning_rate": 1.1569745387169476e-06,
"loss": 1.3845,
"step": 7730
},
{
"epoch": 0.5741839762611276,
"grad_norm": 7.016010284423828,
"learning_rate": 1.15357227277147e-06,
"loss": 1.2342,
"step": 7740
},
{
"epoch": 0.5749258160237388,
"grad_norm": 8.45107650756836,
"learning_rate": 1.1501718884481093e-06,
"loss": 1.2879,
"step": 7750
},
{
"epoch": 0.5756676557863502,
"grad_norm": 8.31155776977539,
"learning_rate": 1.1467734042160506e-06,
"loss": 1.1682,
"step": 7760
},
{
"epoch": 0.5764094955489614,
"grad_norm": 6.724592208862305,
"learning_rate": 1.1433768385341618e-06,
"loss": 1.279,
"step": 7770
},
{
"epoch": 0.5771513353115727,
"grad_norm": 7.946195602416992,
"learning_rate": 1.1399822098508868e-06,
"loss": 1.2484,
"step": 7780
},
{
"epoch": 0.577893175074184,
"grad_norm": 8.1701078414917,
"learning_rate": 1.1365895366041515e-06,
"loss": 1.4018,
"step": 7790
},
{
"epoch": 0.5786350148367952,
"grad_norm": 7.849374771118164,
"learning_rate": 1.1331988372212606e-06,
"loss": 1.3815,
"step": 7800
},
{
"epoch": 0.5793768545994066,
"grad_norm": 7.588284015655518,
"learning_rate": 1.129810130118795e-06,
"loss": 1.2523,
"step": 7810
},
{
"epoch": 0.5801186943620178,
"grad_norm": 7.635886192321777,
"learning_rate": 1.1264234337025184e-06,
"loss": 1.3134,
"step": 7820
},
{
"epoch": 0.5808605341246291,
"grad_norm": 8.270919799804688,
"learning_rate": 1.1230387663672702e-06,
"loss": 1.2948,
"step": 7830
},
{
"epoch": 0.5816023738872403,
"grad_norm": 8.233508110046387,
"learning_rate": 1.1196561464968714e-06,
"loss": 1.4182,
"step": 7840
},
{
"epoch": 0.5823442136498517,
"grad_norm": 7.905995845794678,
"learning_rate": 1.1162755924640197e-06,
"loss": 1.2159,
"step": 7850
},
{
"epoch": 0.5830860534124629,
"grad_norm": 8.946208953857422,
"learning_rate": 1.1128971226301945e-06,
"loss": 1.3037,
"step": 7860
},
{
"epoch": 0.5838278931750742,
"grad_norm": 7.928177833557129,
"learning_rate": 1.1095207553455534e-06,
"loss": 1.2651,
"step": 7870
},
{
"epoch": 0.5845697329376854,
"grad_norm": 8.850945472717285,
"learning_rate": 1.106146508948835e-06,
"loss": 1.2274,
"step": 7880
},
{
"epoch": 0.5853115727002968,
"grad_norm": 8.4835844039917,
"learning_rate": 1.1027744017672569e-06,
"loss": 1.3851,
"step": 7890
},
{
"epoch": 0.586053412462908,
"grad_norm": 9.85268783569336,
"learning_rate": 1.0994044521164195e-06,
"loss": 1.2782,
"step": 7900
},
{
"epoch": 0.5867952522255193,
"grad_norm": 9.465106964111328,
"learning_rate": 1.0960366783002025e-06,
"loss": 1.3173,
"step": 7910
},
{
"epoch": 0.5875370919881305,
"grad_norm": 8.64224624633789,
"learning_rate": 1.0926710986106692e-06,
"loss": 1.2422,
"step": 7920
},
{
"epoch": 0.5882789317507419,
"grad_norm": 7.39610481262207,
"learning_rate": 1.0893077313279645e-06,
"loss": 1.3971,
"step": 7930
},
{
"epoch": 0.5890207715133531,
"grad_norm": 10.103199005126953,
"learning_rate": 1.0859465947202174e-06,
"loss": 1.2907,
"step": 7940
},
{
"epoch": 0.5897626112759644,
"grad_norm": 8.195404052734375,
"learning_rate": 1.08258770704344e-06,
"loss": 1.3106,
"step": 7950
},
{
"epoch": 0.5905044510385756,
"grad_norm": 10.311722755432129,
"learning_rate": 1.0792310865414305e-06,
"loss": 1.482,
"step": 7960
},
{
"epoch": 0.591246290801187,
"grad_norm": 9.214127540588379,
"learning_rate": 1.075876751445672e-06,
"loss": 1.366,
"step": 7970
},
{
"epoch": 0.5919881305637982,
"grad_norm": 8.142541885375977,
"learning_rate": 1.0725247199752353e-06,
"loss": 1.2611,
"step": 7980
},
{
"epoch": 0.5927299703264095,
"grad_norm": 8.227744102478027,
"learning_rate": 1.0691750103366772e-06,
"loss": 1.3838,
"step": 7990
},
{
"epoch": 0.5934718100890207,
"grad_norm": 7.327287673950195,
"learning_rate": 1.0658276407239463e-06,
"loss": 1.3833,
"step": 8000
},
{
"epoch": 0.5934718100890207,
"eval_loss": 1.310362458229065,
"eval_runtime": 23.6142,
"eval_samples_per_second": 18.887,
"eval_steps_per_second": 9.443,
"step": 8000
},
{
"epoch": 0.594213649851632,
"grad_norm": 7.846217155456543,
"learning_rate": 1.0624826293182785e-06,
"loss": 1.3256,
"step": 8010
},
{
"epoch": 0.5949554896142433,
"grad_norm": 10.000598907470703,
"learning_rate": 1.0591399942881038e-06,
"loss": 1.2878,
"step": 8020
},
{
"epoch": 0.5956973293768546,
"grad_norm": 12.841207504272461,
"learning_rate": 1.0557997537889423e-06,
"loss": 1.506,
"step": 8030
},
{
"epoch": 0.5964391691394659,
"grad_norm": 12.134847640991211,
"learning_rate": 1.05246192596331e-06,
"loss": 1.2424,
"step": 8040
},
{
"epoch": 0.5971810089020771,
"grad_norm": 8.403639793395996,
"learning_rate": 1.0491265289406184e-06,
"loss": 1.2328,
"step": 8050
},
{
"epoch": 0.5979228486646885,
"grad_norm": 8.479205131530762,
"learning_rate": 1.0457935808370746e-06,
"loss": 1.3008,
"step": 8060
},
{
"epoch": 0.5986646884272997,
"grad_norm": 8.48107624053955,
"learning_rate": 1.0424630997555867e-06,
"loss": 1.3708,
"step": 8070
},
{
"epoch": 0.599406528189911,
"grad_norm": 7.639008045196533,
"learning_rate": 1.0391351037856604e-06,
"loss": 1.3698,
"step": 8080
},
{
"epoch": 0.6001483679525222,
"grad_norm": 8.282002449035645,
"learning_rate": 1.0358096110033063e-06,
"loss": 1.3946,
"step": 8090
},
{
"epoch": 0.6008902077151336,
"grad_norm": 7.605436325073242,
"learning_rate": 1.0324866394709365e-06,
"loss": 1.3852,
"step": 8100
},
{
"epoch": 0.6016320474777448,
"grad_norm": 7.768093585968018,
"learning_rate": 1.0291662072372715e-06,
"loss": 1.389,
"step": 8110
},
{
"epoch": 0.6023738872403561,
"grad_norm": 8.550724029541016,
"learning_rate": 1.0258483323372364e-06,
"loss": 1.3193,
"step": 8120
},
{
"epoch": 0.6031157270029673,
"grad_norm": 7.690985202789307,
"learning_rate": 1.0225330327918696e-06,
"loss": 1.2423,
"step": 8130
},
{
"epoch": 0.6038575667655787,
"grad_norm": 8.371797561645508,
"learning_rate": 1.0192203266082185e-06,
"loss": 1.4319,
"step": 8140
},
{
"epoch": 0.6045994065281899,
"grad_norm": 7.518775463104248,
"learning_rate": 1.0159102317792468e-06,
"loss": 1.3037,
"step": 8150
},
{
"epoch": 0.6053412462908012,
"grad_norm": 12.325798034667969,
"learning_rate": 1.012602766283733e-06,
"loss": 1.4,
"step": 8160
},
{
"epoch": 0.6060830860534124,
"grad_norm": 8.312942504882812,
"learning_rate": 1.0092979480861763e-06,
"loss": 1.4318,
"step": 8170
},
{
"epoch": 0.6068249258160238,
"grad_norm": 8.462662696838379,
"learning_rate": 1.0059957951366943e-06,
"loss": 1.2307,
"step": 8180
},
{
"epoch": 0.607566765578635,
"grad_norm": 8.316648483276367,
"learning_rate": 1.0026963253709315e-06,
"loss": 1.2333,
"step": 8190
},
{
"epoch": 0.6083086053412463,
"grad_norm": 10.025683403015137,
"learning_rate": 9.993995567099557e-07,
"loss": 1.3134,
"step": 8200
},
{
"epoch": 0.6090504451038575,
"grad_norm": 10.292147636413574,
"learning_rate": 9.961055070601667e-07,
"loss": 1.1875,
"step": 8210
},
{
"epoch": 0.6097922848664689,
"grad_norm": 7.682520389556885,
"learning_rate": 9.928141943131926e-07,
"loss": 1.2678,
"step": 8220
},
{
"epoch": 0.6105341246290801,
"grad_norm": 9.251666069030762,
"learning_rate": 9.895256363457996e-07,
"loss": 1.4774,
"step": 8230
},
{
"epoch": 0.6112759643916914,
"grad_norm": 9.813620567321777,
"learning_rate": 9.862398510197875e-07,
"loss": 1.4223,
"step": 8240
},
{
"epoch": 0.6120178041543026,
"grad_norm": 8.737075805664062,
"learning_rate": 9.829568561819005e-07,
"loss": 1.2286,
"step": 8250
},
{
"epoch": 0.612759643916914,
"grad_norm": 8.058504104614258,
"learning_rate": 9.796766696637232e-07,
"loss": 1.3313,
"step": 8260
},
{
"epoch": 0.6135014836795252,
"grad_norm": 8.038324356079102,
"learning_rate": 9.763993092815876e-07,
"loss": 1.3329,
"step": 8270
},
{
"epoch": 0.6142433234421365,
"grad_norm": 8.749686241149902,
"learning_rate": 9.731247928364766e-07,
"loss": 1.29,
"step": 8280
},
{
"epoch": 0.6149851632047477,
"grad_norm": 9.395393371582031,
"learning_rate": 9.69853138113925e-07,
"loss": 1.3178,
"step": 8290
},
{
"epoch": 0.615727002967359,
"grad_norm": 7.980838775634766,
"learning_rate": 9.665843628839246e-07,
"loss": 1.2876,
"step": 8300
},
{
"epoch": 0.6164688427299704,
"grad_norm": 7.525308609008789,
"learning_rate": 9.633184849008272e-07,
"loss": 1.4126,
"step": 8310
},
{
"epoch": 0.6172106824925816,
"grad_norm": 7.652345657348633,
"learning_rate": 9.600555219032493e-07,
"loss": 1.3087,
"step": 8320
},
{
"epoch": 0.6179525222551929,
"grad_norm": 8.231952667236328,
"learning_rate": 9.567954916139718e-07,
"loss": 1.3444,
"step": 8330
},
{
"epoch": 0.6186943620178041,
"grad_norm": 8.18375301361084,
"learning_rate": 9.535384117398501e-07,
"loss": 1.274,
"step": 8340
},
{
"epoch": 0.6194362017804155,
"grad_norm": 6.912817478179932,
"learning_rate": 9.502842999717117e-07,
"loss": 1.3022,
"step": 8350
},
{
"epoch": 0.6201780415430267,
"grad_norm": 8.46898078918457,
"learning_rate": 9.470331739842646e-07,
"loss": 1.4138,
"step": 8360
},
{
"epoch": 0.620919881305638,
"grad_norm": 8.513370513916016,
"learning_rate": 9.43785051435998e-07,
"loss": 1.1663,
"step": 8370
},
{
"epoch": 0.6216617210682492,
"grad_norm": 7.527527332305908,
"learning_rate": 9.405399499690899e-07,
"loss": 1.2236,
"step": 8380
},
{
"epoch": 0.6224035608308606,
"grad_norm": 8.23088264465332,
"learning_rate": 9.372978872093067e-07,
"loss": 1.3095,
"step": 8390
},
{
"epoch": 0.6231454005934718,
"grad_norm": 8.179079055786133,
"learning_rate": 9.340588807659127e-07,
"loss": 1.3372,
"step": 8400
},
{
"epoch": 0.6238872403560831,
"grad_norm": 8.004188537597656,
"learning_rate": 9.308229482315696e-07,
"loss": 1.401,
"step": 8410
},
{
"epoch": 0.6246290801186943,
"grad_norm": 8.311544418334961,
"learning_rate": 9.275901071822453e-07,
"loss": 1.2743,
"step": 8420
},
{
"epoch": 0.6253709198813057,
"grad_norm": 9.387375831604004,
"learning_rate": 9.243603751771139e-07,
"loss": 1.3015,
"step": 8430
},
{
"epoch": 0.6261127596439169,
"grad_norm": 7.231583118438721,
"learning_rate": 9.211337697584654e-07,
"loss": 1.3418,
"step": 8440
},
{
"epoch": 0.6268545994065282,
"grad_norm": 7.891911029815674,
"learning_rate": 9.179103084516049e-07,
"loss": 1.2991,
"step": 8450
},
{
"epoch": 0.6275964391691394,
"grad_norm": 8.7365083694458,
"learning_rate": 9.14690008764763e-07,
"loss": 1.4723,
"step": 8460
},
{
"epoch": 0.6283382789317508,
"grad_norm": 8.894607543945312,
"learning_rate": 9.114728881889955e-07,
"loss": 1.4044,
"step": 8470
},
{
"epoch": 0.629080118694362,
"grad_norm": 9.663780212402344,
"learning_rate": 9.082589641980931e-07,
"loss": 1.3265,
"step": 8480
},
{
"epoch": 0.6298219584569733,
"grad_norm": 9.393049240112305,
"learning_rate": 9.050482542484822e-07,
"loss": 1.3115,
"step": 8490
},
{
"epoch": 0.6305637982195845,
"grad_norm": 8.080445289611816,
"learning_rate": 9.018407757791341e-07,
"loss": 1.3217,
"step": 8500
},
{
"epoch": 0.6305637982195845,
"eval_loss": 1.3059756755828857,
"eval_runtime": 23.6134,
"eval_samples_per_second": 18.888,
"eval_steps_per_second": 9.444,
"step": 8500
},
{
"epoch": 0.6313056379821959,
"grad_norm": 7.894256114959717,
"learning_rate": 8.986365462114664e-07,
"loss": 1.2935,
"step": 8510
},
{
"epoch": 0.6320474777448071,
"grad_norm": 9.298333168029785,
"learning_rate": 8.954355829492521e-07,
"loss": 1.4362,
"step": 8520
},
{
"epoch": 0.6327893175074184,
"grad_norm": 7.979432106018066,
"learning_rate": 8.922379033785212e-07,
"loss": 1.5357,
"step": 8530
},
{
"epoch": 0.6335311572700296,
"grad_norm": 8.477805137634277,
"learning_rate": 8.890435248674709e-07,
"loss": 1.2728,
"step": 8540
},
{
"epoch": 0.634272997032641,
"grad_norm": 9.298026084899902,
"learning_rate": 8.858524647663661e-07,
"loss": 1.4405,
"step": 8550
},
{
"epoch": 0.6350148367952523,
"grad_norm": 8.434690475463867,
"learning_rate": 8.826647404074497e-07,
"loss": 1.2176,
"step": 8560
},
{
"epoch": 0.6357566765578635,
"grad_norm": 8.349011421203613,
"learning_rate": 8.794803691048457e-07,
"loss": 1.3891,
"step": 8570
},
{
"epoch": 0.6364985163204748,
"grad_norm": 7.770542144775391,
"learning_rate": 8.762993681544657e-07,
"loss": 1.2877,
"step": 8580
},
{
"epoch": 0.637240356083086,
"grad_norm": 9.452780723571777,
"learning_rate": 8.731217548339163e-07,
"loss": 1.4215,
"step": 8590
},
{
"epoch": 0.6379821958456974,
"grad_norm": 7.791207790374756,
"learning_rate": 8.699475464024022e-07,
"loss": 1.2664,
"step": 8600
},
{
"epoch": 0.6387240356083086,
"grad_norm": 8.289834976196289,
"learning_rate": 8.667767601006372e-07,
"loss": 1.2292,
"step": 8610
},
{
"epoch": 0.6394658753709199,
"grad_norm": 9.18075180053711,
"learning_rate": 8.63609413150745e-07,
"loss": 1.2276,
"step": 8620
},
{
"epoch": 0.6402077151335311,
"grad_norm": 7.8221635818481445,
"learning_rate": 8.604455227561712e-07,
"loss": 1.1693,
"step": 8630
},
{
"epoch": 0.6409495548961425,
"grad_norm": 8.45543384552002,
"learning_rate": 8.572851061015842e-07,
"loss": 1.3574,
"step": 8640
},
{
"epoch": 0.6416913946587537,
"grad_norm": 8.002989768981934,
"learning_rate": 8.541281803527875e-07,
"loss": 1.1484,
"step": 8650
},
{
"epoch": 0.642433234421365,
"grad_norm": 7.84604549407959,
"learning_rate": 8.509747626566218e-07,
"loss": 1.2894,
"step": 8660
},
{
"epoch": 0.6431750741839762,
"grad_norm": 8.202202796936035,
"learning_rate": 8.478248701408751e-07,
"loss": 1.2653,
"step": 8670
},
{
"epoch": 0.6439169139465876,
"grad_norm": 9.587785720825195,
"learning_rate": 8.44678519914187e-07,
"loss": 1.2584,
"step": 8680
},
{
"epoch": 0.6446587537091988,
"grad_norm": 7.761561870574951,
"learning_rate": 8.415357290659591e-07,
"loss": 1.2958,
"step": 8690
},
{
"epoch": 0.6454005934718101,
"grad_norm": 8.499533653259277,
"learning_rate": 8.383965146662582e-07,
"loss": 1.2073,
"step": 8700
},
{
"epoch": 0.6461424332344213,
"grad_norm": 9.094478607177734,
"learning_rate": 8.352608937657273e-07,
"loss": 1.4064,
"step": 8710
},
{
"epoch": 0.6468842729970327,
"grad_norm": 9.180924415588379,
"learning_rate": 8.321288833954896e-07,
"loss": 1.324,
"step": 8720
},
{
"epoch": 0.6476261127596439,
"grad_norm": 8.090041160583496,
"learning_rate": 8.290005005670598e-07,
"loss": 1.2272,
"step": 8730
},
{
"epoch": 0.6483679525222552,
"grad_norm": 8.494268417358398,
"learning_rate": 8.258757622722475e-07,
"loss": 1.298,
"step": 8740
},
{
"epoch": 0.6491097922848664,
"grad_norm": 8.722259521484375,
"learning_rate": 8.227546854830687e-07,
"loss": 1.2791,
"step": 8750
},
{
"epoch": 0.6498516320474778,
"grad_norm": 9.675090789794922,
"learning_rate": 8.196372871516503e-07,
"loss": 1.4562,
"step": 8760
},
{
"epoch": 0.650593471810089,
"grad_norm": 8.035630226135254,
"learning_rate": 8.165235842101421e-07,
"loss": 1.424,
"step": 8770
},
{
"epoch": 0.6513353115727003,
"grad_norm": 7.216797351837158,
"learning_rate": 8.134135935706192e-07,
"loss": 1.3999,
"step": 8780
},
{
"epoch": 0.6520771513353115,
"grad_norm": 9.409671783447266,
"learning_rate": 8.103073321249961e-07,
"loss": 1.213,
"step": 8790
},
{
"epoch": 0.6528189910979229,
"grad_norm": 8.56403923034668,
"learning_rate": 8.072048167449306e-07,
"loss": 1.2852,
"step": 8800
},
{
"epoch": 0.6535608308605341,
"grad_norm": 8.66519546508789,
"learning_rate": 8.041060642817348e-07,
"loss": 1.395,
"step": 8810
},
{
"epoch": 0.6543026706231454,
"grad_norm": 8.845466613769531,
"learning_rate": 8.010110915662808e-07,
"loss": 1.2783,
"step": 8820
},
{
"epoch": 0.6550445103857567,
"grad_norm": 7.585766792297363,
"learning_rate": 7.97919915408913e-07,
"loss": 1.2284,
"step": 8830
},
{
"epoch": 0.655786350148368,
"grad_norm": 9.476142883300781,
"learning_rate": 7.948325525993545e-07,
"loss": 1.3386,
"step": 8840
},
{
"epoch": 0.6565281899109793,
"grad_norm": 14.948787689208984,
"learning_rate": 7.917490199066141e-07,
"loss": 1.2518,
"step": 8850
},
{
"epoch": 0.6572700296735905,
"grad_norm": 8.437254905700684,
"learning_rate": 7.886693340789006e-07,
"loss": 1.342,
"step": 8860
},
{
"epoch": 0.6580118694362018,
"grad_norm": 7.908801555633545,
"learning_rate": 7.855935118435254e-07,
"loss": 1.4527,
"step": 8870
},
{
"epoch": 0.658753709198813,
"grad_norm": 8.332324981689453,
"learning_rate": 7.825215699068171e-07,
"loss": 1.5184,
"step": 8880
},
{
"epoch": 0.6594955489614244,
"grad_norm": 8.469289779663086,
"learning_rate": 7.794535249540267e-07,
"loss": 1.3789,
"step": 8890
},
{
"epoch": 0.6602373887240356,
"grad_norm": 6.876429557800293,
"learning_rate": 7.763893936492411e-07,
"loss": 1.3259,
"step": 8900
},
{
"epoch": 0.6609792284866469,
"grad_norm": 8.458147048950195,
"learning_rate": 7.733291926352871e-07,
"loss": 1.2604,
"step": 8910
},
{
"epoch": 0.6617210682492581,
"grad_norm": 9.535111427307129,
"learning_rate": 7.70272938533647e-07,
"loss": 1.2525,
"step": 8920
},
{
"epoch": 0.6624629080118695,
"grad_norm": 7.999839782714844,
"learning_rate": 7.67220647944363e-07,
"loss": 1.2877,
"step": 8930
},
{
"epoch": 0.6632047477744807,
"grad_norm": 8.702188491821289,
"learning_rate": 7.641723374459524e-07,
"loss": 1.2842,
"step": 8940
},
{
"epoch": 0.663946587537092,
"grad_norm": 9.051286697387695,
"learning_rate": 7.61128023595311e-07,
"loss": 1.4605,
"step": 8950
},
{
"epoch": 0.6646884272997032,
"grad_norm": 8.036483764648438,
"learning_rate": 7.580877229276303e-07,
"loss": 1.247,
"step": 8960
},
{
"epoch": 0.6654302670623146,
"grad_norm": 8.07690143585205,
"learning_rate": 7.550514519563013e-07,
"loss": 1.4113,
"step": 8970
},
{
"epoch": 0.6661721068249258,
"grad_norm": 8.693827629089355,
"learning_rate": 7.520192271728303e-07,
"loss": 1.2892,
"step": 8980
},
{
"epoch": 0.6669139465875371,
"grad_norm": 7.980526924133301,
"learning_rate": 7.489910650467445e-07,
"loss": 1.2029,
"step": 8990
},
{
"epoch": 0.6676557863501483,
"grad_norm": 8.663146018981934,
"learning_rate": 7.459669820255068e-07,
"loss": 1.2351,
"step": 9000
},
{
"epoch": 0.6676557863501483,
"eval_loss": 1.3026496171951294,
"eval_runtime": 23.6065,
"eval_samples_per_second": 18.893,
"eval_steps_per_second": 9.447,
"step": 9000
},
{
"epoch": 0.6683976261127597,
"grad_norm": 8.916913986206055,
"learning_rate": 7.42946994534422e-07,
"loss": 1.2413,
"step": 9010
},
{
"epoch": 0.6691394658753709,
"grad_norm": 8.399556159973145,
"learning_rate": 7.399311189765529e-07,
"loss": 1.1094,
"step": 9020
},
{
"epoch": 0.6698813056379822,
"grad_norm": 9.757269859313965,
"learning_rate": 7.369193717326254e-07,
"loss": 1.3129,
"step": 9030
},
{
"epoch": 0.6706231454005934,
"grad_norm": 8.055035591125488,
"learning_rate": 7.339117691609455e-07,
"loss": 1.2858,
"step": 9040
},
{
"epoch": 0.6713649851632048,
"grad_norm": 7.464066028594971,
"learning_rate": 7.309083275973042e-07,
"loss": 1.1974,
"step": 9050
},
{
"epoch": 0.672106824925816,
"grad_norm": 7.977897644042969,
"learning_rate": 7.27909063354895e-07,
"loss": 1.3727,
"step": 9060
},
{
"epoch": 0.6728486646884273,
"grad_norm": 8.40994930267334,
"learning_rate": 7.249139927242198e-07,
"loss": 1.3799,
"step": 9070
},
{
"epoch": 0.6735905044510386,
"grad_norm": 7.28301477432251,
"learning_rate": 7.21923131973005e-07,
"loss": 1.2326,
"step": 9080
},
{
"epoch": 0.6743323442136498,
"grad_norm": 8.845423698425293,
"learning_rate": 7.189364973461092e-07,
"loss": 1.349,
"step": 9090
},
{
"epoch": 0.6750741839762612,
"grad_norm": 8.522547721862793,
"learning_rate": 7.159541050654386e-07,
"loss": 1.3534,
"step": 9100
},
{
"epoch": 0.6758160237388724,
"grad_norm": 7.7692790031433105,
"learning_rate": 7.129759713298553e-07,
"loss": 1.2062,
"step": 9110
},
{
"epoch": 0.6765578635014837,
"grad_norm": 8.87850570678711,
"learning_rate": 7.100021123150917e-07,
"loss": 1.2687,
"step": 9120
},
{
"epoch": 0.6772997032640949,
"grad_norm": 11.794063568115234,
"learning_rate": 7.070325441736635e-07,
"loss": 1.3114,
"step": 9130
},
{
"epoch": 0.6780415430267063,
"grad_norm": 9.376462936401367,
"learning_rate": 7.040672830347781e-07,
"loss": 1.3112,
"step": 9140
},
{
"epoch": 0.6787833827893175,
"grad_norm": 9.135132789611816,
"learning_rate": 7.011063450042518e-07,
"loss": 1.3361,
"step": 9150
},
{
"epoch": 0.6795252225519288,
"grad_norm": 8.93464183807373,
"learning_rate": 6.981497461644176e-07,
"loss": 1.3685,
"step": 9160
},
{
"epoch": 0.68026706231454,
"grad_norm": 8.19428539276123,
"learning_rate": 6.951975025740427e-07,
"loss": 1.3093,
"step": 9170
},
{
"epoch": 0.6810089020771514,
"grad_norm": 8.19371509552002,
"learning_rate": 6.92249630268236e-07,
"loss": 1.426,
"step": 9180
},
{
"epoch": 0.6817507418397626,
"grad_norm": 7.4333014488220215,
"learning_rate": 6.893061452583667e-07,
"loss": 1.3935,
"step": 9190
},
{
"epoch": 0.6824925816023739,
"grad_norm": 9.02108383178711,
"learning_rate": 6.863670635319714e-07,
"loss": 1.3407,
"step": 9200
},
{
"epoch": 0.6832344213649851,
"grad_norm": 10.15578842163086,
"learning_rate": 6.834324010526733e-07,
"loss": 1.3954,
"step": 9210
},
{
"epoch": 0.6839762611275965,
"grad_norm": 8.848624229431152,
"learning_rate": 6.805021737600896e-07,
"loss": 1.2578,
"step": 9220
},
{
"epoch": 0.6847181008902077,
"grad_norm": 7.951557636260986,
"learning_rate": 6.775763975697501e-07,
"loss": 1.3615,
"step": 9230
},
{
"epoch": 0.685459940652819,
"grad_norm": 8.311724662780762,
"learning_rate": 6.746550883730067e-07,
"loss": 1.1818,
"step": 9240
},
{
"epoch": 0.6862017804154302,
"grad_norm": 7.773900508880615,
"learning_rate": 6.717382620369506e-07,
"loss": 1.3195,
"step": 9250
},
{
"epoch": 0.6869436201780416,
"grad_norm": 9.448432922363281,
"learning_rate": 6.688259344043221e-07,
"loss": 1.1781,
"step": 9260
},
{
"epoch": 0.6876854599406528,
"grad_norm": 7.867501258850098,
"learning_rate": 6.659181212934291e-07,
"loss": 1.2175,
"step": 9270
},
{
"epoch": 0.6884272997032641,
"grad_norm": 8.866848945617676,
"learning_rate": 6.630148384980567e-07,
"loss": 1.3159,
"step": 9280
},
{
"epoch": 0.6891691394658753,
"grad_norm": 6.734555244445801,
"learning_rate": 6.601161017873861e-07,
"loss": 1.291,
"step": 9290
},
{
"epoch": 0.6899109792284867,
"grad_norm": 7.454867362976074,
"learning_rate": 6.572219269059037e-07,
"loss": 1.2432,
"step": 9300
},
{
"epoch": 0.6906528189910979,
"grad_norm": 8.541337966918945,
"learning_rate": 6.543323295733207e-07,
"loss": 1.3534,
"step": 9310
},
{
"epoch": 0.6913946587537092,
"grad_norm": 9.236302375793457,
"learning_rate": 6.514473254844833e-07,
"loss": 1.188,
"step": 9320
},
{
"epoch": 0.6921364985163204,
"grad_norm": 8.086477279663086,
"learning_rate": 6.485669303092917e-07,
"loss": 1.3077,
"step": 9330
},
{
"epoch": 0.6928783382789317,
"grad_norm": 9.85937786102295,
"learning_rate": 6.456911596926104e-07,
"loss": 1.3409,
"step": 9340
},
{
"epoch": 0.6936201780415431,
"grad_norm": 8.914828300476074,
"learning_rate": 6.428200292541874e-07,
"loss": 1.4067,
"step": 9350
},
{
"epoch": 0.6943620178041543,
"grad_norm": 9.060097694396973,
"learning_rate": 6.399535545885673e-07,
"loss": 1.4621,
"step": 9360
},
{
"epoch": 0.6951038575667656,
"grad_norm": 8.829442977905273,
"learning_rate": 6.370917512650057e-07,
"loss": 1.0863,
"step": 9370
},
{
"epoch": 0.6958456973293768,
"grad_norm": 11.040599822998047,
"learning_rate": 6.342346348273879e-07,
"loss": 1.3622,
"step": 9380
},
{
"epoch": 0.6965875370919882,
"grad_norm": 8.519377708435059,
"learning_rate": 6.313822207941395e-07,
"loss": 1.374,
"step": 9390
},
{
"epoch": 0.6973293768545994,
"grad_norm": 7.66409969329834,
"learning_rate": 6.285345246581483e-07,
"loss": 1.2223,
"step": 9400
},
{
"epoch": 0.6980712166172107,
"grad_norm": 7.880566596984863,
"learning_rate": 6.256915618866739e-07,
"loss": 1.2694,
"step": 9410
},
{
"epoch": 0.6988130563798219,
"grad_norm": 8.899270057678223,
"learning_rate": 6.228533479212686e-07,
"loss": 1.4051,
"step": 9420
},
{
"epoch": 0.6995548961424333,
"grad_norm": 7.940200328826904,
"learning_rate": 6.200198981776902e-07,
"loss": 1.4107,
"step": 9430
},
{
"epoch": 0.7002967359050445,
"grad_norm": 8.319690704345703,
"learning_rate": 6.171912280458215e-07,
"loss": 1.246,
"step": 9440
},
{
"epoch": 0.7010385756676558,
"grad_norm": 9.996255874633789,
"learning_rate": 6.143673528895821e-07,
"loss": 1.2741,
"step": 9450
},
{
"epoch": 0.701780415430267,
"grad_norm": 7.97064733505249,
"learning_rate": 6.115482880468506e-07,
"loss": 1.2776,
"step": 9460
},
{
"epoch": 0.7025222551928784,
"grad_norm": 8.822321891784668,
"learning_rate": 6.087340488293757e-07,
"loss": 1.4845,
"step": 9470
},
{
"epoch": 0.7032640949554896,
"grad_norm": 7.667973041534424,
"learning_rate": 6.059246505226985e-07,
"loss": 1.3351,
"step": 9480
},
{
"epoch": 0.7040059347181009,
"grad_norm": 9.208893775939941,
"learning_rate": 6.031201083860636e-07,
"loss": 1.3834,
"step": 9490
},
{
"epoch": 0.7047477744807121,
"grad_norm": 11.381084442138672,
"learning_rate": 6.003204376523425e-07,
"loss": 1.5295,
"step": 9500
},
{
"epoch": 0.7047477744807121,
"eval_loss": 1.298993468284607,
"eval_runtime": 23.6145,
"eval_samples_per_second": 18.887,
"eval_steps_per_second": 9.443,
"step": 9500
},
{
"epoch": 0.7054896142433235,
"grad_norm": 8.421923637390137,
"learning_rate": 5.975256535279449e-07,
"loss": 1.3051,
"step": 9510
},
{
"epoch": 0.7062314540059347,
"grad_norm": 8.727239608764648,
"learning_rate": 5.94735771192741e-07,
"loss": 1.2386,
"step": 9520
},
{
"epoch": 0.706973293768546,
"grad_norm": 8.890826225280762,
"learning_rate": 5.919508057999751e-07,
"loss": 1.4653,
"step": 9530
},
{
"epoch": 0.7077151335311572,
"grad_norm": 8.684818267822266,
"learning_rate": 5.891707724761871e-07,
"loss": 1.3042,
"step": 9540
},
{
"epoch": 0.7084569732937686,
"grad_norm": 8.57326889038086,
"learning_rate": 5.863956863211263e-07,
"loss": 1.3526,
"step": 9550
},
{
"epoch": 0.7091988130563798,
"grad_norm": 9.280582427978516,
"learning_rate": 5.836255624076732e-07,
"loss": 1.3168,
"step": 9560
},
{
"epoch": 0.7099406528189911,
"grad_norm": 8.064082145690918,
"learning_rate": 5.808604157817548e-07,
"loss": 1.3998,
"step": 9570
},
{
"epoch": 0.7106824925816023,
"grad_norm": 8.513121604919434,
"learning_rate": 5.781002614622646e-07,
"loss": 1.1547,
"step": 9580
},
{
"epoch": 0.7114243323442137,
"grad_norm": 9.24774169921875,
"learning_rate": 5.753451144409796e-07,
"loss": 1.2401,
"step": 9590
},
{
"epoch": 0.712166172106825,
"grad_norm": 8.027381896972656,
"learning_rate": 5.725949896824806e-07,
"loss": 1.3028,
"step": 9600
},
{
"epoch": 0.7129080118694362,
"grad_norm": 7.955070495605469,
"learning_rate": 5.698499021240699e-07,
"loss": 1.2351,
"step": 9610
},
{
"epoch": 0.7136498516320475,
"grad_norm": 7.460748672485352,
"learning_rate": 5.671098666756888e-07,
"loss": 1.289,
"step": 9620
},
{
"epoch": 0.7143916913946587,
"grad_norm": 7.7787885665893555,
"learning_rate": 5.643748982198407e-07,
"loss": 1.3013,
"step": 9630
},
{
"epoch": 0.7151335311572701,
"grad_norm": 7.563982009887695,
"learning_rate": 5.616450116115045e-07,
"loss": 1.3116,
"step": 9640
},
{
"epoch": 0.7158753709198813,
"grad_norm": 10.66048812866211,
"learning_rate": 5.5892022167806e-07,
"loss": 1.4897,
"step": 9650
},
{
"epoch": 0.7166172106824926,
"grad_norm": 7.235440731048584,
"learning_rate": 5.56200543219202e-07,
"loss": 1.2029,
"step": 9660
},
{
"epoch": 0.7173590504451038,
"grad_norm": 9.794096946716309,
"learning_rate": 5.534859910068643e-07,
"loss": 1.1368,
"step": 9670
},
{
"epoch": 0.7181008902077152,
"grad_norm": 12.36744499206543,
"learning_rate": 5.507765797851356e-07,
"loss": 1.2889,
"step": 9680
},
{
"epoch": 0.7188427299703264,
"grad_norm": 8.576911926269531,
"learning_rate": 5.480723242701836e-07,
"loss": 1.2541,
"step": 9690
},
{
"epoch": 0.7195845697329377,
"grad_norm": 7.381803512573242,
"learning_rate": 5.4537323915017e-07,
"loss": 1.3102,
"step": 9700
},
{
"epoch": 0.7203264094955489,
"grad_norm": 8.61953353881836,
"learning_rate": 5.426793390851761e-07,
"loss": 1.2292,
"step": 9710
},
{
"epoch": 0.7210682492581603,
"grad_norm": 9.234679222106934,
"learning_rate": 5.399906387071186e-07,
"loss": 1.4074,
"step": 9720
},
{
"epoch": 0.7218100890207715,
"grad_norm": 7.804644584655762,
"learning_rate": 5.373071526196739e-07,
"loss": 1.1214,
"step": 9730
},
{
"epoch": 0.7225519287833828,
"grad_norm": 9.674349784851074,
"learning_rate": 5.346288953981949e-07,
"loss": 1.2788,
"step": 9740
},
{
"epoch": 0.723293768545994,
"grad_norm": 8.411057472229004,
"learning_rate": 5.319558815896363e-07,
"loss": 1.227,
"step": 9750
},
{
"epoch": 0.7240356083086054,
"grad_norm": 8.84724235534668,
"learning_rate": 5.29288125712471e-07,
"loss": 1.2271,
"step": 9760
},
{
"epoch": 0.7247774480712166,
"grad_norm": 8.127745628356934,
"learning_rate": 5.266256422566145e-07,
"loss": 1.2995,
"step": 9770
},
{
"epoch": 0.7255192878338279,
"grad_norm": 7.898895740509033,
"learning_rate": 5.239684456833457e-07,
"loss": 1.1288,
"step": 9780
},
{
"epoch": 0.7262611275964391,
"grad_norm": 8.459515571594238,
"learning_rate": 5.213165504252262e-07,
"loss": 1.373,
"step": 9790
},
{
"epoch": 0.7270029673590505,
"grad_norm": 9.448688507080078,
"learning_rate": 5.186699708860253e-07,
"loss": 1.2424,
"step": 9800
},
{
"epoch": 0.7277448071216617,
"grad_norm": 8.228900909423828,
"learning_rate": 5.160287214406383e-07,
"loss": 1.2119,
"step": 9810
},
{
"epoch": 0.728486646884273,
"grad_norm": 7.960751533508301,
"learning_rate": 5.133928164350119e-07,
"loss": 1.3451,
"step": 9820
},
{
"epoch": 0.7292284866468842,
"grad_norm": 7.8820414543151855,
"learning_rate": 5.107622701860624e-07,
"loss": 1.2296,
"step": 9830
},
{
"epoch": 0.7299703264094956,
"grad_norm": 8.707436561584473,
"learning_rate": 5.081370969816023e-07,
"loss": 1.2629,
"step": 9840
},
{
"epoch": 0.7307121661721068,
"grad_norm": 9.171490669250488,
"learning_rate": 5.055173110802586e-07,
"loss": 1.3124,
"step": 9850
},
{
"epoch": 0.7314540059347181,
"grad_norm": 7.622151851654053,
"learning_rate": 5.029029267113971e-07,
"loss": 1.2931,
"step": 9860
},
{
"epoch": 0.7321958456973294,
"grad_norm": 7.796103000640869,
"learning_rate": 5.002939580750467e-07,
"loss": 1.3467,
"step": 9870
},
{
"epoch": 0.7329376854599406,
"grad_norm": 8.309154510498047,
"learning_rate": 4.976904193418203e-07,
"loss": 1.3801,
"step": 9880
},
{
"epoch": 0.733679525222552,
"grad_norm": 8.498586654663086,
"learning_rate": 4.950923246528368e-07,
"loss": 1.2142,
"step": 9890
},
{
"epoch": 0.7344213649851632,
"grad_norm": 8.15847396850586,
"learning_rate": 4.92499688119648e-07,
"loss": 1.2417,
"step": 9900
},
{
"epoch": 0.7351632047477745,
"grad_norm": 8.350110054016113,
"learning_rate": 4.899125238241574e-07,
"loss": 1.3085,
"step": 9910
},
{
"epoch": 0.7359050445103857,
"grad_norm": 8.587996482849121,
"learning_rate": 4.873308458185486e-07,
"loss": 1.1625,
"step": 9920
},
{
"epoch": 0.7366468842729971,
"grad_norm": 6.703005313873291,
"learning_rate": 4.847546681252034e-07,
"loss": 1.2597,
"step": 9930
},
{
"epoch": 0.7373887240356083,
"grad_norm": 8.741930961608887,
"learning_rate": 4.821840047366322e-07,
"loss": 1.3137,
"step": 9940
},
{
"epoch": 0.7381305637982196,
"grad_norm": 9.368997573852539,
"learning_rate": 4.796188696153909e-07,
"loss": 1.4068,
"step": 9950
},
{
"epoch": 0.7388724035608308,
"grad_norm": 9.121284484863281,
"learning_rate": 4.770592766940116e-07,
"loss": 1.284,
"step": 9960
},
{
"epoch": 0.7396142433234422,
"grad_norm": 8.773377418518066,
"learning_rate": 4.745052398749213e-07,
"loss": 1.3025,
"step": 9970
},
{
"epoch": 0.7403560830860534,
"grad_norm": 8.985709190368652,
"learning_rate": 4.719567730303719e-07,
"loss": 1.276,
"step": 9980
},
{
"epoch": 0.7410979228486647,
"grad_norm": 7.726775169372559,
"learning_rate": 4.6941389000235893e-07,
"loss": 1.2906,
"step": 9990
},
{
"epoch": 0.7418397626112759,
"grad_norm": 8.630135536193848,
"learning_rate": 4.668766046025522e-07,
"loss": 1.293,
"step": 10000
},
{
"epoch": 0.7418397626112759,
"eval_loss": 1.2966691255569458,
"eval_runtime": 23.6049,
"eval_samples_per_second": 18.894,
"eval_steps_per_second": 9.447,
"step": 10000
},
{
"epoch": 0.7425816023738873,
"grad_norm": 9.198748588562012,
"learning_rate": 4.643449306122158e-07,
"loss": 1.2206,
"step": 10010
},
{
"epoch": 0.7433234421364985,
"grad_norm": 8.540892601013184,
"learning_rate": 4.618188817821371e-07,
"loss": 1.4011,
"step": 10020
},
{
"epoch": 0.7440652818991098,
"grad_norm": 8.046586990356445,
"learning_rate": 4.5929847183254916e-07,
"loss": 1.284,
"step": 10030
},
{
"epoch": 0.744807121661721,
"grad_norm": 8.560956954956055,
"learning_rate": 4.567837144530585e-07,
"loss": 1.2844,
"step": 10040
},
{
"epoch": 0.7455489614243324,
"grad_norm": 9.451622009277344,
"learning_rate": 4.542746233025685e-07,
"loss": 1.37,
"step": 10050
},
{
"epoch": 0.7462908011869436,
"grad_norm": 10.818734169006348,
"learning_rate": 4.51771212009208e-07,
"loss": 1.3427,
"step": 10060
},
{
"epoch": 0.7470326409495549,
"grad_norm": 8.870102882385254,
"learning_rate": 4.492734941702541e-07,
"loss": 1.3504,
"step": 10070
},
{
"epoch": 0.7477744807121661,
"grad_norm": 10.100753784179688,
"learning_rate": 4.467814833520613e-07,
"loss": 1.1713,
"step": 10080
},
{
"epoch": 0.7485163204747775,
"grad_norm": 8.58507251739502,
"learning_rate": 4.4429519308998503e-07,
"loss": 1.1272,
"step": 10090
},
{
"epoch": 0.7492581602373887,
"grad_norm": 7.8265299797058105,
"learning_rate": 4.41814636888311e-07,
"loss": 1.2065,
"step": 10100
},
{
"epoch": 0.75,
"grad_norm": 8.231193542480469,
"learning_rate": 4.3933982822017883e-07,
"loss": 1.2077,
"step": 10110
},
{
"epoch": 0.7507418397626113,
"grad_norm": 7.965888500213623,
"learning_rate": 4.368707805275116e-07,
"loss": 1.4395,
"step": 10120
},
{
"epoch": 0.7514836795252225,
"grad_norm": 9.374115943908691,
"learning_rate": 4.344075072209417e-07,
"loss": 1.2853,
"step": 10130
},
{
"epoch": 0.7522255192878339,
"grad_norm": 7.917102813720703,
"learning_rate": 4.3195002167973655e-07,
"loss": 1.3366,
"step": 10140
},
{
"epoch": 0.7529673590504451,
"grad_norm": 9.077959060668945,
"learning_rate": 4.294983372517293e-07,
"loss": 1.4383,
"step": 10150
},
{
"epoch": 0.7537091988130564,
"grad_norm": 9.32331657409668,
"learning_rate": 4.2705246725324216e-07,
"loss": 1.2742,
"step": 10160
},
{
"epoch": 0.7544510385756676,
"grad_norm": 8.539690971374512,
"learning_rate": 4.246124249690187e-07,
"loss": 1.2168,
"step": 10170
},
{
"epoch": 0.755192878338279,
"grad_norm": 8.285751342773438,
"learning_rate": 4.2217822365214686e-07,
"loss": 1.376,
"step": 10180
},
{
"epoch": 0.7559347181008902,
"grad_norm": 8.879798889160156,
"learning_rate": 4.197498765239913e-07,
"loss": 1.3534,
"step": 10190
},
{
"epoch": 0.7566765578635015,
"grad_norm": 8.319602012634277,
"learning_rate": 4.1732739677411836e-07,
"loss": 1.2968,
"step": 10200
},
{
"epoch": 0.7574183976261127,
"grad_norm": 7.641089916229248,
"learning_rate": 4.149107975602267e-07,
"loss": 1.2378,
"step": 10210
},
{
"epoch": 0.7581602373887241,
"grad_norm": 9.449283599853516,
"learning_rate": 4.1250009200807353e-07,
"loss": 1.0789,
"step": 10220
},
{
"epoch": 0.7589020771513353,
"grad_norm": 9.45445442199707,
"learning_rate": 4.100952932114066e-07,
"loss": 1.2849,
"step": 10230
},
{
"epoch": 0.7596439169139466,
"grad_norm": 7.804203987121582,
"learning_rate": 4.07696414231889e-07,
"loss": 1.2507,
"step": 10240
},
{
"epoch": 0.7603857566765578,
"grad_norm": 8.116350173950195,
"learning_rate": 4.0530346809903196e-07,
"loss": 1.2658,
"step": 10250
},
{
"epoch": 0.7611275964391692,
"grad_norm": 9.725852012634277,
"learning_rate": 4.029164678101213e-07,
"loss": 1.462,
"step": 10260
},
{
"epoch": 0.7618694362017804,
"grad_norm": 8.416056632995605,
"learning_rate": 4.0053542633014913e-07,
"loss": 1.3301,
"step": 10270
},
{
"epoch": 0.7626112759643917,
"grad_norm": 6.388516426086426,
"learning_rate": 3.98160356591741e-07,
"loss": 1.2121,
"step": 10280
},
{
"epoch": 0.7633531157270029,
"grad_norm": 7.303947925567627,
"learning_rate": 3.957912714950882e-07,
"loss": 1.2568,
"step": 10290
},
{
"epoch": 0.7640949554896143,
"grad_norm": 8.52409553527832,
"learning_rate": 3.9342818390787535e-07,
"loss": 1.435,
"step": 10300
},
{
"epoch": 0.7648367952522255,
"grad_norm": 9.281074523925781,
"learning_rate": 3.910711066652127e-07,
"loss": 1.3805,
"step": 10310
},
{
"epoch": 0.7655786350148368,
"grad_norm": 7.558801651000977,
"learning_rate": 3.8872005256956383e-07,
"loss": 1.2831,
"step": 10320
},
{
"epoch": 0.766320474777448,
"grad_norm": 9.506136894226074,
"learning_rate": 3.863750343906796e-07,
"loss": 1.396,
"step": 10330
},
{
"epoch": 0.7670623145400594,
"grad_norm": 9.334778785705566,
"learning_rate": 3.840360648655247e-07,
"loss": 1.374,
"step": 10340
},
{
"epoch": 0.7678041543026706,
"grad_norm": 8.17182445526123,
"learning_rate": 3.8170315669821227e-07,
"loss": 1.3962,
"step": 10350
},
{
"epoch": 0.7685459940652819,
"grad_norm": 8.254951477050781,
"learning_rate": 3.7937632255993176e-07,
"loss": 1.4787,
"step": 10360
},
{
"epoch": 0.7692878338278932,
"grad_norm": 7.743471622467041,
"learning_rate": 3.770555750888825e-07,
"loss": 1.3078,
"step": 10370
},
{
"epoch": 0.7700296735905044,
"grad_norm": 8.887690544128418,
"learning_rate": 3.747409268902046e-07,
"loss": 1.3241,
"step": 10380
},
{
"epoch": 0.7707715133531158,
"grad_norm": 9.078700065612793,
"learning_rate": 3.724323905359082e-07,
"loss": 1.4121,
"step": 10390
},
{
"epoch": 0.771513353115727,
"grad_norm": 8.609134674072266,
"learning_rate": 3.7012997856480794e-07,
"loss": 1.2956,
"step": 10400
},
{
"epoch": 0.7722551928783383,
"grad_norm": 8.539812088012695,
"learning_rate": 3.678337034824545e-07,
"loss": 1.2113,
"step": 10410
},
{
"epoch": 0.7729970326409495,
"grad_norm": 8.455937385559082,
"learning_rate": 3.655435777610649e-07,
"loss": 1.3018,
"step": 10420
},
{
"epoch": 0.7737388724035609,
"grad_norm": 8.811159133911133,
"learning_rate": 3.63259613839457e-07,
"loss": 1.3779,
"step": 10430
},
{
"epoch": 0.7744807121661721,
"grad_norm": 8.420944213867188,
"learning_rate": 3.6098182412297944e-07,
"loss": 1.2882,
"step": 10440
},
{
"epoch": 0.7752225519287834,
"grad_norm": 8.92984390258789,
"learning_rate": 3.587102209834474e-07,
"loss": 1.227,
"step": 10450
},
{
"epoch": 0.7759643916913946,
"grad_norm": 7.994571208953857,
"learning_rate": 3.564448167590721e-07,
"loss": 1.3261,
"step": 10460
},
{
"epoch": 0.776706231454006,
"grad_norm": 7.83929443359375,
"learning_rate": 3.541856237543967e-07,
"loss": 1.5789,
"step": 10470
},
{
"epoch": 0.7774480712166172,
"grad_norm": 8.824812889099121,
"learning_rate": 3.51932654240227e-07,
"loss": 1.2063,
"step": 10480
},
{
"epoch": 0.7781899109792285,
"grad_norm": 9.47778606414795,
"learning_rate": 3.4968592045356605e-07,
"loss": 1.2887,
"step": 10490
},
{
"epoch": 0.7789317507418397,
"grad_norm": 7.607693195343018,
"learning_rate": 3.474454345975488e-07,
"loss": 1.2231,
"step": 10500
},
{
"epoch": 0.7789317507418397,
"eval_loss": 1.294171690940857,
"eval_runtime": 23.6195,
"eval_samples_per_second": 18.883,
"eval_steps_per_second": 9.441,
"step": 10500
},
{
"epoch": 0.7796735905044511,
"grad_norm": 8.560502052307129,
"learning_rate": 3.4521120884137254e-07,
"loss": 1.2739,
"step": 10510
},
{
"epoch": 0.7804154302670623,
"grad_norm": 9.082324028015137,
"learning_rate": 3.4298325532023496e-07,
"loss": 1.2654,
"step": 10520
},
{
"epoch": 0.7811572700296736,
"grad_norm": 9.28708267211914,
"learning_rate": 3.40761586135264e-07,
"loss": 1.2823,
"step": 10530
},
{
"epoch": 0.7818991097922848,
"grad_norm": 8.582283973693848,
"learning_rate": 3.385462133534565e-07,
"loss": 1.2891,
"step": 10540
},
{
"epoch": 0.7826409495548962,
"grad_norm": 8.136933326721191,
"learning_rate": 3.3633714900760804e-07,
"loss": 1.2946,
"step": 10550
},
{
"epoch": 0.7833827893175074,
"grad_norm": 9.77947998046875,
"learning_rate": 3.34134405096252e-07,
"loss": 1.1834,
"step": 10560
},
{
"epoch": 0.7841246290801187,
"grad_norm": 8.430279731750488,
"learning_rate": 3.319379935835907e-07,
"loss": 1.3921,
"step": 10570
},
{
"epoch": 0.7848664688427299,
"grad_norm": 9.608940124511719,
"learning_rate": 3.297479263994334e-07,
"loss": 1.1624,
"step": 10580
},
{
"epoch": 0.7856083086053413,
"grad_norm": 8.380851745605469,
"learning_rate": 3.2756421543912855e-07,
"loss": 1.4092,
"step": 10590
},
{
"epoch": 0.7863501483679525,
"grad_norm": 8.177634239196777,
"learning_rate": 3.25386872563503e-07,
"loss": 1.1724,
"step": 10600
},
{
"epoch": 0.7870919881305638,
"grad_norm": 9.429834365844727,
"learning_rate": 3.232159095987926e-07,
"loss": 1.2285,
"step": 10610
},
{
"epoch": 0.787833827893175,
"grad_norm": 9.116068840026855,
"learning_rate": 3.2105133833658333e-07,
"loss": 1.27,
"step": 10620
},
{
"epoch": 0.7885756676557863,
"grad_norm": 7.366293430328369,
"learning_rate": 3.1889317053374265e-07,
"loss": 1.2879,
"step": 10630
},
{
"epoch": 0.7893175074183977,
"grad_norm": 8.340385437011719,
"learning_rate": 3.167414179123589e-07,
"loss": 1.4321,
"step": 10640
},
{
"epoch": 0.7900593471810089,
"grad_norm": 8.602953910827637,
"learning_rate": 3.145960921596762e-07,
"loss": 1.3273,
"step": 10650
},
{
"epoch": 0.7908011869436202,
"grad_norm": 8.914224624633789,
"learning_rate": 3.124572049280301e-07,
"loss": 1.2689,
"step": 10660
},
{
"epoch": 0.7915430267062314,
"grad_norm": 8.413691520690918,
"learning_rate": 3.1032476783478694e-07,
"loss": 1.226,
"step": 10670
},
{
"epoch": 0.7922848664688428,
"grad_norm": 8.446022033691406,
"learning_rate": 3.0819879246227737e-07,
"loss": 1.3581,
"step": 10680
},
{
"epoch": 0.793026706231454,
"grad_norm": 9.09033203125,
"learning_rate": 3.0607929035773686e-07,
"loss": 1.3828,
"step": 10690
},
{
"epoch": 0.7937685459940653,
"grad_norm": 7.936834812164307,
"learning_rate": 3.039662730332399e-07,
"loss": 1.3225,
"step": 10700
},
{
"epoch": 0.7945103857566765,
"grad_norm": 8.607504844665527,
"learning_rate": 3.018597519656404e-07,
"loss": 1.3454,
"step": 10710
},
{
"epoch": 0.7952522255192879,
"grad_norm": 9.906630516052246,
"learning_rate": 2.997597385965062e-07,
"loss": 1.3378,
"step": 10720
},
{
"epoch": 0.7959940652818991,
"grad_norm": 8.220865249633789,
"learning_rate": 2.9766624433206e-07,
"loss": 1.39,
"step": 10730
},
{
"epoch": 0.7967359050445104,
"grad_norm": 9.154244422912598,
"learning_rate": 2.955792805431149e-07,
"loss": 1.5,
"step": 10740
},
{
"epoch": 0.7974777448071216,
"grad_norm": 8.345115661621094,
"learning_rate": 2.93498858565015e-07,
"loss": 1.1509,
"step": 10750
},
{
"epoch": 0.798219584569733,
"grad_norm": 8.3400239944458,
"learning_rate": 2.914249896975705e-07,
"loss": 1.2945,
"step": 10760
},
{
"epoch": 0.7989614243323442,
"grad_norm": 8.932135581970215,
"learning_rate": 2.89357685205001e-07,
"loss": 1.3422,
"step": 10770
},
{
"epoch": 0.7997032640949555,
"grad_norm": 9.743860244750977,
"learning_rate": 2.872969563158693e-07,
"loss": 1.321,
"step": 10780
},
{
"epoch": 0.8004451038575667,
"grad_norm": 8.630380630493164,
"learning_rate": 2.852428142230246e-07,
"loss": 1.2738,
"step": 10790
},
{
"epoch": 0.8011869436201781,
"grad_norm": 10.236412048339844,
"learning_rate": 2.831952700835386e-07,
"loss": 1.3867,
"step": 10800
},
{
"epoch": 0.8019287833827893,
"grad_norm": 9.148995399475098,
"learning_rate": 2.811543350186474e-07,
"loss": 1.4451,
"step": 10810
},
{
"epoch": 0.8026706231454006,
"grad_norm": 8.069551467895508,
"learning_rate": 2.791200201136886e-07,
"loss": 1.2106,
"step": 10820
},
{
"epoch": 0.8034124629080118,
"grad_norm": 8.256972312927246,
"learning_rate": 2.7709233641804396e-07,
"loss": 1.279,
"step": 10830
},
{
"epoch": 0.8041543026706232,
"grad_norm": 9.485831260681152,
"learning_rate": 2.75071294945076e-07,
"loss": 1.299,
"step": 10840
},
{
"epoch": 0.8048961424332344,
"grad_norm": 8.086010932922363,
"learning_rate": 2.730569066720718e-07,
"loss": 1.3489,
"step": 10850
},
{
"epoch": 0.8056379821958457,
"grad_norm": 9.07319450378418,
"learning_rate": 2.710491825401803e-07,
"loss": 1.4439,
"step": 10860
},
{
"epoch": 0.8063798219584569,
"grad_norm": 8.647632598876953,
"learning_rate": 2.69048133454355e-07,
"loss": 1.3518,
"step": 10870
},
{
"epoch": 0.8071216617210683,
"grad_norm": 8.795075416564941,
"learning_rate": 2.6705377028329315e-07,
"loss": 1.3317,
"step": 10880
},
{
"epoch": 0.8078635014836796,
"grad_norm": 9.038534164428711,
"learning_rate": 2.650661038593778e-07,
"loss": 1.2905,
"step": 10890
},
{
"epoch": 0.8086053412462908,
"grad_norm": 8.689789772033691,
"learning_rate": 2.630851449786193e-07,
"loss": 1.1946,
"step": 10900
},
{
"epoch": 0.8093471810089021,
"grad_norm": 7.252419471740723,
"learning_rate": 2.6111090440059453e-07,
"loss": 1.3711,
"step": 10910
},
{
"epoch": 0.8100890207715133,
"grad_norm": 12.24842357635498,
"learning_rate": 2.5914339284839143e-07,
"loss": 1.1649,
"step": 10920
},
{
"epoch": 0.8108308605341247,
"grad_norm": 7.127127170562744,
"learning_rate": 2.5718262100854774e-07,
"loss": 1.1895,
"step": 10930
},
{
"epoch": 0.8115727002967359,
"grad_norm": 8.82343578338623,
"learning_rate": 2.55228599530996e-07,
"loss": 1.3349,
"step": 10940
},
{
"epoch": 0.8123145400593472,
"grad_norm": 8.194673538208008,
"learning_rate": 2.532813390290026e-07,
"loss": 1.1975,
"step": 10950
},
{
"epoch": 0.8130563798219584,
"grad_norm": 9.27203369140625,
"learning_rate": 2.513408500791135e-07,
"loss": 1.2848,
"step": 10960
},
{
"epoch": 0.8137982195845698,
"grad_norm": 8.359867095947266,
"learning_rate": 2.494071432210928e-07,
"loss": 1.295,
"step": 10970
},
{
"epoch": 0.814540059347181,
"grad_norm": 8.281832695007324,
"learning_rate": 2.4748022895786977e-07,
"loss": 1.3752,
"step": 10980
},
{
"epoch": 0.8152818991097923,
"grad_norm": 9.253495216369629,
"learning_rate": 2.4556011775547804e-07,
"loss": 1.3411,
"step": 10990
},
{
"epoch": 0.8160237388724035,
"grad_norm": 7.842648029327393,
"learning_rate": 2.4364682004300195e-07,
"loss": 1.2721,
"step": 11000
},
{
"epoch": 0.8160237388724035,
"eval_loss": 1.2925976514816284,
"eval_runtime": 23.6283,
"eval_samples_per_second": 18.876,
"eval_steps_per_second": 9.438,
"step": 11000
},
{
"epoch": 0.8167655786350149,
"grad_norm": 7.728657245635986,
"learning_rate": 2.417403462125166e-07,
"loss": 1.3394,
"step": 11010
},
{
"epoch": 0.8175074183976261,
"grad_norm": 12.809070587158203,
"learning_rate": 2.398407066190351e-07,
"loss": 1.4096,
"step": 11020
},
{
"epoch": 0.8182492581602374,
"grad_norm": 8.430037498474121,
"learning_rate": 2.3794791158044865e-07,
"loss": 1.1904,
"step": 11030
},
{
"epoch": 0.8189910979228486,
"grad_norm": 7.398181915283203,
"learning_rate": 2.3606197137747366e-07,
"loss": 1.1156,
"step": 11040
},
{
"epoch": 0.81973293768546,
"grad_norm": 7.863873481750488,
"learning_rate": 2.341828962535932e-07,
"loss": 1.3122,
"step": 11050
},
{
"epoch": 0.8204747774480712,
"grad_norm": 8.728108406066895,
"learning_rate": 2.3231069641500414e-07,
"loss": 1.2932,
"step": 11060
},
{
"epoch": 0.8212166172106825,
"grad_norm": 8.522738456726074,
"learning_rate": 2.3044538203055876e-07,
"loss": 1.3695,
"step": 11070
},
{
"epoch": 0.8219584569732937,
"grad_norm": 8.17654800415039,
"learning_rate": 2.2858696323171225e-07,
"loss": 1.2997,
"step": 11080
},
{
"epoch": 0.8227002967359051,
"grad_norm": 9.464920997619629,
"learning_rate": 2.267354501124652e-07,
"loss": 1.1666,
"step": 11090
},
{
"epoch": 0.8234421364985163,
"grad_norm": 8.210715293884277,
"learning_rate": 2.2489085272931132e-07,
"loss": 1.3267,
"step": 11100
},
{
"epoch": 0.8241839762611276,
"grad_norm": 8.052397727966309,
"learning_rate": 2.230531811011804e-07,
"loss": 1.1532,
"step": 11110
},
{
"epoch": 0.8249258160237388,
"grad_norm": 8.493069648742676,
"learning_rate": 2.212224452093859e-07,
"loss": 1.3931,
"step": 11120
},
{
"epoch": 0.8256676557863502,
"grad_norm": 9.07796573638916,
"learning_rate": 2.1939865499756905e-07,
"loss": 1.2122,
"step": 11130
},
{
"epoch": 0.8264094955489614,
"grad_norm": 8.513764381408691,
"learning_rate": 2.1758182037164564e-07,
"loss": 1.3071,
"step": 11140
},
{
"epoch": 0.8271513353115727,
"grad_norm": 8.735884666442871,
"learning_rate": 2.1577195119975328e-07,
"loss": 1.3478,
"step": 11150
},
{
"epoch": 0.827893175074184,
"grad_norm": 8.680800437927246,
"learning_rate": 2.1396905731219506e-07,
"loss": 1.3153,
"step": 11160
},
{
"epoch": 0.8286350148367952,
"grad_norm": 9.367341041564941,
"learning_rate": 2.1217314850138952e-07,
"loss": 1.4447,
"step": 11170
},
{
"epoch": 0.8293768545994066,
"grad_norm": 7.34644079208374,
"learning_rate": 2.103842345218142e-07,
"loss": 1.1235,
"step": 11180
},
{
"epoch": 0.8301186943620178,
"grad_norm": 8.427669525146484,
"learning_rate": 2.0860232508995558e-07,
"loss": 1.5228,
"step": 11190
},
{
"epoch": 0.8308605341246291,
"grad_norm": 8.698617935180664,
"learning_rate": 2.068274298842537e-07,
"loss": 1.3354,
"step": 11200
},
{
"epoch": 0.8316023738872403,
"grad_norm": 9.806768417358398,
"learning_rate": 2.050595585450522e-07,
"loss": 1.4197,
"step": 11210
},
{
"epoch": 0.8323442136498517,
"grad_norm": 8.465378761291504,
"learning_rate": 2.0329872067454286e-07,
"loss": 1.3557,
"step": 11220
},
{
"epoch": 0.8330860534124629,
"grad_norm": 9.589797973632812,
"learning_rate": 2.0154492583671708e-07,
"loss": 1.2585,
"step": 11230
},
{
"epoch": 0.8338278931750742,
"grad_norm": 7.806549072265625,
"learning_rate": 1.9979818355731023e-07,
"loss": 1.2215,
"step": 11240
},
{
"epoch": 0.8345697329376854,
"grad_norm": 9.690045356750488,
"learning_rate": 1.9805850332375347e-07,
"loss": 1.347,
"step": 11250
},
{
"epoch": 0.8353115727002968,
"grad_norm": 8.724777221679688,
"learning_rate": 1.9632589458511884e-07,
"loss": 1.236,
"step": 11260
},
{
"epoch": 0.836053412462908,
"grad_norm": 9.154813766479492,
"learning_rate": 1.9460036675207077e-07,
"loss": 1.2911,
"step": 11270
},
{
"epoch": 0.8367952522255193,
"grad_norm": 7.401409149169922,
"learning_rate": 1.9288192919681274e-07,
"loss": 1.2317,
"step": 11280
},
{
"epoch": 0.8375370919881305,
"grad_norm": 8.244491577148438,
"learning_rate": 1.9117059125303858e-07,
"loss": 1.3247,
"step": 11290
},
{
"epoch": 0.8382789317507419,
"grad_norm": 8.045402526855469,
"learning_rate": 1.8946636221587916e-07,
"loss": 1.1623,
"step": 11300
},
{
"epoch": 0.8390207715133531,
"grad_norm": 8.480494499206543,
"learning_rate": 1.8776925134185496e-07,
"loss": 1.0614,
"step": 11310
},
{
"epoch": 0.8397626112759644,
"grad_norm": 7.870119571685791,
"learning_rate": 1.8607926784882235e-07,
"loss": 1.2517,
"step": 11320
},
{
"epoch": 0.8405044510385756,
"grad_norm": 8.129040718078613,
"learning_rate": 1.8439642091592705e-07,
"loss": 1.2463,
"step": 11330
},
{
"epoch": 0.841246290801187,
"grad_norm": 9.212223052978516,
"learning_rate": 1.8272071968355125e-07,
"loss": 1.2069,
"step": 11340
},
{
"epoch": 0.8419881305637982,
"grad_norm": 8.223834991455078,
"learning_rate": 1.8105217325326607e-07,
"loss": 1.2398,
"step": 11350
},
{
"epoch": 0.8427299703264095,
"grad_norm": 8.517450332641602,
"learning_rate": 1.7939079068778075e-07,
"loss": 1.3121,
"step": 11360
},
{
"epoch": 0.8434718100890207,
"grad_norm": 9.853727340698242,
"learning_rate": 1.7773658101089484e-07,
"loss": 1.3609,
"step": 11370
},
{
"epoch": 0.844213649851632,
"grad_norm": 7.983924388885498,
"learning_rate": 1.7608955320744708e-07,
"loss": 1.1758,
"step": 11380
},
{
"epoch": 0.8449554896142433,
"grad_norm": 10.495153427124023,
"learning_rate": 1.7444971622326916e-07,
"loss": 1.4902,
"step": 11390
},
{
"epoch": 0.8456973293768546,
"grad_norm": 7.264878273010254,
"learning_rate": 1.7281707896513477e-07,
"loss": 1.3286,
"step": 11400
},
{
"epoch": 0.8464391691394659,
"grad_norm": 8.638553619384766,
"learning_rate": 1.71191650300713e-07,
"loss": 1.3583,
"step": 11410
},
{
"epoch": 0.8471810089020771,
"grad_norm": 8.893415451049805,
"learning_rate": 1.6957343905851974e-07,
"loss": 1.1841,
"step": 11420
},
{
"epoch": 0.8479228486646885,
"grad_norm": 8.31513500213623,
"learning_rate": 1.6796245402786814e-07,
"loss": 1.3391,
"step": 11430
},
{
"epoch": 0.8486646884272997,
"grad_norm": 8.640569686889648,
"learning_rate": 1.663587039588237e-07,
"loss": 1.3447,
"step": 11440
},
{
"epoch": 0.849406528189911,
"grad_norm": 8.092960357666016,
"learning_rate": 1.6476219756215383e-07,
"loss": 1.3939,
"step": 11450
},
{
"epoch": 0.8501483679525222,
"grad_norm": 8.167853355407715,
"learning_rate": 1.631729435092833e-07,
"loss": 1.3579,
"step": 11460
},
{
"epoch": 0.8508902077151336,
"grad_norm": 8.560916900634766,
"learning_rate": 1.6159095043224452e-07,
"loss": 1.3254,
"step": 11470
},
{
"epoch": 0.8516320474777448,
"grad_norm": 9.065086364746094,
"learning_rate": 1.6001622692363315e-07,
"loss": 1.4158,
"step": 11480
},
{
"epoch": 0.8523738872403561,
"grad_norm": 8.567241668701172,
"learning_rate": 1.584487815365589e-07,
"loss": 1.3156,
"step": 11490
},
{
"epoch": 0.8531157270029673,
"grad_norm": 8.35258960723877,
"learning_rate": 1.568886227846016e-07,
"loss": 1.3877,
"step": 11500
},
{
"epoch": 0.8531157270029673,
"eval_loss": 1.2913334369659424,
"eval_runtime": 23.6172,
"eval_samples_per_second": 18.885,
"eval_steps_per_second": 9.442,
"step": 11500
},
{
"epoch": 0.8538575667655787,
"grad_norm": 8.62309455871582,
"learning_rate": 1.5533575914176257e-07,
"loss": 1.4222,
"step": 11510
},
{
"epoch": 0.8545994065281899,
"grad_norm": 8.811359405517578,
"learning_rate": 1.5379019904242088e-07,
"loss": 1.2911,
"step": 11520
},
{
"epoch": 0.8553412462908012,
"grad_norm": 8.009239196777344,
"learning_rate": 1.5225195088128525e-07,
"loss": 1.2665,
"step": 11530
},
{
"epoch": 0.8560830860534124,
"grad_norm": 7.761435031890869,
"learning_rate": 1.5072102301335056e-07,
"loss": 1.1277,
"step": 11540
},
{
"epoch": 0.8568249258160238,
"grad_norm": 10.778253555297852,
"learning_rate": 1.49197423753851e-07,
"loss": 1.2821,
"step": 11550
},
{
"epoch": 0.857566765578635,
"grad_norm": 8.768142700195312,
"learning_rate": 1.4768116137821587e-07,
"loss": 1.4462,
"step": 11560
},
{
"epoch": 0.8583086053412463,
"grad_norm": 8.696138381958008,
"learning_rate": 1.461722441220234e-07,
"loss": 1.3719,
"step": 11570
},
{
"epoch": 0.8590504451038575,
"grad_norm": 9.442025184631348,
"learning_rate": 1.4467068018095775e-07,
"loss": 1.1023,
"step": 11580
},
{
"epoch": 0.8597922848664689,
"grad_norm": 8.020044326782227,
"learning_rate": 1.4317647771076265e-07,
"loss": 1.5093,
"step": 11590
},
{
"epoch": 0.8605341246290801,
"grad_norm": 7.807736396789551,
"learning_rate": 1.4168964482719914e-07,
"loss": 1.2425,
"step": 11600
},
{
"epoch": 0.8612759643916914,
"grad_norm": 8.01576042175293,
"learning_rate": 1.4021018960599885e-07,
"loss": 1.1915,
"step": 11610
},
{
"epoch": 0.8620178041543026,
"grad_norm": 8.013411521911621,
"learning_rate": 1.3873812008282306e-07,
"loss": 1.4305,
"step": 11620
},
{
"epoch": 0.862759643916914,
"grad_norm": 9.141283988952637,
"learning_rate": 1.3727344425321665e-07,
"loss": 1.4392,
"step": 11630
},
{
"epoch": 0.8635014836795252,
"grad_norm": 8.028132438659668,
"learning_rate": 1.3581617007256646e-07,
"loss": 1.2475,
"step": 11640
},
{
"epoch": 0.8642433234421365,
"grad_norm": 7.848435401916504,
"learning_rate": 1.3436630545605622e-07,
"loss": 1.3094,
"step": 11650
},
{
"epoch": 0.8649851632047477,
"grad_norm": 11.25391674041748,
"learning_rate": 1.3292385827862608e-07,
"loss": 1.2995,
"step": 11660
},
{
"epoch": 0.865727002967359,
"grad_norm": 9.802054405212402,
"learning_rate": 1.3148883637492665e-07,
"loss": 1.2417,
"step": 11670
},
{
"epoch": 0.8664688427299704,
"grad_norm": 8.343031883239746,
"learning_rate": 1.3006124753927945e-07,
"loss": 1.2874,
"step": 11680
},
{
"epoch": 0.8672106824925816,
"grad_norm": 8.474637985229492,
"learning_rate": 1.2864109952563313e-07,
"loss": 1.2236,
"step": 11690
},
{
"epoch": 0.8679525222551929,
"grad_norm": 8.79692268371582,
"learning_rate": 1.2722840004752085e-07,
"loss": 1.3287,
"step": 11700
},
{
"epoch": 0.8686943620178041,
"grad_norm": 8.213624954223633,
"learning_rate": 1.2582315677802008e-07,
"loss": 1.3982,
"step": 11710
},
{
"epoch": 0.8694362017804155,
"grad_norm": 7.986428260803223,
"learning_rate": 1.2442537734970843e-07,
"loss": 1.3435,
"step": 11720
},
{
"epoch": 0.8701780415430267,
"grad_norm": 8.277667045593262,
"learning_rate": 1.2303506935462538e-07,
"loss": 1.1284,
"step": 11730
},
{
"epoch": 0.870919881305638,
"grad_norm": 11.558180809020996,
"learning_rate": 1.2165224034422774e-07,
"loss": 1.3261,
"step": 11740
},
{
"epoch": 0.8716617210682492,
"grad_norm": 10.026036262512207,
"learning_rate": 1.202768978293516e-07,
"loss": 1.1863,
"step": 11750
},
{
"epoch": 0.8724035608308606,
"grad_norm": 7.991722106933594,
"learning_rate": 1.1890904928016927e-07,
"loss": 1.3425,
"step": 11760
},
{
"epoch": 0.8731454005934718,
"grad_norm": 9.529873847961426,
"learning_rate": 1.1754870212614933e-07,
"loss": 1.2864,
"step": 11770
},
{
"epoch": 0.8738872403560831,
"grad_norm": 7.1360883712768555,
"learning_rate": 1.161958637560177e-07,
"loss": 1.2753,
"step": 11780
},
{
"epoch": 0.8746290801186943,
"grad_norm": 7.879760265350342,
"learning_rate": 1.1485054151771518e-07,
"loss": 1.3217,
"step": 11790
},
{
"epoch": 0.8753709198813057,
"grad_norm": 9.130861282348633,
"learning_rate": 1.1351274271835948e-07,
"loss": 1.2331,
"step": 11800
},
{
"epoch": 0.8761127596439169,
"grad_norm": 8.248236656188965,
"learning_rate": 1.1218247462420422e-07,
"loss": 1.3204,
"step": 11810
},
{
"epoch": 0.8768545994065282,
"grad_norm": 6.826605796813965,
"learning_rate": 1.1085974446060054e-07,
"loss": 1.4165,
"step": 11820
},
{
"epoch": 0.8775964391691394,
"grad_norm": 9.05876350402832,
"learning_rate": 1.0954455941195668e-07,
"loss": 1.3007,
"step": 11830
},
{
"epoch": 0.8783382789317508,
"grad_norm": 8.325678825378418,
"learning_rate": 1.0823692662170015e-07,
"loss": 1.5846,
"step": 11840
},
{
"epoch": 0.879080118694362,
"grad_norm": 9.26690673828125,
"learning_rate": 1.0693685319223812e-07,
"loss": 1.3343,
"step": 11850
},
{
"epoch": 0.8798219584569733,
"grad_norm": 7.220630645751953,
"learning_rate": 1.0564434618491875e-07,
"loss": 1.3249,
"step": 11860
},
{
"epoch": 0.8805637982195845,
"grad_norm": 8.11390495300293,
"learning_rate": 1.0435941261999393e-07,
"loss": 1.1482,
"step": 11870
},
{
"epoch": 0.8813056379821959,
"grad_norm": 7.764613151550293,
"learning_rate": 1.0308205947657978e-07,
"loss": 1.3138,
"step": 11880
},
{
"epoch": 0.8820474777448071,
"grad_norm": 8.297335624694824,
"learning_rate": 1.0181229369261985e-07,
"loss": 1.1945,
"step": 11890
},
{
"epoch": 0.8827893175074184,
"grad_norm": 8.837085723876953,
"learning_rate": 1.0055012216484633e-07,
"loss": 1.2443,
"step": 11900
},
{
"epoch": 0.8835311572700296,
"grad_norm": 8.462185859680176,
"learning_rate": 9.929555174874388e-08,
"loss": 1.2518,
"step": 11910
},
{
"epoch": 0.884272997032641,
"grad_norm": 8.001595497131348,
"learning_rate": 9.804858925851124e-08,
"loss": 1.2265,
"step": 11920
},
{
"epoch": 0.8850148367952523,
"grad_norm": 8.231101989746094,
"learning_rate": 9.68092414670248e-08,
"loss": 1.3531,
"step": 11930
},
{
"epoch": 0.8857566765578635,
"grad_norm": 9.410528182983398,
"learning_rate": 9.557751510580209e-08,
"loss": 1.2193,
"step": 11940
},
{
"epoch": 0.8864985163204748,
"grad_norm": 7.80114221572876,
"learning_rate": 9.435341686496408e-08,
"loss": 1.3282,
"step": 11950
},
{
"epoch": 0.887240356083086,
"grad_norm": 7.797093868255615,
"learning_rate": 9.313695339320066e-08,
"loss": 1.2345,
"step": 11960
},
{
"epoch": 0.8879821958456974,
"grad_norm": 8.155489921569824,
"learning_rate": 9.192813129773248e-08,
"loss": 1.1327,
"step": 11970
},
{
"epoch": 0.8887240356083086,
"grad_norm": 7.540963649749756,
"learning_rate": 9.072695714427665e-08,
"loss": 1.2119,
"step": 11980
},
{
"epoch": 0.8894658753709199,
"grad_norm": 8.781906127929688,
"learning_rate": 8.953343745700987e-08,
"loss": 1.2566,
"step": 11990
},
{
"epoch": 0.8902077151335311,
"grad_norm": 8.42147445678711,
"learning_rate": 8.83475787185346e-08,
"loss": 1.2929,
"step": 12000
},
{
"epoch": 0.8902077151335311,
"eval_loss": 1.2903343439102173,
"eval_runtime": 23.6255,
"eval_samples_per_second": 18.878,
"eval_steps_per_second": 9.439,
"step": 12000
},
{
"epoch": 0.8909495548961425,
"grad_norm": 7.471776485443115,
"learning_rate": 8.716938736984192e-08,
"loss": 1.2483,
"step": 12010
},
{
"epoch": 0.8916913946587537,
"grad_norm": 8.134690284729004,
"learning_rate": 8.599886981027805e-08,
"loss": 1.2836,
"step": 12020
},
{
"epoch": 0.892433234421365,
"grad_norm": 7.840508460998535,
"learning_rate": 8.48360323975087e-08,
"loss": 1.2289,
"step": 12030
},
{
"epoch": 0.8931750741839762,
"grad_norm": 7.892512798309326,
"learning_rate": 8.368088144748515e-08,
"loss": 1.267,
"step": 12040
},
{
"epoch": 0.8939169139465876,
"grad_norm": 8.905203819274902,
"learning_rate": 8.253342323440921e-08,
"loss": 1.4043,
"step": 12050
},
{
"epoch": 0.8946587537091988,
"grad_norm": 7.420648574829102,
"learning_rate": 8.139366399070014e-08,
"loss": 1.3941,
"step": 12060
},
{
"epoch": 0.8954005934718101,
"grad_norm": 8.54706859588623,
"learning_rate": 8.026160990695996e-08,
"loss": 1.3438,
"step": 12070
},
{
"epoch": 0.8961424332344213,
"grad_norm": 7.239863872528076,
"learning_rate": 7.91372671319402e-08,
"loss": 1.3068,
"step": 12080
},
{
"epoch": 0.8968842729970327,
"grad_norm": 8.183691024780273,
"learning_rate": 7.8020641772508e-08,
"loss": 1.4976,
"step": 12090
},
{
"epoch": 0.8976261127596439,
"grad_norm": 7.157724380493164,
"learning_rate": 7.691173989361428e-08,
"loss": 1.4513,
"step": 12100
},
{
"epoch": 0.8983679525222552,
"grad_norm": 8.098734855651855,
"learning_rate": 7.581056751825893e-08,
"loss": 1.3248,
"step": 12110
},
{
"epoch": 0.8991097922848664,
"grad_norm": 7.755335807800293,
"learning_rate": 7.471713062745967e-08,
"loss": 1.3429,
"step": 12120
},
{
"epoch": 0.8998516320474778,
"grad_norm": 6.79167366027832,
"learning_rate": 7.363143516021858e-08,
"loss": 1.2159,
"step": 12130
},
{
"epoch": 0.900593471810089,
"grad_norm": 8.930359840393066,
"learning_rate": 7.255348701349029e-08,
"loss": 1.297,
"step": 12140
},
{
"epoch": 0.9013353115727003,
"grad_norm": 9.669726371765137,
"learning_rate": 7.148329204214987e-08,
"loss": 1.2854,
"step": 12150
},
{
"epoch": 0.9020771513353115,
"grad_norm": 8.40202522277832,
"learning_rate": 7.042085605896142e-08,
"loss": 1.0888,
"step": 12160
},
{
"epoch": 0.9028189910979229,
"grad_norm": 7.3866729736328125,
"learning_rate": 6.936618483454527e-08,
"loss": 1.4845,
"step": 12170
},
{
"epoch": 0.9035608308605341,
"grad_norm": 8.179498672485352,
"learning_rate": 6.831928409734811e-08,
"loss": 1.2014,
"step": 12180
},
{
"epoch": 0.9043026706231454,
"grad_norm": 8.435233116149902,
"learning_rate": 6.728015953361094e-08,
"loss": 1.248,
"step": 12190
},
{
"epoch": 0.9050445103857567,
"grad_norm": 7.416328430175781,
"learning_rate": 6.624881678733852e-08,
"loss": 1.2651,
"step": 12200
},
{
"epoch": 0.905786350148368,
"grad_norm": 8.049245834350586,
"learning_rate": 6.522526146026924e-08,
"loss": 1.1607,
"step": 12210
},
{
"epoch": 0.9065281899109793,
"grad_norm": 7.982175827026367,
"learning_rate": 6.420949911184288e-08,
"loss": 1.2755,
"step": 12220
},
{
"epoch": 0.9072700296735905,
"grad_norm": 9.001856803894043,
"learning_rate": 6.320153525917299e-08,
"loss": 1.1793,
"step": 12230
},
{
"epoch": 0.9080118694362018,
"grad_norm": 8.896450996398926,
"learning_rate": 6.220137537701459e-08,
"loss": 1.4263,
"step": 12240
},
{
"epoch": 0.908753709198813,
"grad_norm": 9.380216598510742,
"learning_rate": 6.120902489773606e-08,
"loss": 1.4032,
"step": 12250
},
{
"epoch": 0.9094955489614244,
"grad_norm": 7.810571193695068,
"learning_rate": 6.022448921128854e-08,
"loss": 1.3057,
"step": 12260
},
{
"epoch": 0.9102373887240356,
"grad_norm": 7.799693584442139,
"learning_rate": 5.9247773665177805e-08,
"loss": 1.3243,
"step": 12270
},
{
"epoch": 0.9109792284866469,
"grad_norm": 8.116616249084473,
"learning_rate": 5.8278883564433614e-08,
"loss": 1.4306,
"step": 12280
},
{
"epoch": 0.9117210682492581,
"grad_norm": 8.54800033569336,
"learning_rate": 5.731782417158271e-08,
"loss": 1.3961,
"step": 12290
},
{
"epoch": 0.9124629080118695,
"grad_norm": 8.728897094726562,
"learning_rate": 5.636460070661853e-08,
"loss": 1.3383,
"step": 12300
},
{
"epoch": 0.9132047477744807,
"grad_norm": 8.328527450561523,
"learning_rate": 5.5419218346974723e-08,
"loss": 1.2801,
"step": 12310
},
{
"epoch": 0.913946587537092,
"grad_norm": 7.380051612854004,
"learning_rate": 5.448168222749467e-08,
"loss": 1.3151,
"step": 12320
},
{
"epoch": 0.9146884272997032,
"grad_norm": 9.386639595031738,
"learning_rate": 5.355199744040601e-08,
"loss": 1.2813,
"step": 12330
},
{
"epoch": 0.9154302670623146,
"grad_norm": 8.962152481079102,
"learning_rate": 5.2630169035291164e-08,
"loss": 1.3694,
"step": 12340
},
{
"epoch": 0.9161721068249258,
"grad_norm": 8.48715877532959,
"learning_rate": 5.171620201906119e-08,
"loss": 1.2331,
"step": 12350
},
{
"epoch": 0.9169139465875371,
"grad_norm": 8.15807819366455,
"learning_rate": 5.081010135592745e-08,
"loss": 1.2515,
"step": 12360
},
{
"epoch": 0.9176557863501483,
"grad_norm": 7.585864543914795,
"learning_rate": 4.9911871967375675e-08,
"loss": 1.2352,
"step": 12370
},
{
"epoch": 0.9183976261127597,
"grad_norm": 7.90684700012207,
"learning_rate": 4.902151873213828e-08,
"loss": 1.2776,
"step": 12380
},
{
"epoch": 0.9191394658753709,
"grad_norm": 10.257676124572754,
"learning_rate": 4.813904648616907e-08,
"loss": 1.3307,
"step": 12390
},
{
"epoch": 0.9198813056379822,
"grad_norm": 8.50632095336914,
"learning_rate": 4.7264460022615416e-08,
"loss": 1.2977,
"step": 12400
},
{
"epoch": 0.9206231454005934,
"grad_norm": 7.718177318572998,
"learning_rate": 4.63977640917938e-08,
"loss": 1.1651,
"step": 12410
},
{
"epoch": 0.9213649851632048,
"grad_norm": 8.514959335327148,
"learning_rate": 4.5538963401162645e-08,
"loss": 1.197,
"step": 12420
},
{
"epoch": 0.922106824925816,
"grad_norm": 7.185023784637451,
"learning_rate": 4.468806261529801e-08,
"loss": 1.2111,
"step": 12430
},
{
"epoch": 0.9228486646884273,
"grad_norm": 10.854412078857422,
"learning_rate": 4.38450663558671e-08,
"loss": 1.2498,
"step": 12440
},
{
"epoch": 0.9235905044510386,
"grad_norm": 8.594488143920898,
"learning_rate": 4.3009979201604154e-08,
"loss": 1.2785,
"step": 12450
},
{
"epoch": 0.9243323442136498,
"grad_norm": 8.167387008666992,
"learning_rate": 4.218280568828442e-08,
"loss": 1.4823,
"step": 12460
},
{
"epoch": 0.9250741839762612,
"grad_norm": 9.161100387573242,
"learning_rate": 4.136355030870104e-08,
"loss": 1.2747,
"step": 12470
},
{
"epoch": 0.9258160237388724,
"grad_norm": 8.26723575592041,
"learning_rate": 4.0552217512639213e-08,
"loss": 1.3196,
"step": 12480
},
{
"epoch": 0.9265578635014837,
"grad_norm": 8.994638442993164,
"learning_rate": 3.974881170685274e-08,
"loss": 1.127,
"step": 12490
},
{
"epoch": 0.9272997032640949,
"grad_norm": 9.040610313415527,
"learning_rate": 3.895333725504035e-08,
"loss": 1.4017,
"step": 12500
},
{
"epoch": 0.9272997032640949,
"eval_loss": 1.289976954460144,
"eval_runtime": 23.649,
"eval_samples_per_second": 18.859,
"eval_steps_per_second": 9.43,
"step": 12500
},
{
"epoch": 0.9280415430267063,
"grad_norm": 7.480683326721191,
"learning_rate": 3.816579847782092e-08,
"loss": 1.3201,
"step": 12510
},
{
"epoch": 0.9287833827893175,
"grad_norm": 8.271261215209961,
"learning_rate": 3.738619965271145e-08,
"loss": 1.2206,
"step": 12520
},
{
"epoch": 0.9295252225519288,
"grad_norm": 8.740528106689453,
"learning_rate": 3.661454501410277e-08,
"loss": 1.3493,
"step": 12530
},
{
"epoch": 0.93026706231454,
"grad_norm": 11.37153434753418,
"learning_rate": 3.585083875323675e-08,
"loss": 1.2472,
"step": 12540
},
{
"epoch": 0.9310089020771514,
"grad_norm": 9.2501802444458,
"learning_rate": 3.5095085018183595e-08,
"loss": 1.3783,
"step": 12550
},
{
"epoch": 0.9317507418397626,
"grad_norm": 7.809544086456299,
"learning_rate": 3.434728791381991e-08,
"loss": 1.1981,
"step": 12560
},
{
"epoch": 0.9324925816023739,
"grad_norm": 9.464616775512695,
"learning_rate": 3.360745150180522e-08,
"loss": 1.4154,
"step": 12570
},
{
"epoch": 0.9332344213649851,
"grad_norm": 6.653102874755859,
"learning_rate": 3.2875579800561104e-08,
"loss": 1.1891,
"step": 12580
},
{
"epoch": 0.9339762611275965,
"grad_norm": 9.972185134887695,
"learning_rate": 3.215167678524794e-08,
"loss": 1.3693,
"step": 12590
},
{
"epoch": 0.9347181008902077,
"grad_norm": 7.9361419677734375,
"learning_rate": 3.143574638774555e-08,
"loss": 1.274,
"step": 12600
},
{
"epoch": 0.935459940652819,
"grad_norm": 10.938789367675781,
"learning_rate": 3.072779249662905e-08,
"loss": 1.5216,
"step": 12610
},
{
"epoch": 0.9362017804154302,
"grad_norm": 8.189596176147461,
"learning_rate": 3.002781895715023e-08,
"loss": 1.21,
"step": 12620
},
{
"epoch": 0.9369436201780416,
"grad_norm": 7.8240790367126465,
"learning_rate": 2.933582957121489e-08,
"loss": 1.3034,
"step": 12630
},
{
"epoch": 0.9376854599406528,
"grad_norm": 10.08263874053955,
"learning_rate": 2.8651828097363663e-08,
"loss": 1.3179,
"step": 12640
},
{
"epoch": 0.9384272997032641,
"grad_norm": 7.9933366775512695,
"learning_rate": 2.7975818250749906e-08,
"loss": 1.2449,
"step": 12650
},
{
"epoch": 0.9391691394658753,
"grad_norm": 10.46999740600586,
"learning_rate": 2.730780370312119e-08,
"loss": 1.2007,
"step": 12660
},
{
"epoch": 0.9399109792284867,
"grad_norm": 8.219291687011719,
"learning_rate": 2.664778808279833e-08,
"loss": 1.1914,
"step": 12670
},
{
"epoch": 0.9406528189910979,
"grad_norm": 7.833841800689697,
"learning_rate": 2.599577497465605e-08,
"loss": 1.257,
"step": 12680
},
{
"epoch": 0.9413946587537092,
"grad_norm": 8.004801750183105,
"learning_rate": 2.5351767920103187e-08,
"loss": 1.227,
"step": 12690
},
{
"epoch": 0.9421364985163204,
"grad_norm": 9.302260398864746,
"learning_rate": 2.4715770417064187e-08,
"loss": 1.5238,
"step": 12700
},
{
"epoch": 0.9428783382789317,
"grad_norm": 9.043020248413086,
"learning_rate": 2.4087785919959137e-08,
"loss": 1.3732,
"step": 12710
},
{
"epoch": 0.9436201780415431,
"grad_norm": 8.14455795288086,
"learning_rate": 2.3467817839685767e-08,
"loss": 1.3711,
"step": 12720
},
{
"epoch": 0.9443620178041543,
"grad_norm": 8.116730690002441,
"learning_rate": 2.285586954360047e-08,
"loss": 1.3635,
"step": 12730
},
{
"epoch": 0.9451038575667656,
"grad_norm": 8.274658203125,
"learning_rate": 2.225194435550032e-08,
"loss": 1.2573,
"step": 12740
},
{
"epoch": 0.9458456973293768,
"grad_norm": 9.78200912475586,
"learning_rate": 2.1656045555605074e-08,
"loss": 1.4526,
"step": 12750
},
{
"epoch": 0.9465875370919882,
"grad_norm": 9.067741394042969,
"learning_rate": 2.1068176380538373e-08,
"loss": 1.3403,
"step": 12760
},
{
"epoch": 0.9473293768545994,
"grad_norm": 13.13876724243164,
"learning_rate": 2.0488340023312068e-08,
"loss": 1.3285,
"step": 12770
},
{
"epoch": 0.9480712166172107,
"grad_norm": 9.662564277648926,
"learning_rate": 1.9916539633306753e-08,
"loss": 1.1916,
"step": 12780
},
{
"epoch": 0.9488130563798219,
"grad_norm": 8.476212501525879,
"learning_rate": 1.9352778316256258e-08,
"loss": 1.2045,
"step": 12790
},
{
"epoch": 0.9495548961424333,
"grad_norm": 8.053838729858398,
"learning_rate": 1.8797059134230186e-08,
"loss": 1.2306,
"step": 12800
},
{
"epoch": 0.9502967359050445,
"grad_norm": 9.246392250061035,
"learning_rate": 1.8249385105616913e-08,
"loss": 1.3062,
"step": 12810
},
{
"epoch": 0.9510385756676558,
"grad_norm": 8.371253967285156,
"learning_rate": 1.7709759205107923e-08,
"loss": 1.3793,
"step": 12820
},
{
"epoch": 0.951780415430267,
"grad_norm": 7.30432653427124,
"learning_rate": 1.7178184363681182e-08,
"loss": 1.2617,
"step": 12830
},
{
"epoch": 0.9525222551928784,
"grad_norm": 8.539069175720215,
"learning_rate": 1.6654663468585295e-08,
"loss": 1.4355,
"step": 12840
},
{
"epoch": 0.9532640949554896,
"grad_norm": 10.273870468139648,
"learning_rate": 1.6139199363323864e-08,
"loss": 1.4731,
"step": 12850
},
{
"epoch": 0.9540059347181009,
"grad_norm": 8.182214736938477,
"learning_rate": 1.5631794847639824e-08,
"loss": 1.3659,
"step": 12860
},
{
"epoch": 0.9547477744807121,
"grad_norm": 11.630437850952148,
"learning_rate": 1.513245267750113e-08,
"loss": 1.2748,
"step": 12870
},
{
"epoch": 0.9554896142433235,
"grad_norm": 10.194863319396973,
"learning_rate": 1.4641175565084265e-08,
"loss": 1.3644,
"step": 12880
},
{
"epoch": 0.9562314540059347,
"grad_norm": 7.40037727355957,
"learning_rate": 1.4157966178761083e-08,
"loss": 1.2678,
"step": 12890
},
{
"epoch": 0.956973293768546,
"grad_norm": 9.2279052734375,
"learning_rate": 1.3682827143082832e-08,
"loss": 1.4593,
"step": 12900
},
{
"epoch": 0.9577151335311572,
"grad_norm": 8.294316291809082,
"learning_rate": 1.3215761038767483e-08,
"loss": 1.2412,
"step": 12910
},
{
"epoch": 0.9584569732937686,
"grad_norm": 7.223811626434326,
"learning_rate": 1.2756770402684081e-08,
"loss": 1.2322,
"step": 12920
},
{
"epoch": 0.9591988130563798,
"grad_norm": 7.492358207702637,
"learning_rate": 1.2305857727840597e-08,
"loss": 1.3891,
"step": 12930
},
{
"epoch": 0.9599406528189911,
"grad_norm": 8.983826637268066,
"learning_rate": 1.186302546336876e-08,
"loss": 1.4126,
"step": 12940
},
{
"epoch": 0.9606824925816023,
"grad_norm": 8.535353660583496,
"learning_rate": 1.1428276014512073e-08,
"loss": 1.1881,
"step": 12950
},
{
"epoch": 0.9614243323442137,
"grad_norm": 7.106237888336182,
"learning_rate": 1.1001611742611827e-08,
"loss": 1.292,
"step": 12960
},
{
"epoch": 0.962166172106825,
"grad_norm": 8.555818557739258,
"learning_rate": 1.0583034965095274e-08,
"loss": 1.4155,
"step": 12970
},
{
"epoch": 0.9629080118694362,
"grad_norm": 12.074318885803223,
"learning_rate": 1.0172547955461798e-08,
"loss": 1.4455,
"step": 12980
},
{
"epoch": 0.9636498516320475,
"grad_norm": 8.598979949951172,
"learning_rate": 9.770152943271604e-09,
"loss": 1.3468,
"step": 12990
},
{
"epoch": 0.9643916913946587,
"grad_norm": 9.474443435668945,
"learning_rate": 9.375852114133221e-09,
"loss": 1.2126,
"step": 13000
},
{
"epoch": 0.9643916913946587,
"eval_loss": 1.2897428274154663,
"eval_runtime": 23.7046,
"eval_samples_per_second": 18.815,
"eval_steps_per_second": 9.407,
"step": 13000
},
{
"epoch": 0.9651335311572701,
"grad_norm": 7.944087982177734,
"learning_rate": 8.989647609691342e-09,
"loss": 1.2518,
"step": 13010
},
{
"epoch": 0.9658753709198813,
"grad_norm": 8.350529670715332,
"learning_rate": 8.611541527615508e-09,
"loss": 1.1986,
"step": 13020
},
{
"epoch": 0.9666172106824926,
"grad_norm": 8.835983276367188,
"learning_rate": 8.241535921589106e-09,
"loss": 1.4297,
"step": 13030
},
{
"epoch": 0.9673590504451038,
"grad_norm": 9.169357299804688,
"learning_rate": 7.879632801297387e-09,
"loss": 1.2199,
"step": 13040
},
{
"epoch": 0.9681008902077152,
"grad_norm": 11.830096244812012,
"learning_rate": 7.525834132416976e-09,
"loss": 1.3442,
"step": 13050
},
{
"epoch": 0.9688427299703264,
"grad_norm": 8.521605491638184,
"learning_rate": 7.180141836605536e-09,
"loss": 1.3721,
"step": 13060
},
{
"epoch": 0.9695845697329377,
"grad_norm": 8.605573654174805,
"learning_rate": 6.842557791490122e-09,
"loss": 1.3636,
"step": 13070
},
{
"epoch": 0.9703264094955489,
"grad_norm": 7.742245197296143,
"learning_rate": 6.513083830659017e-09,
"loss": 1.2094,
"step": 13080
},
{
"epoch": 0.9710682492581603,
"grad_norm": 9.237808227539062,
"learning_rate": 6.19172174364957e-09,
"loss": 1.2527,
"step": 13090
},
{
"epoch": 0.9718100890207715,
"grad_norm": 8.128382682800293,
"learning_rate": 5.878473275940044e-09,
"loss": 1.1039,
"step": 13100
},
{
"epoch": 0.9725519287833828,
"grad_norm": 9.049505233764648,
"learning_rate": 5.573340128939286e-09,
"loss": 1.3061,
"step": 13110
},
{
"epoch": 0.973293768545994,
"grad_norm": 8.908202171325684,
"learning_rate": 5.276323959978235e-09,
"loss": 1.3528,
"step": 13120
},
{
"epoch": 0.9740356083086054,
"grad_norm": 10.514373779296875,
"learning_rate": 4.987426382299598e-09,
"loss": 1.2388,
"step": 13130
},
{
"epoch": 0.9747774480712166,
"grad_norm": 8.527087211608887,
"learning_rate": 4.706648965051019e-09,
"loss": 1.249,
"step": 13140
},
{
"epoch": 0.9755192878338279,
"grad_norm": 8.145854949951172,
"learning_rate": 4.433993233274591e-09,
"loss": 1.1806,
"step": 13150
},
{
"epoch": 0.9762611275964391,
"grad_norm": 7.99056339263916,
"learning_rate": 4.169460667900027e-09,
"loss": 1.2609,
"step": 13160
},
{
"epoch": 0.9770029673590505,
"grad_norm": 8.545060157775879,
"learning_rate": 3.913052705735997e-09,
"loss": 1.3828,
"step": 13170
},
{
"epoch": 0.9777448071216617,
"grad_norm": 8.997559547424316,
"learning_rate": 3.6647707394619756e-09,
"loss": 1.2281,
"step": 13180
},
{
"epoch": 0.978486646884273,
"grad_norm": 8.990921974182129,
"learning_rate": 3.4246161176217372e-09,
"loss": 1.3476,
"step": 13190
},
{
"epoch": 0.9792284866468842,
"grad_norm": 7.651655673980713,
"learning_rate": 3.1925901446148707e-09,
"loss": 1.2981,
"step": 13200
},
{
"epoch": 0.9799703264094956,
"grad_norm": 8.907322883605957,
"learning_rate": 2.9686940806904485e-09,
"loss": 1.517,
"step": 13210
},
{
"epoch": 0.9807121661721068,
"grad_norm": 9.220152854919434,
"learning_rate": 2.752929141939864e-09,
"loss": 1.2845,
"step": 13220
},
{
"epoch": 0.9814540059347181,
"grad_norm": 7.591921329498291,
"learning_rate": 2.5452965002903396e-09,
"loss": 1.2068,
"step": 13230
},
{
"epoch": 0.9821958456973294,
"grad_norm": 8.76726245880127,
"learning_rate": 2.34579728349843e-09,
"loss": 1.2462,
"step": 13240
},
{
"epoch": 0.9829376854599406,
"grad_norm": 8.317231178283691,
"learning_rate": 2.154432575144194e-09,
"loss": 1.3612,
"step": 13250
},
{
"epoch": 0.983679525222552,
"grad_norm": 9.405437469482422,
"learning_rate": 1.9712034146250336e-09,
"loss": 1.2786,
"step": 13260
},
{
"epoch": 0.9844213649851632,
"grad_norm": 7.619749069213867,
"learning_rate": 1.7961107971498635e-09,
"loss": 1.2626,
"step": 13270
},
{
"epoch": 0.9851632047477745,
"grad_norm": 13.960756301879883,
"learning_rate": 1.6291556737344503e-09,
"loss": 1.432,
"step": 13280
},
{
"epoch": 0.9859050445103857,
"grad_norm": 9.093308448791504,
"learning_rate": 1.4703389511955822e-09,
"loss": 1.3687,
"step": 13290
},
{
"epoch": 0.9866468842729971,
"grad_norm": 7.0503458976745605,
"learning_rate": 1.319661492145907e-09,
"loss": 1.3628,
"step": 13300
},
{
"epoch": 0.9873887240356083,
"grad_norm": 9.696161270141602,
"learning_rate": 1.1771241149901024e-09,
"loss": 1.3019,
"step": 13310
},
{
"epoch": 0.9881305637982196,
"grad_norm": 9.714421272277832,
"learning_rate": 1.0427275939200453e-09,
"loss": 1.3525,
"step": 13320
},
{
"epoch": 0.9888724035608308,
"grad_norm": 9.047686576843262,
"learning_rate": 9.164726589103167e-10,
"loss": 1.208,
"step": 13330
},
{
"epoch": 0.9896142433234422,
"grad_norm": 10.157636642456055,
"learning_rate": 7.983599957147036e-10,
"loss": 1.393,
"step": 13340
},
{
"epoch": 0.9903560830860534,
"grad_norm": 8.367962837219238,
"learning_rate": 6.883902458618696e-10,
"loss": 1.2299,
"step": 13350
},
{
"epoch": 0.9910979228486647,
"grad_norm": 8.495455741882324,
"learning_rate": 5.865640066525235e-10,
"loss": 1.3572,
"step": 13360
},
{
"epoch": 0.9918397626112759,
"grad_norm": 7.387685298919678,
"learning_rate": 4.92881831156089e-10,
"loss": 1.2902,
"step": 13370
},
{
"epoch": 0.9925816023738873,
"grad_norm": 8.251172065734863,
"learning_rate": 4.073442282070405e-10,
"loss": 1.2775,
"step": 13380
},
{
"epoch": 0.9933234421364985,
"grad_norm": 7.994820594787598,
"learning_rate": 3.2995166240290533e-10,
"loss": 1.1886,
"step": 13390
},
{
"epoch": 0.9940652818991098,
"grad_norm": 9.602749824523926,
"learning_rate": 2.6070455410159843e-10,
"loss": 1.2908,
"step": 13400
},
{
"epoch": 0.994807121661721,
"grad_norm": 8.252080917358398,
"learning_rate": 1.996032794184255e-10,
"loss": 1.3749,
"step": 13410
},
{
"epoch": 0.9955489614243324,
"grad_norm": 8.841800689697266,
"learning_rate": 1.4664817022508326e-10,
"loss": 1.3027,
"step": 13420
},
{
"epoch": 0.9962908011869436,
"grad_norm": 7.5887370109558105,
"learning_rate": 1.0183951414732828e-10,
"loss": 1.3064,
"step": 13430
},
{
"epoch": 0.9970326409495549,
"grad_norm": 8.982939720153809,
"learning_rate": 6.517755456331153e-11,
"loss": 1.3691,
"step": 13440
},
{
"epoch": 0.9977744807121661,
"grad_norm": 8.62787914276123,
"learning_rate": 3.666249060241267e-11,
"loss": 1.2796,
"step": 13450
},
{
"epoch": 0.9985163204747775,
"grad_norm": 8.360151290893555,
"learning_rate": 1.6294477144074282e-11,
"loss": 1.2975,
"step": 13460
},
{
"epoch": 0.9992581602373887,
"grad_norm": 8.118022918701172,
"learning_rate": 4.073624817468868e-12,
"loss": 1.2711,
"step": 13470
},
{
"epoch": 1.0,
"grad_norm": 7.892242908477783,
"learning_rate": 0.0,
"loss": 1.2698,
"step": 13480
},
{
"epoch": 1.0,
"step": 13480,
"total_flos": 3.3534568071535e+17,
"train_loss": 1.3545556901116753,
"train_runtime": 10924.2621,
"train_samples_per_second": 2.468,
"train_steps_per_second": 1.234
}
],
"logging_steps": 10,
"max_steps": 13480,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3534568071535e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}