{ "best_metric": 0.8562657833099365, "best_model_checkpoint": "./pkr7098/imagenet2012-1k-subsampling-50_outputs/checkpoint-53130", "epoch": 10.0, "eval_steps": 500, "global_step": 53130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.8585796356201172, "learning_rate": 1.9996235648409565e-05, "loss": 6.9253, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.480604648590088, "learning_rate": 1.9992471296819125e-05, "loss": 6.9171, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.0857093334198, "learning_rate": 1.9988706945228688e-05, "loss": 6.9099, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.152116537094116, "learning_rate": 1.9984942593638247e-05, "loss": 6.9089, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.038759708404541, "learning_rate": 1.998117824204781e-05, "loss": 6.9107, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.1176657676696777, "learning_rate": 1.997741389045737e-05, "loss": 6.9032, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.669021487236023, "learning_rate": 1.9973649538866934e-05, "loss": 6.9133, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.055243968963623, "learning_rate": 1.9969885187276493e-05, "loss": 6.9086, "step": 80 }, { "epoch": 0.02, "grad_norm": 1.8620834350585938, "learning_rate": 1.9966120835686053e-05, "loss": 6.9038, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.8310596942901611, "learning_rate": 1.9962356484095616e-05, "loss": 6.9013, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.1785600185394287, "learning_rate": 1.9958592132505176e-05, "loss": 6.8939, "step": 110 }, { "epoch": 0.02, "grad_norm": 1.7701833248138428, "learning_rate": 1.995482778091474e-05, "loss": 6.8996, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.139279842376709, "learning_rate": 1.99510634293243e-05, "loss": 6.8908, "step": 130 }, { "epoch": 0.03, "grad_norm": 2.171644687652588, "learning_rate": 1.9947299077733862e-05, "loss": 6.9144, "step": 140 }, { "epoch": 0.03, "grad_norm": 2.095906972885132, "learning_rate": 1.9943534726143422e-05, "loss": 6.8951, "step": 150 }, { "epoch": 0.03, "grad_norm": 2.044994831085205, "learning_rate": 1.9939770374552985e-05, "loss": 6.9095, "step": 160 }, { "epoch": 0.03, "grad_norm": 1.922256350517273, "learning_rate": 1.9936006022962545e-05, "loss": 6.8982, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.382159948348999, "learning_rate": 1.9932241671372108e-05, "loss": 6.89, "step": 180 }, { "epoch": 0.04, "grad_norm": 2.047050714492798, "learning_rate": 1.992847731978167e-05, "loss": 6.8883, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.0464096069335938, "learning_rate": 1.992471296819123e-05, "loss": 6.873, "step": 200 }, { "epoch": 0.04, "grad_norm": 2.008358955383301, "learning_rate": 1.9920948616600794e-05, "loss": 6.8818, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.002315044403076, "learning_rate": 1.9917184265010354e-05, "loss": 6.9011, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.103261947631836, "learning_rate": 1.9913419913419917e-05, "loss": 6.8859, "step": 230 }, { "epoch": 0.05, "grad_norm": 2.0483646392822266, "learning_rate": 1.9909655561829477e-05, "loss": 6.863, "step": 240 }, { "epoch": 0.05, "grad_norm": 2.216993808746338, "learning_rate": 1.990589121023904e-05, "loss": 6.888, "step": 250 }, { "epoch": 0.05, "grad_norm": 2.370058059692383, "learning_rate": 1.99021268586486e-05, "loss": 6.8774, "step": 260 }, { "epoch": 0.05, "grad_norm": 2.0837109088897705, "learning_rate": 1.989836250705816e-05, "loss": 6.8961, "step": 270 }, { "epoch": 0.05, "grad_norm": 2.42187237739563, "learning_rate": 1.9894598155467723e-05, "loss": 6.8963, "step": 280 }, { "epoch": 0.05, "grad_norm": 2.1696746349334717, "learning_rate": 1.9890833803877282e-05, "loss": 6.8862, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.573646306991577, "learning_rate": 1.9887069452286845e-05, "loss": 6.8992, "step": 300 }, { "epoch": 0.06, "grad_norm": 2.024512529373169, "learning_rate": 1.9883305100696405e-05, "loss": 6.8682, "step": 310 }, { "epoch": 0.06, "grad_norm": 2.296330451965332, "learning_rate": 1.987954074910597e-05, "loss": 6.8696, "step": 320 }, { "epoch": 0.06, "grad_norm": 2.309410333633423, "learning_rate": 1.9875776397515528e-05, "loss": 6.8806, "step": 330 }, { "epoch": 0.06, "grad_norm": 2.832308292388916, "learning_rate": 1.987201204592509e-05, "loss": 6.8922, "step": 340 }, { "epoch": 0.07, "grad_norm": 2.251929759979248, "learning_rate": 1.986824769433465e-05, "loss": 6.8934, "step": 350 }, { "epoch": 0.07, "grad_norm": 2.0651419162750244, "learning_rate": 1.9864483342744214e-05, "loss": 6.8582, "step": 360 }, { "epoch": 0.07, "grad_norm": 2.233346939086914, "learning_rate": 1.9860718991153774e-05, "loss": 6.8853, "step": 370 }, { "epoch": 0.07, "grad_norm": 2.2311742305755615, "learning_rate": 1.9856954639563337e-05, "loss": 6.8671, "step": 380 }, { "epoch": 0.07, "grad_norm": 2.0983166694641113, "learning_rate": 1.98531902879729e-05, "loss": 6.8613, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.14290714263916, "learning_rate": 1.984942593638246e-05, "loss": 6.8369, "step": 400 }, { "epoch": 0.08, "grad_norm": 2.194899082183838, "learning_rate": 1.9845661584792023e-05, "loss": 6.8728, "step": 410 }, { "epoch": 0.08, "grad_norm": 1.9904452562332153, "learning_rate": 1.9841897233201583e-05, "loss": 6.8779, "step": 420 }, { "epoch": 0.08, "grad_norm": 2.2429018020629883, "learning_rate": 1.9838132881611146e-05, "loss": 6.834, "step": 430 }, { "epoch": 0.08, "grad_norm": 2.2924535274505615, "learning_rate": 1.9834368530020706e-05, "loss": 6.8536, "step": 440 }, { "epoch": 0.08, "grad_norm": 2.234050750732422, "learning_rate": 1.9830604178430266e-05, "loss": 6.8649, "step": 450 }, { "epoch": 0.09, "grad_norm": 2.2850382328033447, "learning_rate": 1.982683982683983e-05, "loss": 6.8306, "step": 460 }, { "epoch": 0.09, "grad_norm": 2.9660110473632812, "learning_rate": 1.982307547524939e-05, "loss": 6.8365, "step": 470 }, { "epoch": 0.09, "grad_norm": 2.3017611503601074, "learning_rate": 1.9819311123658952e-05, "loss": 6.8603, "step": 480 }, { "epoch": 0.09, "grad_norm": 2.140291690826416, "learning_rate": 1.981554677206851e-05, "loss": 6.8539, "step": 490 }, { "epoch": 0.09, "grad_norm": 2.315230369567871, "learning_rate": 1.9811782420478075e-05, "loss": 6.8482, "step": 500 }, { "epoch": 0.1, "grad_norm": 2.797887086868286, "learning_rate": 1.9808018068887634e-05, "loss": 6.8395, "step": 510 }, { "epoch": 0.1, "grad_norm": 2.40307354927063, "learning_rate": 1.9804253717297198e-05, "loss": 6.8594, "step": 520 }, { "epoch": 0.1, "grad_norm": 2.569096326828003, "learning_rate": 1.9800489365706757e-05, "loss": 6.8484, "step": 530 }, { "epoch": 0.1, "grad_norm": 2.475616216659546, "learning_rate": 1.979672501411632e-05, "loss": 6.8645, "step": 540 }, { "epoch": 0.1, "grad_norm": 2.2893221378326416, "learning_rate": 1.979296066252588e-05, "loss": 6.8275, "step": 550 }, { "epoch": 0.11, "grad_norm": 2.6556358337402344, "learning_rate": 1.9789196310935443e-05, "loss": 6.8431, "step": 560 }, { "epoch": 0.11, "grad_norm": 2.5893912315368652, "learning_rate": 1.9785431959345007e-05, "loss": 6.8599, "step": 570 }, { "epoch": 0.11, "grad_norm": 2.4556918144226074, "learning_rate": 1.9781667607754566e-05, "loss": 6.8442, "step": 580 }, { "epoch": 0.11, "grad_norm": 2.2123830318450928, "learning_rate": 1.977790325616413e-05, "loss": 6.8394, "step": 590 }, { "epoch": 0.11, "grad_norm": 2.4330053329467773, "learning_rate": 1.977413890457369e-05, "loss": 6.8497, "step": 600 }, { "epoch": 0.11, "grad_norm": 2.333451509475708, "learning_rate": 1.977037455298325e-05, "loss": 6.8324, "step": 610 }, { "epoch": 0.12, "grad_norm": 2.747091770172119, "learning_rate": 1.9766610201392812e-05, "loss": 6.8411, "step": 620 }, { "epoch": 0.12, "grad_norm": 2.5731656551361084, "learning_rate": 1.9762845849802372e-05, "loss": 6.8368, "step": 630 }, { "epoch": 0.12, "grad_norm": 2.414581060409546, "learning_rate": 1.9759081498211935e-05, "loss": 6.8406, "step": 640 }, { "epoch": 0.12, "grad_norm": 2.3303568363189697, "learning_rate": 1.9755317146621495e-05, "loss": 6.8463, "step": 650 }, { "epoch": 0.12, "grad_norm": 2.7673823833465576, "learning_rate": 1.9751552795031058e-05, "loss": 6.8439, "step": 660 }, { "epoch": 0.13, "grad_norm": 2.352534294128418, "learning_rate": 1.9747788443440618e-05, "loss": 6.8342, "step": 670 }, { "epoch": 0.13, "grad_norm": 2.852058172225952, "learning_rate": 1.974402409185018e-05, "loss": 6.8184, "step": 680 }, { "epoch": 0.13, "grad_norm": 2.295675754547119, "learning_rate": 1.974025974025974e-05, "loss": 6.8254, "step": 690 }, { "epoch": 0.13, "grad_norm": 2.3109614849090576, "learning_rate": 1.9736495388669304e-05, "loss": 6.82, "step": 700 }, { "epoch": 0.13, "grad_norm": 2.4982097148895264, "learning_rate": 1.9732731037078864e-05, "loss": 6.8425, "step": 710 }, { "epoch": 0.14, "grad_norm": 2.374101161956787, "learning_rate": 1.9728966685488427e-05, "loss": 6.8255, "step": 720 }, { "epoch": 0.14, "grad_norm": 2.4136903285980225, "learning_rate": 1.9725202333897987e-05, "loss": 6.8206, "step": 730 }, { "epoch": 0.14, "grad_norm": 2.5289034843444824, "learning_rate": 1.972143798230755e-05, "loss": 6.7983, "step": 740 }, { "epoch": 0.14, "grad_norm": 2.6093173027038574, "learning_rate": 1.9717673630717113e-05, "loss": 6.7991, "step": 750 }, { "epoch": 0.14, "grad_norm": 2.9302408695220947, "learning_rate": 1.9713909279126673e-05, "loss": 6.7973, "step": 760 }, { "epoch": 0.14, "grad_norm": 2.3061861991882324, "learning_rate": 1.9710144927536236e-05, "loss": 6.8135, "step": 770 }, { "epoch": 0.15, "grad_norm": 2.4390909671783447, "learning_rate": 1.9706380575945796e-05, "loss": 6.832, "step": 780 }, { "epoch": 0.15, "grad_norm": 2.387230157852173, "learning_rate": 1.9702616224355355e-05, "loss": 6.8272, "step": 790 }, { "epoch": 0.15, "grad_norm": 2.4431333541870117, "learning_rate": 1.969885187276492e-05, "loss": 6.8185, "step": 800 }, { "epoch": 0.15, "grad_norm": 2.154442548751831, "learning_rate": 1.9695087521174478e-05, "loss": 6.8085, "step": 810 }, { "epoch": 0.15, "grad_norm": 2.32527494430542, "learning_rate": 1.969132316958404e-05, "loss": 6.8038, "step": 820 }, { "epoch": 0.16, "grad_norm": 3.167388677597046, "learning_rate": 1.96875588179936e-05, "loss": 6.8152, "step": 830 }, { "epoch": 0.16, "grad_norm": 2.665239095687866, "learning_rate": 1.9683794466403164e-05, "loss": 6.822, "step": 840 }, { "epoch": 0.16, "grad_norm": 2.9431662559509277, "learning_rate": 1.9680030114812724e-05, "loss": 6.7848, "step": 850 }, { "epoch": 0.16, "grad_norm": 2.5422050952911377, "learning_rate": 1.9676265763222287e-05, "loss": 6.8207, "step": 860 }, { "epoch": 0.16, "grad_norm": 2.7234206199645996, "learning_rate": 1.9672501411631847e-05, "loss": 6.7918, "step": 870 }, { "epoch": 0.17, "grad_norm": 2.460327386856079, "learning_rate": 1.966873706004141e-05, "loss": 6.7891, "step": 880 }, { "epoch": 0.17, "grad_norm": 2.356071949005127, "learning_rate": 1.966497270845097e-05, "loss": 6.7794, "step": 890 }, { "epoch": 0.17, "grad_norm": 2.804115056991577, "learning_rate": 1.9661208356860533e-05, "loss": 6.7735, "step": 900 }, { "epoch": 0.17, "grad_norm": 2.534862995147705, "learning_rate": 1.9657444005270093e-05, "loss": 6.812, "step": 910 }, { "epoch": 0.17, "grad_norm": 2.8011670112609863, "learning_rate": 1.9653679653679656e-05, "loss": 6.7792, "step": 920 }, { "epoch": 0.18, "grad_norm": 2.5520925521850586, "learning_rate": 1.964991530208922e-05, "loss": 6.7953, "step": 930 }, { "epoch": 0.18, "grad_norm": 2.651599645614624, "learning_rate": 1.964615095049878e-05, "loss": 6.7898, "step": 940 }, { "epoch": 0.18, "grad_norm": 2.8219456672668457, "learning_rate": 1.9642386598908342e-05, "loss": 6.8101, "step": 950 }, { "epoch": 0.18, "grad_norm": 2.178567886352539, "learning_rate": 1.96386222473179e-05, "loss": 6.7726, "step": 960 }, { "epoch": 0.18, "grad_norm": 2.447699546813965, "learning_rate": 1.963485789572746e-05, "loss": 6.7634, "step": 970 }, { "epoch": 0.18, "grad_norm": 2.359414577484131, "learning_rate": 1.9631093544137025e-05, "loss": 6.7843, "step": 980 }, { "epoch": 0.19, "grad_norm": 2.6178600788116455, "learning_rate": 1.9627329192546585e-05, "loss": 6.7829, "step": 990 }, { "epoch": 0.19, "grad_norm": 2.5228044986724854, "learning_rate": 1.9623564840956148e-05, "loss": 6.7711, "step": 1000 }, { "epoch": 0.19, "grad_norm": 2.6705472469329834, "learning_rate": 1.9619800489365707e-05, "loss": 6.7738, "step": 1010 }, { "epoch": 0.19, "grad_norm": 2.5253331661224365, "learning_rate": 1.961603613777527e-05, "loss": 6.7957, "step": 1020 }, { "epoch": 0.19, "grad_norm": 2.3723368644714355, "learning_rate": 1.961227178618483e-05, "loss": 6.7988, "step": 1030 }, { "epoch": 0.2, "grad_norm": 2.6050617694854736, "learning_rate": 1.9608507434594394e-05, "loss": 6.7645, "step": 1040 }, { "epoch": 0.2, "grad_norm": 2.593855619430542, "learning_rate": 1.9604743083003953e-05, "loss": 6.783, "step": 1050 }, { "epoch": 0.2, "grad_norm": 2.58815336227417, "learning_rate": 1.9600978731413516e-05, "loss": 6.7452, "step": 1060 }, { "epoch": 0.2, "grad_norm": 2.7308313846588135, "learning_rate": 1.9597214379823076e-05, "loss": 6.7848, "step": 1070 }, { "epoch": 0.2, "grad_norm": 2.237525701522827, "learning_rate": 1.959345002823264e-05, "loss": 6.734, "step": 1080 }, { "epoch": 0.21, "grad_norm": 2.3565104007720947, "learning_rate": 1.95896856766422e-05, "loss": 6.738, "step": 1090 }, { "epoch": 0.21, "grad_norm": 2.3083043098449707, "learning_rate": 1.9585921325051762e-05, "loss": 6.7453, "step": 1100 }, { "epoch": 0.21, "grad_norm": 2.53562331199646, "learning_rate": 1.9582156973461322e-05, "loss": 6.755, "step": 1110 }, { "epoch": 0.21, "grad_norm": 2.6216201782226562, "learning_rate": 1.9578392621870885e-05, "loss": 6.7424, "step": 1120 }, { "epoch": 0.21, "grad_norm": 2.346785545349121, "learning_rate": 1.957462827028045e-05, "loss": 6.7221, "step": 1130 }, { "epoch": 0.21, "grad_norm": 3.705143690109253, "learning_rate": 1.9570863918690005e-05, "loss": 6.7499, "step": 1140 }, { "epoch": 0.22, "grad_norm": 2.6007704734802246, "learning_rate": 1.9567099567099568e-05, "loss": 6.772, "step": 1150 }, { "epoch": 0.22, "grad_norm": 2.545687198638916, "learning_rate": 1.956333521550913e-05, "loss": 6.7402, "step": 1160 }, { "epoch": 0.22, "grad_norm": 2.387261152267456, "learning_rate": 1.955957086391869e-05, "loss": 6.7351, "step": 1170 }, { "epoch": 0.22, "grad_norm": 2.6859683990478516, "learning_rate": 1.9555806512328254e-05, "loss": 6.7419, "step": 1180 }, { "epoch": 0.22, "grad_norm": 2.2952544689178467, "learning_rate": 1.9552042160737814e-05, "loss": 6.7231, "step": 1190 }, { "epoch": 0.23, "grad_norm": 2.5317373275756836, "learning_rate": 1.9548277809147377e-05, "loss": 6.7292, "step": 1200 }, { "epoch": 0.23, "grad_norm": 2.49037766456604, "learning_rate": 1.9544513457556937e-05, "loss": 6.7487, "step": 1210 }, { "epoch": 0.23, "grad_norm": 2.3043088912963867, "learning_rate": 1.95407491059665e-05, "loss": 6.7242, "step": 1220 }, { "epoch": 0.23, "grad_norm": 2.4029009342193604, "learning_rate": 1.953698475437606e-05, "loss": 6.6883, "step": 1230 }, { "epoch": 0.23, "grad_norm": 2.429788589477539, "learning_rate": 1.9533220402785623e-05, "loss": 6.7333, "step": 1240 }, { "epoch": 0.24, "grad_norm": 2.5161609649658203, "learning_rate": 1.9529456051195183e-05, "loss": 6.7212, "step": 1250 }, { "epoch": 0.24, "grad_norm": 2.518301486968994, "learning_rate": 1.9525691699604746e-05, "loss": 6.7362, "step": 1260 }, { "epoch": 0.24, "grad_norm": 2.562810182571411, "learning_rate": 1.9521927348014305e-05, "loss": 6.7174, "step": 1270 }, { "epoch": 0.24, "grad_norm": 2.6715238094329834, "learning_rate": 1.951816299642387e-05, "loss": 6.6963, "step": 1280 }, { "epoch": 0.24, "grad_norm": 2.386201858520508, "learning_rate": 1.951439864483343e-05, "loss": 6.7451, "step": 1290 }, { "epoch": 0.24, "grad_norm": 2.566657543182373, "learning_rate": 1.951063429324299e-05, "loss": 6.7035, "step": 1300 }, { "epoch": 0.25, "grad_norm": 2.326904773712158, "learning_rate": 1.950686994165255e-05, "loss": 6.6927, "step": 1310 }, { "epoch": 0.25, "grad_norm": 4.709331512451172, "learning_rate": 1.950310559006211e-05, "loss": 6.7281, "step": 1320 }, { "epoch": 0.25, "grad_norm": 2.4039790630340576, "learning_rate": 1.9499341238471674e-05, "loss": 6.7296, "step": 1330 }, { "epoch": 0.25, "grad_norm": 2.68048095703125, "learning_rate": 1.9495576886881234e-05, "loss": 6.7344, "step": 1340 }, { "epoch": 0.25, "grad_norm": 2.5811767578125, "learning_rate": 1.9491812535290797e-05, "loss": 6.7038, "step": 1350 }, { "epoch": 0.26, "grad_norm": 2.633669853210449, "learning_rate": 1.948804818370036e-05, "loss": 6.7223, "step": 1360 }, { "epoch": 0.26, "grad_norm": 2.6536636352539062, "learning_rate": 1.948428383210992e-05, "loss": 6.723, "step": 1370 }, { "epoch": 0.26, "grad_norm": 2.6891911029815674, "learning_rate": 1.9480519480519483e-05, "loss": 6.6934, "step": 1380 }, { "epoch": 0.26, "grad_norm": 2.5096209049224854, "learning_rate": 1.9476755128929043e-05, "loss": 6.7022, "step": 1390 }, { "epoch": 0.26, "grad_norm": 2.4939661026000977, "learning_rate": 1.9472990777338606e-05, "loss": 6.6692, "step": 1400 }, { "epoch": 0.27, "grad_norm": 2.3654654026031494, "learning_rate": 1.9469226425748166e-05, "loss": 6.6535, "step": 1410 }, { "epoch": 0.27, "grad_norm": 2.514317274093628, "learning_rate": 1.946546207415773e-05, "loss": 6.6607, "step": 1420 }, { "epoch": 0.27, "grad_norm": 3.223936080932617, "learning_rate": 1.946169772256729e-05, "loss": 6.6851, "step": 1430 }, { "epoch": 0.27, "grad_norm": 2.544851541519165, "learning_rate": 1.9457933370976852e-05, "loss": 6.7033, "step": 1440 }, { "epoch": 0.27, "grad_norm": 2.4377310276031494, "learning_rate": 1.9454169019386412e-05, "loss": 6.673, "step": 1450 }, { "epoch": 0.27, "grad_norm": 2.367894172668457, "learning_rate": 1.9450404667795975e-05, "loss": 6.6645, "step": 1460 }, { "epoch": 0.28, "grad_norm": 2.536773443222046, "learning_rate": 1.9446640316205535e-05, "loss": 6.697, "step": 1470 }, { "epoch": 0.28, "grad_norm": 2.5109031200408936, "learning_rate": 1.9442875964615098e-05, "loss": 6.6996, "step": 1480 }, { "epoch": 0.28, "grad_norm": 2.505375623703003, "learning_rate": 1.9439111613024658e-05, "loss": 6.6544, "step": 1490 }, { "epoch": 0.28, "grad_norm": 2.6280813217163086, "learning_rate": 1.9435347261434217e-05, "loss": 6.6834, "step": 1500 }, { "epoch": 0.28, "grad_norm": 2.5706870555877686, "learning_rate": 1.943158290984378e-05, "loss": 6.6702, "step": 1510 }, { "epoch": 0.29, "grad_norm": 2.6631226539611816, "learning_rate": 1.942781855825334e-05, "loss": 6.6669, "step": 1520 }, { "epoch": 0.29, "grad_norm": 2.719485282897949, "learning_rate": 1.9424054206662903e-05, "loss": 6.6705, "step": 1530 }, { "epoch": 0.29, "grad_norm": 2.376312494277954, "learning_rate": 1.9420289855072467e-05, "loss": 6.6924, "step": 1540 }, { "epoch": 0.29, "grad_norm": 2.7295444011688232, "learning_rate": 1.9416525503482026e-05, "loss": 6.6478, "step": 1550 }, { "epoch": 0.29, "grad_norm": 2.433988332748413, "learning_rate": 1.941276115189159e-05, "loss": 6.6599, "step": 1560 }, { "epoch": 0.3, "grad_norm": 2.5745954513549805, "learning_rate": 1.940899680030115e-05, "loss": 6.6439, "step": 1570 }, { "epoch": 0.3, "grad_norm": 2.6781184673309326, "learning_rate": 1.9405232448710712e-05, "loss": 6.6772, "step": 1580 }, { "epoch": 0.3, "grad_norm": 2.530686855316162, "learning_rate": 1.9401468097120272e-05, "loss": 6.6569, "step": 1590 }, { "epoch": 0.3, "grad_norm": 2.4142239093780518, "learning_rate": 1.9397703745529835e-05, "loss": 6.6709, "step": 1600 }, { "epoch": 0.3, "grad_norm": 2.6212003231048584, "learning_rate": 1.9393939393939395e-05, "loss": 6.6436, "step": 1610 }, { "epoch": 0.3, "grad_norm": 2.9554197788238525, "learning_rate": 1.9390175042348958e-05, "loss": 6.6499, "step": 1620 }, { "epoch": 0.31, "grad_norm": 2.4507064819335938, "learning_rate": 1.9386410690758518e-05, "loss": 6.6978, "step": 1630 }, { "epoch": 0.31, "grad_norm": 2.445267677307129, "learning_rate": 1.938264633916808e-05, "loss": 6.6614, "step": 1640 }, { "epoch": 0.31, "grad_norm": 2.496021270751953, "learning_rate": 1.937888198757764e-05, "loss": 6.6897, "step": 1650 }, { "epoch": 0.31, "grad_norm": 2.96243953704834, "learning_rate": 1.93751176359872e-05, "loss": 6.6677, "step": 1660 }, { "epoch": 0.31, "grad_norm": 3.0991013050079346, "learning_rate": 1.9371353284396764e-05, "loss": 6.6609, "step": 1670 }, { "epoch": 0.32, "grad_norm": 2.4630982875823975, "learning_rate": 1.9367588932806324e-05, "loss": 6.6538, "step": 1680 }, { "epoch": 0.32, "grad_norm": 4.999861717224121, "learning_rate": 1.9363824581215887e-05, "loss": 6.6487, "step": 1690 }, { "epoch": 0.32, "grad_norm": 3.728114128112793, "learning_rate": 1.9360060229625447e-05, "loss": 6.6326, "step": 1700 }, { "epoch": 0.32, "grad_norm": 3.2675485610961914, "learning_rate": 1.935629587803501e-05, "loss": 6.6779, "step": 1710 }, { "epoch": 0.32, "grad_norm": 2.7716498374938965, "learning_rate": 1.9352531526444573e-05, "loss": 6.6232, "step": 1720 }, { "epoch": 0.33, "grad_norm": 2.6670401096343994, "learning_rate": 1.9348767174854133e-05, "loss": 6.6341, "step": 1730 }, { "epoch": 0.33, "grad_norm": 2.6808712482452393, "learning_rate": 1.9345002823263696e-05, "loss": 6.6232, "step": 1740 }, { "epoch": 0.33, "grad_norm": 2.6918118000030518, "learning_rate": 1.9341238471673256e-05, "loss": 6.6069, "step": 1750 }, { "epoch": 0.33, "grad_norm": 2.7721352577209473, "learning_rate": 1.933747412008282e-05, "loss": 6.646, "step": 1760 }, { "epoch": 0.33, "grad_norm": 2.5393922328948975, "learning_rate": 1.933370976849238e-05, "loss": 6.6356, "step": 1770 }, { "epoch": 0.34, "grad_norm": 2.383399724960327, "learning_rate": 1.932994541690194e-05, "loss": 6.6562, "step": 1780 }, { "epoch": 0.34, "grad_norm": 2.316410779953003, "learning_rate": 1.93261810653115e-05, "loss": 6.6265, "step": 1790 }, { "epoch": 0.34, "grad_norm": 2.5583908557891846, "learning_rate": 1.9322416713721065e-05, "loss": 6.6207, "step": 1800 }, { "epoch": 0.34, "grad_norm": 2.384917974472046, "learning_rate": 1.9318652362130624e-05, "loss": 6.5896, "step": 1810 }, { "epoch": 0.34, "grad_norm": 2.775904655456543, "learning_rate": 1.9314888010540187e-05, "loss": 6.6297, "step": 1820 }, { "epoch": 0.34, "grad_norm": 2.6162569522857666, "learning_rate": 1.9311123658949747e-05, "loss": 6.6549, "step": 1830 }, { "epoch": 0.35, "grad_norm": 2.812328338623047, "learning_rate": 1.9307359307359307e-05, "loss": 6.6034, "step": 1840 }, { "epoch": 0.35, "grad_norm": 2.5816731452941895, "learning_rate": 1.930359495576887e-05, "loss": 6.5898, "step": 1850 }, { "epoch": 0.35, "grad_norm": 2.612111806869507, "learning_rate": 1.929983060417843e-05, "loss": 6.5873, "step": 1860 }, { "epoch": 0.35, "grad_norm": 2.857194185256958, "learning_rate": 1.9296066252587993e-05, "loss": 6.634, "step": 1870 }, { "epoch": 0.35, "grad_norm": 2.609227418899536, "learning_rate": 1.9292301900997553e-05, "loss": 6.6045, "step": 1880 }, { "epoch": 0.36, "grad_norm": 2.5522749423980713, "learning_rate": 1.9288537549407116e-05, "loss": 6.5958, "step": 1890 }, { "epoch": 0.36, "grad_norm": 2.7084991931915283, "learning_rate": 1.9284773197816676e-05, "loss": 6.6409, "step": 1900 }, { "epoch": 0.36, "grad_norm": 2.7430155277252197, "learning_rate": 1.928100884622624e-05, "loss": 6.6072, "step": 1910 }, { "epoch": 0.36, "grad_norm": 2.854513168334961, "learning_rate": 1.9277244494635802e-05, "loss": 6.6201, "step": 1920 }, { "epoch": 0.36, "grad_norm": 2.921213388442993, "learning_rate": 1.9273480143045362e-05, "loss": 6.569, "step": 1930 }, { "epoch": 0.37, "grad_norm": 2.3851304054260254, "learning_rate": 1.9269715791454925e-05, "loss": 6.5777, "step": 1940 }, { "epoch": 0.37, "grad_norm": 2.651808023452759, "learning_rate": 1.9265951439864485e-05, "loss": 6.6101, "step": 1950 }, { "epoch": 0.37, "grad_norm": 3.4343347549438477, "learning_rate": 1.9262187088274048e-05, "loss": 6.5711, "step": 1960 }, { "epoch": 0.37, "grad_norm": 3.198838472366333, "learning_rate": 1.9258422736683608e-05, "loss": 6.605, "step": 1970 }, { "epoch": 0.37, "grad_norm": 2.650223731994629, "learning_rate": 1.925465838509317e-05, "loss": 6.5864, "step": 1980 }, { "epoch": 0.37, "grad_norm": 2.8101279735565186, "learning_rate": 1.925089403350273e-05, "loss": 6.5997, "step": 1990 }, { "epoch": 0.38, "grad_norm": 2.7880499362945557, "learning_rate": 1.9247129681912294e-05, "loss": 6.5786, "step": 2000 }, { "epoch": 0.38, "grad_norm": 5.192387580871582, "learning_rate": 1.9243365330321854e-05, "loss": 6.5334, "step": 2010 }, { "epoch": 0.38, "grad_norm": 2.7987234592437744, "learning_rate": 1.9239600978731413e-05, "loss": 6.5991, "step": 2020 }, { "epoch": 0.38, "grad_norm": 2.6219470500946045, "learning_rate": 1.9235836627140976e-05, "loss": 6.5655, "step": 2030 }, { "epoch": 0.38, "grad_norm": 2.54457950592041, "learning_rate": 1.9232072275550536e-05, "loss": 6.5291, "step": 2040 }, { "epoch": 0.39, "grad_norm": 3.2591652870178223, "learning_rate": 1.92283079239601e-05, "loss": 6.5531, "step": 2050 }, { "epoch": 0.39, "grad_norm": 2.890012502670288, "learning_rate": 1.922454357236966e-05, "loss": 6.5312, "step": 2060 }, { "epoch": 0.39, "grad_norm": 3.0817246437072754, "learning_rate": 1.9220779220779222e-05, "loss": 6.5529, "step": 2070 }, { "epoch": 0.39, "grad_norm": 2.7113771438598633, "learning_rate": 1.9217014869188782e-05, "loss": 6.5886, "step": 2080 }, { "epoch": 0.39, "grad_norm": 3.6894679069519043, "learning_rate": 1.9213250517598345e-05, "loss": 6.5491, "step": 2090 }, { "epoch": 0.4, "grad_norm": 2.8171944618225098, "learning_rate": 1.920948616600791e-05, "loss": 6.5447, "step": 2100 }, { "epoch": 0.4, "grad_norm": 2.6681368350982666, "learning_rate": 1.9205721814417468e-05, "loss": 6.5313, "step": 2110 }, { "epoch": 0.4, "grad_norm": 2.6559665203094482, "learning_rate": 1.920195746282703e-05, "loss": 6.5535, "step": 2120 }, { "epoch": 0.4, "grad_norm": 2.9616358280181885, "learning_rate": 1.919819311123659e-05, "loss": 6.5102, "step": 2130 }, { "epoch": 0.4, "grad_norm": 2.5124080181121826, "learning_rate": 1.9194428759646154e-05, "loss": 6.5601, "step": 2140 }, { "epoch": 0.4, "grad_norm": 2.563617467880249, "learning_rate": 1.9190664408055714e-05, "loss": 6.5269, "step": 2150 }, { "epoch": 0.41, "grad_norm": 2.6807825565338135, "learning_rate": 1.9186900056465277e-05, "loss": 6.5438, "step": 2160 }, { "epoch": 0.41, "grad_norm": 2.8440377712249756, "learning_rate": 1.9183135704874837e-05, "loss": 6.5231, "step": 2170 }, { "epoch": 0.41, "grad_norm": 2.6473491191864014, "learning_rate": 1.9179371353284397e-05, "loss": 6.5444, "step": 2180 }, { "epoch": 0.41, "grad_norm": 2.6953611373901367, "learning_rate": 1.917560700169396e-05, "loss": 6.5357, "step": 2190 }, { "epoch": 0.41, "grad_norm": 2.747603178024292, "learning_rate": 1.917184265010352e-05, "loss": 6.5937, "step": 2200 }, { "epoch": 0.42, "grad_norm": 2.5047335624694824, "learning_rate": 1.9168078298513083e-05, "loss": 6.5503, "step": 2210 }, { "epoch": 0.42, "grad_norm": 2.302786350250244, "learning_rate": 1.9164313946922643e-05, "loss": 6.5099, "step": 2220 }, { "epoch": 0.42, "grad_norm": 2.784923791885376, "learning_rate": 1.9160549595332206e-05, "loss": 6.5088, "step": 2230 }, { "epoch": 0.42, "grad_norm": 2.428138494491577, "learning_rate": 1.9156785243741765e-05, "loss": 6.5193, "step": 2240 }, { "epoch": 0.42, "grad_norm": 2.6376843452453613, "learning_rate": 1.915302089215133e-05, "loss": 6.5036, "step": 2250 }, { "epoch": 0.43, "grad_norm": 2.7962093353271484, "learning_rate": 1.914925654056089e-05, "loss": 6.5367, "step": 2260 }, { "epoch": 0.43, "grad_norm": 2.7732503414154053, "learning_rate": 1.914549218897045e-05, "loss": 6.5306, "step": 2270 }, { "epoch": 0.43, "grad_norm": 2.6896862983703613, "learning_rate": 1.9141727837380015e-05, "loss": 6.4973, "step": 2280 }, { "epoch": 0.43, "grad_norm": 2.9891698360443115, "learning_rate": 1.9137963485789574e-05, "loss": 6.5318, "step": 2290 }, { "epoch": 0.43, "grad_norm": 2.754216432571411, "learning_rate": 1.9134199134199138e-05, "loss": 6.5505, "step": 2300 }, { "epoch": 0.43, "grad_norm": 3.05325984954834, "learning_rate": 1.9130434782608697e-05, "loss": 6.5059, "step": 2310 }, { "epoch": 0.44, "grad_norm": 2.4625449180603027, "learning_rate": 1.912667043101826e-05, "loss": 6.5033, "step": 2320 }, { "epoch": 0.44, "grad_norm": 2.476813316345215, "learning_rate": 1.912290607942782e-05, "loss": 6.518, "step": 2330 }, { "epoch": 0.44, "grad_norm": 2.5328636169433594, "learning_rate": 1.9119141727837383e-05, "loss": 6.5007, "step": 2340 }, { "epoch": 0.44, "grad_norm": 2.9487144947052, "learning_rate": 1.9115377376246943e-05, "loss": 6.4511, "step": 2350 }, { "epoch": 0.44, "grad_norm": 2.6691248416900635, "learning_rate": 1.9111613024656503e-05, "loss": 6.4731, "step": 2360 }, { "epoch": 0.45, "grad_norm": 7.700212478637695, "learning_rate": 1.9107848673066066e-05, "loss": 6.4794, "step": 2370 }, { "epoch": 0.45, "grad_norm": 2.5885720252990723, "learning_rate": 1.9104084321475626e-05, "loss": 6.4509, "step": 2380 }, { "epoch": 0.45, "grad_norm": 2.7361390590667725, "learning_rate": 1.910031996988519e-05, "loss": 6.4764, "step": 2390 }, { "epoch": 0.45, "grad_norm": 2.933286428451538, "learning_rate": 1.909655561829475e-05, "loss": 6.4793, "step": 2400 }, { "epoch": 0.45, "grad_norm": 2.748516798019409, "learning_rate": 1.9092791266704312e-05, "loss": 6.4787, "step": 2410 }, { "epoch": 0.46, "grad_norm": 3.037767171859741, "learning_rate": 1.9089026915113872e-05, "loss": 6.4818, "step": 2420 }, { "epoch": 0.46, "grad_norm": 2.5165228843688965, "learning_rate": 1.9085262563523435e-05, "loss": 6.4875, "step": 2430 }, { "epoch": 0.46, "grad_norm": 2.8557047843933105, "learning_rate": 1.9081498211932995e-05, "loss": 6.466, "step": 2440 }, { "epoch": 0.46, "grad_norm": 2.8542356491088867, "learning_rate": 1.9077733860342558e-05, "loss": 6.4936, "step": 2450 }, { "epoch": 0.46, "grad_norm": 3.3220980167388916, "learning_rate": 1.907396950875212e-05, "loss": 6.4984, "step": 2460 }, { "epoch": 0.46, "grad_norm": 2.9888689517974854, "learning_rate": 1.907020515716168e-05, "loss": 6.4775, "step": 2470 }, { "epoch": 0.47, "grad_norm": 2.6073288917541504, "learning_rate": 1.9066440805571244e-05, "loss": 6.4742, "step": 2480 }, { "epoch": 0.47, "grad_norm": 5.938607215881348, "learning_rate": 1.9062676453980804e-05, "loss": 6.4564, "step": 2490 }, { "epoch": 0.47, "grad_norm": 2.9256367683410645, "learning_rate": 1.9058912102390367e-05, "loss": 6.4945, "step": 2500 }, { "epoch": 0.47, "grad_norm": 2.7962679862976074, "learning_rate": 1.9055147750799927e-05, "loss": 6.4731, "step": 2510 }, { "epoch": 0.47, "grad_norm": 2.9641900062561035, "learning_rate": 1.905138339920949e-05, "loss": 6.4545, "step": 2520 }, { "epoch": 0.48, "grad_norm": 2.543147325515747, "learning_rate": 1.904761904761905e-05, "loss": 6.4805, "step": 2530 }, { "epoch": 0.48, "grad_norm": 2.8278234004974365, "learning_rate": 1.904385469602861e-05, "loss": 6.4229, "step": 2540 }, { "epoch": 0.48, "grad_norm": 2.62637996673584, "learning_rate": 1.9040090344438172e-05, "loss": 6.4725, "step": 2550 }, { "epoch": 0.48, "grad_norm": 3.3434314727783203, "learning_rate": 1.9036325992847732e-05, "loss": 6.4764, "step": 2560 }, { "epoch": 0.48, "grad_norm": 3.443331003189087, "learning_rate": 1.9032561641257295e-05, "loss": 6.4635, "step": 2570 }, { "epoch": 0.49, "grad_norm": 3.297346591949463, "learning_rate": 1.9028797289666855e-05, "loss": 6.4628, "step": 2580 }, { "epoch": 0.49, "grad_norm": 2.6333324909210205, "learning_rate": 1.9025032938076418e-05, "loss": 6.4329, "step": 2590 }, { "epoch": 0.49, "grad_norm": 2.846392869949341, "learning_rate": 1.9021268586485978e-05, "loss": 6.4663, "step": 2600 }, { "epoch": 0.49, "grad_norm": 2.9317896366119385, "learning_rate": 1.901750423489554e-05, "loss": 6.4311, "step": 2610 }, { "epoch": 0.49, "grad_norm": 2.694610118865967, "learning_rate": 1.90137398833051e-05, "loss": 6.4356, "step": 2620 }, { "epoch": 0.5, "grad_norm": 2.8555214405059814, "learning_rate": 1.9009975531714664e-05, "loss": 6.4313, "step": 2630 }, { "epoch": 0.5, "grad_norm": 2.9706711769104004, "learning_rate": 1.9006211180124224e-05, "loss": 6.4394, "step": 2640 }, { "epoch": 0.5, "grad_norm": 2.759869337081909, "learning_rate": 1.9002446828533787e-05, "loss": 6.4559, "step": 2650 }, { "epoch": 0.5, "grad_norm": 2.997725486755371, "learning_rate": 1.899868247694335e-05, "loss": 6.432, "step": 2660 }, { "epoch": 0.5, "grad_norm": 2.6816163063049316, "learning_rate": 1.899491812535291e-05, "loss": 6.4287, "step": 2670 }, { "epoch": 0.5, "grad_norm": 2.724766254425049, "learning_rate": 1.8991153773762473e-05, "loss": 6.4197, "step": 2680 }, { "epoch": 0.51, "grad_norm": 2.632607936859131, "learning_rate": 1.8987389422172033e-05, "loss": 6.4284, "step": 2690 }, { "epoch": 0.51, "grad_norm": 3.4783926010131836, "learning_rate": 1.8983625070581596e-05, "loss": 6.388, "step": 2700 }, { "epoch": 0.51, "grad_norm": 2.8150532245635986, "learning_rate": 1.8979860718991156e-05, "loss": 6.484, "step": 2710 }, { "epoch": 0.51, "grad_norm": 2.8433399200439453, "learning_rate": 1.8976096367400716e-05, "loss": 6.4574, "step": 2720 }, { "epoch": 0.51, "grad_norm": 2.861065149307251, "learning_rate": 1.897233201581028e-05, "loss": 6.3698, "step": 2730 }, { "epoch": 0.52, "grad_norm": 2.9589295387268066, "learning_rate": 1.896856766421984e-05, "loss": 6.4128, "step": 2740 }, { "epoch": 0.52, "grad_norm": 2.7081398963928223, "learning_rate": 1.89648033126294e-05, "loss": 6.4447, "step": 2750 }, { "epoch": 0.52, "grad_norm": 2.9661500453948975, "learning_rate": 1.896103896103896e-05, "loss": 6.3831, "step": 2760 }, { "epoch": 0.52, "grad_norm": 2.954826593399048, "learning_rate": 1.8957274609448525e-05, "loss": 6.3772, "step": 2770 }, { "epoch": 0.52, "grad_norm": 2.599303722381592, "learning_rate": 1.8953510257858084e-05, "loss": 6.4018, "step": 2780 }, { "epoch": 0.53, "grad_norm": 2.9619429111480713, "learning_rate": 1.8949745906267647e-05, "loss": 6.3965, "step": 2790 }, { "epoch": 0.53, "grad_norm": 2.7658960819244385, "learning_rate": 1.8945981554677207e-05, "loss": 6.4177, "step": 2800 }, { "epoch": 0.53, "grad_norm": 2.748741388320923, "learning_rate": 1.894221720308677e-05, "loss": 6.4142, "step": 2810 }, { "epoch": 0.53, "grad_norm": 2.726864814758301, "learning_rate": 1.893845285149633e-05, "loss": 6.4, "step": 2820 }, { "epoch": 0.53, "grad_norm": 3.4584908485412598, "learning_rate": 1.8934688499905893e-05, "loss": 6.3808, "step": 2830 }, { "epoch": 0.53, "grad_norm": 3.009855270385742, "learning_rate": 1.8930924148315456e-05, "loss": 6.438, "step": 2840 }, { "epoch": 0.54, "grad_norm": 4.697122573852539, "learning_rate": 1.8927159796725016e-05, "loss": 6.3683, "step": 2850 }, { "epoch": 0.54, "grad_norm": 2.823795795440674, "learning_rate": 1.892339544513458e-05, "loss": 6.3994, "step": 2860 }, { "epoch": 0.54, "grad_norm": 2.7415943145751953, "learning_rate": 1.891963109354414e-05, "loss": 6.3479, "step": 2870 }, { "epoch": 0.54, "grad_norm": 2.8538568019866943, "learning_rate": 1.89158667419537e-05, "loss": 6.3714, "step": 2880 }, { "epoch": 0.54, "grad_norm": 2.636777639389038, "learning_rate": 1.8912102390363262e-05, "loss": 6.3828, "step": 2890 }, { "epoch": 0.55, "grad_norm": 2.5686187744140625, "learning_rate": 1.8908338038772822e-05, "loss": 6.3698, "step": 2900 }, { "epoch": 0.55, "grad_norm": 3.3633761405944824, "learning_rate": 1.8904573687182385e-05, "loss": 6.3656, "step": 2910 }, { "epoch": 0.55, "grad_norm": 2.680591106414795, "learning_rate": 1.8900809335591945e-05, "loss": 6.3156, "step": 2920 }, { "epoch": 0.55, "grad_norm": 4.2905802726745605, "learning_rate": 1.8897044984001508e-05, "loss": 6.3368, "step": 2930 }, { "epoch": 0.55, "grad_norm": 2.710627794265747, "learning_rate": 1.8893280632411068e-05, "loss": 6.3495, "step": 2940 }, { "epoch": 0.56, "grad_norm": 2.661747932434082, "learning_rate": 1.888951628082063e-05, "loss": 6.383, "step": 2950 }, { "epoch": 0.56, "grad_norm": 3.4706857204437256, "learning_rate": 1.888575192923019e-05, "loss": 6.3589, "step": 2960 }, { "epoch": 0.56, "grad_norm": 3.149345874786377, "learning_rate": 1.8881987577639754e-05, "loss": 6.3492, "step": 2970 }, { "epoch": 0.56, "grad_norm": 2.604423761367798, "learning_rate": 1.8878223226049314e-05, "loss": 6.3399, "step": 2980 }, { "epoch": 0.56, "grad_norm": 3.920194387435913, "learning_rate": 1.8874458874458877e-05, "loss": 6.4093, "step": 2990 }, { "epoch": 0.56, "grad_norm": 2.971879005432129, "learning_rate": 1.8870694522868436e-05, "loss": 6.4137, "step": 3000 }, { "epoch": 0.57, "grad_norm": 3.068359136581421, "learning_rate": 1.8866930171278e-05, "loss": 6.3586, "step": 3010 }, { "epoch": 0.57, "grad_norm": 3.062915563583374, "learning_rate": 1.8863165819687563e-05, "loss": 6.3375, "step": 3020 }, { "epoch": 0.57, "grad_norm": 2.8037359714508057, "learning_rate": 1.8859401468097123e-05, "loss": 6.2931, "step": 3030 }, { "epoch": 0.57, "grad_norm": 2.8892300128936768, "learning_rate": 1.8855637116506686e-05, "loss": 6.3763, "step": 3040 }, { "epoch": 0.57, "grad_norm": 3.103889226913452, "learning_rate": 1.8851872764916242e-05, "loss": 6.3472, "step": 3050 }, { "epoch": 0.58, "grad_norm": 2.9963624477386475, "learning_rate": 1.8848108413325805e-05, "loss": 6.3431, "step": 3060 }, { "epoch": 0.58, "grad_norm": 3.95845627784729, "learning_rate": 1.884434406173537e-05, "loss": 6.364, "step": 3070 }, { "epoch": 0.58, "grad_norm": 2.802729606628418, "learning_rate": 1.8840579710144928e-05, "loss": 6.3197, "step": 3080 }, { "epoch": 0.58, "grad_norm": 2.546173572540283, "learning_rate": 1.883681535855449e-05, "loss": 6.2756, "step": 3090 }, { "epoch": 0.58, "grad_norm": 2.989448308944702, "learning_rate": 1.883305100696405e-05, "loss": 6.3094, "step": 3100 }, { "epoch": 0.59, "grad_norm": 2.922934055328369, "learning_rate": 1.8829286655373614e-05, "loss": 6.3123, "step": 3110 }, { "epoch": 0.59, "grad_norm": 2.840175151824951, "learning_rate": 1.8825522303783174e-05, "loss": 6.2974, "step": 3120 }, { "epoch": 0.59, "grad_norm": 2.6850576400756836, "learning_rate": 1.8821757952192737e-05, "loss": 6.2945, "step": 3130 }, { "epoch": 0.59, "grad_norm": 2.828094959259033, "learning_rate": 1.8817993600602297e-05, "loss": 6.3545, "step": 3140 }, { "epoch": 0.59, "grad_norm": 3.4463613033294678, "learning_rate": 1.881422924901186e-05, "loss": 6.3237, "step": 3150 }, { "epoch": 0.59, "grad_norm": 2.932133674621582, "learning_rate": 1.881046489742142e-05, "loss": 6.3177, "step": 3160 }, { "epoch": 0.6, "grad_norm": 4.342816352844238, "learning_rate": 1.8806700545830983e-05, "loss": 6.3004, "step": 3170 }, { "epoch": 0.6, "grad_norm": 2.7983977794647217, "learning_rate": 1.8802936194240543e-05, "loss": 6.2909, "step": 3180 }, { "epoch": 0.6, "grad_norm": 2.860229015350342, "learning_rate": 1.8799171842650106e-05, "loss": 6.2874, "step": 3190 }, { "epoch": 0.6, "grad_norm": 4.009525775909424, "learning_rate": 1.8795407491059666e-05, "loss": 6.2494, "step": 3200 }, { "epoch": 0.6, "grad_norm": 3.5227296352386475, "learning_rate": 1.879164313946923e-05, "loss": 6.2814, "step": 3210 }, { "epoch": 0.61, "grad_norm": 2.8381760120391846, "learning_rate": 1.8787878787878792e-05, "loss": 6.2733, "step": 3220 }, { "epoch": 0.61, "grad_norm": 2.767287015914917, "learning_rate": 1.878411443628835e-05, "loss": 6.3207, "step": 3230 }, { "epoch": 0.61, "grad_norm": 3.097081184387207, "learning_rate": 1.878035008469791e-05, "loss": 6.3215, "step": 3240 }, { "epoch": 0.61, "grad_norm": 2.896815061569214, "learning_rate": 1.8776585733107475e-05, "loss": 6.318, "step": 3250 }, { "epoch": 0.61, "grad_norm": 2.8316450119018555, "learning_rate": 1.8772821381517034e-05, "loss": 6.2586, "step": 3260 }, { "epoch": 0.62, "grad_norm": 3.387800693511963, "learning_rate": 1.8769057029926598e-05, "loss": 6.2708, "step": 3270 }, { "epoch": 0.62, "grad_norm": 2.5523407459259033, "learning_rate": 1.8765292678336157e-05, "loss": 6.3111, "step": 3280 }, { "epoch": 0.62, "grad_norm": 2.8760297298431396, "learning_rate": 1.876152832674572e-05, "loss": 6.2953, "step": 3290 }, { "epoch": 0.62, "grad_norm": 3.449699640274048, "learning_rate": 1.875776397515528e-05, "loss": 6.2473, "step": 3300 }, { "epoch": 0.62, "grad_norm": 2.9004993438720703, "learning_rate": 1.8753999623564843e-05, "loss": 6.2706, "step": 3310 }, { "epoch": 0.62, "grad_norm": 2.845015048980713, "learning_rate": 1.8750235271974403e-05, "loss": 6.2894, "step": 3320 }, { "epoch": 0.63, "grad_norm": 3.1126046180725098, "learning_rate": 1.8746470920383966e-05, "loss": 6.2885, "step": 3330 }, { "epoch": 0.63, "grad_norm": 3.0513312816619873, "learning_rate": 1.8742706568793526e-05, "loss": 6.321, "step": 3340 }, { "epoch": 0.63, "grad_norm": 2.940154790878296, "learning_rate": 1.873894221720309e-05, "loss": 6.2449, "step": 3350 }, { "epoch": 0.63, "grad_norm": 2.721733570098877, "learning_rate": 1.873517786561265e-05, "loss": 6.2729, "step": 3360 }, { "epoch": 0.63, "grad_norm": 2.845959186553955, "learning_rate": 1.8731413514022212e-05, "loss": 6.2728, "step": 3370 }, { "epoch": 0.64, "grad_norm": 3.2412092685699463, "learning_rate": 1.8727649162431772e-05, "loss": 6.2788, "step": 3380 }, { "epoch": 0.64, "grad_norm": 2.820462226867676, "learning_rate": 1.8723884810841335e-05, "loss": 6.2737, "step": 3390 }, { "epoch": 0.64, "grad_norm": 4.037824630737305, "learning_rate": 1.8720120459250895e-05, "loss": 6.3159, "step": 3400 }, { "epoch": 0.64, "grad_norm": 2.8224730491638184, "learning_rate": 1.8716356107660455e-05, "loss": 6.2822, "step": 3410 }, { "epoch": 0.64, "grad_norm": 3.047783374786377, "learning_rate": 1.8712591756070018e-05, "loss": 6.2788, "step": 3420 }, { "epoch": 0.65, "grad_norm": 4.59201717376709, "learning_rate": 1.8708827404479578e-05, "loss": 6.2627, "step": 3430 }, { "epoch": 0.65, "grad_norm": 3.200969934463501, "learning_rate": 1.870506305288914e-05, "loss": 6.2382, "step": 3440 }, { "epoch": 0.65, "grad_norm": 2.937462568283081, "learning_rate": 1.8701298701298704e-05, "loss": 6.2336, "step": 3450 }, { "epoch": 0.65, "grad_norm": 3.2794556617736816, "learning_rate": 1.8697534349708264e-05, "loss": 6.261, "step": 3460 }, { "epoch": 0.65, "grad_norm": 3.147082567214966, "learning_rate": 1.8693769998117827e-05, "loss": 6.2726, "step": 3470 }, { "epoch": 0.65, "grad_norm": 2.9619040489196777, "learning_rate": 1.8690005646527387e-05, "loss": 6.2739, "step": 3480 }, { "epoch": 0.66, "grad_norm": 2.903965473175049, "learning_rate": 1.868624129493695e-05, "loss": 6.1975, "step": 3490 }, { "epoch": 0.66, "grad_norm": 2.762101173400879, "learning_rate": 1.868247694334651e-05, "loss": 6.2015, "step": 3500 }, { "epoch": 0.66, "grad_norm": 3.481088161468506, "learning_rate": 1.8678712591756073e-05, "loss": 6.2692, "step": 3510 }, { "epoch": 0.66, "grad_norm": 2.820317268371582, "learning_rate": 1.8674948240165632e-05, "loss": 6.2115, "step": 3520 }, { "epoch": 0.66, "grad_norm": 3.554399013519287, "learning_rate": 1.8671183888575196e-05, "loss": 6.2303, "step": 3530 }, { "epoch": 0.67, "grad_norm": 2.9928488731384277, "learning_rate": 1.8667419536984755e-05, "loss": 6.1882, "step": 3540 }, { "epoch": 0.67, "grad_norm": 2.827382802963257, "learning_rate": 1.866365518539432e-05, "loss": 6.2007, "step": 3550 }, { "epoch": 0.67, "grad_norm": 2.7333712577819824, "learning_rate": 1.8659890833803878e-05, "loss": 6.1857, "step": 3560 }, { "epoch": 0.67, "grad_norm": 3.43860125541687, "learning_rate": 1.865612648221344e-05, "loss": 6.2069, "step": 3570 }, { "epoch": 0.67, "grad_norm": 3.0302352905273438, "learning_rate": 1.8652362130623e-05, "loss": 6.2242, "step": 3580 }, { "epoch": 0.68, "grad_norm": 2.9700002670288086, "learning_rate": 1.864859777903256e-05, "loss": 6.1983, "step": 3590 }, { "epoch": 0.68, "grad_norm": 3.0191869735717773, "learning_rate": 1.8644833427442124e-05, "loss": 6.2173, "step": 3600 }, { "epoch": 0.68, "grad_norm": 2.7090609073638916, "learning_rate": 1.8641069075851684e-05, "loss": 6.1946, "step": 3610 }, { "epoch": 0.68, "grad_norm": 3.020233631134033, "learning_rate": 1.8637304724261247e-05, "loss": 6.2582, "step": 3620 }, { "epoch": 0.68, "grad_norm": 3.4682936668395996, "learning_rate": 1.863354037267081e-05, "loss": 6.2268, "step": 3630 }, { "epoch": 0.69, "grad_norm": 3.124009609222412, "learning_rate": 1.862977602108037e-05, "loss": 6.2227, "step": 3640 }, { "epoch": 0.69, "grad_norm": 3.1946659088134766, "learning_rate": 1.8626011669489933e-05, "loss": 6.2156, "step": 3650 }, { "epoch": 0.69, "grad_norm": 3.2658441066741943, "learning_rate": 1.8622247317899493e-05, "loss": 6.1946, "step": 3660 }, { "epoch": 0.69, "grad_norm": 3.424638032913208, "learning_rate": 1.8618482966309056e-05, "loss": 6.1919, "step": 3670 }, { "epoch": 0.69, "grad_norm": 3.3747997283935547, "learning_rate": 1.8614718614718616e-05, "loss": 6.2506, "step": 3680 }, { "epoch": 0.69, "grad_norm": 2.9038097858428955, "learning_rate": 1.861095426312818e-05, "loss": 6.1838, "step": 3690 }, { "epoch": 0.7, "grad_norm": 2.6961169242858887, "learning_rate": 1.860718991153774e-05, "loss": 6.1785, "step": 3700 }, { "epoch": 0.7, "grad_norm": 3.13710355758667, "learning_rate": 1.8603425559947302e-05, "loss": 6.1791, "step": 3710 }, { "epoch": 0.7, "grad_norm": 2.912348508834839, "learning_rate": 1.859966120835686e-05, "loss": 6.2164, "step": 3720 }, { "epoch": 0.7, "grad_norm": 2.874903917312622, "learning_rate": 1.8595896856766425e-05, "loss": 6.1655, "step": 3730 }, { "epoch": 0.7, "grad_norm": 5.829525947570801, "learning_rate": 1.8592132505175985e-05, "loss": 6.2047, "step": 3740 }, { "epoch": 0.71, "grad_norm": 3.880034923553467, "learning_rate": 1.8588368153585544e-05, "loss": 6.2151, "step": 3750 }, { "epoch": 0.71, "grad_norm": 3.412844181060791, "learning_rate": 1.8584603801995107e-05, "loss": 6.2034, "step": 3760 }, { "epoch": 0.71, "grad_norm": 2.9908251762390137, "learning_rate": 1.8580839450404667e-05, "loss": 6.1605, "step": 3770 }, { "epoch": 0.71, "grad_norm": 3.2586495876312256, "learning_rate": 1.857707509881423e-05, "loss": 6.145, "step": 3780 }, { "epoch": 0.71, "grad_norm": 2.735668659210205, "learning_rate": 1.857331074722379e-05, "loss": 6.1659, "step": 3790 }, { "epoch": 0.72, "grad_norm": 3.6149892807006836, "learning_rate": 1.8569546395633353e-05, "loss": 6.1618, "step": 3800 }, { "epoch": 0.72, "grad_norm": 3.5494861602783203, "learning_rate": 1.8565782044042916e-05, "loss": 6.1412, "step": 3810 }, { "epoch": 0.72, "grad_norm": 3.845383644104004, "learning_rate": 1.8562017692452476e-05, "loss": 6.1147, "step": 3820 }, { "epoch": 0.72, "grad_norm": 2.857085943222046, "learning_rate": 1.855825334086204e-05, "loss": 6.2246, "step": 3830 }, { "epoch": 0.72, "grad_norm": 3.4632909297943115, "learning_rate": 1.85544889892716e-05, "loss": 6.1731, "step": 3840 }, { "epoch": 0.72, "grad_norm": 3.133561611175537, "learning_rate": 1.8550724637681162e-05, "loss": 6.1988, "step": 3850 }, { "epoch": 0.73, "grad_norm": 3.460613250732422, "learning_rate": 1.8546960286090722e-05, "loss": 6.1348, "step": 3860 }, { "epoch": 0.73, "grad_norm": 3.0388598442077637, "learning_rate": 1.8543195934500285e-05, "loss": 6.1054, "step": 3870 }, { "epoch": 0.73, "grad_norm": 3.104234218597412, "learning_rate": 1.8539431582909845e-05, "loss": 6.1595, "step": 3880 }, { "epoch": 0.73, "grad_norm": 3.1962454319000244, "learning_rate": 1.8535667231319408e-05, "loss": 6.1496, "step": 3890 }, { "epoch": 0.73, "grad_norm": 3.1930150985717773, "learning_rate": 1.8531902879728968e-05, "loss": 6.2137, "step": 3900 }, { "epoch": 0.74, "grad_norm": 2.9409141540527344, "learning_rate": 1.852813852813853e-05, "loss": 6.1862, "step": 3910 }, { "epoch": 0.74, "grad_norm": 3.0037753582000732, "learning_rate": 1.852437417654809e-05, "loss": 6.2242, "step": 3920 }, { "epoch": 0.74, "grad_norm": 5.536525249481201, "learning_rate": 1.852060982495765e-05, "loss": 6.1609, "step": 3930 }, { "epoch": 0.74, "grad_norm": 2.939833164215088, "learning_rate": 1.8516845473367214e-05, "loss": 6.1345, "step": 3940 }, { "epoch": 0.74, "grad_norm": 4.231409549713135, "learning_rate": 1.8513081121776774e-05, "loss": 6.2041, "step": 3950 }, { "epoch": 0.75, "grad_norm": 2.816448211669922, "learning_rate": 1.8509316770186337e-05, "loss": 6.186, "step": 3960 }, { "epoch": 0.75, "grad_norm": 2.9809634685516357, "learning_rate": 1.8505552418595896e-05, "loss": 6.1059, "step": 3970 }, { "epoch": 0.75, "grad_norm": 3.1738669872283936, "learning_rate": 1.850178806700546e-05, "loss": 6.096, "step": 3980 }, { "epoch": 0.75, "grad_norm": 2.7718279361724854, "learning_rate": 1.849802371541502e-05, "loss": 6.1687, "step": 3990 }, { "epoch": 0.75, "grad_norm": 2.723320484161377, "learning_rate": 1.8494259363824583e-05, "loss": 6.0731, "step": 4000 }, { "epoch": 0.75, "grad_norm": 3.0385329723358154, "learning_rate": 1.8490495012234146e-05, "loss": 6.1163, "step": 4010 }, { "epoch": 0.76, "grad_norm": 2.9319238662719727, "learning_rate": 1.8486730660643705e-05, "loss": 6.1294, "step": 4020 }, { "epoch": 0.76, "grad_norm": 2.8837320804595947, "learning_rate": 1.848296630905327e-05, "loss": 6.1649, "step": 4030 }, { "epoch": 0.76, "grad_norm": 3.0900797843933105, "learning_rate": 1.847920195746283e-05, "loss": 6.0922, "step": 4040 }, { "epoch": 0.76, "grad_norm": 2.947920083999634, "learning_rate": 1.847543760587239e-05, "loss": 6.0779, "step": 4050 }, { "epoch": 0.76, "grad_norm": 3.1584599018096924, "learning_rate": 1.847167325428195e-05, "loss": 6.0398, "step": 4060 }, { "epoch": 0.77, "grad_norm": 3.380052328109741, "learning_rate": 1.8467908902691514e-05, "loss": 6.0577, "step": 4070 }, { "epoch": 0.77, "grad_norm": 3.2318527698516846, "learning_rate": 1.8464144551101074e-05, "loss": 6.1119, "step": 4080 }, { "epoch": 0.77, "grad_norm": 2.8217580318450928, "learning_rate": 1.8460380199510637e-05, "loss": 6.0211, "step": 4090 }, { "epoch": 0.77, "grad_norm": 3.678891181945801, "learning_rate": 1.8456615847920197e-05, "loss": 6.1347, "step": 4100 }, { "epoch": 0.77, "grad_norm": 4.879661560058594, "learning_rate": 1.8452851496329757e-05, "loss": 6.1032, "step": 4110 }, { "epoch": 0.78, "grad_norm": 3.0156853199005127, "learning_rate": 1.844908714473932e-05, "loss": 6.089, "step": 4120 }, { "epoch": 0.78, "grad_norm": 3.1185648441314697, "learning_rate": 1.844532279314888e-05, "loss": 6.1102, "step": 4130 }, { "epoch": 0.78, "grad_norm": 3.027703285217285, "learning_rate": 1.8441558441558443e-05, "loss": 6.0028, "step": 4140 }, { "epoch": 0.78, "grad_norm": 2.8133630752563477, "learning_rate": 1.8437794089968003e-05, "loss": 6.0709, "step": 4150 }, { "epoch": 0.78, "grad_norm": 3.2893717288970947, "learning_rate": 1.8434029738377566e-05, "loss": 6.0206, "step": 4160 }, { "epoch": 0.78, "grad_norm": 3.675004720687866, "learning_rate": 1.8430265386787126e-05, "loss": 6.0798, "step": 4170 }, { "epoch": 0.79, "grad_norm": 3.5279109477996826, "learning_rate": 1.842650103519669e-05, "loss": 6.1258, "step": 4180 }, { "epoch": 0.79, "grad_norm": 3.764820098876953, "learning_rate": 1.8422736683606252e-05, "loss": 6.121, "step": 4190 }, { "epoch": 0.79, "grad_norm": 2.7705323696136475, "learning_rate": 1.8418972332015812e-05, "loss": 6.0128, "step": 4200 }, { "epoch": 0.79, "grad_norm": 3.6026437282562256, "learning_rate": 1.8415207980425375e-05, "loss": 6.1128, "step": 4210 }, { "epoch": 0.79, "grad_norm": 2.9786674976348877, "learning_rate": 1.8411443628834935e-05, "loss": 6.089, "step": 4220 }, { "epoch": 0.8, "grad_norm": 2.854918956756592, "learning_rate": 1.8407679277244498e-05, "loss": 6.0533, "step": 4230 }, { "epoch": 0.8, "grad_norm": 4.0675368309021, "learning_rate": 1.8403914925654058e-05, "loss": 6.0496, "step": 4240 }, { "epoch": 0.8, "grad_norm": 2.8727331161499023, "learning_rate": 1.840015057406362e-05, "loss": 6.0167, "step": 4250 }, { "epoch": 0.8, "grad_norm": 3.4834461212158203, "learning_rate": 1.839638622247318e-05, "loss": 6.0888, "step": 4260 }, { "epoch": 0.8, "grad_norm": 3.8864407539367676, "learning_rate": 1.839262187088274e-05, "loss": 6.1174, "step": 4270 }, { "epoch": 0.81, "grad_norm": 3.9190967082977295, "learning_rate": 1.8388857519292303e-05, "loss": 6.0399, "step": 4280 }, { "epoch": 0.81, "grad_norm": 3.3072915077209473, "learning_rate": 1.8385093167701863e-05, "loss": 6.0305, "step": 4290 }, { "epoch": 0.81, "grad_norm": 3.320314884185791, "learning_rate": 1.8381328816111426e-05, "loss": 6.092, "step": 4300 }, { "epoch": 0.81, "grad_norm": 2.976619243621826, "learning_rate": 1.8377564464520986e-05, "loss": 6.0663, "step": 4310 }, { "epoch": 0.81, "grad_norm": 3.934274435043335, "learning_rate": 1.837380011293055e-05, "loss": 6.0635, "step": 4320 }, { "epoch": 0.81, "grad_norm": 3.0798866748809814, "learning_rate": 1.837003576134011e-05, "loss": 6.0374, "step": 4330 }, { "epoch": 0.82, "grad_norm": 2.866913080215454, "learning_rate": 1.8366271409749672e-05, "loss": 6.0333, "step": 4340 }, { "epoch": 0.82, "grad_norm": 3.7947776317596436, "learning_rate": 1.8362507058159232e-05, "loss": 5.9892, "step": 4350 }, { "epoch": 0.82, "grad_norm": 3.1060264110565186, "learning_rate": 1.8358742706568795e-05, "loss": 6.0328, "step": 4360 }, { "epoch": 0.82, "grad_norm": 2.983135461807251, "learning_rate": 1.8354978354978358e-05, "loss": 5.9976, "step": 4370 }, { "epoch": 0.82, "grad_norm": 3.6576428413391113, "learning_rate": 1.8351214003387918e-05, "loss": 5.9754, "step": 4380 }, { "epoch": 0.83, "grad_norm": 3.5233638286590576, "learning_rate": 1.834744965179748e-05, "loss": 6.0078, "step": 4390 }, { "epoch": 0.83, "grad_norm": 3.1347076892852783, "learning_rate": 1.834368530020704e-05, "loss": 6.073, "step": 4400 }, { "epoch": 0.83, "grad_norm": 3.4067604541778564, "learning_rate": 1.8339920948616604e-05, "loss": 6.0079, "step": 4410 }, { "epoch": 0.83, "grad_norm": 3.476513147354126, "learning_rate": 1.8336156597026164e-05, "loss": 6.0346, "step": 4420 }, { "epoch": 0.83, "grad_norm": 3.078284978866577, "learning_rate": 1.8332392245435727e-05, "loss": 5.9707, "step": 4430 }, { "epoch": 0.84, "grad_norm": 3.684180736541748, "learning_rate": 1.8328627893845287e-05, "loss": 6.0228, "step": 4440 }, { "epoch": 0.84, "grad_norm": 3.11917781829834, "learning_rate": 1.8324863542254847e-05, "loss": 5.9767, "step": 4450 }, { "epoch": 0.84, "grad_norm": 3.0451982021331787, "learning_rate": 1.832109919066441e-05, "loss": 5.9699, "step": 4460 }, { "epoch": 0.84, "grad_norm": 3.294008255004883, "learning_rate": 1.831733483907397e-05, "loss": 6.0202, "step": 4470 }, { "epoch": 0.84, "grad_norm": 5.766932010650635, "learning_rate": 1.8313570487483533e-05, "loss": 6.0621, "step": 4480 }, { "epoch": 0.85, "grad_norm": 4.833611011505127, "learning_rate": 1.8309806135893092e-05, "loss": 6.0654, "step": 4490 }, { "epoch": 0.85, "grad_norm": 3.194051504135132, "learning_rate": 1.8306041784302656e-05, "loss": 5.9708, "step": 4500 }, { "epoch": 0.85, "grad_norm": 3.262328624725342, "learning_rate": 1.8302277432712215e-05, "loss": 6.0895, "step": 4510 }, { "epoch": 0.85, "grad_norm": 3.44919753074646, "learning_rate": 1.829851308112178e-05, "loss": 5.9948, "step": 4520 }, { "epoch": 0.85, "grad_norm": 3.91009783744812, "learning_rate": 1.8294748729531338e-05, "loss": 6.025, "step": 4530 }, { "epoch": 0.85, "grad_norm": 3.513437509536743, "learning_rate": 1.82909843779409e-05, "loss": 5.9898, "step": 4540 }, { "epoch": 0.86, "grad_norm": 3.095289468765259, "learning_rate": 1.8287220026350465e-05, "loss": 6.0518, "step": 4550 }, { "epoch": 0.86, "grad_norm": 4.2038726806640625, "learning_rate": 1.8283455674760024e-05, "loss": 6.0684, "step": 4560 }, { "epoch": 0.86, "grad_norm": 3.7143218517303467, "learning_rate": 1.8279691323169587e-05, "loss": 6.0687, "step": 4570 }, { "epoch": 0.86, "grad_norm": 3.445624828338623, "learning_rate": 1.8275926971579147e-05, "loss": 5.9846, "step": 4580 }, { "epoch": 0.86, "grad_norm": 3.6637818813323975, "learning_rate": 1.827216261998871e-05, "loss": 6.0118, "step": 4590 }, { "epoch": 0.87, "grad_norm": 3.822211503982544, "learning_rate": 1.826839826839827e-05, "loss": 5.8824, "step": 4600 }, { "epoch": 0.87, "grad_norm": 3.0215353965759277, "learning_rate": 1.8264633916807833e-05, "loss": 5.9474, "step": 4610 }, { "epoch": 0.87, "grad_norm": 3.329132318496704, "learning_rate": 1.8260869565217393e-05, "loss": 5.9615, "step": 4620 }, { "epoch": 0.87, "grad_norm": 2.9768261909484863, "learning_rate": 1.8257105213626953e-05, "loss": 5.984, "step": 4630 }, { "epoch": 0.87, "grad_norm": 4.200778007507324, "learning_rate": 1.8253340862036516e-05, "loss": 6.0252, "step": 4640 }, { "epoch": 0.88, "grad_norm": 3.1100523471832275, "learning_rate": 1.8249576510446076e-05, "loss": 6.032, "step": 4650 }, { "epoch": 0.88, "grad_norm": 3.6637959480285645, "learning_rate": 1.824581215885564e-05, "loss": 5.8775, "step": 4660 }, { "epoch": 0.88, "grad_norm": 2.8674635887145996, "learning_rate": 1.82420478072652e-05, "loss": 5.967, "step": 4670 }, { "epoch": 0.88, "grad_norm": 5.125814914703369, "learning_rate": 1.8238283455674762e-05, "loss": 5.9742, "step": 4680 }, { "epoch": 0.88, "grad_norm": 3.5703861713409424, "learning_rate": 1.823451910408432e-05, "loss": 5.9827, "step": 4690 }, { "epoch": 0.88, "grad_norm": 3.398340940475464, "learning_rate": 1.8230754752493885e-05, "loss": 5.8908, "step": 4700 }, { "epoch": 0.89, "grad_norm": 3.368262767791748, "learning_rate": 1.8226990400903445e-05, "loss": 6.0251, "step": 4710 }, { "epoch": 0.89, "grad_norm": 3.1160783767700195, "learning_rate": 1.8223226049313008e-05, "loss": 6.0266, "step": 4720 }, { "epoch": 0.89, "grad_norm": 4.354055881500244, "learning_rate": 1.8219461697722567e-05, "loss": 5.9064, "step": 4730 }, { "epoch": 0.89, "grad_norm": 3.428316354751587, "learning_rate": 1.821569734613213e-05, "loss": 5.9613, "step": 4740 }, { "epoch": 0.89, "grad_norm": 5.322836875915527, "learning_rate": 1.8211932994541694e-05, "loss": 5.8915, "step": 4750 }, { "epoch": 0.9, "grad_norm": 3.1977086067199707, "learning_rate": 1.8208168642951254e-05, "loss": 5.9849, "step": 4760 }, { "epoch": 0.9, "grad_norm": 3.6655070781707764, "learning_rate": 1.8204404291360817e-05, "loss": 5.9071, "step": 4770 }, { "epoch": 0.9, "grad_norm": 3.88796329498291, "learning_rate": 1.8200639939770376e-05, "loss": 5.9307, "step": 4780 }, { "epoch": 0.9, "grad_norm": 2.8081860542297363, "learning_rate": 1.819687558817994e-05, "loss": 5.945, "step": 4790 }, { "epoch": 0.9, "grad_norm": 3.9588053226470947, "learning_rate": 1.81931112365895e-05, "loss": 5.9581, "step": 4800 }, { "epoch": 0.91, "grad_norm": 3.6405527591705322, "learning_rate": 1.818934688499906e-05, "loss": 5.9423, "step": 4810 }, { "epoch": 0.91, "grad_norm": 5.194989204406738, "learning_rate": 1.8185582533408622e-05, "loss": 5.9, "step": 4820 }, { "epoch": 0.91, "grad_norm": 3.2684292793273926, "learning_rate": 1.8181818181818182e-05, "loss": 5.9881, "step": 4830 }, { "epoch": 0.91, "grad_norm": 4.88789701461792, "learning_rate": 1.8178053830227745e-05, "loss": 5.9956, "step": 4840 }, { "epoch": 0.91, "grad_norm": 3.286144256591797, "learning_rate": 1.8174289478637305e-05, "loss": 6.0177, "step": 4850 }, { "epoch": 0.91, "grad_norm": 3.582435131072998, "learning_rate": 1.8170525127046868e-05, "loss": 5.9237, "step": 4860 }, { "epoch": 0.92, "grad_norm": 5.786302089691162, "learning_rate": 1.8166760775456428e-05, "loss": 5.9811, "step": 4870 }, { "epoch": 0.92, "grad_norm": 10.76257610321045, "learning_rate": 1.816299642386599e-05, "loss": 5.9277, "step": 4880 }, { "epoch": 0.92, "grad_norm": 3.059964895248413, "learning_rate": 1.815923207227555e-05, "loss": 5.8813, "step": 4890 }, { "epoch": 0.92, "grad_norm": 3.1282639503479004, "learning_rate": 1.8155467720685114e-05, "loss": 5.9414, "step": 4900 }, { "epoch": 0.92, "grad_norm": 3.5674808025360107, "learning_rate": 1.8151703369094674e-05, "loss": 5.9096, "step": 4910 }, { "epoch": 0.93, "grad_norm": 3.4338762760162354, "learning_rate": 1.8147939017504237e-05, "loss": 5.9282, "step": 4920 }, { "epoch": 0.93, "grad_norm": 3.970597743988037, "learning_rate": 1.81441746659138e-05, "loss": 5.9748, "step": 4930 }, { "epoch": 0.93, "grad_norm": 3.5161378383636475, "learning_rate": 1.814041031432336e-05, "loss": 5.8948, "step": 4940 }, { "epoch": 0.93, "grad_norm": 3.004849433898926, "learning_rate": 1.8136645962732923e-05, "loss": 5.8901, "step": 4950 }, { "epoch": 0.93, "grad_norm": 3.364605665206909, "learning_rate": 1.8132881611142483e-05, "loss": 5.8469, "step": 4960 }, { "epoch": 0.94, "grad_norm": 4.161324501037598, "learning_rate": 1.8129117259552043e-05, "loss": 5.8441, "step": 4970 }, { "epoch": 0.94, "grad_norm": 3.2717819213867188, "learning_rate": 1.8125352907961606e-05, "loss": 5.8559, "step": 4980 }, { "epoch": 0.94, "grad_norm": 3.444769859313965, "learning_rate": 1.8121588556371165e-05, "loss": 5.9372, "step": 4990 }, { "epoch": 0.94, "grad_norm": 3.4469246864318848, "learning_rate": 1.811782420478073e-05, "loss": 5.9131, "step": 5000 }, { "epoch": 0.94, "grad_norm": 3.0224616527557373, "learning_rate": 1.811405985319029e-05, "loss": 5.8025, "step": 5010 }, { "epoch": 0.94, "grad_norm": 3.8289661407470703, "learning_rate": 1.811029550159985e-05, "loss": 5.8274, "step": 5020 }, { "epoch": 0.95, "grad_norm": 2.879352331161499, "learning_rate": 1.810653115000941e-05, "loss": 5.9744, "step": 5030 }, { "epoch": 0.95, "grad_norm": 3.5162127017974854, "learning_rate": 1.8102766798418974e-05, "loss": 5.8229, "step": 5040 }, { "epoch": 0.95, "grad_norm": 3.4429492950439453, "learning_rate": 1.8099002446828534e-05, "loss": 5.8649, "step": 5050 }, { "epoch": 0.95, "grad_norm": 6.2440690994262695, "learning_rate": 1.8095238095238097e-05, "loss": 5.906, "step": 5060 }, { "epoch": 0.95, "grad_norm": 3.901210069656372, "learning_rate": 1.8091473743647657e-05, "loss": 5.8485, "step": 5070 }, { "epoch": 0.96, "grad_norm": 2.9538042545318604, "learning_rate": 1.808770939205722e-05, "loss": 5.8528, "step": 5080 }, { "epoch": 0.96, "grad_norm": 3.5398683547973633, "learning_rate": 1.808394504046678e-05, "loss": 5.9094, "step": 5090 }, { "epoch": 0.96, "grad_norm": 3.463794469833374, "learning_rate": 1.8080180688876343e-05, "loss": 5.858, "step": 5100 }, { "epoch": 0.96, "grad_norm": 3.4160399436950684, "learning_rate": 1.8076416337285906e-05, "loss": 5.9121, "step": 5110 }, { "epoch": 0.96, "grad_norm": 4.299091339111328, "learning_rate": 1.8072651985695466e-05, "loss": 5.9132, "step": 5120 }, { "epoch": 0.97, "grad_norm": 4.033545970916748, "learning_rate": 1.806888763410503e-05, "loss": 5.9166, "step": 5130 }, { "epoch": 0.97, "grad_norm": 3.242450475692749, "learning_rate": 1.8065123282514586e-05, "loss": 5.8885, "step": 5140 }, { "epoch": 0.97, "grad_norm": 3.0054519176483154, "learning_rate": 1.806135893092415e-05, "loss": 5.8274, "step": 5150 }, { "epoch": 0.97, "grad_norm": 3.099708080291748, "learning_rate": 1.8057594579333712e-05, "loss": 5.8117, "step": 5160 }, { "epoch": 0.97, "grad_norm": 3.053912878036499, "learning_rate": 1.8053830227743272e-05, "loss": 5.8486, "step": 5170 }, { "epoch": 0.97, "grad_norm": 4.281755447387695, "learning_rate": 1.8050065876152835e-05, "loss": 5.9242, "step": 5180 }, { "epoch": 0.98, "grad_norm": 5.619093418121338, "learning_rate": 1.8046301524562395e-05, "loss": 5.8294, "step": 5190 }, { "epoch": 0.98, "grad_norm": 4.025234222412109, "learning_rate": 1.8042537172971958e-05, "loss": 5.8182, "step": 5200 }, { "epoch": 0.98, "grad_norm": 3.251429796218872, "learning_rate": 1.8038772821381518e-05, "loss": 5.8035, "step": 5210 }, { "epoch": 0.98, "grad_norm": 3.805734872817993, "learning_rate": 1.803500846979108e-05, "loss": 5.8319, "step": 5220 }, { "epoch": 0.98, "grad_norm": 3.0786643028259277, "learning_rate": 1.803124411820064e-05, "loss": 5.9459, "step": 5230 }, { "epoch": 0.99, "grad_norm": 3.3231472969055176, "learning_rate": 1.8027479766610204e-05, "loss": 5.8058, "step": 5240 }, { "epoch": 0.99, "grad_norm": 3.3416686058044434, "learning_rate": 1.8023715415019763e-05, "loss": 5.8091, "step": 5250 }, { "epoch": 0.99, "grad_norm": 4.068899154663086, "learning_rate": 1.8019951063429327e-05, "loss": 5.9196, "step": 5260 }, { "epoch": 0.99, "grad_norm": 3.6459412574768066, "learning_rate": 1.8016186711838886e-05, "loss": 5.7817, "step": 5270 }, { "epoch": 0.99, "grad_norm": 6.448877811431885, "learning_rate": 1.801242236024845e-05, "loss": 5.8905, "step": 5280 }, { "epoch": 1.0, "grad_norm": 4.894053936004639, "learning_rate": 1.800865800865801e-05, "loss": 5.8216, "step": 5290 }, { "epoch": 1.0, "grad_norm": 3.516646146774292, "learning_rate": 1.8004893657067572e-05, "loss": 5.8255, "step": 5300 }, { "epoch": 1.0, "grad_norm": 4.483935832977295, "learning_rate": 1.8001129305477136e-05, "loss": 5.7852, "step": 5310 }, { "epoch": 1.0, "eval_accuracy": 0.6866666666666666, "eval_loss": 5.756490230560303, "eval_runtime": 31.1876, "eval_samples_per_second": 240.48, "eval_steps_per_second": 30.076, "step": 5313 }, { "epoch": 1.0, "grad_norm": 3.064215660095215, "learning_rate": 1.7997364953886692e-05, "loss": 5.7318, "step": 5320 }, { "epoch": 1.0, "grad_norm": 3.6211490631103516, "learning_rate": 1.7993600602296255e-05, "loss": 5.7207, "step": 5330 }, { "epoch": 1.01, "grad_norm": 3.7358758449554443, "learning_rate": 1.7989836250705818e-05, "loss": 5.7413, "step": 5340 }, { "epoch": 1.01, "grad_norm": 3.3474528789520264, "learning_rate": 1.7986071899115378e-05, "loss": 5.8081, "step": 5350 }, { "epoch": 1.01, "grad_norm": 3.723468542098999, "learning_rate": 1.798230754752494e-05, "loss": 5.7353, "step": 5360 }, { "epoch": 1.01, "grad_norm": 3.1955268383026123, "learning_rate": 1.79785431959345e-05, "loss": 5.7034, "step": 5370 }, { "epoch": 1.01, "grad_norm": 4.23120641708374, "learning_rate": 1.7974778844344064e-05, "loss": 5.6639, "step": 5380 }, { "epoch": 1.01, "grad_norm": 4.113842964172363, "learning_rate": 1.7971014492753624e-05, "loss": 5.7417, "step": 5390 }, { "epoch": 1.02, "grad_norm": 3.4102118015289307, "learning_rate": 1.7967250141163187e-05, "loss": 5.7073, "step": 5400 }, { "epoch": 1.02, "grad_norm": 3.219844102859497, "learning_rate": 1.7963485789572747e-05, "loss": 5.7843, "step": 5410 }, { "epoch": 1.02, "grad_norm": 3.3281567096710205, "learning_rate": 1.795972143798231e-05, "loss": 5.7379, "step": 5420 }, { "epoch": 1.02, "grad_norm": 4.145016193389893, "learning_rate": 1.795595708639187e-05, "loss": 5.7389, "step": 5430 }, { "epoch": 1.02, "grad_norm": 4.070563316345215, "learning_rate": 1.7952192734801433e-05, "loss": 5.7813, "step": 5440 }, { "epoch": 1.03, "grad_norm": 3.548051357269287, "learning_rate": 1.7948428383210993e-05, "loss": 5.6979, "step": 5450 }, { "epoch": 1.03, "grad_norm": 3.414898633956909, "learning_rate": 1.7944664031620556e-05, "loss": 5.7716, "step": 5460 }, { "epoch": 1.03, "grad_norm": 4.09160852432251, "learning_rate": 1.7940899680030116e-05, "loss": 5.7346, "step": 5470 }, { "epoch": 1.03, "grad_norm": 3.3175604343414307, "learning_rate": 1.793713532843968e-05, "loss": 5.7907, "step": 5480 }, { "epoch": 1.03, "grad_norm": 4.92573881149292, "learning_rate": 1.793337097684924e-05, "loss": 5.6612, "step": 5490 }, { "epoch": 1.04, "grad_norm": 3.908839702606201, "learning_rate": 1.7929606625258798e-05, "loss": 5.6572, "step": 5500 }, { "epoch": 1.04, "grad_norm": 3.724313974380493, "learning_rate": 1.792584227366836e-05, "loss": 5.7813, "step": 5510 }, { "epoch": 1.04, "grad_norm": 3.3908185958862305, "learning_rate": 1.792207792207792e-05, "loss": 5.627, "step": 5520 }, { "epoch": 1.04, "grad_norm": 4.907402515411377, "learning_rate": 1.7918313570487484e-05, "loss": 5.6593, "step": 5530 }, { "epoch": 1.04, "grad_norm": 3.77998948097229, "learning_rate": 1.7914549218897047e-05, "loss": 5.7596, "step": 5540 }, { "epoch": 1.04, "grad_norm": 3.539862871170044, "learning_rate": 1.7910784867306607e-05, "loss": 5.6802, "step": 5550 }, { "epoch": 1.05, "grad_norm": 5.592168807983398, "learning_rate": 1.790702051571617e-05, "loss": 5.673, "step": 5560 }, { "epoch": 1.05, "grad_norm": 3.0869011878967285, "learning_rate": 1.790325616412573e-05, "loss": 5.6687, "step": 5570 }, { "epoch": 1.05, "grad_norm": 3.558314561843872, "learning_rate": 1.7899491812535293e-05, "loss": 5.685, "step": 5580 }, { "epoch": 1.05, "grad_norm": 2.9247989654541016, "learning_rate": 1.7895727460944853e-05, "loss": 5.6274, "step": 5590 }, { "epoch": 1.05, "grad_norm": 3.401524066925049, "learning_rate": 1.7891963109354416e-05, "loss": 5.6802, "step": 5600 }, { "epoch": 1.06, "grad_norm": 3.173633098602295, "learning_rate": 1.7888198757763976e-05, "loss": 5.5907, "step": 5610 }, { "epoch": 1.06, "grad_norm": 3.520982265472412, "learning_rate": 1.788443440617354e-05, "loss": 5.6875, "step": 5620 }, { "epoch": 1.06, "grad_norm": 3.8196980953216553, "learning_rate": 1.78806700545831e-05, "loss": 5.6967, "step": 5630 }, { "epoch": 1.06, "grad_norm": 3.640376329421997, "learning_rate": 1.7876905702992662e-05, "loss": 5.6778, "step": 5640 }, { "epoch": 1.06, "grad_norm": 3.5509583950042725, "learning_rate": 1.7873141351402222e-05, "loss": 5.6098, "step": 5650 }, { "epoch": 1.07, "grad_norm": 3.4623301029205322, "learning_rate": 1.7869376999811785e-05, "loss": 5.5988, "step": 5660 }, { "epoch": 1.07, "grad_norm": 3.2605643272399902, "learning_rate": 1.7865612648221345e-05, "loss": 5.6148, "step": 5670 }, { "epoch": 1.07, "grad_norm": 3.8433616161346436, "learning_rate": 1.7861848296630905e-05, "loss": 5.5344, "step": 5680 }, { "epoch": 1.07, "grad_norm": 4.449424743652344, "learning_rate": 1.7858083945040468e-05, "loss": 5.7056, "step": 5690 }, { "epoch": 1.07, "grad_norm": 3.4395947456359863, "learning_rate": 1.7854319593450027e-05, "loss": 5.6438, "step": 5700 }, { "epoch": 1.07, "grad_norm": 4.081213474273682, "learning_rate": 1.785055524185959e-05, "loss": 5.6064, "step": 5710 }, { "epoch": 1.08, "grad_norm": 3.6221301555633545, "learning_rate": 1.7846790890269154e-05, "loss": 5.5613, "step": 5720 }, { "epoch": 1.08, "grad_norm": 3.2759907245635986, "learning_rate": 1.7843026538678714e-05, "loss": 5.6219, "step": 5730 }, { "epoch": 1.08, "grad_norm": 3.353506088256836, "learning_rate": 1.7839262187088277e-05, "loss": 5.6426, "step": 5740 }, { "epoch": 1.08, "grad_norm": 5.253815650939941, "learning_rate": 1.7835497835497836e-05, "loss": 5.6352, "step": 5750 }, { "epoch": 1.08, "grad_norm": 3.6588995456695557, "learning_rate": 1.78317334839074e-05, "loss": 5.6508, "step": 5760 }, { "epoch": 1.09, "grad_norm": 5.337824821472168, "learning_rate": 1.782796913231696e-05, "loss": 5.6761, "step": 5770 }, { "epoch": 1.09, "grad_norm": 3.9401497840881348, "learning_rate": 1.7824204780726523e-05, "loss": 5.5415, "step": 5780 }, { "epoch": 1.09, "grad_norm": 3.819956064224243, "learning_rate": 1.7820440429136082e-05, "loss": 5.717, "step": 5790 }, { "epoch": 1.09, "grad_norm": 3.598991632461548, "learning_rate": 1.7816676077545645e-05, "loss": 5.719, "step": 5800 }, { "epoch": 1.09, "grad_norm": 4.070620059967041, "learning_rate": 1.7812911725955205e-05, "loss": 5.5225, "step": 5810 }, { "epoch": 1.1, "grad_norm": 5.40057897567749, "learning_rate": 1.780914737436477e-05, "loss": 5.563, "step": 5820 }, { "epoch": 1.1, "grad_norm": 3.8934574127197266, "learning_rate": 1.7805383022774328e-05, "loss": 5.6571, "step": 5830 }, { "epoch": 1.1, "grad_norm": 3.8978958129882812, "learning_rate": 1.7801618671183888e-05, "loss": 5.5483, "step": 5840 }, { "epoch": 1.1, "grad_norm": 3.628629684448242, "learning_rate": 1.779785431959345e-05, "loss": 5.5746, "step": 5850 }, { "epoch": 1.1, "grad_norm": 4.06587553024292, "learning_rate": 1.779408996800301e-05, "loss": 5.6674, "step": 5860 }, { "epoch": 1.1, "grad_norm": 3.2706613540649414, "learning_rate": 1.7790325616412574e-05, "loss": 5.5582, "step": 5870 }, { "epoch": 1.11, "grad_norm": 4.300984859466553, "learning_rate": 1.7786561264822134e-05, "loss": 5.5943, "step": 5880 }, { "epoch": 1.11, "grad_norm": 3.4221644401550293, "learning_rate": 1.7782796913231697e-05, "loss": 5.5292, "step": 5890 }, { "epoch": 1.11, "grad_norm": 3.1121580600738525, "learning_rate": 1.777903256164126e-05, "loss": 5.6084, "step": 5900 }, { "epoch": 1.11, "grad_norm": 3.0184531211853027, "learning_rate": 1.777526821005082e-05, "loss": 5.5822, "step": 5910 }, { "epoch": 1.11, "grad_norm": 3.992727756500244, "learning_rate": 1.7771503858460383e-05, "loss": 5.6512, "step": 5920 }, { "epoch": 1.12, "grad_norm": 3.752830982208252, "learning_rate": 1.7767739506869943e-05, "loss": 5.5588, "step": 5930 }, { "epoch": 1.12, "grad_norm": 3.507953405380249, "learning_rate": 1.7763975155279506e-05, "loss": 5.7075, "step": 5940 }, { "epoch": 1.12, "grad_norm": 3.104038953781128, "learning_rate": 1.7760210803689066e-05, "loss": 5.5203, "step": 5950 }, { "epoch": 1.12, "grad_norm": 4.265203952789307, "learning_rate": 1.775644645209863e-05, "loss": 5.5882, "step": 5960 }, { "epoch": 1.12, "grad_norm": 3.1321351528167725, "learning_rate": 1.775268210050819e-05, "loss": 5.5488, "step": 5970 }, { "epoch": 1.13, "grad_norm": 5.637481212615967, "learning_rate": 1.7748917748917752e-05, "loss": 5.4386, "step": 5980 }, { "epoch": 1.13, "grad_norm": 3.919205665588379, "learning_rate": 1.774515339732731e-05, "loss": 5.5141, "step": 5990 }, { "epoch": 1.13, "grad_norm": 3.299159049987793, "learning_rate": 1.7741389045736875e-05, "loss": 5.5211, "step": 6000 }, { "epoch": 1.13, "grad_norm": 4.947606086730957, "learning_rate": 1.7737624694146434e-05, "loss": 5.58, "step": 6010 }, { "epoch": 1.13, "grad_norm": 4.38947057723999, "learning_rate": 1.7733860342555994e-05, "loss": 5.6413, "step": 6020 }, { "epoch": 1.13, "grad_norm": 4.6740403175354, "learning_rate": 1.7730095990965557e-05, "loss": 5.5519, "step": 6030 }, { "epoch": 1.14, "grad_norm": 3.973433494567871, "learning_rate": 1.7726331639375117e-05, "loss": 5.6413, "step": 6040 }, { "epoch": 1.14, "grad_norm": 4.425774574279785, "learning_rate": 1.772256728778468e-05, "loss": 5.464, "step": 6050 }, { "epoch": 1.14, "grad_norm": 4.747916221618652, "learning_rate": 1.771880293619424e-05, "loss": 5.6365, "step": 6060 }, { "epoch": 1.14, "grad_norm": 3.1735618114471436, "learning_rate": 1.7715038584603803e-05, "loss": 5.5448, "step": 6070 }, { "epoch": 1.14, "grad_norm": 3.5024526119232178, "learning_rate": 1.7711274233013366e-05, "loss": 5.5672, "step": 6080 }, { "epoch": 1.15, "grad_norm": 4.6181111335754395, "learning_rate": 1.7707509881422926e-05, "loss": 5.5938, "step": 6090 }, { "epoch": 1.15, "grad_norm": 4.128718852996826, "learning_rate": 1.770374552983249e-05, "loss": 5.57, "step": 6100 }, { "epoch": 1.15, "grad_norm": 3.691974401473999, "learning_rate": 1.769998117824205e-05, "loss": 5.4984, "step": 6110 }, { "epoch": 1.15, "grad_norm": 3.8929593563079834, "learning_rate": 1.7696216826651612e-05, "loss": 5.6067, "step": 6120 }, { "epoch": 1.15, "grad_norm": 3.4313833713531494, "learning_rate": 1.7692452475061172e-05, "loss": 5.444, "step": 6130 }, { "epoch": 1.16, "grad_norm": 4.30509090423584, "learning_rate": 1.7688688123470735e-05, "loss": 5.5532, "step": 6140 }, { "epoch": 1.16, "grad_norm": 4.268110275268555, "learning_rate": 1.7684923771880295e-05, "loss": 5.5397, "step": 6150 }, { "epoch": 1.16, "grad_norm": 3.366288423538208, "learning_rate": 1.7681159420289858e-05, "loss": 5.4952, "step": 6160 }, { "epoch": 1.16, "grad_norm": 4.403131008148193, "learning_rate": 1.7677395068699418e-05, "loss": 5.5013, "step": 6170 }, { "epoch": 1.16, "grad_norm": 4.42628288269043, "learning_rate": 1.767363071710898e-05, "loss": 5.5515, "step": 6180 }, { "epoch": 1.17, "grad_norm": 4.776844501495361, "learning_rate": 1.766986636551854e-05, "loss": 5.6145, "step": 6190 }, { "epoch": 1.17, "grad_norm": 3.4492292404174805, "learning_rate": 1.76661020139281e-05, "loss": 5.5603, "step": 6200 }, { "epoch": 1.17, "grad_norm": 4.259890079498291, "learning_rate": 1.7662337662337664e-05, "loss": 5.4978, "step": 6210 }, { "epoch": 1.17, "grad_norm": 3.320469379425049, "learning_rate": 1.7658573310747223e-05, "loss": 5.5047, "step": 6220 }, { "epoch": 1.17, "grad_norm": 3.31471848487854, "learning_rate": 1.7654808959156787e-05, "loss": 5.5088, "step": 6230 }, { "epoch": 1.17, "grad_norm": 3.601073741912842, "learning_rate": 1.7651044607566346e-05, "loss": 5.5044, "step": 6240 }, { "epoch": 1.18, "grad_norm": 4.011069297790527, "learning_rate": 1.764728025597591e-05, "loss": 5.5668, "step": 6250 }, { "epoch": 1.18, "grad_norm": 3.5717737674713135, "learning_rate": 1.764351590438547e-05, "loss": 5.4551, "step": 6260 }, { "epoch": 1.18, "grad_norm": 5.707601547241211, "learning_rate": 1.7639751552795032e-05, "loss": 5.503, "step": 6270 }, { "epoch": 1.18, "grad_norm": 3.5272598266601562, "learning_rate": 1.7635987201204596e-05, "loss": 5.5477, "step": 6280 }, { "epoch": 1.18, "grad_norm": 3.233694076538086, "learning_rate": 1.7632222849614155e-05, "loss": 5.5699, "step": 6290 }, { "epoch": 1.19, "grad_norm": 4.6197686195373535, "learning_rate": 1.762845849802372e-05, "loss": 5.4451, "step": 6300 }, { "epoch": 1.19, "grad_norm": 3.8022520542144775, "learning_rate": 1.7624694146433278e-05, "loss": 5.5963, "step": 6310 }, { "epoch": 1.19, "grad_norm": 3.849290609359741, "learning_rate": 1.762092979484284e-05, "loss": 5.5306, "step": 6320 }, { "epoch": 1.19, "grad_norm": 3.3373732566833496, "learning_rate": 1.76171654432524e-05, "loss": 5.458, "step": 6330 }, { "epoch": 1.19, "grad_norm": 3.255927562713623, "learning_rate": 1.7613401091661964e-05, "loss": 5.4804, "step": 6340 }, { "epoch": 1.2, "grad_norm": 4.038276672363281, "learning_rate": 1.7609636740071524e-05, "loss": 5.3793, "step": 6350 }, { "epoch": 1.2, "grad_norm": 3.273983955383301, "learning_rate": 1.7605872388481084e-05, "loss": 5.4838, "step": 6360 }, { "epoch": 1.2, "grad_norm": 3.583287477493286, "learning_rate": 1.7602108036890647e-05, "loss": 5.4342, "step": 6370 }, { "epoch": 1.2, "grad_norm": 3.7486605644226074, "learning_rate": 1.7598343685300207e-05, "loss": 5.4461, "step": 6380 }, { "epoch": 1.2, "grad_norm": 3.960120677947998, "learning_rate": 1.759457933370977e-05, "loss": 5.4537, "step": 6390 }, { "epoch": 1.2, "grad_norm": 3.752458095550537, "learning_rate": 1.759081498211933e-05, "loss": 5.4096, "step": 6400 }, { "epoch": 1.21, "grad_norm": 5.6009907722473145, "learning_rate": 1.7587050630528893e-05, "loss": 5.4537, "step": 6410 }, { "epoch": 1.21, "grad_norm": 4.05803918838501, "learning_rate": 1.7583286278938453e-05, "loss": 5.4811, "step": 6420 }, { "epoch": 1.21, "grad_norm": 3.865929365158081, "learning_rate": 1.7579521927348016e-05, "loss": 5.4824, "step": 6430 }, { "epoch": 1.21, "grad_norm": 3.867224931716919, "learning_rate": 1.7575757575757576e-05, "loss": 5.4902, "step": 6440 }, { "epoch": 1.21, "grad_norm": 3.5504441261291504, "learning_rate": 1.757199322416714e-05, "loss": 5.3908, "step": 6450 }, { "epoch": 1.22, "grad_norm": 3.943458318710327, "learning_rate": 1.7568228872576702e-05, "loss": 5.4648, "step": 6460 }, { "epoch": 1.22, "grad_norm": 4.495426177978516, "learning_rate": 1.756446452098626e-05, "loss": 5.3964, "step": 6470 }, { "epoch": 1.22, "grad_norm": 5.023219585418701, "learning_rate": 1.7560700169395825e-05, "loss": 5.3873, "step": 6480 }, { "epoch": 1.22, "grad_norm": 3.970592975616455, "learning_rate": 1.7556935817805385e-05, "loss": 5.3492, "step": 6490 }, { "epoch": 1.22, "grad_norm": 4.181807041168213, "learning_rate": 1.7553171466214948e-05, "loss": 5.5617, "step": 6500 }, { "epoch": 1.23, "grad_norm": 5.51046085357666, "learning_rate": 1.7549407114624507e-05, "loss": 5.3935, "step": 6510 }, { "epoch": 1.23, "grad_norm": 7.997470855712891, "learning_rate": 1.754564276303407e-05, "loss": 5.3125, "step": 6520 }, { "epoch": 1.23, "grad_norm": 5.629729747772217, "learning_rate": 1.754187841144363e-05, "loss": 5.5043, "step": 6530 }, { "epoch": 1.23, "grad_norm": 3.1573050022125244, "learning_rate": 1.753811405985319e-05, "loss": 5.357, "step": 6540 }, { "epoch": 1.23, "grad_norm": 5.1155595779418945, "learning_rate": 1.7534349708262753e-05, "loss": 5.3955, "step": 6550 }, { "epoch": 1.23, "grad_norm": 3.6041131019592285, "learning_rate": 1.7530585356672313e-05, "loss": 5.4457, "step": 6560 }, { "epoch": 1.24, "grad_norm": 4.085677623748779, "learning_rate": 1.7526821005081876e-05, "loss": 5.5649, "step": 6570 }, { "epoch": 1.24, "grad_norm": 4.2146897315979, "learning_rate": 1.7523056653491436e-05, "loss": 5.3465, "step": 6580 }, { "epoch": 1.24, "grad_norm": 4.089937210083008, "learning_rate": 1.7519292301901e-05, "loss": 5.4061, "step": 6590 }, { "epoch": 1.24, "grad_norm": 3.6523382663726807, "learning_rate": 1.751552795031056e-05, "loss": 5.3226, "step": 6600 }, { "epoch": 1.24, "grad_norm": 3.106112003326416, "learning_rate": 1.7511763598720122e-05, "loss": 5.4166, "step": 6610 }, { "epoch": 1.25, "grad_norm": 3.4070067405700684, "learning_rate": 1.7507999247129682e-05, "loss": 5.4835, "step": 6620 }, { "epoch": 1.25, "grad_norm": 6.973111629486084, "learning_rate": 1.7504234895539245e-05, "loss": 5.4694, "step": 6630 }, { "epoch": 1.25, "grad_norm": 4.977043628692627, "learning_rate": 1.7500470543948808e-05, "loss": 5.3295, "step": 6640 }, { "epoch": 1.25, "grad_norm": 3.538287401199341, "learning_rate": 1.7496706192358368e-05, "loss": 5.3883, "step": 6650 }, { "epoch": 1.25, "grad_norm": 5.48239278793335, "learning_rate": 1.749294184076793e-05, "loss": 5.4582, "step": 6660 }, { "epoch": 1.26, "grad_norm": 5.290346145629883, "learning_rate": 1.748917748917749e-05, "loss": 5.4423, "step": 6670 }, { "epoch": 1.26, "grad_norm": 3.1395699977874756, "learning_rate": 1.7485413137587054e-05, "loss": 5.3962, "step": 6680 }, { "epoch": 1.26, "grad_norm": 4.081737995147705, "learning_rate": 1.7481648785996614e-05, "loss": 5.2637, "step": 6690 }, { "epoch": 1.26, "grad_norm": 3.454633951187134, "learning_rate": 1.7477884434406177e-05, "loss": 5.3793, "step": 6700 }, { "epoch": 1.26, "grad_norm": 6.608503818511963, "learning_rate": 1.7474120082815737e-05, "loss": 5.4873, "step": 6710 }, { "epoch": 1.26, "grad_norm": 3.6044631004333496, "learning_rate": 1.7470355731225296e-05, "loss": 5.4251, "step": 6720 }, { "epoch": 1.27, "grad_norm": 3.1667826175689697, "learning_rate": 1.746659137963486e-05, "loss": 5.3497, "step": 6730 }, { "epoch": 1.27, "grad_norm": 5.698710918426514, "learning_rate": 1.746282702804442e-05, "loss": 5.2494, "step": 6740 }, { "epoch": 1.27, "grad_norm": 4.814940452575684, "learning_rate": 1.7459062676453983e-05, "loss": 5.316, "step": 6750 }, { "epoch": 1.27, "grad_norm": 4.8874030113220215, "learning_rate": 1.7455298324863542e-05, "loss": 5.4121, "step": 6760 }, { "epoch": 1.27, "grad_norm": 3.1862032413482666, "learning_rate": 1.7451533973273105e-05, "loss": 5.2112, "step": 6770 }, { "epoch": 1.28, "grad_norm": 3.5409083366394043, "learning_rate": 1.7447769621682665e-05, "loss": 5.4351, "step": 6780 }, { "epoch": 1.28, "grad_norm": 3.867943286895752, "learning_rate": 1.744400527009223e-05, "loss": 5.3627, "step": 6790 }, { "epoch": 1.28, "grad_norm": 3.5638487339019775, "learning_rate": 1.7440240918501788e-05, "loss": 5.3735, "step": 6800 }, { "epoch": 1.28, "grad_norm": 3.859111785888672, "learning_rate": 1.743647656691135e-05, "loss": 5.3507, "step": 6810 }, { "epoch": 1.28, "grad_norm": 3.738166332244873, "learning_rate": 1.743271221532091e-05, "loss": 5.4537, "step": 6820 }, { "epoch": 1.29, "grad_norm": 3.6341073513031006, "learning_rate": 1.7428947863730474e-05, "loss": 5.3859, "step": 6830 }, { "epoch": 1.29, "grad_norm": 5.004108428955078, "learning_rate": 1.7425183512140037e-05, "loss": 5.3283, "step": 6840 }, { "epoch": 1.29, "grad_norm": 3.703941583633423, "learning_rate": 1.7421419160549597e-05, "loss": 5.3059, "step": 6850 }, { "epoch": 1.29, "grad_norm": 4.05626106262207, "learning_rate": 1.741765480895916e-05, "loss": 5.3823, "step": 6860 }, { "epoch": 1.29, "grad_norm": 5.02943229675293, "learning_rate": 1.741389045736872e-05, "loss": 5.2867, "step": 6870 }, { "epoch": 1.29, "grad_norm": 3.9839112758636475, "learning_rate": 1.7410126105778283e-05, "loss": 5.2647, "step": 6880 }, { "epoch": 1.3, "grad_norm": 4.86874532699585, "learning_rate": 1.7406361754187843e-05, "loss": 5.2764, "step": 6890 }, { "epoch": 1.3, "grad_norm": 5.171234130859375, "learning_rate": 1.7402597402597403e-05, "loss": 5.4265, "step": 6900 }, { "epoch": 1.3, "grad_norm": 3.988234519958496, "learning_rate": 1.7398833051006966e-05, "loss": 5.2659, "step": 6910 }, { "epoch": 1.3, "grad_norm": 3.766921043395996, "learning_rate": 1.7395068699416526e-05, "loss": 5.4643, "step": 6920 }, { "epoch": 1.3, "grad_norm": 3.932483673095703, "learning_rate": 1.739130434782609e-05, "loss": 5.3583, "step": 6930 }, { "epoch": 1.31, "grad_norm": 5.202116966247559, "learning_rate": 1.738753999623565e-05, "loss": 5.4499, "step": 6940 }, { "epoch": 1.31, "grad_norm": 5.583954334259033, "learning_rate": 1.7383775644645212e-05, "loss": 5.4157, "step": 6950 }, { "epoch": 1.31, "grad_norm": 8.147529602050781, "learning_rate": 1.738001129305477e-05, "loss": 5.327, "step": 6960 }, { "epoch": 1.31, "grad_norm": 3.973139524459839, "learning_rate": 1.7376246941464335e-05, "loss": 5.3035, "step": 6970 }, { "epoch": 1.31, "grad_norm": 3.3507044315338135, "learning_rate": 1.7372482589873894e-05, "loss": 5.2054, "step": 6980 }, { "epoch": 1.32, "grad_norm": 6.835877895355225, "learning_rate": 1.7368718238283458e-05, "loss": 5.315, "step": 6990 }, { "epoch": 1.32, "grad_norm": 3.1898701190948486, "learning_rate": 1.7364953886693017e-05, "loss": 5.4237, "step": 7000 }, { "epoch": 1.32, "grad_norm": 4.374692440032959, "learning_rate": 1.736118953510258e-05, "loss": 5.2256, "step": 7010 }, { "epoch": 1.32, "grad_norm": 4.43062162399292, "learning_rate": 1.7357425183512144e-05, "loss": 5.1817, "step": 7020 }, { "epoch": 1.32, "grad_norm": 7.586085319519043, "learning_rate": 1.7353660831921703e-05, "loss": 5.3104, "step": 7030 }, { "epoch": 1.33, "grad_norm": 4.529648780822754, "learning_rate": 1.7349896480331267e-05, "loss": 5.272, "step": 7040 }, { "epoch": 1.33, "grad_norm": 3.117426872253418, "learning_rate": 1.7346132128740826e-05, "loss": 5.2938, "step": 7050 }, { "epoch": 1.33, "grad_norm": 5.401022434234619, "learning_rate": 1.7342367777150386e-05, "loss": 5.3078, "step": 7060 }, { "epoch": 1.33, "grad_norm": 4.479426383972168, "learning_rate": 1.733860342555995e-05, "loss": 5.3244, "step": 7070 }, { "epoch": 1.33, "grad_norm": 3.8002305030822754, "learning_rate": 1.733483907396951e-05, "loss": 5.1801, "step": 7080 }, { "epoch": 1.33, "grad_norm": 3.564082622528076, "learning_rate": 1.7331074722379072e-05, "loss": 5.3763, "step": 7090 }, { "epoch": 1.34, "grad_norm": 3.56687593460083, "learning_rate": 1.7327310370788632e-05, "loss": 5.1961, "step": 7100 }, { "epoch": 1.34, "grad_norm": 3.6049387454986572, "learning_rate": 1.7323546019198195e-05, "loss": 5.3018, "step": 7110 }, { "epoch": 1.34, "grad_norm": 3.306177854537964, "learning_rate": 1.7319781667607755e-05, "loss": 5.2295, "step": 7120 }, { "epoch": 1.34, "grad_norm": 5.129675388336182, "learning_rate": 1.7316017316017318e-05, "loss": 5.303, "step": 7130 }, { "epoch": 1.34, "grad_norm": 3.892099380493164, "learning_rate": 1.7312252964426878e-05, "loss": 5.2198, "step": 7140 }, { "epoch": 1.35, "grad_norm": 4.6624860763549805, "learning_rate": 1.730848861283644e-05, "loss": 5.265, "step": 7150 }, { "epoch": 1.35, "grad_norm": 3.942927122116089, "learning_rate": 1.7304724261246e-05, "loss": 5.3392, "step": 7160 }, { "epoch": 1.35, "grad_norm": 4.542496204376221, "learning_rate": 1.7300959909655564e-05, "loss": 5.2053, "step": 7170 }, { "epoch": 1.35, "grad_norm": 4.077686309814453, "learning_rate": 1.7297195558065124e-05, "loss": 5.3184, "step": 7180 }, { "epoch": 1.35, "grad_norm": 5.096768379211426, "learning_rate": 1.7293431206474687e-05, "loss": 5.3053, "step": 7190 }, { "epoch": 1.36, "grad_norm": 6.893802642822266, "learning_rate": 1.728966685488425e-05, "loss": 5.2908, "step": 7200 }, { "epoch": 1.36, "grad_norm": 3.9917964935302734, "learning_rate": 1.728590250329381e-05, "loss": 5.3373, "step": 7210 }, { "epoch": 1.36, "grad_norm": 3.5734989643096924, "learning_rate": 1.7282138151703373e-05, "loss": 5.229, "step": 7220 }, { "epoch": 1.36, "grad_norm": 3.814216375350952, "learning_rate": 1.727837380011293e-05, "loss": 5.1797, "step": 7230 }, { "epoch": 1.36, "grad_norm": 4.143984317779541, "learning_rate": 1.7274609448522492e-05, "loss": 5.2167, "step": 7240 }, { "epoch": 1.36, "grad_norm": 3.803403377532959, "learning_rate": 1.7270845096932056e-05, "loss": 5.2626, "step": 7250 }, { "epoch": 1.37, "grad_norm": 3.4533042907714844, "learning_rate": 1.7267080745341615e-05, "loss": 5.187, "step": 7260 }, { "epoch": 1.37, "grad_norm": 3.164524793624878, "learning_rate": 1.726331639375118e-05, "loss": 5.1956, "step": 7270 }, { "epoch": 1.37, "grad_norm": 4.071840763092041, "learning_rate": 1.7259552042160738e-05, "loss": 5.1184, "step": 7280 }, { "epoch": 1.37, "grad_norm": 7.326527118682861, "learning_rate": 1.72557876905703e-05, "loss": 5.253, "step": 7290 }, { "epoch": 1.37, "grad_norm": 3.4524407386779785, "learning_rate": 1.725202333897986e-05, "loss": 5.2274, "step": 7300 }, { "epoch": 1.38, "grad_norm": 7.469705104827881, "learning_rate": 1.7248258987389424e-05, "loss": 5.1864, "step": 7310 }, { "epoch": 1.38, "grad_norm": 3.6923930644989014, "learning_rate": 1.7244494635798984e-05, "loss": 5.169, "step": 7320 }, { "epoch": 1.38, "grad_norm": 3.7456209659576416, "learning_rate": 1.7240730284208547e-05, "loss": 5.2865, "step": 7330 }, { "epoch": 1.38, "grad_norm": 3.506336212158203, "learning_rate": 1.7236965932618107e-05, "loss": 5.2578, "step": 7340 }, { "epoch": 1.38, "grad_norm": 3.723317861557007, "learning_rate": 1.723320158102767e-05, "loss": 5.2458, "step": 7350 }, { "epoch": 1.39, "grad_norm": 4.727193355560303, "learning_rate": 1.722943722943723e-05, "loss": 5.1858, "step": 7360 }, { "epoch": 1.39, "grad_norm": 4.694411277770996, "learning_rate": 1.7225672877846793e-05, "loss": 5.157, "step": 7370 }, { "epoch": 1.39, "grad_norm": 10.395565032958984, "learning_rate": 1.7221908526256356e-05, "loss": 5.1838, "step": 7380 }, { "epoch": 1.39, "grad_norm": 3.999401330947876, "learning_rate": 1.7218144174665916e-05, "loss": 5.2449, "step": 7390 }, { "epoch": 1.39, "grad_norm": 5.770976543426514, "learning_rate": 1.721437982307548e-05, "loss": 5.2041, "step": 7400 }, { "epoch": 1.39, "grad_norm": 4.1684064865112305, "learning_rate": 1.7210615471485036e-05, "loss": 5.2769, "step": 7410 }, { "epoch": 1.4, "grad_norm": 3.4186148643493652, "learning_rate": 1.72068511198946e-05, "loss": 5.1023, "step": 7420 }, { "epoch": 1.4, "grad_norm": 3.8021910190582275, "learning_rate": 1.7203086768304162e-05, "loss": 5.1972, "step": 7430 }, { "epoch": 1.4, "grad_norm": 5.207361698150635, "learning_rate": 1.719932241671372e-05, "loss": 5.1311, "step": 7440 }, { "epoch": 1.4, "grad_norm": 3.8703479766845703, "learning_rate": 1.7195558065123285e-05, "loss": 5.1434, "step": 7450 }, { "epoch": 1.4, "grad_norm": 5.820541858673096, "learning_rate": 1.7191793713532845e-05, "loss": 5.2627, "step": 7460 }, { "epoch": 1.41, "grad_norm": 3.754873514175415, "learning_rate": 1.7188029361942408e-05, "loss": 5.1121, "step": 7470 }, { "epoch": 1.41, "grad_norm": 5.0720086097717285, "learning_rate": 1.7184265010351967e-05, "loss": 5.0704, "step": 7480 }, { "epoch": 1.41, "grad_norm": 4.0805768966674805, "learning_rate": 1.718050065876153e-05, "loss": 5.3178, "step": 7490 }, { "epoch": 1.41, "grad_norm": 5.068567276000977, "learning_rate": 1.717673630717109e-05, "loss": 5.1662, "step": 7500 }, { "epoch": 1.41, "grad_norm": 4.059872150421143, "learning_rate": 1.7172971955580654e-05, "loss": 5.0356, "step": 7510 }, { "epoch": 1.42, "grad_norm": 3.5050201416015625, "learning_rate": 1.7169207603990213e-05, "loss": 5.2463, "step": 7520 }, { "epoch": 1.42, "grad_norm": 4.099201202392578, "learning_rate": 1.7165443252399776e-05, "loss": 5.1921, "step": 7530 }, { "epoch": 1.42, "grad_norm": 5.133457660675049, "learning_rate": 1.7161678900809336e-05, "loss": 5.1114, "step": 7540 }, { "epoch": 1.42, "grad_norm": 5.4331159591674805, "learning_rate": 1.71579145492189e-05, "loss": 5.1927, "step": 7550 }, { "epoch": 1.42, "grad_norm": 3.7023303508758545, "learning_rate": 1.715415019762846e-05, "loss": 5.0841, "step": 7560 }, { "epoch": 1.42, "grad_norm": 4.59948205947876, "learning_rate": 1.7150385846038022e-05, "loss": 5.233, "step": 7570 }, { "epoch": 1.43, "grad_norm": 3.8817737102508545, "learning_rate": 1.7146621494447582e-05, "loss": 5.1976, "step": 7580 }, { "epoch": 1.43, "grad_norm": 3.8560373783111572, "learning_rate": 1.7142857142857142e-05, "loss": 5.0753, "step": 7590 }, { "epoch": 1.43, "grad_norm": 5.3909149169921875, "learning_rate": 1.7139092791266705e-05, "loss": 5.1925, "step": 7600 }, { "epoch": 1.43, "grad_norm": 4.343448162078857, "learning_rate": 1.7135328439676268e-05, "loss": 5.196, "step": 7610 }, { "epoch": 1.43, "grad_norm": 6.405071258544922, "learning_rate": 1.7131564088085828e-05, "loss": 5.1406, "step": 7620 }, { "epoch": 1.44, "grad_norm": 5.151088237762451, "learning_rate": 1.712779973649539e-05, "loss": 5.1234, "step": 7630 }, { "epoch": 1.44, "grad_norm": 3.7828986644744873, "learning_rate": 1.712403538490495e-05, "loss": 5.1711, "step": 7640 }, { "epoch": 1.44, "grad_norm": 3.6769795417785645, "learning_rate": 1.7120271033314514e-05, "loss": 5.1384, "step": 7650 }, { "epoch": 1.44, "grad_norm": 4.800556182861328, "learning_rate": 1.7116506681724074e-05, "loss": 5.1938, "step": 7660 }, { "epoch": 1.44, "grad_norm": 4.440310955047607, "learning_rate": 1.7112742330133637e-05, "loss": 4.9629, "step": 7670 }, { "epoch": 1.45, "grad_norm": 4.44383430480957, "learning_rate": 1.7108977978543197e-05, "loss": 5.0992, "step": 7680 }, { "epoch": 1.45, "grad_norm": 3.9268581867218018, "learning_rate": 1.710521362695276e-05, "loss": 5.0436, "step": 7690 }, { "epoch": 1.45, "grad_norm": 3.9141645431518555, "learning_rate": 1.710144927536232e-05, "loss": 5.1705, "step": 7700 }, { "epoch": 1.45, "grad_norm": 4.492930889129639, "learning_rate": 1.7097684923771883e-05, "loss": 5.0687, "step": 7710 }, { "epoch": 1.45, "grad_norm": 5.466479778289795, "learning_rate": 1.7093920572181443e-05, "loss": 5.1591, "step": 7720 }, { "epoch": 1.45, "grad_norm": 3.5339152812957764, "learning_rate": 1.7090156220591006e-05, "loss": 4.9697, "step": 7730 }, { "epoch": 1.46, "grad_norm": 3.642507553100586, "learning_rate": 1.7086391869000565e-05, "loss": 5.1312, "step": 7740 }, { "epoch": 1.46, "grad_norm": 4.053368091583252, "learning_rate": 1.708262751741013e-05, "loss": 5.0886, "step": 7750 }, { "epoch": 1.46, "grad_norm": 6.340184688568115, "learning_rate": 1.707886316581969e-05, "loss": 5.023, "step": 7760 }, { "epoch": 1.46, "grad_norm": 4.180225372314453, "learning_rate": 1.7075098814229248e-05, "loss": 5.1253, "step": 7770 }, { "epoch": 1.46, "grad_norm": 3.8722784519195557, "learning_rate": 1.707133446263881e-05, "loss": 5.0701, "step": 7780 }, { "epoch": 1.47, "grad_norm": 7.559011936187744, "learning_rate": 1.706757011104837e-05, "loss": 5.1676, "step": 7790 }, { "epoch": 1.47, "grad_norm": 5.031394004821777, "learning_rate": 1.7063805759457934e-05, "loss": 5.0672, "step": 7800 }, { "epoch": 1.47, "grad_norm": 4.723738670349121, "learning_rate": 1.7060041407867497e-05, "loss": 5.1092, "step": 7810 }, { "epoch": 1.47, "grad_norm": 4.843240737915039, "learning_rate": 1.7056277056277057e-05, "loss": 5.0145, "step": 7820 }, { "epoch": 1.47, "grad_norm": 5.91166353225708, "learning_rate": 1.705251270468662e-05, "loss": 5.0017, "step": 7830 }, { "epoch": 1.48, "grad_norm": 5.08294677734375, "learning_rate": 1.704874835309618e-05, "loss": 5.0576, "step": 7840 }, { "epoch": 1.48, "grad_norm": 29.38603401184082, "learning_rate": 1.7044984001505743e-05, "loss": 5.0467, "step": 7850 }, { "epoch": 1.48, "grad_norm": 3.345935583114624, "learning_rate": 1.7041219649915303e-05, "loss": 4.9942, "step": 7860 }, { "epoch": 1.48, "grad_norm": 3.858593225479126, "learning_rate": 1.7037455298324866e-05, "loss": 5.0611, "step": 7870 }, { "epoch": 1.48, "grad_norm": 12.626725196838379, "learning_rate": 1.7033690946734426e-05, "loss": 5.0529, "step": 7880 }, { "epoch": 1.49, "grad_norm": 4.119753360748291, "learning_rate": 1.702992659514399e-05, "loss": 5.023, "step": 7890 }, { "epoch": 1.49, "grad_norm": 8.894264221191406, "learning_rate": 1.702616224355355e-05, "loss": 5.1411, "step": 7900 }, { "epoch": 1.49, "grad_norm": 3.9443325996398926, "learning_rate": 1.7022397891963112e-05, "loss": 5.1197, "step": 7910 }, { "epoch": 1.49, "grad_norm": 4.1317973136901855, "learning_rate": 1.7018633540372672e-05, "loss": 5.0245, "step": 7920 }, { "epoch": 1.49, "grad_norm": 3.701643943786621, "learning_rate": 1.701486918878223e-05, "loss": 5.0405, "step": 7930 }, { "epoch": 1.49, "grad_norm": 3.322911500930786, "learning_rate": 1.7011104837191795e-05, "loss": 5.0015, "step": 7940 }, { "epoch": 1.5, "grad_norm": 5.470844745635986, "learning_rate": 1.7007340485601354e-05, "loss": 4.9907, "step": 7950 }, { "epoch": 1.5, "grad_norm": 5.827738285064697, "learning_rate": 1.7003576134010918e-05, "loss": 5.0219, "step": 7960 }, { "epoch": 1.5, "grad_norm": 4.909959316253662, "learning_rate": 1.6999811782420477e-05, "loss": 5.1492, "step": 7970 }, { "epoch": 1.5, "grad_norm": 4.0625433921813965, "learning_rate": 1.699604743083004e-05, "loss": 5.0034, "step": 7980 }, { "epoch": 1.5, "grad_norm": 8.92168140411377, "learning_rate": 1.6992283079239604e-05, "loss": 5.077, "step": 7990 }, { "epoch": 1.51, "grad_norm": 3.722633123397827, "learning_rate": 1.6988518727649163e-05, "loss": 5.0068, "step": 8000 }, { "epoch": 1.51, "grad_norm": 4.391185283660889, "learning_rate": 1.6984754376058727e-05, "loss": 5.1295, "step": 8010 }, { "epoch": 1.51, "grad_norm": 3.715665578842163, "learning_rate": 1.6980990024468286e-05, "loss": 5.008, "step": 8020 }, { "epoch": 1.51, "grad_norm": 5.831299781799316, "learning_rate": 1.697722567287785e-05, "loss": 4.966, "step": 8030 }, { "epoch": 1.51, "grad_norm": 4.479729652404785, "learning_rate": 1.697346132128741e-05, "loss": 5.0291, "step": 8040 }, { "epoch": 1.52, "grad_norm": 9.000201225280762, "learning_rate": 1.6969696969696972e-05, "loss": 4.9913, "step": 8050 }, { "epoch": 1.52, "grad_norm": 4.201887607574463, "learning_rate": 1.6965932618106532e-05, "loss": 5.0668, "step": 8060 }, { "epoch": 1.52, "grad_norm": 6.5595550537109375, "learning_rate": 1.6962168266516095e-05, "loss": 4.9879, "step": 8070 }, { "epoch": 1.52, "grad_norm": 5.78305721282959, "learning_rate": 1.6958403914925655e-05, "loss": 4.9436, "step": 8080 }, { "epoch": 1.52, "grad_norm": 3.9996652603149414, "learning_rate": 1.6954639563335218e-05, "loss": 4.8807, "step": 8090 }, { "epoch": 1.52, "grad_norm": 4.365677833557129, "learning_rate": 1.6950875211744778e-05, "loss": 5.0638, "step": 8100 }, { "epoch": 1.53, "grad_norm": 4.317311763763428, "learning_rate": 1.6947110860154338e-05, "loss": 5.0942, "step": 8110 }, { "epoch": 1.53, "grad_norm": 3.649364948272705, "learning_rate": 1.69433465085639e-05, "loss": 5.0228, "step": 8120 }, { "epoch": 1.53, "grad_norm": 6.4775519371032715, "learning_rate": 1.693958215697346e-05, "loss": 5.0941, "step": 8130 }, { "epoch": 1.53, "grad_norm": 6.25124454498291, "learning_rate": 1.6935817805383024e-05, "loss": 4.9845, "step": 8140 }, { "epoch": 1.53, "grad_norm": 4.090969562530518, "learning_rate": 1.6932053453792584e-05, "loss": 4.9963, "step": 8150 }, { "epoch": 1.54, "grad_norm": 9.618407249450684, "learning_rate": 1.6928289102202147e-05, "loss": 4.9911, "step": 8160 }, { "epoch": 1.54, "grad_norm": 4.4269633293151855, "learning_rate": 1.692452475061171e-05, "loss": 4.9878, "step": 8170 }, { "epoch": 1.54, "grad_norm": 7.7458062171936035, "learning_rate": 1.692076039902127e-05, "loss": 5.0025, "step": 8180 }, { "epoch": 1.54, "grad_norm": 3.9767813682556152, "learning_rate": 1.6916996047430833e-05, "loss": 4.991, "step": 8190 }, { "epoch": 1.54, "grad_norm": 7.545022964477539, "learning_rate": 1.6913231695840393e-05, "loss": 4.9913, "step": 8200 }, { "epoch": 1.55, "grad_norm": 6.105132579803467, "learning_rate": 1.6909467344249956e-05, "loss": 4.9745, "step": 8210 }, { "epoch": 1.55, "grad_norm": 10.298683166503906, "learning_rate": 1.6905702992659516e-05, "loss": 5.0419, "step": 8220 }, { "epoch": 1.55, "grad_norm": 6.129251480102539, "learning_rate": 1.690193864106908e-05, "loss": 5.0936, "step": 8230 }, { "epoch": 1.55, "grad_norm": 3.5882744789123535, "learning_rate": 1.689817428947864e-05, "loss": 4.8374, "step": 8240 }, { "epoch": 1.55, "grad_norm": 3.834015369415283, "learning_rate": 1.68944099378882e-05, "loss": 5.1059, "step": 8250 }, { "epoch": 1.55, "grad_norm": 5.804141521453857, "learning_rate": 1.689064558629776e-05, "loss": 5.0226, "step": 8260 }, { "epoch": 1.56, "grad_norm": 4.296107769012451, "learning_rate": 1.6886881234707325e-05, "loss": 4.951, "step": 8270 }, { "epoch": 1.56, "grad_norm": 5.19776725769043, "learning_rate": 1.6883116883116884e-05, "loss": 4.9549, "step": 8280 }, { "epoch": 1.56, "grad_norm": 4.887362957000732, "learning_rate": 1.6879352531526444e-05, "loss": 4.9304, "step": 8290 }, { "epoch": 1.56, "grad_norm": 3.952012300491333, "learning_rate": 1.6875588179936007e-05, "loss": 5.0039, "step": 8300 }, { "epoch": 1.56, "grad_norm": 4.41887092590332, "learning_rate": 1.6871823828345567e-05, "loss": 4.7845, "step": 8310 }, { "epoch": 1.57, "grad_norm": 11.580818176269531, "learning_rate": 1.686805947675513e-05, "loss": 5.1581, "step": 8320 }, { "epoch": 1.57, "grad_norm": 4.609852313995361, "learning_rate": 1.686429512516469e-05, "loss": 4.9123, "step": 8330 }, { "epoch": 1.57, "grad_norm": 3.595414400100708, "learning_rate": 1.6860530773574253e-05, "loss": 4.875, "step": 8340 }, { "epoch": 1.57, "grad_norm": 5.0496931076049805, "learning_rate": 1.6856766421983813e-05, "loss": 5.0022, "step": 8350 }, { "epoch": 1.57, "grad_norm": 5.270349025726318, "learning_rate": 1.6853002070393376e-05, "loss": 4.8619, "step": 8360 }, { "epoch": 1.58, "grad_norm": 4.3038811683654785, "learning_rate": 1.684923771880294e-05, "loss": 4.8173, "step": 8370 }, { "epoch": 1.58, "grad_norm": 9.853190422058105, "learning_rate": 1.68454733672125e-05, "loss": 4.9342, "step": 8380 }, { "epoch": 1.58, "grad_norm": 4.847845077514648, "learning_rate": 1.6841709015622062e-05, "loss": 5.074, "step": 8390 }, { "epoch": 1.58, "grad_norm": 8.384522438049316, "learning_rate": 1.6837944664031622e-05, "loss": 4.8648, "step": 8400 }, { "epoch": 1.58, "grad_norm": 4.04774808883667, "learning_rate": 1.6834180312441185e-05, "loss": 5.0063, "step": 8410 }, { "epoch": 1.58, "grad_norm": 4.394888877868652, "learning_rate": 1.6830415960850745e-05, "loss": 4.8632, "step": 8420 }, { "epoch": 1.59, "grad_norm": 4.777827262878418, "learning_rate": 1.6826651609260308e-05, "loss": 4.9474, "step": 8430 }, { "epoch": 1.59, "grad_norm": 4.665745735168457, "learning_rate": 1.6822887257669868e-05, "loss": 5.0431, "step": 8440 }, { "epoch": 1.59, "grad_norm": 5.718853950500488, "learning_rate": 1.6819122906079427e-05, "loss": 4.8633, "step": 8450 }, { "epoch": 1.59, "grad_norm": 5.005476474761963, "learning_rate": 1.681535855448899e-05, "loss": 4.8521, "step": 8460 }, { "epoch": 1.59, "grad_norm": 3.3350417613983154, "learning_rate": 1.681159420289855e-05, "loss": 4.8333, "step": 8470 }, { "epoch": 1.6, "grad_norm": 5.856405258178711, "learning_rate": 1.6807829851308114e-05, "loss": 4.9934, "step": 8480 }, { "epoch": 1.6, "grad_norm": 4.821055889129639, "learning_rate": 1.6804065499717673e-05, "loss": 4.8683, "step": 8490 }, { "epoch": 1.6, "grad_norm": 6.559111595153809, "learning_rate": 1.6800301148127236e-05, "loss": 4.8924, "step": 8500 }, { "epoch": 1.6, "grad_norm": 5.451012134552002, "learning_rate": 1.6796536796536796e-05, "loss": 5.0124, "step": 8510 }, { "epoch": 1.6, "grad_norm": 4.38719367980957, "learning_rate": 1.679277244494636e-05, "loss": 4.8277, "step": 8520 }, { "epoch": 1.61, "grad_norm": 3.5944623947143555, "learning_rate": 1.678900809335592e-05, "loss": 4.9957, "step": 8530 }, { "epoch": 1.61, "grad_norm": 15.848986625671387, "learning_rate": 1.6785243741765482e-05, "loss": 4.9341, "step": 8540 }, { "epoch": 1.61, "grad_norm": 8.380572319030762, "learning_rate": 1.6781479390175045e-05, "loss": 4.936, "step": 8550 }, { "epoch": 1.61, "grad_norm": 6.376757621765137, "learning_rate": 1.6777715038584605e-05, "loss": 4.7159, "step": 8560 }, { "epoch": 1.61, "grad_norm": 5.770646572113037, "learning_rate": 1.677395068699417e-05, "loss": 4.8327, "step": 8570 }, { "epoch": 1.61, "grad_norm": 3.680101156234741, "learning_rate": 1.6770186335403728e-05, "loss": 4.9556, "step": 8580 }, { "epoch": 1.62, "grad_norm": 5.15360689163208, "learning_rate": 1.676642198381329e-05, "loss": 4.9723, "step": 8590 }, { "epoch": 1.62, "grad_norm": 3.9179909229278564, "learning_rate": 1.676265763222285e-05, "loss": 4.8428, "step": 8600 }, { "epoch": 1.62, "grad_norm": 3.9752557277679443, "learning_rate": 1.6758893280632414e-05, "loss": 4.6768, "step": 8610 }, { "epoch": 1.62, "grad_norm": 5.241940498352051, "learning_rate": 1.6755128929041974e-05, "loss": 4.8211, "step": 8620 }, { "epoch": 1.62, "grad_norm": 3.9683773517608643, "learning_rate": 1.6751364577451534e-05, "loss": 4.8608, "step": 8630 }, { "epoch": 1.63, "grad_norm": 3.7375926971435547, "learning_rate": 1.6747600225861097e-05, "loss": 4.8918, "step": 8640 }, { "epoch": 1.63, "grad_norm": 3.8603086471557617, "learning_rate": 1.6743835874270657e-05, "loss": 4.849, "step": 8650 }, { "epoch": 1.63, "grad_norm": 4.478582859039307, "learning_rate": 1.674007152268022e-05, "loss": 4.9671, "step": 8660 }, { "epoch": 1.63, "grad_norm": 8.064244270324707, "learning_rate": 1.673630717108978e-05, "loss": 4.8326, "step": 8670 }, { "epoch": 1.63, "grad_norm": 5.850414276123047, "learning_rate": 1.6732542819499343e-05, "loss": 4.7765, "step": 8680 }, { "epoch": 1.64, "grad_norm": 6.554373264312744, "learning_rate": 1.6728778467908903e-05, "loss": 4.9134, "step": 8690 }, { "epoch": 1.64, "grad_norm": 6.9197235107421875, "learning_rate": 1.6725014116318466e-05, "loss": 4.7998, "step": 8700 }, { "epoch": 1.64, "grad_norm": 4.355859756469727, "learning_rate": 1.6721249764728025e-05, "loss": 4.8434, "step": 8710 }, { "epoch": 1.64, "grad_norm": 6.3713250160217285, "learning_rate": 1.671748541313759e-05, "loss": 4.9046, "step": 8720 }, { "epoch": 1.64, "grad_norm": 4.62460470199585, "learning_rate": 1.6713721061547152e-05, "loss": 4.7389, "step": 8730 }, { "epoch": 1.65, "grad_norm": 4.1460371017456055, "learning_rate": 1.670995670995671e-05, "loss": 4.9723, "step": 8740 }, { "epoch": 1.65, "grad_norm": 5.620097637176514, "learning_rate": 1.6706192358366275e-05, "loss": 4.8477, "step": 8750 }, { "epoch": 1.65, "grad_norm": 5.2599921226501465, "learning_rate": 1.6702428006775834e-05, "loss": 4.8072, "step": 8760 }, { "epoch": 1.65, "grad_norm": 5.580438613891602, "learning_rate": 1.6698663655185398e-05, "loss": 4.8149, "step": 8770 }, { "epoch": 1.65, "grad_norm": 12.685842514038086, "learning_rate": 1.6694899303594957e-05, "loss": 4.8802, "step": 8780 }, { "epoch": 1.65, "grad_norm": 8.145336151123047, "learning_rate": 1.669113495200452e-05, "loss": 4.8833, "step": 8790 }, { "epoch": 1.66, "grad_norm": 7.1701741218566895, "learning_rate": 1.668737060041408e-05, "loss": 5.0262, "step": 8800 }, { "epoch": 1.66, "grad_norm": 7.442540645599365, "learning_rate": 1.668360624882364e-05, "loss": 4.6509, "step": 8810 }, { "epoch": 1.66, "grad_norm": 4.74673318862915, "learning_rate": 1.6679841897233203e-05, "loss": 4.8674, "step": 8820 }, { "epoch": 1.66, "grad_norm": 4.696155071258545, "learning_rate": 1.6676077545642763e-05, "loss": 4.9311, "step": 8830 }, { "epoch": 1.66, "grad_norm": 3.6667866706848145, "learning_rate": 1.6672313194052326e-05, "loss": 4.7207, "step": 8840 }, { "epoch": 1.67, "grad_norm": 3.897566795349121, "learning_rate": 1.6668548842461886e-05, "loss": 4.6853, "step": 8850 }, { "epoch": 1.67, "grad_norm": 6.038201332092285, "learning_rate": 1.666478449087145e-05, "loss": 4.8375, "step": 8860 }, { "epoch": 1.67, "grad_norm": 5.887599945068359, "learning_rate": 1.666102013928101e-05, "loss": 4.7595, "step": 8870 }, { "epoch": 1.67, "grad_norm": 5.8411970138549805, "learning_rate": 1.6657255787690572e-05, "loss": 4.891, "step": 8880 }, { "epoch": 1.67, "grad_norm": 4.061485290527344, "learning_rate": 1.6653491436100132e-05, "loss": 4.6923, "step": 8890 }, { "epoch": 1.68, "grad_norm": 3.7704198360443115, "learning_rate": 1.6649727084509695e-05, "loss": 4.6174, "step": 8900 }, { "epoch": 1.68, "grad_norm": 4.734050273895264, "learning_rate": 1.6645962732919258e-05, "loss": 4.741, "step": 8910 }, { "epoch": 1.68, "grad_norm": 4.4391279220581055, "learning_rate": 1.6642198381328818e-05, "loss": 4.847, "step": 8920 }, { "epoch": 1.68, "grad_norm": 4.607929229736328, "learning_rate": 1.663843402973838e-05, "loss": 5.1521, "step": 8930 }, { "epoch": 1.68, "grad_norm": 3.34086275100708, "learning_rate": 1.663466967814794e-05, "loss": 4.7362, "step": 8940 }, { "epoch": 1.68, "grad_norm": 4.1590681076049805, "learning_rate": 1.6630905326557504e-05, "loss": 4.6424, "step": 8950 }, { "epoch": 1.69, "grad_norm": 8.469990730285645, "learning_rate": 1.6627140974967064e-05, "loss": 4.8489, "step": 8960 }, { "epoch": 1.69, "grad_norm": 8.145037651062012, "learning_rate": 1.6623376623376627e-05, "loss": 4.8331, "step": 8970 }, { "epoch": 1.69, "grad_norm": 4.614475727081299, "learning_rate": 1.6619612271786187e-05, "loss": 4.7718, "step": 8980 }, { "epoch": 1.69, "grad_norm": 8.78810977935791, "learning_rate": 1.6615847920195746e-05, "loss": 4.8933, "step": 8990 }, { "epoch": 1.69, "grad_norm": 6.89530611038208, "learning_rate": 1.661208356860531e-05, "loss": 4.7328, "step": 9000 }, { "epoch": 1.7, "grad_norm": 5.276695728302002, "learning_rate": 1.660831921701487e-05, "loss": 4.7656, "step": 9010 }, { "epoch": 1.7, "grad_norm": 4.822672367095947, "learning_rate": 1.6604554865424432e-05, "loss": 4.6905, "step": 9020 }, { "epoch": 1.7, "grad_norm": 7.6168084144592285, "learning_rate": 1.6600790513833992e-05, "loss": 4.8544, "step": 9030 }, { "epoch": 1.7, "grad_norm": 6.220376014709473, "learning_rate": 1.6597026162243555e-05, "loss": 4.7541, "step": 9040 }, { "epoch": 1.7, "grad_norm": 4.913834095001221, "learning_rate": 1.6593261810653115e-05, "loss": 4.8284, "step": 9050 }, { "epoch": 1.71, "grad_norm": 5.822172164916992, "learning_rate": 1.6589497459062678e-05, "loss": 4.765, "step": 9060 }, { "epoch": 1.71, "grad_norm": 4.819484710693359, "learning_rate": 1.6585733107472238e-05, "loss": 4.9274, "step": 9070 }, { "epoch": 1.71, "grad_norm": 4.6173996925354, "learning_rate": 1.65819687558818e-05, "loss": 4.6061, "step": 9080 }, { "epoch": 1.71, "grad_norm": 5.881588935852051, "learning_rate": 1.657820440429136e-05, "loss": 4.6803, "step": 9090 }, { "epoch": 1.71, "grad_norm": 5.687201023101807, "learning_rate": 1.6574440052700924e-05, "loss": 4.6524, "step": 9100 }, { "epoch": 1.71, "grad_norm": 10.45479679107666, "learning_rate": 1.6570675701110487e-05, "loss": 4.6798, "step": 9110 }, { "epoch": 1.72, "grad_norm": 4.342623710632324, "learning_rate": 1.6566911349520047e-05, "loss": 4.6135, "step": 9120 }, { "epoch": 1.72, "grad_norm": 7.318731307983398, "learning_rate": 1.656314699792961e-05, "loss": 4.779, "step": 9130 }, { "epoch": 1.72, "grad_norm": 6.194139003753662, "learning_rate": 1.655938264633917e-05, "loss": 4.6842, "step": 9140 }, { "epoch": 1.72, "grad_norm": 5.226294994354248, "learning_rate": 1.655561829474873e-05, "loss": 4.6383, "step": 9150 }, { "epoch": 1.72, "grad_norm": 5.978142738342285, "learning_rate": 1.6551853943158293e-05, "loss": 4.7826, "step": 9160 }, { "epoch": 1.73, "grad_norm": 3.7740187644958496, "learning_rate": 1.6548089591567853e-05, "loss": 4.7329, "step": 9170 }, { "epoch": 1.73, "grad_norm": 6.198735237121582, "learning_rate": 1.6544325239977416e-05, "loss": 4.8493, "step": 9180 }, { "epoch": 1.73, "grad_norm": 5.207520484924316, "learning_rate": 1.6540560888386976e-05, "loss": 4.794, "step": 9190 }, { "epoch": 1.73, "grad_norm": 4.9924798011779785, "learning_rate": 1.653679653679654e-05, "loss": 4.6006, "step": 9200 }, { "epoch": 1.73, "grad_norm": 7.021182060241699, "learning_rate": 1.65330321852061e-05, "loss": 4.6435, "step": 9210 }, { "epoch": 1.74, "grad_norm": 4.065104007720947, "learning_rate": 1.652926783361566e-05, "loss": 4.6089, "step": 9220 }, { "epoch": 1.74, "grad_norm": 4.948240756988525, "learning_rate": 1.652550348202522e-05, "loss": 4.8561, "step": 9230 }, { "epoch": 1.74, "grad_norm": 7.881089687347412, "learning_rate": 1.6521739130434785e-05, "loss": 4.7885, "step": 9240 }, { "epoch": 1.74, "grad_norm": 4.210996150970459, "learning_rate": 1.6517974778844344e-05, "loss": 4.8015, "step": 9250 }, { "epoch": 1.74, "grad_norm": 7.7194013595581055, "learning_rate": 1.6514210427253907e-05, "loss": 4.6426, "step": 9260 }, { "epoch": 1.74, "grad_norm": 5.098067283630371, "learning_rate": 1.6510446075663467e-05, "loss": 4.8345, "step": 9270 }, { "epoch": 1.75, "grad_norm": 11.078705787658691, "learning_rate": 1.650668172407303e-05, "loss": 4.7525, "step": 9280 }, { "epoch": 1.75, "grad_norm": 4.493772029876709, "learning_rate": 1.6502917372482594e-05, "loss": 4.8295, "step": 9290 }, { "epoch": 1.75, "grad_norm": 5.437169551849365, "learning_rate": 1.6499153020892153e-05, "loss": 4.4675, "step": 9300 }, { "epoch": 1.75, "grad_norm": 6.202925205230713, "learning_rate": 1.6495388669301716e-05, "loss": 4.7109, "step": 9310 }, { "epoch": 1.75, "grad_norm": 6.981991767883301, "learning_rate": 1.6491624317711273e-05, "loss": 4.6205, "step": 9320 }, { "epoch": 1.76, "grad_norm": 4.462118625640869, "learning_rate": 1.6487859966120836e-05, "loss": 4.5807, "step": 9330 }, { "epoch": 1.76, "grad_norm": 5.589308738708496, "learning_rate": 1.64840956145304e-05, "loss": 4.7013, "step": 9340 }, { "epoch": 1.76, "grad_norm": 3.8529932498931885, "learning_rate": 1.648033126293996e-05, "loss": 4.6595, "step": 9350 }, { "epoch": 1.76, "grad_norm": 4.4032721519470215, "learning_rate": 1.6476566911349522e-05, "loss": 4.6199, "step": 9360 }, { "epoch": 1.76, "grad_norm": 4.758117198944092, "learning_rate": 1.6472802559759082e-05, "loss": 4.7317, "step": 9370 }, { "epoch": 1.77, "grad_norm": 8.429734230041504, "learning_rate": 1.6469038208168645e-05, "loss": 4.688, "step": 9380 }, { "epoch": 1.77, "grad_norm": 3.6958677768707275, "learning_rate": 1.6465273856578205e-05, "loss": 4.6594, "step": 9390 }, { "epoch": 1.77, "grad_norm": 6.912812232971191, "learning_rate": 1.6461509504987768e-05, "loss": 4.6948, "step": 9400 }, { "epoch": 1.77, "grad_norm": 6.3366169929504395, "learning_rate": 1.6457745153397328e-05, "loss": 4.6253, "step": 9410 }, { "epoch": 1.77, "grad_norm": 5.728288173675537, "learning_rate": 1.645398080180689e-05, "loss": 4.7164, "step": 9420 }, { "epoch": 1.77, "grad_norm": 7.759985446929932, "learning_rate": 1.645021645021645e-05, "loss": 4.7024, "step": 9430 }, { "epoch": 1.78, "grad_norm": 6.019044876098633, "learning_rate": 1.6446452098626014e-05, "loss": 4.5483, "step": 9440 }, { "epoch": 1.78, "grad_norm": 9.748091697692871, "learning_rate": 1.6442687747035574e-05, "loss": 4.6254, "step": 9450 }, { "epoch": 1.78, "grad_norm": 3.768693685531616, "learning_rate": 1.6438923395445137e-05, "loss": 4.5522, "step": 9460 }, { "epoch": 1.78, "grad_norm": 5.204063892364502, "learning_rate": 1.64351590438547e-05, "loss": 4.6025, "step": 9470 }, { "epoch": 1.78, "grad_norm": 10.164170265197754, "learning_rate": 1.643139469226426e-05, "loss": 4.66, "step": 9480 }, { "epoch": 1.79, "grad_norm": 7.891330242156982, "learning_rate": 1.6427630340673823e-05, "loss": 4.585, "step": 9490 }, { "epoch": 1.79, "grad_norm": 4.016350746154785, "learning_rate": 1.642386598908338e-05, "loss": 4.5539, "step": 9500 }, { "epoch": 1.79, "grad_norm": 5.567968368530273, "learning_rate": 1.6420101637492942e-05, "loss": 4.612, "step": 9510 }, { "epoch": 1.79, "grad_norm": 5.928299903869629, "learning_rate": 1.6416337285902505e-05, "loss": 4.5307, "step": 9520 }, { "epoch": 1.79, "grad_norm": 4.476163387298584, "learning_rate": 1.6412572934312065e-05, "loss": 4.7564, "step": 9530 }, { "epoch": 1.8, "grad_norm": 5.472432613372803, "learning_rate": 1.640880858272163e-05, "loss": 4.4869, "step": 9540 }, { "epoch": 1.8, "grad_norm": 9.691415786743164, "learning_rate": 1.6405044231131188e-05, "loss": 4.6419, "step": 9550 }, { "epoch": 1.8, "grad_norm": 5.1662726402282715, "learning_rate": 1.640127987954075e-05, "loss": 4.5287, "step": 9560 }, { "epoch": 1.8, "grad_norm": 5.903685569763184, "learning_rate": 1.639751552795031e-05, "loss": 4.6224, "step": 9570 }, { "epoch": 1.8, "grad_norm": 5.737349987030029, "learning_rate": 1.6393751176359874e-05, "loss": 4.4734, "step": 9580 }, { "epoch": 1.81, "grad_norm": 8.89427375793457, "learning_rate": 1.6389986824769434e-05, "loss": 4.7268, "step": 9590 }, { "epoch": 1.81, "grad_norm": 8.637029647827148, "learning_rate": 1.6386222473178997e-05, "loss": 4.7729, "step": 9600 }, { "epoch": 1.81, "grad_norm": 7.215028285980225, "learning_rate": 1.6382458121588557e-05, "loss": 4.6281, "step": 9610 }, { "epoch": 1.81, "grad_norm": 8.093809127807617, "learning_rate": 1.637869376999812e-05, "loss": 4.606, "step": 9620 }, { "epoch": 1.81, "grad_norm": 3.9345719814300537, "learning_rate": 1.637492941840768e-05, "loss": 4.6149, "step": 9630 }, { "epoch": 1.81, "grad_norm": 4.802034378051758, "learning_rate": 1.6371165066817243e-05, "loss": 4.6024, "step": 9640 }, { "epoch": 1.82, "grad_norm": 8.22869873046875, "learning_rate": 1.6367400715226803e-05, "loss": 4.6137, "step": 9650 }, { "epoch": 1.82, "grad_norm": 6.226015567779541, "learning_rate": 1.6363636363636366e-05, "loss": 4.6233, "step": 9660 }, { "epoch": 1.82, "grad_norm": 9.889335632324219, "learning_rate": 1.6359872012045926e-05, "loss": 4.6214, "step": 9670 }, { "epoch": 1.82, "grad_norm": 7.1177568435668945, "learning_rate": 1.6356107660455485e-05, "loss": 4.7114, "step": 9680 }, { "epoch": 1.82, "grad_norm": 5.070508003234863, "learning_rate": 1.635234330886505e-05, "loss": 4.681, "step": 9690 }, { "epoch": 1.83, "grad_norm": 4.495706081390381, "learning_rate": 1.6348578957274612e-05, "loss": 4.6281, "step": 9700 }, { "epoch": 1.83, "grad_norm": 9.562110900878906, "learning_rate": 1.634481460568417e-05, "loss": 4.5723, "step": 9710 }, { "epoch": 1.83, "grad_norm": 5.272618293762207, "learning_rate": 1.6341050254093735e-05, "loss": 4.7513, "step": 9720 }, { "epoch": 1.83, "grad_norm": 8.10490608215332, "learning_rate": 1.6337285902503294e-05, "loss": 4.6056, "step": 9730 }, { "epoch": 1.83, "grad_norm": 5.029525279998779, "learning_rate": 1.6333521550912858e-05, "loss": 4.6391, "step": 9740 }, { "epoch": 1.84, "grad_norm": 6.911810398101807, "learning_rate": 1.6329757199322417e-05, "loss": 4.5984, "step": 9750 }, { "epoch": 1.84, "grad_norm": 5.6609883308410645, "learning_rate": 1.632599284773198e-05, "loss": 4.6344, "step": 9760 }, { "epoch": 1.84, "grad_norm": 3.838470935821533, "learning_rate": 1.632222849614154e-05, "loss": 4.4365, "step": 9770 }, { "epoch": 1.84, "grad_norm": 3.9894893169403076, "learning_rate": 1.6318464144551103e-05, "loss": 4.4302, "step": 9780 }, { "epoch": 1.84, "grad_norm": 5.317237854003906, "learning_rate": 1.6314699792960663e-05, "loss": 4.5487, "step": 9790 }, { "epoch": 1.84, "grad_norm": 3.9029135704040527, "learning_rate": 1.6310935441370226e-05, "loss": 4.6154, "step": 9800 }, { "epoch": 1.85, "grad_norm": 5.0397186279296875, "learning_rate": 1.6307171089779786e-05, "loss": 4.5875, "step": 9810 }, { "epoch": 1.85, "grad_norm": 5.487625598907471, "learning_rate": 1.630340673818935e-05, "loss": 4.3151, "step": 9820 }, { "epoch": 1.85, "grad_norm": 9.342184066772461, "learning_rate": 1.629964238659891e-05, "loss": 4.5705, "step": 9830 }, { "epoch": 1.85, "grad_norm": 8.340239524841309, "learning_rate": 1.6295878035008472e-05, "loss": 4.6207, "step": 9840 }, { "epoch": 1.85, "grad_norm": 5.831496715545654, "learning_rate": 1.6292113683418032e-05, "loss": 4.6339, "step": 9850 }, { "epoch": 1.86, "grad_norm": 4.267518997192383, "learning_rate": 1.6288349331827592e-05, "loss": 4.3859, "step": 9860 }, { "epoch": 1.86, "grad_norm": 6.181102275848389, "learning_rate": 1.6284584980237155e-05, "loss": 4.6229, "step": 9870 }, { "epoch": 1.86, "grad_norm": 4.802160263061523, "learning_rate": 1.6280820628646715e-05, "loss": 4.4299, "step": 9880 }, { "epoch": 1.86, "grad_norm": 4.94935941696167, "learning_rate": 1.6277056277056278e-05, "loss": 4.5571, "step": 9890 }, { "epoch": 1.86, "grad_norm": 10.149169921875, "learning_rate": 1.627329192546584e-05, "loss": 4.5458, "step": 9900 }, { "epoch": 1.87, "grad_norm": 5.424384593963623, "learning_rate": 1.62695275738754e-05, "loss": 4.6236, "step": 9910 }, { "epoch": 1.87, "grad_norm": 5.745746612548828, "learning_rate": 1.6265763222284964e-05, "loss": 4.5403, "step": 9920 }, { "epoch": 1.87, "grad_norm": 5.986821174621582, "learning_rate": 1.6261998870694524e-05, "loss": 4.3354, "step": 9930 }, { "epoch": 1.87, "grad_norm": 6.1216020584106445, "learning_rate": 1.6258234519104087e-05, "loss": 4.5153, "step": 9940 }, { "epoch": 1.87, "grad_norm": 6.094784736633301, "learning_rate": 1.6254470167513647e-05, "loss": 4.7005, "step": 9950 }, { "epoch": 1.87, "grad_norm": 10.191990852355957, "learning_rate": 1.625070581592321e-05, "loss": 4.7124, "step": 9960 }, { "epoch": 1.88, "grad_norm": 4.917109489440918, "learning_rate": 1.624694146433277e-05, "loss": 4.765, "step": 9970 }, { "epoch": 1.88, "grad_norm": 5.449561595916748, "learning_rate": 1.6243177112742333e-05, "loss": 4.4194, "step": 9980 }, { "epoch": 1.88, "grad_norm": 4.597510814666748, "learning_rate": 1.6239412761151892e-05, "loss": 4.4183, "step": 9990 }, { "epoch": 1.88, "grad_norm": 4.581331253051758, "learning_rate": 1.6235648409561456e-05, "loss": 4.5327, "step": 10000 }, { "epoch": 1.88, "grad_norm": 4.181787967681885, "learning_rate": 1.6231884057971015e-05, "loss": 4.4171, "step": 10010 }, { "epoch": 1.89, "grad_norm": 4.417957782745361, "learning_rate": 1.6228119706380575e-05, "loss": 4.3575, "step": 10020 }, { "epoch": 1.89, "grad_norm": 6.901144504547119, "learning_rate": 1.6224355354790138e-05, "loss": 4.5871, "step": 10030 }, { "epoch": 1.89, "grad_norm": 7.759471893310547, "learning_rate": 1.6220591003199698e-05, "loss": 4.498, "step": 10040 }, { "epoch": 1.89, "grad_norm": 3.9077529907226562, "learning_rate": 1.621682665160926e-05, "loss": 4.5018, "step": 10050 }, { "epoch": 1.89, "grad_norm": 5.162599563598633, "learning_rate": 1.621306230001882e-05, "loss": 4.5336, "step": 10060 }, { "epoch": 1.9, "grad_norm": 5.279311656951904, "learning_rate": 1.6209297948428384e-05, "loss": 4.322, "step": 10070 }, { "epoch": 1.9, "grad_norm": 6.895501613616943, "learning_rate": 1.6205533596837947e-05, "loss": 4.4317, "step": 10080 }, { "epoch": 1.9, "grad_norm": 5.615440845489502, "learning_rate": 1.6201769245247507e-05, "loss": 4.4347, "step": 10090 }, { "epoch": 1.9, "grad_norm": 7.611416816711426, "learning_rate": 1.619800489365707e-05, "loss": 4.3973, "step": 10100 }, { "epoch": 1.9, "grad_norm": 6.3034796714782715, "learning_rate": 1.619424054206663e-05, "loss": 4.3546, "step": 10110 }, { "epoch": 1.9, "grad_norm": 5.475399017333984, "learning_rate": 1.6190476190476193e-05, "loss": 4.4942, "step": 10120 }, { "epoch": 1.91, "grad_norm": 5.7142815589904785, "learning_rate": 1.6186711838885753e-05, "loss": 4.2995, "step": 10130 }, { "epoch": 1.91, "grad_norm": 6.851151943206787, "learning_rate": 1.6182947487295316e-05, "loss": 4.1475, "step": 10140 }, { "epoch": 1.91, "grad_norm": 4.556617736816406, "learning_rate": 1.6179183135704876e-05, "loss": 4.2166, "step": 10150 }, { "epoch": 1.91, "grad_norm": 15.542479515075684, "learning_rate": 1.617541878411444e-05, "loss": 4.4629, "step": 10160 }, { "epoch": 1.91, "grad_norm": 6.260297775268555, "learning_rate": 1.6171654432524e-05, "loss": 4.269, "step": 10170 }, { "epoch": 1.92, "grad_norm": 3.8075075149536133, "learning_rate": 1.6167890080933562e-05, "loss": 4.4611, "step": 10180 }, { "epoch": 1.92, "grad_norm": 4.777757167816162, "learning_rate": 1.616412572934312e-05, "loss": 4.4366, "step": 10190 }, { "epoch": 1.92, "grad_norm": 7.765013217926025, "learning_rate": 1.616036137775268e-05, "loss": 4.4348, "step": 10200 }, { "epoch": 1.92, "grad_norm": 8.169259071350098, "learning_rate": 1.6156597026162245e-05, "loss": 4.4343, "step": 10210 }, { "epoch": 1.92, "grad_norm": 4.8991475105285645, "learning_rate": 1.6152832674571804e-05, "loss": 4.3347, "step": 10220 }, { "epoch": 1.93, "grad_norm": 6.67490291595459, "learning_rate": 1.6149068322981367e-05, "loss": 4.437, "step": 10230 }, { "epoch": 1.93, "grad_norm": 4.518898010253906, "learning_rate": 1.6145303971390927e-05, "loss": 4.4617, "step": 10240 }, { "epoch": 1.93, "grad_norm": 7.252262592315674, "learning_rate": 1.614153961980049e-05, "loss": 4.5347, "step": 10250 }, { "epoch": 1.93, "grad_norm": 5.818848609924316, "learning_rate": 1.6137775268210054e-05, "loss": 4.563, "step": 10260 }, { "epoch": 1.93, "grad_norm": 6.757011890411377, "learning_rate": 1.6134010916619613e-05, "loss": 4.5476, "step": 10270 }, { "epoch": 1.93, "grad_norm": 3.422853946685791, "learning_rate": 1.6130246565029176e-05, "loss": 4.4984, "step": 10280 }, { "epoch": 1.94, "grad_norm": 7.138296127319336, "learning_rate": 1.6126482213438736e-05, "loss": 4.5376, "step": 10290 }, { "epoch": 1.94, "grad_norm": 7.359692573547363, "learning_rate": 1.61227178618483e-05, "loss": 4.3073, "step": 10300 }, { "epoch": 1.94, "grad_norm": 6.206998825073242, "learning_rate": 1.611895351025786e-05, "loss": 4.4038, "step": 10310 }, { "epoch": 1.94, "grad_norm": 5.848674774169922, "learning_rate": 1.6115189158667422e-05, "loss": 4.4956, "step": 10320 }, { "epoch": 1.94, "grad_norm": 6.967643737792969, "learning_rate": 1.6111424807076982e-05, "loss": 4.3617, "step": 10330 }, { "epoch": 1.95, "grad_norm": 4.30903959274292, "learning_rate": 1.6107660455486545e-05, "loss": 4.3072, "step": 10340 }, { "epoch": 1.95, "grad_norm": 7.51648473739624, "learning_rate": 1.6103896103896105e-05, "loss": 4.4632, "step": 10350 }, { "epoch": 1.95, "grad_norm": 7.170875549316406, "learning_rate": 1.6100131752305668e-05, "loss": 4.2911, "step": 10360 }, { "epoch": 1.95, "grad_norm": 4.8094916343688965, "learning_rate": 1.6096367400715228e-05, "loss": 4.1807, "step": 10370 }, { "epoch": 1.95, "grad_norm": 5.280708312988281, "learning_rate": 1.6092603049124788e-05, "loss": 4.4612, "step": 10380 }, { "epoch": 1.96, "grad_norm": 3.9026429653167725, "learning_rate": 1.608883869753435e-05, "loss": 4.4509, "step": 10390 }, { "epoch": 1.96, "grad_norm": 7.888210296630859, "learning_rate": 1.608507434594391e-05, "loss": 4.1466, "step": 10400 }, { "epoch": 1.96, "grad_norm": 6.035211086273193, "learning_rate": 1.6081309994353474e-05, "loss": 4.3655, "step": 10410 }, { "epoch": 1.96, "grad_norm": 6.376752853393555, "learning_rate": 1.6077545642763034e-05, "loss": 4.2419, "step": 10420 }, { "epoch": 1.96, "grad_norm": 4.665284156799316, "learning_rate": 1.6073781291172597e-05, "loss": 4.3365, "step": 10430 }, { "epoch": 1.96, "grad_norm": 4.392868518829346, "learning_rate": 1.6070016939582156e-05, "loss": 4.3826, "step": 10440 }, { "epoch": 1.97, "grad_norm": 7.208968639373779, "learning_rate": 1.606625258799172e-05, "loss": 4.4768, "step": 10450 }, { "epoch": 1.97, "grad_norm": 5.607078552246094, "learning_rate": 1.6062488236401283e-05, "loss": 4.3916, "step": 10460 }, { "epoch": 1.97, "grad_norm": 4.433419704437256, "learning_rate": 1.6058723884810843e-05, "loss": 4.5029, "step": 10470 }, { "epoch": 1.97, "grad_norm": 4.120962142944336, "learning_rate": 1.6054959533220406e-05, "loss": 4.3394, "step": 10480 }, { "epoch": 1.97, "grad_norm": 6.94193172454834, "learning_rate": 1.6051195181629965e-05, "loss": 4.4265, "step": 10490 }, { "epoch": 1.98, "grad_norm": 7.037685871124268, "learning_rate": 1.604743083003953e-05, "loss": 4.3615, "step": 10500 }, { "epoch": 1.98, "grad_norm": 4.69428014755249, "learning_rate": 1.604366647844909e-05, "loss": 4.3038, "step": 10510 }, { "epoch": 1.98, "grad_norm": 10.49842357635498, "learning_rate": 1.603990212685865e-05, "loss": 4.2241, "step": 10520 }, { "epoch": 1.98, "grad_norm": 10.739736557006836, "learning_rate": 1.603613777526821e-05, "loss": 4.2324, "step": 10530 }, { "epoch": 1.98, "grad_norm": 7.016395568847656, "learning_rate": 1.603237342367777e-05, "loss": 4.287, "step": 10540 }, { "epoch": 1.99, "grad_norm": 13.377906799316406, "learning_rate": 1.6028609072087334e-05, "loss": 4.4657, "step": 10550 }, { "epoch": 1.99, "grad_norm": 4.685717582702637, "learning_rate": 1.6024844720496894e-05, "loss": 4.5813, "step": 10560 }, { "epoch": 1.99, "grad_norm": 6.391681671142578, "learning_rate": 1.6021080368906457e-05, "loss": 4.3325, "step": 10570 }, { "epoch": 1.99, "grad_norm": 4.717489719390869, "learning_rate": 1.6017316017316017e-05, "loss": 4.3122, "step": 10580 }, { "epoch": 1.99, "grad_norm": 10.848299980163574, "learning_rate": 1.601355166572558e-05, "loss": 4.2836, "step": 10590 }, { "epoch": 2.0, "grad_norm": 4.571922302246094, "learning_rate": 1.600978731413514e-05, "loss": 4.4368, "step": 10600 }, { "epoch": 2.0, "grad_norm": 4.63656759262085, "learning_rate": 1.6006022962544703e-05, "loss": 4.2516, "step": 10610 }, { "epoch": 2.0, "grad_norm": 5.74009370803833, "learning_rate": 1.6002258610954263e-05, "loss": 4.4299, "step": 10620 }, { "epoch": 2.0, "eval_accuracy": 0.7314666666666667, "eval_loss": 4.2552618980407715, "eval_runtime": 31.0721, "eval_samples_per_second": 241.374, "eval_steps_per_second": 30.188, "step": 10626 }, { "epoch": 2.0, "grad_norm": 5.579972267150879, "learning_rate": 1.5998494259363826e-05, "loss": 4.31, "step": 10630 }, { "epoch": 2.0, "grad_norm": 4.047071933746338, "learning_rate": 1.599472990777339e-05, "loss": 4.1163, "step": 10640 }, { "epoch": 2.0, "grad_norm": 4.846254825592041, "learning_rate": 1.599096555618295e-05, "loss": 4.1266, "step": 10650 }, { "epoch": 2.01, "grad_norm": 4.7568159103393555, "learning_rate": 1.5987201204592512e-05, "loss": 4.2925, "step": 10660 }, { "epoch": 2.01, "grad_norm": 5.211572647094727, "learning_rate": 1.5983436853002072e-05, "loss": 4.0036, "step": 10670 }, { "epoch": 2.01, "grad_norm": 18.539222717285156, "learning_rate": 1.5979672501411635e-05, "loss": 4.3448, "step": 10680 }, { "epoch": 2.01, "grad_norm": 3.799651861190796, "learning_rate": 1.5975908149821195e-05, "loss": 4.2838, "step": 10690 }, { "epoch": 2.01, "grad_norm": 4.080503463745117, "learning_rate": 1.5972143798230758e-05, "loss": 3.9882, "step": 10700 }, { "epoch": 2.02, "grad_norm": 9.573444366455078, "learning_rate": 1.5968379446640318e-05, "loss": 4.261, "step": 10710 }, { "epoch": 2.02, "grad_norm": 7.987376689910889, "learning_rate": 1.5964615095049877e-05, "loss": 4.0733, "step": 10720 }, { "epoch": 2.02, "grad_norm": 8.086185455322266, "learning_rate": 1.596085074345944e-05, "loss": 4.2465, "step": 10730 }, { "epoch": 2.02, "grad_norm": 5.307953834533691, "learning_rate": 1.5957086391869e-05, "loss": 4.0953, "step": 10740 }, { "epoch": 2.02, "grad_norm": 4.630192279815674, "learning_rate": 1.5953322040278563e-05, "loss": 4.0914, "step": 10750 }, { "epoch": 2.03, "grad_norm": 6.072296619415283, "learning_rate": 1.5949557688688123e-05, "loss": 4.2974, "step": 10760 }, { "epoch": 2.03, "grad_norm": 6.996493816375732, "learning_rate": 1.5945793337097686e-05, "loss": 4.1912, "step": 10770 }, { "epoch": 2.03, "grad_norm": 4.269257068634033, "learning_rate": 1.5942028985507246e-05, "loss": 4.1206, "step": 10780 }, { "epoch": 2.03, "grad_norm": 5.2253098487854, "learning_rate": 1.593826463391681e-05, "loss": 4.0746, "step": 10790 }, { "epoch": 2.03, "grad_norm": 10.502296447753906, "learning_rate": 1.593450028232637e-05, "loss": 4.1572, "step": 10800 }, { "epoch": 2.03, "grad_norm": 5.908490180969238, "learning_rate": 1.5930735930735932e-05, "loss": 4.1969, "step": 10810 }, { "epoch": 2.04, "grad_norm": 7.656743049621582, "learning_rate": 1.5926971579145495e-05, "loss": 4.1952, "step": 10820 }, { "epoch": 2.04, "grad_norm": 5.4670939445495605, "learning_rate": 1.5923207227555055e-05, "loss": 4.0075, "step": 10830 }, { "epoch": 2.04, "grad_norm": 5.7026472091674805, "learning_rate": 1.5919442875964618e-05, "loss": 4.1768, "step": 10840 }, { "epoch": 2.04, "grad_norm": 5.495305061340332, "learning_rate": 1.5915678524374178e-05, "loss": 4.2635, "step": 10850 }, { "epoch": 2.04, "grad_norm": 4.939096927642822, "learning_rate": 1.591191417278374e-05, "loss": 4.0376, "step": 10860 }, { "epoch": 2.05, "grad_norm": 8.390722274780273, "learning_rate": 1.59081498211933e-05, "loss": 3.8992, "step": 10870 }, { "epoch": 2.05, "grad_norm": 4.46321439743042, "learning_rate": 1.5904385469602864e-05, "loss": 3.936, "step": 10880 }, { "epoch": 2.05, "grad_norm": 3.8374812602996826, "learning_rate": 1.5900621118012424e-05, "loss": 4.1501, "step": 10890 }, { "epoch": 2.05, "grad_norm": 4.85408878326416, "learning_rate": 1.5896856766421984e-05, "loss": 4.0827, "step": 10900 }, { "epoch": 2.05, "grad_norm": 5.240114688873291, "learning_rate": 1.5893092414831547e-05, "loss": 4.2292, "step": 10910 }, { "epoch": 2.06, "grad_norm": 6.479221820831299, "learning_rate": 1.5889328063241107e-05, "loss": 4.0247, "step": 10920 }, { "epoch": 2.06, "grad_norm": 4.75130558013916, "learning_rate": 1.588556371165067e-05, "loss": 4.1623, "step": 10930 }, { "epoch": 2.06, "grad_norm": 5.828037738800049, "learning_rate": 1.588179936006023e-05, "loss": 4.2303, "step": 10940 }, { "epoch": 2.06, "grad_norm": 4.340429782867432, "learning_rate": 1.5878035008469793e-05, "loss": 4.1326, "step": 10950 }, { "epoch": 2.06, "grad_norm": 4.118145942687988, "learning_rate": 1.5874270656879352e-05, "loss": 4.0783, "step": 10960 }, { "epoch": 2.06, "grad_norm": 9.45816707611084, "learning_rate": 1.5870506305288916e-05, "loss": 4.0551, "step": 10970 }, { "epoch": 2.07, "grad_norm": 4.479208946228027, "learning_rate": 1.5866741953698475e-05, "loss": 4.0872, "step": 10980 }, { "epoch": 2.07, "grad_norm": 14.254130363464355, "learning_rate": 1.586297760210804e-05, "loss": 4.0232, "step": 10990 }, { "epoch": 2.07, "grad_norm": 6.137701511383057, "learning_rate": 1.58592132505176e-05, "loss": 4.1143, "step": 11000 }, { "epoch": 2.07, "grad_norm": 5.000929355621338, "learning_rate": 1.585544889892716e-05, "loss": 4.0659, "step": 11010 }, { "epoch": 2.07, "grad_norm": 4.034130096435547, "learning_rate": 1.5851684547336725e-05, "loss": 4.0946, "step": 11020 }, { "epoch": 2.08, "grad_norm": 6.8645501136779785, "learning_rate": 1.5847920195746284e-05, "loss": 4.0024, "step": 11030 }, { "epoch": 2.08, "grad_norm": 6.4136786460876465, "learning_rate": 1.5844155844155847e-05, "loss": 4.0736, "step": 11040 }, { "epoch": 2.08, "grad_norm": 4.403553485870361, "learning_rate": 1.5840391492565407e-05, "loss": 3.7255, "step": 11050 }, { "epoch": 2.08, "grad_norm": 5.576516628265381, "learning_rate": 1.583662714097497e-05, "loss": 3.9675, "step": 11060 }, { "epoch": 2.08, "grad_norm": 7.631837844848633, "learning_rate": 1.583286278938453e-05, "loss": 4.0973, "step": 11070 }, { "epoch": 2.09, "grad_norm": 6.222148418426514, "learning_rate": 1.582909843779409e-05, "loss": 4.025, "step": 11080 }, { "epoch": 2.09, "grad_norm": 9.054905891418457, "learning_rate": 1.5825334086203653e-05, "loss": 4.2141, "step": 11090 }, { "epoch": 2.09, "grad_norm": 4.380316734313965, "learning_rate": 1.5821569734613213e-05, "loss": 4.0489, "step": 11100 }, { "epoch": 2.09, "grad_norm": 5.465594291687012, "learning_rate": 1.5817805383022776e-05, "loss": 4.0556, "step": 11110 }, { "epoch": 2.09, "grad_norm": 4.7221832275390625, "learning_rate": 1.5814041031432336e-05, "loss": 3.9977, "step": 11120 }, { "epoch": 2.09, "grad_norm": 4.684112548828125, "learning_rate": 1.58102766798419e-05, "loss": 4.1111, "step": 11130 }, { "epoch": 2.1, "grad_norm": 7.422446250915527, "learning_rate": 1.580651232825146e-05, "loss": 4.0975, "step": 11140 }, { "epoch": 2.1, "grad_norm": 4.457563877105713, "learning_rate": 1.5802747976661022e-05, "loss": 4.0529, "step": 11150 }, { "epoch": 2.1, "grad_norm": 4.79970645904541, "learning_rate": 1.579898362507058e-05, "loss": 3.9219, "step": 11160 }, { "epoch": 2.1, "grad_norm": 5.312371730804443, "learning_rate": 1.5795219273480145e-05, "loss": 4.2685, "step": 11170 }, { "epoch": 2.1, "grad_norm": 5.733940124511719, "learning_rate": 1.5791454921889705e-05, "loss": 4.0419, "step": 11180 }, { "epoch": 2.11, "grad_norm": 4.6637139320373535, "learning_rate": 1.5787690570299268e-05, "loss": 3.983, "step": 11190 }, { "epoch": 2.11, "grad_norm": 8.839118003845215, "learning_rate": 1.578392621870883e-05, "loss": 4.0758, "step": 11200 }, { "epoch": 2.11, "grad_norm": 4.420551300048828, "learning_rate": 1.578016186711839e-05, "loss": 3.958, "step": 11210 }, { "epoch": 2.11, "grad_norm": 6.134811878204346, "learning_rate": 1.5776397515527954e-05, "loss": 4.172, "step": 11220 }, { "epoch": 2.11, "grad_norm": 6.6265869140625, "learning_rate": 1.5772633163937514e-05, "loss": 4.0555, "step": 11230 }, { "epoch": 2.12, "grad_norm": 7.439310073852539, "learning_rate": 1.5768868812347073e-05, "loss": 3.8814, "step": 11240 }, { "epoch": 2.12, "grad_norm": 8.763142585754395, "learning_rate": 1.5765104460756636e-05, "loss": 3.977, "step": 11250 }, { "epoch": 2.12, "grad_norm": 4.134701728820801, "learning_rate": 1.5761340109166196e-05, "loss": 3.9433, "step": 11260 }, { "epoch": 2.12, "grad_norm": 9.159955978393555, "learning_rate": 1.575757575757576e-05, "loss": 4.2028, "step": 11270 }, { "epoch": 2.12, "grad_norm": 6.593533515930176, "learning_rate": 1.575381140598532e-05, "loss": 3.931, "step": 11280 }, { "epoch": 2.12, "grad_norm": 5.5972580909729, "learning_rate": 1.5750047054394882e-05, "loss": 3.9788, "step": 11290 }, { "epoch": 2.13, "grad_norm": 11.934804916381836, "learning_rate": 1.5746282702804442e-05, "loss": 4.1172, "step": 11300 }, { "epoch": 2.13, "grad_norm": 8.514456748962402, "learning_rate": 1.5742518351214005e-05, "loss": 4.0525, "step": 11310 }, { "epoch": 2.13, "grad_norm": 7.898783206939697, "learning_rate": 1.5738753999623565e-05, "loss": 4.0748, "step": 11320 }, { "epoch": 2.13, "grad_norm": 5.486702919006348, "learning_rate": 1.5734989648033128e-05, "loss": 3.9374, "step": 11330 }, { "epoch": 2.13, "grad_norm": 7.614177227020264, "learning_rate": 1.5731225296442688e-05, "loss": 3.9349, "step": 11340 }, { "epoch": 2.14, "grad_norm": 4.120919227600098, "learning_rate": 1.572746094485225e-05, "loss": 3.9833, "step": 11350 }, { "epoch": 2.14, "grad_norm": 10.874237060546875, "learning_rate": 1.572369659326181e-05, "loss": 4.0419, "step": 11360 }, { "epoch": 2.14, "grad_norm": 6.191878318786621, "learning_rate": 1.5719932241671374e-05, "loss": 4.1145, "step": 11370 }, { "epoch": 2.14, "grad_norm": 5.437715530395508, "learning_rate": 1.5716167890080937e-05, "loss": 3.942, "step": 11380 }, { "epoch": 2.14, "grad_norm": 5.2392659187316895, "learning_rate": 1.5712403538490497e-05, "loss": 3.991, "step": 11390 }, { "epoch": 2.15, "grad_norm": 4.5837249755859375, "learning_rate": 1.570863918690006e-05, "loss": 3.799, "step": 11400 }, { "epoch": 2.15, "grad_norm": 5.173925399780273, "learning_rate": 1.5704874835309616e-05, "loss": 3.9474, "step": 11410 }, { "epoch": 2.15, "grad_norm": 3.99222731590271, "learning_rate": 1.570111048371918e-05, "loss": 3.8807, "step": 11420 }, { "epoch": 2.15, "grad_norm": 4.407945156097412, "learning_rate": 1.5697346132128743e-05, "loss": 3.6761, "step": 11430 }, { "epoch": 2.15, "grad_norm": 7.639770984649658, "learning_rate": 1.5693581780538303e-05, "loss": 4.0302, "step": 11440 }, { "epoch": 2.16, "grad_norm": 4.561074733734131, "learning_rate": 1.5689817428947866e-05, "loss": 3.9477, "step": 11450 }, { "epoch": 2.16, "grad_norm": 4.958947658538818, "learning_rate": 1.5686053077357425e-05, "loss": 3.8021, "step": 11460 }, { "epoch": 2.16, "grad_norm": 6.527427673339844, "learning_rate": 1.568228872576699e-05, "loss": 3.864, "step": 11470 }, { "epoch": 2.16, "grad_norm": 5.908439636230469, "learning_rate": 1.567852437417655e-05, "loss": 3.8795, "step": 11480 }, { "epoch": 2.16, "grad_norm": 6.4105329513549805, "learning_rate": 1.567476002258611e-05, "loss": 3.8993, "step": 11490 }, { "epoch": 2.16, "grad_norm": 4.569242000579834, "learning_rate": 1.567099567099567e-05, "loss": 3.9079, "step": 11500 }, { "epoch": 2.17, "grad_norm": 6.731176853179932, "learning_rate": 1.5667231319405234e-05, "loss": 3.9274, "step": 11510 }, { "epoch": 2.17, "grad_norm": 5.803924083709717, "learning_rate": 1.5663466967814794e-05, "loss": 3.8492, "step": 11520 }, { "epoch": 2.17, "grad_norm": 4.805103302001953, "learning_rate": 1.5659702616224357e-05, "loss": 3.8872, "step": 11530 }, { "epoch": 2.17, "grad_norm": 6.442079067230225, "learning_rate": 1.5655938264633917e-05, "loss": 3.6923, "step": 11540 }, { "epoch": 2.17, "grad_norm": 5.588798999786377, "learning_rate": 1.565217391304348e-05, "loss": 3.826, "step": 11550 }, { "epoch": 2.18, "grad_norm": 6.22387170791626, "learning_rate": 1.5648409561453043e-05, "loss": 3.8219, "step": 11560 }, { "epoch": 2.18, "grad_norm": 5.797980308532715, "learning_rate": 1.5644645209862603e-05, "loss": 3.8358, "step": 11570 }, { "epoch": 2.18, "grad_norm": 4.398248195648193, "learning_rate": 1.5640880858272166e-05, "loss": 3.8283, "step": 11580 }, { "epoch": 2.18, "grad_norm": 8.31688404083252, "learning_rate": 1.5637116506681723e-05, "loss": 4.112, "step": 11590 }, { "epoch": 2.18, "grad_norm": 6.595235347747803, "learning_rate": 1.5633352155091286e-05, "loss": 3.9741, "step": 11600 }, { "epoch": 2.19, "grad_norm": 6.283722400665283, "learning_rate": 1.562958780350085e-05, "loss": 3.8255, "step": 11610 }, { "epoch": 2.19, "grad_norm": 4.959485054016113, "learning_rate": 1.562582345191041e-05, "loss": 3.9458, "step": 11620 }, { "epoch": 2.19, "grad_norm": 14.348812103271484, "learning_rate": 1.5622059100319972e-05, "loss": 3.8888, "step": 11630 }, { "epoch": 2.19, "grad_norm": 4.12087869644165, "learning_rate": 1.5618294748729532e-05, "loss": 4.1594, "step": 11640 }, { "epoch": 2.19, "grad_norm": 12.193377494812012, "learning_rate": 1.5614530397139095e-05, "loss": 3.8591, "step": 11650 }, { "epoch": 2.19, "grad_norm": 7.308841705322266, "learning_rate": 1.5610766045548655e-05, "loss": 3.8784, "step": 11660 }, { "epoch": 2.2, "grad_norm": 6.743547439575195, "learning_rate": 1.5607001693958218e-05, "loss": 3.8188, "step": 11670 }, { "epoch": 2.2, "grad_norm": 5.421703815460205, "learning_rate": 1.5603237342367778e-05, "loss": 3.7861, "step": 11680 }, { "epoch": 2.2, "grad_norm": 4.1648454666137695, "learning_rate": 1.559947299077734e-05, "loss": 3.983, "step": 11690 }, { "epoch": 2.2, "grad_norm": 5.27364444732666, "learning_rate": 1.55957086391869e-05, "loss": 3.9612, "step": 11700 }, { "epoch": 2.2, "grad_norm": 4.577748775482178, "learning_rate": 1.5591944287596464e-05, "loss": 3.7457, "step": 11710 }, { "epoch": 2.21, "grad_norm": 4.922910690307617, "learning_rate": 1.5588179936006023e-05, "loss": 3.7754, "step": 11720 }, { "epoch": 2.21, "grad_norm": 7.118587493896484, "learning_rate": 1.5584415584415587e-05, "loss": 3.7868, "step": 11730 }, { "epoch": 2.21, "grad_norm": 4.804932594299316, "learning_rate": 1.5580651232825146e-05, "loss": 3.6839, "step": 11740 }, { "epoch": 2.21, "grad_norm": 4.892269134521484, "learning_rate": 1.557688688123471e-05, "loss": 3.7721, "step": 11750 }, { "epoch": 2.21, "grad_norm": 5.398070812225342, "learning_rate": 1.557312252964427e-05, "loss": 3.9635, "step": 11760 }, { "epoch": 2.22, "grad_norm": 7.159237384796143, "learning_rate": 1.556935817805383e-05, "loss": 3.8856, "step": 11770 }, { "epoch": 2.22, "grad_norm": 5.062245845794678, "learning_rate": 1.5565593826463392e-05, "loss": 3.8015, "step": 11780 }, { "epoch": 2.22, "grad_norm": 7.825982570648193, "learning_rate": 1.5561829474872955e-05, "loss": 3.7507, "step": 11790 }, { "epoch": 2.22, "grad_norm": 6.594232082366943, "learning_rate": 1.5558065123282515e-05, "loss": 3.8327, "step": 11800 }, { "epoch": 2.22, "grad_norm": 4.983365058898926, "learning_rate": 1.5554300771692078e-05, "loss": 3.9154, "step": 11810 }, { "epoch": 2.22, "grad_norm": 4.425704479217529, "learning_rate": 1.5550536420101638e-05, "loss": 3.8784, "step": 11820 }, { "epoch": 2.23, "grad_norm": 5.853933811187744, "learning_rate": 1.55467720685112e-05, "loss": 3.8025, "step": 11830 }, { "epoch": 2.23, "grad_norm": 5.639091968536377, "learning_rate": 1.554300771692076e-05, "loss": 3.9327, "step": 11840 }, { "epoch": 2.23, "grad_norm": 9.400067329406738, "learning_rate": 1.5539243365330324e-05, "loss": 3.7902, "step": 11850 }, { "epoch": 2.23, "grad_norm": 4.905858039855957, "learning_rate": 1.5535479013739884e-05, "loss": 3.8139, "step": 11860 }, { "epoch": 2.23, "grad_norm": 7.195109844207764, "learning_rate": 1.5531714662149447e-05, "loss": 3.8113, "step": 11870 }, { "epoch": 2.24, "grad_norm": 4.775333404541016, "learning_rate": 1.5527950310559007e-05, "loss": 3.7297, "step": 11880 }, { "epoch": 2.24, "grad_norm": 8.695398330688477, "learning_rate": 1.552418595896857e-05, "loss": 3.7862, "step": 11890 }, { "epoch": 2.24, "grad_norm": 3.9781153202056885, "learning_rate": 1.552042160737813e-05, "loss": 3.8104, "step": 11900 }, { "epoch": 2.24, "grad_norm": 5.6755475997924805, "learning_rate": 1.5516657255787693e-05, "loss": 3.8189, "step": 11910 }, { "epoch": 2.24, "grad_norm": 6.011416435241699, "learning_rate": 1.5512892904197253e-05, "loss": 3.9329, "step": 11920 }, { "epoch": 2.25, "grad_norm": 5.765122890472412, "learning_rate": 1.5509128552606816e-05, "loss": 3.8653, "step": 11930 }, { "epoch": 2.25, "grad_norm": 10.834667205810547, "learning_rate": 1.5505364201016376e-05, "loss": 3.8024, "step": 11940 }, { "epoch": 2.25, "grad_norm": 5.2431230545043945, "learning_rate": 1.5501599849425935e-05, "loss": 3.8745, "step": 11950 }, { "epoch": 2.25, "grad_norm": 5.293792247772217, "learning_rate": 1.54978354978355e-05, "loss": 3.776, "step": 11960 }, { "epoch": 2.25, "grad_norm": 9.133255004882812, "learning_rate": 1.5494071146245058e-05, "loss": 3.9118, "step": 11970 }, { "epoch": 2.25, "grad_norm": 4.385495185852051, "learning_rate": 1.549030679465462e-05, "loss": 3.7856, "step": 11980 }, { "epoch": 2.26, "grad_norm": 6.026075839996338, "learning_rate": 1.5486542443064185e-05, "loss": 3.7819, "step": 11990 }, { "epoch": 2.26, "grad_norm": 4.667020320892334, "learning_rate": 1.5482778091473744e-05, "loss": 3.8542, "step": 12000 }, { "epoch": 2.26, "grad_norm": 6.475196361541748, "learning_rate": 1.5479013739883307e-05, "loss": 3.6043, "step": 12010 }, { "epoch": 2.26, "grad_norm": 6.754886150360107, "learning_rate": 1.5475249388292867e-05, "loss": 3.8147, "step": 12020 }, { "epoch": 2.26, "grad_norm": 7.7133002281188965, "learning_rate": 1.547148503670243e-05, "loss": 3.7685, "step": 12030 }, { "epoch": 2.27, "grad_norm": 14.9595308303833, "learning_rate": 1.546772068511199e-05, "loss": 3.9689, "step": 12040 }, { "epoch": 2.27, "grad_norm": 5.2849812507629395, "learning_rate": 1.5463956333521553e-05, "loss": 3.8254, "step": 12050 }, { "epoch": 2.27, "grad_norm": 7.486852169036865, "learning_rate": 1.5460191981931113e-05, "loss": 3.8118, "step": 12060 }, { "epoch": 2.27, "grad_norm": 4.797131538391113, "learning_rate": 1.5456427630340676e-05, "loss": 3.8703, "step": 12070 }, { "epoch": 2.27, "grad_norm": 5.599274158477783, "learning_rate": 1.5452663278750236e-05, "loss": 3.8456, "step": 12080 }, { "epoch": 2.28, "grad_norm": 4.429075241088867, "learning_rate": 1.54488989271598e-05, "loss": 3.8886, "step": 12090 }, { "epoch": 2.28, "grad_norm": 6.5798563957214355, "learning_rate": 1.544513457556936e-05, "loss": 3.6841, "step": 12100 }, { "epoch": 2.28, "grad_norm": 4.577319145202637, "learning_rate": 1.544137022397892e-05, "loss": 3.6288, "step": 12110 }, { "epoch": 2.28, "grad_norm": 6.879914283752441, "learning_rate": 1.5437605872388482e-05, "loss": 3.8503, "step": 12120 }, { "epoch": 2.28, "grad_norm": 6.99090576171875, "learning_rate": 1.543384152079804e-05, "loss": 3.8953, "step": 12130 }, { "epoch": 2.28, "grad_norm": 5.0394368171691895, "learning_rate": 1.5430077169207605e-05, "loss": 3.82, "step": 12140 }, { "epoch": 2.29, "grad_norm": 13.344022750854492, "learning_rate": 1.5426312817617165e-05, "loss": 3.6878, "step": 12150 }, { "epoch": 2.29, "grad_norm": 16.516658782958984, "learning_rate": 1.5422548466026728e-05, "loss": 3.7845, "step": 12160 }, { "epoch": 2.29, "grad_norm": 6.867408752441406, "learning_rate": 1.541878411443629e-05, "loss": 3.7413, "step": 12170 }, { "epoch": 2.29, "grad_norm": 4.87980842590332, "learning_rate": 1.541501976284585e-05, "loss": 3.8253, "step": 12180 }, { "epoch": 2.29, "grad_norm": 7.078337669372559, "learning_rate": 1.5411255411255414e-05, "loss": 3.8031, "step": 12190 }, { "epoch": 2.3, "grad_norm": 7.2710747718811035, "learning_rate": 1.5407491059664974e-05, "loss": 3.6578, "step": 12200 }, { "epoch": 2.3, "grad_norm": 7.008667945861816, "learning_rate": 1.5403726708074537e-05, "loss": 3.7918, "step": 12210 }, { "epoch": 2.3, "grad_norm": 6.130124568939209, "learning_rate": 1.5399962356484096e-05, "loss": 3.7822, "step": 12220 }, { "epoch": 2.3, "grad_norm": 6.417569160461426, "learning_rate": 1.539619800489366e-05, "loss": 3.6929, "step": 12230 }, { "epoch": 2.3, "grad_norm": 5.228917121887207, "learning_rate": 1.539243365330322e-05, "loss": 3.6124, "step": 12240 }, { "epoch": 2.31, "grad_norm": 11.106487274169922, "learning_rate": 1.5388669301712783e-05, "loss": 3.6337, "step": 12250 }, { "epoch": 2.31, "grad_norm": 6.127734661102295, "learning_rate": 1.5384904950122342e-05, "loss": 3.6257, "step": 12260 }, { "epoch": 2.31, "grad_norm": 10.052376747131348, "learning_rate": 1.5381140598531905e-05, "loss": 3.8573, "step": 12270 }, { "epoch": 2.31, "grad_norm": 5.995655536651611, "learning_rate": 1.5377376246941465e-05, "loss": 3.6298, "step": 12280 }, { "epoch": 2.31, "grad_norm": 8.142227172851562, "learning_rate": 1.5373611895351025e-05, "loss": 3.7154, "step": 12290 }, { "epoch": 2.32, "grad_norm": 7.869065284729004, "learning_rate": 1.5369847543760588e-05, "loss": 3.8026, "step": 12300 }, { "epoch": 2.32, "grad_norm": 6.950026512145996, "learning_rate": 1.5366083192170148e-05, "loss": 3.7222, "step": 12310 }, { "epoch": 2.32, "grad_norm": 7.188232421875, "learning_rate": 1.536231884057971e-05, "loss": 3.8239, "step": 12320 }, { "epoch": 2.32, "grad_norm": 10.76546859741211, "learning_rate": 1.535855448898927e-05, "loss": 3.6733, "step": 12330 }, { "epoch": 2.32, "grad_norm": 7.061885356903076, "learning_rate": 1.5354790137398834e-05, "loss": 3.8812, "step": 12340 }, { "epoch": 2.32, "grad_norm": 9.965605735778809, "learning_rate": 1.5351025785808397e-05, "loss": 3.5866, "step": 12350 }, { "epoch": 2.33, "grad_norm": 11.400392532348633, "learning_rate": 1.5347261434217957e-05, "loss": 3.7223, "step": 12360 }, { "epoch": 2.33, "grad_norm": 8.316784858703613, "learning_rate": 1.534349708262752e-05, "loss": 3.8265, "step": 12370 }, { "epoch": 2.33, "grad_norm": 5.970481872558594, "learning_rate": 1.533973273103708e-05, "loss": 3.7859, "step": 12380 }, { "epoch": 2.33, "grad_norm": 14.493403434753418, "learning_rate": 1.5335968379446643e-05, "loss": 3.4946, "step": 12390 }, { "epoch": 2.33, "grad_norm": 4.9313435554504395, "learning_rate": 1.5332204027856203e-05, "loss": 3.4934, "step": 12400 }, { "epoch": 2.34, "grad_norm": 8.869562149047852, "learning_rate": 1.5328439676265766e-05, "loss": 3.6183, "step": 12410 }, { "epoch": 2.34, "grad_norm": 6.632718563079834, "learning_rate": 1.5324675324675326e-05, "loss": 3.6429, "step": 12420 }, { "epoch": 2.34, "grad_norm": 5.525375843048096, "learning_rate": 1.532091097308489e-05, "loss": 3.7848, "step": 12430 }, { "epoch": 2.34, "grad_norm": 6.012401580810547, "learning_rate": 1.531714662149445e-05, "loss": 3.5354, "step": 12440 }, { "epoch": 2.34, "grad_norm": 5.694854736328125, "learning_rate": 1.5313382269904012e-05, "loss": 3.7598, "step": 12450 }, { "epoch": 2.35, "grad_norm": 5.683794021606445, "learning_rate": 1.530961791831357e-05, "loss": 3.6107, "step": 12460 }, { "epoch": 2.35, "grad_norm": 6.75642204284668, "learning_rate": 1.530585356672313e-05, "loss": 3.6036, "step": 12470 }, { "epoch": 2.35, "grad_norm": 5.987377166748047, "learning_rate": 1.5302089215132694e-05, "loss": 3.6919, "step": 12480 }, { "epoch": 2.35, "grad_norm": 6.574718475341797, "learning_rate": 1.5298324863542254e-05, "loss": 3.5969, "step": 12490 }, { "epoch": 2.35, "grad_norm": 9.196330070495605, "learning_rate": 1.5294560511951817e-05, "loss": 3.6928, "step": 12500 }, { "epoch": 2.35, "grad_norm": 4.448946952819824, "learning_rate": 1.5290796160361377e-05, "loss": 3.6838, "step": 12510 }, { "epoch": 2.36, "grad_norm": 6.322027206420898, "learning_rate": 1.528703180877094e-05, "loss": 3.7446, "step": 12520 }, { "epoch": 2.36, "grad_norm": 5.5545654296875, "learning_rate": 1.5283267457180503e-05, "loss": 3.9824, "step": 12530 }, { "epoch": 2.36, "grad_norm": 5.138641834259033, "learning_rate": 1.5279503105590063e-05, "loss": 3.6081, "step": 12540 }, { "epoch": 2.36, "grad_norm": 8.211402893066406, "learning_rate": 1.5275738753999626e-05, "loss": 3.5954, "step": 12550 }, { "epoch": 2.36, "grad_norm": 7.986123085021973, "learning_rate": 1.5271974402409186e-05, "loss": 3.5452, "step": 12560 }, { "epoch": 2.37, "grad_norm": 5.923511505126953, "learning_rate": 1.526821005081875e-05, "loss": 3.5639, "step": 12570 }, { "epoch": 2.37, "grad_norm": 8.377513885498047, "learning_rate": 1.526444569922831e-05, "loss": 3.8294, "step": 12580 }, { "epoch": 2.37, "grad_norm": 5.032309055328369, "learning_rate": 1.5260681347637872e-05, "loss": 3.4091, "step": 12590 }, { "epoch": 2.37, "grad_norm": 9.507458686828613, "learning_rate": 1.5256916996047434e-05, "loss": 3.6918, "step": 12600 }, { "epoch": 2.37, "grad_norm": 5.7776875495910645, "learning_rate": 1.5253152644456995e-05, "loss": 3.7447, "step": 12610 }, { "epoch": 2.38, "grad_norm": 7.698153495788574, "learning_rate": 1.5249388292866557e-05, "loss": 3.5927, "step": 12620 }, { "epoch": 2.38, "grad_norm": 11.579968452453613, "learning_rate": 1.5245623941276115e-05, "loss": 3.8513, "step": 12630 }, { "epoch": 2.38, "grad_norm": 7.120565414428711, "learning_rate": 1.5241859589685676e-05, "loss": 3.7179, "step": 12640 }, { "epoch": 2.38, "grad_norm": 5.725169658660889, "learning_rate": 1.523809523809524e-05, "loss": 3.4282, "step": 12650 }, { "epoch": 2.38, "grad_norm": 10.428451538085938, "learning_rate": 1.52343308865048e-05, "loss": 3.5137, "step": 12660 }, { "epoch": 2.38, "grad_norm": 5.810141563415527, "learning_rate": 1.5230566534914362e-05, "loss": 3.4959, "step": 12670 }, { "epoch": 2.39, "grad_norm": 9.445448875427246, "learning_rate": 1.5226802183323924e-05, "loss": 3.467, "step": 12680 }, { "epoch": 2.39, "grad_norm": 4.816313743591309, "learning_rate": 1.5223037831733485e-05, "loss": 3.6695, "step": 12690 }, { "epoch": 2.39, "grad_norm": 12.151119232177734, "learning_rate": 1.5219273480143047e-05, "loss": 3.6408, "step": 12700 }, { "epoch": 2.39, "grad_norm": 4.754378318786621, "learning_rate": 1.5215509128552608e-05, "loss": 3.7007, "step": 12710 }, { "epoch": 2.39, "grad_norm": 8.035955429077148, "learning_rate": 1.521174477696217e-05, "loss": 3.5082, "step": 12720 }, { "epoch": 2.4, "grad_norm": 8.561524391174316, "learning_rate": 1.5207980425371731e-05, "loss": 3.4118, "step": 12730 }, { "epoch": 2.4, "grad_norm": 4.571209907531738, "learning_rate": 1.5204216073781292e-05, "loss": 3.5132, "step": 12740 }, { "epoch": 2.4, "grad_norm": 4.464623928070068, "learning_rate": 1.5200451722190854e-05, "loss": 3.5513, "step": 12750 }, { "epoch": 2.4, "grad_norm": 8.340402603149414, "learning_rate": 1.5196687370600415e-05, "loss": 3.5391, "step": 12760 }, { "epoch": 2.4, "grad_norm": 4.715097427368164, "learning_rate": 1.5192923019009977e-05, "loss": 3.682, "step": 12770 }, { "epoch": 2.41, "grad_norm": 4.435693740844727, "learning_rate": 1.5189158667419538e-05, "loss": 3.4653, "step": 12780 }, { "epoch": 2.41, "grad_norm": 8.597689628601074, "learning_rate": 1.5185394315829101e-05, "loss": 3.702, "step": 12790 }, { "epoch": 2.41, "grad_norm": 7.53239631652832, "learning_rate": 1.5181629964238663e-05, "loss": 3.7139, "step": 12800 }, { "epoch": 2.41, "grad_norm": 4.363271713256836, "learning_rate": 1.5177865612648221e-05, "loss": 3.8615, "step": 12810 }, { "epoch": 2.41, "grad_norm": 5.71921443939209, "learning_rate": 1.5174101261057782e-05, "loss": 3.5089, "step": 12820 }, { "epoch": 2.41, "grad_norm": 6.192845344543457, "learning_rate": 1.5170336909467346e-05, "loss": 3.7281, "step": 12830 }, { "epoch": 2.42, "grad_norm": 4.4803900718688965, "learning_rate": 1.5166572557876907e-05, "loss": 3.7433, "step": 12840 }, { "epoch": 2.42, "grad_norm": 5.934178352355957, "learning_rate": 1.5162808206286468e-05, "loss": 3.3846, "step": 12850 }, { "epoch": 2.42, "grad_norm": 9.16152572631836, "learning_rate": 1.515904385469603e-05, "loss": 3.4953, "step": 12860 }, { "epoch": 2.42, "grad_norm": 5.70888090133667, "learning_rate": 1.5155279503105591e-05, "loss": 3.8664, "step": 12870 }, { "epoch": 2.42, "grad_norm": 5.00731897354126, "learning_rate": 1.5151515151515153e-05, "loss": 3.4824, "step": 12880 }, { "epoch": 2.43, "grad_norm": 4.6556525230407715, "learning_rate": 1.5147750799924714e-05, "loss": 3.3344, "step": 12890 }, { "epoch": 2.43, "grad_norm": 6.8424882888793945, "learning_rate": 1.5143986448334276e-05, "loss": 3.5199, "step": 12900 }, { "epoch": 2.43, "grad_norm": 9.179112434387207, "learning_rate": 1.5140222096743837e-05, "loss": 3.4551, "step": 12910 }, { "epoch": 2.43, "grad_norm": 8.610604286193848, "learning_rate": 1.5136457745153399e-05, "loss": 3.3387, "step": 12920 }, { "epoch": 2.43, "grad_norm": 5.980412483215332, "learning_rate": 1.513269339356296e-05, "loss": 3.6896, "step": 12930 }, { "epoch": 2.44, "grad_norm": 15.157827377319336, "learning_rate": 1.5128929041972522e-05, "loss": 3.4617, "step": 12940 }, { "epoch": 2.44, "grad_norm": 5.296204090118408, "learning_rate": 1.5125164690382083e-05, "loss": 3.484, "step": 12950 }, { "epoch": 2.44, "grad_norm": 5.0488200187683105, "learning_rate": 1.5121400338791645e-05, "loss": 3.5605, "step": 12960 }, { "epoch": 2.44, "grad_norm": 9.137130737304688, "learning_rate": 1.5117635987201206e-05, "loss": 3.4415, "step": 12970 }, { "epoch": 2.44, "grad_norm": 11.090530395507812, "learning_rate": 1.5113871635610766e-05, "loss": 3.4077, "step": 12980 }, { "epoch": 2.44, "grad_norm": 4.750095367431641, "learning_rate": 1.5110107284020327e-05, "loss": 3.445, "step": 12990 }, { "epoch": 2.45, "grad_norm": 4.5548906326293945, "learning_rate": 1.5106342932429889e-05, "loss": 3.5281, "step": 13000 }, { "epoch": 2.45, "grad_norm": 4.27346134185791, "learning_rate": 1.510257858083945e-05, "loss": 3.3352, "step": 13010 }, { "epoch": 2.45, "grad_norm": 6.587227821350098, "learning_rate": 1.5098814229249013e-05, "loss": 3.4918, "step": 13020 }, { "epoch": 2.45, "grad_norm": 7.768476486206055, "learning_rate": 1.5095049877658575e-05, "loss": 3.6064, "step": 13030 }, { "epoch": 2.45, "grad_norm": 5.7046895027160645, "learning_rate": 1.5091285526068136e-05, "loss": 3.5623, "step": 13040 }, { "epoch": 2.46, "grad_norm": 7.536666393280029, "learning_rate": 1.5087521174477698e-05, "loss": 3.5042, "step": 13050 }, { "epoch": 2.46, "grad_norm": 4.389326095581055, "learning_rate": 1.5083756822887259e-05, "loss": 3.4515, "step": 13060 }, { "epoch": 2.46, "grad_norm": 6.69420862197876, "learning_rate": 1.507999247129682e-05, "loss": 3.5076, "step": 13070 }, { "epoch": 2.46, "grad_norm": 12.379227638244629, "learning_rate": 1.5076228119706382e-05, "loss": 3.3666, "step": 13080 }, { "epoch": 2.46, "grad_norm": 4.6272077560424805, "learning_rate": 1.5072463768115944e-05, "loss": 3.4486, "step": 13090 }, { "epoch": 2.47, "grad_norm": 4.071839332580566, "learning_rate": 1.5068699416525505e-05, "loss": 3.3911, "step": 13100 }, { "epoch": 2.47, "grad_norm": 7.712924480438232, "learning_rate": 1.5064935064935066e-05, "loss": 3.364, "step": 13110 }, { "epoch": 2.47, "grad_norm": 5.018918991088867, "learning_rate": 1.5061170713344628e-05, "loss": 3.5704, "step": 13120 }, { "epoch": 2.47, "grad_norm": 8.211872100830078, "learning_rate": 1.505740636175419e-05, "loss": 3.53, "step": 13130 }, { "epoch": 2.47, "grad_norm": 9.747666358947754, "learning_rate": 1.505364201016375e-05, "loss": 3.3557, "step": 13140 }, { "epoch": 2.48, "grad_norm": 10.278982162475586, "learning_rate": 1.5049877658573312e-05, "loss": 3.5097, "step": 13150 }, { "epoch": 2.48, "grad_norm": 4.201531887054443, "learning_rate": 1.5046113306982872e-05, "loss": 3.5552, "step": 13160 }, { "epoch": 2.48, "grad_norm": 8.231130599975586, "learning_rate": 1.5042348955392434e-05, "loss": 3.3803, "step": 13170 }, { "epoch": 2.48, "grad_norm": 9.909188270568848, "learning_rate": 1.5038584603801995e-05, "loss": 3.5295, "step": 13180 }, { "epoch": 2.48, "grad_norm": 7.392763137817383, "learning_rate": 1.5034820252211556e-05, "loss": 3.2394, "step": 13190 }, { "epoch": 2.48, "grad_norm": 4.667118072509766, "learning_rate": 1.5031055900621118e-05, "loss": 3.3898, "step": 13200 }, { "epoch": 2.49, "grad_norm": 7.509998321533203, "learning_rate": 1.5027291549030681e-05, "loss": 3.6384, "step": 13210 }, { "epoch": 2.49, "grad_norm": 7.3728251457214355, "learning_rate": 1.5023527197440243e-05, "loss": 3.2083, "step": 13220 }, { "epoch": 2.49, "grad_norm": 4.930960178375244, "learning_rate": 1.5019762845849804e-05, "loss": 3.4842, "step": 13230 }, { "epoch": 2.49, "grad_norm": 4.692440032958984, "learning_rate": 1.5015998494259365e-05, "loss": 3.3589, "step": 13240 }, { "epoch": 2.49, "grad_norm": 11.564428329467773, "learning_rate": 1.5012234142668927e-05, "loss": 3.4433, "step": 13250 }, { "epoch": 2.5, "grad_norm": 7.1563029289245605, "learning_rate": 1.5008469791078488e-05, "loss": 3.5608, "step": 13260 }, { "epoch": 2.5, "grad_norm": 5.789453029632568, "learning_rate": 1.500470543948805e-05, "loss": 3.4389, "step": 13270 }, { "epoch": 2.5, "grad_norm": 7.700305938720703, "learning_rate": 1.5000941087897611e-05, "loss": 3.679, "step": 13280 }, { "epoch": 2.5, "grad_norm": 28.372114181518555, "learning_rate": 1.4997176736307173e-05, "loss": 3.3249, "step": 13290 }, { "epoch": 2.5, "grad_norm": 4.614748001098633, "learning_rate": 1.4993412384716734e-05, "loss": 3.3231, "step": 13300 }, { "epoch": 2.51, "grad_norm": 10.271411895751953, "learning_rate": 1.4989648033126296e-05, "loss": 3.4839, "step": 13310 }, { "epoch": 2.51, "grad_norm": 7.566690444946289, "learning_rate": 1.4985883681535857e-05, "loss": 3.5218, "step": 13320 }, { "epoch": 2.51, "grad_norm": 6.336673736572266, "learning_rate": 1.4982119329945417e-05, "loss": 3.5025, "step": 13330 }, { "epoch": 2.51, "grad_norm": 6.501099109649658, "learning_rate": 1.4978354978354978e-05, "loss": 3.3349, "step": 13340 }, { "epoch": 2.51, "grad_norm": 6.898789882659912, "learning_rate": 1.497459062676454e-05, "loss": 3.298, "step": 13350 }, { "epoch": 2.51, "grad_norm": 7.725273609161377, "learning_rate": 1.4970826275174101e-05, "loss": 3.3298, "step": 13360 }, { "epoch": 2.52, "grad_norm": 12.708728790283203, "learning_rate": 1.4967061923583663e-05, "loss": 3.3188, "step": 13370 }, { "epoch": 2.52, "grad_norm": 6.650354862213135, "learning_rate": 1.4963297571993224e-05, "loss": 3.2392, "step": 13380 }, { "epoch": 2.52, "grad_norm": 5.478582859039307, "learning_rate": 1.4959533220402787e-05, "loss": 3.2194, "step": 13390 }, { "epoch": 2.52, "grad_norm": 6.394359588623047, "learning_rate": 1.4955768868812349e-05, "loss": 3.3009, "step": 13400 }, { "epoch": 2.52, "grad_norm": 7.721007823944092, "learning_rate": 1.495200451722191e-05, "loss": 3.3658, "step": 13410 }, { "epoch": 2.53, "grad_norm": 8.919482231140137, "learning_rate": 1.4948240165631472e-05, "loss": 3.5119, "step": 13420 }, { "epoch": 2.53, "grad_norm": 4.5763163566589355, "learning_rate": 1.4944475814041033e-05, "loss": 3.3035, "step": 13430 }, { "epoch": 2.53, "grad_norm": 9.980001449584961, "learning_rate": 1.4940711462450595e-05, "loss": 3.2571, "step": 13440 }, { "epoch": 2.53, "grad_norm": 16.004104614257812, "learning_rate": 1.4936947110860156e-05, "loss": 3.4498, "step": 13450 }, { "epoch": 2.53, "grad_norm": 6.2009148597717285, "learning_rate": 1.4933182759269718e-05, "loss": 3.2206, "step": 13460 }, { "epoch": 2.54, "grad_norm": 8.402820587158203, "learning_rate": 1.4929418407679279e-05, "loss": 3.595, "step": 13470 }, { "epoch": 2.54, "grad_norm": 4.65133810043335, "learning_rate": 1.492565405608884e-05, "loss": 3.3736, "step": 13480 }, { "epoch": 2.54, "grad_norm": 7.042107582092285, "learning_rate": 1.4921889704498402e-05, "loss": 3.4574, "step": 13490 }, { "epoch": 2.54, "grad_norm": 4.084888458251953, "learning_rate": 1.4918125352907963e-05, "loss": 3.1454, "step": 13500 }, { "epoch": 2.54, "grad_norm": 4.526319980621338, "learning_rate": 1.4914361001317523e-05, "loss": 3.3052, "step": 13510 }, { "epoch": 2.54, "grad_norm": 5.464129447937012, "learning_rate": 1.4910596649727085e-05, "loss": 3.5497, "step": 13520 }, { "epoch": 2.55, "grad_norm": 15.77133846282959, "learning_rate": 1.4906832298136646e-05, "loss": 3.5207, "step": 13530 }, { "epoch": 2.55, "grad_norm": 4.927525520324707, "learning_rate": 1.4903067946546208e-05, "loss": 3.105, "step": 13540 }, { "epoch": 2.55, "grad_norm": 5.723185062408447, "learning_rate": 1.4899303594955769e-05, "loss": 3.4134, "step": 13550 }, { "epoch": 2.55, "grad_norm": 9.735562324523926, "learning_rate": 1.489553924336533e-05, "loss": 3.3815, "step": 13560 }, { "epoch": 2.55, "grad_norm": 6.268448352813721, "learning_rate": 1.4891774891774892e-05, "loss": 3.3659, "step": 13570 }, { "epoch": 2.56, "grad_norm": 4.358490467071533, "learning_rate": 1.4888010540184455e-05, "loss": 3.397, "step": 13580 }, { "epoch": 2.56, "grad_norm": 4.054864883422852, "learning_rate": 1.4884246188594017e-05, "loss": 3.3671, "step": 13590 }, { "epoch": 2.56, "grad_norm": 6.499452114105225, "learning_rate": 1.4880481837003578e-05, "loss": 3.2552, "step": 13600 }, { "epoch": 2.56, "grad_norm": 4.717024326324463, "learning_rate": 1.487671748541314e-05, "loss": 3.3538, "step": 13610 }, { "epoch": 2.56, "grad_norm": 9.698598861694336, "learning_rate": 1.4872953133822701e-05, "loss": 3.3423, "step": 13620 }, { "epoch": 2.57, "grad_norm": 4.548202037811279, "learning_rate": 1.4869188782232262e-05, "loss": 3.4741, "step": 13630 }, { "epoch": 2.57, "grad_norm": 13.491065979003906, "learning_rate": 1.4865424430641824e-05, "loss": 3.346, "step": 13640 }, { "epoch": 2.57, "grad_norm": 5.557788372039795, "learning_rate": 1.4861660079051385e-05, "loss": 3.2162, "step": 13650 }, { "epoch": 2.57, "grad_norm": 5.344019412994385, "learning_rate": 1.4857895727460947e-05, "loss": 3.1721, "step": 13660 }, { "epoch": 2.57, "grad_norm": 6.586699485778809, "learning_rate": 1.4854131375870508e-05, "loss": 3.1908, "step": 13670 }, { "epoch": 2.57, "grad_norm": 8.110856056213379, "learning_rate": 1.4850367024280068e-05, "loss": 3.3915, "step": 13680 }, { "epoch": 2.58, "grad_norm": 4.365354537963867, "learning_rate": 1.484660267268963e-05, "loss": 3.1176, "step": 13690 }, { "epoch": 2.58, "grad_norm": 6.799899101257324, "learning_rate": 1.4842838321099191e-05, "loss": 3.4083, "step": 13700 }, { "epoch": 2.58, "grad_norm": 8.260652542114258, "learning_rate": 1.4839073969508752e-05, "loss": 3.1945, "step": 13710 }, { "epoch": 2.58, "grad_norm": 5.603668212890625, "learning_rate": 1.4835309617918314e-05, "loss": 3.2319, "step": 13720 }, { "epoch": 2.58, "grad_norm": 6.699263572692871, "learning_rate": 1.4831545266327875e-05, "loss": 3.4166, "step": 13730 }, { "epoch": 2.59, "grad_norm": 4.538029670715332, "learning_rate": 1.4827780914737437e-05, "loss": 3.0809, "step": 13740 }, { "epoch": 2.59, "grad_norm": 14.52987003326416, "learning_rate": 1.4824016563146998e-05, "loss": 3.591, "step": 13750 }, { "epoch": 2.59, "grad_norm": 6.2880988121032715, "learning_rate": 1.4820252211556561e-05, "loss": 3.2251, "step": 13760 }, { "epoch": 2.59, "grad_norm": 6.8407769203186035, "learning_rate": 1.4816487859966123e-05, "loss": 3.2624, "step": 13770 }, { "epoch": 2.59, "grad_norm": 7.827079772949219, "learning_rate": 1.4812723508375684e-05, "loss": 3.2746, "step": 13780 }, { "epoch": 2.6, "grad_norm": 7.865041732788086, "learning_rate": 1.4808959156785246e-05, "loss": 3.5742, "step": 13790 }, { "epoch": 2.6, "grad_norm": 7.023594856262207, "learning_rate": 1.4805194805194807e-05, "loss": 3.0071, "step": 13800 }, { "epoch": 2.6, "grad_norm": 6.307139873504639, "learning_rate": 1.4801430453604369e-05, "loss": 3.2134, "step": 13810 }, { "epoch": 2.6, "grad_norm": 7.879902362823486, "learning_rate": 1.479766610201393e-05, "loss": 3.3856, "step": 13820 }, { "epoch": 2.6, "grad_norm": 6.012749195098877, "learning_rate": 1.4793901750423492e-05, "loss": 3.3122, "step": 13830 }, { "epoch": 2.6, "grad_norm": 8.595688819885254, "learning_rate": 1.4790137398833053e-05, "loss": 3.6094, "step": 13840 }, { "epoch": 2.61, "grad_norm": 6.35960578918457, "learning_rate": 1.4786373047242613e-05, "loss": 3.2108, "step": 13850 }, { "epoch": 2.61, "grad_norm": 15.971559524536133, "learning_rate": 1.4782608695652174e-05, "loss": 3.2781, "step": 13860 }, { "epoch": 2.61, "grad_norm": 4.078514099121094, "learning_rate": 1.4778844344061736e-05, "loss": 3.2752, "step": 13870 }, { "epoch": 2.61, "grad_norm": 6.117066383361816, "learning_rate": 1.4775079992471297e-05, "loss": 3.1935, "step": 13880 }, { "epoch": 2.61, "grad_norm": 21.44966697692871, "learning_rate": 1.4771315640880859e-05, "loss": 3.3219, "step": 13890 }, { "epoch": 2.62, "grad_norm": 7.699004650115967, "learning_rate": 1.476755128929042e-05, "loss": 3.0218, "step": 13900 }, { "epoch": 2.62, "grad_norm": 6.727810382843018, "learning_rate": 1.4763786937699982e-05, "loss": 2.9525, "step": 13910 }, { "epoch": 2.62, "grad_norm": 10.634733200073242, "learning_rate": 1.4760022586109543e-05, "loss": 3.3507, "step": 13920 }, { "epoch": 2.62, "grad_norm": 8.29336166381836, "learning_rate": 1.4756258234519105e-05, "loss": 3.2863, "step": 13930 }, { "epoch": 2.62, "grad_norm": 7.364121913909912, "learning_rate": 1.4752493882928666e-05, "loss": 3.0521, "step": 13940 }, { "epoch": 2.63, "grad_norm": 7.224334239959717, "learning_rate": 1.4748729531338229e-05, "loss": 3.2566, "step": 13950 }, { "epoch": 2.63, "grad_norm": 7.779704570770264, "learning_rate": 1.474496517974779e-05, "loss": 3.1183, "step": 13960 }, { "epoch": 2.63, "grad_norm": 13.521882057189941, "learning_rate": 1.4741200828157352e-05, "loss": 3.1717, "step": 13970 }, { "epoch": 2.63, "grad_norm": 8.769793510437012, "learning_rate": 1.4737436476566914e-05, "loss": 3.356, "step": 13980 }, { "epoch": 2.63, "grad_norm": 7.209610462188721, "learning_rate": 1.4733672124976475e-05, "loss": 2.9737, "step": 13990 }, { "epoch": 2.64, "grad_norm": 5.223250865936279, "learning_rate": 1.4729907773386036e-05, "loss": 3.0005, "step": 14000 }, { "epoch": 2.64, "grad_norm": 11.20239543914795, "learning_rate": 1.4726143421795598e-05, "loss": 3.1289, "step": 14010 }, { "epoch": 2.64, "grad_norm": 6.293657302856445, "learning_rate": 1.472237907020516e-05, "loss": 3.3502, "step": 14020 }, { "epoch": 2.64, "grad_norm": 6.549447536468506, "learning_rate": 1.4718614718614719e-05, "loss": 3.221, "step": 14030 }, { "epoch": 2.64, "grad_norm": 5.635914325714111, "learning_rate": 1.471485036702428e-05, "loss": 3.3486, "step": 14040 }, { "epoch": 2.64, "grad_norm": 7.579744815826416, "learning_rate": 1.4711086015433842e-05, "loss": 3.2114, "step": 14050 }, { "epoch": 2.65, "grad_norm": 7.292599678039551, "learning_rate": 1.4707321663843404e-05, "loss": 3.1704, "step": 14060 }, { "epoch": 2.65, "grad_norm": 7.889623165130615, "learning_rate": 1.4703557312252965e-05, "loss": 3.4114, "step": 14070 }, { "epoch": 2.65, "grad_norm": 6.4268035888671875, "learning_rate": 1.4699792960662526e-05, "loss": 3.0388, "step": 14080 }, { "epoch": 2.65, "grad_norm": 6.44148063659668, "learning_rate": 1.4696028609072088e-05, "loss": 3.2445, "step": 14090 }, { "epoch": 2.65, "grad_norm": 8.12196159362793, "learning_rate": 1.469226425748165e-05, "loss": 3.1226, "step": 14100 }, { "epoch": 2.66, "grad_norm": 8.50309944152832, "learning_rate": 1.468849990589121e-05, "loss": 3.3051, "step": 14110 }, { "epoch": 2.66, "grad_norm": 7.487829208374023, "learning_rate": 1.4684735554300772e-05, "loss": 3.3664, "step": 14120 }, { "epoch": 2.66, "grad_norm": 6.021106719970703, "learning_rate": 1.4680971202710335e-05, "loss": 3.3301, "step": 14130 }, { "epoch": 2.66, "grad_norm": 10.409062385559082, "learning_rate": 1.4677206851119897e-05, "loss": 3.1057, "step": 14140 }, { "epoch": 2.66, "grad_norm": 8.8533296585083, "learning_rate": 1.4673442499529458e-05, "loss": 3.0008, "step": 14150 }, { "epoch": 2.67, "grad_norm": 6.513985633850098, "learning_rate": 1.466967814793902e-05, "loss": 3.0532, "step": 14160 }, { "epoch": 2.67, "grad_norm": 12.81251335144043, "learning_rate": 1.4665913796348581e-05, "loss": 2.9559, "step": 14170 }, { "epoch": 2.67, "grad_norm": 8.799267768859863, "learning_rate": 1.4662149444758143e-05, "loss": 3.2398, "step": 14180 }, { "epoch": 2.67, "grad_norm": 9.084511756896973, "learning_rate": 1.4658385093167704e-05, "loss": 3.1442, "step": 14190 }, { "epoch": 2.67, "grad_norm": 5.756289958953857, "learning_rate": 1.4654620741577264e-05, "loss": 3.4141, "step": 14200 }, { "epoch": 2.67, "grad_norm": 8.267768859863281, "learning_rate": 1.4650856389986825e-05, "loss": 3.12, "step": 14210 }, { "epoch": 2.68, "grad_norm": 5.865368366241455, "learning_rate": 1.4647092038396387e-05, "loss": 3.1831, "step": 14220 }, { "epoch": 2.68, "grad_norm": 11.692936897277832, "learning_rate": 1.4643327686805948e-05, "loss": 2.993, "step": 14230 }, { "epoch": 2.68, "grad_norm": 8.556349754333496, "learning_rate": 1.463956333521551e-05, "loss": 3.2896, "step": 14240 }, { "epoch": 2.68, "grad_norm": 12.899626731872559, "learning_rate": 1.4635798983625071e-05, "loss": 3.2019, "step": 14250 }, { "epoch": 2.68, "grad_norm": 5.565244674682617, "learning_rate": 1.4632034632034633e-05, "loss": 3.3158, "step": 14260 }, { "epoch": 2.69, "grad_norm": 5.408461570739746, "learning_rate": 1.4628270280444194e-05, "loss": 3.0275, "step": 14270 }, { "epoch": 2.69, "grad_norm": 3.6086740493774414, "learning_rate": 1.4624505928853756e-05, "loss": 3.1816, "step": 14280 }, { "epoch": 2.69, "grad_norm": 5.622535228729248, "learning_rate": 1.4620741577263317e-05, "loss": 3.1545, "step": 14290 }, { "epoch": 2.69, "grad_norm": 7.087208271026611, "learning_rate": 1.4616977225672879e-05, "loss": 3.2504, "step": 14300 }, { "epoch": 2.69, "grad_norm": 7.327866077423096, "learning_rate": 1.461321287408244e-05, "loss": 3.0673, "step": 14310 }, { "epoch": 2.7, "grad_norm": 12.090078353881836, "learning_rate": 1.4609448522492003e-05, "loss": 3.4291, "step": 14320 }, { "epoch": 2.7, "grad_norm": 4.398726940155029, "learning_rate": 1.4605684170901565e-05, "loss": 3.1558, "step": 14330 }, { "epoch": 2.7, "grad_norm": 6.0207085609436035, "learning_rate": 1.4601919819311126e-05, "loss": 2.9898, "step": 14340 }, { "epoch": 2.7, "grad_norm": 5.215182304382324, "learning_rate": 1.4598155467720688e-05, "loss": 3.2014, "step": 14350 }, { "epoch": 2.7, "grad_norm": 9.138909339904785, "learning_rate": 1.4594391116130249e-05, "loss": 3.3812, "step": 14360 }, { "epoch": 2.7, "grad_norm": 7.6451897621154785, "learning_rate": 1.459062676453981e-05, "loss": 2.9652, "step": 14370 }, { "epoch": 2.71, "grad_norm": 6.538483619689941, "learning_rate": 1.458686241294937e-05, "loss": 3.1883, "step": 14380 }, { "epoch": 2.71, "grad_norm": 6.956658363342285, "learning_rate": 1.4583098061358932e-05, "loss": 2.9613, "step": 14390 }, { "epoch": 2.71, "grad_norm": 10.763396263122559, "learning_rate": 1.4579333709768493e-05, "loss": 3.3564, "step": 14400 }, { "epoch": 2.71, "grad_norm": 7.146010398864746, "learning_rate": 1.4575569358178055e-05, "loss": 3.1327, "step": 14410 }, { "epoch": 2.71, "grad_norm": 8.088001251220703, "learning_rate": 1.4571805006587616e-05, "loss": 3.1404, "step": 14420 }, { "epoch": 2.72, "grad_norm": 9.142199516296387, "learning_rate": 1.4568040654997178e-05, "loss": 3.2205, "step": 14430 }, { "epoch": 2.72, "grad_norm": 6.004544258117676, "learning_rate": 1.4564276303406739e-05, "loss": 3.0221, "step": 14440 }, { "epoch": 2.72, "grad_norm": 12.181873321533203, "learning_rate": 1.45605119518163e-05, "loss": 3.1707, "step": 14450 }, { "epoch": 2.72, "grad_norm": 7.000960826873779, "learning_rate": 1.4556747600225862e-05, "loss": 3.1649, "step": 14460 }, { "epoch": 2.72, "grad_norm": 5.739673614501953, "learning_rate": 1.4552983248635423e-05, "loss": 3.1857, "step": 14470 }, { "epoch": 2.73, "grad_norm": 34.14403533935547, "learning_rate": 1.4549218897044985e-05, "loss": 2.9342, "step": 14480 }, { "epoch": 2.73, "grad_norm": 6.0822296142578125, "learning_rate": 1.4545454545454546e-05, "loss": 2.8416, "step": 14490 }, { "epoch": 2.73, "grad_norm": 4.558340549468994, "learning_rate": 1.4541690193864108e-05, "loss": 3.12, "step": 14500 }, { "epoch": 2.73, "grad_norm": 4.308550834655762, "learning_rate": 1.4537925842273671e-05, "loss": 2.9276, "step": 14510 }, { "epoch": 2.73, "grad_norm": 12.403753280639648, "learning_rate": 1.4534161490683232e-05, "loss": 3.0913, "step": 14520 }, { "epoch": 2.73, "grad_norm": 5.992092132568359, "learning_rate": 1.4530397139092794e-05, "loss": 3.0075, "step": 14530 }, { "epoch": 2.74, "grad_norm": 10.336134910583496, "learning_rate": 1.4526632787502355e-05, "loss": 3.1626, "step": 14540 }, { "epoch": 2.74, "grad_norm": 6.779484272003174, "learning_rate": 1.4522868435911915e-05, "loss": 3.1663, "step": 14550 }, { "epoch": 2.74, "grad_norm": 5.025636672973633, "learning_rate": 1.4519104084321477e-05, "loss": 3.096, "step": 14560 }, { "epoch": 2.74, "grad_norm": 10.301048278808594, "learning_rate": 1.4515339732731038e-05, "loss": 2.8479, "step": 14570 }, { "epoch": 2.74, "grad_norm": 5.159264087677002, "learning_rate": 1.45115753811406e-05, "loss": 3.146, "step": 14580 }, { "epoch": 2.75, "grad_norm": 6.6029438972473145, "learning_rate": 1.4507811029550161e-05, "loss": 3.0231, "step": 14590 }, { "epoch": 2.75, "grad_norm": 7.830244064331055, "learning_rate": 1.4504046677959722e-05, "loss": 2.9005, "step": 14600 }, { "epoch": 2.75, "grad_norm": 10.885473251342773, "learning_rate": 1.4500282326369284e-05, "loss": 2.9078, "step": 14610 }, { "epoch": 2.75, "grad_norm": 9.61038589477539, "learning_rate": 1.4496517974778845e-05, "loss": 2.8704, "step": 14620 }, { "epoch": 2.75, "grad_norm": 9.649160385131836, "learning_rate": 1.4492753623188407e-05, "loss": 3.0497, "step": 14630 }, { "epoch": 2.76, "grad_norm": 8.296353340148926, "learning_rate": 1.4488989271597968e-05, "loss": 3.5055, "step": 14640 }, { "epoch": 2.76, "grad_norm": 8.328307151794434, "learning_rate": 1.448522492000753e-05, "loss": 2.9726, "step": 14650 }, { "epoch": 2.76, "grad_norm": 7.673588752746582, "learning_rate": 1.4481460568417091e-05, "loss": 3.2788, "step": 14660 }, { "epoch": 2.76, "grad_norm": 15.670825004577637, "learning_rate": 1.4477696216826653e-05, "loss": 3.0347, "step": 14670 }, { "epoch": 2.76, "grad_norm": 6.944177627563477, "learning_rate": 1.4473931865236214e-05, "loss": 3.1781, "step": 14680 }, { "epoch": 2.76, "grad_norm": 9.000872611999512, "learning_rate": 1.4470167513645777e-05, "loss": 3.0623, "step": 14690 }, { "epoch": 2.77, "grad_norm": 5.5438971519470215, "learning_rate": 1.4466403162055339e-05, "loss": 2.9686, "step": 14700 }, { "epoch": 2.77, "grad_norm": 7.427661418914795, "learning_rate": 1.44626388104649e-05, "loss": 2.8588, "step": 14710 }, { "epoch": 2.77, "grad_norm": 5.584495544433594, "learning_rate": 1.4458874458874458e-05, "loss": 3.0223, "step": 14720 }, { "epoch": 2.77, "grad_norm": 14.704045295715332, "learning_rate": 1.445511010728402e-05, "loss": 2.5928, "step": 14730 }, { "epoch": 2.77, "grad_norm": 5.034955978393555, "learning_rate": 1.4451345755693583e-05, "loss": 2.9967, "step": 14740 }, { "epoch": 2.78, "grad_norm": 14.075299263000488, "learning_rate": 1.4447581404103144e-05, "loss": 3.2656, "step": 14750 }, { "epoch": 2.78, "grad_norm": 8.729633331298828, "learning_rate": 1.4443817052512706e-05, "loss": 3.0153, "step": 14760 }, { "epoch": 2.78, "grad_norm": 8.24337100982666, "learning_rate": 1.4440052700922267e-05, "loss": 2.9384, "step": 14770 }, { "epoch": 2.78, "grad_norm": 6.17709493637085, "learning_rate": 1.4436288349331829e-05, "loss": 2.7959, "step": 14780 }, { "epoch": 2.78, "grad_norm": 6.298855304718018, "learning_rate": 1.443252399774139e-05, "loss": 2.8526, "step": 14790 }, { "epoch": 2.79, "grad_norm": 4.327752590179443, "learning_rate": 1.4428759646150952e-05, "loss": 2.895, "step": 14800 }, { "epoch": 2.79, "grad_norm": 7.335789203643799, "learning_rate": 1.4424995294560513e-05, "loss": 3.2239, "step": 14810 }, { "epoch": 2.79, "grad_norm": 5.736459255218506, "learning_rate": 1.4421230942970075e-05, "loss": 2.6913, "step": 14820 }, { "epoch": 2.79, "grad_norm": 3.824171543121338, "learning_rate": 1.4417466591379636e-05, "loss": 2.838, "step": 14830 }, { "epoch": 2.79, "grad_norm": 11.12816333770752, "learning_rate": 1.4413702239789197e-05, "loss": 2.6673, "step": 14840 }, { "epoch": 2.8, "grad_norm": 11.29539966583252, "learning_rate": 1.4409937888198759e-05, "loss": 3.0861, "step": 14850 }, { "epoch": 2.8, "grad_norm": 5.828672885894775, "learning_rate": 1.440617353660832e-05, "loss": 2.8925, "step": 14860 }, { "epoch": 2.8, "grad_norm": 15.908937454223633, "learning_rate": 1.4402409185017882e-05, "loss": 2.7879, "step": 14870 }, { "epoch": 2.8, "grad_norm": 29.583024978637695, "learning_rate": 1.4398644833427445e-05, "loss": 2.9696, "step": 14880 }, { "epoch": 2.8, "grad_norm": 5.898332595825195, "learning_rate": 1.4394880481837006e-05, "loss": 2.9647, "step": 14890 }, { "epoch": 2.8, "grad_norm": 5.554769039154053, "learning_rate": 1.4391116130246565e-05, "loss": 2.9692, "step": 14900 }, { "epoch": 2.81, "grad_norm": 5.3026299476623535, "learning_rate": 1.4387351778656126e-05, "loss": 3.1038, "step": 14910 }, { "epoch": 2.81, "grad_norm": 6.050397872924805, "learning_rate": 1.4383587427065689e-05, "loss": 3.1184, "step": 14920 }, { "epoch": 2.81, "grad_norm": 7.660538673400879, "learning_rate": 1.437982307547525e-05, "loss": 2.9626, "step": 14930 }, { "epoch": 2.81, "grad_norm": 39.97035217285156, "learning_rate": 1.4376058723884812e-05, "loss": 3.1921, "step": 14940 }, { "epoch": 2.81, "grad_norm": 4.277101039886475, "learning_rate": 1.4372294372294374e-05, "loss": 2.973, "step": 14950 }, { "epoch": 2.82, "grad_norm": 6.54320764541626, "learning_rate": 1.4368530020703935e-05, "loss": 3.0208, "step": 14960 }, { "epoch": 2.82, "grad_norm": 8.211244583129883, "learning_rate": 1.4364765669113496e-05, "loss": 2.7794, "step": 14970 }, { "epoch": 2.82, "grad_norm": 8.613863945007324, "learning_rate": 1.4361001317523058e-05, "loss": 2.8716, "step": 14980 }, { "epoch": 2.82, "grad_norm": 7.8873114585876465, "learning_rate": 1.435723696593262e-05, "loss": 2.8692, "step": 14990 }, { "epoch": 2.82, "grad_norm": 7.216277599334717, "learning_rate": 1.435347261434218e-05, "loss": 2.9463, "step": 15000 }, { "epoch": 2.83, "grad_norm": 4.840063095092773, "learning_rate": 1.4349708262751742e-05, "loss": 3.0113, "step": 15010 }, { "epoch": 2.83, "grad_norm": 5.008612632751465, "learning_rate": 1.4345943911161304e-05, "loss": 3.1231, "step": 15020 }, { "epoch": 2.83, "grad_norm": 4.250117301940918, "learning_rate": 1.4342179559570865e-05, "loss": 3.0163, "step": 15030 }, { "epoch": 2.83, "grad_norm": 21.30902671813965, "learning_rate": 1.4338415207980427e-05, "loss": 2.9939, "step": 15040 }, { "epoch": 2.83, "grad_norm": 16.79435157775879, "learning_rate": 1.4334650856389988e-05, "loss": 2.8389, "step": 15050 }, { "epoch": 2.83, "grad_norm": 4.240272045135498, "learning_rate": 1.4330886504799551e-05, "loss": 2.8553, "step": 15060 }, { "epoch": 2.84, "grad_norm": 7.734553813934326, "learning_rate": 1.432712215320911e-05, "loss": 2.6589, "step": 15070 }, { "epoch": 2.84, "grad_norm": 4.341592788696289, "learning_rate": 1.432335780161867e-05, "loss": 2.8405, "step": 15080 }, { "epoch": 2.84, "grad_norm": 8.253460884094238, "learning_rate": 1.4319593450028232e-05, "loss": 2.9612, "step": 15090 }, { "epoch": 2.84, "grad_norm": 7.628543376922607, "learning_rate": 1.4315829098437794e-05, "loss": 2.9989, "step": 15100 }, { "epoch": 2.84, "grad_norm": 18.457294464111328, "learning_rate": 1.4312064746847357e-05, "loss": 2.9198, "step": 15110 }, { "epoch": 2.85, "grad_norm": 13.895566940307617, "learning_rate": 1.4308300395256918e-05, "loss": 2.7745, "step": 15120 }, { "epoch": 2.85, "grad_norm": 15.653528213500977, "learning_rate": 1.430453604366648e-05, "loss": 2.8793, "step": 15130 }, { "epoch": 2.85, "grad_norm": 10.075485229492188, "learning_rate": 1.4300771692076041e-05, "loss": 3.062, "step": 15140 }, { "epoch": 2.85, "grad_norm": 9.107342720031738, "learning_rate": 1.4297007340485603e-05, "loss": 2.8955, "step": 15150 }, { "epoch": 2.85, "grad_norm": 14.34937572479248, "learning_rate": 1.4293242988895164e-05, "loss": 2.9611, "step": 15160 }, { "epoch": 2.86, "grad_norm": 8.95149040222168, "learning_rate": 1.4289478637304726e-05, "loss": 2.8848, "step": 15170 }, { "epoch": 2.86, "grad_norm": 5.236386299133301, "learning_rate": 1.4285714285714287e-05, "loss": 2.6948, "step": 15180 }, { "epoch": 2.86, "grad_norm": 8.983771324157715, "learning_rate": 1.4281949934123849e-05, "loss": 2.8674, "step": 15190 }, { "epoch": 2.86, "grad_norm": 11.37382984161377, "learning_rate": 1.427818558253341e-05, "loss": 3.1205, "step": 15200 }, { "epoch": 2.86, "grad_norm": 18.400815963745117, "learning_rate": 1.4274421230942972e-05, "loss": 3.0421, "step": 15210 }, { "epoch": 2.86, "grad_norm": 7.874523639678955, "learning_rate": 1.4270656879352533e-05, "loss": 2.7789, "step": 15220 }, { "epoch": 2.87, "grad_norm": 11.480690956115723, "learning_rate": 1.4266892527762094e-05, "loss": 2.7324, "step": 15230 }, { "epoch": 2.87, "grad_norm": 6.786672115325928, "learning_rate": 1.4263128176171656e-05, "loss": 2.8793, "step": 15240 }, { "epoch": 2.87, "grad_norm": 23.710582733154297, "learning_rate": 1.4259363824581216e-05, "loss": 3.0834, "step": 15250 }, { "epoch": 2.87, "grad_norm": 8.687037467956543, "learning_rate": 1.4255599472990777e-05, "loss": 2.8051, "step": 15260 }, { "epoch": 2.87, "grad_norm": 8.003458023071289, "learning_rate": 1.4251835121400339e-05, "loss": 2.7242, "step": 15270 }, { "epoch": 2.88, "grad_norm": 4.3183183670043945, "learning_rate": 1.42480707698099e-05, "loss": 3.154, "step": 15280 }, { "epoch": 2.88, "grad_norm": 9.064364433288574, "learning_rate": 1.4244306418219463e-05, "loss": 2.7328, "step": 15290 }, { "epoch": 2.88, "grad_norm": 6.07478666305542, "learning_rate": 1.4240542066629025e-05, "loss": 3.1124, "step": 15300 }, { "epoch": 2.88, "grad_norm": 7.217052459716797, "learning_rate": 1.4236777715038586e-05, "loss": 2.7969, "step": 15310 }, { "epoch": 2.88, "grad_norm": 5.144783020019531, "learning_rate": 1.4233013363448148e-05, "loss": 2.7514, "step": 15320 }, { "epoch": 2.89, "grad_norm": 4.329220771789551, "learning_rate": 1.4229249011857709e-05, "loss": 2.7564, "step": 15330 }, { "epoch": 2.89, "grad_norm": 6.250062942504883, "learning_rate": 1.422548466026727e-05, "loss": 2.8276, "step": 15340 }, { "epoch": 2.89, "grad_norm": 6.071444034576416, "learning_rate": 1.4221720308676832e-05, "loss": 2.6574, "step": 15350 }, { "epoch": 2.89, "grad_norm": 6.243869781494141, "learning_rate": 1.4217955957086393e-05, "loss": 2.7303, "step": 15360 }, { "epoch": 2.89, "grad_norm": 16.598417282104492, "learning_rate": 1.4214191605495955e-05, "loss": 2.8046, "step": 15370 }, { "epoch": 2.89, "grad_norm": 9.353248596191406, "learning_rate": 1.4210427253905516e-05, "loss": 2.8063, "step": 15380 }, { "epoch": 2.9, "grad_norm": 8.12759017944336, "learning_rate": 1.4206662902315078e-05, "loss": 2.8651, "step": 15390 }, { "epoch": 2.9, "grad_norm": 14.176252365112305, "learning_rate": 1.420289855072464e-05, "loss": 3.1051, "step": 15400 }, { "epoch": 2.9, "grad_norm": 6.259965896606445, "learning_rate": 1.41991341991342e-05, "loss": 3.1067, "step": 15410 }, { "epoch": 2.9, "grad_norm": 6.678144454956055, "learning_rate": 1.419536984754376e-05, "loss": 2.8616, "step": 15420 }, { "epoch": 2.9, "grad_norm": 8.792501449584961, "learning_rate": 1.4191605495953322e-05, "loss": 2.9977, "step": 15430 }, { "epoch": 2.91, "grad_norm": 10.569917678833008, "learning_rate": 1.4187841144362883e-05, "loss": 2.7409, "step": 15440 }, { "epoch": 2.91, "grad_norm": 11.452315330505371, "learning_rate": 1.4184076792772445e-05, "loss": 2.6558, "step": 15450 }, { "epoch": 2.91, "grad_norm": 11.816713333129883, "learning_rate": 1.4180312441182006e-05, "loss": 2.8886, "step": 15460 }, { "epoch": 2.91, "grad_norm": 6.726597309112549, "learning_rate": 1.4176548089591568e-05, "loss": 2.7877, "step": 15470 }, { "epoch": 2.91, "grad_norm": 5.927799701690674, "learning_rate": 1.4172783738001131e-05, "loss": 2.3958, "step": 15480 }, { "epoch": 2.92, "grad_norm": 12.70663833618164, "learning_rate": 1.4169019386410692e-05, "loss": 2.8458, "step": 15490 }, { "epoch": 2.92, "grad_norm": 10.396106719970703, "learning_rate": 1.4165255034820254e-05, "loss": 2.7142, "step": 15500 }, { "epoch": 2.92, "grad_norm": 7.531083106994629, "learning_rate": 1.4161490683229815e-05, "loss": 2.7203, "step": 15510 }, { "epoch": 2.92, "grad_norm": 5.195558071136475, "learning_rate": 1.4157726331639377e-05, "loss": 2.7773, "step": 15520 }, { "epoch": 2.92, "grad_norm": 19.651317596435547, "learning_rate": 1.4153961980048938e-05, "loss": 2.8192, "step": 15530 }, { "epoch": 2.92, "grad_norm": 7.312119483947754, "learning_rate": 1.41501976284585e-05, "loss": 2.619, "step": 15540 }, { "epoch": 2.93, "grad_norm": 5.6834869384765625, "learning_rate": 1.4146433276868061e-05, "loss": 2.7052, "step": 15550 }, { "epoch": 2.93, "grad_norm": 6.562124252319336, "learning_rate": 1.4142668925277623e-05, "loss": 2.8663, "step": 15560 }, { "epoch": 2.93, "grad_norm": 7.813902378082275, "learning_rate": 1.4138904573687184e-05, "loss": 2.663, "step": 15570 }, { "epoch": 2.93, "grad_norm": 14.589634895324707, "learning_rate": 1.4135140222096746e-05, "loss": 2.8536, "step": 15580 }, { "epoch": 2.93, "grad_norm": 8.651175498962402, "learning_rate": 1.4131375870506307e-05, "loss": 3.0621, "step": 15590 }, { "epoch": 2.94, "grad_norm": 9.971288681030273, "learning_rate": 1.4127611518915867e-05, "loss": 2.9412, "step": 15600 }, { "epoch": 2.94, "grad_norm": 11.991320610046387, "learning_rate": 1.4123847167325428e-05, "loss": 2.9279, "step": 15610 }, { "epoch": 2.94, "grad_norm": 4.10844612121582, "learning_rate": 1.412008281573499e-05, "loss": 2.6703, "step": 15620 }, { "epoch": 2.94, "grad_norm": 5.146695137023926, "learning_rate": 1.4116318464144551e-05, "loss": 2.6734, "step": 15630 }, { "epoch": 2.94, "grad_norm": 10.159703254699707, "learning_rate": 1.4112554112554113e-05, "loss": 2.5359, "step": 15640 }, { "epoch": 2.95, "grad_norm": 8.903599739074707, "learning_rate": 1.4108789760963674e-05, "loss": 2.8425, "step": 15650 }, { "epoch": 2.95, "grad_norm": 5.412868022918701, "learning_rate": 1.4105025409373236e-05, "loss": 2.8371, "step": 15660 }, { "epoch": 2.95, "grad_norm": 4.370077133178711, "learning_rate": 1.4101261057782799e-05, "loss": 2.9035, "step": 15670 }, { "epoch": 2.95, "grad_norm": 5.650407314300537, "learning_rate": 1.409749670619236e-05, "loss": 2.7723, "step": 15680 }, { "epoch": 2.95, "grad_norm": 11.712204933166504, "learning_rate": 1.4093732354601922e-05, "loss": 2.6682, "step": 15690 }, { "epoch": 2.96, "grad_norm": 8.759886741638184, "learning_rate": 1.4089968003011483e-05, "loss": 2.7851, "step": 15700 }, { "epoch": 2.96, "grad_norm": 4.271284580230713, "learning_rate": 1.4086203651421045e-05, "loss": 2.6168, "step": 15710 }, { "epoch": 2.96, "grad_norm": 11.771697044372559, "learning_rate": 1.4082439299830606e-05, "loss": 2.9702, "step": 15720 }, { "epoch": 2.96, "grad_norm": 6.036469459533691, "learning_rate": 1.4078674948240167e-05, "loss": 2.8804, "step": 15730 }, { "epoch": 2.96, "grad_norm": 8.083833694458008, "learning_rate": 1.4074910596649729e-05, "loss": 2.8447, "step": 15740 }, { "epoch": 2.96, "grad_norm": 7.468766212463379, "learning_rate": 1.407114624505929e-05, "loss": 2.8192, "step": 15750 }, { "epoch": 2.97, "grad_norm": 7.825921535491943, "learning_rate": 1.4067381893468852e-05, "loss": 2.7133, "step": 15760 }, { "epoch": 2.97, "grad_norm": 9.92990779876709, "learning_rate": 1.4063617541878412e-05, "loss": 2.8893, "step": 15770 }, { "epoch": 2.97, "grad_norm": 12.215529441833496, "learning_rate": 1.4059853190287973e-05, "loss": 2.8216, "step": 15780 }, { "epoch": 2.97, "grad_norm": 16.796615600585938, "learning_rate": 1.4056088838697535e-05, "loss": 2.4362, "step": 15790 }, { "epoch": 2.97, "grad_norm": 7.565396308898926, "learning_rate": 1.4052324487107096e-05, "loss": 2.9049, "step": 15800 }, { "epoch": 2.98, "grad_norm": 6.777116775512695, "learning_rate": 1.4048560135516657e-05, "loss": 2.5979, "step": 15810 }, { "epoch": 2.98, "grad_norm": 4.635298252105713, "learning_rate": 1.4044795783926219e-05, "loss": 2.969, "step": 15820 }, { "epoch": 2.98, "grad_norm": 9.559572219848633, "learning_rate": 1.404103143233578e-05, "loss": 2.7821, "step": 15830 }, { "epoch": 2.98, "grad_norm": 10.147965431213379, "learning_rate": 1.4037267080745342e-05, "loss": 2.7185, "step": 15840 }, { "epoch": 2.98, "grad_norm": 5.011512279510498, "learning_rate": 1.4033502729154905e-05, "loss": 2.5612, "step": 15850 }, { "epoch": 2.99, "grad_norm": 10.797514915466309, "learning_rate": 1.4029738377564466e-05, "loss": 3.0979, "step": 15860 }, { "epoch": 2.99, "grad_norm": 7.495589733123779, "learning_rate": 1.4025974025974028e-05, "loss": 2.5408, "step": 15870 }, { "epoch": 2.99, "grad_norm": 6.633500099182129, "learning_rate": 1.402220967438359e-05, "loss": 2.751, "step": 15880 }, { "epoch": 2.99, "grad_norm": 8.996492385864258, "learning_rate": 1.401844532279315e-05, "loss": 2.6868, "step": 15890 }, { "epoch": 2.99, "grad_norm": 8.618696212768555, "learning_rate": 1.4014680971202712e-05, "loss": 2.4467, "step": 15900 }, { "epoch": 2.99, "grad_norm": 8.029672622680664, "learning_rate": 1.4010916619612274e-05, "loss": 2.6699, "step": 15910 }, { "epoch": 3.0, "grad_norm": 12.381640434265137, "learning_rate": 1.4007152268021835e-05, "loss": 2.5614, "step": 15920 }, { "epoch": 3.0, "grad_norm": 7.882593631744385, "learning_rate": 1.4003387916431397e-05, "loss": 2.7934, "step": 15930 }, { "epoch": 3.0, "eval_accuracy": 0.7546666666666667, "eval_loss": 2.7094385623931885, "eval_runtime": 31.1719, "eval_samples_per_second": 240.601, "eval_steps_per_second": 30.091, "step": 15939 }, { "epoch": 3.0, "grad_norm": 5.72823429107666, "learning_rate": 1.3999623564840956e-05, "loss": 2.8048, "step": 15940 }, { "epoch": 3.0, "grad_norm": 10.13255500793457, "learning_rate": 1.3995859213250518e-05, "loss": 2.5921, "step": 15950 }, { "epoch": 3.0, "grad_norm": 7.569526672363281, "learning_rate": 1.399209486166008e-05, "loss": 2.6581, "step": 15960 }, { "epoch": 3.01, "grad_norm": 7.485115051269531, "learning_rate": 1.398833051006964e-05, "loss": 2.6147, "step": 15970 }, { "epoch": 3.01, "grad_norm": 6.6184587478637695, "learning_rate": 1.3984566158479202e-05, "loss": 2.6776, "step": 15980 }, { "epoch": 3.01, "grad_norm": 7.744344711303711, "learning_rate": 1.3980801806888764e-05, "loss": 2.7159, "step": 15990 }, { "epoch": 3.01, "grad_norm": 6.304228782653809, "learning_rate": 1.3977037455298325e-05, "loss": 2.2979, "step": 16000 }, { "epoch": 3.01, "grad_norm": 11.377362251281738, "learning_rate": 1.3973273103707887e-05, "loss": 2.6107, "step": 16010 }, { "epoch": 3.02, "grad_norm": 7.546968936920166, "learning_rate": 1.3969508752117448e-05, "loss": 2.6796, "step": 16020 }, { "epoch": 3.02, "grad_norm": 8.799281120300293, "learning_rate": 1.396574440052701e-05, "loss": 2.6833, "step": 16030 }, { "epoch": 3.02, "grad_norm": 6.029132843017578, "learning_rate": 1.3961980048936573e-05, "loss": 2.658, "step": 16040 }, { "epoch": 3.02, "grad_norm": 17.01116180419922, "learning_rate": 1.3958215697346134e-05, "loss": 2.6828, "step": 16050 }, { "epoch": 3.02, "grad_norm": 7.601544380187988, "learning_rate": 1.3954451345755696e-05, "loss": 2.5622, "step": 16060 }, { "epoch": 3.02, "grad_norm": 19.160001754760742, "learning_rate": 1.3950686994165257e-05, "loss": 2.4935, "step": 16070 }, { "epoch": 3.03, "grad_norm": 4.74172830581665, "learning_rate": 1.3946922642574819e-05, "loss": 2.6513, "step": 16080 }, { "epoch": 3.03, "grad_norm": 4.864106178283691, "learning_rate": 1.394315829098438e-05, "loss": 2.6307, "step": 16090 }, { "epoch": 3.03, "grad_norm": 7.812783241271973, "learning_rate": 1.3939393939393942e-05, "loss": 2.7918, "step": 16100 }, { "epoch": 3.03, "grad_norm": 4.93768310546875, "learning_rate": 1.3935629587803503e-05, "loss": 2.7099, "step": 16110 }, { "epoch": 3.03, "grad_norm": 21.922788619995117, "learning_rate": 1.3931865236213063e-05, "loss": 2.9016, "step": 16120 }, { "epoch": 3.04, "grad_norm": 5.226830005645752, "learning_rate": 1.3928100884622624e-05, "loss": 2.8047, "step": 16130 }, { "epoch": 3.04, "grad_norm": 15.841909408569336, "learning_rate": 1.3924336533032186e-05, "loss": 2.2892, "step": 16140 }, { "epoch": 3.04, "grad_norm": 14.991548538208008, "learning_rate": 1.3920572181441747e-05, "loss": 2.4517, "step": 16150 }, { "epoch": 3.04, "grad_norm": 4.272622108459473, "learning_rate": 1.3916807829851309e-05, "loss": 2.524, "step": 16160 }, { "epoch": 3.04, "grad_norm": 8.906754493713379, "learning_rate": 1.391304347826087e-05, "loss": 2.7115, "step": 16170 }, { "epoch": 3.05, "grad_norm": 8.2300443649292, "learning_rate": 1.3909279126670432e-05, "loss": 2.4855, "step": 16180 }, { "epoch": 3.05, "grad_norm": 5.336661338806152, "learning_rate": 1.3905514775079993e-05, "loss": 2.6553, "step": 16190 }, { "epoch": 3.05, "grad_norm": 13.912590980529785, "learning_rate": 1.3901750423489554e-05, "loss": 2.666, "step": 16200 }, { "epoch": 3.05, "grad_norm": 9.392287254333496, "learning_rate": 1.3897986071899116e-05, "loss": 2.5157, "step": 16210 }, { "epoch": 3.05, "grad_norm": 4.785181045532227, "learning_rate": 1.3894221720308679e-05, "loss": 2.4368, "step": 16220 }, { "epoch": 3.05, "grad_norm": 5.415240287780762, "learning_rate": 1.389045736871824e-05, "loss": 2.4729, "step": 16230 }, { "epoch": 3.06, "grad_norm": 6.799906253814697, "learning_rate": 1.3886693017127802e-05, "loss": 2.5026, "step": 16240 }, { "epoch": 3.06, "grad_norm": 7.464561462402344, "learning_rate": 1.3882928665537363e-05, "loss": 2.3939, "step": 16250 }, { "epoch": 3.06, "grad_norm": 7.054912090301514, "learning_rate": 1.3879164313946925e-05, "loss": 2.3711, "step": 16260 }, { "epoch": 3.06, "grad_norm": 9.757431030273438, "learning_rate": 1.3875399962356486e-05, "loss": 2.4576, "step": 16270 }, { "epoch": 3.06, "grad_norm": 5.594775199890137, "learning_rate": 1.3871635610766048e-05, "loss": 2.6047, "step": 16280 }, { "epoch": 3.07, "grad_norm": 11.136175155639648, "learning_rate": 1.3867871259175608e-05, "loss": 2.5492, "step": 16290 }, { "epoch": 3.07, "grad_norm": 9.653315544128418, "learning_rate": 1.3864106907585169e-05, "loss": 2.7153, "step": 16300 }, { "epoch": 3.07, "grad_norm": 16.59368896484375, "learning_rate": 1.386034255599473e-05, "loss": 2.9371, "step": 16310 }, { "epoch": 3.07, "grad_norm": 6.285935878753662, "learning_rate": 1.3856578204404292e-05, "loss": 2.4203, "step": 16320 }, { "epoch": 3.07, "grad_norm": 12.642993927001953, "learning_rate": 1.3852813852813853e-05, "loss": 2.5848, "step": 16330 }, { "epoch": 3.08, "grad_norm": 6.345438480377197, "learning_rate": 1.3849049501223415e-05, "loss": 2.4782, "step": 16340 }, { "epoch": 3.08, "grad_norm": 7.475521564483643, "learning_rate": 1.3845285149632976e-05, "loss": 2.8802, "step": 16350 }, { "epoch": 3.08, "grad_norm": 7.40354061126709, "learning_rate": 1.3841520798042538e-05, "loss": 2.5257, "step": 16360 }, { "epoch": 3.08, "grad_norm": 5.398587226867676, "learning_rate": 1.38377564464521e-05, "loss": 2.8319, "step": 16370 }, { "epoch": 3.08, "grad_norm": 4.676400661468506, "learning_rate": 1.383399209486166e-05, "loss": 2.4131, "step": 16380 }, { "epoch": 3.08, "grad_norm": 7.838329315185547, "learning_rate": 1.3830227743271222e-05, "loss": 2.6379, "step": 16390 }, { "epoch": 3.09, "grad_norm": 6.008301258087158, "learning_rate": 1.3826463391680784e-05, "loss": 2.5417, "step": 16400 }, { "epoch": 3.09, "grad_norm": 7.391419410705566, "learning_rate": 1.3822699040090347e-05, "loss": 2.5861, "step": 16410 }, { "epoch": 3.09, "grad_norm": 5.241502285003662, "learning_rate": 1.3818934688499908e-05, "loss": 2.5454, "step": 16420 }, { "epoch": 3.09, "grad_norm": 6.2493367195129395, "learning_rate": 1.381517033690947e-05, "loss": 2.5099, "step": 16430 }, { "epoch": 3.09, "grad_norm": 10.405550956726074, "learning_rate": 1.3811405985319031e-05, "loss": 2.5737, "step": 16440 }, { "epoch": 3.1, "grad_norm": 10.087574005126953, "learning_rate": 1.3807641633728593e-05, "loss": 2.314, "step": 16450 }, { "epoch": 3.1, "grad_norm": 6.864453315734863, "learning_rate": 1.3803877282138154e-05, "loss": 2.5797, "step": 16460 }, { "epoch": 3.1, "grad_norm": 4.043096542358398, "learning_rate": 1.3800112930547714e-05, "loss": 2.3865, "step": 16470 }, { "epoch": 3.1, "grad_norm": 8.557513236999512, "learning_rate": 1.3796348578957275e-05, "loss": 2.459, "step": 16480 }, { "epoch": 3.1, "grad_norm": 25.291898727416992, "learning_rate": 1.3792584227366837e-05, "loss": 2.5729, "step": 16490 }, { "epoch": 3.11, "grad_norm": 29.470672607421875, "learning_rate": 1.3788819875776398e-05, "loss": 2.6194, "step": 16500 }, { "epoch": 3.11, "grad_norm": 5.019947528839111, "learning_rate": 1.378505552418596e-05, "loss": 2.2763, "step": 16510 }, { "epoch": 3.11, "grad_norm": 14.667228698730469, "learning_rate": 1.3781291172595521e-05, "loss": 2.4079, "step": 16520 }, { "epoch": 3.11, "grad_norm": 16.10256576538086, "learning_rate": 1.3777526821005083e-05, "loss": 2.8521, "step": 16530 }, { "epoch": 3.11, "grad_norm": 5.297245025634766, "learning_rate": 1.3773762469414644e-05, "loss": 2.4195, "step": 16540 }, { "epoch": 3.12, "grad_norm": 6.964445114135742, "learning_rate": 1.3769998117824206e-05, "loss": 2.539, "step": 16550 }, { "epoch": 3.12, "grad_norm": 8.060262680053711, "learning_rate": 1.3766233766233767e-05, "loss": 2.6806, "step": 16560 }, { "epoch": 3.12, "grad_norm": 27.346101760864258, "learning_rate": 1.3762469414643328e-05, "loss": 2.5731, "step": 16570 }, { "epoch": 3.12, "grad_norm": 13.408673286437988, "learning_rate": 1.375870506305289e-05, "loss": 2.3761, "step": 16580 }, { "epoch": 3.12, "grad_norm": 8.398408889770508, "learning_rate": 1.3754940711462453e-05, "loss": 2.4591, "step": 16590 }, { "epoch": 3.12, "grad_norm": 9.513443946838379, "learning_rate": 1.3751176359872015e-05, "loss": 2.4043, "step": 16600 }, { "epoch": 3.13, "grad_norm": 5.014272689819336, "learning_rate": 1.3747412008281576e-05, "loss": 2.8311, "step": 16610 }, { "epoch": 3.13, "grad_norm": 13.03158950805664, "learning_rate": 1.3743647656691137e-05, "loss": 2.3034, "step": 16620 }, { "epoch": 3.13, "grad_norm": 25.35574722290039, "learning_rate": 1.3739883305100699e-05, "loss": 2.5601, "step": 16630 }, { "epoch": 3.13, "grad_norm": 7.676589488983154, "learning_rate": 1.3736118953510259e-05, "loss": 2.4727, "step": 16640 }, { "epoch": 3.13, "grad_norm": 6.241795063018799, "learning_rate": 1.373235460191982e-05, "loss": 2.344, "step": 16650 }, { "epoch": 3.14, "grad_norm": 8.19005298614502, "learning_rate": 1.3728590250329382e-05, "loss": 2.4407, "step": 16660 }, { "epoch": 3.14, "grad_norm": 9.252071380615234, "learning_rate": 1.3724825898738943e-05, "loss": 2.3879, "step": 16670 }, { "epoch": 3.14, "grad_norm": 4.64891242980957, "learning_rate": 1.3721061547148505e-05, "loss": 2.478, "step": 16680 }, { "epoch": 3.14, "grad_norm": 12.982640266418457, "learning_rate": 1.3717297195558066e-05, "loss": 2.7787, "step": 16690 }, { "epoch": 3.14, "grad_norm": 4.891650676727295, "learning_rate": 1.3713532843967627e-05, "loss": 2.2632, "step": 16700 }, { "epoch": 3.15, "grad_norm": 16.587858200073242, "learning_rate": 1.3709768492377189e-05, "loss": 2.2535, "step": 16710 }, { "epoch": 3.15, "grad_norm": 8.104990005493164, "learning_rate": 1.370600414078675e-05, "loss": 2.3733, "step": 16720 }, { "epoch": 3.15, "grad_norm": 7.4662394523620605, "learning_rate": 1.3702239789196312e-05, "loss": 2.115, "step": 16730 }, { "epoch": 3.15, "grad_norm": 21.421432495117188, "learning_rate": 1.3698475437605873e-05, "loss": 2.3562, "step": 16740 }, { "epoch": 3.15, "grad_norm": 5.357096195220947, "learning_rate": 1.3694711086015435e-05, "loss": 2.2325, "step": 16750 }, { "epoch": 3.15, "grad_norm": 6.147331714630127, "learning_rate": 1.3690946734424996e-05, "loss": 2.4206, "step": 16760 }, { "epoch": 3.16, "grad_norm": 11.714056015014648, "learning_rate": 1.3687182382834558e-05, "loss": 2.4481, "step": 16770 }, { "epoch": 3.16, "grad_norm": 9.148975372314453, "learning_rate": 1.368341803124412e-05, "loss": 2.3104, "step": 16780 }, { "epoch": 3.16, "grad_norm": 10.10844898223877, "learning_rate": 1.3679653679653682e-05, "loss": 2.3505, "step": 16790 }, { "epoch": 3.16, "grad_norm": 4.908918857574463, "learning_rate": 1.3675889328063244e-05, "loss": 2.4205, "step": 16800 }, { "epoch": 3.16, "grad_norm": 9.376786231994629, "learning_rate": 1.3672124976472802e-05, "loss": 2.6518, "step": 16810 }, { "epoch": 3.17, "grad_norm": 15.973650932312012, "learning_rate": 1.3668360624882365e-05, "loss": 2.3571, "step": 16820 }, { "epoch": 3.17, "grad_norm": 6.914224624633789, "learning_rate": 1.3664596273291926e-05, "loss": 2.1983, "step": 16830 }, { "epoch": 3.17, "grad_norm": 5.047109127044678, "learning_rate": 1.3660831921701488e-05, "loss": 2.1406, "step": 16840 }, { "epoch": 3.17, "grad_norm": 7.190529823303223, "learning_rate": 1.365706757011105e-05, "loss": 2.3257, "step": 16850 }, { "epoch": 3.17, "grad_norm": 17.389127731323242, "learning_rate": 1.365330321852061e-05, "loss": 2.4099, "step": 16860 }, { "epoch": 3.18, "grad_norm": 8.829944610595703, "learning_rate": 1.3649538866930172e-05, "loss": 2.1792, "step": 16870 }, { "epoch": 3.18, "grad_norm": 7.4745988845825195, "learning_rate": 1.3645774515339734e-05, "loss": 2.4042, "step": 16880 }, { "epoch": 3.18, "grad_norm": 7.47590446472168, "learning_rate": 1.3642010163749295e-05, "loss": 2.2335, "step": 16890 }, { "epoch": 3.18, "grad_norm": 8.67519760131836, "learning_rate": 1.3638245812158857e-05, "loss": 2.4963, "step": 16900 }, { "epoch": 3.18, "grad_norm": 13.422600746154785, "learning_rate": 1.3634481460568418e-05, "loss": 2.4068, "step": 16910 }, { "epoch": 3.18, "grad_norm": 5.132624626159668, "learning_rate": 1.363071710897798e-05, "loss": 2.0553, "step": 16920 }, { "epoch": 3.19, "grad_norm": 3.634127378463745, "learning_rate": 1.3626952757387541e-05, "loss": 2.3048, "step": 16930 }, { "epoch": 3.19, "grad_norm": 12.553937911987305, "learning_rate": 1.3623188405797103e-05, "loss": 2.2707, "step": 16940 }, { "epoch": 3.19, "grad_norm": 8.07034969329834, "learning_rate": 1.3619424054206664e-05, "loss": 2.3505, "step": 16950 }, { "epoch": 3.19, "grad_norm": 7.7102580070495605, "learning_rate": 1.3615659702616225e-05, "loss": 2.2881, "step": 16960 }, { "epoch": 3.19, "grad_norm": 6.294566631317139, "learning_rate": 1.3611895351025789e-05, "loss": 2.2425, "step": 16970 }, { "epoch": 3.2, "grad_norm": 21.3561954498291, "learning_rate": 1.360813099943535e-05, "loss": 2.5824, "step": 16980 }, { "epoch": 3.2, "grad_norm": 23.01136016845703, "learning_rate": 1.3604366647844908e-05, "loss": 2.1891, "step": 16990 }, { "epoch": 3.2, "grad_norm": 13.740280151367188, "learning_rate": 1.360060229625447e-05, "loss": 2.1156, "step": 17000 }, { "epoch": 3.2, "grad_norm": 6.194286823272705, "learning_rate": 1.3596837944664033e-05, "loss": 2.1015, "step": 17010 }, { "epoch": 3.2, "grad_norm": 6.71209192276001, "learning_rate": 1.3593073593073594e-05, "loss": 2.0999, "step": 17020 }, { "epoch": 3.21, "grad_norm": 7.006505966186523, "learning_rate": 1.3589309241483156e-05, "loss": 2.1711, "step": 17030 }, { "epoch": 3.21, "grad_norm": 6.750763416290283, "learning_rate": 1.3585544889892717e-05, "loss": 2.6033, "step": 17040 }, { "epoch": 3.21, "grad_norm": 4.699491500854492, "learning_rate": 1.3581780538302279e-05, "loss": 2.075, "step": 17050 }, { "epoch": 3.21, "grad_norm": 9.14957046508789, "learning_rate": 1.357801618671184e-05, "loss": 2.4523, "step": 17060 }, { "epoch": 3.21, "grad_norm": 4.098272323608398, "learning_rate": 1.3574251835121402e-05, "loss": 1.8619, "step": 17070 }, { "epoch": 3.21, "grad_norm": 20.661664962768555, "learning_rate": 1.3570487483530963e-05, "loss": 2.3277, "step": 17080 }, { "epoch": 3.22, "grad_norm": 11.551385879516602, "learning_rate": 1.3566723131940524e-05, "loss": 2.4337, "step": 17090 }, { "epoch": 3.22, "grad_norm": 6.284670829772949, "learning_rate": 1.3562958780350086e-05, "loss": 2.696, "step": 17100 }, { "epoch": 3.22, "grad_norm": 8.457011222839355, "learning_rate": 1.3559194428759647e-05, "loss": 2.4836, "step": 17110 }, { "epoch": 3.22, "grad_norm": 8.318930625915527, "learning_rate": 1.3555430077169209e-05, "loss": 2.2709, "step": 17120 }, { "epoch": 3.22, "grad_norm": 4.008598327636719, "learning_rate": 1.355166572557877e-05, "loss": 2.2146, "step": 17130 }, { "epoch": 3.23, "grad_norm": 5.7014946937561035, "learning_rate": 1.3547901373988332e-05, "loss": 2.4608, "step": 17140 }, { "epoch": 3.23, "grad_norm": 6.472574710845947, "learning_rate": 1.3544137022397895e-05, "loss": 2.2396, "step": 17150 }, { "epoch": 3.23, "grad_norm": 7.037259101867676, "learning_rate": 1.3540372670807453e-05, "loss": 2.4347, "step": 17160 }, { "epoch": 3.23, "grad_norm": 14.940933227539062, "learning_rate": 1.3536608319217014e-05, "loss": 2.2869, "step": 17170 }, { "epoch": 3.23, "grad_norm": 5.644647121429443, "learning_rate": 1.3532843967626576e-05, "loss": 2.4104, "step": 17180 }, { "epoch": 3.24, "grad_norm": 8.700958251953125, "learning_rate": 1.3529079616036137e-05, "loss": 2.275, "step": 17190 }, { "epoch": 3.24, "grad_norm": 10.173393249511719, "learning_rate": 1.35253152644457e-05, "loss": 2.2258, "step": 17200 }, { "epoch": 3.24, "grad_norm": 9.289419174194336, "learning_rate": 1.3521550912855262e-05, "loss": 2.1307, "step": 17210 }, { "epoch": 3.24, "grad_norm": 10.507315635681152, "learning_rate": 1.3517786561264823e-05, "loss": 2.2228, "step": 17220 }, { "epoch": 3.24, "grad_norm": 17.90699005126953, "learning_rate": 1.3514022209674385e-05, "loss": 2.1789, "step": 17230 }, { "epoch": 3.24, "grad_norm": 17.78692626953125, "learning_rate": 1.3510257858083946e-05, "loss": 2.286, "step": 17240 }, { "epoch": 3.25, "grad_norm": 20.160356521606445, "learning_rate": 1.3506493506493508e-05, "loss": 2.4036, "step": 17250 }, { "epoch": 3.25, "grad_norm": 14.273446083068848, "learning_rate": 1.350272915490307e-05, "loss": 2.1708, "step": 17260 }, { "epoch": 3.25, "grad_norm": 5.817999839782715, "learning_rate": 1.349896480331263e-05, "loss": 2.437, "step": 17270 }, { "epoch": 3.25, "grad_norm": 8.525969505310059, "learning_rate": 1.3495200451722192e-05, "loss": 2.1232, "step": 17280 }, { "epoch": 3.25, "grad_norm": 10.203720092773438, "learning_rate": 1.3491436100131754e-05, "loss": 2.2521, "step": 17290 }, { "epoch": 3.26, "grad_norm": 7.757316589355469, "learning_rate": 1.3487671748541315e-05, "loss": 2.3627, "step": 17300 }, { "epoch": 3.26, "grad_norm": 4.353641986846924, "learning_rate": 1.3483907396950877e-05, "loss": 2.2512, "step": 17310 }, { "epoch": 3.26, "grad_norm": 10.248074531555176, "learning_rate": 1.3480143045360438e-05, "loss": 2.5071, "step": 17320 }, { "epoch": 3.26, "grad_norm": 13.115521430969238, "learning_rate": 1.347637869377e-05, "loss": 2.3108, "step": 17330 }, { "epoch": 3.26, "grad_norm": 5.469527721405029, "learning_rate": 1.347261434217956e-05, "loss": 1.7027, "step": 17340 }, { "epoch": 3.27, "grad_norm": 5.834651947021484, "learning_rate": 1.346884999058912e-05, "loss": 2.2962, "step": 17350 }, { "epoch": 3.27, "grad_norm": 8.019462585449219, "learning_rate": 1.3465085638998682e-05, "loss": 2.649, "step": 17360 }, { "epoch": 3.27, "grad_norm": 10.119734764099121, "learning_rate": 1.3461321287408244e-05, "loss": 2.1964, "step": 17370 }, { "epoch": 3.27, "grad_norm": 7.287962913513184, "learning_rate": 1.3457556935817807e-05, "loss": 2.5475, "step": 17380 }, { "epoch": 3.27, "grad_norm": 7.343551158905029, "learning_rate": 1.3453792584227368e-05, "loss": 2.273, "step": 17390 }, { "epoch": 3.27, "grad_norm": 9.913813591003418, "learning_rate": 1.345002823263693e-05, "loss": 2.267, "step": 17400 }, { "epoch": 3.28, "grad_norm": 9.015138626098633, "learning_rate": 1.3446263881046491e-05, "loss": 2.1588, "step": 17410 }, { "epoch": 3.28, "grad_norm": 17.901674270629883, "learning_rate": 1.3442499529456053e-05, "loss": 2.1719, "step": 17420 }, { "epoch": 3.28, "grad_norm": 24.501487731933594, "learning_rate": 1.3438735177865614e-05, "loss": 2.2731, "step": 17430 }, { "epoch": 3.28, "grad_norm": 6.028817653656006, "learning_rate": 1.3434970826275176e-05, "loss": 2.1774, "step": 17440 }, { "epoch": 3.28, "grad_norm": 9.6994047164917, "learning_rate": 1.3431206474684737e-05, "loss": 2.5506, "step": 17450 }, { "epoch": 3.29, "grad_norm": 6.670959949493408, "learning_rate": 1.3427442123094298e-05, "loss": 2.2378, "step": 17460 }, { "epoch": 3.29, "grad_norm": 20.83564567565918, "learning_rate": 1.342367777150386e-05, "loss": 2.1959, "step": 17470 }, { "epoch": 3.29, "grad_norm": 11.419940948486328, "learning_rate": 1.3419913419913421e-05, "loss": 2.041, "step": 17480 }, { "epoch": 3.29, "grad_norm": 7.676941394805908, "learning_rate": 1.3416149068322983e-05, "loss": 2.1222, "step": 17490 }, { "epoch": 3.29, "grad_norm": 16.266496658325195, "learning_rate": 1.3412384716732544e-05, "loss": 2.3506, "step": 17500 }, { "epoch": 3.3, "grad_norm": 9.446209907531738, "learning_rate": 1.3408620365142104e-05, "loss": 2.5076, "step": 17510 }, { "epoch": 3.3, "grad_norm": 4.858938217163086, "learning_rate": 1.3404856013551666e-05, "loss": 1.9202, "step": 17520 }, { "epoch": 3.3, "grad_norm": 3.9233055114746094, "learning_rate": 1.3401091661961227e-05, "loss": 2.0582, "step": 17530 }, { "epoch": 3.3, "grad_norm": 6.448343753814697, "learning_rate": 1.3397327310370788e-05, "loss": 2.3783, "step": 17540 }, { "epoch": 3.3, "grad_norm": 22.5777587890625, "learning_rate": 1.339356295878035e-05, "loss": 2.2551, "step": 17550 }, { "epoch": 3.31, "grad_norm": 5.1327104568481445, "learning_rate": 1.3389798607189911e-05, "loss": 2.1806, "step": 17560 }, { "epoch": 3.31, "grad_norm": 9.523948669433594, "learning_rate": 1.3386034255599475e-05, "loss": 2.4007, "step": 17570 }, { "epoch": 3.31, "grad_norm": 6.770203590393066, "learning_rate": 1.3382269904009036e-05, "loss": 2.5531, "step": 17580 }, { "epoch": 3.31, "grad_norm": 5.728158473968506, "learning_rate": 1.3378505552418597e-05, "loss": 2.116, "step": 17590 }, { "epoch": 3.31, "grad_norm": 15.523874282836914, "learning_rate": 1.3374741200828159e-05, "loss": 2.323, "step": 17600 }, { "epoch": 3.31, "grad_norm": 27.21449851989746, "learning_rate": 1.337097684923772e-05, "loss": 2.0659, "step": 17610 }, { "epoch": 3.32, "grad_norm": 5.338441848754883, "learning_rate": 1.3367212497647282e-05, "loss": 2.003, "step": 17620 }, { "epoch": 3.32, "grad_norm": 6.023446559906006, "learning_rate": 1.3363448146056843e-05, "loss": 2.1356, "step": 17630 }, { "epoch": 3.32, "grad_norm": 6.757903575897217, "learning_rate": 1.3359683794466405e-05, "loss": 2.4006, "step": 17640 }, { "epoch": 3.32, "grad_norm": 12.635032653808594, "learning_rate": 1.3355919442875966e-05, "loss": 2.0722, "step": 17650 }, { "epoch": 3.32, "grad_norm": 9.285216331481934, "learning_rate": 1.3352155091285528e-05, "loss": 2.3259, "step": 17660 }, { "epoch": 3.33, "grad_norm": 12.814033508300781, "learning_rate": 1.3348390739695089e-05, "loss": 2.2402, "step": 17670 }, { "epoch": 3.33, "grad_norm": 8.399075508117676, "learning_rate": 1.334462638810465e-05, "loss": 2.1736, "step": 17680 }, { "epoch": 3.33, "grad_norm": 6.659306049346924, "learning_rate": 1.334086203651421e-05, "loss": 2.0956, "step": 17690 }, { "epoch": 3.33, "grad_norm": 8.907916069030762, "learning_rate": 1.3337097684923772e-05, "loss": 1.9927, "step": 17700 }, { "epoch": 3.33, "grad_norm": 6.215513229370117, "learning_rate": 1.3333333333333333e-05, "loss": 2.0337, "step": 17710 }, { "epoch": 3.34, "grad_norm": 4.003261089324951, "learning_rate": 1.3329568981742895e-05, "loss": 2.1053, "step": 17720 }, { "epoch": 3.34, "grad_norm": 23.838529586791992, "learning_rate": 1.3325804630152456e-05, "loss": 2.3904, "step": 17730 }, { "epoch": 3.34, "grad_norm": 6.266791343688965, "learning_rate": 1.3322040278562018e-05, "loss": 2.2668, "step": 17740 }, { "epoch": 3.34, "grad_norm": 14.216041564941406, "learning_rate": 1.331827592697158e-05, "loss": 2.0331, "step": 17750 }, { "epoch": 3.34, "grad_norm": 12.211094856262207, "learning_rate": 1.3314511575381142e-05, "loss": 2.2239, "step": 17760 }, { "epoch": 3.34, "grad_norm": 6.941567897796631, "learning_rate": 1.3310747223790704e-05, "loss": 2.1712, "step": 17770 }, { "epoch": 3.35, "grad_norm": 6.195743083953857, "learning_rate": 1.3306982872200265e-05, "loss": 2.0734, "step": 17780 }, { "epoch": 3.35, "grad_norm": 6.717380046844482, "learning_rate": 1.3303218520609827e-05, "loss": 2.3201, "step": 17790 }, { "epoch": 3.35, "grad_norm": 12.03931999206543, "learning_rate": 1.3299454169019388e-05, "loss": 2.2702, "step": 17800 }, { "epoch": 3.35, "grad_norm": 11.816142082214355, "learning_rate": 1.329568981742895e-05, "loss": 2.1566, "step": 17810 }, { "epoch": 3.35, "grad_norm": 17.85408592224121, "learning_rate": 1.3291925465838511e-05, "loss": 2.1189, "step": 17820 }, { "epoch": 3.36, "grad_norm": 7.314423084259033, "learning_rate": 1.3288161114248073e-05, "loss": 2.0172, "step": 17830 }, { "epoch": 3.36, "grad_norm": 12.552395820617676, "learning_rate": 1.3284396762657634e-05, "loss": 2.2597, "step": 17840 }, { "epoch": 3.36, "grad_norm": 9.304374694824219, "learning_rate": 1.3280632411067195e-05, "loss": 2.1937, "step": 17850 }, { "epoch": 3.36, "grad_norm": 5.545548915863037, "learning_rate": 1.3276868059476755e-05, "loss": 2.2196, "step": 17860 }, { "epoch": 3.36, "grad_norm": 10.859105110168457, "learning_rate": 1.3273103707886317e-05, "loss": 2.1488, "step": 17870 }, { "epoch": 3.37, "grad_norm": 7.488467693328857, "learning_rate": 1.3269339356295878e-05, "loss": 2.1601, "step": 17880 }, { "epoch": 3.37, "grad_norm": 7.243104934692383, "learning_rate": 1.326557500470544e-05, "loss": 2.1451, "step": 17890 }, { "epoch": 3.37, "grad_norm": 7.471163272857666, "learning_rate": 1.3261810653115001e-05, "loss": 2.065, "step": 17900 }, { "epoch": 3.37, "grad_norm": 9.878214836120605, "learning_rate": 1.3258046301524563e-05, "loss": 2.0346, "step": 17910 }, { "epoch": 3.37, "grad_norm": 11.709124565124512, "learning_rate": 1.3254281949934124e-05, "loss": 2.2273, "step": 17920 }, { "epoch": 3.37, "grad_norm": 22.54524803161621, "learning_rate": 1.3250517598343685e-05, "loss": 2.037, "step": 17930 }, { "epoch": 3.38, "grad_norm": 3.9867968559265137, "learning_rate": 1.3246753246753249e-05, "loss": 2.2627, "step": 17940 }, { "epoch": 3.38, "grad_norm": 7.225203514099121, "learning_rate": 1.324298889516281e-05, "loss": 2.1492, "step": 17950 }, { "epoch": 3.38, "grad_norm": 7.8236775398254395, "learning_rate": 1.3239224543572372e-05, "loss": 2.2366, "step": 17960 }, { "epoch": 3.38, "grad_norm": 4.443457126617432, "learning_rate": 1.3235460191981933e-05, "loss": 1.9925, "step": 17970 }, { "epoch": 3.38, "grad_norm": 14.248533248901367, "learning_rate": 1.3231695840391494e-05, "loss": 2.187, "step": 17980 }, { "epoch": 3.39, "grad_norm": 5.991246700286865, "learning_rate": 1.3227931488801056e-05, "loss": 2.0112, "step": 17990 }, { "epoch": 3.39, "grad_norm": 10.76357650756836, "learning_rate": 1.3224167137210617e-05, "loss": 2.4213, "step": 18000 }, { "epoch": 3.39, "grad_norm": 5.766618251800537, "learning_rate": 1.3220402785620179e-05, "loss": 1.7307, "step": 18010 }, { "epoch": 3.39, "grad_norm": 6.714885234832764, "learning_rate": 1.321663843402974e-05, "loss": 2.1758, "step": 18020 }, { "epoch": 3.39, "grad_norm": 11.681251525878906, "learning_rate": 1.32128740824393e-05, "loss": 2.3254, "step": 18030 }, { "epoch": 3.4, "grad_norm": 15.801177024841309, "learning_rate": 1.3209109730848861e-05, "loss": 2.1412, "step": 18040 }, { "epoch": 3.4, "grad_norm": 9.477062225341797, "learning_rate": 1.3205345379258423e-05, "loss": 1.9209, "step": 18050 }, { "epoch": 3.4, "grad_norm": 4.855134963989258, "learning_rate": 1.3201581027667984e-05, "loss": 2.0294, "step": 18060 }, { "epoch": 3.4, "grad_norm": 6.468574047088623, "learning_rate": 1.3197816676077546e-05, "loss": 2.0421, "step": 18070 }, { "epoch": 3.4, "grad_norm": 8.422307014465332, "learning_rate": 1.3194052324487107e-05, "loss": 1.9623, "step": 18080 }, { "epoch": 3.4, "grad_norm": 15.541171073913574, "learning_rate": 1.3190287972896669e-05, "loss": 1.9774, "step": 18090 }, { "epoch": 3.41, "grad_norm": 3.8373525142669678, "learning_rate": 1.318652362130623e-05, "loss": 1.8363, "step": 18100 }, { "epoch": 3.41, "grad_norm": 4.823386192321777, "learning_rate": 1.3182759269715792e-05, "loss": 2.2143, "step": 18110 }, { "epoch": 3.41, "grad_norm": 5.459898471832275, "learning_rate": 1.3178994918125355e-05, "loss": 1.9873, "step": 18120 }, { "epoch": 3.41, "grad_norm": 7.368117809295654, "learning_rate": 1.3175230566534916e-05, "loss": 1.9806, "step": 18130 }, { "epoch": 3.41, "grad_norm": 6.924953937530518, "learning_rate": 1.3171466214944478e-05, "loss": 2.1295, "step": 18140 }, { "epoch": 3.42, "grad_norm": 11.13085651397705, "learning_rate": 1.316770186335404e-05, "loss": 2.1773, "step": 18150 }, { "epoch": 3.42, "grad_norm": 43.57018280029297, "learning_rate": 1.31639375117636e-05, "loss": 1.9276, "step": 18160 }, { "epoch": 3.42, "grad_norm": 6.509583950042725, "learning_rate": 1.3160173160173162e-05, "loss": 2.0282, "step": 18170 }, { "epoch": 3.42, "grad_norm": 28.11988639831543, "learning_rate": 1.3156408808582724e-05, "loss": 1.9403, "step": 18180 }, { "epoch": 3.42, "grad_norm": 4.456322193145752, "learning_rate": 1.3152644456992285e-05, "loss": 1.9777, "step": 18190 }, { "epoch": 3.43, "grad_norm": 10.495478630065918, "learning_rate": 1.3148880105401847e-05, "loss": 1.9311, "step": 18200 }, { "epoch": 3.43, "grad_norm": 9.403477668762207, "learning_rate": 1.3145115753811406e-05, "loss": 1.9901, "step": 18210 }, { "epoch": 3.43, "grad_norm": 12.529974937438965, "learning_rate": 1.3141351402220968e-05, "loss": 2.2266, "step": 18220 }, { "epoch": 3.43, "grad_norm": 8.205238342285156, "learning_rate": 1.313758705063053e-05, "loss": 1.8873, "step": 18230 }, { "epoch": 3.43, "grad_norm": 8.789932250976562, "learning_rate": 1.313382269904009e-05, "loss": 2.3908, "step": 18240 }, { "epoch": 3.43, "grad_norm": 7.4011383056640625, "learning_rate": 1.3130058347449652e-05, "loss": 2.1811, "step": 18250 }, { "epoch": 3.44, "grad_norm": 7.565342903137207, "learning_rate": 1.3126293995859214e-05, "loss": 1.906, "step": 18260 }, { "epoch": 3.44, "grad_norm": 4.4078192710876465, "learning_rate": 1.3122529644268775e-05, "loss": 1.7022, "step": 18270 }, { "epoch": 3.44, "grad_norm": 29.880828857421875, "learning_rate": 1.3118765292678337e-05, "loss": 2.0516, "step": 18280 }, { "epoch": 3.44, "grad_norm": 10.716898918151855, "learning_rate": 1.3115000941087898e-05, "loss": 1.8212, "step": 18290 }, { "epoch": 3.44, "grad_norm": 15.047762870788574, "learning_rate": 1.311123658949746e-05, "loss": 1.8639, "step": 18300 }, { "epoch": 3.45, "grad_norm": 8.001484870910645, "learning_rate": 1.3107472237907023e-05, "loss": 2.097, "step": 18310 }, { "epoch": 3.45, "grad_norm": 8.17911434173584, "learning_rate": 1.3103707886316584e-05, "loss": 1.998, "step": 18320 }, { "epoch": 3.45, "grad_norm": 7.078904151916504, "learning_rate": 1.3099943534726146e-05, "loss": 2.0324, "step": 18330 }, { "epoch": 3.45, "grad_norm": 19.999042510986328, "learning_rate": 1.3096179183135707e-05, "loss": 2.2589, "step": 18340 }, { "epoch": 3.45, "grad_norm": 6.821465492248535, "learning_rate": 1.3092414831545268e-05, "loss": 2.0181, "step": 18350 }, { "epoch": 3.46, "grad_norm": 16.801586151123047, "learning_rate": 1.308865047995483e-05, "loss": 2.1832, "step": 18360 }, { "epoch": 3.46, "grad_norm": 8.644526481628418, "learning_rate": 1.3084886128364391e-05, "loss": 2.1414, "step": 18370 }, { "epoch": 3.46, "grad_norm": 7.96954870223999, "learning_rate": 1.3081121776773951e-05, "loss": 1.913, "step": 18380 }, { "epoch": 3.46, "grad_norm": 6.405788421630859, "learning_rate": 1.3077357425183513e-05, "loss": 1.7991, "step": 18390 }, { "epoch": 3.46, "grad_norm": 5.472543716430664, "learning_rate": 1.3073593073593074e-05, "loss": 1.8373, "step": 18400 }, { "epoch": 3.47, "grad_norm": 11.308146476745605, "learning_rate": 1.3069828722002636e-05, "loss": 2.0228, "step": 18410 }, { "epoch": 3.47, "grad_norm": 15.845481872558594, "learning_rate": 1.3066064370412197e-05, "loss": 1.9928, "step": 18420 }, { "epoch": 3.47, "grad_norm": 8.689587593078613, "learning_rate": 1.3062300018821758e-05, "loss": 2.0818, "step": 18430 }, { "epoch": 3.47, "grad_norm": 27.186220169067383, "learning_rate": 1.305853566723132e-05, "loss": 1.9161, "step": 18440 }, { "epoch": 3.47, "grad_norm": 6.466693878173828, "learning_rate": 1.3054771315640881e-05, "loss": 1.9085, "step": 18450 }, { "epoch": 3.47, "grad_norm": 11.024419784545898, "learning_rate": 1.3051006964050443e-05, "loss": 1.9601, "step": 18460 }, { "epoch": 3.48, "grad_norm": 5.090084075927734, "learning_rate": 1.3047242612460004e-05, "loss": 2.143, "step": 18470 }, { "epoch": 3.48, "grad_norm": 12.95552921295166, "learning_rate": 1.3043478260869566e-05, "loss": 2.4482, "step": 18480 }, { "epoch": 3.48, "grad_norm": 9.394156455993652, "learning_rate": 1.3039713909279127e-05, "loss": 1.914, "step": 18490 }, { "epoch": 3.48, "grad_norm": 5.978391647338867, "learning_rate": 1.303594955768869e-05, "loss": 2.002, "step": 18500 }, { "epoch": 3.48, "grad_norm": 5.844174861907959, "learning_rate": 1.3032185206098252e-05, "loss": 2.3795, "step": 18510 }, { "epoch": 3.49, "grad_norm": 24.756702423095703, "learning_rate": 1.3028420854507813e-05, "loss": 1.9738, "step": 18520 }, { "epoch": 3.49, "grad_norm": 6.950799465179443, "learning_rate": 1.3024656502917375e-05, "loss": 1.9241, "step": 18530 }, { "epoch": 3.49, "grad_norm": 11.90694808959961, "learning_rate": 1.3020892151326936e-05, "loss": 2.2924, "step": 18540 }, { "epoch": 3.49, "grad_norm": 11.800655364990234, "learning_rate": 1.3017127799736498e-05, "loss": 2.1392, "step": 18550 }, { "epoch": 3.49, "grad_norm": 6.180018424987793, "learning_rate": 1.3013363448146057e-05, "loss": 1.8976, "step": 18560 }, { "epoch": 3.5, "grad_norm": 5.06319522857666, "learning_rate": 1.3009599096555619e-05, "loss": 2.0315, "step": 18570 }, { "epoch": 3.5, "grad_norm": 16.519315719604492, "learning_rate": 1.300583474496518e-05, "loss": 2.2712, "step": 18580 }, { "epoch": 3.5, "grad_norm": 7.123774528503418, "learning_rate": 1.3002070393374742e-05, "loss": 2.0342, "step": 18590 }, { "epoch": 3.5, "grad_norm": 18.311893463134766, "learning_rate": 1.2998306041784303e-05, "loss": 1.9826, "step": 18600 }, { "epoch": 3.5, "grad_norm": 12.739026069641113, "learning_rate": 1.2994541690193865e-05, "loss": 1.9456, "step": 18610 }, { "epoch": 3.5, "grad_norm": 8.55923080444336, "learning_rate": 1.2990777338603426e-05, "loss": 1.8039, "step": 18620 }, { "epoch": 3.51, "grad_norm": 13.18472957611084, "learning_rate": 1.2987012987012988e-05, "loss": 2.1318, "step": 18630 }, { "epoch": 3.51, "grad_norm": 6.163987159729004, "learning_rate": 1.2983248635422549e-05, "loss": 2.0817, "step": 18640 }, { "epoch": 3.51, "grad_norm": 13.204118728637695, "learning_rate": 1.297948428383211e-05, "loss": 2.0898, "step": 18650 }, { "epoch": 3.51, "grad_norm": 7.00177526473999, "learning_rate": 1.2975719932241672e-05, "loss": 2.0013, "step": 18660 }, { "epoch": 3.51, "grad_norm": 24.042531967163086, "learning_rate": 1.2971955580651234e-05, "loss": 1.884, "step": 18670 }, { "epoch": 3.52, "grad_norm": 19.49479103088379, "learning_rate": 1.2968191229060797e-05, "loss": 2.2212, "step": 18680 }, { "epoch": 3.52, "grad_norm": 9.255216598510742, "learning_rate": 1.2964426877470358e-05, "loss": 2.0055, "step": 18690 }, { "epoch": 3.52, "grad_norm": 9.654414176940918, "learning_rate": 1.296066252587992e-05, "loss": 1.972, "step": 18700 }, { "epoch": 3.52, "grad_norm": 3.0793545246124268, "learning_rate": 1.2956898174289481e-05, "loss": 1.6832, "step": 18710 }, { "epoch": 3.52, "grad_norm": 9.09790325164795, "learning_rate": 1.2953133822699043e-05, "loss": 1.833, "step": 18720 }, { "epoch": 3.53, "grad_norm": 4.778370380401611, "learning_rate": 1.2949369471108602e-05, "loss": 1.8353, "step": 18730 }, { "epoch": 3.53, "grad_norm": 8.280155181884766, "learning_rate": 1.2945605119518164e-05, "loss": 1.9609, "step": 18740 }, { "epoch": 3.53, "grad_norm": 16.770931243896484, "learning_rate": 1.2941840767927725e-05, "loss": 1.989, "step": 18750 }, { "epoch": 3.53, "grad_norm": 13.008971214294434, "learning_rate": 1.2938076416337287e-05, "loss": 1.9608, "step": 18760 }, { "epoch": 3.53, "grad_norm": 10.46919059753418, "learning_rate": 1.2934312064746848e-05, "loss": 1.9793, "step": 18770 }, { "epoch": 3.53, "grad_norm": 8.881004333496094, "learning_rate": 1.293054771315641e-05, "loss": 1.8922, "step": 18780 }, { "epoch": 3.54, "grad_norm": 5.150662422180176, "learning_rate": 1.2926783361565971e-05, "loss": 2.0686, "step": 18790 }, { "epoch": 3.54, "grad_norm": 7.772060871124268, "learning_rate": 1.2923019009975533e-05, "loss": 1.963, "step": 18800 }, { "epoch": 3.54, "grad_norm": 8.1159086227417, "learning_rate": 1.2919254658385094e-05, "loss": 2.0333, "step": 18810 }, { "epoch": 3.54, "grad_norm": 12.533989906311035, "learning_rate": 1.2915490306794655e-05, "loss": 2.088, "step": 18820 }, { "epoch": 3.54, "grad_norm": 21.347049713134766, "learning_rate": 1.2911725955204217e-05, "loss": 2.1555, "step": 18830 }, { "epoch": 3.55, "grad_norm": 7.12397575378418, "learning_rate": 1.2907961603613778e-05, "loss": 2.0623, "step": 18840 }, { "epoch": 3.55, "grad_norm": 10.791213989257812, "learning_rate": 1.290419725202334e-05, "loss": 2.1219, "step": 18850 }, { "epoch": 3.55, "grad_norm": 8.120030403137207, "learning_rate": 1.2900432900432901e-05, "loss": 1.8217, "step": 18860 }, { "epoch": 3.55, "grad_norm": 5.759142875671387, "learning_rate": 1.2896668548842464e-05, "loss": 1.6623, "step": 18870 }, { "epoch": 3.55, "grad_norm": 10.546258926391602, "learning_rate": 1.2892904197252026e-05, "loss": 2.2074, "step": 18880 }, { "epoch": 3.56, "grad_norm": 5.4837965965271, "learning_rate": 1.2889139845661587e-05, "loss": 1.8465, "step": 18890 }, { "epoch": 3.56, "grad_norm": 6.776173114776611, "learning_rate": 1.2885375494071149e-05, "loss": 2.0048, "step": 18900 }, { "epoch": 3.56, "grad_norm": 11.400514602661133, "learning_rate": 1.2881611142480709e-05, "loss": 1.8364, "step": 18910 }, { "epoch": 3.56, "grad_norm": 6.094997406005859, "learning_rate": 1.287784679089027e-05, "loss": 1.9519, "step": 18920 }, { "epoch": 3.56, "grad_norm": 9.738938331604004, "learning_rate": 1.2874082439299832e-05, "loss": 2.2354, "step": 18930 }, { "epoch": 3.56, "grad_norm": 7.144976615905762, "learning_rate": 1.2870318087709393e-05, "loss": 2.1213, "step": 18940 }, { "epoch": 3.57, "grad_norm": 4.868966102600098, "learning_rate": 1.2866553736118954e-05, "loss": 1.7377, "step": 18950 }, { "epoch": 3.57, "grad_norm": 17.455867767333984, "learning_rate": 1.2862789384528516e-05, "loss": 2.0036, "step": 18960 }, { "epoch": 3.57, "grad_norm": 6.96973991394043, "learning_rate": 1.2859025032938077e-05, "loss": 2.075, "step": 18970 }, { "epoch": 3.57, "grad_norm": 9.145732879638672, "learning_rate": 1.2855260681347639e-05, "loss": 1.7093, "step": 18980 }, { "epoch": 3.57, "grad_norm": 6.906736850738525, "learning_rate": 1.28514963297572e-05, "loss": 2.2193, "step": 18990 }, { "epoch": 3.58, "grad_norm": 28.209924697875977, "learning_rate": 1.2847731978166762e-05, "loss": 1.723, "step": 19000 }, { "epoch": 3.58, "grad_norm": 9.18234634399414, "learning_rate": 1.2843967626576323e-05, "loss": 1.8148, "step": 19010 }, { "epoch": 3.58, "grad_norm": 7.6682586669921875, "learning_rate": 1.2840203274985885e-05, "loss": 1.8239, "step": 19020 }, { "epoch": 3.58, "grad_norm": 5.16909646987915, "learning_rate": 1.2836438923395446e-05, "loss": 1.7717, "step": 19030 }, { "epoch": 3.58, "grad_norm": 9.808969497680664, "learning_rate": 1.2832674571805008e-05, "loss": 1.7606, "step": 19040 }, { "epoch": 3.59, "grad_norm": 5.6245832443237305, "learning_rate": 1.282891022021457e-05, "loss": 2.1062, "step": 19050 }, { "epoch": 3.59, "grad_norm": 8.315461158752441, "learning_rate": 1.2825145868624132e-05, "loss": 1.9931, "step": 19060 }, { "epoch": 3.59, "grad_norm": 6.356577396392822, "learning_rate": 1.2821381517033694e-05, "loss": 1.6805, "step": 19070 }, { "epoch": 3.59, "grad_norm": 6.560335159301758, "learning_rate": 1.2817617165443252e-05, "loss": 1.8092, "step": 19080 }, { "epoch": 3.59, "grad_norm": 12.969362258911133, "learning_rate": 1.2813852813852813e-05, "loss": 2.1968, "step": 19090 }, { "epoch": 3.59, "grad_norm": 14.180224418640137, "learning_rate": 1.2810088462262376e-05, "loss": 2.1959, "step": 19100 }, { "epoch": 3.6, "grad_norm": 296.3688049316406, "learning_rate": 1.2806324110671938e-05, "loss": 2.0864, "step": 19110 }, { "epoch": 3.6, "grad_norm": 26.327489852905273, "learning_rate": 1.28025597590815e-05, "loss": 1.9344, "step": 19120 }, { "epoch": 3.6, "grad_norm": 3.5424439907073975, "learning_rate": 1.279879540749106e-05, "loss": 1.5707, "step": 19130 }, { "epoch": 3.6, "grad_norm": 11.900901794433594, "learning_rate": 1.2795031055900622e-05, "loss": 1.8648, "step": 19140 }, { "epoch": 3.6, "grad_norm": 9.746707916259766, "learning_rate": 1.2791266704310184e-05, "loss": 1.9923, "step": 19150 }, { "epoch": 3.61, "grad_norm": 6.923778533935547, "learning_rate": 1.2787502352719745e-05, "loss": 2.4349, "step": 19160 }, { "epoch": 3.61, "grad_norm": 11.637565612792969, "learning_rate": 1.2783738001129307e-05, "loss": 2.152, "step": 19170 }, { "epoch": 3.61, "grad_norm": 3.7857089042663574, "learning_rate": 1.2779973649538868e-05, "loss": 1.6385, "step": 19180 }, { "epoch": 3.61, "grad_norm": 16.268362045288086, "learning_rate": 1.277620929794843e-05, "loss": 1.8265, "step": 19190 }, { "epoch": 3.61, "grad_norm": 10.922266960144043, "learning_rate": 1.2772444946357991e-05, "loss": 2.0278, "step": 19200 }, { "epoch": 3.62, "grad_norm": 11.14235782623291, "learning_rate": 1.2768680594767552e-05, "loss": 1.9257, "step": 19210 }, { "epoch": 3.62, "grad_norm": 15.673467636108398, "learning_rate": 1.2764916243177114e-05, "loss": 1.782, "step": 19220 }, { "epoch": 3.62, "grad_norm": 4.575836658477783, "learning_rate": 1.2761151891586675e-05, "loss": 1.8066, "step": 19230 }, { "epoch": 3.62, "grad_norm": 4.547247886657715, "learning_rate": 1.2757387539996238e-05, "loss": 1.9082, "step": 19240 }, { "epoch": 3.62, "grad_norm": 13.23827075958252, "learning_rate": 1.2753623188405797e-05, "loss": 2.0453, "step": 19250 }, { "epoch": 3.63, "grad_norm": 23.971220016479492, "learning_rate": 1.2749858836815358e-05, "loss": 1.7168, "step": 19260 }, { "epoch": 3.63, "grad_norm": 4.372469902038574, "learning_rate": 1.274609448522492e-05, "loss": 1.9974, "step": 19270 }, { "epoch": 3.63, "grad_norm": 10.855060577392578, "learning_rate": 1.2742330133634483e-05, "loss": 1.9985, "step": 19280 }, { "epoch": 3.63, "grad_norm": 6.776280879974365, "learning_rate": 1.2738565782044044e-05, "loss": 1.7341, "step": 19290 }, { "epoch": 3.63, "grad_norm": 10.902573585510254, "learning_rate": 1.2734801430453606e-05, "loss": 1.6878, "step": 19300 }, { "epoch": 3.63, "grad_norm": 17.29878044128418, "learning_rate": 1.2731037078863167e-05, "loss": 2.0207, "step": 19310 }, { "epoch": 3.64, "grad_norm": 6.516051769256592, "learning_rate": 1.2727272727272728e-05, "loss": 1.8968, "step": 19320 }, { "epoch": 3.64, "grad_norm": 6.049952030181885, "learning_rate": 1.272350837568229e-05, "loss": 2.3054, "step": 19330 }, { "epoch": 3.64, "grad_norm": 7.836690425872803, "learning_rate": 1.2719744024091851e-05, "loss": 1.9668, "step": 19340 }, { "epoch": 3.64, "grad_norm": 11.173535346984863, "learning_rate": 1.2715979672501413e-05, "loss": 2.0691, "step": 19350 }, { "epoch": 3.64, "grad_norm": 8.31632137298584, "learning_rate": 1.2712215320910974e-05, "loss": 1.7111, "step": 19360 }, { "epoch": 3.65, "grad_norm": 7.979506492614746, "learning_rate": 1.2708450969320536e-05, "loss": 1.9995, "step": 19370 }, { "epoch": 3.65, "grad_norm": 5.2390313148498535, "learning_rate": 1.2704686617730097e-05, "loss": 1.9141, "step": 19380 }, { "epoch": 3.65, "grad_norm": 4.747333526611328, "learning_rate": 1.2700922266139659e-05, "loss": 1.7319, "step": 19390 }, { "epoch": 3.65, "grad_norm": 8.991707801818848, "learning_rate": 1.269715791454922e-05, "loss": 1.9547, "step": 19400 }, { "epoch": 3.65, "grad_norm": 8.409485816955566, "learning_rate": 1.2693393562958782e-05, "loss": 1.9256, "step": 19410 }, { "epoch": 3.66, "grad_norm": 10.419615745544434, "learning_rate": 1.2689629211368343e-05, "loss": 2.0997, "step": 19420 }, { "epoch": 3.66, "grad_norm": 8.572612762451172, "learning_rate": 1.2685864859777903e-05, "loss": 1.7939, "step": 19430 }, { "epoch": 3.66, "grad_norm": 8.453900337219238, "learning_rate": 1.2682100508187464e-05, "loss": 2.0243, "step": 19440 }, { "epoch": 3.66, "grad_norm": 6.323369979858398, "learning_rate": 1.2678336156597026e-05, "loss": 1.7606, "step": 19450 }, { "epoch": 3.66, "grad_norm": 14.150957107543945, "learning_rate": 1.2674571805006587e-05, "loss": 1.8716, "step": 19460 }, { "epoch": 3.66, "grad_norm": 8.149356842041016, "learning_rate": 1.267080745341615e-05, "loss": 1.9497, "step": 19470 }, { "epoch": 3.67, "grad_norm": 14.092212677001953, "learning_rate": 1.2667043101825712e-05, "loss": 2.101, "step": 19480 }, { "epoch": 3.67, "grad_norm": 3.973227024078369, "learning_rate": 1.2663278750235273e-05, "loss": 1.6496, "step": 19490 }, { "epoch": 3.67, "grad_norm": 8.301676750183105, "learning_rate": 1.2659514398644835e-05, "loss": 2.0054, "step": 19500 }, { "epoch": 3.67, "grad_norm": 11.822549819946289, "learning_rate": 1.2655750047054396e-05, "loss": 1.8979, "step": 19510 }, { "epoch": 3.67, "grad_norm": 23.156982421875, "learning_rate": 1.2651985695463958e-05, "loss": 1.6513, "step": 19520 }, { "epoch": 3.68, "grad_norm": 15.396169662475586, "learning_rate": 1.2648221343873519e-05, "loss": 1.9873, "step": 19530 }, { "epoch": 3.68, "grad_norm": 8.215550422668457, "learning_rate": 1.264445699228308e-05, "loss": 1.9063, "step": 19540 }, { "epoch": 3.68, "grad_norm": 11.786815643310547, "learning_rate": 1.2640692640692642e-05, "loss": 2.0262, "step": 19550 }, { "epoch": 3.68, "grad_norm": 5.900434970855713, "learning_rate": 1.2636928289102204e-05, "loss": 1.7201, "step": 19560 }, { "epoch": 3.68, "grad_norm": 7.299560546875, "learning_rate": 1.2633163937511765e-05, "loss": 2.1173, "step": 19570 }, { "epoch": 3.69, "grad_norm": 9.341535568237305, "learning_rate": 1.2629399585921326e-05, "loss": 2.0, "step": 19580 }, { "epoch": 3.69, "grad_norm": 15.608997344970703, "learning_rate": 1.2625635234330888e-05, "loss": 1.9367, "step": 19590 }, { "epoch": 3.69, "grad_norm": 5.606038570404053, "learning_rate": 1.2621870882740448e-05, "loss": 1.8725, "step": 19600 }, { "epoch": 3.69, "grad_norm": 8.813735008239746, "learning_rate": 1.2618106531150009e-05, "loss": 2.0252, "step": 19610 }, { "epoch": 3.69, "grad_norm": 5.909628868103027, "learning_rate": 1.261434217955957e-05, "loss": 1.7257, "step": 19620 }, { "epoch": 3.69, "grad_norm": 21.48015785217285, "learning_rate": 1.2610577827969132e-05, "loss": 1.8378, "step": 19630 }, { "epoch": 3.7, "grad_norm": 18.355018615722656, "learning_rate": 1.2606813476378694e-05, "loss": 1.7872, "step": 19640 }, { "epoch": 3.7, "grad_norm": 10.858268737792969, "learning_rate": 1.2603049124788255e-05, "loss": 1.7914, "step": 19650 }, { "epoch": 3.7, "grad_norm": 13.561539649963379, "learning_rate": 1.2599284773197818e-05, "loss": 1.7903, "step": 19660 }, { "epoch": 3.7, "grad_norm": 8.76860237121582, "learning_rate": 1.259552042160738e-05, "loss": 2.0807, "step": 19670 }, { "epoch": 3.7, "grad_norm": 6.225655555725098, "learning_rate": 1.2591756070016941e-05, "loss": 1.7881, "step": 19680 }, { "epoch": 3.71, "grad_norm": 7.95245361328125, "learning_rate": 1.2587991718426503e-05, "loss": 1.5251, "step": 19690 }, { "epoch": 3.71, "grad_norm": 9.99848747253418, "learning_rate": 1.2584227366836064e-05, "loss": 1.8557, "step": 19700 }, { "epoch": 3.71, "grad_norm": 7.841402530670166, "learning_rate": 1.2580463015245625e-05, "loss": 2.0181, "step": 19710 }, { "epoch": 3.71, "grad_norm": 7.321385860443115, "learning_rate": 1.2576698663655187e-05, "loss": 1.7769, "step": 19720 }, { "epoch": 3.71, "grad_norm": 4.356576919555664, "learning_rate": 1.2572934312064748e-05, "loss": 1.9278, "step": 19730 }, { "epoch": 3.72, "grad_norm": 5.340190410614014, "learning_rate": 1.256916996047431e-05, "loss": 1.7542, "step": 19740 }, { "epoch": 3.72, "grad_norm": 8.743389129638672, "learning_rate": 1.2565405608883871e-05, "loss": 1.8108, "step": 19750 }, { "epoch": 3.72, "grad_norm": 5.315398216247559, "learning_rate": 1.2561641257293433e-05, "loss": 1.5683, "step": 19760 }, { "epoch": 3.72, "grad_norm": 7.530415058135986, "learning_rate": 1.2557876905702994e-05, "loss": 1.7046, "step": 19770 }, { "epoch": 3.72, "grad_norm": 5.031317710876465, "learning_rate": 1.2554112554112554e-05, "loss": 1.5701, "step": 19780 }, { "epoch": 3.72, "grad_norm": 11.126982688903809, "learning_rate": 1.2550348202522115e-05, "loss": 1.7465, "step": 19790 }, { "epoch": 3.73, "grad_norm": 6.674161911010742, "learning_rate": 1.2546583850931677e-05, "loss": 1.977, "step": 19800 }, { "epoch": 3.73, "grad_norm": 4.962024211883545, "learning_rate": 1.2542819499341238e-05, "loss": 1.7228, "step": 19810 }, { "epoch": 3.73, "grad_norm": 12.763315200805664, "learning_rate": 1.25390551477508e-05, "loss": 1.6865, "step": 19820 }, { "epoch": 3.73, "grad_norm": 7.546834468841553, "learning_rate": 1.2535290796160361e-05, "loss": 1.8932, "step": 19830 }, { "epoch": 3.73, "grad_norm": 24.9157657623291, "learning_rate": 1.2531526444569924e-05, "loss": 1.9807, "step": 19840 }, { "epoch": 3.74, "grad_norm": 7.698818206787109, "learning_rate": 1.2527762092979486e-05, "loss": 1.8081, "step": 19850 }, { "epoch": 3.74, "grad_norm": 10.226046562194824, "learning_rate": 1.2523997741389047e-05, "loss": 2.035, "step": 19860 }, { "epoch": 3.74, "grad_norm": 5.331039905548096, "learning_rate": 1.2520233389798609e-05, "loss": 1.8511, "step": 19870 }, { "epoch": 3.74, "grad_norm": 8.29790210723877, "learning_rate": 1.251646903820817e-05, "loss": 1.8644, "step": 19880 }, { "epoch": 3.74, "grad_norm": 8.588957786560059, "learning_rate": 1.2512704686617732e-05, "loss": 2.2266, "step": 19890 }, { "epoch": 3.75, "grad_norm": 9.230169296264648, "learning_rate": 1.2508940335027293e-05, "loss": 2.0573, "step": 19900 }, { "epoch": 3.75, "grad_norm": 4.643959045410156, "learning_rate": 1.2505175983436855e-05, "loss": 1.8614, "step": 19910 }, { "epoch": 3.75, "grad_norm": 4.462587833404541, "learning_rate": 1.2501411631846416e-05, "loss": 1.5318, "step": 19920 }, { "epoch": 3.75, "grad_norm": 11.233470916748047, "learning_rate": 1.2497647280255978e-05, "loss": 1.9023, "step": 19930 }, { "epoch": 3.75, "grad_norm": 7.3314595222473145, "learning_rate": 1.2493882928665539e-05, "loss": 1.7305, "step": 19940 }, { "epoch": 3.75, "grad_norm": 7.167730331420898, "learning_rate": 1.2490118577075099e-05, "loss": 1.3817, "step": 19950 }, { "epoch": 3.76, "grad_norm": 12.81592082977295, "learning_rate": 1.248635422548466e-05, "loss": 1.5118, "step": 19960 }, { "epoch": 3.76, "grad_norm": 19.16656494140625, "learning_rate": 1.2482589873894222e-05, "loss": 1.877, "step": 19970 }, { "epoch": 3.76, "grad_norm": 6.895003318786621, "learning_rate": 1.2478825522303783e-05, "loss": 1.6514, "step": 19980 }, { "epoch": 3.76, "grad_norm": 6.305500507354736, "learning_rate": 1.2475061170713345e-05, "loss": 1.9477, "step": 19990 }, { "epoch": 3.76, "grad_norm": 9.86363697052002, "learning_rate": 1.2471296819122906e-05, "loss": 2.1602, "step": 20000 }, { "epoch": 3.77, "grad_norm": 6.933320999145508, "learning_rate": 1.2467532467532468e-05, "loss": 1.4926, "step": 20010 }, { "epoch": 3.77, "grad_norm": 7.698347568511963, "learning_rate": 1.2463768115942029e-05, "loss": 1.7934, "step": 20020 }, { "epoch": 3.77, "grad_norm": 14.325718879699707, "learning_rate": 1.2460003764351592e-05, "loss": 1.7177, "step": 20030 }, { "epoch": 3.77, "grad_norm": 5.607003211975098, "learning_rate": 1.2456239412761154e-05, "loss": 1.9676, "step": 20040 }, { "epoch": 3.77, "grad_norm": 12.610427856445312, "learning_rate": 1.2452475061170715e-05, "loss": 2.1668, "step": 20050 }, { "epoch": 3.78, "grad_norm": 9.83594799041748, "learning_rate": 1.2448710709580277e-05, "loss": 1.7706, "step": 20060 }, { "epoch": 3.78, "grad_norm": 10.55615520477295, "learning_rate": 1.2444946357989838e-05, "loss": 1.6667, "step": 20070 }, { "epoch": 3.78, "grad_norm": 14.41985034942627, "learning_rate": 1.24411820063994e-05, "loss": 1.808, "step": 20080 }, { "epoch": 3.78, "grad_norm": 5.358947277069092, "learning_rate": 1.2437417654808961e-05, "loss": 1.5994, "step": 20090 }, { "epoch": 3.78, "grad_norm": 24.91019058227539, "learning_rate": 1.2433653303218522e-05, "loss": 1.9046, "step": 20100 }, { "epoch": 3.79, "grad_norm": 12.984135627746582, "learning_rate": 1.2429888951628084e-05, "loss": 1.5971, "step": 20110 }, { "epoch": 3.79, "grad_norm": 11.962932586669922, "learning_rate": 1.2426124600037644e-05, "loss": 1.527, "step": 20120 }, { "epoch": 3.79, "grad_norm": 8.879301071166992, "learning_rate": 1.2422360248447205e-05, "loss": 1.6746, "step": 20130 }, { "epoch": 3.79, "grad_norm": 9.046487808227539, "learning_rate": 1.2418595896856767e-05, "loss": 1.6807, "step": 20140 }, { "epoch": 3.79, "grad_norm": 4.956076622009277, "learning_rate": 1.2414831545266328e-05, "loss": 1.9525, "step": 20150 }, { "epoch": 3.79, "grad_norm": 4.489163398742676, "learning_rate": 1.241106719367589e-05, "loss": 2.0301, "step": 20160 }, { "epoch": 3.8, "grad_norm": 9.663702011108398, "learning_rate": 1.2407302842085451e-05, "loss": 1.8285, "step": 20170 }, { "epoch": 3.8, "grad_norm": 5.1517863273620605, "learning_rate": 1.2403538490495012e-05, "loss": 1.9547, "step": 20180 }, { "epoch": 3.8, "grad_norm": 31.635330200195312, "learning_rate": 1.2399774138904574e-05, "loss": 2.1514, "step": 20190 }, { "epoch": 3.8, "grad_norm": 21.929651260375977, "learning_rate": 1.2396009787314135e-05, "loss": 1.6646, "step": 20200 }, { "epoch": 3.8, "grad_norm": 13.490052223205566, "learning_rate": 1.2392245435723698e-05, "loss": 2.1783, "step": 20210 }, { "epoch": 3.81, "grad_norm": 19.86699867248535, "learning_rate": 1.238848108413326e-05, "loss": 1.8168, "step": 20220 }, { "epoch": 3.81, "grad_norm": 6.852078437805176, "learning_rate": 1.2384716732542821e-05, "loss": 1.6172, "step": 20230 }, { "epoch": 3.81, "grad_norm": 21.677940368652344, "learning_rate": 1.2380952380952383e-05, "loss": 1.7033, "step": 20240 }, { "epoch": 3.81, "grad_norm": 3.1723427772521973, "learning_rate": 1.2377188029361944e-05, "loss": 1.9684, "step": 20250 }, { "epoch": 3.81, "grad_norm": 12.557634353637695, "learning_rate": 1.2373423677771506e-05, "loss": 1.9921, "step": 20260 }, { "epoch": 3.82, "grad_norm": 13.770596504211426, "learning_rate": 1.2369659326181067e-05, "loss": 1.6452, "step": 20270 }, { "epoch": 3.82, "grad_norm": 9.587441444396973, "learning_rate": 1.2365894974590629e-05, "loss": 2.1738, "step": 20280 }, { "epoch": 3.82, "grad_norm": 5.746249675750732, "learning_rate": 1.236213062300019e-05, "loss": 1.788, "step": 20290 }, { "epoch": 3.82, "grad_norm": 8.05097770690918, "learning_rate": 1.235836627140975e-05, "loss": 1.5581, "step": 20300 }, { "epoch": 3.82, "grad_norm": 7.153928756713867, "learning_rate": 1.2354601919819311e-05, "loss": 1.5682, "step": 20310 }, { "epoch": 3.82, "grad_norm": 13.329123497009277, "learning_rate": 1.2350837568228873e-05, "loss": 1.6781, "step": 20320 }, { "epoch": 3.83, "grad_norm": 12.08292007446289, "learning_rate": 1.2347073216638434e-05, "loss": 1.5742, "step": 20330 }, { "epoch": 3.83, "grad_norm": 6.530152797698975, "learning_rate": 1.2343308865047996e-05, "loss": 1.8222, "step": 20340 }, { "epoch": 3.83, "grad_norm": 5.176646709442139, "learning_rate": 1.2339544513457557e-05, "loss": 1.6133, "step": 20350 }, { "epoch": 3.83, "grad_norm": 9.452203750610352, "learning_rate": 1.2335780161867119e-05, "loss": 1.8646, "step": 20360 }, { "epoch": 3.83, "grad_norm": 18.210819244384766, "learning_rate": 1.233201581027668e-05, "loss": 1.4498, "step": 20370 }, { "epoch": 3.84, "grad_norm": 19.53200340270996, "learning_rate": 1.2328251458686242e-05, "loss": 1.5822, "step": 20380 }, { "epoch": 3.84, "grad_norm": 13.156296730041504, "learning_rate": 1.2324487107095803e-05, "loss": 1.6047, "step": 20390 }, { "epoch": 3.84, "grad_norm": 10.787355422973633, "learning_rate": 1.2320722755505366e-05, "loss": 1.6954, "step": 20400 }, { "epoch": 3.84, "grad_norm": 3.5052146911621094, "learning_rate": 1.2316958403914928e-05, "loss": 1.8135, "step": 20410 }, { "epoch": 3.84, "grad_norm": 9.211674690246582, "learning_rate": 1.2313194052324489e-05, "loss": 1.9276, "step": 20420 }, { "epoch": 3.85, "grad_norm": 8.048405647277832, "learning_rate": 1.230942970073405e-05, "loss": 1.876, "step": 20430 }, { "epoch": 3.85, "grad_norm": 10.061442375183105, "learning_rate": 1.2305665349143612e-05, "loss": 1.8561, "step": 20440 }, { "epoch": 3.85, "grad_norm": 9.974609375, "learning_rate": 1.2301900997553174e-05, "loss": 1.4144, "step": 20450 }, { "epoch": 3.85, "grad_norm": 35.64834213256836, "learning_rate": 1.2298136645962735e-05, "loss": 1.5163, "step": 20460 }, { "epoch": 3.85, "grad_norm": 6.793826103210449, "learning_rate": 1.2294372294372295e-05, "loss": 1.5621, "step": 20470 }, { "epoch": 3.85, "grad_norm": 3.043668508529663, "learning_rate": 1.2290607942781856e-05, "loss": 1.3533, "step": 20480 }, { "epoch": 3.86, "grad_norm": 19.3832950592041, "learning_rate": 1.2286843591191418e-05, "loss": 1.6316, "step": 20490 }, { "epoch": 3.86, "grad_norm": 5.867909908294678, "learning_rate": 1.2283079239600979e-05, "loss": 1.6189, "step": 20500 }, { "epoch": 3.86, "grad_norm": 20.958799362182617, "learning_rate": 1.227931488801054e-05, "loss": 1.9867, "step": 20510 }, { "epoch": 3.86, "grad_norm": 11.672524452209473, "learning_rate": 1.2275550536420102e-05, "loss": 2.2281, "step": 20520 }, { "epoch": 3.86, "grad_norm": 18.717376708984375, "learning_rate": 1.2271786184829664e-05, "loss": 1.6601, "step": 20530 }, { "epoch": 3.87, "grad_norm": 14.034415245056152, "learning_rate": 1.2268021833239225e-05, "loss": 1.7476, "step": 20540 }, { "epoch": 3.87, "grad_norm": 8.459630966186523, "learning_rate": 1.2264257481648786e-05, "loss": 1.4688, "step": 20550 }, { "epoch": 3.87, "grad_norm": 6.649875640869141, "learning_rate": 1.2260493130058348e-05, "loss": 1.4558, "step": 20560 }, { "epoch": 3.87, "grad_norm": 3.7194368839263916, "learning_rate": 1.225672877846791e-05, "loss": 1.6436, "step": 20570 }, { "epoch": 3.87, "grad_norm": 7.5452985763549805, "learning_rate": 1.2252964426877473e-05, "loss": 1.7242, "step": 20580 }, { "epoch": 3.88, "grad_norm": 19.26695442199707, "learning_rate": 1.2249200075287034e-05, "loss": 1.5373, "step": 20590 }, { "epoch": 3.88, "grad_norm": 7.555977821350098, "learning_rate": 1.2245435723696595e-05, "loss": 1.6922, "step": 20600 }, { "epoch": 3.88, "grad_norm": 7.906007766723633, "learning_rate": 1.2241671372106157e-05, "loss": 1.3989, "step": 20610 }, { "epoch": 3.88, "grad_norm": 7.017991542816162, "learning_rate": 1.2237907020515718e-05, "loss": 1.4261, "step": 20620 }, { "epoch": 3.88, "grad_norm": 6.061066627502441, "learning_rate": 1.223414266892528e-05, "loss": 1.6243, "step": 20630 }, { "epoch": 3.88, "grad_norm": 7.750338554382324, "learning_rate": 1.2230378317334841e-05, "loss": 1.6446, "step": 20640 }, { "epoch": 3.89, "grad_norm": 177.62649536132812, "learning_rate": 1.2226613965744401e-05, "loss": 1.9348, "step": 20650 }, { "epoch": 3.89, "grad_norm": 7.241508960723877, "learning_rate": 1.2222849614153963e-05, "loss": 1.8252, "step": 20660 }, { "epoch": 3.89, "grad_norm": 4.4552435874938965, "learning_rate": 1.2219085262563524e-05, "loss": 1.8172, "step": 20670 }, { "epoch": 3.89, "grad_norm": 19.10430908203125, "learning_rate": 1.2215320910973085e-05, "loss": 1.9146, "step": 20680 }, { "epoch": 3.89, "grad_norm": 18.19588279724121, "learning_rate": 1.2211556559382647e-05, "loss": 1.7513, "step": 20690 }, { "epoch": 3.9, "grad_norm": 15.914993286132812, "learning_rate": 1.2207792207792208e-05, "loss": 1.7209, "step": 20700 }, { "epoch": 3.9, "grad_norm": 9.63773250579834, "learning_rate": 1.220402785620177e-05, "loss": 1.5009, "step": 20710 }, { "epoch": 3.9, "grad_norm": 13.111783027648926, "learning_rate": 1.2200263504611331e-05, "loss": 1.7857, "step": 20720 }, { "epoch": 3.9, "grad_norm": 14.780935287475586, "learning_rate": 1.2196499153020893e-05, "loss": 1.5576, "step": 20730 }, { "epoch": 3.9, "grad_norm": 2.8121819496154785, "learning_rate": 1.2192734801430454e-05, "loss": 1.4629, "step": 20740 }, { "epoch": 3.91, "grad_norm": 11.858397483825684, "learning_rate": 1.2188970449840016e-05, "loss": 1.9453, "step": 20750 }, { "epoch": 3.91, "grad_norm": 8.156011581420898, "learning_rate": 1.2185206098249577e-05, "loss": 1.6928, "step": 20760 }, { "epoch": 3.91, "grad_norm": 26.640304565429688, "learning_rate": 1.218144174665914e-05, "loss": 1.7904, "step": 20770 }, { "epoch": 3.91, "grad_norm": 5.228221893310547, "learning_rate": 1.2177677395068702e-05, "loss": 1.3564, "step": 20780 }, { "epoch": 3.91, "grad_norm": 4.930618762969971, "learning_rate": 1.2173913043478263e-05, "loss": 1.5976, "step": 20790 }, { "epoch": 3.91, "grad_norm": 10.263113021850586, "learning_rate": 1.2170148691887825e-05, "loss": 1.727, "step": 20800 }, { "epoch": 3.92, "grad_norm": 8.315753936767578, "learning_rate": 1.2166384340297386e-05, "loss": 1.5199, "step": 20810 }, { "epoch": 3.92, "grad_norm": 4.06594705581665, "learning_rate": 1.2162619988706946e-05, "loss": 1.6104, "step": 20820 }, { "epoch": 3.92, "grad_norm": 16.011816024780273, "learning_rate": 1.2158855637116507e-05, "loss": 1.404, "step": 20830 }, { "epoch": 3.92, "grad_norm": 12.612791061401367, "learning_rate": 1.2155091285526069e-05, "loss": 1.6556, "step": 20840 }, { "epoch": 3.92, "grad_norm": 4.609894752502441, "learning_rate": 1.215132693393563e-05, "loss": 1.4054, "step": 20850 }, { "epoch": 3.93, "grad_norm": 9.120388984680176, "learning_rate": 1.2147562582345192e-05, "loss": 1.7159, "step": 20860 }, { "epoch": 3.93, "grad_norm": 27.49857521057129, "learning_rate": 1.2143798230754753e-05, "loss": 1.9204, "step": 20870 }, { "epoch": 3.93, "grad_norm": 6.894259929656982, "learning_rate": 1.2140033879164315e-05, "loss": 1.8133, "step": 20880 }, { "epoch": 3.93, "grad_norm": 4.275078296661377, "learning_rate": 1.2136269527573876e-05, "loss": 1.5878, "step": 20890 }, { "epoch": 3.93, "grad_norm": 9.416297912597656, "learning_rate": 1.2132505175983438e-05, "loss": 1.8399, "step": 20900 }, { "epoch": 3.94, "grad_norm": 5.509664535522461, "learning_rate": 1.2128740824392999e-05, "loss": 1.8207, "step": 20910 }, { "epoch": 3.94, "grad_norm": 11.878488540649414, "learning_rate": 1.212497647280256e-05, "loss": 1.6334, "step": 20920 }, { "epoch": 3.94, "grad_norm": 15.206461906433105, "learning_rate": 1.2121212121212122e-05, "loss": 1.5632, "step": 20930 }, { "epoch": 3.94, "grad_norm": 17.411251068115234, "learning_rate": 1.2117447769621683e-05, "loss": 1.5116, "step": 20940 }, { "epoch": 3.94, "grad_norm": 10.719283103942871, "learning_rate": 1.2113683418031245e-05, "loss": 1.718, "step": 20950 }, { "epoch": 3.95, "grad_norm": 10.647701263427734, "learning_rate": 1.2109919066440808e-05, "loss": 1.6246, "step": 20960 }, { "epoch": 3.95, "grad_norm": 5.002242565155029, "learning_rate": 1.210615471485037e-05, "loss": 1.5317, "step": 20970 }, { "epoch": 3.95, "grad_norm": 4.119180679321289, "learning_rate": 1.2102390363259931e-05, "loss": 1.358, "step": 20980 }, { "epoch": 3.95, "grad_norm": 6.45958137512207, "learning_rate": 1.2098626011669492e-05, "loss": 1.512, "step": 20990 }, { "epoch": 3.95, "grad_norm": 8.774272918701172, "learning_rate": 1.2094861660079052e-05, "loss": 1.9346, "step": 21000 }, { "epoch": 3.95, "grad_norm": 17.34573745727539, "learning_rate": 1.2091097308488614e-05, "loss": 1.7243, "step": 21010 }, { "epoch": 3.96, "grad_norm": 12.577427864074707, "learning_rate": 1.2087332956898175e-05, "loss": 2.2543, "step": 21020 }, { "epoch": 3.96, "grad_norm": 9.050633430480957, "learning_rate": 1.2083568605307737e-05, "loss": 1.5313, "step": 21030 }, { "epoch": 3.96, "grad_norm": 5.1217498779296875, "learning_rate": 1.2079804253717298e-05, "loss": 1.807, "step": 21040 }, { "epoch": 3.96, "grad_norm": 7.456243515014648, "learning_rate": 1.207603990212686e-05, "loss": 1.7452, "step": 21050 }, { "epoch": 3.96, "grad_norm": 5.49287223815918, "learning_rate": 1.2072275550536421e-05, "loss": 1.5841, "step": 21060 }, { "epoch": 3.97, "grad_norm": 11.715792655944824, "learning_rate": 1.2068511198945982e-05, "loss": 2.1894, "step": 21070 }, { "epoch": 3.97, "grad_norm": 10.759514808654785, "learning_rate": 1.2064746847355544e-05, "loss": 1.6397, "step": 21080 }, { "epoch": 3.97, "grad_norm": 9.116241455078125, "learning_rate": 1.2060982495765105e-05, "loss": 1.5875, "step": 21090 }, { "epoch": 3.97, "grad_norm": 6.298516273498535, "learning_rate": 1.2057218144174667e-05, "loss": 1.6174, "step": 21100 }, { "epoch": 3.97, "grad_norm": 16.303861618041992, "learning_rate": 1.2053453792584228e-05, "loss": 1.6687, "step": 21110 }, { "epoch": 3.98, "grad_norm": 9.316900253295898, "learning_rate": 1.204968944099379e-05, "loss": 1.5464, "step": 21120 }, { "epoch": 3.98, "grad_norm": 18.653125762939453, "learning_rate": 1.2045925089403351e-05, "loss": 1.8929, "step": 21130 }, { "epoch": 3.98, "grad_norm": 8.84122371673584, "learning_rate": 1.2042160737812914e-05, "loss": 1.3914, "step": 21140 }, { "epoch": 3.98, "grad_norm": 9.7447509765625, "learning_rate": 1.2038396386222476e-05, "loss": 1.6378, "step": 21150 }, { "epoch": 3.98, "grad_norm": 12.237557411193848, "learning_rate": 1.2034632034632037e-05, "loss": 1.4588, "step": 21160 }, { "epoch": 3.98, "grad_norm": 7.375302791595459, "learning_rate": 1.2030867683041595e-05, "loss": 1.822, "step": 21170 }, { "epoch": 3.99, "grad_norm": 7.69105339050293, "learning_rate": 1.2027103331451157e-05, "loss": 1.427, "step": 21180 }, { "epoch": 3.99, "grad_norm": 2.8148481845855713, "learning_rate": 1.202333897986072e-05, "loss": 1.2122, "step": 21190 }, { "epoch": 3.99, "grad_norm": 10.6585054397583, "learning_rate": 1.2019574628270281e-05, "loss": 1.1317, "step": 21200 }, { "epoch": 3.99, "grad_norm": 9.885071754455566, "learning_rate": 1.2015810276679843e-05, "loss": 1.8243, "step": 21210 }, { "epoch": 3.99, "grad_norm": 26.595077514648438, "learning_rate": 1.2012045925089404e-05, "loss": 1.4937, "step": 21220 }, { "epoch": 4.0, "grad_norm": 5.562558174133301, "learning_rate": 1.2008281573498966e-05, "loss": 1.6933, "step": 21230 }, { "epoch": 4.0, "grad_norm": 14.402726173400879, "learning_rate": 1.2004517221908527e-05, "loss": 1.5855, "step": 21240 }, { "epoch": 4.0, "grad_norm": 7.88834285736084, "learning_rate": 1.2000752870318089e-05, "loss": 1.5784, "step": 21250 }, { "epoch": 4.0, "eval_accuracy": 0.7728, "eval_loss": 1.655354380607605, "eval_runtime": 31.2895, "eval_samples_per_second": 239.697, "eval_steps_per_second": 29.978, "step": 21252 }, { "epoch": 4.0, "grad_norm": 13.500533103942871, "learning_rate": 1.199698851872765e-05, "loss": 1.3578, "step": 21260 }, { "epoch": 4.0, "grad_norm": 5.576572895050049, "learning_rate": 1.1993224167137212e-05, "loss": 1.5854, "step": 21270 }, { "epoch": 4.01, "grad_norm": 13.157581329345703, "learning_rate": 1.1989459815546773e-05, "loss": 1.6571, "step": 21280 }, { "epoch": 4.01, "grad_norm": 9.378302574157715, "learning_rate": 1.1985695463956335e-05, "loss": 1.7689, "step": 21290 }, { "epoch": 4.01, "grad_norm": 9.518418312072754, "learning_rate": 1.1981931112365896e-05, "loss": 1.3633, "step": 21300 }, { "epoch": 4.01, "grad_norm": 5.089882850646973, "learning_rate": 1.1978166760775457e-05, "loss": 1.3026, "step": 21310 }, { "epoch": 4.01, "grad_norm": 6.972430229187012, "learning_rate": 1.1974402409185019e-05, "loss": 1.4042, "step": 21320 }, { "epoch": 4.01, "grad_norm": 11.402575492858887, "learning_rate": 1.1970638057594582e-05, "loss": 1.3424, "step": 21330 }, { "epoch": 4.02, "grad_norm": 9.427912712097168, "learning_rate": 1.196687370600414e-05, "loss": 1.916, "step": 21340 }, { "epoch": 4.02, "grad_norm": 5.273439884185791, "learning_rate": 1.1963109354413702e-05, "loss": 1.5751, "step": 21350 }, { "epoch": 4.02, "grad_norm": 5.529270648956299, "learning_rate": 1.1959345002823263e-05, "loss": 1.4115, "step": 21360 }, { "epoch": 4.02, "grad_norm": 13.274507522583008, "learning_rate": 1.1955580651232826e-05, "loss": 1.395, "step": 21370 }, { "epoch": 4.02, "grad_norm": 9.113694190979004, "learning_rate": 1.1951816299642388e-05, "loss": 1.5151, "step": 21380 }, { "epoch": 4.03, "grad_norm": 3.7147464752197266, "learning_rate": 1.1948051948051949e-05, "loss": 1.5031, "step": 21390 }, { "epoch": 4.03, "grad_norm": 12.78921127319336, "learning_rate": 1.194428759646151e-05, "loss": 1.7242, "step": 21400 }, { "epoch": 4.03, "grad_norm": 6.96553897857666, "learning_rate": 1.1940523244871072e-05, "loss": 1.4721, "step": 21410 }, { "epoch": 4.03, "grad_norm": 3.179124593734741, "learning_rate": 1.1936758893280634e-05, "loss": 1.4808, "step": 21420 }, { "epoch": 4.03, "grad_norm": 6.812026023864746, "learning_rate": 1.1932994541690195e-05, "loss": 1.5064, "step": 21430 }, { "epoch": 4.04, "grad_norm": 38.644432067871094, "learning_rate": 1.1929230190099756e-05, "loss": 1.518, "step": 21440 }, { "epoch": 4.04, "grad_norm": 12.068611145019531, "learning_rate": 1.1925465838509318e-05, "loss": 1.6051, "step": 21450 }, { "epoch": 4.04, "grad_norm": 37.011905670166016, "learning_rate": 1.192170148691888e-05, "loss": 1.291, "step": 21460 }, { "epoch": 4.04, "grad_norm": 14.111103057861328, "learning_rate": 1.191793713532844e-05, "loss": 1.4895, "step": 21470 }, { "epoch": 4.04, "grad_norm": 16.625423431396484, "learning_rate": 1.1914172783738002e-05, "loss": 1.3755, "step": 21480 }, { "epoch": 4.04, "grad_norm": 3.837176561355591, "learning_rate": 1.1910408432147564e-05, "loss": 1.5626, "step": 21490 }, { "epoch": 4.05, "grad_norm": 14.177433967590332, "learning_rate": 1.1906644080557125e-05, "loss": 1.3549, "step": 21500 }, { "epoch": 4.05, "grad_norm": 3.6623570919036865, "learning_rate": 1.1902879728966688e-05, "loss": 1.3145, "step": 21510 }, { "epoch": 4.05, "grad_norm": 8.533249855041504, "learning_rate": 1.1899115377376246e-05, "loss": 1.4884, "step": 21520 }, { "epoch": 4.05, "grad_norm": 6.940463542938232, "learning_rate": 1.1895351025785808e-05, "loss": 1.6619, "step": 21530 }, { "epoch": 4.05, "grad_norm": 4.584464073181152, "learning_rate": 1.189158667419537e-05, "loss": 1.2284, "step": 21540 }, { "epoch": 4.06, "grad_norm": 27.909671783447266, "learning_rate": 1.188782232260493e-05, "loss": 1.4996, "step": 21550 }, { "epoch": 4.06, "grad_norm": 9.329991340637207, "learning_rate": 1.1884057971014494e-05, "loss": 1.2834, "step": 21560 }, { "epoch": 4.06, "grad_norm": 6.2425456047058105, "learning_rate": 1.1880293619424055e-05, "loss": 1.4415, "step": 21570 }, { "epoch": 4.06, "grad_norm": 4.8549652099609375, "learning_rate": 1.1876529267833617e-05, "loss": 1.3848, "step": 21580 }, { "epoch": 4.06, "grad_norm": 11.966907501220703, "learning_rate": 1.1872764916243178e-05, "loss": 1.3374, "step": 21590 }, { "epoch": 4.07, "grad_norm": 26.499244689941406, "learning_rate": 1.186900056465274e-05, "loss": 1.3371, "step": 21600 }, { "epoch": 4.07, "grad_norm": 4.754690647125244, "learning_rate": 1.1865236213062301e-05, "loss": 1.1069, "step": 21610 }, { "epoch": 4.07, "grad_norm": 23.109878540039062, "learning_rate": 1.1861471861471863e-05, "loss": 1.4847, "step": 21620 }, { "epoch": 4.07, "grad_norm": 4.450960636138916, "learning_rate": 1.1857707509881424e-05, "loss": 1.4774, "step": 21630 }, { "epoch": 4.07, "grad_norm": 13.6156644821167, "learning_rate": 1.1853943158290986e-05, "loss": 1.6072, "step": 21640 }, { "epoch": 4.07, "grad_norm": 25.844648361206055, "learning_rate": 1.1850178806700547e-05, "loss": 1.2931, "step": 21650 }, { "epoch": 4.08, "grad_norm": 11.8696870803833, "learning_rate": 1.1846414455110109e-05, "loss": 1.4433, "step": 21660 }, { "epoch": 4.08, "grad_norm": 15.156259536743164, "learning_rate": 1.184265010351967e-05, "loss": 1.6806, "step": 21670 }, { "epoch": 4.08, "grad_norm": 8.796675682067871, "learning_rate": 1.1838885751929231e-05, "loss": 1.7617, "step": 21680 }, { "epoch": 4.08, "grad_norm": 5.091752529144287, "learning_rate": 1.1835121400338791e-05, "loss": 1.6385, "step": 21690 }, { "epoch": 4.08, "grad_norm": 10.241667747497559, "learning_rate": 1.1831357048748353e-05, "loss": 1.232, "step": 21700 }, { "epoch": 4.09, "grad_norm": 3.2507410049438477, "learning_rate": 1.1827592697157914e-05, "loss": 1.4852, "step": 21710 }, { "epoch": 4.09, "grad_norm": 4.350616931915283, "learning_rate": 1.1823828345567476e-05, "loss": 1.3438, "step": 21720 }, { "epoch": 4.09, "grad_norm": 9.896570205688477, "learning_rate": 1.1820063993977037e-05, "loss": 1.4833, "step": 21730 }, { "epoch": 4.09, "grad_norm": 3.9963760375976562, "learning_rate": 1.18162996423866e-05, "loss": 1.5684, "step": 21740 }, { "epoch": 4.09, "grad_norm": 9.535852432250977, "learning_rate": 1.1812535290796162e-05, "loss": 1.5145, "step": 21750 }, { "epoch": 4.1, "grad_norm": 3.9326932430267334, "learning_rate": 1.1808770939205723e-05, "loss": 1.4651, "step": 21760 }, { "epoch": 4.1, "grad_norm": 3.1489434242248535, "learning_rate": 1.1805006587615285e-05, "loss": 1.6563, "step": 21770 }, { "epoch": 4.1, "grad_norm": 15.185462951660156, "learning_rate": 1.1801242236024846e-05, "loss": 1.4423, "step": 21780 }, { "epoch": 4.1, "grad_norm": 11.134682655334473, "learning_rate": 1.1797477884434408e-05, "loss": 1.2798, "step": 21790 }, { "epoch": 4.1, "grad_norm": 6.549045562744141, "learning_rate": 1.1793713532843969e-05, "loss": 1.6316, "step": 21800 }, { "epoch": 4.11, "grad_norm": 7.56115198135376, "learning_rate": 1.178994918125353e-05, "loss": 1.2007, "step": 21810 }, { "epoch": 4.11, "grad_norm": 6.839396953582764, "learning_rate": 1.1786184829663092e-05, "loss": 1.5856, "step": 21820 }, { "epoch": 4.11, "grad_norm": 4.112015247344971, "learning_rate": 1.1782420478072653e-05, "loss": 1.1376, "step": 21830 }, { "epoch": 4.11, "grad_norm": 5.951756477355957, "learning_rate": 1.1778656126482215e-05, "loss": 1.6442, "step": 21840 }, { "epoch": 4.11, "grad_norm": 11.753628730773926, "learning_rate": 1.1774891774891776e-05, "loss": 1.3215, "step": 21850 }, { "epoch": 4.11, "grad_norm": 9.691905975341797, "learning_rate": 1.1771127423301338e-05, "loss": 1.5696, "step": 21860 }, { "epoch": 4.12, "grad_norm": 4.926630020141602, "learning_rate": 1.1767363071710898e-05, "loss": 1.2839, "step": 21870 }, { "epoch": 4.12, "grad_norm": 8.45942497253418, "learning_rate": 1.1763598720120459e-05, "loss": 1.4239, "step": 21880 }, { "epoch": 4.12, "grad_norm": 10.799348831176758, "learning_rate": 1.175983436853002e-05, "loss": 1.3624, "step": 21890 }, { "epoch": 4.12, "grad_norm": 14.17685317993164, "learning_rate": 1.1756070016939582e-05, "loss": 1.6068, "step": 21900 }, { "epoch": 4.12, "grad_norm": 9.250771522521973, "learning_rate": 1.1752305665349143e-05, "loss": 1.5541, "step": 21910 }, { "epoch": 4.13, "grad_norm": 16.8367919921875, "learning_rate": 1.1748541313758705e-05, "loss": 1.331, "step": 21920 }, { "epoch": 4.13, "grad_norm": 35.04539489746094, "learning_rate": 1.1744776962168268e-05, "loss": 1.8105, "step": 21930 }, { "epoch": 4.13, "grad_norm": 6.2122626304626465, "learning_rate": 1.174101261057783e-05, "loss": 1.2929, "step": 21940 }, { "epoch": 4.13, "grad_norm": 4.204780101776123, "learning_rate": 1.1737248258987391e-05, "loss": 1.333, "step": 21950 }, { "epoch": 4.13, "grad_norm": 28.819149017333984, "learning_rate": 1.1733483907396952e-05, "loss": 1.6011, "step": 21960 }, { "epoch": 4.14, "grad_norm": 11.026989936828613, "learning_rate": 1.1729719555806514e-05, "loss": 1.3074, "step": 21970 }, { "epoch": 4.14, "grad_norm": 6.614445686340332, "learning_rate": 1.1725955204216075e-05, "loss": 1.628, "step": 21980 }, { "epoch": 4.14, "grad_norm": 9.508096694946289, "learning_rate": 1.1722190852625637e-05, "loss": 1.1223, "step": 21990 }, { "epoch": 4.14, "grad_norm": 11.38542652130127, "learning_rate": 1.1718426501035198e-05, "loss": 1.6808, "step": 22000 }, { "epoch": 4.14, "grad_norm": 7.099781036376953, "learning_rate": 1.171466214944476e-05, "loss": 1.8893, "step": 22010 }, { "epoch": 4.14, "grad_norm": 27.0643310546875, "learning_rate": 1.1710897797854321e-05, "loss": 1.541, "step": 22020 }, { "epoch": 4.15, "grad_norm": 3.0341477394104004, "learning_rate": 1.1707133446263883e-05, "loss": 1.0653, "step": 22030 }, { "epoch": 4.15, "grad_norm": 21.59052085876465, "learning_rate": 1.1703369094673442e-05, "loss": 1.3339, "step": 22040 }, { "epoch": 4.15, "grad_norm": 8.72144889831543, "learning_rate": 1.1699604743083004e-05, "loss": 1.0812, "step": 22050 }, { "epoch": 4.15, "grad_norm": 15.87111759185791, "learning_rate": 1.1695840391492565e-05, "loss": 1.2401, "step": 22060 }, { "epoch": 4.15, "grad_norm": 4.0406341552734375, "learning_rate": 1.1692076039902127e-05, "loss": 1.077, "step": 22070 }, { "epoch": 4.16, "grad_norm": 7.93537712097168, "learning_rate": 1.1688311688311688e-05, "loss": 1.4247, "step": 22080 }, { "epoch": 4.16, "grad_norm": 6.6882710456848145, "learning_rate": 1.168454733672125e-05, "loss": 1.2155, "step": 22090 }, { "epoch": 4.16, "grad_norm": 21.735994338989258, "learning_rate": 1.1680782985130811e-05, "loss": 1.7847, "step": 22100 }, { "epoch": 4.16, "grad_norm": 35.96470642089844, "learning_rate": 1.1677018633540373e-05, "loss": 1.2616, "step": 22110 }, { "epoch": 4.16, "grad_norm": 3.301687717437744, "learning_rate": 1.1673254281949936e-05, "loss": 1.2178, "step": 22120 }, { "epoch": 4.17, "grad_norm": 11.633638381958008, "learning_rate": 1.1669489930359497e-05, "loss": 1.6943, "step": 22130 }, { "epoch": 4.17, "grad_norm": 8.26904296875, "learning_rate": 1.1665725578769059e-05, "loss": 1.4489, "step": 22140 }, { "epoch": 4.17, "grad_norm": 20.59292221069336, "learning_rate": 1.166196122717862e-05, "loss": 1.2942, "step": 22150 }, { "epoch": 4.17, "grad_norm": 49.7509880065918, "learning_rate": 1.1658196875588182e-05, "loss": 1.5922, "step": 22160 }, { "epoch": 4.17, "grad_norm": 11.030691146850586, "learning_rate": 1.1654432523997743e-05, "loss": 1.409, "step": 22170 }, { "epoch": 4.17, "grad_norm": 7.64301061630249, "learning_rate": 1.1650668172407305e-05, "loss": 1.24, "step": 22180 }, { "epoch": 4.18, "grad_norm": 12.712337493896484, "learning_rate": 1.1646903820816866e-05, "loss": 1.4544, "step": 22190 }, { "epoch": 4.18, "grad_norm": 16.438764572143555, "learning_rate": 1.1643139469226427e-05, "loss": 1.5123, "step": 22200 }, { "epoch": 4.18, "grad_norm": 5.241950035095215, "learning_rate": 1.1639375117635989e-05, "loss": 1.2658, "step": 22210 }, { "epoch": 4.18, "grad_norm": 11.820525169372559, "learning_rate": 1.1635610766045549e-05, "loss": 1.3581, "step": 22220 }, { "epoch": 4.18, "grad_norm": 19.10674476623535, "learning_rate": 1.163184641445511e-05, "loss": 1.2019, "step": 22230 }, { "epoch": 4.19, "grad_norm": 9.48658275604248, "learning_rate": 1.1628082062864672e-05, "loss": 1.6962, "step": 22240 }, { "epoch": 4.19, "grad_norm": 27.171833038330078, "learning_rate": 1.1624317711274233e-05, "loss": 1.3881, "step": 22250 }, { "epoch": 4.19, "grad_norm": 7.700462341308594, "learning_rate": 1.1620553359683795e-05, "loss": 1.4526, "step": 22260 }, { "epoch": 4.19, "grad_norm": 4.551231384277344, "learning_rate": 1.1616789008093356e-05, "loss": 1.2841, "step": 22270 }, { "epoch": 4.19, "grad_norm": 18.879850387573242, "learning_rate": 1.1613024656502917e-05, "loss": 1.3801, "step": 22280 }, { "epoch": 4.2, "grad_norm": 7.231531143188477, "learning_rate": 1.1609260304912479e-05, "loss": 1.2568, "step": 22290 }, { "epoch": 4.2, "grad_norm": 12.560738563537598, "learning_rate": 1.1605495953322042e-05, "loss": 1.4661, "step": 22300 }, { "epoch": 4.2, "grad_norm": 3.6045310497283936, "learning_rate": 1.1601731601731604e-05, "loss": 1.2758, "step": 22310 }, { "epoch": 4.2, "grad_norm": 5.100409984588623, "learning_rate": 1.1597967250141165e-05, "loss": 1.1652, "step": 22320 }, { "epoch": 4.2, "grad_norm": 12.520657539367676, "learning_rate": 1.1594202898550726e-05, "loss": 1.4419, "step": 22330 }, { "epoch": 4.2, "grad_norm": 8.522043228149414, "learning_rate": 1.1590438546960288e-05, "loss": 1.475, "step": 22340 }, { "epoch": 4.21, "grad_norm": 4.298609256744385, "learning_rate": 1.158667419536985e-05, "loss": 1.6008, "step": 22350 }, { "epoch": 4.21, "grad_norm": 10.020773887634277, "learning_rate": 1.158290984377941e-05, "loss": 1.3986, "step": 22360 }, { "epoch": 4.21, "grad_norm": 5.5365095138549805, "learning_rate": 1.1579145492188972e-05, "loss": 1.1475, "step": 22370 }, { "epoch": 4.21, "grad_norm": 26.848464965820312, "learning_rate": 1.1575381140598534e-05, "loss": 1.5816, "step": 22380 }, { "epoch": 4.21, "grad_norm": 16.19618797302246, "learning_rate": 1.1571616789008094e-05, "loss": 1.4038, "step": 22390 }, { "epoch": 4.22, "grad_norm": 26.881031036376953, "learning_rate": 1.1567852437417655e-05, "loss": 1.6944, "step": 22400 }, { "epoch": 4.22, "grad_norm": 3.12028431892395, "learning_rate": 1.1564088085827216e-05, "loss": 1.294, "step": 22410 }, { "epoch": 4.22, "grad_norm": 3.7232069969177246, "learning_rate": 1.1560323734236778e-05, "loss": 1.2836, "step": 22420 }, { "epoch": 4.22, "grad_norm": 12.73692798614502, "learning_rate": 1.155655938264634e-05, "loss": 1.3533, "step": 22430 }, { "epoch": 4.22, "grad_norm": 5.52551794052124, "learning_rate": 1.15527950310559e-05, "loss": 1.5526, "step": 22440 }, { "epoch": 4.23, "grad_norm": 11.75933837890625, "learning_rate": 1.1549030679465462e-05, "loss": 1.8025, "step": 22450 }, { "epoch": 4.23, "grad_norm": 24.30883026123047, "learning_rate": 1.1545266327875024e-05, "loss": 1.6105, "step": 22460 }, { "epoch": 4.23, "grad_norm": 6.007153511047363, "learning_rate": 1.1541501976284585e-05, "loss": 1.3387, "step": 22470 }, { "epoch": 4.23, "grad_norm": 9.185994148254395, "learning_rate": 1.1537737624694147e-05, "loss": 1.3984, "step": 22480 }, { "epoch": 4.23, "grad_norm": 3.4163880348205566, "learning_rate": 1.153397327310371e-05, "loss": 1.4932, "step": 22490 }, { "epoch": 4.23, "grad_norm": 13.919078826904297, "learning_rate": 1.1530208921513271e-05, "loss": 1.3808, "step": 22500 }, { "epoch": 4.24, "grad_norm": 9.715858459472656, "learning_rate": 1.1526444569922833e-05, "loss": 1.5708, "step": 22510 }, { "epoch": 4.24, "grad_norm": 5.465107440948486, "learning_rate": 1.1522680218332394e-05, "loss": 1.1772, "step": 22520 }, { "epoch": 4.24, "grad_norm": 6.381343364715576, "learning_rate": 1.1518915866741956e-05, "loss": 1.3303, "step": 22530 }, { "epoch": 4.24, "grad_norm": 19.810794830322266, "learning_rate": 1.1515151515151517e-05, "loss": 1.3471, "step": 22540 }, { "epoch": 4.24, "grad_norm": 7.413094997406006, "learning_rate": 1.1511387163561079e-05, "loss": 1.2949, "step": 22550 }, { "epoch": 4.25, "grad_norm": 5.6388444900512695, "learning_rate": 1.1507622811970638e-05, "loss": 1.3835, "step": 22560 }, { "epoch": 4.25, "grad_norm": 25.473546981811523, "learning_rate": 1.15038584603802e-05, "loss": 1.2957, "step": 22570 }, { "epoch": 4.25, "grad_norm": 18.242752075195312, "learning_rate": 1.1500094108789761e-05, "loss": 1.1755, "step": 22580 }, { "epoch": 4.25, "grad_norm": 8.87878704071045, "learning_rate": 1.1496329757199323e-05, "loss": 1.7272, "step": 22590 }, { "epoch": 4.25, "grad_norm": 6.038020610809326, "learning_rate": 1.1492565405608884e-05, "loss": 1.3643, "step": 22600 }, { "epoch": 4.26, "grad_norm": 7.321763515472412, "learning_rate": 1.1488801054018446e-05, "loss": 1.1277, "step": 22610 }, { "epoch": 4.26, "grad_norm": 6.387785911560059, "learning_rate": 1.1485036702428007e-05, "loss": 1.4277, "step": 22620 }, { "epoch": 4.26, "grad_norm": 4.22401237487793, "learning_rate": 1.1481272350837569e-05, "loss": 1.0967, "step": 22630 }, { "epoch": 4.26, "grad_norm": 18.102201461791992, "learning_rate": 1.147750799924713e-05, "loss": 1.5889, "step": 22640 }, { "epoch": 4.26, "grad_norm": 4.212001800537109, "learning_rate": 1.1473743647656691e-05, "loss": 1.1097, "step": 22650 }, { "epoch": 4.27, "grad_norm": 3.618703603744507, "learning_rate": 1.1469979296066253e-05, "loss": 1.41, "step": 22660 }, { "epoch": 4.27, "grad_norm": 18.89423942565918, "learning_rate": 1.1466214944475816e-05, "loss": 1.2869, "step": 22670 }, { "epoch": 4.27, "grad_norm": 13.292141914367676, "learning_rate": 1.1462450592885378e-05, "loss": 1.4402, "step": 22680 }, { "epoch": 4.27, "grad_norm": 9.685138702392578, "learning_rate": 1.1458686241294939e-05, "loss": 1.2504, "step": 22690 }, { "epoch": 4.27, "grad_norm": 9.967771530151367, "learning_rate": 1.14549218897045e-05, "loss": 1.3169, "step": 22700 }, { "epoch": 4.27, "grad_norm": 5.263427257537842, "learning_rate": 1.1451157538114062e-05, "loss": 1.2336, "step": 22710 }, { "epoch": 4.28, "grad_norm": 5.904111862182617, "learning_rate": 1.1447393186523623e-05, "loss": 1.5214, "step": 22720 }, { "epoch": 4.28, "grad_norm": 3.490976095199585, "learning_rate": 1.1443628834933185e-05, "loss": 1.2407, "step": 22730 }, { "epoch": 4.28, "grad_norm": 7.685748100280762, "learning_rate": 1.1439864483342745e-05, "loss": 1.4259, "step": 22740 }, { "epoch": 4.28, "grad_norm": 37.501548767089844, "learning_rate": 1.1436100131752306e-05, "loss": 1.4619, "step": 22750 }, { "epoch": 4.28, "grad_norm": 11.093833923339844, "learning_rate": 1.1432335780161868e-05, "loss": 1.3905, "step": 22760 }, { "epoch": 4.29, "grad_norm": 16.976572036743164, "learning_rate": 1.1428571428571429e-05, "loss": 1.2383, "step": 22770 }, { "epoch": 4.29, "grad_norm": 4.594071388244629, "learning_rate": 1.142480707698099e-05, "loss": 1.4833, "step": 22780 }, { "epoch": 4.29, "grad_norm": 5.4996256828308105, "learning_rate": 1.1421042725390552e-05, "loss": 1.3097, "step": 22790 }, { "epoch": 4.29, "grad_norm": 4.83232307434082, "learning_rate": 1.1417278373800113e-05, "loss": 1.2455, "step": 22800 }, { "epoch": 4.29, "grad_norm": 6.72515344619751, "learning_rate": 1.1413514022209675e-05, "loss": 1.3426, "step": 22810 }, { "epoch": 4.3, "grad_norm": 8.427131652832031, "learning_rate": 1.1409749670619236e-05, "loss": 1.7302, "step": 22820 }, { "epoch": 4.3, "grad_norm": 7.042243003845215, "learning_rate": 1.1405985319028798e-05, "loss": 1.3222, "step": 22830 }, { "epoch": 4.3, "grad_norm": 11.9269437789917, "learning_rate": 1.140222096743836e-05, "loss": 1.4412, "step": 22840 }, { "epoch": 4.3, "grad_norm": 10.321571350097656, "learning_rate": 1.139845661584792e-05, "loss": 1.2085, "step": 22850 }, { "epoch": 4.3, "grad_norm": 6.500455379486084, "learning_rate": 1.1394692264257484e-05, "loss": 1.6461, "step": 22860 }, { "epoch": 4.3, "grad_norm": 5.898437976837158, "learning_rate": 1.1390927912667045e-05, "loss": 1.0046, "step": 22870 }, { "epoch": 4.31, "grad_norm": 8.894137382507324, "learning_rate": 1.1387163561076607e-05, "loss": 1.2981, "step": 22880 }, { "epoch": 4.31, "grad_norm": 4.706697940826416, "learning_rate": 1.1383399209486168e-05, "loss": 1.2524, "step": 22890 }, { "epoch": 4.31, "grad_norm": 4.421281814575195, "learning_rate": 1.137963485789573e-05, "loss": 1.0571, "step": 22900 }, { "epoch": 4.31, "grad_norm": 17.163183212280273, "learning_rate": 1.137587050630529e-05, "loss": 1.7138, "step": 22910 }, { "epoch": 4.31, "grad_norm": 6.0863518714904785, "learning_rate": 1.1372106154714851e-05, "loss": 1.2281, "step": 22920 }, { "epoch": 4.32, "grad_norm": 5.828840255737305, "learning_rate": 1.1368341803124412e-05, "loss": 1.41, "step": 22930 }, { "epoch": 4.32, "grad_norm": 13.186480522155762, "learning_rate": 1.1364577451533974e-05, "loss": 1.2603, "step": 22940 }, { "epoch": 4.32, "grad_norm": 5.384606838226318, "learning_rate": 1.1360813099943535e-05, "loss": 1.4722, "step": 22950 }, { "epoch": 4.32, "grad_norm": 9.919597625732422, "learning_rate": 1.1357048748353097e-05, "loss": 1.2787, "step": 22960 }, { "epoch": 4.32, "grad_norm": 18.809741973876953, "learning_rate": 1.1353284396762658e-05, "loss": 1.291, "step": 22970 }, { "epoch": 4.33, "grad_norm": 6.522648811340332, "learning_rate": 1.134952004517222e-05, "loss": 1.2689, "step": 22980 }, { "epoch": 4.33, "grad_norm": 8.54554557800293, "learning_rate": 1.1345755693581781e-05, "loss": 1.6911, "step": 22990 }, { "epoch": 4.33, "grad_norm": 3.135051965713501, "learning_rate": 1.1341991341991343e-05, "loss": 1.4583, "step": 23000 }, { "epoch": 4.33, "grad_norm": 4.633485317230225, "learning_rate": 1.1338226990400904e-05, "loss": 1.236, "step": 23010 }, { "epoch": 4.33, "grad_norm": 8.277584075927734, "learning_rate": 1.1334462638810466e-05, "loss": 1.2733, "step": 23020 }, { "epoch": 4.33, "grad_norm": 11.313704490661621, "learning_rate": 1.1330698287220027e-05, "loss": 1.5, "step": 23030 }, { "epoch": 4.34, "grad_norm": 10.552010536193848, "learning_rate": 1.132693393562959e-05, "loss": 1.4042, "step": 23040 }, { "epoch": 4.34, "grad_norm": 6.136496067047119, "learning_rate": 1.1323169584039152e-05, "loss": 1.2188, "step": 23050 }, { "epoch": 4.34, "grad_norm": 2.9745607376098633, "learning_rate": 1.1319405232448713e-05, "loss": 1.4024, "step": 23060 }, { "epoch": 4.34, "grad_norm": 10.06482219696045, "learning_rate": 1.1315640880858275e-05, "loss": 1.474, "step": 23070 }, { "epoch": 4.34, "grad_norm": 14.603937149047852, "learning_rate": 1.1311876529267836e-05, "loss": 1.4731, "step": 23080 }, { "epoch": 4.35, "grad_norm": 7.338199615478516, "learning_rate": 1.1308112177677396e-05, "loss": 1.2424, "step": 23090 }, { "epoch": 4.35, "grad_norm": 45.47178649902344, "learning_rate": 1.1304347826086957e-05, "loss": 1.335, "step": 23100 }, { "epoch": 4.35, "grad_norm": 9.846171379089355, "learning_rate": 1.1300583474496519e-05, "loss": 1.256, "step": 23110 }, { "epoch": 4.35, "grad_norm": 25.826841354370117, "learning_rate": 1.129681912290608e-05, "loss": 1.28, "step": 23120 }, { "epoch": 4.35, "grad_norm": 23.70148468017578, "learning_rate": 1.1293054771315642e-05, "loss": 1.47, "step": 23130 }, { "epoch": 4.36, "grad_norm": 13.747662544250488, "learning_rate": 1.1289290419725203e-05, "loss": 1.3473, "step": 23140 }, { "epoch": 4.36, "grad_norm": 14.677962303161621, "learning_rate": 1.1285526068134765e-05, "loss": 0.9575, "step": 23150 }, { "epoch": 4.36, "grad_norm": 20.239076614379883, "learning_rate": 1.1281761716544326e-05, "loss": 1.2453, "step": 23160 }, { "epoch": 4.36, "grad_norm": 10.252973556518555, "learning_rate": 1.1277997364953887e-05, "loss": 1.149, "step": 23170 }, { "epoch": 4.36, "grad_norm": 2.7143025398254395, "learning_rate": 1.1274233013363449e-05, "loss": 1.4149, "step": 23180 }, { "epoch": 4.36, "grad_norm": 7.062023162841797, "learning_rate": 1.127046866177301e-05, "loss": 1.3138, "step": 23190 }, { "epoch": 4.37, "grad_norm": 11.918550491333008, "learning_rate": 1.1266704310182572e-05, "loss": 1.5192, "step": 23200 }, { "epoch": 4.37, "grad_norm": 12.523595809936523, "learning_rate": 1.1262939958592133e-05, "loss": 1.1179, "step": 23210 }, { "epoch": 4.37, "grad_norm": 8.70029067993164, "learning_rate": 1.1259175607001695e-05, "loss": 1.0639, "step": 23220 }, { "epoch": 4.37, "grad_norm": 13.865007400512695, "learning_rate": 1.1255411255411258e-05, "loss": 1.3118, "step": 23230 }, { "epoch": 4.37, "grad_norm": 25.616748809814453, "learning_rate": 1.125164690382082e-05, "loss": 1.3557, "step": 23240 }, { "epoch": 4.38, "grad_norm": 5.623507499694824, "learning_rate": 1.124788255223038e-05, "loss": 1.5767, "step": 23250 }, { "epoch": 4.38, "grad_norm": 4.884276866912842, "learning_rate": 1.1244118200639939e-05, "loss": 1.3989, "step": 23260 }, { "epoch": 4.38, "grad_norm": 10.648892402648926, "learning_rate": 1.1240353849049502e-05, "loss": 1.2401, "step": 23270 }, { "epoch": 4.38, "grad_norm": 10.275334358215332, "learning_rate": 1.1236589497459064e-05, "loss": 1.3642, "step": 23280 }, { "epoch": 4.38, "grad_norm": 10.291203498840332, "learning_rate": 1.1232825145868625e-05, "loss": 1.4468, "step": 23290 }, { "epoch": 4.39, "grad_norm": 4.797783851623535, "learning_rate": 1.1229060794278186e-05, "loss": 1.2413, "step": 23300 }, { "epoch": 4.39, "grad_norm": 11.461807250976562, "learning_rate": 1.1225296442687748e-05, "loss": 1.2817, "step": 23310 }, { "epoch": 4.39, "grad_norm": 13.239301681518555, "learning_rate": 1.122153209109731e-05, "loss": 1.5378, "step": 23320 }, { "epoch": 4.39, "grad_norm": 19.613981246948242, "learning_rate": 1.121776773950687e-05, "loss": 1.0354, "step": 23330 }, { "epoch": 4.39, "grad_norm": 22.205846786499023, "learning_rate": 1.1214003387916432e-05, "loss": 1.3645, "step": 23340 }, { "epoch": 4.39, "grad_norm": 8.093974113464355, "learning_rate": 1.1210239036325994e-05, "loss": 1.2558, "step": 23350 }, { "epoch": 4.4, "grad_norm": 20.63254737854004, "learning_rate": 1.1206474684735555e-05, "loss": 1.2494, "step": 23360 }, { "epoch": 4.4, "grad_norm": 16.344945907592773, "learning_rate": 1.1202710333145117e-05, "loss": 1.2554, "step": 23370 }, { "epoch": 4.4, "grad_norm": 2.9398090839385986, "learning_rate": 1.1198945981554678e-05, "loss": 1.3689, "step": 23380 }, { "epoch": 4.4, "grad_norm": 12.727755546569824, "learning_rate": 1.119518162996424e-05, "loss": 1.4485, "step": 23390 }, { "epoch": 4.4, "grad_norm": 18.862890243530273, "learning_rate": 1.1191417278373801e-05, "loss": 1.1004, "step": 23400 }, { "epoch": 4.41, "grad_norm": 27.26294708251953, "learning_rate": 1.1187652926783363e-05, "loss": 1.1208, "step": 23410 }, { "epoch": 4.41, "grad_norm": 7.961979389190674, "learning_rate": 1.1183888575192926e-05, "loss": 1.3701, "step": 23420 }, { "epoch": 4.41, "grad_norm": 9.444908142089844, "learning_rate": 1.1180124223602484e-05, "loss": 1.3856, "step": 23430 }, { "epoch": 4.41, "grad_norm": 7.100154876708984, "learning_rate": 1.1176359872012045e-05, "loss": 1.2454, "step": 23440 }, { "epoch": 4.41, "grad_norm": 21.100200653076172, "learning_rate": 1.1172595520421607e-05, "loss": 1.2524, "step": 23450 }, { "epoch": 4.42, "grad_norm": 5.258886814117432, "learning_rate": 1.116883116883117e-05, "loss": 1.5572, "step": 23460 }, { "epoch": 4.42, "grad_norm": 27.240352630615234, "learning_rate": 1.1165066817240731e-05, "loss": 1.6241, "step": 23470 }, { "epoch": 4.42, "grad_norm": 16.0458927154541, "learning_rate": 1.1161302465650293e-05, "loss": 1.3404, "step": 23480 }, { "epoch": 4.42, "grad_norm": 18.282493591308594, "learning_rate": 1.1157538114059854e-05, "loss": 1.1821, "step": 23490 }, { "epoch": 4.42, "grad_norm": 13.274044036865234, "learning_rate": 1.1153773762469416e-05, "loss": 1.4195, "step": 23500 }, { "epoch": 4.42, "grad_norm": 14.76251220703125, "learning_rate": 1.1150009410878977e-05, "loss": 1.4065, "step": 23510 }, { "epoch": 4.43, "grad_norm": 8.709878921508789, "learning_rate": 1.1146245059288539e-05, "loss": 1.6023, "step": 23520 }, { "epoch": 4.43, "grad_norm": 5.92330265045166, "learning_rate": 1.11424807076981e-05, "loss": 1.3209, "step": 23530 }, { "epoch": 4.43, "grad_norm": 17.7413387298584, "learning_rate": 1.1138716356107661e-05, "loss": 1.1393, "step": 23540 }, { "epoch": 4.43, "grad_norm": 13.982818603515625, "learning_rate": 1.1134952004517223e-05, "loss": 1.0704, "step": 23550 }, { "epoch": 4.43, "grad_norm": 5.319547176361084, "learning_rate": 1.1131187652926784e-05, "loss": 1.2218, "step": 23560 }, { "epoch": 4.44, "grad_norm": 9.939594268798828, "learning_rate": 1.1127423301336346e-05, "loss": 1.2286, "step": 23570 }, { "epoch": 4.44, "grad_norm": 14.662912368774414, "learning_rate": 1.1123658949745907e-05, "loss": 1.1226, "step": 23580 }, { "epoch": 4.44, "grad_norm": 7.673614501953125, "learning_rate": 1.1119894598155469e-05, "loss": 1.3084, "step": 23590 }, { "epoch": 4.44, "grad_norm": 16.663188934326172, "learning_rate": 1.1116130246565032e-05, "loss": 1.124, "step": 23600 }, { "epoch": 4.44, "grad_norm": 11.63786506652832, "learning_rate": 1.111236589497459e-05, "loss": 1.5997, "step": 23610 }, { "epoch": 4.45, "grad_norm": 34.37097930908203, "learning_rate": 1.1108601543384151e-05, "loss": 1.1965, "step": 23620 }, { "epoch": 4.45, "grad_norm": 14.486160278320312, "learning_rate": 1.1104837191793713e-05, "loss": 1.0156, "step": 23630 }, { "epoch": 4.45, "grad_norm": 13.651799201965332, "learning_rate": 1.1101072840203274e-05, "loss": 1.2019, "step": 23640 }, { "epoch": 4.45, "grad_norm": 10.198738098144531, "learning_rate": 1.1097308488612838e-05, "loss": 1.313, "step": 23650 }, { "epoch": 4.45, "grad_norm": 9.802396774291992, "learning_rate": 1.1093544137022399e-05, "loss": 1.3188, "step": 23660 }, { "epoch": 4.46, "grad_norm": 7.22983455657959, "learning_rate": 1.108977978543196e-05, "loss": 1.0307, "step": 23670 }, { "epoch": 4.46, "grad_norm": 6.274710655212402, "learning_rate": 1.1086015433841522e-05, "loss": 1.148, "step": 23680 }, { "epoch": 4.46, "grad_norm": 7.444991111755371, "learning_rate": 1.1082251082251083e-05, "loss": 1.2741, "step": 23690 }, { "epoch": 4.46, "grad_norm": 13.223048210144043, "learning_rate": 1.1078486730660645e-05, "loss": 1.2085, "step": 23700 }, { "epoch": 4.46, "grad_norm": 11.560144424438477, "learning_rate": 1.1074722379070206e-05, "loss": 1.3778, "step": 23710 }, { "epoch": 4.46, "grad_norm": 2.2751550674438477, "learning_rate": 1.1070958027479768e-05, "loss": 1.3936, "step": 23720 }, { "epoch": 4.47, "grad_norm": 3.871035099029541, "learning_rate": 1.106719367588933e-05, "loss": 1.295, "step": 23730 }, { "epoch": 4.47, "grad_norm": 9.869433403015137, "learning_rate": 1.106342932429889e-05, "loss": 1.4146, "step": 23740 }, { "epoch": 4.47, "grad_norm": 25.56659698486328, "learning_rate": 1.1059664972708452e-05, "loss": 1.2192, "step": 23750 }, { "epoch": 4.47, "grad_norm": 12.907247543334961, "learning_rate": 1.1055900621118014e-05, "loss": 1.4451, "step": 23760 }, { "epoch": 4.47, "grad_norm": 21.056577682495117, "learning_rate": 1.1052136269527575e-05, "loss": 1.3403, "step": 23770 }, { "epoch": 4.48, "grad_norm": 8.884479522705078, "learning_rate": 1.1048371917937135e-05, "loss": 1.0312, "step": 23780 }, { "epoch": 4.48, "grad_norm": 6.852158069610596, "learning_rate": 1.1044607566346696e-05, "loss": 1.1433, "step": 23790 }, { "epoch": 4.48, "grad_norm": 9.584796905517578, "learning_rate": 1.1040843214756258e-05, "loss": 1.5018, "step": 23800 }, { "epoch": 4.48, "grad_norm": 6.116985321044922, "learning_rate": 1.103707886316582e-05, "loss": 1.1242, "step": 23810 }, { "epoch": 4.48, "grad_norm": 17.78133773803711, "learning_rate": 1.103331451157538e-05, "loss": 1.0608, "step": 23820 }, { "epoch": 4.49, "grad_norm": 6.157369613647461, "learning_rate": 1.1029550159984944e-05, "loss": 1.3893, "step": 23830 }, { "epoch": 4.49, "grad_norm": 8.917243957519531, "learning_rate": 1.1025785808394505e-05, "loss": 1.2112, "step": 23840 }, { "epoch": 4.49, "grad_norm": 20.34598731994629, "learning_rate": 1.1022021456804067e-05, "loss": 1.2109, "step": 23850 }, { "epoch": 4.49, "grad_norm": 10.729361534118652, "learning_rate": 1.1018257105213628e-05, "loss": 1.4396, "step": 23860 }, { "epoch": 4.49, "grad_norm": 12.037083625793457, "learning_rate": 1.101449275362319e-05, "loss": 1.0855, "step": 23870 }, { "epoch": 4.49, "grad_norm": 5.508408546447754, "learning_rate": 1.1010728402032751e-05, "loss": 1.4513, "step": 23880 }, { "epoch": 4.5, "grad_norm": 5.416233062744141, "learning_rate": 1.1006964050442313e-05, "loss": 1.051, "step": 23890 }, { "epoch": 4.5, "grad_norm": 2.6513330936431885, "learning_rate": 1.1003199698851874e-05, "loss": 1.0868, "step": 23900 }, { "epoch": 4.5, "grad_norm": 7.040049076080322, "learning_rate": 1.0999435347261436e-05, "loss": 1.1051, "step": 23910 }, { "epoch": 4.5, "grad_norm": 6.521361827850342, "learning_rate": 1.0995670995670997e-05, "loss": 1.1796, "step": 23920 }, { "epoch": 4.5, "grad_norm": 9.672309875488281, "learning_rate": 1.0991906644080558e-05, "loss": 1.2215, "step": 23930 }, { "epoch": 4.51, "grad_norm": 20.263023376464844, "learning_rate": 1.098814229249012e-05, "loss": 1.4568, "step": 23940 }, { "epoch": 4.51, "grad_norm": 9.718255043029785, "learning_rate": 1.0984377940899681e-05, "loss": 1.186, "step": 23950 }, { "epoch": 4.51, "grad_norm": 13.852045059204102, "learning_rate": 1.0980613589309241e-05, "loss": 1.0503, "step": 23960 }, { "epoch": 4.51, "grad_norm": 8.95326042175293, "learning_rate": 1.0976849237718803e-05, "loss": 1.4088, "step": 23970 }, { "epoch": 4.51, "grad_norm": 11.894559860229492, "learning_rate": 1.0973084886128364e-05, "loss": 1.2216, "step": 23980 }, { "epoch": 4.52, "grad_norm": 5.613511085510254, "learning_rate": 1.0969320534537926e-05, "loss": 1.3457, "step": 23990 }, { "epoch": 4.52, "grad_norm": 13.062862396240234, "learning_rate": 1.0965556182947487e-05, "loss": 1.3163, "step": 24000 }, { "epoch": 4.52, "grad_norm": 13.388591766357422, "learning_rate": 1.0961791831357048e-05, "loss": 1.2049, "step": 24010 }, { "epoch": 4.52, "grad_norm": 8.315409660339355, "learning_rate": 1.0958027479766612e-05, "loss": 1.2314, "step": 24020 }, { "epoch": 4.52, "grad_norm": 15.521514892578125, "learning_rate": 1.0954263128176173e-05, "loss": 1.3388, "step": 24030 }, { "epoch": 4.52, "grad_norm": 4.734891891479492, "learning_rate": 1.0950498776585735e-05, "loss": 0.8585, "step": 24040 }, { "epoch": 4.53, "grad_norm": 14.920530319213867, "learning_rate": 1.0946734424995296e-05, "loss": 1.0535, "step": 24050 }, { "epoch": 4.53, "grad_norm": 23.997804641723633, "learning_rate": 1.0942970073404857e-05, "loss": 1.2312, "step": 24060 }, { "epoch": 4.53, "grad_norm": 12.957062721252441, "learning_rate": 1.0939205721814419e-05, "loss": 1.4538, "step": 24070 }, { "epoch": 4.53, "grad_norm": 6.7665486335754395, "learning_rate": 1.093544137022398e-05, "loss": 1.2526, "step": 24080 }, { "epoch": 4.53, "grad_norm": 5.023631572723389, "learning_rate": 1.0931677018633542e-05, "loss": 1.2847, "step": 24090 }, { "epoch": 4.54, "grad_norm": 9.190400123596191, "learning_rate": 1.0927912667043103e-05, "loss": 1.3253, "step": 24100 }, { "epoch": 4.54, "grad_norm": 14.648627281188965, "learning_rate": 1.0924148315452665e-05, "loss": 1.1851, "step": 24110 }, { "epoch": 4.54, "grad_norm": 5.2912139892578125, "learning_rate": 1.0920383963862226e-05, "loss": 1.2338, "step": 24120 }, { "epoch": 4.54, "grad_norm": 5.818458557128906, "learning_rate": 1.0916619612271786e-05, "loss": 1.0387, "step": 24130 }, { "epoch": 4.54, "grad_norm": 5.1177592277526855, "learning_rate": 1.0912855260681347e-05, "loss": 1.1707, "step": 24140 }, { "epoch": 4.55, "grad_norm": 27.66385269165039, "learning_rate": 1.0909090909090909e-05, "loss": 1.0448, "step": 24150 }, { "epoch": 4.55, "grad_norm": 4.1212286949157715, "learning_rate": 1.090532655750047e-05, "loss": 1.1793, "step": 24160 }, { "epoch": 4.55, "grad_norm": 5.9400506019592285, "learning_rate": 1.0901562205910032e-05, "loss": 1.3043, "step": 24170 }, { "epoch": 4.55, "grad_norm": 17.84995460510254, "learning_rate": 1.0897797854319593e-05, "loss": 1.4627, "step": 24180 }, { "epoch": 4.55, "grad_norm": 9.543108940124512, "learning_rate": 1.0894033502729155e-05, "loss": 1.3457, "step": 24190 }, { "epoch": 4.55, "grad_norm": 7.9750590324401855, "learning_rate": 1.0890269151138718e-05, "loss": 1.3724, "step": 24200 }, { "epoch": 4.56, "grad_norm": 24.82449722290039, "learning_rate": 1.088650479954828e-05, "loss": 1.3627, "step": 24210 }, { "epoch": 4.56, "grad_norm": 5.002338409423828, "learning_rate": 1.088274044795784e-05, "loss": 1.0731, "step": 24220 }, { "epoch": 4.56, "grad_norm": 8.53024673461914, "learning_rate": 1.0878976096367402e-05, "loss": 1.6133, "step": 24230 }, { "epoch": 4.56, "grad_norm": 21.515546798706055, "learning_rate": 1.0875211744776964e-05, "loss": 1.333, "step": 24240 }, { "epoch": 4.56, "grad_norm": 7.449126720428467, "learning_rate": 1.0871447393186525e-05, "loss": 1.3455, "step": 24250 }, { "epoch": 4.57, "grad_norm": 5.602290630340576, "learning_rate": 1.0867683041596087e-05, "loss": 1.0319, "step": 24260 }, { "epoch": 4.57, "grad_norm": 15.280950546264648, "learning_rate": 1.0863918690005648e-05, "loss": 1.1313, "step": 24270 }, { "epoch": 4.57, "grad_norm": 24.31390953063965, "learning_rate": 1.086015433841521e-05, "loss": 1.0942, "step": 24280 }, { "epoch": 4.57, "grad_norm": 9.724617004394531, "learning_rate": 1.0856389986824771e-05, "loss": 1.0912, "step": 24290 }, { "epoch": 4.57, "grad_norm": 15.971935272216797, "learning_rate": 1.0852625635234333e-05, "loss": 1.3329, "step": 24300 }, { "epoch": 4.58, "grad_norm": 12.32314682006836, "learning_rate": 1.0848861283643892e-05, "loss": 1.081, "step": 24310 }, { "epoch": 4.58, "grad_norm": 4.493342876434326, "learning_rate": 1.0845096932053454e-05, "loss": 1.0494, "step": 24320 }, { "epoch": 4.58, "grad_norm": 11.808741569519043, "learning_rate": 1.0841332580463015e-05, "loss": 0.8922, "step": 24330 }, { "epoch": 4.58, "grad_norm": 35.05922317504883, "learning_rate": 1.0837568228872577e-05, "loss": 1.0724, "step": 24340 }, { "epoch": 4.58, "grad_norm": 15.500190734863281, "learning_rate": 1.0833803877282138e-05, "loss": 1.3189, "step": 24350 }, { "epoch": 4.58, "grad_norm": 12.789752006530762, "learning_rate": 1.08300395256917e-05, "loss": 1.3746, "step": 24360 }, { "epoch": 4.59, "grad_norm": 3.314342498779297, "learning_rate": 1.0826275174101261e-05, "loss": 1.0576, "step": 24370 }, { "epoch": 4.59, "grad_norm": 6.866835594177246, "learning_rate": 1.0822510822510823e-05, "loss": 1.1703, "step": 24380 }, { "epoch": 4.59, "grad_norm": 9.35311222076416, "learning_rate": 1.0818746470920386e-05, "loss": 1.1326, "step": 24390 }, { "epoch": 4.59, "grad_norm": 8.053922653198242, "learning_rate": 1.0814982119329947e-05, "loss": 1.3983, "step": 24400 }, { "epoch": 4.59, "grad_norm": 13.07441520690918, "learning_rate": 1.0811217767739509e-05, "loss": 0.9565, "step": 24410 }, { "epoch": 4.6, "grad_norm": 10.292523384094238, "learning_rate": 1.080745341614907e-05, "loss": 1.2338, "step": 24420 }, { "epoch": 4.6, "grad_norm": 9.829605102539062, "learning_rate": 1.0803689064558631e-05, "loss": 1.1613, "step": 24430 }, { "epoch": 4.6, "grad_norm": 9.334866523742676, "learning_rate": 1.0799924712968193e-05, "loss": 1.0959, "step": 24440 }, { "epoch": 4.6, "grad_norm": 6.089194297790527, "learning_rate": 1.0796160361377754e-05, "loss": 1.3693, "step": 24450 }, { "epoch": 4.6, "grad_norm": 18.241588592529297, "learning_rate": 1.0792396009787316e-05, "loss": 1.1812, "step": 24460 }, { "epoch": 4.61, "grad_norm": 17.4996337890625, "learning_rate": 1.0788631658196877e-05, "loss": 1.4491, "step": 24470 }, { "epoch": 4.61, "grad_norm": 15.6777982711792, "learning_rate": 1.0784867306606437e-05, "loss": 1.347, "step": 24480 }, { "epoch": 4.61, "grad_norm": 8.229899406433105, "learning_rate": 1.0781102955015999e-05, "loss": 0.9794, "step": 24490 }, { "epoch": 4.61, "grad_norm": 17.689180374145508, "learning_rate": 1.077733860342556e-05, "loss": 1.3727, "step": 24500 }, { "epoch": 4.61, "grad_norm": 13.020503997802734, "learning_rate": 1.0773574251835121e-05, "loss": 1.7272, "step": 24510 }, { "epoch": 4.62, "grad_norm": 2.5165858268737793, "learning_rate": 1.0769809900244683e-05, "loss": 1.0694, "step": 24520 }, { "epoch": 4.62, "grad_norm": 8.94255256652832, "learning_rate": 1.0766045548654244e-05, "loss": 1.2834, "step": 24530 }, { "epoch": 4.62, "grad_norm": 10.90611457824707, "learning_rate": 1.0762281197063806e-05, "loss": 1.0317, "step": 24540 }, { "epoch": 4.62, "grad_norm": 18.603296279907227, "learning_rate": 1.0758516845473367e-05, "loss": 1.2318, "step": 24550 }, { "epoch": 4.62, "grad_norm": 11.504902839660645, "learning_rate": 1.0754752493882929e-05, "loss": 1.3683, "step": 24560 }, { "epoch": 4.62, "grad_norm": 2.577777147293091, "learning_rate": 1.0750988142292492e-05, "loss": 1.1869, "step": 24570 }, { "epoch": 4.63, "grad_norm": 8.789922714233398, "learning_rate": 1.0747223790702053e-05, "loss": 1.0967, "step": 24580 }, { "epoch": 4.63, "grad_norm": 4.969944000244141, "learning_rate": 1.0743459439111615e-05, "loss": 1.4564, "step": 24590 }, { "epoch": 4.63, "grad_norm": 5.494117736816406, "learning_rate": 1.0739695087521176e-05, "loss": 1.1966, "step": 24600 }, { "epoch": 4.63, "grad_norm": 16.573198318481445, "learning_rate": 1.0735930735930738e-05, "loss": 0.9869, "step": 24610 }, { "epoch": 4.63, "grad_norm": 6.444282531738281, "learning_rate": 1.07321663843403e-05, "loss": 1.0829, "step": 24620 }, { "epoch": 4.64, "grad_norm": 16.71677017211914, "learning_rate": 1.072840203274986e-05, "loss": 0.927, "step": 24630 }, { "epoch": 4.64, "grad_norm": 17.701824188232422, "learning_rate": 1.0724637681159422e-05, "loss": 1.3496, "step": 24640 }, { "epoch": 4.64, "grad_norm": 6.14141845703125, "learning_rate": 1.0720873329568982e-05, "loss": 1.1632, "step": 24650 }, { "epoch": 4.64, "grad_norm": 14.471163749694824, "learning_rate": 1.0717108977978543e-05, "loss": 1.2186, "step": 24660 }, { "epoch": 4.64, "grad_norm": 25.959949493408203, "learning_rate": 1.0713344626388105e-05, "loss": 1.0911, "step": 24670 }, { "epoch": 4.65, "grad_norm": 10.902137756347656, "learning_rate": 1.0709580274797666e-05, "loss": 1.2402, "step": 24680 }, { "epoch": 4.65, "grad_norm": 10.041330337524414, "learning_rate": 1.0705815923207228e-05, "loss": 1.0697, "step": 24690 }, { "epoch": 4.65, "grad_norm": 28.599468231201172, "learning_rate": 1.070205157161679e-05, "loss": 0.9849, "step": 24700 }, { "epoch": 4.65, "grad_norm": 18.14686393737793, "learning_rate": 1.069828722002635e-05, "loss": 1.3968, "step": 24710 }, { "epoch": 4.65, "grad_norm": 14.962003707885742, "learning_rate": 1.0694522868435912e-05, "loss": 1.0589, "step": 24720 }, { "epoch": 4.65, "grad_norm": 5.887497425079346, "learning_rate": 1.0690758516845474e-05, "loss": 1.3455, "step": 24730 }, { "epoch": 4.66, "grad_norm": 21.044198989868164, "learning_rate": 1.0686994165255035e-05, "loss": 0.9045, "step": 24740 }, { "epoch": 4.66, "grad_norm": 7.076939582824707, "learning_rate": 1.0683229813664597e-05, "loss": 0.974, "step": 24750 }, { "epoch": 4.66, "grad_norm": 8.052274703979492, "learning_rate": 1.067946546207416e-05, "loss": 1.132, "step": 24760 }, { "epoch": 4.66, "grad_norm": 3.0002081394195557, "learning_rate": 1.0675701110483721e-05, "loss": 1.4371, "step": 24770 }, { "epoch": 4.66, "grad_norm": 6.199221134185791, "learning_rate": 1.0671936758893283e-05, "loss": 1.278, "step": 24780 }, { "epoch": 4.67, "grad_norm": 9.897551536560059, "learning_rate": 1.0668172407302844e-05, "loss": 1.4287, "step": 24790 }, { "epoch": 4.67, "grad_norm": 9.521723747253418, "learning_rate": 1.0664408055712406e-05, "loss": 0.994, "step": 24800 }, { "epoch": 4.67, "grad_norm": 6.78341817855835, "learning_rate": 1.0660643704121967e-05, "loss": 1.3535, "step": 24810 }, { "epoch": 4.67, "grad_norm": 2.9654018878936768, "learning_rate": 1.0656879352531528e-05, "loss": 1.054, "step": 24820 }, { "epoch": 4.67, "grad_norm": 33.28799057006836, "learning_rate": 1.0653115000941088e-05, "loss": 1.3857, "step": 24830 }, { "epoch": 4.68, "grad_norm": 5.354284763336182, "learning_rate": 1.064935064935065e-05, "loss": 1.1971, "step": 24840 }, { "epoch": 4.68, "grad_norm": 8.065067291259766, "learning_rate": 1.0645586297760211e-05, "loss": 1.3653, "step": 24850 }, { "epoch": 4.68, "grad_norm": 12.445756912231445, "learning_rate": 1.0641821946169773e-05, "loss": 1.3074, "step": 24860 }, { "epoch": 4.68, "grad_norm": 22.858789443969727, "learning_rate": 1.0638057594579334e-05, "loss": 0.9663, "step": 24870 }, { "epoch": 4.68, "grad_norm": 11.2050199508667, "learning_rate": 1.0634293242988896e-05, "loss": 1.3536, "step": 24880 }, { "epoch": 4.68, "grad_norm": 9.776514053344727, "learning_rate": 1.0630528891398457e-05, "loss": 1.0433, "step": 24890 }, { "epoch": 4.69, "grad_norm": 21.889034271240234, "learning_rate": 1.0626764539808018e-05, "loss": 1.1049, "step": 24900 }, { "epoch": 4.69, "grad_norm": 27.761518478393555, "learning_rate": 1.062300018821758e-05, "loss": 1.3584, "step": 24910 }, { "epoch": 4.69, "grad_norm": 11.25415325164795, "learning_rate": 1.0619235836627141e-05, "loss": 1.2275, "step": 24920 }, { "epoch": 4.69, "grad_norm": 9.840291023254395, "learning_rate": 1.0615471485036703e-05, "loss": 1.174, "step": 24930 }, { "epoch": 4.69, "grad_norm": 13.991315841674805, "learning_rate": 1.0611707133446264e-05, "loss": 1.2037, "step": 24940 }, { "epoch": 4.7, "grad_norm": 4.1368889808654785, "learning_rate": 1.0607942781855827e-05, "loss": 1.2517, "step": 24950 }, { "epoch": 4.7, "grad_norm": 9.214469909667969, "learning_rate": 1.0604178430265389e-05, "loss": 1.4734, "step": 24960 }, { "epoch": 4.7, "grad_norm": 13.74550724029541, "learning_rate": 1.060041407867495e-05, "loss": 1.294, "step": 24970 }, { "epoch": 4.7, "grad_norm": 9.992650032043457, "learning_rate": 1.0596649727084512e-05, "loss": 1.0656, "step": 24980 }, { "epoch": 4.7, "grad_norm": 16.074861526489258, "learning_rate": 1.0592885375494073e-05, "loss": 1.1022, "step": 24990 }, { "epoch": 4.71, "grad_norm": 66.05573272705078, "learning_rate": 1.0589121023903633e-05, "loss": 0.9666, "step": 25000 }, { "epoch": 4.71, "grad_norm": 11.355215072631836, "learning_rate": 1.0585356672313195e-05, "loss": 1.3685, "step": 25010 }, { "epoch": 4.71, "grad_norm": 10.772954940795898, "learning_rate": 1.0581592320722756e-05, "loss": 1.046, "step": 25020 }, { "epoch": 4.71, "grad_norm": 15.41829776763916, "learning_rate": 1.0577827969132317e-05, "loss": 1.0158, "step": 25030 }, { "epoch": 4.71, "grad_norm": 14.398574829101562, "learning_rate": 1.0574063617541879e-05, "loss": 1.2134, "step": 25040 }, { "epoch": 4.71, "grad_norm": 4.490827560424805, "learning_rate": 1.057029926595144e-05, "loss": 1.1743, "step": 25050 }, { "epoch": 4.72, "grad_norm": 33.1912956237793, "learning_rate": 1.0566534914361002e-05, "loss": 1.0984, "step": 25060 }, { "epoch": 4.72, "grad_norm": 11.077810287475586, "learning_rate": 1.0562770562770563e-05, "loss": 1.1323, "step": 25070 }, { "epoch": 4.72, "grad_norm": 15.652287483215332, "learning_rate": 1.0559006211180125e-05, "loss": 1.091, "step": 25080 }, { "epoch": 4.72, "grad_norm": 31.493244171142578, "learning_rate": 1.0555241859589686e-05, "loss": 1.6215, "step": 25090 }, { "epoch": 4.72, "grad_norm": 10.600312232971191, "learning_rate": 1.0551477507999248e-05, "loss": 1.2373, "step": 25100 }, { "epoch": 4.73, "grad_norm": 8.50595760345459, "learning_rate": 1.0547713156408809e-05, "loss": 1.0581, "step": 25110 }, { "epoch": 4.73, "grad_norm": 16.157825469970703, "learning_rate": 1.054394880481837e-05, "loss": 1.1691, "step": 25120 }, { "epoch": 4.73, "grad_norm": 29.103710174560547, "learning_rate": 1.0540184453227934e-05, "loss": 1.0663, "step": 25130 }, { "epoch": 4.73, "grad_norm": 21.067293167114258, "learning_rate": 1.0536420101637495e-05, "loss": 1.3879, "step": 25140 }, { "epoch": 4.73, "grad_norm": 18.49663543701172, "learning_rate": 1.0532655750047057e-05, "loss": 1.0306, "step": 25150 }, { "epoch": 4.74, "grad_norm": 17.198123931884766, "learning_rate": 1.0528891398456618e-05, "loss": 1.1495, "step": 25160 }, { "epoch": 4.74, "grad_norm": 5.602969646453857, "learning_rate": 1.052512704686618e-05, "loss": 1.2334, "step": 25170 }, { "epoch": 4.74, "grad_norm": 13.30167293548584, "learning_rate": 1.052136269527574e-05, "loss": 1.189, "step": 25180 }, { "epoch": 4.74, "grad_norm": 16.90226173400879, "learning_rate": 1.05175983436853e-05, "loss": 1.1234, "step": 25190 }, { "epoch": 4.74, "grad_norm": 3.7400076389312744, "learning_rate": 1.0513833992094862e-05, "loss": 1.1189, "step": 25200 }, { "epoch": 4.74, "grad_norm": 14.868270874023438, "learning_rate": 1.0510069640504424e-05, "loss": 1.2246, "step": 25210 }, { "epoch": 4.75, "grad_norm": 4.341249942779541, "learning_rate": 1.0506305288913985e-05, "loss": 1.29, "step": 25220 }, { "epoch": 4.75, "grad_norm": 12.49927806854248, "learning_rate": 1.0502540937323547e-05, "loss": 1.0265, "step": 25230 }, { "epoch": 4.75, "grad_norm": 22.81476402282715, "learning_rate": 1.0498776585733108e-05, "loss": 1.3817, "step": 25240 }, { "epoch": 4.75, "grad_norm": 6.98078727722168, "learning_rate": 1.049501223414267e-05, "loss": 1.2146, "step": 25250 }, { "epoch": 4.75, "grad_norm": 8.071756362915039, "learning_rate": 1.0491247882552231e-05, "loss": 1.131, "step": 25260 }, { "epoch": 4.76, "grad_norm": 8.3782320022583, "learning_rate": 1.0487483530961793e-05, "loss": 1.0954, "step": 25270 }, { "epoch": 4.76, "grad_norm": 12.565004348754883, "learning_rate": 1.0483719179371354e-05, "loss": 1.4605, "step": 25280 }, { "epoch": 4.76, "grad_norm": 10.430145263671875, "learning_rate": 1.0479954827780915e-05, "loss": 1.106, "step": 25290 }, { "epoch": 4.76, "grad_norm": 9.142749786376953, "learning_rate": 1.0476190476190477e-05, "loss": 1.1452, "step": 25300 }, { "epoch": 4.76, "grad_norm": 14.01291275024414, "learning_rate": 1.0472426124600038e-05, "loss": 0.9915, "step": 25310 }, { "epoch": 4.77, "grad_norm": 3.822159767150879, "learning_rate": 1.0468661773009602e-05, "loss": 1.6682, "step": 25320 }, { "epoch": 4.77, "grad_norm": 30.380680084228516, "learning_rate": 1.0464897421419163e-05, "loss": 1.1571, "step": 25330 }, { "epoch": 4.77, "grad_norm": 12.578876495361328, "learning_rate": 1.0461133069828724e-05, "loss": 1.3349, "step": 25340 }, { "epoch": 4.77, "grad_norm": 7.98133659362793, "learning_rate": 1.0457368718238282e-05, "loss": 1.2712, "step": 25350 }, { "epoch": 4.77, "grad_norm": 15.005038261413574, "learning_rate": 1.0453604366647846e-05, "loss": 1.2653, "step": 25360 }, { "epoch": 4.78, "grad_norm": 10.36233901977539, "learning_rate": 1.0449840015057407e-05, "loss": 0.9858, "step": 25370 }, { "epoch": 4.78, "grad_norm": 5.5333380699157715, "learning_rate": 1.0446075663466969e-05, "loss": 1.2215, "step": 25380 }, { "epoch": 4.78, "grad_norm": 9.379959106445312, "learning_rate": 1.044231131187653e-05, "loss": 1.1904, "step": 25390 }, { "epoch": 4.78, "grad_norm": 9.702463150024414, "learning_rate": 1.0438546960286091e-05, "loss": 1.1191, "step": 25400 }, { "epoch": 4.78, "grad_norm": 4.509952068328857, "learning_rate": 1.0434782608695653e-05, "loss": 0.9224, "step": 25410 }, { "epoch": 4.78, "grad_norm": 4.689672470092773, "learning_rate": 1.0431018257105214e-05, "loss": 1.2617, "step": 25420 }, { "epoch": 4.79, "grad_norm": 12.805371284484863, "learning_rate": 1.0427253905514776e-05, "loss": 0.845, "step": 25430 }, { "epoch": 4.79, "grad_norm": 4.872035026550293, "learning_rate": 1.0423489553924337e-05, "loss": 1.0583, "step": 25440 }, { "epoch": 4.79, "grad_norm": 8.656426429748535, "learning_rate": 1.0419725202333899e-05, "loss": 1.2976, "step": 25450 }, { "epoch": 4.79, "grad_norm": 7.93969202041626, "learning_rate": 1.041596085074346e-05, "loss": 1.2692, "step": 25460 }, { "epoch": 4.79, "grad_norm": 7.077612400054932, "learning_rate": 1.0412196499153022e-05, "loss": 1.1235, "step": 25470 }, { "epoch": 4.8, "grad_norm": 16.626768112182617, "learning_rate": 1.0408432147562583e-05, "loss": 1.2643, "step": 25480 }, { "epoch": 4.8, "grad_norm": 3.812723398208618, "learning_rate": 1.0404667795972145e-05, "loss": 1.3406, "step": 25490 }, { "epoch": 4.8, "grad_norm": 9.23488998413086, "learning_rate": 1.0400903444381708e-05, "loss": 1.1035, "step": 25500 }, { "epoch": 4.8, "grad_norm": 6.040359973907471, "learning_rate": 1.039713909279127e-05, "loss": 1.105, "step": 25510 }, { "epoch": 4.8, "grad_norm": 12.046161651611328, "learning_rate": 1.0393374741200827e-05, "loss": 1.2153, "step": 25520 }, { "epoch": 4.81, "grad_norm": 11.164216041564941, "learning_rate": 1.0389610389610389e-05, "loss": 1.2855, "step": 25530 }, { "epoch": 4.81, "grad_norm": 19.752851486206055, "learning_rate": 1.038584603801995e-05, "loss": 1.1226, "step": 25540 }, { "epoch": 4.81, "grad_norm": 8.377117156982422, "learning_rate": 1.0382081686429513e-05, "loss": 1.1627, "step": 25550 }, { "epoch": 4.81, "grad_norm": 6.385437488555908, "learning_rate": 1.0378317334839075e-05, "loss": 0.988, "step": 25560 }, { "epoch": 4.81, "grad_norm": 5.609097003936768, "learning_rate": 1.0374552983248636e-05, "loss": 1.0536, "step": 25570 }, { "epoch": 4.81, "grad_norm": 6.156047344207764, "learning_rate": 1.0370788631658198e-05, "loss": 0.9748, "step": 25580 }, { "epoch": 4.82, "grad_norm": 10.199936866760254, "learning_rate": 1.036702428006776e-05, "loss": 1.2601, "step": 25590 }, { "epoch": 4.82, "grad_norm": 3.5951240062713623, "learning_rate": 1.036325992847732e-05, "loss": 1.1225, "step": 25600 }, { "epoch": 4.82, "grad_norm": 16.65151596069336, "learning_rate": 1.0359495576886882e-05, "loss": 1.0543, "step": 25610 }, { "epoch": 4.82, "grad_norm": 25.264877319335938, "learning_rate": 1.0355731225296444e-05, "loss": 1.21, "step": 25620 }, { "epoch": 4.82, "grad_norm": 9.493725776672363, "learning_rate": 1.0351966873706005e-05, "loss": 1.3152, "step": 25630 }, { "epoch": 4.83, "grad_norm": 7.403919696807861, "learning_rate": 1.0348202522115567e-05, "loss": 1.0935, "step": 25640 }, { "epoch": 4.83, "grad_norm": 12.59331226348877, "learning_rate": 1.0344438170525128e-05, "loss": 1.3015, "step": 25650 }, { "epoch": 4.83, "grad_norm": 17.20966911315918, "learning_rate": 1.034067381893469e-05, "loss": 1.3596, "step": 25660 }, { "epoch": 4.83, "grad_norm": 9.469931602478027, "learning_rate": 1.0336909467344251e-05, "loss": 0.9711, "step": 25670 }, { "epoch": 4.83, "grad_norm": 2.1328306198120117, "learning_rate": 1.0333145115753812e-05, "loss": 1.0223, "step": 25680 }, { "epoch": 4.84, "grad_norm": 5.9799346923828125, "learning_rate": 1.0329380764163376e-05, "loss": 1.0078, "step": 25690 }, { "epoch": 4.84, "grad_norm": 28.39556312561035, "learning_rate": 1.0325616412572934e-05, "loss": 1.3057, "step": 25700 }, { "epoch": 4.84, "grad_norm": 29.23651123046875, "learning_rate": 1.0321852060982495e-05, "loss": 1.0624, "step": 25710 }, { "epoch": 4.84, "grad_norm": 6.022303104400635, "learning_rate": 1.0318087709392057e-05, "loss": 1.0922, "step": 25720 }, { "epoch": 4.84, "grad_norm": 34.845245361328125, "learning_rate": 1.031432335780162e-05, "loss": 1.3464, "step": 25730 }, { "epoch": 4.84, "grad_norm": 13.598247528076172, "learning_rate": 1.0310559006211181e-05, "loss": 0.9758, "step": 25740 }, { "epoch": 4.85, "grad_norm": 10.721945762634277, "learning_rate": 1.0306794654620743e-05, "loss": 0.9556, "step": 25750 }, { "epoch": 4.85, "grad_norm": 2.767383098602295, "learning_rate": 1.0303030303030304e-05, "loss": 1.0067, "step": 25760 }, { "epoch": 4.85, "grad_norm": 21.182327270507812, "learning_rate": 1.0299265951439866e-05, "loss": 1.1026, "step": 25770 }, { "epoch": 4.85, "grad_norm": 13.29002857208252, "learning_rate": 1.0295501599849427e-05, "loss": 1.0089, "step": 25780 }, { "epoch": 4.85, "grad_norm": 12.47888469696045, "learning_rate": 1.0291737248258988e-05, "loss": 1.23, "step": 25790 }, { "epoch": 4.86, "grad_norm": 15.18270206451416, "learning_rate": 1.028797289666855e-05, "loss": 1.3749, "step": 25800 }, { "epoch": 4.86, "grad_norm": 17.31894874572754, "learning_rate": 1.0284208545078111e-05, "loss": 1.4101, "step": 25810 }, { "epoch": 4.86, "grad_norm": 21.306396484375, "learning_rate": 1.0280444193487673e-05, "loss": 1.0165, "step": 25820 }, { "epoch": 4.86, "grad_norm": 10.692023277282715, "learning_rate": 1.0276679841897234e-05, "loss": 1.1215, "step": 25830 }, { "epoch": 4.86, "grad_norm": 2.4061758518218994, "learning_rate": 1.0272915490306796e-05, "loss": 1.0926, "step": 25840 }, { "epoch": 4.87, "grad_norm": 17.752622604370117, "learning_rate": 1.0269151138716357e-05, "loss": 1.2268, "step": 25850 }, { "epoch": 4.87, "grad_norm": 22.09778594970703, "learning_rate": 1.0265386787125919e-05, "loss": 0.9715, "step": 25860 }, { "epoch": 4.87, "grad_norm": 4.150321006774902, "learning_rate": 1.0261622435535478e-05, "loss": 1.2232, "step": 25870 }, { "epoch": 4.87, "grad_norm": 4.821023464202881, "learning_rate": 1.025785808394504e-05, "loss": 1.3189, "step": 25880 }, { "epoch": 4.87, "grad_norm": 8.190055847167969, "learning_rate": 1.0254093732354601e-05, "loss": 0.9114, "step": 25890 }, { "epoch": 4.87, "grad_norm": 12.706771850585938, "learning_rate": 1.0250329380764163e-05, "loss": 0.9136, "step": 25900 }, { "epoch": 4.88, "grad_norm": 4.07108211517334, "learning_rate": 1.0246565029173724e-05, "loss": 0.9614, "step": 25910 }, { "epoch": 4.88, "grad_norm": 6.869511127471924, "learning_rate": 1.0242800677583287e-05, "loss": 0.9797, "step": 25920 }, { "epoch": 4.88, "grad_norm": 7.895568370819092, "learning_rate": 1.0239036325992849e-05, "loss": 0.9795, "step": 25930 }, { "epoch": 4.88, "grad_norm": 14.05517578125, "learning_rate": 1.023527197440241e-05, "loss": 0.9762, "step": 25940 }, { "epoch": 4.88, "grad_norm": 12.612595558166504, "learning_rate": 1.0231507622811972e-05, "loss": 1.2767, "step": 25950 }, { "epoch": 4.89, "grad_norm": 9.433365821838379, "learning_rate": 1.0227743271221533e-05, "loss": 1.0211, "step": 25960 }, { "epoch": 4.89, "grad_norm": 17.517715454101562, "learning_rate": 1.0223978919631095e-05, "loss": 1.0772, "step": 25970 }, { "epoch": 4.89, "grad_norm": 7.24169921875, "learning_rate": 1.0220214568040656e-05, "loss": 1.2914, "step": 25980 }, { "epoch": 4.89, "grad_norm": 19.72892951965332, "learning_rate": 1.0216450216450218e-05, "loss": 1.2819, "step": 25990 }, { "epoch": 4.89, "grad_norm": 9.213044166564941, "learning_rate": 1.0212685864859779e-05, "loss": 1.0973, "step": 26000 }, { "epoch": 4.9, "grad_norm": 5.074709415435791, "learning_rate": 1.020892151326934e-05, "loss": 0.8679, "step": 26010 }, { "epoch": 4.9, "grad_norm": 2.2960987091064453, "learning_rate": 1.0205157161678902e-05, "loss": 0.8621, "step": 26020 }, { "epoch": 4.9, "grad_norm": 18.922508239746094, "learning_rate": 1.0201392810088464e-05, "loss": 0.9828, "step": 26030 }, { "epoch": 4.9, "grad_norm": 18.598535537719727, "learning_rate": 1.0197628458498025e-05, "loss": 0.9248, "step": 26040 }, { "epoch": 4.9, "grad_norm": 7.504372596740723, "learning_rate": 1.0193864106907585e-05, "loss": 1.1654, "step": 26050 }, { "epoch": 4.9, "grad_norm": 20.26690101623535, "learning_rate": 1.0190099755317146e-05, "loss": 1.2221, "step": 26060 }, { "epoch": 4.91, "grad_norm": 2.8796322345733643, "learning_rate": 1.0186335403726708e-05, "loss": 0.9816, "step": 26070 }, { "epoch": 4.91, "grad_norm": 14.184268951416016, "learning_rate": 1.0182571052136269e-05, "loss": 1.1677, "step": 26080 }, { "epoch": 4.91, "grad_norm": 29.624011993408203, "learning_rate": 1.017880670054583e-05, "loss": 1.1847, "step": 26090 }, { "epoch": 4.91, "grad_norm": 5.696228504180908, "learning_rate": 1.0175042348955392e-05, "loss": 1.0953, "step": 26100 }, { "epoch": 4.91, "grad_norm": 5.240353584289551, "learning_rate": 1.0171277997364955e-05, "loss": 1.1718, "step": 26110 }, { "epoch": 4.92, "grad_norm": 7.874598503112793, "learning_rate": 1.0167513645774517e-05, "loss": 1.2185, "step": 26120 }, { "epoch": 4.92, "grad_norm": 10.357918739318848, "learning_rate": 1.0163749294184078e-05, "loss": 1.1531, "step": 26130 }, { "epoch": 4.92, "grad_norm": 10.745437622070312, "learning_rate": 1.015998494259364e-05, "loss": 1.2208, "step": 26140 }, { "epoch": 4.92, "grad_norm": 8.142765998840332, "learning_rate": 1.0156220591003201e-05, "loss": 1.0742, "step": 26150 }, { "epoch": 4.92, "grad_norm": 34.80218505859375, "learning_rate": 1.0152456239412763e-05, "loss": 1.1466, "step": 26160 }, { "epoch": 4.93, "grad_norm": 5.531490802764893, "learning_rate": 1.0148691887822324e-05, "loss": 0.921, "step": 26170 }, { "epoch": 4.93, "grad_norm": 11.76372241973877, "learning_rate": 1.0144927536231885e-05, "loss": 1.138, "step": 26180 }, { "epoch": 4.93, "grad_norm": 2.1722989082336426, "learning_rate": 1.0141163184641447e-05, "loss": 0.9231, "step": 26190 }, { "epoch": 4.93, "grad_norm": 4.814737319946289, "learning_rate": 1.0137398833051008e-05, "loss": 1.2029, "step": 26200 }, { "epoch": 4.93, "grad_norm": 8.89008903503418, "learning_rate": 1.013363448146057e-05, "loss": 1.4168, "step": 26210 }, { "epoch": 4.94, "grad_norm": 24.199127197265625, "learning_rate": 1.012987012987013e-05, "loss": 1.0795, "step": 26220 }, { "epoch": 4.94, "grad_norm": 13.183247566223145, "learning_rate": 1.0126105778279691e-05, "loss": 1.3494, "step": 26230 }, { "epoch": 4.94, "grad_norm": 8.631226539611816, "learning_rate": 1.0122341426689252e-05, "loss": 1.2947, "step": 26240 }, { "epoch": 4.94, "grad_norm": 13.727885246276855, "learning_rate": 1.0118577075098814e-05, "loss": 1.4218, "step": 26250 }, { "epoch": 4.94, "grad_norm": 16.077251434326172, "learning_rate": 1.0114812723508375e-05, "loss": 1.1375, "step": 26260 }, { "epoch": 4.94, "grad_norm": 28.120006561279297, "learning_rate": 1.0111048371917937e-05, "loss": 0.8345, "step": 26270 }, { "epoch": 4.95, "grad_norm": 32.448673248291016, "learning_rate": 1.0107284020327498e-05, "loss": 1.5321, "step": 26280 }, { "epoch": 4.95, "grad_norm": 3.6854100227355957, "learning_rate": 1.0103519668737061e-05, "loss": 1.0194, "step": 26290 }, { "epoch": 4.95, "grad_norm": 25.11188316345215, "learning_rate": 1.0099755317146623e-05, "loss": 1.1785, "step": 26300 }, { "epoch": 4.95, "grad_norm": 10.218350410461426, "learning_rate": 1.0095990965556184e-05, "loss": 1.2161, "step": 26310 }, { "epoch": 4.95, "grad_norm": 10.047581672668457, "learning_rate": 1.0092226613965746e-05, "loss": 1.1624, "step": 26320 }, { "epoch": 4.96, "grad_norm": 7.268515586853027, "learning_rate": 1.0088462262375307e-05, "loss": 1.0929, "step": 26330 }, { "epoch": 4.96, "grad_norm": 4.340163707733154, "learning_rate": 1.0084697910784869e-05, "loss": 0.8543, "step": 26340 }, { "epoch": 4.96, "grad_norm": 4.949464797973633, "learning_rate": 1.008093355919443e-05, "loss": 1.1856, "step": 26350 }, { "epoch": 4.96, "grad_norm": 5.003396034240723, "learning_rate": 1.0077169207603992e-05, "loss": 0.8742, "step": 26360 }, { "epoch": 4.96, "grad_norm": 5.929224491119385, "learning_rate": 1.0073404856013553e-05, "loss": 0.9812, "step": 26370 }, { "epoch": 4.97, "grad_norm": 15.164633750915527, "learning_rate": 1.0069640504423115e-05, "loss": 0.9665, "step": 26380 }, { "epoch": 4.97, "grad_norm": 11.110221862792969, "learning_rate": 1.0065876152832676e-05, "loss": 1.1016, "step": 26390 }, { "epoch": 4.97, "grad_norm": 10.297097206115723, "learning_rate": 1.0062111801242236e-05, "loss": 1.0971, "step": 26400 }, { "epoch": 4.97, "grad_norm": 12.229903221130371, "learning_rate": 1.0058347449651797e-05, "loss": 1.0627, "step": 26410 }, { "epoch": 4.97, "grad_norm": 14.45139217376709, "learning_rate": 1.0054583098061359e-05, "loss": 1.3832, "step": 26420 }, { "epoch": 4.97, "grad_norm": 20.653587341308594, "learning_rate": 1.005081874647092e-05, "loss": 1.0634, "step": 26430 }, { "epoch": 4.98, "grad_norm": 18.79515838623047, "learning_rate": 1.0047054394880482e-05, "loss": 1.3115, "step": 26440 }, { "epoch": 4.98, "grad_norm": 29.438232421875, "learning_rate": 1.0043290043290043e-05, "loss": 1.4086, "step": 26450 }, { "epoch": 4.98, "grad_norm": 8.170830726623535, "learning_rate": 1.0039525691699605e-05, "loss": 0.7941, "step": 26460 }, { "epoch": 4.98, "grad_norm": 4.754985332489014, "learning_rate": 1.0035761340109166e-05, "loss": 0.9629, "step": 26470 }, { "epoch": 4.98, "grad_norm": 8.939909934997559, "learning_rate": 1.003199698851873e-05, "loss": 1.0287, "step": 26480 }, { "epoch": 4.99, "grad_norm": 6.884005546569824, "learning_rate": 1.002823263692829e-05, "loss": 1.0801, "step": 26490 }, { "epoch": 4.99, "grad_norm": 4.694358825683594, "learning_rate": 1.0024468285337852e-05, "loss": 0.8783, "step": 26500 }, { "epoch": 4.99, "grad_norm": 9.406129837036133, "learning_rate": 1.0020703933747414e-05, "loss": 1.0807, "step": 26510 }, { "epoch": 4.99, "grad_norm": 8.150083541870117, "learning_rate": 1.0016939582156975e-05, "loss": 1.3124, "step": 26520 }, { "epoch": 4.99, "grad_norm": 10.473052978515625, "learning_rate": 1.0013175230566537e-05, "loss": 1.3617, "step": 26530 }, { "epoch": 5.0, "grad_norm": 9.322982788085938, "learning_rate": 1.0009410878976098e-05, "loss": 0.9887, "step": 26540 }, { "epoch": 5.0, "grad_norm": 13.791378021240234, "learning_rate": 1.000564652738566e-05, "loss": 1.1296, "step": 26550 }, { "epoch": 5.0, "grad_norm": 4.172353744506836, "learning_rate": 1.0001882175795221e-05, "loss": 0.7426, "step": 26560 }, { "epoch": 5.0, "eval_accuracy": 0.7896, "eval_loss": 1.1835823059082031, "eval_runtime": 31.2345, "eval_samples_per_second": 240.119, "eval_steps_per_second": 30.031, "step": 26565 }, { "epoch": 5.0, "grad_norm": 2.916236400604248, "learning_rate": 9.998117824204782e-06, "loss": 0.8302, "step": 26570 }, { "epoch": 5.0, "grad_norm": 5.750648021697998, "learning_rate": 9.994353472614344e-06, "loss": 0.8985, "step": 26580 }, { "epoch": 5.0, "grad_norm": 13.874232292175293, "learning_rate": 9.990589121023905e-06, "loss": 0.9801, "step": 26590 }, { "epoch": 5.01, "grad_norm": 4.6329169273376465, "learning_rate": 9.986824769433467e-06, "loss": 0.8527, "step": 26600 }, { "epoch": 5.01, "grad_norm": 13.857070922851562, "learning_rate": 9.983060417843027e-06, "loss": 1.1713, "step": 26610 }, { "epoch": 5.01, "grad_norm": 24.842588424682617, "learning_rate": 9.979296066252588e-06, "loss": 1.2598, "step": 26620 }, { "epoch": 5.01, "grad_norm": 8.714927673339844, "learning_rate": 9.97553171466215e-06, "loss": 1.4008, "step": 26630 }, { "epoch": 5.01, "grad_norm": 22.71858024597168, "learning_rate": 9.971767363071711e-06, "loss": 1.1521, "step": 26640 }, { "epoch": 5.02, "grad_norm": 6.438852787017822, "learning_rate": 9.968003011481272e-06, "loss": 0.8392, "step": 26650 }, { "epoch": 5.02, "grad_norm": 3.9260830879211426, "learning_rate": 9.964238659890836e-06, "loss": 1.1356, "step": 26660 }, { "epoch": 5.02, "grad_norm": 13.564977645874023, "learning_rate": 9.960474308300397e-06, "loss": 1.0648, "step": 26670 }, { "epoch": 5.02, "grad_norm": 11.386746406555176, "learning_rate": 9.956709956709958e-06, "loss": 1.1249, "step": 26680 }, { "epoch": 5.02, "grad_norm": 29.780128479003906, "learning_rate": 9.95294560511952e-06, "loss": 1.3452, "step": 26690 }, { "epoch": 5.03, "grad_norm": 6.506429195404053, "learning_rate": 9.94918125352908e-06, "loss": 0.9004, "step": 26700 }, { "epoch": 5.03, "grad_norm": 10.421928405761719, "learning_rate": 9.945416901938641e-06, "loss": 1.0872, "step": 26710 }, { "epoch": 5.03, "grad_norm": 5.279590606689453, "learning_rate": 9.941652550348203e-06, "loss": 0.9468, "step": 26720 }, { "epoch": 5.03, "grad_norm": 9.596915245056152, "learning_rate": 9.937888198757764e-06, "loss": 1.1274, "step": 26730 }, { "epoch": 5.03, "grad_norm": 20.254804611206055, "learning_rate": 9.934123847167326e-06, "loss": 0.6749, "step": 26740 }, { "epoch": 5.03, "grad_norm": 6.77964448928833, "learning_rate": 9.930359495576887e-06, "loss": 0.8153, "step": 26750 }, { "epoch": 5.04, "grad_norm": 13.672237396240234, "learning_rate": 9.92659514398645e-06, "loss": 0.8883, "step": 26760 }, { "epoch": 5.04, "grad_norm": 8.901549339294434, "learning_rate": 9.922830792396012e-06, "loss": 0.9805, "step": 26770 }, { "epoch": 5.04, "grad_norm": 7.224909782409668, "learning_rate": 9.919066440805573e-06, "loss": 1.165, "step": 26780 }, { "epoch": 5.04, "grad_norm": 20.98736572265625, "learning_rate": 9.915302089215133e-06, "loss": 1.2478, "step": 26790 }, { "epoch": 5.04, "grad_norm": 13.662611961364746, "learning_rate": 9.911537737624694e-06, "loss": 0.7839, "step": 26800 }, { "epoch": 5.05, "grad_norm": 5.005545616149902, "learning_rate": 9.907773386034256e-06, "loss": 0.9965, "step": 26810 }, { "epoch": 5.05, "grad_norm": 12.271440505981445, "learning_rate": 9.904009034443817e-06, "loss": 0.742, "step": 26820 }, { "epoch": 5.05, "grad_norm": 13.787073135375977, "learning_rate": 9.900244682853379e-06, "loss": 1.0941, "step": 26830 }, { "epoch": 5.05, "grad_norm": 10.770931243896484, "learning_rate": 9.89648033126294e-06, "loss": 0.704, "step": 26840 }, { "epoch": 5.05, "grad_norm": 25.081623077392578, "learning_rate": 9.892715979672503e-06, "loss": 1.1205, "step": 26850 }, { "epoch": 5.06, "grad_norm": 9.152400970458984, "learning_rate": 9.888951628082065e-06, "loss": 0.9555, "step": 26860 }, { "epoch": 5.06, "grad_norm": 8.149613380432129, "learning_rate": 9.885187276491625e-06, "loss": 0.8373, "step": 26870 }, { "epoch": 5.06, "grad_norm": 4.785212993621826, "learning_rate": 9.881422924901186e-06, "loss": 0.7638, "step": 26880 }, { "epoch": 5.06, "grad_norm": 14.316230773925781, "learning_rate": 9.877658573310747e-06, "loss": 0.918, "step": 26890 }, { "epoch": 5.06, "grad_norm": 6.017427444458008, "learning_rate": 9.873894221720309e-06, "loss": 0.9361, "step": 26900 }, { "epoch": 5.06, "grad_norm": 8.026313781738281, "learning_rate": 9.87012987012987e-06, "loss": 1.0931, "step": 26910 }, { "epoch": 5.07, "grad_norm": 19.828094482421875, "learning_rate": 9.866365518539432e-06, "loss": 1.2641, "step": 26920 }, { "epoch": 5.07, "grad_norm": 12.181910514831543, "learning_rate": 9.862601166948993e-06, "loss": 1.1161, "step": 26930 }, { "epoch": 5.07, "grad_norm": 42.590702056884766, "learning_rate": 9.858836815358556e-06, "loss": 1.3991, "step": 26940 }, { "epoch": 5.07, "grad_norm": 14.43920612335205, "learning_rate": 9.855072463768118e-06, "loss": 1.2055, "step": 26950 }, { "epoch": 5.07, "grad_norm": 7.366767406463623, "learning_rate": 9.851308112177678e-06, "loss": 1.003, "step": 26960 }, { "epoch": 5.08, "grad_norm": 21.354427337646484, "learning_rate": 9.847543760587239e-06, "loss": 0.8959, "step": 26970 }, { "epoch": 5.08, "grad_norm": 11.245001792907715, "learning_rate": 9.8437794089968e-06, "loss": 0.7587, "step": 26980 }, { "epoch": 5.08, "grad_norm": 8.552119255065918, "learning_rate": 9.840015057406362e-06, "loss": 1.2923, "step": 26990 }, { "epoch": 5.08, "grad_norm": 12.783400535583496, "learning_rate": 9.836250705815924e-06, "loss": 1.1782, "step": 27000 }, { "epoch": 5.08, "grad_norm": 5.661661148071289, "learning_rate": 9.832486354225485e-06, "loss": 1.2305, "step": 27010 }, { "epoch": 5.09, "grad_norm": 43.36799621582031, "learning_rate": 9.828722002635046e-06, "loss": 1.2792, "step": 27020 }, { "epoch": 5.09, "grad_norm": 14.231168746948242, "learning_rate": 9.82495765104461e-06, "loss": 1.137, "step": 27030 }, { "epoch": 5.09, "grad_norm": 15.047614097595215, "learning_rate": 9.821193299454171e-06, "loss": 1.0722, "step": 27040 }, { "epoch": 5.09, "grad_norm": 3.712689161300659, "learning_rate": 9.81742894786373e-06, "loss": 0.9203, "step": 27050 }, { "epoch": 5.09, "grad_norm": 3.286311149597168, "learning_rate": 9.813664596273292e-06, "loss": 0.8649, "step": 27060 }, { "epoch": 5.1, "grad_norm": 2.5953259468078613, "learning_rate": 9.809900244682854e-06, "loss": 1.2937, "step": 27070 }, { "epoch": 5.1, "grad_norm": 10.919048309326172, "learning_rate": 9.806135893092415e-06, "loss": 0.9678, "step": 27080 }, { "epoch": 5.1, "grad_norm": 41.31084442138672, "learning_rate": 9.802371541501977e-06, "loss": 1.158, "step": 27090 }, { "epoch": 5.1, "grad_norm": 16.500913619995117, "learning_rate": 9.798607189911538e-06, "loss": 0.9315, "step": 27100 }, { "epoch": 5.1, "grad_norm": 18.413644790649414, "learning_rate": 9.7948428383211e-06, "loss": 0.9244, "step": 27110 }, { "epoch": 5.1, "grad_norm": 9.511786460876465, "learning_rate": 9.791078486730661e-06, "loss": 0.9637, "step": 27120 }, { "epoch": 5.11, "grad_norm": 5.775086879730225, "learning_rate": 9.787314135140224e-06, "loss": 0.8531, "step": 27130 }, { "epoch": 5.11, "grad_norm": 17.341480255126953, "learning_rate": 9.783549783549784e-06, "loss": 0.9959, "step": 27140 }, { "epoch": 5.11, "grad_norm": 8.823723793029785, "learning_rate": 9.779785431959345e-06, "loss": 1.0582, "step": 27150 }, { "epoch": 5.11, "grad_norm": 15.410893440246582, "learning_rate": 9.776021080368907e-06, "loss": 1.3696, "step": 27160 }, { "epoch": 5.11, "grad_norm": 5.488454341888428, "learning_rate": 9.772256728778468e-06, "loss": 1.1351, "step": 27170 }, { "epoch": 5.12, "grad_norm": 14.9708833694458, "learning_rate": 9.76849237718803e-06, "loss": 0.9706, "step": 27180 }, { "epoch": 5.12, "grad_norm": 9.437253952026367, "learning_rate": 9.764728025597591e-06, "loss": 0.9786, "step": 27190 }, { "epoch": 5.12, "grad_norm": 7.0545973777771, "learning_rate": 9.760963674007153e-06, "loss": 0.8359, "step": 27200 }, { "epoch": 5.12, "grad_norm": 10.452978134155273, "learning_rate": 9.757199322416714e-06, "loss": 1.125, "step": 27210 }, { "epoch": 5.12, "grad_norm": 7.698655605316162, "learning_rate": 9.753434970826276e-06, "loss": 0.9692, "step": 27220 }, { "epoch": 5.13, "grad_norm": 12.78397274017334, "learning_rate": 9.749670619235837e-06, "loss": 0.9195, "step": 27230 }, { "epoch": 5.13, "grad_norm": 5.4084601402282715, "learning_rate": 9.745906267645399e-06, "loss": 1.0743, "step": 27240 }, { "epoch": 5.13, "grad_norm": 11.61865234375, "learning_rate": 9.74214191605496e-06, "loss": 0.7497, "step": 27250 }, { "epoch": 5.13, "grad_norm": 3.124312400817871, "learning_rate": 9.738377564464521e-06, "loss": 0.8332, "step": 27260 }, { "epoch": 5.13, "grad_norm": 11.031091690063477, "learning_rate": 9.734613212874083e-06, "loss": 0.9798, "step": 27270 }, { "epoch": 5.13, "grad_norm": 7.196647644042969, "learning_rate": 9.730848861283644e-06, "loss": 0.8928, "step": 27280 }, { "epoch": 5.14, "grad_norm": 4.007391929626465, "learning_rate": 9.727084509693206e-06, "loss": 0.9578, "step": 27290 }, { "epoch": 5.14, "grad_norm": 7.186996936798096, "learning_rate": 9.723320158102767e-06, "loss": 0.9437, "step": 27300 }, { "epoch": 5.14, "grad_norm": 14.832673072814941, "learning_rate": 9.719555806512329e-06, "loss": 0.7745, "step": 27310 }, { "epoch": 5.14, "grad_norm": 20.813167572021484, "learning_rate": 9.71579145492189e-06, "loss": 1.0704, "step": 27320 }, { "epoch": 5.14, "grad_norm": 10.628039360046387, "learning_rate": 9.712027103331452e-06, "loss": 1.0283, "step": 27330 }, { "epoch": 5.15, "grad_norm": 11.506697654724121, "learning_rate": 9.708262751741013e-06, "loss": 1.0715, "step": 27340 }, { "epoch": 5.15, "grad_norm": 8.832376480102539, "learning_rate": 9.704498400150575e-06, "loss": 1.0275, "step": 27350 }, { "epoch": 5.15, "grad_norm": 17.504180908203125, "learning_rate": 9.700734048560136e-06, "loss": 1.1846, "step": 27360 }, { "epoch": 5.15, "grad_norm": 22.317054748535156, "learning_rate": 9.696969696969698e-06, "loss": 1.1642, "step": 27370 }, { "epoch": 5.15, "grad_norm": 4.779489040374756, "learning_rate": 9.693205345379259e-06, "loss": 0.8401, "step": 27380 }, { "epoch": 5.16, "grad_norm": 3.6331608295440674, "learning_rate": 9.68944099378882e-06, "loss": 0.8401, "step": 27390 }, { "epoch": 5.16, "grad_norm": 7.78971529006958, "learning_rate": 9.685676642198382e-06, "loss": 0.9365, "step": 27400 }, { "epoch": 5.16, "grad_norm": 11.525785446166992, "learning_rate": 9.681912290607943e-06, "loss": 0.9498, "step": 27410 }, { "epoch": 5.16, "grad_norm": 8.604715347290039, "learning_rate": 9.678147939017505e-06, "loss": 1.0219, "step": 27420 }, { "epoch": 5.16, "grad_norm": 3.892197370529175, "learning_rate": 9.674383587427066e-06, "loss": 0.6814, "step": 27430 }, { "epoch": 5.16, "grad_norm": 10.428958892822266, "learning_rate": 9.670619235836628e-06, "loss": 1.3971, "step": 27440 }, { "epoch": 5.17, "grad_norm": 22.1682071685791, "learning_rate": 9.66685488424619e-06, "loss": 0.9847, "step": 27450 }, { "epoch": 5.17, "grad_norm": 11.644514083862305, "learning_rate": 9.66309053265575e-06, "loss": 1.1442, "step": 27460 }, { "epoch": 5.17, "grad_norm": 4.7382588386535645, "learning_rate": 9.659326181065312e-06, "loss": 0.9062, "step": 27470 }, { "epoch": 5.17, "grad_norm": 25.0002498626709, "learning_rate": 9.655561829474874e-06, "loss": 0.7817, "step": 27480 }, { "epoch": 5.17, "grad_norm": 17.368316650390625, "learning_rate": 9.651797477884435e-06, "loss": 1.0312, "step": 27490 }, { "epoch": 5.18, "grad_norm": 7.87490701675415, "learning_rate": 9.648033126293997e-06, "loss": 0.7836, "step": 27500 }, { "epoch": 5.18, "grad_norm": 3.262343168258667, "learning_rate": 9.644268774703558e-06, "loss": 0.6991, "step": 27510 }, { "epoch": 5.18, "grad_norm": 9.775717735290527, "learning_rate": 9.64050442311312e-06, "loss": 1.1062, "step": 27520 }, { "epoch": 5.18, "grad_norm": 22.61628532409668, "learning_rate": 9.636740071522681e-06, "loss": 0.8994, "step": 27530 }, { "epoch": 5.18, "grad_norm": 48.14933776855469, "learning_rate": 9.632975719932242e-06, "loss": 0.8694, "step": 27540 }, { "epoch": 5.19, "grad_norm": 12.035213470458984, "learning_rate": 9.629211368341804e-06, "loss": 1.1458, "step": 27550 }, { "epoch": 5.19, "grad_norm": 2.048144578933716, "learning_rate": 9.625447016751365e-06, "loss": 0.6777, "step": 27560 }, { "epoch": 5.19, "grad_norm": 14.882506370544434, "learning_rate": 9.621682665160927e-06, "loss": 0.926, "step": 27570 }, { "epoch": 5.19, "grad_norm": 9.695802688598633, "learning_rate": 9.617918313570488e-06, "loss": 0.8995, "step": 27580 }, { "epoch": 5.19, "grad_norm": 6.620717525482178, "learning_rate": 9.61415396198005e-06, "loss": 1.0442, "step": 27590 }, { "epoch": 5.19, "grad_norm": 19.235530853271484, "learning_rate": 9.610389610389611e-06, "loss": 1.1022, "step": 27600 }, { "epoch": 5.2, "grad_norm": 3.552525758743286, "learning_rate": 9.606625258799173e-06, "loss": 0.773, "step": 27610 }, { "epoch": 5.2, "grad_norm": 22.866758346557617, "learning_rate": 9.602860907208734e-06, "loss": 0.981, "step": 27620 }, { "epoch": 5.2, "grad_norm": 1.8297398090362549, "learning_rate": 9.599096555618296e-06, "loss": 0.9407, "step": 27630 }, { "epoch": 5.2, "grad_norm": 7.152495861053467, "learning_rate": 9.595332204027857e-06, "loss": 0.7969, "step": 27640 }, { "epoch": 5.2, "grad_norm": 22.57699203491211, "learning_rate": 9.591567852437418e-06, "loss": 0.751, "step": 27650 }, { "epoch": 5.21, "grad_norm": 11.715229034423828, "learning_rate": 9.58780350084698e-06, "loss": 0.6957, "step": 27660 }, { "epoch": 5.21, "grad_norm": 18.593791961669922, "learning_rate": 9.584039149256541e-06, "loss": 1.0855, "step": 27670 }, { "epoch": 5.21, "grad_norm": 3.3888206481933594, "learning_rate": 9.580274797666103e-06, "loss": 0.8838, "step": 27680 }, { "epoch": 5.21, "grad_norm": 7.081830978393555, "learning_rate": 9.576510446075664e-06, "loss": 0.7919, "step": 27690 }, { "epoch": 5.21, "grad_norm": 5.471766948699951, "learning_rate": 9.572746094485226e-06, "loss": 0.791, "step": 27700 }, { "epoch": 5.22, "grad_norm": 9.526850700378418, "learning_rate": 9.568981742894787e-06, "loss": 0.8642, "step": 27710 }, { "epoch": 5.22, "grad_norm": 20.183530807495117, "learning_rate": 9.565217391304349e-06, "loss": 0.947, "step": 27720 }, { "epoch": 5.22, "grad_norm": 14.78543758392334, "learning_rate": 9.56145303971391e-06, "loss": 1.0679, "step": 27730 }, { "epoch": 5.22, "grad_norm": 4.317257881164551, "learning_rate": 9.557688688123472e-06, "loss": 1.3319, "step": 27740 }, { "epoch": 5.22, "grad_norm": 4.291439056396484, "learning_rate": 9.553924336533033e-06, "loss": 0.7692, "step": 27750 }, { "epoch": 5.22, "grad_norm": 3.341729164123535, "learning_rate": 9.550159984942595e-06, "loss": 0.748, "step": 27760 }, { "epoch": 5.23, "grad_norm": 65.21473693847656, "learning_rate": 9.546395633352156e-06, "loss": 0.7597, "step": 27770 }, { "epoch": 5.23, "grad_norm": 11.0003662109375, "learning_rate": 9.542631281761717e-06, "loss": 0.8743, "step": 27780 }, { "epoch": 5.23, "grad_norm": 6.101351261138916, "learning_rate": 9.538866930171279e-06, "loss": 1.0004, "step": 27790 }, { "epoch": 5.23, "grad_norm": 6.741501808166504, "learning_rate": 9.53510257858084e-06, "loss": 0.6571, "step": 27800 }, { "epoch": 5.23, "grad_norm": 14.496847152709961, "learning_rate": 9.531338226990402e-06, "loss": 0.9468, "step": 27810 }, { "epoch": 5.24, "grad_norm": 5.572566986083984, "learning_rate": 9.527573875399963e-06, "loss": 1.1785, "step": 27820 }, { "epoch": 5.24, "grad_norm": 8.359467506408691, "learning_rate": 9.523809523809525e-06, "loss": 1.1988, "step": 27830 }, { "epoch": 5.24, "grad_norm": 3.1812174320220947, "learning_rate": 9.520045172219086e-06, "loss": 1.0454, "step": 27840 }, { "epoch": 5.24, "grad_norm": 3.727541208267212, "learning_rate": 9.516280820628648e-06, "loss": 1.1708, "step": 27850 }, { "epoch": 5.24, "grad_norm": 4.738743305206299, "learning_rate": 9.512516469038209e-06, "loss": 0.7484, "step": 27860 }, { "epoch": 5.25, "grad_norm": 2.8919122219085693, "learning_rate": 9.50875211744777e-06, "loss": 1.2647, "step": 27870 }, { "epoch": 5.25, "grad_norm": 10.391885757446289, "learning_rate": 9.504987765857332e-06, "loss": 0.8355, "step": 27880 }, { "epoch": 5.25, "grad_norm": 6.072714328765869, "learning_rate": 9.501223414266894e-06, "loss": 1.0003, "step": 27890 }, { "epoch": 5.25, "grad_norm": 18.254798889160156, "learning_rate": 9.497459062676455e-06, "loss": 1.1736, "step": 27900 }, { "epoch": 5.25, "grad_norm": 7.245409965515137, "learning_rate": 9.493694711086016e-06, "loss": 1.1235, "step": 27910 }, { "epoch": 5.26, "grad_norm": 3.408019542694092, "learning_rate": 9.489930359495578e-06, "loss": 0.8228, "step": 27920 }, { "epoch": 5.26, "grad_norm": 4.944334030151367, "learning_rate": 9.48616600790514e-06, "loss": 0.8352, "step": 27930 }, { "epoch": 5.26, "grad_norm": 11.32558536529541, "learning_rate": 9.4824016563147e-06, "loss": 0.838, "step": 27940 }, { "epoch": 5.26, "grad_norm": 8.638864517211914, "learning_rate": 9.478637304724262e-06, "loss": 0.8634, "step": 27950 }, { "epoch": 5.26, "grad_norm": 16.10377311706543, "learning_rate": 9.474872953133824e-06, "loss": 1.0847, "step": 27960 }, { "epoch": 5.26, "grad_norm": 4.015634059906006, "learning_rate": 9.471108601543385e-06, "loss": 0.8252, "step": 27970 }, { "epoch": 5.27, "grad_norm": 8.575149536132812, "learning_rate": 9.467344249952947e-06, "loss": 0.9009, "step": 27980 }, { "epoch": 5.27, "grad_norm": 3.329954147338867, "learning_rate": 9.463579898362508e-06, "loss": 0.6134, "step": 27990 }, { "epoch": 5.27, "grad_norm": 7.170989513397217, "learning_rate": 9.45981554677207e-06, "loss": 0.7319, "step": 28000 }, { "epoch": 5.27, "grad_norm": 14.477974891662598, "learning_rate": 9.456051195181631e-06, "loss": 1.1545, "step": 28010 }, { "epoch": 5.27, "grad_norm": 12.156949996948242, "learning_rate": 9.452286843591193e-06, "loss": 1.0464, "step": 28020 }, { "epoch": 5.28, "grad_norm": 9.985879898071289, "learning_rate": 9.448522492000754e-06, "loss": 1.3909, "step": 28030 }, { "epoch": 5.28, "grad_norm": 10.521660804748535, "learning_rate": 9.444758140410315e-06, "loss": 1.0714, "step": 28040 }, { "epoch": 5.28, "grad_norm": 3.53287410736084, "learning_rate": 9.440993788819877e-06, "loss": 0.9531, "step": 28050 }, { "epoch": 5.28, "grad_norm": 34.304256439208984, "learning_rate": 9.437229437229438e-06, "loss": 0.8852, "step": 28060 }, { "epoch": 5.28, "grad_norm": 6.207040309906006, "learning_rate": 9.433465085639e-06, "loss": 0.9303, "step": 28070 }, { "epoch": 5.29, "grad_norm": 16.0903377532959, "learning_rate": 9.429700734048561e-06, "loss": 1.0293, "step": 28080 }, { "epoch": 5.29, "grad_norm": 6.0098795890808105, "learning_rate": 9.425936382458121e-06, "loss": 1.2863, "step": 28090 }, { "epoch": 5.29, "grad_norm": 6.091872215270996, "learning_rate": 9.422172030867684e-06, "loss": 0.9243, "step": 28100 }, { "epoch": 5.29, "grad_norm": 12.428032875061035, "learning_rate": 9.418407679277246e-06, "loss": 1.3243, "step": 28110 }, { "epoch": 5.29, "grad_norm": 4.0772318840026855, "learning_rate": 9.414643327686807e-06, "loss": 0.6689, "step": 28120 }, { "epoch": 5.29, "grad_norm": 4.86169958114624, "learning_rate": 9.410878976096369e-06, "loss": 0.81, "step": 28130 }, { "epoch": 5.3, "grad_norm": 1.3692225217819214, "learning_rate": 9.40711462450593e-06, "loss": 0.8834, "step": 28140 }, { "epoch": 5.3, "grad_norm": 6.199678421020508, "learning_rate": 9.403350272915491e-06, "loss": 0.8518, "step": 28150 }, { "epoch": 5.3, "grad_norm": 16.723379135131836, "learning_rate": 9.399585921325053e-06, "loss": 1.2079, "step": 28160 }, { "epoch": 5.3, "grad_norm": 8.276581764221191, "learning_rate": 9.395821569734614e-06, "loss": 1.061, "step": 28170 }, { "epoch": 5.3, "grad_norm": 2.572300434112549, "learning_rate": 9.392057218144174e-06, "loss": 0.9677, "step": 28180 }, { "epoch": 5.31, "grad_norm": 5.367160320281982, "learning_rate": 9.388292866553737e-06, "loss": 0.769, "step": 28190 }, { "epoch": 5.31, "grad_norm": 5.248952388763428, "learning_rate": 9.384528514963299e-06, "loss": 0.8837, "step": 28200 }, { "epoch": 5.31, "grad_norm": 31.504255294799805, "learning_rate": 9.38076416337286e-06, "loss": 0.8916, "step": 28210 }, { "epoch": 5.31, "grad_norm": 7.04111909866333, "learning_rate": 9.376999811782422e-06, "loss": 0.919, "step": 28220 }, { "epoch": 5.31, "grad_norm": 37.66887283325195, "learning_rate": 9.373235460191983e-06, "loss": 0.8285, "step": 28230 }, { "epoch": 5.32, "grad_norm": 6.5409417152404785, "learning_rate": 9.369471108601545e-06, "loss": 0.9307, "step": 28240 }, { "epoch": 5.32, "grad_norm": 4.117661476135254, "learning_rate": 9.365706757011106e-06, "loss": 0.6511, "step": 28250 }, { "epoch": 5.32, "grad_norm": 21.092880249023438, "learning_rate": 9.361942405420668e-06, "loss": 1.5117, "step": 28260 }, { "epoch": 5.32, "grad_norm": 4.0568318367004395, "learning_rate": 9.358178053830227e-06, "loss": 0.7792, "step": 28270 }, { "epoch": 5.32, "grad_norm": 7.6530842781066895, "learning_rate": 9.354413702239789e-06, "loss": 0.7052, "step": 28280 }, { "epoch": 5.32, "grad_norm": 13.412463188171387, "learning_rate": 9.350649350649352e-06, "loss": 1.2288, "step": 28290 }, { "epoch": 5.33, "grad_norm": 7.002659797668457, "learning_rate": 9.346884999058913e-06, "loss": 0.9251, "step": 28300 }, { "epoch": 5.33, "grad_norm": 4.83797025680542, "learning_rate": 9.343120647468475e-06, "loss": 0.9053, "step": 28310 }, { "epoch": 5.33, "grad_norm": 16.18877601623535, "learning_rate": 9.339356295878036e-06, "loss": 1.2876, "step": 28320 }, { "epoch": 5.33, "grad_norm": 9.63300895690918, "learning_rate": 9.335591944287598e-06, "loss": 1.1485, "step": 28330 }, { "epoch": 5.33, "grad_norm": 8.405498504638672, "learning_rate": 9.33182759269716e-06, "loss": 0.858, "step": 28340 }, { "epoch": 5.34, "grad_norm": 13.015806198120117, "learning_rate": 9.32806324110672e-06, "loss": 0.8942, "step": 28350 }, { "epoch": 5.34, "grad_norm": 13.439905166625977, "learning_rate": 9.32429888951628e-06, "loss": 0.7757, "step": 28360 }, { "epoch": 5.34, "grad_norm": 46.8524055480957, "learning_rate": 9.320534537925842e-06, "loss": 0.9142, "step": 28370 }, { "epoch": 5.34, "grad_norm": 37.03493118286133, "learning_rate": 9.316770186335405e-06, "loss": 1.241, "step": 28380 }, { "epoch": 5.34, "grad_norm": 10.489031791687012, "learning_rate": 9.313005834744967e-06, "loss": 0.9837, "step": 28390 }, { "epoch": 5.35, "grad_norm": 7.557834148406982, "learning_rate": 9.309241483154528e-06, "loss": 1.0916, "step": 28400 }, { "epoch": 5.35, "grad_norm": 8.398204803466797, "learning_rate": 9.30547713156409e-06, "loss": 0.674, "step": 28410 }, { "epoch": 5.35, "grad_norm": 5.196258544921875, "learning_rate": 9.301712779973651e-06, "loss": 1.0452, "step": 28420 }, { "epoch": 5.35, "grad_norm": 25.11329460144043, "learning_rate": 9.297948428383212e-06, "loss": 1.1052, "step": 28430 }, { "epoch": 5.35, "grad_norm": 3.0304675102233887, "learning_rate": 9.294184076792772e-06, "loss": 0.8896, "step": 28440 }, { "epoch": 5.35, "grad_norm": 12.971097946166992, "learning_rate": 9.290419725202334e-06, "loss": 1.1059, "step": 28450 }, { "epoch": 5.36, "grad_norm": 7.9853668212890625, "learning_rate": 9.286655373611895e-06, "loss": 0.6155, "step": 28460 }, { "epoch": 5.36, "grad_norm": 11.388359069824219, "learning_rate": 9.282891022021458e-06, "loss": 0.9596, "step": 28470 }, { "epoch": 5.36, "grad_norm": 8.400618553161621, "learning_rate": 9.27912667043102e-06, "loss": 0.7879, "step": 28480 }, { "epoch": 5.36, "grad_norm": 8.280097961425781, "learning_rate": 9.275362318840581e-06, "loss": 0.9666, "step": 28490 }, { "epoch": 5.36, "grad_norm": 4.299639701843262, "learning_rate": 9.271597967250143e-06, "loss": 0.6447, "step": 28500 }, { "epoch": 5.37, "grad_norm": 5.530027389526367, "learning_rate": 9.267833615659704e-06, "loss": 0.7924, "step": 28510 }, { "epoch": 5.37, "grad_norm": 5.538214683532715, "learning_rate": 9.264069264069266e-06, "loss": 0.9461, "step": 28520 }, { "epoch": 5.37, "grad_norm": 18.258651733398438, "learning_rate": 9.260304912478825e-06, "loss": 1.0281, "step": 28530 }, { "epoch": 5.37, "grad_norm": 19.745840072631836, "learning_rate": 9.256540560888387e-06, "loss": 0.8826, "step": 28540 }, { "epoch": 5.37, "grad_norm": 13.694552421569824, "learning_rate": 9.252776209297948e-06, "loss": 1.1994, "step": 28550 }, { "epoch": 5.38, "grad_norm": 23.569223403930664, "learning_rate": 9.24901185770751e-06, "loss": 1.0469, "step": 28560 }, { "epoch": 5.38, "grad_norm": 5.491553783416748, "learning_rate": 9.245247506117073e-06, "loss": 1.0271, "step": 28570 }, { "epoch": 5.38, "grad_norm": 6.416265964508057, "learning_rate": 9.241483154526634e-06, "loss": 1.0173, "step": 28580 }, { "epoch": 5.38, "grad_norm": 9.00337028503418, "learning_rate": 9.237718802936196e-06, "loss": 0.8453, "step": 28590 }, { "epoch": 5.38, "grad_norm": 33.706539154052734, "learning_rate": 9.233954451345757e-06, "loss": 0.9671, "step": 28600 }, { "epoch": 5.38, "grad_norm": 11.606428146362305, "learning_rate": 9.230190099755319e-06, "loss": 0.8074, "step": 28610 }, { "epoch": 5.39, "grad_norm": 33.36208724975586, "learning_rate": 9.226425748164878e-06, "loss": 0.9912, "step": 28620 }, { "epoch": 5.39, "grad_norm": 14.858409881591797, "learning_rate": 9.22266139657444e-06, "loss": 1.1494, "step": 28630 }, { "epoch": 5.39, "grad_norm": 4.3761796951293945, "learning_rate": 9.218897044984001e-06, "loss": 0.6095, "step": 28640 }, { "epoch": 5.39, "grad_norm": 15.37989616394043, "learning_rate": 9.215132693393563e-06, "loss": 0.7637, "step": 28650 }, { "epoch": 5.39, "grad_norm": 22.02336883544922, "learning_rate": 9.211368341803126e-06, "loss": 0.9147, "step": 28660 }, { "epoch": 5.4, "grad_norm": 2.1736600399017334, "learning_rate": 9.207603990212687e-06, "loss": 0.7403, "step": 28670 }, { "epoch": 5.4, "grad_norm": 4.165881156921387, "learning_rate": 9.203839638622249e-06, "loss": 1.0021, "step": 28680 }, { "epoch": 5.4, "grad_norm": 8.381041526794434, "learning_rate": 9.20007528703181e-06, "loss": 0.986, "step": 28690 }, { "epoch": 5.4, "grad_norm": 8.75325870513916, "learning_rate": 9.19631093544137e-06, "loss": 0.7465, "step": 28700 }, { "epoch": 5.4, "grad_norm": 12.10628890991211, "learning_rate": 9.192546583850932e-06, "loss": 1.1348, "step": 28710 }, { "epoch": 5.41, "grad_norm": 5.588778495788574, "learning_rate": 9.188782232260493e-06, "loss": 0.9982, "step": 28720 }, { "epoch": 5.41, "grad_norm": 20.60218620300293, "learning_rate": 9.185017880670055e-06, "loss": 0.8728, "step": 28730 }, { "epoch": 5.41, "grad_norm": 4.909438133239746, "learning_rate": 9.181253529079616e-06, "loss": 0.9869, "step": 28740 }, { "epoch": 5.41, "grad_norm": 6.493514537811279, "learning_rate": 9.177489177489179e-06, "loss": 1.1888, "step": 28750 }, { "epoch": 5.41, "grad_norm": 12.462530136108398, "learning_rate": 9.17372482589874e-06, "loss": 0.8986, "step": 28760 }, { "epoch": 5.42, "grad_norm": 11.790112495422363, "learning_rate": 9.169960474308302e-06, "loss": 0.9239, "step": 28770 }, { "epoch": 5.42, "grad_norm": 21.728673934936523, "learning_rate": 9.166196122717864e-06, "loss": 1.0881, "step": 28780 }, { "epoch": 5.42, "grad_norm": 12.574946403503418, "learning_rate": 9.162431771127423e-06, "loss": 0.8977, "step": 28790 }, { "epoch": 5.42, "grad_norm": 1.9553234577178955, "learning_rate": 9.158667419536985e-06, "loss": 1.3669, "step": 28800 }, { "epoch": 5.42, "grad_norm": 13.493619918823242, "learning_rate": 9.154903067946546e-06, "loss": 1.1484, "step": 28810 }, { "epoch": 5.42, "grad_norm": 5.195600509643555, "learning_rate": 9.151138716356108e-06, "loss": 1.0762, "step": 28820 }, { "epoch": 5.43, "grad_norm": 12.32522964477539, "learning_rate": 9.147374364765669e-06, "loss": 0.9495, "step": 28830 }, { "epoch": 5.43, "grad_norm": 6.158605575561523, "learning_rate": 9.143610013175232e-06, "loss": 0.8816, "step": 28840 }, { "epoch": 5.43, "grad_norm": 12.90102767944336, "learning_rate": 9.139845661584794e-06, "loss": 0.7207, "step": 28850 }, { "epoch": 5.43, "grad_norm": 23.62394905090332, "learning_rate": 9.136081309994355e-06, "loss": 0.9825, "step": 28860 }, { "epoch": 5.43, "grad_norm": 17.158891677856445, "learning_rate": 9.132316958403917e-06, "loss": 1.015, "step": 28870 }, { "epoch": 5.44, "grad_norm": 18.659269332885742, "learning_rate": 9.128552606813476e-06, "loss": 0.8932, "step": 28880 }, { "epoch": 5.44, "grad_norm": 13.33729362487793, "learning_rate": 9.124788255223038e-06, "loss": 0.7327, "step": 28890 }, { "epoch": 5.44, "grad_norm": 3.1400554180145264, "learning_rate": 9.1210239036326e-06, "loss": 0.7296, "step": 28900 }, { "epoch": 5.44, "grad_norm": 39.30720138549805, "learning_rate": 9.11725955204216e-06, "loss": 0.9596, "step": 28910 }, { "epoch": 5.44, "grad_norm": 11.591464042663574, "learning_rate": 9.113495200451722e-06, "loss": 1.3182, "step": 28920 }, { "epoch": 5.45, "grad_norm": 15.750186920166016, "learning_rate": 9.109730848861284e-06, "loss": 1.0812, "step": 28930 }, { "epoch": 5.45, "grad_norm": 22.184999465942383, "learning_rate": 9.105966497270847e-06, "loss": 1.1149, "step": 28940 }, { "epoch": 5.45, "grad_norm": 3.8057732582092285, "learning_rate": 9.102202145680408e-06, "loss": 1.0327, "step": 28950 }, { "epoch": 5.45, "grad_norm": 10.918521881103516, "learning_rate": 9.09843779408997e-06, "loss": 1.0136, "step": 28960 }, { "epoch": 5.45, "grad_norm": 8.937349319458008, "learning_rate": 9.09467344249953e-06, "loss": 1.0213, "step": 28970 }, { "epoch": 5.45, "grad_norm": 41.649662017822266, "learning_rate": 9.090909090909091e-06, "loss": 1.0811, "step": 28980 }, { "epoch": 5.46, "grad_norm": 37.86155700683594, "learning_rate": 9.087144739318652e-06, "loss": 1.1534, "step": 28990 }, { "epoch": 5.46, "grad_norm": 20.072498321533203, "learning_rate": 9.083380387728214e-06, "loss": 1.0236, "step": 29000 }, { "epoch": 5.46, "grad_norm": 20.86501121520996, "learning_rate": 9.079616036137775e-06, "loss": 0.8638, "step": 29010 }, { "epoch": 5.46, "grad_norm": 6.2412896156311035, "learning_rate": 9.075851684547337e-06, "loss": 0.7937, "step": 29020 }, { "epoch": 5.46, "grad_norm": 4.902469158172607, "learning_rate": 9.0720873329569e-06, "loss": 1.0052, "step": 29030 }, { "epoch": 5.47, "grad_norm": 17.87940788269043, "learning_rate": 9.068322981366461e-06, "loss": 0.7675, "step": 29040 }, { "epoch": 5.47, "grad_norm": 6.655585765838623, "learning_rate": 9.064558629776021e-06, "loss": 1.1088, "step": 29050 }, { "epoch": 5.47, "grad_norm": 9.94439697265625, "learning_rate": 9.060794278185583e-06, "loss": 0.9594, "step": 29060 }, { "epoch": 5.47, "grad_norm": 47.68941116333008, "learning_rate": 9.057029926595144e-06, "loss": 1.0776, "step": 29070 }, { "epoch": 5.47, "grad_norm": 16.939481735229492, "learning_rate": 9.053265575004706e-06, "loss": 0.9547, "step": 29080 }, { "epoch": 5.48, "grad_norm": 25.923072814941406, "learning_rate": 9.049501223414267e-06, "loss": 0.9966, "step": 29090 }, { "epoch": 5.48, "grad_norm": 13.021749496459961, "learning_rate": 9.045736871823829e-06, "loss": 0.7119, "step": 29100 }, { "epoch": 5.48, "grad_norm": 14.501338958740234, "learning_rate": 9.04197252023339e-06, "loss": 1.1393, "step": 29110 }, { "epoch": 5.48, "grad_norm": 92.26004791259766, "learning_rate": 9.038208168642953e-06, "loss": 1.1267, "step": 29120 }, { "epoch": 5.48, "grad_norm": 17.94881820678711, "learning_rate": 9.034443817052515e-06, "loss": 0.7753, "step": 29130 }, { "epoch": 5.48, "grad_norm": 17.52510643005371, "learning_rate": 9.030679465462074e-06, "loss": 0.9959, "step": 29140 }, { "epoch": 5.49, "grad_norm": 24.444133758544922, "learning_rate": 9.026915113871636e-06, "loss": 1.1808, "step": 29150 }, { "epoch": 5.49, "grad_norm": 9.542744636535645, "learning_rate": 9.023150762281197e-06, "loss": 0.9329, "step": 29160 }, { "epoch": 5.49, "grad_norm": 9.04818058013916, "learning_rate": 9.019386410690759e-06, "loss": 0.8142, "step": 29170 }, { "epoch": 5.49, "grad_norm": 2.120548725128174, "learning_rate": 9.01562205910032e-06, "loss": 0.5434, "step": 29180 }, { "epoch": 5.49, "grad_norm": 1.9490175247192383, "learning_rate": 9.011857707509882e-06, "loss": 0.7956, "step": 29190 }, { "epoch": 5.5, "grad_norm": 39.55332946777344, "learning_rate": 9.008093355919443e-06, "loss": 1.1267, "step": 29200 }, { "epoch": 5.5, "grad_norm": 14.2628755569458, "learning_rate": 9.004329004329005e-06, "loss": 1.0506, "step": 29210 }, { "epoch": 5.5, "grad_norm": 6.581046104431152, "learning_rate": 9.000564652738568e-06, "loss": 1.0391, "step": 29220 }, { "epoch": 5.5, "grad_norm": 14.000588417053223, "learning_rate": 8.996800301148128e-06, "loss": 1.0122, "step": 29230 }, { "epoch": 5.5, "grad_norm": 11.082819938659668, "learning_rate": 8.993035949557689e-06, "loss": 0.8397, "step": 29240 }, { "epoch": 5.51, "grad_norm": 4.468993186950684, "learning_rate": 8.98927159796725e-06, "loss": 1.1224, "step": 29250 }, { "epoch": 5.51, "grad_norm": 3.9217376708984375, "learning_rate": 8.985507246376812e-06, "loss": 1.1921, "step": 29260 }, { "epoch": 5.51, "grad_norm": 10.662483215332031, "learning_rate": 8.981742894786373e-06, "loss": 0.8061, "step": 29270 }, { "epoch": 5.51, "grad_norm": 6.659120559692383, "learning_rate": 8.977978543195935e-06, "loss": 1.2479, "step": 29280 }, { "epoch": 5.51, "grad_norm": 21.427820205688477, "learning_rate": 8.974214191605496e-06, "loss": 0.8032, "step": 29290 }, { "epoch": 5.51, "grad_norm": 13.474923133850098, "learning_rate": 8.970449840015058e-06, "loss": 1.1119, "step": 29300 }, { "epoch": 5.52, "grad_norm": 12.508622169494629, "learning_rate": 8.96668548842462e-06, "loss": 0.6608, "step": 29310 }, { "epoch": 5.52, "grad_norm": 13.279670715332031, "learning_rate": 8.96292113683418e-06, "loss": 0.7437, "step": 29320 }, { "epoch": 5.52, "grad_norm": 9.320384979248047, "learning_rate": 8.959156785243742e-06, "loss": 1.0422, "step": 29330 }, { "epoch": 5.52, "grad_norm": 7.070023536682129, "learning_rate": 8.955392433653304e-06, "loss": 0.7838, "step": 29340 }, { "epoch": 5.52, "grad_norm": 3.0125250816345215, "learning_rate": 8.951628082062865e-06, "loss": 0.9059, "step": 29350 }, { "epoch": 5.53, "grad_norm": 9.702225685119629, "learning_rate": 8.947863730472427e-06, "loss": 0.9163, "step": 29360 }, { "epoch": 5.53, "grad_norm": 9.385030746459961, "learning_rate": 8.944099378881988e-06, "loss": 0.8988, "step": 29370 }, { "epoch": 5.53, "grad_norm": 5.907294750213623, "learning_rate": 8.94033502729155e-06, "loss": 0.6597, "step": 29380 }, { "epoch": 5.53, "grad_norm": 20.72801399230957, "learning_rate": 8.936570675701111e-06, "loss": 0.79, "step": 29390 }, { "epoch": 5.53, "grad_norm": 134.38417053222656, "learning_rate": 8.932806324110672e-06, "loss": 0.9157, "step": 29400 }, { "epoch": 5.54, "grad_norm": 4.289487838745117, "learning_rate": 8.929041972520234e-06, "loss": 1.021, "step": 29410 }, { "epoch": 5.54, "grad_norm": 15.529023170471191, "learning_rate": 8.925277620929795e-06, "loss": 0.7296, "step": 29420 }, { "epoch": 5.54, "grad_norm": 23.799545288085938, "learning_rate": 8.921513269339357e-06, "loss": 0.7523, "step": 29430 }, { "epoch": 5.54, "grad_norm": 7.799114227294922, "learning_rate": 8.917748917748918e-06, "loss": 0.9548, "step": 29440 }, { "epoch": 5.54, "grad_norm": 6.250319004058838, "learning_rate": 8.91398456615848e-06, "loss": 1.0914, "step": 29450 }, { "epoch": 5.54, "grad_norm": 4.887936115264893, "learning_rate": 8.910220214568041e-06, "loss": 1.0589, "step": 29460 }, { "epoch": 5.55, "grad_norm": 15.556137084960938, "learning_rate": 8.906455862977603e-06, "loss": 0.7501, "step": 29470 }, { "epoch": 5.55, "grad_norm": 4.8936448097229, "learning_rate": 8.902691511387164e-06, "loss": 0.9409, "step": 29480 }, { "epoch": 5.55, "grad_norm": 4.803304672241211, "learning_rate": 8.898927159796726e-06, "loss": 0.8649, "step": 29490 }, { "epoch": 5.55, "grad_norm": 1.804856538772583, "learning_rate": 8.895162808206287e-06, "loss": 0.7748, "step": 29500 }, { "epoch": 5.55, "grad_norm": 8.4596529006958, "learning_rate": 8.891398456615848e-06, "loss": 1.2872, "step": 29510 }, { "epoch": 5.56, "grad_norm": 17.373252868652344, "learning_rate": 8.88763410502541e-06, "loss": 1.1505, "step": 29520 }, { "epoch": 5.56, "grad_norm": 16.646570205688477, "learning_rate": 8.883869753434971e-06, "loss": 0.8761, "step": 29530 }, { "epoch": 5.56, "grad_norm": 11.048295974731445, "learning_rate": 8.880105401844533e-06, "loss": 0.8933, "step": 29540 }, { "epoch": 5.56, "grad_norm": 20.154624938964844, "learning_rate": 8.876341050254094e-06, "loss": 1.2233, "step": 29550 }, { "epoch": 5.56, "grad_norm": 15.49629020690918, "learning_rate": 8.872576698663656e-06, "loss": 0.9148, "step": 29560 }, { "epoch": 5.57, "grad_norm": 18.930164337158203, "learning_rate": 8.868812347073217e-06, "loss": 1.1451, "step": 29570 }, { "epoch": 5.57, "grad_norm": 21.803237915039062, "learning_rate": 8.865047995482779e-06, "loss": 1.1853, "step": 29580 }, { "epoch": 5.57, "grad_norm": 3.2622554302215576, "learning_rate": 8.86128364389234e-06, "loss": 0.5586, "step": 29590 }, { "epoch": 5.57, "grad_norm": 28.86388397216797, "learning_rate": 8.857519292301902e-06, "loss": 1.1107, "step": 29600 }, { "epoch": 5.57, "grad_norm": 2.6599478721618652, "learning_rate": 8.853754940711463e-06, "loss": 1.1612, "step": 29610 }, { "epoch": 5.58, "grad_norm": 8.663665771484375, "learning_rate": 8.849990589121025e-06, "loss": 0.9016, "step": 29620 }, { "epoch": 5.58, "grad_norm": 5.800599575042725, "learning_rate": 8.846226237530586e-06, "loss": 1.0564, "step": 29630 }, { "epoch": 5.58, "grad_norm": 33.331382751464844, "learning_rate": 8.842461885940147e-06, "loss": 1.0554, "step": 29640 }, { "epoch": 5.58, "grad_norm": 28.608020782470703, "learning_rate": 8.838697534349709e-06, "loss": 0.952, "step": 29650 }, { "epoch": 5.58, "grad_norm": 2.404825448989868, "learning_rate": 8.83493318275927e-06, "loss": 0.7676, "step": 29660 }, { "epoch": 5.58, "grad_norm": 21.339494705200195, "learning_rate": 8.831168831168832e-06, "loss": 0.686, "step": 29670 }, { "epoch": 5.59, "grad_norm": 8.47734260559082, "learning_rate": 8.827404479578393e-06, "loss": 1.087, "step": 29680 }, { "epoch": 5.59, "grad_norm": 2.428666830062866, "learning_rate": 8.823640127987955e-06, "loss": 0.7628, "step": 29690 }, { "epoch": 5.59, "grad_norm": 22.068326950073242, "learning_rate": 8.819875776397516e-06, "loss": 0.966, "step": 29700 }, { "epoch": 5.59, "grad_norm": 9.214179992675781, "learning_rate": 8.816111424807078e-06, "loss": 1.2109, "step": 29710 }, { "epoch": 5.59, "grad_norm": 54.18630599975586, "learning_rate": 8.812347073216639e-06, "loss": 0.9417, "step": 29720 }, { "epoch": 5.6, "grad_norm": 4.5401177406311035, "learning_rate": 8.8085827216262e-06, "loss": 1.062, "step": 29730 }, { "epoch": 5.6, "grad_norm": 18.38913345336914, "learning_rate": 8.804818370035762e-06, "loss": 1.0399, "step": 29740 }, { "epoch": 5.6, "grad_norm": 4.836206436157227, "learning_rate": 8.801054018445324e-06, "loss": 0.8188, "step": 29750 }, { "epoch": 5.6, "grad_norm": 4.160147666931152, "learning_rate": 8.797289666854885e-06, "loss": 1.0734, "step": 29760 }, { "epoch": 5.6, "grad_norm": 48.155029296875, "learning_rate": 8.793525315264446e-06, "loss": 0.906, "step": 29770 }, { "epoch": 5.61, "grad_norm": 9.932600975036621, "learning_rate": 8.789760963674008e-06, "loss": 1.4372, "step": 29780 }, { "epoch": 5.61, "grad_norm": 16.927183151245117, "learning_rate": 8.78599661208357e-06, "loss": 0.8624, "step": 29790 }, { "epoch": 5.61, "grad_norm": 14.760272026062012, "learning_rate": 8.78223226049313e-06, "loss": 0.6661, "step": 29800 }, { "epoch": 5.61, "grad_norm": 17.276628494262695, "learning_rate": 8.778467908902692e-06, "loss": 0.9024, "step": 29810 }, { "epoch": 5.61, "grad_norm": 5.607086181640625, "learning_rate": 8.774703557312254e-06, "loss": 1.0903, "step": 29820 }, { "epoch": 5.61, "grad_norm": 39.1153564453125, "learning_rate": 8.770939205721815e-06, "loss": 0.9129, "step": 29830 }, { "epoch": 5.62, "grad_norm": 13.988188743591309, "learning_rate": 8.767174854131377e-06, "loss": 0.7199, "step": 29840 }, { "epoch": 5.62, "grad_norm": 4.17264461517334, "learning_rate": 8.763410502540938e-06, "loss": 0.9442, "step": 29850 }, { "epoch": 5.62, "grad_norm": 18.37892723083496, "learning_rate": 8.7596461509505e-06, "loss": 1.0808, "step": 29860 }, { "epoch": 5.62, "grad_norm": 12.660880088806152, "learning_rate": 8.755881799360061e-06, "loss": 0.8276, "step": 29870 }, { "epoch": 5.62, "grad_norm": 3.057593584060669, "learning_rate": 8.752117447769623e-06, "loss": 0.9181, "step": 29880 }, { "epoch": 5.63, "grad_norm": 1.3751906156539917, "learning_rate": 8.748353096179184e-06, "loss": 1.1312, "step": 29890 }, { "epoch": 5.63, "grad_norm": 12.612813949584961, "learning_rate": 8.744588744588745e-06, "loss": 0.9828, "step": 29900 }, { "epoch": 5.63, "grad_norm": 2.8939971923828125, "learning_rate": 8.740824392998307e-06, "loss": 0.908, "step": 29910 }, { "epoch": 5.63, "grad_norm": 1.7106744050979614, "learning_rate": 8.737060041407868e-06, "loss": 0.7676, "step": 29920 }, { "epoch": 5.63, "grad_norm": 12.564702987670898, "learning_rate": 8.73329568981743e-06, "loss": 0.7571, "step": 29930 }, { "epoch": 5.64, "grad_norm": 14.032854080200195, "learning_rate": 8.729531338226991e-06, "loss": 0.8814, "step": 29940 }, { "epoch": 5.64, "grad_norm": 11.228653907775879, "learning_rate": 8.725766986636553e-06, "loss": 1.1324, "step": 29950 }, { "epoch": 5.64, "grad_norm": 12.96795654296875, "learning_rate": 8.722002635046114e-06, "loss": 0.6425, "step": 29960 }, { "epoch": 5.64, "grad_norm": 4.7573089599609375, "learning_rate": 8.718238283455676e-06, "loss": 0.8978, "step": 29970 }, { "epoch": 5.64, "grad_norm": 26.95293617248535, "learning_rate": 8.714473931865237e-06, "loss": 1.0579, "step": 29980 }, { "epoch": 5.64, "grad_norm": 41.74863052368164, "learning_rate": 8.710709580274799e-06, "loss": 1.2386, "step": 29990 }, { "epoch": 5.65, "grad_norm": 4.445674896240234, "learning_rate": 8.70694522868436e-06, "loss": 0.7049, "step": 30000 }, { "epoch": 5.65, "grad_norm": 4.920813083648682, "learning_rate": 8.703180877093921e-06, "loss": 0.9574, "step": 30010 }, { "epoch": 5.65, "grad_norm": 2.760878562927246, "learning_rate": 8.699416525503483e-06, "loss": 0.4799, "step": 30020 }, { "epoch": 5.65, "grad_norm": 13.369872093200684, "learning_rate": 8.695652173913044e-06, "loss": 0.6481, "step": 30030 }, { "epoch": 5.65, "grad_norm": 6.600404739379883, "learning_rate": 8.691887822322606e-06, "loss": 0.9652, "step": 30040 }, { "epoch": 5.66, "grad_norm": 15.891260147094727, "learning_rate": 8.688123470732167e-06, "loss": 0.982, "step": 30050 }, { "epoch": 5.66, "grad_norm": 32.19712829589844, "learning_rate": 8.684359119141729e-06, "loss": 0.9189, "step": 30060 }, { "epoch": 5.66, "grad_norm": 43.56324005126953, "learning_rate": 8.68059476755129e-06, "loss": 0.8433, "step": 30070 }, { "epoch": 5.66, "grad_norm": 32.96376037597656, "learning_rate": 8.676830415960852e-06, "loss": 1.1444, "step": 30080 }, { "epoch": 5.66, "grad_norm": 8.312973976135254, "learning_rate": 8.673066064370413e-06, "loss": 0.7938, "step": 30090 }, { "epoch": 5.67, "grad_norm": 6.109252452850342, "learning_rate": 8.669301712779975e-06, "loss": 0.9439, "step": 30100 }, { "epoch": 5.67, "grad_norm": 6.114505290985107, "learning_rate": 8.665537361189536e-06, "loss": 0.7573, "step": 30110 }, { "epoch": 5.67, "grad_norm": 13.92830753326416, "learning_rate": 8.661773009599098e-06, "loss": 0.7924, "step": 30120 }, { "epoch": 5.67, "grad_norm": 13.27717399597168, "learning_rate": 8.658008658008659e-06, "loss": 0.9396, "step": 30130 }, { "epoch": 5.67, "grad_norm": 10.957916259765625, "learning_rate": 8.65424430641822e-06, "loss": 0.5955, "step": 30140 }, { "epoch": 5.67, "grad_norm": 1.7774187326431274, "learning_rate": 8.650479954827782e-06, "loss": 0.5516, "step": 30150 }, { "epoch": 5.68, "grad_norm": 7.166529655456543, "learning_rate": 8.646715603237343e-06, "loss": 0.9568, "step": 30160 }, { "epoch": 5.68, "grad_norm": 7.256234645843506, "learning_rate": 8.642951251646905e-06, "loss": 0.8236, "step": 30170 }, { "epoch": 5.68, "grad_norm": 18.996091842651367, "learning_rate": 8.639186900056465e-06, "loss": 0.7431, "step": 30180 }, { "epoch": 5.68, "grad_norm": 16.654666900634766, "learning_rate": 8.635422548466028e-06, "loss": 0.8785, "step": 30190 }, { "epoch": 5.68, "grad_norm": 20.598628997802734, "learning_rate": 8.63165819687559e-06, "loss": 0.9685, "step": 30200 }, { "epoch": 5.69, "grad_norm": 42.501468658447266, "learning_rate": 8.62789384528515e-06, "loss": 0.7086, "step": 30210 }, { "epoch": 5.69, "grad_norm": 3.89968204498291, "learning_rate": 8.624129493694712e-06, "loss": 0.7695, "step": 30220 }, { "epoch": 5.69, "grad_norm": 9.280635833740234, "learning_rate": 8.620365142104274e-06, "loss": 0.6718, "step": 30230 }, { "epoch": 5.69, "grad_norm": 5.70449686050415, "learning_rate": 8.616600790513835e-06, "loss": 0.8694, "step": 30240 }, { "epoch": 5.69, "grad_norm": 24.63044548034668, "learning_rate": 8.612836438923397e-06, "loss": 0.8387, "step": 30250 }, { "epoch": 5.7, "grad_norm": 5.008859634399414, "learning_rate": 8.609072087332958e-06, "loss": 0.5237, "step": 30260 }, { "epoch": 5.7, "grad_norm": 10.31961441040039, "learning_rate": 8.605307735742518e-06, "loss": 0.7631, "step": 30270 }, { "epoch": 5.7, "grad_norm": 8.60684871673584, "learning_rate": 8.601543384152081e-06, "loss": 0.6888, "step": 30280 }, { "epoch": 5.7, "grad_norm": 13.582413673400879, "learning_rate": 8.597779032561642e-06, "loss": 0.8553, "step": 30290 }, { "epoch": 5.7, "grad_norm": 23.668649673461914, "learning_rate": 8.594014680971204e-06, "loss": 0.969, "step": 30300 }, { "epoch": 5.7, "grad_norm": 12.337814331054688, "learning_rate": 8.590250329380765e-06, "loss": 1.0902, "step": 30310 }, { "epoch": 5.71, "grad_norm": 5.430622100830078, "learning_rate": 8.586485977790327e-06, "loss": 0.8464, "step": 30320 }, { "epoch": 5.71, "grad_norm": 13.253424644470215, "learning_rate": 8.582721626199888e-06, "loss": 0.6974, "step": 30330 }, { "epoch": 5.71, "grad_norm": 6.752406597137451, "learning_rate": 8.57895727460945e-06, "loss": 0.6867, "step": 30340 }, { "epoch": 5.71, "grad_norm": 2.9843058586120605, "learning_rate": 8.575192923019011e-06, "loss": 0.747, "step": 30350 }, { "epoch": 5.71, "grad_norm": 6.5608062744140625, "learning_rate": 8.571428571428571e-06, "loss": 0.5911, "step": 30360 }, { "epoch": 5.72, "grad_norm": 8.627592086791992, "learning_rate": 8.567664219838134e-06, "loss": 0.8262, "step": 30370 }, { "epoch": 5.72, "grad_norm": 15.797553062438965, "learning_rate": 8.563899868247696e-06, "loss": 0.8481, "step": 30380 }, { "epoch": 5.72, "grad_norm": 19.08051109313965, "learning_rate": 8.560135516657257e-06, "loss": 0.7966, "step": 30390 }, { "epoch": 5.72, "grad_norm": 2.340876340866089, "learning_rate": 8.556371165066818e-06, "loss": 1.0239, "step": 30400 }, { "epoch": 5.72, "grad_norm": 6.641857147216797, "learning_rate": 8.55260681347638e-06, "loss": 1.0529, "step": 30410 }, { "epoch": 5.73, "grad_norm": 7.574660301208496, "learning_rate": 8.548842461885941e-06, "loss": 1.0188, "step": 30420 }, { "epoch": 5.73, "grad_norm": 7.297388076782227, "learning_rate": 8.545078110295503e-06, "loss": 0.7809, "step": 30430 }, { "epoch": 5.73, "grad_norm": 8.178194046020508, "learning_rate": 8.541313758705064e-06, "loss": 0.7911, "step": 30440 }, { "epoch": 5.73, "grad_norm": 6.48317289352417, "learning_rate": 8.537549407114624e-06, "loss": 1.1025, "step": 30450 }, { "epoch": 5.73, "grad_norm": 12.021096229553223, "learning_rate": 8.533785055524186e-06, "loss": 1.1284, "step": 30460 }, { "epoch": 5.73, "grad_norm": 1.0553628206253052, "learning_rate": 8.530020703933749e-06, "loss": 0.815, "step": 30470 }, { "epoch": 5.74, "grad_norm": 18.57106590270996, "learning_rate": 8.52625635234331e-06, "loss": 0.9251, "step": 30480 }, { "epoch": 5.74, "grad_norm": 3.429990768432617, "learning_rate": 8.522492000752872e-06, "loss": 0.8475, "step": 30490 }, { "epoch": 5.74, "grad_norm": 24.884265899658203, "learning_rate": 8.518727649162433e-06, "loss": 0.8757, "step": 30500 }, { "epoch": 5.74, "grad_norm": 2.049884796142578, "learning_rate": 8.514963297571995e-06, "loss": 0.8337, "step": 30510 }, { "epoch": 5.74, "grad_norm": 19.036104202270508, "learning_rate": 8.511198945981556e-06, "loss": 0.8514, "step": 30520 }, { "epoch": 5.75, "grad_norm": 10.064069747924805, "learning_rate": 8.507434594391116e-06, "loss": 0.9903, "step": 30530 }, { "epoch": 5.75, "grad_norm": 2.785097599029541, "learning_rate": 8.503670242800677e-06, "loss": 0.7679, "step": 30540 }, { "epoch": 5.75, "grad_norm": 14.600317001342773, "learning_rate": 8.499905891210239e-06, "loss": 0.6578, "step": 30550 }, { "epoch": 5.75, "grad_norm": 35.62032699584961, "learning_rate": 8.496141539619802e-06, "loss": 0.925, "step": 30560 }, { "epoch": 5.75, "grad_norm": 13.970808029174805, "learning_rate": 8.492377188029363e-06, "loss": 0.7008, "step": 30570 }, { "epoch": 5.76, "grad_norm": 6.60103702545166, "learning_rate": 8.488612836438925e-06, "loss": 1.2066, "step": 30580 }, { "epoch": 5.76, "grad_norm": 16.42170524597168, "learning_rate": 8.484848484848486e-06, "loss": 0.7334, "step": 30590 }, { "epoch": 5.76, "grad_norm": 19.058320999145508, "learning_rate": 8.481084133258048e-06, "loss": 0.8098, "step": 30600 }, { "epoch": 5.76, "grad_norm": 6.231168270111084, "learning_rate": 8.477319781667609e-06, "loss": 1.0256, "step": 30610 }, { "epoch": 5.76, "grad_norm": 5.102482318878174, "learning_rate": 8.473555430077169e-06, "loss": 0.8856, "step": 30620 }, { "epoch": 5.77, "grad_norm": 15.688319206237793, "learning_rate": 8.46979107848673e-06, "loss": 1.1421, "step": 30630 }, { "epoch": 5.77, "grad_norm": 13.8787202835083, "learning_rate": 8.466026726896292e-06, "loss": 0.8882, "step": 30640 }, { "epoch": 5.77, "grad_norm": 17.177953720092773, "learning_rate": 8.462262375305855e-06, "loss": 0.7688, "step": 30650 }, { "epoch": 5.77, "grad_norm": 20.290695190429688, "learning_rate": 8.458498023715416e-06, "loss": 0.8638, "step": 30660 }, { "epoch": 5.77, "grad_norm": 4.208349227905273, "learning_rate": 8.454733672124978e-06, "loss": 1.0099, "step": 30670 }, { "epoch": 5.77, "grad_norm": 24.063852310180664, "learning_rate": 8.45096932053454e-06, "loss": 0.5681, "step": 30680 }, { "epoch": 5.78, "grad_norm": 4.072475910186768, "learning_rate": 8.4472049689441e-06, "loss": 1.0082, "step": 30690 }, { "epoch": 5.78, "grad_norm": 3.7660579681396484, "learning_rate": 8.443440617353662e-06, "loss": 0.9331, "step": 30700 }, { "epoch": 5.78, "grad_norm": 3.538877487182617, "learning_rate": 8.439676265763222e-06, "loss": 0.7324, "step": 30710 }, { "epoch": 5.78, "grad_norm": 18.799739837646484, "learning_rate": 8.435911914172784e-06, "loss": 0.6764, "step": 30720 }, { "epoch": 5.78, "grad_norm": 14.212458610534668, "learning_rate": 8.432147562582345e-06, "loss": 0.9995, "step": 30730 }, { "epoch": 5.79, "grad_norm": 6.933103084564209, "learning_rate": 8.428383210991906e-06, "loss": 0.7807, "step": 30740 }, { "epoch": 5.79, "grad_norm": 9.192700386047363, "learning_rate": 8.42461885940147e-06, "loss": 0.916, "step": 30750 }, { "epoch": 5.79, "grad_norm": 32.47039794921875, "learning_rate": 8.420854507811031e-06, "loss": 0.9782, "step": 30760 }, { "epoch": 5.79, "grad_norm": 6.952606678009033, "learning_rate": 8.417090156220593e-06, "loss": 1.1702, "step": 30770 }, { "epoch": 5.79, "grad_norm": 15.746705055236816, "learning_rate": 8.413325804630154e-06, "loss": 0.7794, "step": 30780 }, { "epoch": 5.8, "grad_norm": 18.23080062866211, "learning_rate": 8.409561453039714e-06, "loss": 1.1291, "step": 30790 }, { "epoch": 5.8, "grad_norm": 2.102573871612549, "learning_rate": 8.405797101449275e-06, "loss": 0.5794, "step": 30800 }, { "epoch": 5.8, "grad_norm": 11.986711502075195, "learning_rate": 8.402032749858837e-06, "loss": 1.0847, "step": 30810 }, { "epoch": 5.8, "grad_norm": 5.476190090179443, "learning_rate": 8.398268398268398e-06, "loss": 0.8104, "step": 30820 }, { "epoch": 5.8, "grad_norm": 1.4963953495025635, "learning_rate": 8.39450404667796e-06, "loss": 1.2224, "step": 30830 }, { "epoch": 5.8, "grad_norm": 9.224945068359375, "learning_rate": 8.390739695087523e-06, "loss": 0.824, "step": 30840 }, { "epoch": 5.81, "grad_norm": 7.392148017883301, "learning_rate": 8.386975343497084e-06, "loss": 1.1233, "step": 30850 }, { "epoch": 5.81, "grad_norm": 3.570284128189087, "learning_rate": 8.383210991906646e-06, "loss": 1.1112, "step": 30860 }, { "epoch": 5.81, "grad_norm": 5.803598403930664, "learning_rate": 8.379446640316207e-06, "loss": 0.9001, "step": 30870 }, { "epoch": 5.81, "grad_norm": 25.192916870117188, "learning_rate": 8.375682288725767e-06, "loss": 0.688, "step": 30880 }, { "epoch": 5.81, "grad_norm": 5.229773044586182, "learning_rate": 8.371917937135328e-06, "loss": 0.6231, "step": 30890 }, { "epoch": 5.82, "grad_norm": 5.515371322631836, "learning_rate": 8.36815358554489e-06, "loss": 0.9919, "step": 30900 }, { "epoch": 5.82, "grad_norm": 20.2115535736084, "learning_rate": 8.364389233954451e-06, "loss": 0.5739, "step": 30910 }, { "epoch": 5.82, "grad_norm": 14.953691482543945, "learning_rate": 8.360624882364013e-06, "loss": 1.1375, "step": 30920 }, { "epoch": 5.82, "grad_norm": 20.632360458374023, "learning_rate": 8.356860530773576e-06, "loss": 1.0599, "step": 30930 }, { "epoch": 5.82, "grad_norm": 22.558162689208984, "learning_rate": 8.353096179183137e-06, "loss": 0.9153, "step": 30940 }, { "epoch": 5.83, "grad_norm": 21.550914764404297, "learning_rate": 8.349331827592699e-06, "loss": 0.7017, "step": 30950 }, { "epoch": 5.83, "grad_norm": 36.31262969970703, "learning_rate": 8.34556747600226e-06, "loss": 0.8309, "step": 30960 }, { "epoch": 5.83, "grad_norm": 17.182716369628906, "learning_rate": 8.34180312441182e-06, "loss": 0.9958, "step": 30970 }, { "epoch": 5.83, "grad_norm": 4.573057651519775, "learning_rate": 8.338038772821381e-06, "loss": 0.9777, "step": 30980 }, { "epoch": 5.83, "grad_norm": 11.109490394592285, "learning_rate": 8.334274421230943e-06, "loss": 0.8027, "step": 30990 }, { "epoch": 5.83, "grad_norm": 8.800436019897461, "learning_rate": 8.330510069640504e-06, "loss": 1.0635, "step": 31000 }, { "epoch": 5.84, "grad_norm": 7.208930492401123, "learning_rate": 8.326745718050066e-06, "loss": 1.3517, "step": 31010 }, { "epoch": 5.84, "grad_norm": 37.75522994995117, "learning_rate": 8.322981366459629e-06, "loss": 0.8122, "step": 31020 }, { "epoch": 5.84, "grad_norm": 35.847415924072266, "learning_rate": 8.31921701486919e-06, "loss": 0.818, "step": 31030 }, { "epoch": 5.84, "grad_norm": 12.321288108825684, "learning_rate": 8.315452663278752e-06, "loss": 0.6911, "step": 31040 }, { "epoch": 5.84, "grad_norm": 11.846648216247559, "learning_rate": 8.311688311688313e-06, "loss": 0.7366, "step": 31050 }, { "epoch": 5.85, "grad_norm": 17.003767013549805, "learning_rate": 8.307923960097873e-06, "loss": 0.8506, "step": 31060 }, { "epoch": 5.85, "grad_norm": 17.710678100585938, "learning_rate": 8.304159608507435e-06, "loss": 0.9313, "step": 31070 }, { "epoch": 5.85, "grad_norm": 23.007640838623047, "learning_rate": 8.300395256916996e-06, "loss": 0.9082, "step": 31080 }, { "epoch": 5.85, "grad_norm": 9.039473533630371, "learning_rate": 8.296630905326558e-06, "loss": 0.7494, "step": 31090 }, { "epoch": 5.85, "grad_norm": 22.964630126953125, "learning_rate": 8.292866553736119e-06, "loss": 0.9792, "step": 31100 }, { "epoch": 5.86, "grad_norm": 10.491424560546875, "learning_rate": 8.28910220214568e-06, "loss": 1.113, "step": 31110 }, { "epoch": 5.86, "grad_norm": 2.0476365089416504, "learning_rate": 8.285337850555244e-06, "loss": 0.7404, "step": 31120 }, { "epoch": 5.86, "grad_norm": 8.222275733947754, "learning_rate": 8.281573498964805e-06, "loss": 1.0106, "step": 31130 }, { "epoch": 5.86, "grad_norm": 1.0441361665725708, "learning_rate": 8.277809147374365e-06, "loss": 1.0561, "step": 31140 }, { "epoch": 5.86, "grad_norm": 4.987740993499756, "learning_rate": 8.274044795783926e-06, "loss": 0.8179, "step": 31150 }, { "epoch": 5.86, "grad_norm": 10.642935752868652, "learning_rate": 8.270280444193488e-06, "loss": 0.5978, "step": 31160 }, { "epoch": 5.87, "grad_norm": 2.8146939277648926, "learning_rate": 8.26651609260305e-06, "loss": 0.8504, "step": 31170 }, { "epoch": 5.87, "grad_norm": 10.670562744140625, "learning_rate": 8.26275174101261e-06, "loss": 0.7798, "step": 31180 }, { "epoch": 5.87, "grad_norm": 3.2183420658111572, "learning_rate": 8.258987389422172e-06, "loss": 0.8998, "step": 31190 }, { "epoch": 5.87, "grad_norm": 17.901538848876953, "learning_rate": 8.255223037831734e-06, "loss": 0.6935, "step": 31200 }, { "epoch": 5.87, "grad_norm": 24.882827758789062, "learning_rate": 8.251458686241297e-06, "loss": 1.1362, "step": 31210 }, { "epoch": 5.88, "grad_norm": 1.8489365577697754, "learning_rate": 8.247694334650858e-06, "loss": 0.8552, "step": 31220 }, { "epoch": 5.88, "grad_norm": 26.055313110351562, "learning_rate": 8.243929983060418e-06, "loss": 0.8264, "step": 31230 }, { "epoch": 5.88, "grad_norm": 30.86322021484375, "learning_rate": 8.24016563146998e-06, "loss": 1.0128, "step": 31240 }, { "epoch": 5.88, "grad_norm": 25.930686950683594, "learning_rate": 8.236401279879541e-06, "loss": 0.9956, "step": 31250 }, { "epoch": 5.88, "grad_norm": 9.143621444702148, "learning_rate": 8.232636928289102e-06, "loss": 0.6222, "step": 31260 }, { "epoch": 5.89, "grad_norm": 6.3026123046875, "learning_rate": 8.228872576698664e-06, "loss": 0.8651, "step": 31270 }, { "epoch": 5.89, "grad_norm": 16.093923568725586, "learning_rate": 8.225108225108225e-06, "loss": 0.9715, "step": 31280 }, { "epoch": 5.89, "grad_norm": 5.815495014190674, "learning_rate": 8.221343873517787e-06, "loss": 0.8773, "step": 31290 }, { "epoch": 5.89, "grad_norm": 1.4614043235778809, "learning_rate": 8.21757952192735e-06, "loss": 0.6845, "step": 31300 }, { "epoch": 5.89, "grad_norm": 9.334432601928711, "learning_rate": 8.213815170336911e-06, "loss": 0.6569, "step": 31310 }, { "epoch": 5.89, "grad_norm": 5.495462894439697, "learning_rate": 8.210050818746471e-06, "loss": 0.9539, "step": 31320 }, { "epoch": 5.9, "grad_norm": 7.815459728240967, "learning_rate": 8.206286467156033e-06, "loss": 0.8634, "step": 31330 }, { "epoch": 5.9, "grad_norm": 10.316014289855957, "learning_rate": 8.202522115565594e-06, "loss": 1.0669, "step": 31340 }, { "epoch": 5.9, "grad_norm": 9.539694786071777, "learning_rate": 8.198757763975156e-06, "loss": 0.894, "step": 31350 }, { "epoch": 5.9, "grad_norm": 17.459341049194336, "learning_rate": 8.194993412384717e-06, "loss": 0.8356, "step": 31360 }, { "epoch": 5.9, "grad_norm": 10.63845443725586, "learning_rate": 8.191229060794278e-06, "loss": 0.833, "step": 31370 }, { "epoch": 5.91, "grad_norm": 9.418939590454102, "learning_rate": 8.18746470920384e-06, "loss": 0.8877, "step": 31380 }, { "epoch": 5.91, "grad_norm": 6.853357791900635, "learning_rate": 8.183700357613401e-06, "loss": 1.0733, "step": 31390 }, { "epoch": 5.91, "grad_norm": 14.216197967529297, "learning_rate": 8.179936006022963e-06, "loss": 1.029, "step": 31400 }, { "epoch": 5.91, "grad_norm": 21.108821868896484, "learning_rate": 8.176171654432524e-06, "loss": 0.9707, "step": 31410 }, { "epoch": 5.91, "grad_norm": 6.926586151123047, "learning_rate": 8.172407302842086e-06, "loss": 0.9501, "step": 31420 }, { "epoch": 5.92, "grad_norm": 13.97275161743164, "learning_rate": 8.168642951251647e-06, "loss": 0.9879, "step": 31430 }, { "epoch": 5.92, "grad_norm": 13.5280179977417, "learning_rate": 8.164878599661209e-06, "loss": 0.7511, "step": 31440 }, { "epoch": 5.92, "grad_norm": 18.628765106201172, "learning_rate": 8.16111424807077e-06, "loss": 0.8676, "step": 31450 }, { "epoch": 5.92, "grad_norm": 15.630091667175293, "learning_rate": 8.157349896480332e-06, "loss": 1.2567, "step": 31460 }, { "epoch": 5.92, "grad_norm": 13.568726539611816, "learning_rate": 8.153585544889893e-06, "loss": 0.7142, "step": 31470 }, { "epoch": 5.93, "grad_norm": 4.635666847229004, "learning_rate": 8.149821193299455e-06, "loss": 1.0116, "step": 31480 }, { "epoch": 5.93, "grad_norm": 9.294036865234375, "learning_rate": 8.146056841709016e-06, "loss": 0.706, "step": 31490 }, { "epoch": 5.93, "grad_norm": 9.127870559692383, "learning_rate": 8.142292490118577e-06, "loss": 0.9532, "step": 31500 }, { "epoch": 5.93, "grad_norm": 20.695833206176758, "learning_rate": 8.138528138528139e-06, "loss": 0.7443, "step": 31510 }, { "epoch": 5.93, "grad_norm": 12.937740325927734, "learning_rate": 8.1347637869377e-06, "loss": 0.8935, "step": 31520 }, { "epoch": 5.93, "grad_norm": 6.708775043487549, "learning_rate": 8.130999435347262e-06, "loss": 0.5707, "step": 31530 }, { "epoch": 5.94, "grad_norm": 13.287261962890625, "learning_rate": 8.127235083756823e-06, "loss": 0.9112, "step": 31540 }, { "epoch": 5.94, "grad_norm": 13.261035919189453, "learning_rate": 8.123470732166385e-06, "loss": 0.7466, "step": 31550 }, { "epoch": 5.94, "grad_norm": 12.0918550491333, "learning_rate": 8.119706380575946e-06, "loss": 0.9712, "step": 31560 }, { "epoch": 5.94, "grad_norm": 23.26054573059082, "learning_rate": 8.115942028985508e-06, "loss": 0.5801, "step": 31570 }, { "epoch": 5.94, "grad_norm": 7.467716217041016, "learning_rate": 8.112177677395069e-06, "loss": 0.5842, "step": 31580 }, { "epoch": 5.95, "grad_norm": 22.294055938720703, "learning_rate": 8.10841332580463e-06, "loss": 0.8699, "step": 31590 }, { "epoch": 5.95, "grad_norm": 11.526472091674805, "learning_rate": 8.104648974214192e-06, "loss": 0.7589, "step": 31600 }, { "epoch": 5.95, "grad_norm": 31.875795364379883, "learning_rate": 8.100884622623754e-06, "loss": 0.89, "step": 31610 }, { "epoch": 5.95, "grad_norm": 3.5950570106506348, "learning_rate": 8.097120271033315e-06, "loss": 0.6014, "step": 31620 }, { "epoch": 5.95, "grad_norm": 59.06367874145508, "learning_rate": 8.093355919442876e-06, "loss": 0.8202, "step": 31630 }, { "epoch": 5.96, "grad_norm": 25.463605880737305, "learning_rate": 8.089591567852438e-06, "loss": 1.0135, "step": 31640 }, { "epoch": 5.96, "grad_norm": 21.07535743713379, "learning_rate": 8.085827216262e-06, "loss": 0.6141, "step": 31650 }, { "epoch": 5.96, "grad_norm": 18.520116806030273, "learning_rate": 8.08206286467156e-06, "loss": 0.7665, "step": 31660 }, { "epoch": 5.96, "grad_norm": 4.14178466796875, "learning_rate": 8.078298513081122e-06, "loss": 0.8915, "step": 31670 }, { "epoch": 5.96, "grad_norm": 8.283469200134277, "learning_rate": 8.074534161490684e-06, "loss": 0.7185, "step": 31680 }, { "epoch": 5.96, "grad_norm": 34.855308532714844, "learning_rate": 8.070769809900245e-06, "loss": 0.8499, "step": 31690 }, { "epoch": 5.97, "grad_norm": 6.6889848709106445, "learning_rate": 8.067005458309807e-06, "loss": 0.7434, "step": 31700 }, { "epoch": 5.97, "grad_norm": 38.26859664916992, "learning_rate": 8.063241106719368e-06, "loss": 0.774, "step": 31710 }, { "epoch": 5.97, "grad_norm": 12.017799377441406, "learning_rate": 8.05947675512893e-06, "loss": 0.8339, "step": 31720 }, { "epoch": 5.97, "grad_norm": 35.268775939941406, "learning_rate": 8.055712403538491e-06, "loss": 0.7681, "step": 31730 }, { "epoch": 5.97, "grad_norm": 10.519451141357422, "learning_rate": 8.051948051948052e-06, "loss": 1.15, "step": 31740 }, { "epoch": 5.98, "grad_norm": 12.947084426879883, "learning_rate": 8.048183700357614e-06, "loss": 0.9001, "step": 31750 }, { "epoch": 5.98, "grad_norm": 4.643189907073975, "learning_rate": 8.044419348767175e-06, "loss": 0.7547, "step": 31760 }, { "epoch": 5.98, "grad_norm": 14.288684844970703, "learning_rate": 8.040654997176737e-06, "loss": 0.7049, "step": 31770 }, { "epoch": 5.98, "grad_norm": 20.607770919799805, "learning_rate": 8.036890645586298e-06, "loss": 1.0844, "step": 31780 }, { "epoch": 5.98, "grad_norm": 5.297301292419434, "learning_rate": 8.03312629399586e-06, "loss": 0.9687, "step": 31790 }, { "epoch": 5.99, "grad_norm": 4.064742565155029, "learning_rate": 8.029361942405421e-06, "loss": 0.8523, "step": 31800 }, { "epoch": 5.99, "grad_norm": 19.675914764404297, "learning_rate": 8.025597590814983e-06, "loss": 1.0783, "step": 31810 }, { "epoch": 5.99, "grad_norm": 14.964875221252441, "learning_rate": 8.021833239224544e-06, "loss": 0.8001, "step": 31820 }, { "epoch": 5.99, "grad_norm": 2.3197381496429443, "learning_rate": 8.018068887634106e-06, "loss": 0.6272, "step": 31830 }, { "epoch": 5.99, "grad_norm": 14.406858444213867, "learning_rate": 8.014304536043667e-06, "loss": 0.8508, "step": 31840 }, { "epoch": 5.99, "grad_norm": 8.911713600158691, "learning_rate": 8.010540184453229e-06, "loss": 0.9284, "step": 31850 }, { "epoch": 6.0, "grad_norm": 6.384344577789307, "learning_rate": 8.00677583286279e-06, "loss": 0.9696, "step": 31860 }, { "epoch": 6.0, "grad_norm": 12.764810562133789, "learning_rate": 8.003011481272351e-06, "loss": 0.8495, "step": 31870 }, { "epoch": 6.0, "eval_accuracy": 0.8013333333333333, "eval_loss": 0.9912445545196533, "eval_runtime": 31.4388, "eval_samples_per_second": 238.558, "eval_steps_per_second": 29.836, "step": 31878 }, { "epoch": 6.0, "grad_norm": 6.118091583251953, "learning_rate": 7.999247129681913e-06, "loss": 0.6397, "step": 31880 }, { "epoch": 6.0, "grad_norm": 12.596017837524414, "learning_rate": 7.995482778091474e-06, "loss": 0.6942, "step": 31890 }, { "epoch": 6.0, "grad_norm": 17.227420806884766, "learning_rate": 7.991718426501036e-06, "loss": 0.8776, "step": 31900 }, { "epoch": 6.01, "grad_norm": 6.788822174072266, "learning_rate": 7.987954074910597e-06, "loss": 0.6966, "step": 31910 }, { "epoch": 6.01, "grad_norm": 5.093673229217529, "learning_rate": 7.984189723320159e-06, "loss": 0.5451, "step": 31920 }, { "epoch": 6.01, "grad_norm": 3.4659838676452637, "learning_rate": 7.98042537172972e-06, "loss": 1.015, "step": 31930 }, { "epoch": 6.01, "grad_norm": 5.289698600769043, "learning_rate": 7.976661020139282e-06, "loss": 0.8071, "step": 31940 }, { "epoch": 6.01, "grad_norm": 6.656088829040527, "learning_rate": 7.972896668548843e-06, "loss": 0.8808, "step": 31950 }, { "epoch": 6.02, "grad_norm": 13.209814071655273, "learning_rate": 7.969132316958405e-06, "loss": 0.7275, "step": 31960 }, { "epoch": 6.02, "grad_norm": 13.236181259155273, "learning_rate": 7.965367965367966e-06, "loss": 0.5999, "step": 31970 }, { "epoch": 6.02, "grad_norm": 1.4408715963363647, "learning_rate": 7.961603613777528e-06, "loss": 0.5384, "step": 31980 }, { "epoch": 6.02, "grad_norm": 2.034662961959839, "learning_rate": 7.957839262187089e-06, "loss": 0.7545, "step": 31990 }, { "epoch": 6.02, "grad_norm": 2.5053718090057373, "learning_rate": 7.95407491059665e-06, "loss": 0.5524, "step": 32000 }, { "epoch": 6.02, "grad_norm": 40.72920608520508, "learning_rate": 7.950310559006212e-06, "loss": 0.747, "step": 32010 }, { "epoch": 6.03, "grad_norm": 14.14037036895752, "learning_rate": 7.946546207415773e-06, "loss": 0.8803, "step": 32020 }, { "epoch": 6.03, "grad_norm": 1.51949942111969, "learning_rate": 7.942781855825335e-06, "loss": 0.5928, "step": 32030 }, { "epoch": 6.03, "grad_norm": 12.499205589294434, "learning_rate": 7.939017504234896e-06, "loss": 0.9736, "step": 32040 }, { "epoch": 6.03, "grad_norm": 5.166198253631592, "learning_rate": 7.935253152644458e-06, "loss": 0.8084, "step": 32050 }, { "epoch": 6.03, "grad_norm": 14.663975715637207, "learning_rate": 7.93148880105402e-06, "loss": 0.8102, "step": 32060 }, { "epoch": 6.04, "grad_norm": 11.02549934387207, "learning_rate": 7.92772444946358e-06, "loss": 0.821, "step": 32070 }, { "epoch": 6.04, "grad_norm": 31.12770652770996, "learning_rate": 7.923960097873142e-06, "loss": 0.7475, "step": 32080 }, { "epoch": 6.04, "grad_norm": 9.413935661315918, "learning_rate": 7.920195746282704e-06, "loss": 0.817, "step": 32090 }, { "epoch": 6.04, "grad_norm": 2.7146778106689453, "learning_rate": 7.916431394692265e-06, "loss": 0.6634, "step": 32100 }, { "epoch": 6.04, "grad_norm": 2.7076473236083984, "learning_rate": 7.912667043101827e-06, "loss": 0.3906, "step": 32110 }, { "epoch": 6.05, "grad_norm": 2.5345466136932373, "learning_rate": 7.908902691511388e-06, "loss": 0.6356, "step": 32120 }, { "epoch": 6.05, "grad_norm": 8.628089904785156, "learning_rate": 7.90513833992095e-06, "loss": 0.9719, "step": 32130 }, { "epoch": 6.05, "grad_norm": 7.373205661773682, "learning_rate": 7.901373988330511e-06, "loss": 0.7137, "step": 32140 }, { "epoch": 6.05, "grad_norm": 13.761917114257812, "learning_rate": 7.897609636740072e-06, "loss": 0.7898, "step": 32150 }, { "epoch": 6.05, "grad_norm": 3.045355796813965, "learning_rate": 7.893845285149634e-06, "loss": 0.8793, "step": 32160 }, { "epoch": 6.05, "grad_norm": 2.679121971130371, "learning_rate": 7.890080933559195e-06, "loss": 0.7518, "step": 32170 }, { "epoch": 6.06, "grad_norm": 2.645170211791992, "learning_rate": 7.886316581968757e-06, "loss": 0.7894, "step": 32180 }, { "epoch": 6.06, "grad_norm": 8.539621353149414, "learning_rate": 7.882552230378318e-06, "loss": 0.6749, "step": 32190 }, { "epoch": 6.06, "grad_norm": 12.0453519821167, "learning_rate": 7.87878787878788e-06, "loss": 0.7961, "step": 32200 }, { "epoch": 6.06, "grad_norm": 13.905264854431152, "learning_rate": 7.875023527197441e-06, "loss": 0.6052, "step": 32210 }, { "epoch": 6.06, "grad_norm": 12.50597858428955, "learning_rate": 7.871259175607003e-06, "loss": 0.8935, "step": 32220 }, { "epoch": 6.07, "grad_norm": 3.413496494293213, "learning_rate": 7.867494824016564e-06, "loss": 0.8768, "step": 32230 }, { "epoch": 6.07, "grad_norm": 6.290199279785156, "learning_rate": 7.863730472426126e-06, "loss": 0.8751, "step": 32240 }, { "epoch": 6.07, "grad_norm": 46.341224670410156, "learning_rate": 7.859966120835687e-06, "loss": 0.8089, "step": 32250 }, { "epoch": 6.07, "grad_norm": 5.2493157386779785, "learning_rate": 7.856201769245248e-06, "loss": 0.7939, "step": 32260 }, { "epoch": 6.07, "grad_norm": 4.613568305969238, "learning_rate": 7.852437417654808e-06, "loss": 0.8498, "step": 32270 }, { "epoch": 6.08, "grad_norm": 9.366703033447266, "learning_rate": 7.848673066064371e-06, "loss": 1.1275, "step": 32280 }, { "epoch": 6.08, "grad_norm": 9.067091941833496, "learning_rate": 7.844908714473933e-06, "loss": 0.8313, "step": 32290 }, { "epoch": 6.08, "grad_norm": 5.4753336906433105, "learning_rate": 7.841144362883494e-06, "loss": 0.8506, "step": 32300 }, { "epoch": 6.08, "grad_norm": 7.397542953491211, "learning_rate": 7.837380011293056e-06, "loss": 0.895, "step": 32310 }, { "epoch": 6.08, "grad_norm": 3.176846742630005, "learning_rate": 7.833615659702617e-06, "loss": 0.7508, "step": 32320 }, { "epoch": 6.09, "grad_norm": 21.120325088500977, "learning_rate": 7.829851308112179e-06, "loss": 0.7373, "step": 32330 }, { "epoch": 6.09, "grad_norm": 17.073131561279297, "learning_rate": 7.82608695652174e-06, "loss": 0.6827, "step": 32340 }, { "epoch": 6.09, "grad_norm": 4.612349033355713, "learning_rate": 7.822322604931302e-06, "loss": 0.81, "step": 32350 }, { "epoch": 6.09, "grad_norm": 17.579679489135742, "learning_rate": 7.818558253340861e-06, "loss": 0.668, "step": 32360 }, { "epoch": 6.09, "grad_norm": 1.1515507698059082, "learning_rate": 7.814793901750425e-06, "loss": 0.7238, "step": 32370 }, { "epoch": 6.09, "grad_norm": 53.772796630859375, "learning_rate": 7.811029550159986e-06, "loss": 0.8021, "step": 32380 }, { "epoch": 6.1, "grad_norm": 17.886808395385742, "learning_rate": 7.807265198569547e-06, "loss": 1.393, "step": 32390 }, { "epoch": 6.1, "grad_norm": 8.951563835144043, "learning_rate": 7.803500846979109e-06, "loss": 0.9395, "step": 32400 }, { "epoch": 6.1, "grad_norm": 33.0211067199707, "learning_rate": 7.79973649538867e-06, "loss": 1.1258, "step": 32410 }, { "epoch": 6.1, "grad_norm": 29.95659065246582, "learning_rate": 7.795972143798232e-06, "loss": 0.8725, "step": 32420 }, { "epoch": 6.1, "grad_norm": 2.5250978469848633, "learning_rate": 7.792207792207793e-06, "loss": 0.7194, "step": 32430 }, { "epoch": 6.11, "grad_norm": 5.767477035522461, "learning_rate": 7.788443440617355e-06, "loss": 0.8909, "step": 32440 }, { "epoch": 6.11, "grad_norm": 15.346549987792969, "learning_rate": 7.784679089026915e-06, "loss": 0.876, "step": 32450 }, { "epoch": 6.11, "grad_norm": 14.86963939666748, "learning_rate": 7.780914737436478e-06, "loss": 1.1234, "step": 32460 }, { "epoch": 6.11, "grad_norm": 3.0311222076416016, "learning_rate": 7.777150385846039e-06, "loss": 0.7522, "step": 32470 }, { "epoch": 6.11, "grad_norm": 7.837195873260498, "learning_rate": 7.7733860342556e-06, "loss": 1.0877, "step": 32480 }, { "epoch": 6.12, "grad_norm": 9.32373046875, "learning_rate": 7.769621682665162e-06, "loss": 0.9013, "step": 32490 }, { "epoch": 6.12, "grad_norm": 9.522368431091309, "learning_rate": 7.765857331074724e-06, "loss": 1.1081, "step": 32500 }, { "epoch": 6.12, "grad_norm": 2.0274200439453125, "learning_rate": 7.762092979484285e-06, "loss": 0.5695, "step": 32510 }, { "epoch": 6.12, "grad_norm": 8.976987838745117, "learning_rate": 7.758328627893846e-06, "loss": 0.744, "step": 32520 }, { "epoch": 6.12, "grad_norm": 2.851728677749634, "learning_rate": 7.754564276303408e-06, "loss": 0.6272, "step": 32530 }, { "epoch": 6.12, "grad_norm": 11.374859809875488, "learning_rate": 7.750799924712968e-06, "loss": 1.0324, "step": 32540 }, { "epoch": 6.13, "grad_norm": 15.698226928710938, "learning_rate": 7.747035573122529e-06, "loss": 0.8921, "step": 32550 }, { "epoch": 6.13, "grad_norm": 6.426815032958984, "learning_rate": 7.743271221532092e-06, "loss": 0.5966, "step": 32560 }, { "epoch": 6.13, "grad_norm": 10.463001251220703, "learning_rate": 7.739506869941654e-06, "loss": 0.8489, "step": 32570 }, { "epoch": 6.13, "grad_norm": 4.617766857147217, "learning_rate": 7.735742518351215e-06, "loss": 1.0442, "step": 32580 }, { "epoch": 6.13, "grad_norm": 22.950586318969727, "learning_rate": 7.731978166760777e-06, "loss": 0.7456, "step": 32590 }, { "epoch": 6.14, "grad_norm": 11.816163063049316, "learning_rate": 7.728213815170338e-06, "loss": 0.7198, "step": 32600 }, { "epoch": 6.14, "grad_norm": 14.70573616027832, "learning_rate": 7.7244494635799e-06, "loss": 0.7867, "step": 32610 }, { "epoch": 6.14, "grad_norm": 11.66057300567627, "learning_rate": 7.72068511198946e-06, "loss": 0.957, "step": 32620 }, { "epoch": 6.14, "grad_norm": 6.881016731262207, "learning_rate": 7.71692076039902e-06, "loss": 0.9179, "step": 32630 }, { "epoch": 6.14, "grad_norm": 19.991317749023438, "learning_rate": 7.713156408808582e-06, "loss": 0.9691, "step": 32640 }, { "epoch": 6.15, "grad_norm": 8.383686065673828, "learning_rate": 7.709392057218145e-06, "loss": 0.8217, "step": 32650 }, { "epoch": 6.15, "grad_norm": 5.7311530113220215, "learning_rate": 7.705627705627707e-06, "loss": 0.7062, "step": 32660 }, { "epoch": 6.15, "grad_norm": 3.8115508556365967, "learning_rate": 7.701863354037268e-06, "loss": 0.4817, "step": 32670 }, { "epoch": 6.15, "grad_norm": 3.9426708221435547, "learning_rate": 7.69809900244683e-06, "loss": 0.8462, "step": 32680 }, { "epoch": 6.15, "grad_norm": 33.78925323486328, "learning_rate": 7.694334650856391e-06, "loss": 0.5349, "step": 32690 }, { "epoch": 6.15, "grad_norm": 7.819505214691162, "learning_rate": 7.690570299265953e-06, "loss": 0.5434, "step": 32700 }, { "epoch": 6.16, "grad_norm": 8.800677299499512, "learning_rate": 7.686805947675512e-06, "loss": 0.7228, "step": 32710 }, { "epoch": 6.16, "grad_norm": 5.073202610015869, "learning_rate": 7.683041596085074e-06, "loss": 0.612, "step": 32720 }, { "epoch": 6.16, "grad_norm": 6.545333385467529, "learning_rate": 7.679277244494635e-06, "loss": 1.0068, "step": 32730 }, { "epoch": 6.16, "grad_norm": 53.91558837890625, "learning_rate": 7.675512892904199e-06, "loss": 0.9128, "step": 32740 }, { "epoch": 6.16, "grad_norm": 7.556985378265381, "learning_rate": 7.67174854131376e-06, "loss": 0.5871, "step": 32750 }, { "epoch": 6.17, "grad_norm": 1.9080955982208252, "learning_rate": 7.667984189723321e-06, "loss": 0.9009, "step": 32760 }, { "epoch": 6.17, "grad_norm": 12.842206954956055, "learning_rate": 7.664219838132883e-06, "loss": 1.0738, "step": 32770 }, { "epoch": 6.17, "grad_norm": 2.137928009033203, "learning_rate": 7.660455486542444e-06, "loss": 0.8939, "step": 32780 }, { "epoch": 6.17, "grad_norm": 8.372478485107422, "learning_rate": 7.656691134952006e-06, "loss": 0.8707, "step": 32790 }, { "epoch": 6.17, "grad_norm": 10.824322700500488, "learning_rate": 7.652926783361566e-06, "loss": 0.6836, "step": 32800 }, { "epoch": 6.18, "grad_norm": 8.920574188232422, "learning_rate": 7.649162431771127e-06, "loss": 0.6873, "step": 32810 }, { "epoch": 6.18, "grad_norm": 4.61783504486084, "learning_rate": 7.645398080180689e-06, "loss": 0.5019, "step": 32820 }, { "epoch": 6.18, "grad_norm": 16.41558074951172, "learning_rate": 7.641633728590252e-06, "loss": 0.7826, "step": 32830 }, { "epoch": 6.18, "grad_norm": 9.693421363830566, "learning_rate": 7.637869376999813e-06, "loss": 0.6326, "step": 32840 }, { "epoch": 6.18, "grad_norm": 7.832780838012695, "learning_rate": 7.634105025409375e-06, "loss": 0.84, "step": 32850 }, { "epoch": 6.18, "grad_norm": 7.532310485839844, "learning_rate": 7.630340673818936e-06, "loss": 0.7853, "step": 32860 }, { "epoch": 6.19, "grad_norm": 6.998384952545166, "learning_rate": 7.6265763222284976e-06, "loss": 0.9399, "step": 32870 }, { "epoch": 6.19, "grad_norm": 1.538866400718689, "learning_rate": 7.622811970638057e-06, "loss": 0.8232, "step": 32880 }, { "epoch": 6.19, "grad_norm": 16.18455696105957, "learning_rate": 7.61904761904762e-06, "loss": 1.0424, "step": 32890 }, { "epoch": 6.19, "grad_norm": 8.764405250549316, "learning_rate": 7.615283267457181e-06, "loss": 0.6476, "step": 32900 }, { "epoch": 6.19, "grad_norm": 1.879504680633545, "learning_rate": 7.6115189158667426e-06, "loss": 0.542, "step": 32910 }, { "epoch": 6.2, "grad_norm": 9.971755027770996, "learning_rate": 7.607754564276304e-06, "loss": 0.6875, "step": 32920 }, { "epoch": 6.2, "grad_norm": 26.730527877807617, "learning_rate": 7.6039902126858655e-06, "loss": 0.5063, "step": 32930 }, { "epoch": 6.2, "grad_norm": 5.536253929138184, "learning_rate": 7.600225861095427e-06, "loss": 0.7509, "step": 32940 }, { "epoch": 6.2, "grad_norm": 8.032835960388184, "learning_rate": 7.596461509504988e-06, "loss": 0.4974, "step": 32950 }, { "epoch": 6.2, "grad_norm": 27.142648696899414, "learning_rate": 7.592697157914551e-06, "loss": 0.6667, "step": 32960 }, { "epoch": 6.21, "grad_norm": 5.252595901489258, "learning_rate": 7.5889328063241105e-06, "loss": 0.8755, "step": 32970 }, { "epoch": 6.21, "grad_norm": 8.61584758758545, "learning_rate": 7.585168454733673e-06, "loss": 0.9335, "step": 32980 }, { "epoch": 6.21, "grad_norm": 6.747732162475586, "learning_rate": 7.581404103143234e-06, "loss": 0.8599, "step": 32990 }, { "epoch": 6.21, "grad_norm": 16.04298210144043, "learning_rate": 7.577639751552796e-06, "loss": 0.6968, "step": 33000 }, { "epoch": 6.21, "grad_norm": 14.462750434875488, "learning_rate": 7.573875399962357e-06, "loss": 0.6962, "step": 33010 }, { "epoch": 6.21, "grad_norm": 25.075611114501953, "learning_rate": 7.570111048371919e-06, "loss": 0.9758, "step": 33020 }, { "epoch": 6.22, "grad_norm": 3.5379230976104736, "learning_rate": 7.56634669678148e-06, "loss": 0.7615, "step": 33030 }, { "epoch": 6.22, "grad_norm": 15.697576522827148, "learning_rate": 7.5625823451910415e-06, "loss": 0.9249, "step": 33040 }, { "epoch": 6.22, "grad_norm": 11.27122974395752, "learning_rate": 7.558817993600603e-06, "loss": 0.9837, "step": 33050 }, { "epoch": 6.22, "grad_norm": 12.916740417480469, "learning_rate": 7.555053642010164e-06, "loss": 0.953, "step": 33060 }, { "epoch": 6.22, "grad_norm": 3.3558743000030518, "learning_rate": 7.551289290419725e-06, "loss": 0.6457, "step": 33070 }, { "epoch": 6.23, "grad_norm": 9.100872039794922, "learning_rate": 7.547524938829287e-06, "loss": 0.9039, "step": 33080 }, { "epoch": 6.23, "grad_norm": 10.188567161560059, "learning_rate": 7.543760587238849e-06, "loss": 0.8944, "step": 33090 }, { "epoch": 6.23, "grad_norm": 6.688163757324219, "learning_rate": 7.53999623564841e-06, "loss": 1.4012, "step": 33100 }, { "epoch": 6.23, "grad_norm": 4.41121768951416, "learning_rate": 7.536231884057972e-06, "loss": 0.5033, "step": 33110 }, { "epoch": 6.23, "grad_norm": 29.808561325073242, "learning_rate": 7.532467532467533e-06, "loss": 0.8158, "step": 33120 }, { "epoch": 6.24, "grad_norm": 23.9070987701416, "learning_rate": 7.528703180877095e-06, "loss": 0.8762, "step": 33130 }, { "epoch": 6.24, "grad_norm": 4.423242092132568, "learning_rate": 7.524938829286656e-06, "loss": 0.6123, "step": 33140 }, { "epoch": 6.24, "grad_norm": 25.49211311340332, "learning_rate": 7.521174477696217e-06, "loss": 1.1418, "step": 33150 }, { "epoch": 6.24, "grad_norm": 3.092045307159424, "learning_rate": 7.517410126105778e-06, "loss": 0.7526, "step": 33160 }, { "epoch": 6.24, "grad_norm": 1.7638049125671387, "learning_rate": 7.5136457745153405e-06, "loss": 0.5793, "step": 33170 }, { "epoch": 6.25, "grad_norm": 17.533737182617188, "learning_rate": 7.509881422924902e-06, "loss": 0.8243, "step": 33180 }, { "epoch": 6.25, "grad_norm": 7.42354154586792, "learning_rate": 7.5061170713344635e-06, "loss": 0.7998, "step": 33190 }, { "epoch": 6.25, "grad_norm": 15.324810028076172, "learning_rate": 7.502352719744025e-06, "loss": 0.9309, "step": 33200 }, { "epoch": 6.25, "grad_norm": 5.703739166259766, "learning_rate": 7.498588368153586e-06, "loss": 1.0881, "step": 33210 }, { "epoch": 6.25, "grad_norm": 22.95489501953125, "learning_rate": 7.494824016563148e-06, "loss": 0.7584, "step": 33220 }, { "epoch": 6.25, "grad_norm": 12.995726585388184, "learning_rate": 7.4910596649727084e-06, "loss": 0.8978, "step": 33230 }, { "epoch": 6.26, "grad_norm": 26.64305877685547, "learning_rate": 7.48729531338227e-06, "loss": 0.9737, "step": 33240 }, { "epoch": 6.26, "grad_norm": 8.07895565032959, "learning_rate": 7.483530961791831e-06, "loss": 0.9985, "step": 33250 }, { "epoch": 6.26, "grad_norm": 16.623964309692383, "learning_rate": 7.479766610201394e-06, "loss": 0.9786, "step": 33260 }, { "epoch": 6.26, "grad_norm": 6.8113298416137695, "learning_rate": 7.476002258610955e-06, "loss": 0.8197, "step": 33270 }, { "epoch": 6.26, "grad_norm": 13.354057312011719, "learning_rate": 7.472237907020517e-06, "loss": 0.6549, "step": 33280 }, { "epoch": 6.27, "grad_norm": 5.6670050621032715, "learning_rate": 7.468473555430078e-06, "loss": 0.7754, "step": 33290 }, { "epoch": 6.27, "grad_norm": 14.521883964538574, "learning_rate": 7.4647092038396395e-06, "loss": 0.8289, "step": 33300 }, { "epoch": 6.27, "grad_norm": 7.521157264709473, "learning_rate": 7.460944852249201e-06, "loss": 1.1892, "step": 33310 }, { "epoch": 6.27, "grad_norm": 13.898011207580566, "learning_rate": 7.457180500658762e-06, "loss": 0.5729, "step": 33320 }, { "epoch": 6.27, "grad_norm": 3.4292047023773193, "learning_rate": 7.453416149068323e-06, "loss": 0.9698, "step": 33330 }, { "epoch": 6.28, "grad_norm": 35.013946533203125, "learning_rate": 7.4496517974778845e-06, "loss": 0.7972, "step": 33340 }, { "epoch": 6.28, "grad_norm": 29.406295776367188, "learning_rate": 7.445887445887446e-06, "loss": 0.9631, "step": 33350 }, { "epoch": 6.28, "grad_norm": 11.900074005126953, "learning_rate": 7.442123094297008e-06, "loss": 0.6521, "step": 33360 }, { "epoch": 6.28, "grad_norm": 4.234182357788086, "learning_rate": 7.43835874270657e-06, "loss": 0.9445, "step": 33370 }, { "epoch": 6.28, "grad_norm": 16.268115997314453, "learning_rate": 7.434594391116131e-06, "loss": 0.5853, "step": 33380 }, { "epoch": 6.28, "grad_norm": 19.451705932617188, "learning_rate": 7.430830039525693e-06, "loss": 0.6399, "step": 33390 }, { "epoch": 6.29, "grad_norm": 11.32101058959961, "learning_rate": 7.427065687935254e-06, "loss": 0.4453, "step": 33400 }, { "epoch": 6.29, "grad_norm": 17.87284278869629, "learning_rate": 7.423301336344815e-06, "loss": 0.8561, "step": 33410 }, { "epoch": 6.29, "grad_norm": 21.079120635986328, "learning_rate": 7.419536984754376e-06, "loss": 0.6528, "step": 33420 }, { "epoch": 6.29, "grad_norm": 44.592472076416016, "learning_rate": 7.415772633163938e-06, "loss": 0.8556, "step": 33430 }, { "epoch": 6.29, "grad_norm": 15.032529830932617, "learning_rate": 7.412008281573499e-06, "loss": 0.6889, "step": 33440 }, { "epoch": 6.3, "grad_norm": 17.71792984008789, "learning_rate": 7.4082439299830614e-06, "loss": 0.7922, "step": 33450 }, { "epoch": 6.3, "grad_norm": 22.400226593017578, "learning_rate": 7.404479578392623e-06, "loss": 1.04, "step": 33460 }, { "epoch": 6.3, "grad_norm": 5.065873622894287, "learning_rate": 7.400715226802184e-06, "loss": 0.8546, "step": 33470 }, { "epoch": 6.3, "grad_norm": 10.704484939575195, "learning_rate": 7.396950875211746e-06, "loss": 1.2714, "step": 33480 }, { "epoch": 6.3, "grad_norm": 11.929192543029785, "learning_rate": 7.3931865236213064e-06, "loss": 0.725, "step": 33490 }, { "epoch": 6.31, "grad_norm": 2.1099143028259277, "learning_rate": 7.389422172030868e-06, "loss": 0.598, "step": 33500 }, { "epoch": 6.31, "grad_norm": 19.941347122192383, "learning_rate": 7.385657820440429e-06, "loss": 0.5998, "step": 33510 }, { "epoch": 6.31, "grad_norm": 6.242794990539551, "learning_rate": 7.381893468849991e-06, "loss": 0.8276, "step": 33520 }, { "epoch": 6.31, "grad_norm": 4.363724708557129, "learning_rate": 7.378129117259552e-06, "loss": 0.7789, "step": 33530 }, { "epoch": 6.31, "grad_norm": 3.405797004699707, "learning_rate": 7.3743647656691146e-06, "loss": 0.7004, "step": 33540 }, { "epoch": 6.31, "grad_norm": 2.3336541652679443, "learning_rate": 7.370600414078676e-06, "loss": 0.4551, "step": 33550 }, { "epoch": 6.32, "grad_norm": 11.329400062561035, "learning_rate": 7.3668360624882375e-06, "loss": 0.8562, "step": 33560 }, { "epoch": 6.32, "grad_norm": 12.661884307861328, "learning_rate": 7.363071710897799e-06, "loss": 0.6823, "step": 33570 }, { "epoch": 6.32, "grad_norm": 6.4658203125, "learning_rate": 7.3593073593073596e-06, "loss": 0.6475, "step": 33580 }, { "epoch": 6.32, "grad_norm": 25.622526168823242, "learning_rate": 7.355543007716921e-06, "loss": 0.8339, "step": 33590 }, { "epoch": 6.32, "grad_norm": 11.46175765991211, "learning_rate": 7.3517786561264825e-06, "loss": 0.6584, "step": 33600 }, { "epoch": 6.33, "grad_norm": 8.618928909301758, "learning_rate": 7.348014304536044e-06, "loss": 0.6289, "step": 33610 }, { "epoch": 6.33, "grad_norm": 11.549042701721191, "learning_rate": 7.344249952945605e-06, "loss": 0.7908, "step": 33620 }, { "epoch": 6.33, "grad_norm": 7.479549407958984, "learning_rate": 7.340485601355168e-06, "loss": 1.3735, "step": 33630 }, { "epoch": 6.33, "grad_norm": 7.9505157470703125, "learning_rate": 7.336721249764729e-06, "loss": 0.7044, "step": 33640 }, { "epoch": 6.33, "grad_norm": 9.027472496032715, "learning_rate": 7.332956898174291e-06, "loss": 0.632, "step": 33650 }, { "epoch": 6.34, "grad_norm": 37.35455322265625, "learning_rate": 7.329192546583852e-06, "loss": 0.8402, "step": 33660 }, { "epoch": 6.34, "grad_norm": 15.627532005310059, "learning_rate": 7.325428194993413e-06, "loss": 0.9261, "step": 33670 }, { "epoch": 6.34, "grad_norm": 13.872254371643066, "learning_rate": 7.321663843402974e-06, "loss": 1.0513, "step": 33680 }, { "epoch": 6.34, "grad_norm": 4.954541206359863, "learning_rate": 7.317899491812536e-06, "loss": 0.6506, "step": 33690 }, { "epoch": 6.34, "grad_norm": 7.372968673706055, "learning_rate": 7.314135140222097e-06, "loss": 0.6775, "step": 33700 }, { "epoch": 6.34, "grad_norm": 14.645179748535156, "learning_rate": 7.3103707886316586e-06, "loss": 0.9196, "step": 33710 }, { "epoch": 6.35, "grad_norm": 12.736454010009766, "learning_rate": 7.30660643704122e-06, "loss": 0.948, "step": 33720 }, { "epoch": 6.35, "grad_norm": 11.034810066223145, "learning_rate": 7.302842085450782e-06, "loss": 0.8617, "step": 33730 }, { "epoch": 6.35, "grad_norm": 9.58061695098877, "learning_rate": 7.299077733860344e-06, "loss": 0.8462, "step": 33740 }, { "epoch": 6.35, "grad_norm": 24.281789779663086, "learning_rate": 7.295313382269905e-06, "loss": 0.6847, "step": 33750 }, { "epoch": 6.35, "grad_norm": 23.29878807067871, "learning_rate": 7.291549030679466e-06, "loss": 0.8824, "step": 33760 }, { "epoch": 6.36, "grad_norm": 3.8827896118164062, "learning_rate": 7.287784679089027e-06, "loss": 0.7027, "step": 33770 }, { "epoch": 6.36, "grad_norm": 10.058822631835938, "learning_rate": 7.284020327498589e-06, "loss": 0.7138, "step": 33780 }, { "epoch": 6.36, "grad_norm": 12.02177619934082, "learning_rate": 7.28025597590815e-06, "loss": 0.8802, "step": 33790 }, { "epoch": 6.36, "grad_norm": 0.6440919041633606, "learning_rate": 7.276491624317712e-06, "loss": 0.5824, "step": 33800 }, { "epoch": 6.36, "grad_norm": 2.162614583969116, "learning_rate": 7.272727272727273e-06, "loss": 0.7499, "step": 33810 }, { "epoch": 6.37, "grad_norm": 4.987521648406982, "learning_rate": 7.2689629211368355e-06, "loss": 0.4733, "step": 33820 }, { "epoch": 6.37, "grad_norm": 23.41518211364746, "learning_rate": 7.265198569546397e-06, "loss": 0.9376, "step": 33830 }, { "epoch": 6.37, "grad_norm": 5.6797661781311035, "learning_rate": 7.2614342179559576e-06, "loss": 0.8508, "step": 33840 }, { "epoch": 6.37, "grad_norm": 7.098048210144043, "learning_rate": 7.257669866365519e-06, "loss": 0.9689, "step": 33850 }, { "epoch": 6.37, "grad_norm": 9.661799430847168, "learning_rate": 7.2539055147750805e-06, "loss": 0.7559, "step": 33860 }, { "epoch": 6.37, "grad_norm": 3.4965991973876953, "learning_rate": 7.250141163184642e-06, "loss": 0.6866, "step": 33870 }, { "epoch": 6.38, "grad_norm": 4.093890190124512, "learning_rate": 7.246376811594203e-06, "loss": 0.6322, "step": 33880 }, { "epoch": 6.38, "grad_norm": 7.37570333480835, "learning_rate": 7.242612460003765e-06, "loss": 0.582, "step": 33890 }, { "epoch": 6.38, "grad_norm": 24.41130828857422, "learning_rate": 7.238848108413326e-06, "loss": 1.0237, "step": 33900 }, { "epoch": 6.38, "grad_norm": 5.686967372894287, "learning_rate": 7.235083756822889e-06, "loss": 1.1212, "step": 33910 }, { "epoch": 6.38, "grad_norm": 2.334634780883789, "learning_rate": 7.23131940523245e-06, "loss": 0.6719, "step": 33920 }, { "epoch": 6.39, "grad_norm": 0.6370408535003662, "learning_rate": 7.22755505364201e-06, "loss": 0.8992, "step": 33930 }, { "epoch": 6.39, "grad_norm": 11.492918968200684, "learning_rate": 7.223790702051572e-06, "loss": 0.8724, "step": 33940 }, { "epoch": 6.39, "grad_norm": 13.137578010559082, "learning_rate": 7.220026350461134e-06, "loss": 0.5183, "step": 33950 }, { "epoch": 6.39, "grad_norm": 28.319067001342773, "learning_rate": 7.216261998870695e-06, "loss": 0.6859, "step": 33960 }, { "epoch": 6.39, "grad_norm": 22.97620964050293, "learning_rate": 7.2124976472802565e-06, "loss": 0.8225, "step": 33970 }, { "epoch": 6.4, "grad_norm": 8.105890274047852, "learning_rate": 7.208733295689818e-06, "loss": 1.1563, "step": 33980 }, { "epoch": 6.4, "grad_norm": 3.732881784439087, "learning_rate": 7.2049689440993795e-06, "loss": 0.9299, "step": 33990 }, { "epoch": 6.4, "grad_norm": 18.762136459350586, "learning_rate": 7.201204592508941e-06, "loss": 0.8705, "step": 34000 }, { "epoch": 6.4, "grad_norm": 8.987242698669434, "learning_rate": 7.197440240918503e-06, "loss": 0.7163, "step": 34010 }, { "epoch": 6.4, "grad_norm": 6.1951584815979, "learning_rate": 7.193675889328063e-06, "loss": 0.7687, "step": 34020 }, { "epoch": 6.41, "grad_norm": 16.723796844482422, "learning_rate": 7.189911537737625e-06, "loss": 0.6585, "step": 34030 }, { "epoch": 6.41, "grad_norm": 5.307747840881348, "learning_rate": 7.186147186147187e-06, "loss": 1.1221, "step": 34040 }, { "epoch": 6.41, "grad_norm": 11.021411895751953, "learning_rate": 7.182382834556748e-06, "loss": 0.8796, "step": 34050 }, { "epoch": 6.41, "grad_norm": 14.764350891113281, "learning_rate": 7.17861848296631e-06, "loss": 0.668, "step": 34060 }, { "epoch": 6.41, "grad_norm": 15.550354957580566, "learning_rate": 7.174854131375871e-06, "loss": 0.7307, "step": 34070 }, { "epoch": 6.41, "grad_norm": 2.5523645877838135, "learning_rate": 7.171089779785433e-06, "loss": 0.8271, "step": 34080 }, { "epoch": 6.42, "grad_norm": 16.490161895751953, "learning_rate": 7.167325428194994e-06, "loss": 0.8716, "step": 34090 }, { "epoch": 6.42, "grad_norm": 20.62291717529297, "learning_rate": 7.163561076604555e-06, "loss": 0.6183, "step": 34100 }, { "epoch": 6.42, "grad_norm": 11.556937217712402, "learning_rate": 7.159796725014116e-06, "loss": 1.0226, "step": 34110 }, { "epoch": 6.42, "grad_norm": 57.205543518066406, "learning_rate": 7.1560323734236784e-06, "loss": 0.8058, "step": 34120 }, { "epoch": 6.42, "grad_norm": 10.213240623474121, "learning_rate": 7.15226802183324e-06, "loss": 0.8245, "step": 34130 }, { "epoch": 6.43, "grad_norm": 25.320945739746094, "learning_rate": 7.148503670242801e-06, "loss": 0.863, "step": 34140 }, { "epoch": 6.43, "grad_norm": 13.521397590637207, "learning_rate": 7.144739318652363e-06, "loss": 0.6966, "step": 34150 }, { "epoch": 6.43, "grad_norm": 8.726988792419434, "learning_rate": 7.140974967061924e-06, "loss": 0.9455, "step": 34160 }, { "epoch": 6.43, "grad_norm": 15.485198020935059, "learning_rate": 7.137210615471486e-06, "loss": 1.0589, "step": 34170 }, { "epoch": 6.43, "grad_norm": 34.06157684326172, "learning_rate": 7.133446263881047e-06, "loss": 0.72, "step": 34180 }, { "epoch": 6.44, "grad_norm": 12.81176471710205, "learning_rate": 7.129681912290608e-06, "loss": 0.7719, "step": 34190 }, { "epoch": 6.44, "grad_norm": 6.955991744995117, "learning_rate": 7.125917560700169e-06, "loss": 0.8225, "step": 34200 }, { "epoch": 6.44, "grad_norm": 18.789396286010742, "learning_rate": 7.122153209109732e-06, "loss": 0.7484, "step": 34210 }, { "epoch": 6.44, "grad_norm": 9.216533660888672, "learning_rate": 7.118388857519293e-06, "loss": 0.7314, "step": 34220 }, { "epoch": 6.44, "grad_norm": 43.313621520996094, "learning_rate": 7.1146245059288545e-06, "loss": 0.783, "step": 34230 }, { "epoch": 6.44, "grad_norm": 9.710837364196777, "learning_rate": 7.110860154338416e-06, "loss": 0.7583, "step": 34240 }, { "epoch": 6.45, "grad_norm": 5.346651077270508, "learning_rate": 7.1070958027479774e-06, "loss": 0.7073, "step": 34250 }, { "epoch": 6.45, "grad_norm": 6.974880695343018, "learning_rate": 7.103331451157539e-06, "loss": 0.7958, "step": 34260 }, { "epoch": 6.45, "grad_norm": 2.4056460857391357, "learning_rate": 7.0995670995671e-06, "loss": 0.792, "step": 34270 }, { "epoch": 6.45, "grad_norm": 22.695493698120117, "learning_rate": 7.095802747976661e-06, "loss": 1.192, "step": 34280 }, { "epoch": 6.45, "grad_norm": 4.072471618652344, "learning_rate": 7.0920383963862224e-06, "loss": 0.6443, "step": 34290 }, { "epoch": 6.46, "grad_norm": 1.699610710144043, "learning_rate": 7.088274044795784e-06, "loss": 0.5296, "step": 34300 }, { "epoch": 6.46, "grad_norm": 1.1842429637908936, "learning_rate": 7.084509693205346e-06, "loss": 0.8826, "step": 34310 }, { "epoch": 6.46, "grad_norm": 1.5796277523040771, "learning_rate": 7.080745341614908e-06, "loss": 0.7551, "step": 34320 }, { "epoch": 6.46, "grad_norm": 16.923315048217773, "learning_rate": 7.076980990024469e-06, "loss": 0.7083, "step": 34330 }, { "epoch": 6.46, "grad_norm": 12.883339881896973, "learning_rate": 7.073216638434031e-06, "loss": 1.0438, "step": 34340 }, { "epoch": 6.47, "grad_norm": 28.366357803344727, "learning_rate": 7.069452286843592e-06, "loss": 0.8644, "step": 34350 }, { "epoch": 6.47, "grad_norm": 12.738807678222656, "learning_rate": 7.0656879352531535e-06, "loss": 0.498, "step": 34360 }, { "epoch": 6.47, "grad_norm": 15.73867130279541, "learning_rate": 7.061923583662714e-06, "loss": 1.1679, "step": 34370 }, { "epoch": 6.47, "grad_norm": 1.2987840175628662, "learning_rate": 7.058159232072276e-06, "loss": 0.6273, "step": 34380 }, { "epoch": 6.47, "grad_norm": 18.019638061523438, "learning_rate": 7.054394880481837e-06, "loss": 0.8369, "step": 34390 }, { "epoch": 6.47, "grad_norm": 9.604355812072754, "learning_rate": 7.050630528891399e-06, "loss": 0.8712, "step": 34400 }, { "epoch": 6.48, "grad_norm": 5.3548665046691895, "learning_rate": 7.046866177300961e-06, "loss": 0.7213, "step": 34410 }, { "epoch": 6.48, "grad_norm": 16.364185333251953, "learning_rate": 7.043101825710522e-06, "loss": 0.5826, "step": 34420 }, { "epoch": 6.48, "grad_norm": 13.313765525817871, "learning_rate": 7.039337474120084e-06, "loss": 0.8858, "step": 34430 }, { "epoch": 6.48, "grad_norm": 2.5230929851531982, "learning_rate": 7.035573122529645e-06, "loss": 0.4465, "step": 34440 }, { "epoch": 6.48, "grad_norm": 10.118013381958008, "learning_rate": 7.031808770939206e-06, "loss": 0.7316, "step": 34450 }, { "epoch": 6.49, "grad_norm": 2.5970287322998047, "learning_rate": 7.028044419348767e-06, "loss": 0.4302, "step": 34460 }, { "epoch": 6.49, "grad_norm": 6.057499408721924, "learning_rate": 7.024280067758329e-06, "loss": 0.5125, "step": 34470 }, { "epoch": 6.49, "grad_norm": 3.107283115386963, "learning_rate": 7.02051571616789e-06, "loss": 0.9787, "step": 34480 }, { "epoch": 6.49, "grad_norm": 4.885765552520752, "learning_rate": 7.0167513645774525e-06, "loss": 0.723, "step": 34490 }, { "epoch": 6.49, "grad_norm": 21.82842254638672, "learning_rate": 7.012987012987014e-06, "loss": 0.8655, "step": 34500 }, { "epoch": 6.5, "grad_norm": 11.863201141357422, "learning_rate": 7.009222661396575e-06, "loss": 0.5622, "step": 34510 }, { "epoch": 6.5, "grad_norm": 0.7491320967674255, "learning_rate": 7.005458309806137e-06, "loss": 0.8844, "step": 34520 }, { "epoch": 6.5, "grad_norm": 31.913429260253906, "learning_rate": 7.001693958215698e-06, "loss": 0.8962, "step": 34530 }, { "epoch": 6.5, "grad_norm": 32.60471725463867, "learning_rate": 6.997929606625259e-06, "loss": 0.9156, "step": 34540 }, { "epoch": 6.5, "grad_norm": 4.1954450607299805, "learning_rate": 6.99416525503482e-06, "loss": 0.7518, "step": 34550 }, { "epoch": 6.5, "grad_norm": 12.784632682800293, "learning_rate": 6.990400903444382e-06, "loss": 0.6618, "step": 34560 }, { "epoch": 6.51, "grad_norm": 4.339486122131348, "learning_rate": 6.986636551853943e-06, "loss": 0.7702, "step": 34570 }, { "epoch": 6.51, "grad_norm": 8.27316951751709, "learning_rate": 6.982872200263505e-06, "loss": 0.4823, "step": 34580 }, { "epoch": 6.51, "grad_norm": 4.726656436920166, "learning_rate": 6.979107848673067e-06, "loss": 0.8156, "step": 34590 }, { "epoch": 6.51, "grad_norm": 12.390050888061523, "learning_rate": 6.9753434970826286e-06, "loss": 0.6449, "step": 34600 }, { "epoch": 6.51, "grad_norm": 9.194417953491211, "learning_rate": 6.97157914549219e-06, "loss": 0.5351, "step": 34610 }, { "epoch": 6.52, "grad_norm": 7.054128170013428, "learning_rate": 6.9678147939017515e-06, "loss": 0.9092, "step": 34620 }, { "epoch": 6.52, "grad_norm": 36.081966400146484, "learning_rate": 6.964050442311312e-06, "loss": 0.6922, "step": 34630 }, { "epoch": 6.52, "grad_norm": 4.445969104766846, "learning_rate": 6.9602860907208736e-06, "loss": 0.6247, "step": 34640 }, { "epoch": 6.52, "grad_norm": 22.030195236206055, "learning_rate": 6.956521739130435e-06, "loss": 0.6839, "step": 34650 }, { "epoch": 6.52, "grad_norm": 12.601126670837402, "learning_rate": 6.9527573875399965e-06, "loss": 0.5062, "step": 34660 }, { "epoch": 6.53, "grad_norm": 14.045123100280762, "learning_rate": 6.948993035949558e-06, "loss": 0.4852, "step": 34670 }, { "epoch": 6.53, "grad_norm": 5.942188739776611, "learning_rate": 6.94522868435912e-06, "loss": 0.6854, "step": 34680 }, { "epoch": 6.53, "grad_norm": 5.113363265991211, "learning_rate": 6.941464332768682e-06, "loss": 0.8664, "step": 34690 }, { "epoch": 6.53, "grad_norm": 5.751442909240723, "learning_rate": 6.937699981178243e-06, "loss": 0.7293, "step": 34700 }, { "epoch": 6.53, "grad_norm": 9.576197624206543, "learning_rate": 6.933935629587804e-06, "loss": 0.7572, "step": 34710 }, { "epoch": 6.53, "grad_norm": 2.0982346534729004, "learning_rate": 6.930171277997365e-06, "loss": 0.7848, "step": 34720 }, { "epoch": 6.54, "grad_norm": 8.333172798156738, "learning_rate": 6.926406926406927e-06, "loss": 0.8755, "step": 34730 }, { "epoch": 6.54, "grad_norm": 7.454107761383057, "learning_rate": 6.922642574816488e-06, "loss": 0.8471, "step": 34740 }, { "epoch": 6.54, "grad_norm": 3.082706928253174, "learning_rate": 6.91887822322605e-06, "loss": 0.4181, "step": 34750 }, { "epoch": 6.54, "grad_norm": 7.8948445320129395, "learning_rate": 6.915113871635611e-06, "loss": 0.567, "step": 34760 }, { "epoch": 6.54, "grad_norm": 6.417537689208984, "learning_rate": 6.911349520045173e-06, "loss": 1.2104, "step": 34770 }, { "epoch": 6.55, "grad_norm": 4.900731563568115, "learning_rate": 6.907585168454735e-06, "loss": 0.6585, "step": 34780 }, { "epoch": 6.55, "grad_norm": 8.585679054260254, "learning_rate": 6.903820816864296e-06, "loss": 0.9375, "step": 34790 }, { "epoch": 6.55, "grad_norm": 9.635528564453125, "learning_rate": 6.900056465273857e-06, "loss": 0.5427, "step": 34800 }, { "epoch": 6.55, "grad_norm": 8.551118850708008, "learning_rate": 6.896292113683418e-06, "loss": 0.9001, "step": 34810 }, { "epoch": 6.55, "grad_norm": 3.3109116554260254, "learning_rate": 6.89252776209298e-06, "loss": 0.563, "step": 34820 }, { "epoch": 6.56, "grad_norm": 3.9126782417297363, "learning_rate": 6.888763410502541e-06, "loss": 0.9215, "step": 34830 }, { "epoch": 6.56, "grad_norm": 17.485057830810547, "learning_rate": 6.884999058912103e-06, "loss": 0.862, "step": 34840 }, { "epoch": 6.56, "grad_norm": 21.24209213256836, "learning_rate": 6.881234707321664e-06, "loss": 0.7256, "step": 34850 }, { "epoch": 6.56, "grad_norm": 9.794988632202148, "learning_rate": 6.8774703557312265e-06, "loss": 0.734, "step": 34860 }, { "epoch": 6.56, "grad_norm": 13.132662773132324, "learning_rate": 6.873706004140788e-06, "loss": 0.6238, "step": 34870 }, { "epoch": 6.57, "grad_norm": 49.42293930053711, "learning_rate": 6.8699416525503495e-06, "loss": 1.0804, "step": 34880 }, { "epoch": 6.57, "grad_norm": 22.587448120117188, "learning_rate": 6.86617730095991e-06, "loss": 0.719, "step": 34890 }, { "epoch": 6.57, "grad_norm": 7.82258415222168, "learning_rate": 6.8624129493694715e-06, "loss": 0.6484, "step": 34900 }, { "epoch": 6.57, "grad_norm": 24.351261138916016, "learning_rate": 6.858648597779033e-06, "loss": 0.8178, "step": 34910 }, { "epoch": 6.57, "grad_norm": 14.992230415344238, "learning_rate": 6.8548842461885945e-06, "loss": 0.7871, "step": 34920 }, { "epoch": 6.57, "grad_norm": 9.008991241455078, "learning_rate": 6.851119894598156e-06, "loss": 1.0296, "step": 34930 }, { "epoch": 6.58, "grad_norm": 1.3581798076629639, "learning_rate": 6.847355543007717e-06, "loss": 0.7365, "step": 34940 }, { "epoch": 6.58, "grad_norm": 22.918758392333984, "learning_rate": 6.843591191417279e-06, "loss": 1.1838, "step": 34950 }, { "epoch": 6.58, "grad_norm": 3.588327407836914, "learning_rate": 6.839826839826841e-06, "loss": 0.4586, "step": 34960 }, { "epoch": 6.58, "grad_norm": 13.771443367004395, "learning_rate": 6.836062488236401e-06, "loss": 0.5614, "step": 34970 }, { "epoch": 6.58, "grad_norm": 6.478953838348389, "learning_rate": 6.832298136645963e-06, "loss": 0.762, "step": 34980 }, { "epoch": 6.59, "grad_norm": 11.303642272949219, "learning_rate": 6.828533785055525e-06, "loss": 0.8532, "step": 34990 }, { "epoch": 6.59, "grad_norm": 10.56287956237793, "learning_rate": 6.824769433465086e-06, "loss": 0.8184, "step": 35000 }, { "epoch": 6.59, "grad_norm": 1.879166841506958, "learning_rate": 6.821005081874648e-06, "loss": 0.6475, "step": 35010 }, { "epoch": 6.59, "grad_norm": 8.607254028320312, "learning_rate": 6.817240730284209e-06, "loss": 0.8735, "step": 35020 }, { "epoch": 6.59, "grad_norm": 4.0795392990112305, "learning_rate": 6.8134763786937705e-06, "loss": 0.7916, "step": 35030 }, { "epoch": 6.6, "grad_norm": 9.261628150939941, "learning_rate": 6.809712027103332e-06, "loss": 0.7597, "step": 35040 }, { "epoch": 6.6, "grad_norm": 10.331379890441895, "learning_rate": 6.805947675512894e-06, "loss": 0.7151, "step": 35050 }, { "epoch": 6.6, "grad_norm": 10.072612762451172, "learning_rate": 6.802183323922454e-06, "loss": 0.8081, "step": 35060 }, { "epoch": 6.6, "grad_norm": 4.7918009757995605, "learning_rate": 6.798418972332016e-06, "loss": 0.9026, "step": 35070 }, { "epoch": 6.6, "grad_norm": 2.9943368434906006, "learning_rate": 6.794654620741578e-06, "loss": 0.7136, "step": 35080 }, { "epoch": 6.6, "grad_norm": 20.50057601928711, "learning_rate": 6.790890269151139e-06, "loss": 1.0426, "step": 35090 }, { "epoch": 6.61, "grad_norm": 4.121964931488037, "learning_rate": 6.787125917560701e-06, "loss": 1.0226, "step": 35100 }, { "epoch": 6.61, "grad_norm": 12.788354873657227, "learning_rate": 6.783361565970262e-06, "loss": 0.8316, "step": 35110 }, { "epoch": 6.61, "grad_norm": 21.179628372192383, "learning_rate": 6.779597214379824e-06, "loss": 0.8299, "step": 35120 }, { "epoch": 6.61, "grad_norm": 9.664793014526367, "learning_rate": 6.775832862789385e-06, "loss": 0.8336, "step": 35130 }, { "epoch": 6.61, "grad_norm": 4.931981563568115, "learning_rate": 6.7720685111989474e-06, "loss": 0.8239, "step": 35140 }, { "epoch": 6.62, "grad_norm": 13.75429916381836, "learning_rate": 6.768304159608507e-06, "loss": 0.6484, "step": 35150 }, { "epoch": 6.62, "grad_norm": 4.511131286621094, "learning_rate": 6.764539808018069e-06, "loss": 0.8187, "step": 35160 }, { "epoch": 6.62, "grad_norm": 11.52469253540039, "learning_rate": 6.760775456427631e-06, "loss": 0.5486, "step": 35170 }, { "epoch": 6.62, "grad_norm": 10.465510368347168, "learning_rate": 6.7570111048371924e-06, "loss": 0.7262, "step": 35180 }, { "epoch": 6.62, "grad_norm": 4.710264205932617, "learning_rate": 6.753246753246754e-06, "loss": 0.708, "step": 35190 }, { "epoch": 6.63, "grad_norm": 18.16333770751953, "learning_rate": 6.749482401656315e-06, "loss": 0.7951, "step": 35200 }, { "epoch": 6.63, "grad_norm": 2.079245090484619, "learning_rate": 6.745718050065877e-06, "loss": 0.6829, "step": 35210 }, { "epoch": 6.63, "grad_norm": 8.413666725158691, "learning_rate": 6.741953698475438e-06, "loss": 0.6065, "step": 35220 }, { "epoch": 6.63, "grad_norm": 3.6985278129577637, "learning_rate": 6.738189346885e-06, "loss": 0.6545, "step": 35230 }, { "epoch": 6.63, "grad_norm": 14.51396369934082, "learning_rate": 6.73442499529456e-06, "loss": 0.941, "step": 35240 }, { "epoch": 6.63, "grad_norm": 15.321940422058105, "learning_rate": 6.730660643704122e-06, "loss": 0.6985, "step": 35250 }, { "epoch": 6.64, "grad_norm": 19.311006546020508, "learning_rate": 6.726896292113684e-06, "loss": 0.7731, "step": 35260 }, { "epoch": 6.64, "grad_norm": 2.8298330307006836, "learning_rate": 6.723131940523246e-06, "loss": 0.6832, "step": 35270 }, { "epoch": 6.64, "grad_norm": 8.229240417480469, "learning_rate": 6.719367588932807e-06, "loss": 0.6111, "step": 35280 }, { "epoch": 6.64, "grad_norm": 10.28249740600586, "learning_rate": 6.7156032373423685e-06, "loss": 0.8709, "step": 35290 }, { "epoch": 6.64, "grad_norm": 17.110694885253906, "learning_rate": 6.71183888575193e-06, "loss": 0.7293, "step": 35300 }, { "epoch": 6.65, "grad_norm": 24.578279495239258, "learning_rate": 6.7080745341614914e-06, "loss": 0.7752, "step": 35310 }, { "epoch": 6.65, "grad_norm": 6.030777931213379, "learning_rate": 6.704310182571052e-06, "loss": 0.8103, "step": 35320 }, { "epoch": 6.65, "grad_norm": 13.232614517211914, "learning_rate": 6.7005458309806135e-06, "loss": 1.1151, "step": 35330 }, { "epoch": 6.65, "grad_norm": 7.952554702758789, "learning_rate": 6.696781479390175e-06, "loss": 0.5796, "step": 35340 }, { "epoch": 6.65, "grad_norm": 14.536341667175293, "learning_rate": 6.693017127799737e-06, "loss": 0.5807, "step": 35350 }, { "epoch": 6.66, "grad_norm": 10.163509368896484, "learning_rate": 6.689252776209299e-06, "loss": 1.2468, "step": 35360 }, { "epoch": 6.66, "grad_norm": 13.38232135772705, "learning_rate": 6.68548842461886e-06, "loss": 0.6923, "step": 35370 }, { "epoch": 6.66, "grad_norm": 25.283550262451172, "learning_rate": 6.681724073028422e-06, "loss": 0.9228, "step": 35380 }, { "epoch": 6.66, "grad_norm": 8.699653625488281, "learning_rate": 6.677959721437983e-06, "loss": 0.7367, "step": 35390 }, { "epoch": 6.66, "grad_norm": 6.232950210571289, "learning_rate": 6.6741953698475446e-06, "loss": 0.8762, "step": 35400 }, { "epoch": 6.66, "grad_norm": 10.002278327941895, "learning_rate": 6.670431018257105e-06, "loss": 0.6779, "step": 35410 }, { "epoch": 6.67, "grad_norm": 30.474475860595703, "learning_rate": 6.666666666666667e-06, "loss": 0.679, "step": 35420 }, { "epoch": 6.67, "grad_norm": 28.179275512695312, "learning_rate": 6.662902315076228e-06, "loss": 0.7934, "step": 35430 }, { "epoch": 6.67, "grad_norm": 4.137737274169922, "learning_rate": 6.65913796348579e-06, "loss": 0.4844, "step": 35440 }, { "epoch": 6.67, "grad_norm": 59.58742141723633, "learning_rate": 6.655373611895352e-06, "loss": 0.7615, "step": 35450 }, { "epoch": 6.67, "grad_norm": 11.73046588897705, "learning_rate": 6.651609260304913e-06, "loss": 0.4967, "step": 35460 }, { "epoch": 6.68, "grad_norm": 17.879043579101562, "learning_rate": 6.647844908714475e-06, "loss": 0.5556, "step": 35470 }, { "epoch": 6.68, "grad_norm": 7.54836893081665, "learning_rate": 6.644080557124036e-06, "loss": 1.0532, "step": 35480 }, { "epoch": 6.68, "grad_norm": 21.32975196838379, "learning_rate": 6.640316205533598e-06, "loss": 0.6974, "step": 35490 }, { "epoch": 6.68, "grad_norm": 6.983117580413818, "learning_rate": 6.636551853943158e-06, "loss": 0.6742, "step": 35500 }, { "epoch": 6.68, "grad_norm": 14.633478164672852, "learning_rate": 6.63278750235272e-06, "loss": 0.6519, "step": 35510 }, { "epoch": 6.69, "grad_norm": 12.44120979309082, "learning_rate": 6.629023150762281e-06, "loss": 0.7762, "step": 35520 }, { "epoch": 6.69, "grad_norm": 7.6053948402404785, "learning_rate": 6.625258799171843e-06, "loss": 0.6277, "step": 35530 }, { "epoch": 6.69, "grad_norm": 15.892565727233887, "learning_rate": 6.621494447581405e-06, "loss": 0.5841, "step": 35540 }, { "epoch": 6.69, "grad_norm": 20.586889266967773, "learning_rate": 6.6177300959909665e-06, "loss": 0.7142, "step": 35550 }, { "epoch": 6.69, "grad_norm": 4.947535037994385, "learning_rate": 6.613965744400528e-06, "loss": 0.6928, "step": 35560 }, { "epoch": 6.69, "grad_norm": 6.129070281982422, "learning_rate": 6.610201392810089e-06, "loss": 0.6268, "step": 35570 }, { "epoch": 6.7, "grad_norm": 33.46188735961914, "learning_rate": 6.60643704121965e-06, "loss": 0.6635, "step": 35580 }, { "epoch": 6.7, "grad_norm": 12.149188995361328, "learning_rate": 6.6026726896292115e-06, "loss": 0.8669, "step": 35590 }, { "epoch": 6.7, "grad_norm": 2.522722005844116, "learning_rate": 6.598908338038773e-06, "loss": 0.639, "step": 35600 }, { "epoch": 6.7, "grad_norm": 22.79615020751953, "learning_rate": 6.595143986448334e-06, "loss": 0.5354, "step": 35610 }, { "epoch": 6.7, "grad_norm": 21.13848114013672, "learning_rate": 6.591379634857896e-06, "loss": 0.8527, "step": 35620 }, { "epoch": 6.71, "grad_norm": 4.89055871963501, "learning_rate": 6.587615283267458e-06, "loss": 0.7434, "step": 35630 }, { "epoch": 6.71, "grad_norm": 10.959356307983398, "learning_rate": 6.58385093167702e-06, "loss": 0.9216, "step": 35640 }, { "epoch": 6.71, "grad_norm": 3.981358051300049, "learning_rate": 6.580086580086581e-06, "loss": 0.6788, "step": 35650 }, { "epoch": 6.71, "grad_norm": 35.3505973815918, "learning_rate": 6.5763222284961426e-06, "loss": 0.6128, "step": 35660 }, { "epoch": 6.71, "grad_norm": 8.54912281036377, "learning_rate": 6.572557876905703e-06, "loss": 1.1145, "step": 35670 }, { "epoch": 6.72, "grad_norm": 25.22374153137207, "learning_rate": 6.568793525315265e-06, "loss": 0.9503, "step": 35680 }, { "epoch": 6.72, "grad_norm": 19.567724227905273, "learning_rate": 6.565029173724826e-06, "loss": 0.4849, "step": 35690 }, { "epoch": 6.72, "grad_norm": 15.255121231079102, "learning_rate": 6.5612648221343875e-06, "loss": 0.8696, "step": 35700 }, { "epoch": 6.72, "grad_norm": 2.967883825302124, "learning_rate": 6.557500470543949e-06, "loss": 1.0976, "step": 35710 }, { "epoch": 6.72, "grad_norm": 13.99577522277832, "learning_rate": 6.553736118953511e-06, "loss": 0.8763, "step": 35720 }, { "epoch": 6.73, "grad_norm": 19.421720504760742, "learning_rate": 6.549971767363073e-06, "loss": 0.9559, "step": 35730 }, { "epoch": 6.73, "grad_norm": 12.905183792114258, "learning_rate": 6.546207415772634e-06, "loss": 0.5399, "step": 35740 }, { "epoch": 6.73, "grad_norm": 8.856494903564453, "learning_rate": 6.542443064182196e-06, "loss": 0.6662, "step": 35750 }, { "epoch": 6.73, "grad_norm": 4.250787734985352, "learning_rate": 6.538678712591756e-06, "loss": 0.5793, "step": 35760 }, { "epoch": 6.73, "grad_norm": 2.038532257080078, "learning_rate": 6.534914361001318e-06, "loss": 0.8522, "step": 35770 }, { "epoch": 6.73, "grad_norm": 0.8922250270843506, "learning_rate": 6.531150009410879e-06, "loss": 0.4993, "step": 35780 }, { "epoch": 6.74, "grad_norm": 12.76684284210205, "learning_rate": 6.527385657820441e-06, "loss": 0.5944, "step": 35790 }, { "epoch": 6.74, "grad_norm": 12.685073852539062, "learning_rate": 6.523621306230002e-06, "loss": 0.7024, "step": 35800 }, { "epoch": 6.74, "grad_norm": 21.057527542114258, "learning_rate": 6.519856954639564e-06, "loss": 0.657, "step": 35810 }, { "epoch": 6.74, "grad_norm": 28.576303482055664, "learning_rate": 6.516092603049126e-06, "loss": 0.9238, "step": 35820 }, { "epoch": 6.74, "grad_norm": 11.514458656311035, "learning_rate": 6.512328251458687e-06, "loss": 0.7787, "step": 35830 }, { "epoch": 6.75, "grad_norm": 15.331823348999023, "learning_rate": 6.508563899868249e-06, "loss": 1.2059, "step": 35840 }, { "epoch": 6.75, "grad_norm": 36.174739837646484, "learning_rate": 6.5047995482778095e-06, "loss": 0.9894, "step": 35850 }, { "epoch": 6.75, "grad_norm": 4.677796840667725, "learning_rate": 6.501035196687371e-06, "loss": 0.9216, "step": 35860 }, { "epoch": 6.75, "grad_norm": 16.240278244018555, "learning_rate": 6.497270845096932e-06, "loss": 0.9013, "step": 35870 }, { "epoch": 6.75, "grad_norm": 1.8353596925735474, "learning_rate": 6.493506493506494e-06, "loss": 0.7645, "step": 35880 }, { "epoch": 6.76, "grad_norm": 6.023941993713379, "learning_rate": 6.489742141916055e-06, "loss": 1.0826, "step": 35890 }, { "epoch": 6.76, "grad_norm": 5.827603816986084, "learning_rate": 6.485977790325617e-06, "loss": 0.5308, "step": 35900 }, { "epoch": 6.76, "grad_norm": 5.486311435699463, "learning_rate": 6.482213438735179e-06, "loss": 0.6392, "step": 35910 }, { "epoch": 6.76, "grad_norm": 1.1436667442321777, "learning_rate": 6.4784490871447405e-06, "loss": 0.6408, "step": 35920 }, { "epoch": 6.76, "grad_norm": 46.18919372558594, "learning_rate": 6.474684735554301e-06, "loss": 0.8983, "step": 35930 }, { "epoch": 6.76, "grad_norm": 38.4013671875, "learning_rate": 6.470920383963863e-06, "loss": 0.719, "step": 35940 }, { "epoch": 6.77, "grad_norm": 14.659102439880371, "learning_rate": 6.467156032373424e-06, "loss": 0.9764, "step": 35950 }, { "epoch": 6.77, "grad_norm": 11.168418884277344, "learning_rate": 6.4633916807829855e-06, "loss": 0.5664, "step": 35960 }, { "epoch": 6.77, "grad_norm": 4.3088202476501465, "learning_rate": 6.459627329192547e-06, "loss": 0.6944, "step": 35970 }, { "epoch": 6.77, "grad_norm": 19.49970245361328, "learning_rate": 6.4558629776021084e-06, "loss": 0.771, "step": 35980 }, { "epoch": 6.77, "grad_norm": 11.030245780944824, "learning_rate": 6.45209862601167e-06, "loss": 0.9515, "step": 35990 }, { "epoch": 6.78, "grad_norm": 4.944016933441162, "learning_rate": 6.448334274421232e-06, "loss": 0.7361, "step": 36000 }, { "epoch": 6.78, "grad_norm": 11.580300331115723, "learning_rate": 6.444569922830794e-06, "loss": 0.8828, "step": 36010 }, { "epoch": 6.78, "grad_norm": 13.989238739013672, "learning_rate": 6.440805571240354e-06, "loss": 0.6253, "step": 36020 }, { "epoch": 6.78, "grad_norm": 8.27987289428711, "learning_rate": 6.437041219649916e-06, "loss": 0.5709, "step": 36030 }, { "epoch": 6.78, "grad_norm": 14.307635307312012, "learning_rate": 6.433276868059477e-06, "loss": 0.9116, "step": 36040 }, { "epoch": 6.79, "grad_norm": 8.25582218170166, "learning_rate": 6.429512516469039e-06, "loss": 0.5959, "step": 36050 }, { "epoch": 6.79, "grad_norm": 0.7994733452796936, "learning_rate": 6.4257481648786e-06, "loss": 0.8514, "step": 36060 }, { "epoch": 6.79, "grad_norm": 9.397730827331543, "learning_rate": 6.421983813288162e-06, "loss": 0.6784, "step": 36070 }, { "epoch": 6.79, "grad_norm": 37.672359466552734, "learning_rate": 6.418219461697723e-06, "loss": 0.6439, "step": 36080 }, { "epoch": 6.79, "grad_norm": 3.6397805213928223, "learning_rate": 6.414455110107285e-06, "loss": 0.6154, "step": 36090 }, { "epoch": 6.79, "grad_norm": 7.400353908538818, "learning_rate": 6.410690758516847e-06, "loss": 0.9822, "step": 36100 }, { "epoch": 6.8, "grad_norm": 21.65017318725586, "learning_rate": 6.406926406926407e-06, "loss": 0.7074, "step": 36110 }, { "epoch": 6.8, "grad_norm": 10.7244291305542, "learning_rate": 6.403162055335969e-06, "loss": 0.5725, "step": 36120 }, { "epoch": 6.8, "grad_norm": 7.76882266998291, "learning_rate": 6.39939770374553e-06, "loss": 0.5699, "step": 36130 }, { "epoch": 6.8, "grad_norm": 3.691624641418457, "learning_rate": 6.395633352155092e-06, "loss": 1.1213, "step": 36140 }, { "epoch": 6.8, "grad_norm": 19.521129608154297, "learning_rate": 6.391869000564653e-06, "loss": 0.6333, "step": 36150 }, { "epoch": 6.81, "grad_norm": 27.294422149658203, "learning_rate": 6.388104648974215e-06, "loss": 0.9252, "step": 36160 }, { "epoch": 6.81, "grad_norm": 3.199697971343994, "learning_rate": 6.384340297383776e-06, "loss": 0.4566, "step": 36170 }, { "epoch": 6.81, "grad_norm": 11.948770523071289, "learning_rate": 6.380575945793338e-06, "loss": 0.6871, "step": 36180 }, { "epoch": 6.81, "grad_norm": 6.9421162605285645, "learning_rate": 6.376811594202898e-06, "loss": 0.6361, "step": 36190 }, { "epoch": 6.81, "grad_norm": 3.3813252449035645, "learning_rate": 6.37304724261246e-06, "loss": 0.8625, "step": 36200 }, { "epoch": 6.82, "grad_norm": 25.103116989135742, "learning_rate": 6.369282891022022e-06, "loss": 0.5277, "step": 36210 }, { "epoch": 6.82, "grad_norm": 34.72182083129883, "learning_rate": 6.3655185394315835e-06, "loss": 0.8646, "step": 36220 }, { "epoch": 6.82, "grad_norm": 3.7862021923065186, "learning_rate": 6.361754187841145e-06, "loss": 0.8415, "step": 36230 }, { "epoch": 6.82, "grad_norm": 18.372072219848633, "learning_rate": 6.3579898362507064e-06, "loss": 0.6119, "step": 36240 }, { "epoch": 6.82, "grad_norm": 10.229610443115234, "learning_rate": 6.354225484660268e-06, "loss": 0.8766, "step": 36250 }, { "epoch": 6.82, "grad_norm": 28.582839965820312, "learning_rate": 6.350461133069829e-06, "loss": 0.8249, "step": 36260 }, { "epoch": 6.83, "grad_norm": 3.4164187908172607, "learning_rate": 6.346696781479391e-06, "loss": 0.6617, "step": 36270 }, { "epoch": 6.83, "grad_norm": 27.359926223754883, "learning_rate": 6.342932429888951e-06, "loss": 0.668, "step": 36280 }, { "epoch": 6.83, "grad_norm": 13.791479110717773, "learning_rate": 6.339168078298513e-06, "loss": 0.7939, "step": 36290 }, { "epoch": 6.83, "grad_norm": 6.339425086975098, "learning_rate": 6.335403726708075e-06, "loss": 0.7561, "step": 36300 }, { "epoch": 6.83, "grad_norm": 10.153292655944824, "learning_rate": 6.331639375117637e-06, "loss": 1.1677, "step": 36310 }, { "epoch": 6.84, "grad_norm": 6.044154644012451, "learning_rate": 6.327875023527198e-06, "loss": 0.7294, "step": 36320 }, { "epoch": 6.84, "grad_norm": 9.357995986938477, "learning_rate": 6.3241106719367596e-06, "loss": 0.9811, "step": 36330 }, { "epoch": 6.84, "grad_norm": 11.374494552612305, "learning_rate": 6.320346320346321e-06, "loss": 0.3763, "step": 36340 }, { "epoch": 6.84, "grad_norm": 59.56272506713867, "learning_rate": 6.3165819687558825e-06, "loss": 0.6146, "step": 36350 }, { "epoch": 6.84, "grad_norm": 16.82117462158203, "learning_rate": 6.312817617165444e-06, "loss": 0.6274, "step": 36360 }, { "epoch": 6.85, "grad_norm": 19.121362686157227, "learning_rate": 6.3090532655750046e-06, "loss": 0.8071, "step": 36370 }, { "epoch": 6.85, "grad_norm": 7.842607021331787, "learning_rate": 6.305288913984566e-06, "loss": 1.0871, "step": 36380 }, { "epoch": 6.85, "grad_norm": 16.14730453491211, "learning_rate": 6.3015245623941275e-06, "loss": 0.5043, "step": 36390 }, { "epoch": 6.85, "grad_norm": 5.9936628341674805, "learning_rate": 6.29776021080369e-06, "loss": 0.7736, "step": 36400 }, { "epoch": 6.85, "grad_norm": 10.883078575134277, "learning_rate": 6.293995859213251e-06, "loss": 0.6143, "step": 36410 }, { "epoch": 6.85, "grad_norm": 9.11673641204834, "learning_rate": 6.290231507622813e-06, "loss": 0.869, "step": 36420 }, { "epoch": 6.86, "grad_norm": 13.581634521484375, "learning_rate": 6.286467156032374e-06, "loss": 0.9004, "step": 36430 }, { "epoch": 6.86, "grad_norm": 17.854829788208008, "learning_rate": 6.282702804441936e-06, "loss": 0.5848, "step": 36440 }, { "epoch": 6.86, "grad_norm": 30.23310089111328, "learning_rate": 6.278938452851497e-06, "loss": 0.7937, "step": 36450 }, { "epoch": 6.86, "grad_norm": 7.0351433753967285, "learning_rate": 6.275174101261058e-06, "loss": 0.5366, "step": 36460 }, { "epoch": 6.86, "grad_norm": 6.2937912940979, "learning_rate": 6.271409749670619e-06, "loss": 0.6942, "step": 36470 }, { "epoch": 6.87, "grad_norm": 8.93720817565918, "learning_rate": 6.267645398080181e-06, "loss": 0.8762, "step": 36480 }, { "epoch": 6.87, "grad_norm": 14.043828010559082, "learning_rate": 6.263881046489743e-06, "loss": 0.6268, "step": 36490 }, { "epoch": 6.87, "grad_norm": 16.131328582763672, "learning_rate": 6.260116694899304e-06, "loss": 0.602, "step": 36500 }, { "epoch": 6.87, "grad_norm": 18.67616844177246, "learning_rate": 6.256352343308866e-06, "loss": 0.8061, "step": 36510 }, { "epoch": 6.87, "grad_norm": 4.114846229553223, "learning_rate": 6.252587991718427e-06, "loss": 0.6575, "step": 36520 }, { "epoch": 6.88, "grad_norm": 32.33007049560547, "learning_rate": 6.248823640127989e-06, "loss": 1.0047, "step": 36530 }, { "epoch": 6.88, "grad_norm": 26.43556785583496, "learning_rate": 6.245059288537549e-06, "loss": 0.8084, "step": 36540 }, { "epoch": 6.88, "grad_norm": 1.7522974014282227, "learning_rate": 6.241294936947111e-06, "loss": 0.6309, "step": 36550 }, { "epoch": 6.88, "grad_norm": 6.107260227203369, "learning_rate": 6.237530585356672e-06, "loss": 0.6974, "step": 36560 }, { "epoch": 6.88, "grad_norm": 13.829848289489746, "learning_rate": 6.233766233766234e-06, "loss": 0.5152, "step": 36570 }, { "epoch": 6.88, "grad_norm": 36.040138244628906, "learning_rate": 6.230001882175796e-06, "loss": 0.9514, "step": 36580 }, { "epoch": 6.89, "grad_norm": 39.36509323120117, "learning_rate": 6.2262375305853575e-06, "loss": 0.8235, "step": 36590 }, { "epoch": 6.89, "grad_norm": 10.279460906982422, "learning_rate": 6.222473178994919e-06, "loss": 0.843, "step": 36600 }, { "epoch": 6.89, "grad_norm": 20.82657241821289, "learning_rate": 6.2187088274044805e-06, "loss": 0.9005, "step": 36610 }, { "epoch": 6.89, "grad_norm": 10.53868579864502, "learning_rate": 6.214944475814042e-06, "loss": 0.7821, "step": 36620 }, { "epoch": 6.89, "grad_norm": 11.066123962402344, "learning_rate": 6.2111801242236025e-06, "loss": 1.1019, "step": 36630 }, { "epoch": 6.9, "grad_norm": 3.152423858642578, "learning_rate": 6.207415772633164e-06, "loss": 0.6843, "step": 36640 }, { "epoch": 6.9, "grad_norm": 13.62727165222168, "learning_rate": 6.2036514210427255e-06, "loss": 0.5294, "step": 36650 }, { "epoch": 6.9, "grad_norm": 20.977489471435547, "learning_rate": 6.199887069452287e-06, "loss": 0.8121, "step": 36660 }, { "epoch": 6.9, "grad_norm": 24.3504581451416, "learning_rate": 6.196122717861849e-06, "loss": 0.6978, "step": 36670 }, { "epoch": 6.9, "grad_norm": 23.525861740112305, "learning_rate": 6.192358366271411e-06, "loss": 1.0265, "step": 36680 }, { "epoch": 6.91, "grad_norm": 2.487999677658081, "learning_rate": 6.188594014680972e-06, "loss": 0.5934, "step": 36690 }, { "epoch": 6.91, "grad_norm": 1.4788461923599243, "learning_rate": 6.184829663090534e-06, "loss": 0.858, "step": 36700 }, { "epoch": 6.91, "grad_norm": 11.04887866973877, "learning_rate": 6.181065311500095e-06, "loss": 0.5535, "step": 36710 }, { "epoch": 6.91, "grad_norm": 16.101572036743164, "learning_rate": 6.177300959909656e-06, "loss": 0.5919, "step": 36720 }, { "epoch": 6.91, "grad_norm": 3.5266387462615967, "learning_rate": 6.173536608319217e-06, "loss": 0.6896, "step": 36730 }, { "epoch": 6.92, "grad_norm": 13.87791919708252, "learning_rate": 6.169772256728779e-06, "loss": 0.665, "step": 36740 }, { "epoch": 6.92, "grad_norm": 4.168028354644775, "learning_rate": 6.16600790513834e-06, "loss": 0.603, "step": 36750 }, { "epoch": 6.92, "grad_norm": 6.283482551574707, "learning_rate": 6.1622435535479015e-06, "loss": 0.6333, "step": 36760 }, { "epoch": 6.92, "grad_norm": 20.833709716796875, "learning_rate": 6.158479201957464e-06, "loss": 0.6808, "step": 36770 }, { "epoch": 6.92, "grad_norm": 28.429628372192383, "learning_rate": 6.154714850367025e-06, "loss": 0.9145, "step": 36780 }, { "epoch": 6.92, "grad_norm": 24.9104061126709, "learning_rate": 6.150950498776587e-06, "loss": 0.7179, "step": 36790 }, { "epoch": 6.93, "grad_norm": 4.804409980773926, "learning_rate": 6.147186147186147e-06, "loss": 1.0192, "step": 36800 }, { "epoch": 6.93, "grad_norm": 5.507547378540039, "learning_rate": 6.143421795595709e-06, "loss": 0.6125, "step": 36810 }, { "epoch": 6.93, "grad_norm": 15.59933853149414, "learning_rate": 6.13965744400527e-06, "loss": 0.6585, "step": 36820 }, { "epoch": 6.93, "grad_norm": 5.931190490722656, "learning_rate": 6.135893092414832e-06, "loss": 0.8712, "step": 36830 }, { "epoch": 6.93, "grad_norm": 6.401504039764404, "learning_rate": 6.132128740824393e-06, "loss": 0.8517, "step": 36840 }, { "epoch": 6.94, "grad_norm": 8.744296073913574, "learning_rate": 6.128364389233955e-06, "loss": 1.1649, "step": 36850 }, { "epoch": 6.94, "grad_norm": 7.068543910980225, "learning_rate": 6.124600037643517e-06, "loss": 1.134, "step": 36860 }, { "epoch": 6.94, "grad_norm": 5.190436363220215, "learning_rate": 6.1208356860530784e-06, "loss": 0.6299, "step": 36870 }, { "epoch": 6.94, "grad_norm": 12.264849662780762, "learning_rate": 6.11707133446264e-06, "loss": 0.6952, "step": 36880 }, { "epoch": 6.94, "grad_norm": 21.30219078063965, "learning_rate": 6.1133069828722005e-06, "loss": 0.7631, "step": 36890 }, { "epoch": 6.95, "grad_norm": 18.149635314941406, "learning_rate": 6.109542631281762e-06, "loss": 0.3841, "step": 36900 }, { "epoch": 6.95, "grad_norm": 9.514642715454102, "learning_rate": 6.1057782796913234e-06, "loss": 0.7385, "step": 36910 }, { "epoch": 6.95, "grad_norm": 17.133453369140625, "learning_rate": 6.102013928100885e-06, "loss": 0.5014, "step": 36920 }, { "epoch": 6.95, "grad_norm": 4.99509859085083, "learning_rate": 6.098249576510446e-06, "loss": 0.6163, "step": 36930 }, { "epoch": 6.95, "grad_norm": 9.139423370361328, "learning_rate": 6.094485224920008e-06, "loss": 0.7414, "step": 36940 }, { "epoch": 6.95, "grad_norm": 2.838682174682617, "learning_rate": 6.09072087332957e-06, "loss": 0.5059, "step": 36950 }, { "epoch": 6.96, "grad_norm": 7.605196952819824, "learning_rate": 6.086956521739132e-06, "loss": 0.7718, "step": 36960 }, { "epoch": 6.96, "grad_norm": 8.679461479187012, "learning_rate": 6.083192170148693e-06, "loss": 0.8547, "step": 36970 }, { "epoch": 6.96, "grad_norm": 30.403396606445312, "learning_rate": 6.079427818558254e-06, "loss": 0.8414, "step": 36980 }, { "epoch": 6.96, "grad_norm": 14.012473106384277, "learning_rate": 6.075663466967815e-06, "loss": 0.8916, "step": 36990 }, { "epoch": 6.96, "grad_norm": 21.694454193115234, "learning_rate": 6.071899115377377e-06, "loss": 1.3339, "step": 37000 }, { "epoch": 6.97, "grad_norm": 19.346229553222656, "learning_rate": 6.068134763786938e-06, "loss": 0.6939, "step": 37010 }, { "epoch": 6.97, "grad_norm": 33.79501724243164, "learning_rate": 6.0643704121964995e-06, "loss": 0.8437, "step": 37020 }, { "epoch": 6.97, "grad_norm": 3.9713692665100098, "learning_rate": 6.060606060606061e-06, "loss": 0.7122, "step": 37030 }, { "epoch": 6.97, "grad_norm": 11.297821998596191, "learning_rate": 6.0568417090156224e-06, "loss": 0.774, "step": 37040 }, { "epoch": 6.97, "grad_norm": 8.851127624511719, "learning_rate": 6.053077357425185e-06, "loss": 0.5609, "step": 37050 }, { "epoch": 6.98, "grad_norm": 2.1288793087005615, "learning_rate": 6.049313005834746e-06, "loss": 0.6655, "step": 37060 }, { "epoch": 6.98, "grad_norm": 18.886585235595703, "learning_rate": 6.045548654244307e-06, "loss": 0.8246, "step": 37070 }, { "epoch": 6.98, "grad_norm": 12.303566932678223, "learning_rate": 6.041784302653868e-06, "loss": 0.7336, "step": 37080 }, { "epoch": 6.98, "grad_norm": 6.029139041900635, "learning_rate": 6.03801995106343e-06, "loss": 0.7183, "step": 37090 }, { "epoch": 6.98, "grad_norm": 11.80978775024414, "learning_rate": 6.034255599472991e-06, "loss": 0.5428, "step": 37100 }, { "epoch": 6.98, "grad_norm": 12.277514457702637, "learning_rate": 6.030491247882553e-06, "loss": 0.7973, "step": 37110 }, { "epoch": 6.99, "grad_norm": 6.897933006286621, "learning_rate": 6.026726896292114e-06, "loss": 0.7292, "step": 37120 }, { "epoch": 6.99, "grad_norm": 27.606576919555664, "learning_rate": 6.022962544701676e-06, "loss": 0.7597, "step": 37130 }, { "epoch": 6.99, "grad_norm": 5.399590492248535, "learning_rate": 6.019198193111238e-06, "loss": 0.8286, "step": 37140 }, { "epoch": 6.99, "grad_norm": 11.461363792419434, "learning_rate": 6.015433841520798e-06, "loss": 0.7537, "step": 37150 }, { "epoch": 6.99, "grad_norm": 28.35698127746582, "learning_rate": 6.01166948993036e-06, "loss": 0.5075, "step": 37160 }, { "epoch": 7.0, "grad_norm": 32.98536682128906, "learning_rate": 6.007905138339921e-06, "loss": 0.6517, "step": 37170 }, { "epoch": 7.0, "grad_norm": 4.059322834014893, "learning_rate": 6.004140786749483e-06, "loss": 0.6264, "step": 37180 }, { "epoch": 7.0, "grad_norm": 20.37855339050293, "learning_rate": 6.000376435159044e-06, "loss": 0.575, "step": 37190 }, { "epoch": 7.0, "eval_accuracy": 0.8041333333333334, "eval_loss": 0.9111883640289307, "eval_runtime": 31.1272, "eval_samples_per_second": 240.947, "eval_steps_per_second": 30.134, "step": 37191 }, { "epoch": 7.0, "grad_norm": 30.558164596557617, "learning_rate": 5.996612083568606e-06, "loss": 0.7091, "step": 37200 }, { "epoch": 7.0, "grad_norm": 15.844109535217285, "learning_rate": 5.992847731978167e-06, "loss": 0.8354, "step": 37210 }, { "epoch": 7.01, "grad_norm": 3.1490488052368164, "learning_rate": 5.989083380387729e-06, "loss": 0.8395, "step": 37220 }, { "epoch": 7.01, "grad_norm": 5.403294086456299, "learning_rate": 5.985319028797291e-06, "loss": 0.7746, "step": 37230 }, { "epoch": 7.01, "grad_norm": 24.43394660949707, "learning_rate": 5.981554677206851e-06, "loss": 0.8899, "step": 37240 }, { "epoch": 7.01, "grad_norm": 10.031603813171387, "learning_rate": 5.977790325616413e-06, "loss": 0.7433, "step": 37250 }, { "epoch": 7.01, "grad_norm": 32.59792709350586, "learning_rate": 5.9740259740259746e-06, "loss": 0.5721, "step": 37260 }, { "epoch": 7.01, "grad_norm": 17.758329391479492, "learning_rate": 5.970261622435536e-06, "loss": 0.841, "step": 37270 }, { "epoch": 7.02, "grad_norm": 6.865734100341797, "learning_rate": 5.9664972708450975e-06, "loss": 0.554, "step": 37280 }, { "epoch": 7.02, "grad_norm": 13.139066696166992, "learning_rate": 5.962732919254659e-06, "loss": 0.7729, "step": 37290 }, { "epoch": 7.02, "grad_norm": 8.752161026000977, "learning_rate": 5.95896856766422e-06, "loss": 0.4381, "step": 37300 }, { "epoch": 7.02, "grad_norm": 11.000343322753906, "learning_rate": 5.955204216073782e-06, "loss": 0.9294, "step": 37310 }, { "epoch": 7.02, "grad_norm": 22.186254501342773, "learning_rate": 5.951439864483344e-06, "loss": 1.0764, "step": 37320 }, { "epoch": 7.03, "grad_norm": 21.000696182250977, "learning_rate": 5.947675512892904e-06, "loss": 0.895, "step": 37330 }, { "epoch": 7.03, "grad_norm": 49.79924774169922, "learning_rate": 5.943911161302465e-06, "loss": 0.5077, "step": 37340 }, { "epoch": 7.03, "grad_norm": 1.5146905183792114, "learning_rate": 5.940146809712028e-06, "loss": 0.4758, "step": 37350 }, { "epoch": 7.03, "grad_norm": 22.319744110107422, "learning_rate": 5.936382458121589e-06, "loss": 1.0744, "step": 37360 }, { "epoch": 7.03, "grad_norm": 3.766484498977661, "learning_rate": 5.932618106531151e-06, "loss": 0.4468, "step": 37370 }, { "epoch": 7.04, "grad_norm": 29.184850692749023, "learning_rate": 5.928853754940712e-06, "loss": 0.9438, "step": 37380 }, { "epoch": 7.04, "grad_norm": 18.49837303161621, "learning_rate": 5.9250894033502736e-06, "loss": 0.8739, "step": 37390 }, { "epoch": 7.04, "grad_norm": 3.6087453365325928, "learning_rate": 5.921325051759835e-06, "loss": 0.5554, "step": 37400 }, { "epoch": 7.04, "grad_norm": 3.1408095359802246, "learning_rate": 5.917560700169396e-06, "loss": 0.912, "step": 37410 }, { "epoch": 7.04, "grad_norm": 4.070140838623047, "learning_rate": 5.913796348578957e-06, "loss": 0.9112, "step": 37420 }, { "epoch": 7.04, "grad_norm": 15.710841178894043, "learning_rate": 5.9100319969885186e-06, "loss": 0.8179, "step": 37430 }, { "epoch": 7.05, "grad_norm": 21.413297653198242, "learning_rate": 5.906267645398081e-06, "loss": 0.5749, "step": 37440 }, { "epoch": 7.05, "grad_norm": 24.930105209350586, "learning_rate": 5.902503293807642e-06, "loss": 0.6813, "step": 37450 }, { "epoch": 7.05, "grad_norm": 9.391387939453125, "learning_rate": 5.898738942217204e-06, "loss": 0.5036, "step": 37460 }, { "epoch": 7.05, "grad_norm": 15.781744003295898, "learning_rate": 5.894974590626765e-06, "loss": 0.6075, "step": 37470 }, { "epoch": 7.05, "grad_norm": 5.5781707763671875, "learning_rate": 5.891210239036327e-06, "loss": 0.4049, "step": 37480 }, { "epoch": 7.06, "grad_norm": 3.9295620918273926, "learning_rate": 5.887445887445888e-06, "loss": 0.517, "step": 37490 }, { "epoch": 7.06, "grad_norm": 0.7665086984634399, "learning_rate": 5.883681535855449e-06, "loss": 0.6456, "step": 37500 }, { "epoch": 7.06, "grad_norm": 9.343612670898438, "learning_rate": 5.87991718426501e-06, "loss": 0.6331, "step": 37510 }, { "epoch": 7.06, "grad_norm": 9.377315521240234, "learning_rate": 5.876152832674572e-06, "loss": 0.8224, "step": 37520 }, { "epoch": 7.06, "grad_norm": 9.508060455322266, "learning_rate": 5.872388481084134e-06, "loss": 0.5394, "step": 37530 }, { "epoch": 7.07, "grad_norm": 46.93809509277344, "learning_rate": 5.8686241294936955e-06, "loss": 0.711, "step": 37540 }, { "epoch": 7.07, "grad_norm": 2.8255438804626465, "learning_rate": 5.864859777903257e-06, "loss": 0.5816, "step": 37550 }, { "epoch": 7.07, "grad_norm": 4.194993495941162, "learning_rate": 5.861095426312818e-06, "loss": 0.6035, "step": 37560 }, { "epoch": 7.07, "grad_norm": 21.790729522705078, "learning_rate": 5.85733107472238e-06, "loss": 0.6296, "step": 37570 }, { "epoch": 7.07, "grad_norm": 5.633280277252197, "learning_rate": 5.853566723131941e-06, "loss": 0.6071, "step": 37580 }, { "epoch": 7.08, "grad_norm": 4.44071626663208, "learning_rate": 5.849802371541502e-06, "loss": 0.8148, "step": 37590 }, { "epoch": 7.08, "grad_norm": 26.890914916992188, "learning_rate": 5.846038019951063e-06, "loss": 0.5454, "step": 37600 }, { "epoch": 7.08, "grad_norm": 4.100680828094482, "learning_rate": 5.842273668360625e-06, "loss": 0.6321, "step": 37610 }, { "epoch": 7.08, "grad_norm": 36.70380783081055, "learning_rate": 5.838509316770186e-06, "loss": 0.7428, "step": 37620 }, { "epoch": 7.08, "grad_norm": 1.9733837842941284, "learning_rate": 5.834744965179749e-06, "loss": 0.5637, "step": 37630 }, { "epoch": 7.08, "grad_norm": 19.125102996826172, "learning_rate": 5.83098061358931e-06, "loss": 0.8517, "step": 37640 }, { "epoch": 7.09, "grad_norm": 4.017688751220703, "learning_rate": 5.8272162619988715e-06, "loss": 0.7267, "step": 37650 }, { "epoch": 7.09, "grad_norm": 6.149882793426514, "learning_rate": 5.823451910408433e-06, "loss": 0.6231, "step": 37660 }, { "epoch": 7.09, "grad_norm": 27.88104248046875, "learning_rate": 5.8196875588179945e-06, "loss": 0.5032, "step": 37670 }, { "epoch": 7.09, "grad_norm": 10.182868957519531, "learning_rate": 5.815923207227555e-06, "loss": 0.594, "step": 37680 }, { "epoch": 7.09, "grad_norm": 19.947778701782227, "learning_rate": 5.8121588556371165e-06, "loss": 0.6697, "step": 37690 }, { "epoch": 7.1, "grad_norm": 14.21074104309082, "learning_rate": 5.808394504046678e-06, "loss": 0.6462, "step": 37700 }, { "epoch": 7.1, "grad_norm": 17.754087448120117, "learning_rate": 5.8046301524562395e-06, "loss": 0.7615, "step": 37710 }, { "epoch": 7.1, "grad_norm": 6.829981327056885, "learning_rate": 5.800865800865802e-06, "loss": 0.7823, "step": 37720 }, { "epoch": 7.1, "grad_norm": 10.880169868469238, "learning_rate": 5.797101449275363e-06, "loss": 0.6782, "step": 37730 }, { "epoch": 7.1, "grad_norm": 4.750525951385498, "learning_rate": 5.793337097684925e-06, "loss": 0.6291, "step": 37740 }, { "epoch": 7.11, "grad_norm": 16.698471069335938, "learning_rate": 5.789572746094486e-06, "loss": 1.0194, "step": 37750 }, { "epoch": 7.11, "grad_norm": 11.902446746826172, "learning_rate": 5.785808394504047e-06, "loss": 0.6339, "step": 37760 }, { "epoch": 7.11, "grad_norm": 15.144806861877441, "learning_rate": 5.782044042913608e-06, "loss": 0.7048, "step": 37770 }, { "epoch": 7.11, "grad_norm": 51.336021423339844, "learning_rate": 5.77827969132317e-06, "loss": 0.7162, "step": 37780 }, { "epoch": 7.11, "grad_norm": 16.857934951782227, "learning_rate": 5.774515339732731e-06, "loss": 1.1811, "step": 37790 }, { "epoch": 7.11, "grad_norm": 14.80729866027832, "learning_rate": 5.770750988142293e-06, "loss": 0.9633, "step": 37800 }, { "epoch": 7.12, "grad_norm": 2.4304816722869873, "learning_rate": 5.766986636551855e-06, "loss": 0.689, "step": 37810 }, { "epoch": 7.12, "grad_norm": 1.3563061952590942, "learning_rate": 5.763222284961416e-06, "loss": 0.7598, "step": 37820 }, { "epoch": 7.12, "grad_norm": 13.738055229187012, "learning_rate": 5.759457933370978e-06, "loss": 0.8376, "step": 37830 }, { "epoch": 7.12, "grad_norm": 17.411348342895508, "learning_rate": 5.755693581780539e-06, "loss": 0.6507, "step": 37840 }, { "epoch": 7.12, "grad_norm": 54.60939025878906, "learning_rate": 5.7519292301901e-06, "loss": 0.7383, "step": 37850 }, { "epoch": 7.13, "grad_norm": 3.097688674926758, "learning_rate": 5.748164878599661e-06, "loss": 0.6319, "step": 37860 }, { "epoch": 7.13, "grad_norm": 20.68455696105957, "learning_rate": 5.744400527009223e-06, "loss": 1.248, "step": 37870 }, { "epoch": 7.13, "grad_norm": 3.750034809112549, "learning_rate": 5.740636175418784e-06, "loss": 0.5183, "step": 37880 }, { "epoch": 7.13, "grad_norm": 30.149545669555664, "learning_rate": 5.736871823828346e-06, "loss": 0.7653, "step": 37890 }, { "epoch": 7.13, "grad_norm": 29.93433952331543, "learning_rate": 5.733107472237908e-06, "loss": 0.6765, "step": 37900 }, { "epoch": 7.14, "grad_norm": 5.382635593414307, "learning_rate": 5.7293431206474695e-06, "loss": 0.5773, "step": 37910 }, { "epoch": 7.14, "grad_norm": 6.096846103668213, "learning_rate": 5.725578769057031e-06, "loss": 1.0427, "step": 37920 }, { "epoch": 7.14, "grad_norm": 13.132981300354004, "learning_rate": 5.7218144174665924e-06, "loss": 0.7288, "step": 37930 }, { "epoch": 7.14, "grad_norm": 1.4267334938049316, "learning_rate": 5.718050065876153e-06, "loss": 0.3888, "step": 37940 }, { "epoch": 7.14, "grad_norm": 6.637333393096924, "learning_rate": 5.7142857142857145e-06, "loss": 0.511, "step": 37950 }, { "epoch": 7.14, "grad_norm": 3.020350933074951, "learning_rate": 5.710521362695276e-06, "loss": 0.4792, "step": 37960 }, { "epoch": 7.15, "grad_norm": 1.084544062614441, "learning_rate": 5.7067570111048374e-06, "loss": 0.8395, "step": 37970 }, { "epoch": 7.15, "grad_norm": 1.648380160331726, "learning_rate": 5.702992659514399e-06, "loss": 0.5763, "step": 37980 }, { "epoch": 7.15, "grad_norm": 3.797377347946167, "learning_rate": 5.69922830792396e-06, "loss": 0.4637, "step": 37990 }, { "epoch": 7.15, "grad_norm": 11.527572631835938, "learning_rate": 5.695463956333523e-06, "loss": 0.5408, "step": 38000 }, { "epoch": 7.15, "grad_norm": 22.012502670288086, "learning_rate": 5.691699604743084e-06, "loss": 0.7001, "step": 38010 }, { "epoch": 7.16, "grad_norm": 3.207542896270752, "learning_rate": 5.687935253152645e-06, "loss": 0.6922, "step": 38020 }, { "epoch": 7.16, "grad_norm": 37.179073333740234, "learning_rate": 5.684170901562206e-06, "loss": 0.7452, "step": 38030 }, { "epoch": 7.16, "grad_norm": 24.80609703063965, "learning_rate": 5.680406549971768e-06, "loss": 0.4167, "step": 38040 }, { "epoch": 7.16, "grad_norm": 20.975614547729492, "learning_rate": 5.676642198381329e-06, "loss": 0.8115, "step": 38050 }, { "epoch": 7.16, "grad_norm": 28.004859924316406, "learning_rate": 5.672877846790891e-06, "loss": 0.8096, "step": 38060 }, { "epoch": 7.17, "grad_norm": 10.752762794494629, "learning_rate": 5.669113495200452e-06, "loss": 0.7656, "step": 38070 }, { "epoch": 7.17, "grad_norm": 7.093476295471191, "learning_rate": 5.6653491436100135e-06, "loss": 0.7268, "step": 38080 }, { "epoch": 7.17, "grad_norm": 1.752172589302063, "learning_rate": 5.661584792019576e-06, "loss": 0.6596, "step": 38090 }, { "epoch": 7.17, "grad_norm": 11.936467170715332, "learning_rate": 5.657820440429137e-06, "loss": 0.4564, "step": 38100 }, { "epoch": 7.17, "grad_norm": 18.661617279052734, "learning_rate": 5.654056088838698e-06, "loss": 1.1641, "step": 38110 }, { "epoch": 7.17, "grad_norm": 5.597770690917969, "learning_rate": 5.650291737248259e-06, "loss": 0.5775, "step": 38120 }, { "epoch": 7.18, "grad_norm": 2.4199938774108887, "learning_rate": 5.646527385657821e-06, "loss": 0.7135, "step": 38130 }, { "epoch": 7.18, "grad_norm": 11.700885772705078, "learning_rate": 5.642763034067382e-06, "loss": 0.8787, "step": 38140 }, { "epoch": 7.18, "grad_norm": 8.688162803649902, "learning_rate": 5.638998682476944e-06, "loss": 0.9519, "step": 38150 }, { "epoch": 7.18, "grad_norm": 25.277379989624023, "learning_rate": 5.635234330886505e-06, "loss": 0.6528, "step": 38160 }, { "epoch": 7.18, "grad_norm": 2.77447509765625, "learning_rate": 5.631469979296067e-06, "loss": 0.3564, "step": 38170 }, { "epoch": 7.19, "grad_norm": 10.325285911560059, "learning_rate": 5.627705627705629e-06, "loss": 0.7461, "step": 38180 }, { "epoch": 7.19, "grad_norm": 27.334566116333008, "learning_rate": 5.62394127611519e-06, "loss": 0.9761, "step": 38190 }, { "epoch": 7.19, "grad_norm": 6.265417575836182, "learning_rate": 5.620176924524751e-06, "loss": 0.8741, "step": 38200 }, { "epoch": 7.19, "grad_norm": 4.774230003356934, "learning_rate": 5.6164125729343125e-06, "loss": 0.8001, "step": 38210 }, { "epoch": 7.19, "grad_norm": 12.588274002075195, "learning_rate": 5.612648221343874e-06, "loss": 0.8632, "step": 38220 }, { "epoch": 7.2, "grad_norm": 10.674922943115234, "learning_rate": 5.608883869753435e-06, "loss": 0.8803, "step": 38230 }, { "epoch": 7.2, "grad_norm": 9.436485290527344, "learning_rate": 5.605119518162997e-06, "loss": 0.9266, "step": 38240 }, { "epoch": 7.2, "grad_norm": 15.605271339416504, "learning_rate": 5.601355166572558e-06, "loss": 0.6588, "step": 38250 }, { "epoch": 7.2, "grad_norm": 11.084684371948242, "learning_rate": 5.59759081498212e-06, "loss": 0.8687, "step": 38260 }, { "epoch": 7.2, "grad_norm": 2.5850093364715576, "learning_rate": 5.593826463391681e-06, "loss": 0.7161, "step": 38270 }, { "epoch": 7.2, "grad_norm": 4.913285255432129, "learning_rate": 5.590062111801242e-06, "loss": 0.6563, "step": 38280 }, { "epoch": 7.21, "grad_norm": 7.006165504455566, "learning_rate": 5.586297760210803e-06, "loss": 0.5538, "step": 38290 }, { "epoch": 7.21, "grad_norm": 5.03699254989624, "learning_rate": 5.582533408620366e-06, "loss": 0.6205, "step": 38300 }, { "epoch": 7.21, "grad_norm": 7.754453659057617, "learning_rate": 5.578769057029927e-06, "loss": 0.6308, "step": 38310 }, { "epoch": 7.21, "grad_norm": 12.910945892333984, "learning_rate": 5.5750047054394886e-06, "loss": 0.8673, "step": 38320 }, { "epoch": 7.21, "grad_norm": 21.789987564086914, "learning_rate": 5.57124035384905e-06, "loss": 0.8006, "step": 38330 }, { "epoch": 7.22, "grad_norm": 9.305120468139648, "learning_rate": 5.5674760022586115e-06, "loss": 0.4462, "step": 38340 }, { "epoch": 7.22, "grad_norm": 2.792790651321411, "learning_rate": 5.563711650668173e-06, "loss": 0.7472, "step": 38350 }, { "epoch": 7.22, "grad_norm": 7.392334938049316, "learning_rate": 5.559947299077734e-06, "loss": 0.6038, "step": 38360 }, { "epoch": 7.22, "grad_norm": 5.9524431228637695, "learning_rate": 5.556182947487295e-06, "loss": 0.536, "step": 38370 }, { "epoch": 7.22, "grad_norm": 58.191078186035156, "learning_rate": 5.5524185958968565e-06, "loss": 0.6872, "step": 38380 }, { "epoch": 7.23, "grad_norm": 15.748824119567871, "learning_rate": 5.548654244306419e-06, "loss": 0.3699, "step": 38390 }, { "epoch": 7.23, "grad_norm": 11.440925598144531, "learning_rate": 5.54488989271598e-06, "loss": 0.6539, "step": 38400 }, { "epoch": 7.23, "grad_norm": 20.08065414428711, "learning_rate": 5.541125541125542e-06, "loss": 0.6942, "step": 38410 }, { "epoch": 7.23, "grad_norm": 15.032546997070312, "learning_rate": 5.537361189535103e-06, "loss": 0.8726, "step": 38420 }, { "epoch": 7.23, "grad_norm": 15.498872756958008, "learning_rate": 5.533596837944665e-06, "loss": 0.6084, "step": 38430 }, { "epoch": 7.24, "grad_norm": 20.899972915649414, "learning_rate": 5.529832486354226e-06, "loss": 0.7703, "step": 38440 }, { "epoch": 7.24, "grad_norm": 1.408137559890747, "learning_rate": 5.5260681347637875e-06, "loss": 0.9571, "step": 38450 }, { "epoch": 7.24, "grad_norm": 72.17916107177734, "learning_rate": 5.522303783173348e-06, "loss": 0.9278, "step": 38460 }, { "epoch": 7.24, "grad_norm": 8.472760200500488, "learning_rate": 5.51853943158291e-06, "loss": 0.8991, "step": 38470 }, { "epoch": 7.24, "grad_norm": 5.227403163909912, "learning_rate": 5.514775079992472e-06, "loss": 0.8767, "step": 38480 }, { "epoch": 7.24, "grad_norm": 6.581358432769775, "learning_rate": 5.511010728402033e-06, "loss": 0.6647, "step": 38490 }, { "epoch": 7.25, "grad_norm": 15.244298934936523, "learning_rate": 5.507246376811595e-06, "loss": 0.739, "step": 38500 }, { "epoch": 7.25, "grad_norm": 19.761615753173828, "learning_rate": 5.503482025221156e-06, "loss": 0.5769, "step": 38510 }, { "epoch": 7.25, "grad_norm": 3.388542890548706, "learning_rate": 5.499717673630718e-06, "loss": 0.6245, "step": 38520 }, { "epoch": 7.25, "grad_norm": 7.93393087387085, "learning_rate": 5.495953322040279e-06, "loss": 0.5916, "step": 38530 }, { "epoch": 7.25, "grad_norm": 4.061150074005127, "learning_rate": 5.492188970449841e-06, "loss": 0.6758, "step": 38540 }, { "epoch": 7.26, "grad_norm": 4.659008026123047, "learning_rate": 5.488424618859401e-06, "loss": 0.6164, "step": 38550 }, { "epoch": 7.26, "grad_norm": 15.835151672363281, "learning_rate": 5.484660267268963e-06, "loss": 0.6429, "step": 38560 }, { "epoch": 7.26, "grad_norm": 1.9797465801239014, "learning_rate": 5.480895915678524e-06, "loss": 0.8774, "step": 38570 }, { "epoch": 7.26, "grad_norm": 31.371129989624023, "learning_rate": 5.4771315640880865e-06, "loss": 0.5355, "step": 38580 }, { "epoch": 7.26, "grad_norm": 5.428137302398682, "learning_rate": 5.473367212497648e-06, "loss": 0.4431, "step": 38590 }, { "epoch": 7.27, "grad_norm": 10.931573867797852, "learning_rate": 5.4696028609072095e-06, "loss": 0.7355, "step": 38600 }, { "epoch": 7.27, "grad_norm": 1.8622735738754272, "learning_rate": 5.465838509316771e-06, "loss": 0.6643, "step": 38610 }, { "epoch": 7.27, "grad_norm": 1.4323937892913818, "learning_rate": 5.462074157726332e-06, "loss": 0.6274, "step": 38620 }, { "epoch": 7.27, "grad_norm": 3.692552328109741, "learning_rate": 5.458309806135893e-06, "loss": 0.5755, "step": 38630 }, { "epoch": 7.27, "grad_norm": 8.578022956848145, "learning_rate": 5.4545454545454545e-06, "loss": 0.4808, "step": 38640 }, { "epoch": 7.27, "grad_norm": 13.024136543273926, "learning_rate": 5.450781102955016e-06, "loss": 0.6768, "step": 38650 }, { "epoch": 7.28, "grad_norm": 21.61022186279297, "learning_rate": 5.447016751364577e-06, "loss": 0.519, "step": 38660 }, { "epoch": 7.28, "grad_norm": 44.52021789550781, "learning_rate": 5.44325239977414e-06, "loss": 0.3268, "step": 38670 }, { "epoch": 7.28, "grad_norm": 15.391091346740723, "learning_rate": 5.439488048183701e-06, "loss": 0.7366, "step": 38680 }, { "epoch": 7.28, "grad_norm": 23.698593139648438, "learning_rate": 5.435723696593263e-06, "loss": 0.8, "step": 38690 }, { "epoch": 7.28, "grad_norm": 7.603209972381592, "learning_rate": 5.431959345002824e-06, "loss": 0.8654, "step": 38700 }, { "epoch": 7.29, "grad_norm": 15.161066055297852, "learning_rate": 5.4281949934123855e-06, "loss": 0.7727, "step": 38710 }, { "epoch": 7.29, "grad_norm": 15.844622611999512, "learning_rate": 5.424430641821946e-06, "loss": 0.3999, "step": 38720 }, { "epoch": 7.29, "grad_norm": 10.869279861450195, "learning_rate": 5.420666290231508e-06, "loss": 0.3255, "step": 38730 }, { "epoch": 7.29, "grad_norm": 7.97806453704834, "learning_rate": 5.416901938641069e-06, "loss": 0.6407, "step": 38740 }, { "epoch": 7.29, "grad_norm": 11.535964012145996, "learning_rate": 5.4131375870506305e-06, "loss": 0.8045, "step": 38750 }, { "epoch": 7.3, "grad_norm": 50.705875396728516, "learning_rate": 5.409373235460193e-06, "loss": 0.7047, "step": 38760 }, { "epoch": 7.3, "grad_norm": 8.88627815246582, "learning_rate": 5.405608883869754e-06, "loss": 0.7027, "step": 38770 }, { "epoch": 7.3, "grad_norm": 6.255423545837402, "learning_rate": 5.401844532279316e-06, "loss": 0.754, "step": 38780 }, { "epoch": 7.3, "grad_norm": 7.3560895919799805, "learning_rate": 5.398080180688877e-06, "loss": 0.77, "step": 38790 }, { "epoch": 7.3, "grad_norm": 24.02309799194336, "learning_rate": 5.394315829098439e-06, "loss": 0.6988, "step": 38800 }, { "epoch": 7.3, "grad_norm": 9.047237396240234, "learning_rate": 5.390551477507999e-06, "loss": 0.6286, "step": 38810 }, { "epoch": 7.31, "grad_norm": 8.932085037231445, "learning_rate": 5.386787125917561e-06, "loss": 0.6704, "step": 38820 }, { "epoch": 7.31, "grad_norm": 13.834983825683594, "learning_rate": 5.383022774327122e-06, "loss": 0.4488, "step": 38830 }, { "epoch": 7.31, "grad_norm": 0.9087099432945251, "learning_rate": 5.379258422736684e-06, "loss": 0.4888, "step": 38840 }, { "epoch": 7.31, "grad_norm": 5.027251720428467, "learning_rate": 5.375494071146246e-06, "loss": 0.9245, "step": 38850 }, { "epoch": 7.31, "grad_norm": 16.71733856201172, "learning_rate": 5.3717297195558074e-06, "loss": 0.9299, "step": 38860 }, { "epoch": 7.32, "grad_norm": 6.607717990875244, "learning_rate": 5.367965367965369e-06, "loss": 1.0644, "step": 38870 }, { "epoch": 7.32, "grad_norm": 4.350011348724365, "learning_rate": 5.36420101637493e-06, "loss": 0.8894, "step": 38880 }, { "epoch": 7.32, "grad_norm": 8.624796867370605, "learning_rate": 5.360436664784491e-06, "loss": 0.6719, "step": 38890 }, { "epoch": 7.32, "grad_norm": 6.820374965667725, "learning_rate": 5.3566723131940524e-06, "loss": 0.545, "step": 38900 }, { "epoch": 7.32, "grad_norm": 8.066164016723633, "learning_rate": 5.352907961603614e-06, "loss": 0.7082, "step": 38910 }, { "epoch": 7.33, "grad_norm": 9.551775932312012, "learning_rate": 5.349143610013175e-06, "loss": 1.0106, "step": 38920 }, { "epoch": 7.33, "grad_norm": 2.4853885173797607, "learning_rate": 5.345379258422737e-06, "loss": 0.5258, "step": 38930 }, { "epoch": 7.33, "grad_norm": 0.25415197014808655, "learning_rate": 5.341614906832298e-06, "loss": 0.9771, "step": 38940 }, { "epoch": 7.33, "grad_norm": 23.860332489013672, "learning_rate": 5.337850555241861e-06, "loss": 0.8284, "step": 38950 }, { "epoch": 7.33, "grad_norm": 27.231725692749023, "learning_rate": 5.334086203651422e-06, "loss": 0.5045, "step": 38960 }, { "epoch": 7.33, "grad_norm": 12.43143081665039, "learning_rate": 5.3303218520609835e-06, "loss": 0.9507, "step": 38970 }, { "epoch": 7.34, "grad_norm": 3.9006149768829346, "learning_rate": 5.326557500470544e-06, "loss": 0.4446, "step": 38980 }, { "epoch": 7.34, "grad_norm": 26.314966201782227, "learning_rate": 5.322793148880106e-06, "loss": 0.6039, "step": 38990 }, { "epoch": 7.34, "grad_norm": 6.220264911651611, "learning_rate": 5.319028797289667e-06, "loss": 0.5351, "step": 39000 }, { "epoch": 7.34, "grad_norm": 4.4015045166015625, "learning_rate": 5.3152644456992285e-06, "loss": 0.2832, "step": 39010 }, { "epoch": 7.34, "grad_norm": 24.468585968017578, "learning_rate": 5.31150009410879e-06, "loss": 0.6182, "step": 39020 }, { "epoch": 7.35, "grad_norm": 4.152024269104004, "learning_rate": 5.307735742518351e-06, "loss": 0.8199, "step": 39030 }, { "epoch": 7.35, "grad_norm": 10.165727615356445, "learning_rate": 5.303971390927914e-06, "loss": 0.6839, "step": 39040 }, { "epoch": 7.35, "grad_norm": 13.230927467346191, "learning_rate": 5.300207039337475e-06, "loss": 0.6527, "step": 39050 }, { "epoch": 7.35, "grad_norm": 21.92257308959961, "learning_rate": 5.296442687747037e-06, "loss": 0.7811, "step": 39060 }, { "epoch": 7.35, "grad_norm": 11.228647232055664, "learning_rate": 5.292678336156597e-06, "loss": 1.0498, "step": 39070 }, { "epoch": 7.36, "grad_norm": 8.592232704162598, "learning_rate": 5.288913984566159e-06, "loss": 0.7423, "step": 39080 }, { "epoch": 7.36, "grad_norm": 16.812410354614258, "learning_rate": 5.28514963297572e-06, "loss": 0.8431, "step": 39090 }, { "epoch": 7.36, "grad_norm": 13.05716609954834, "learning_rate": 5.281385281385282e-06, "loss": 0.775, "step": 39100 }, { "epoch": 7.36, "grad_norm": 4.007587432861328, "learning_rate": 5.277620929794843e-06, "loss": 0.3927, "step": 39110 }, { "epoch": 7.36, "grad_norm": 15.15101432800293, "learning_rate": 5.2738565782044046e-06, "loss": 0.7852, "step": 39120 }, { "epoch": 7.36, "grad_norm": 34.69432830810547, "learning_rate": 5.270092226613967e-06, "loss": 0.9307, "step": 39130 }, { "epoch": 7.37, "grad_norm": 17.020305633544922, "learning_rate": 5.266327875023528e-06, "loss": 0.6162, "step": 39140 }, { "epoch": 7.37, "grad_norm": 2.787398338317871, "learning_rate": 5.26256352343309e-06, "loss": 0.633, "step": 39150 }, { "epoch": 7.37, "grad_norm": 23.109773635864258, "learning_rate": 5.25879917184265e-06, "loss": 0.5961, "step": 39160 }, { "epoch": 7.37, "grad_norm": 9.522040367126465, "learning_rate": 5.255034820252212e-06, "loss": 0.9519, "step": 39170 }, { "epoch": 7.37, "grad_norm": 5.839687824249268, "learning_rate": 5.251270468661773e-06, "loss": 0.7439, "step": 39180 }, { "epoch": 7.38, "grad_norm": 3.285266399383545, "learning_rate": 5.247506117071335e-06, "loss": 0.3935, "step": 39190 }, { "epoch": 7.38, "grad_norm": 25.472768783569336, "learning_rate": 5.243741765480896e-06, "loss": 0.6537, "step": 39200 }, { "epoch": 7.38, "grad_norm": 8.704800605773926, "learning_rate": 5.239977413890458e-06, "loss": 0.7204, "step": 39210 }, { "epoch": 7.38, "grad_norm": 3.25404953956604, "learning_rate": 5.236213062300019e-06, "loss": 0.5959, "step": 39220 }, { "epoch": 7.38, "grad_norm": 9.147116661071777, "learning_rate": 5.2324487107095815e-06, "loss": 0.507, "step": 39230 }, { "epoch": 7.39, "grad_norm": 35.31050109863281, "learning_rate": 5.228684359119141e-06, "loss": 0.9105, "step": 39240 }, { "epoch": 7.39, "grad_norm": 2.140300989151001, "learning_rate": 5.2249200075287036e-06, "loss": 0.7936, "step": 39250 }, { "epoch": 7.39, "grad_norm": 9.016438484191895, "learning_rate": 5.221155655938265e-06, "loss": 0.762, "step": 39260 }, { "epoch": 7.39, "grad_norm": 9.740910530090332, "learning_rate": 5.2173913043478265e-06, "loss": 0.6386, "step": 39270 }, { "epoch": 7.39, "grad_norm": 6.554019451141357, "learning_rate": 5.213626952757388e-06, "loss": 1.1374, "step": 39280 }, { "epoch": 7.4, "grad_norm": 11.774321556091309, "learning_rate": 5.209862601166949e-06, "loss": 0.5878, "step": 39290 }, { "epoch": 7.4, "grad_norm": 9.32991886138916, "learning_rate": 5.206098249576511e-06, "loss": 0.659, "step": 39300 }, { "epoch": 7.4, "grad_norm": 4.6589460372924805, "learning_rate": 5.202333897986072e-06, "loss": 0.5913, "step": 39310 }, { "epoch": 7.4, "grad_norm": 7.4741716384887695, "learning_rate": 5.198569546395635e-06, "loss": 0.9489, "step": 39320 }, { "epoch": 7.4, "grad_norm": 17.66104507446289, "learning_rate": 5.194805194805194e-06, "loss": 0.6698, "step": 39330 }, { "epoch": 7.4, "grad_norm": 3.7738263607025146, "learning_rate": 5.191040843214757e-06, "loss": 0.621, "step": 39340 }, { "epoch": 7.41, "grad_norm": 9.111323356628418, "learning_rate": 5.187276491624318e-06, "loss": 0.7562, "step": 39350 }, { "epoch": 7.41, "grad_norm": 3.694535255432129, "learning_rate": 5.18351214003388e-06, "loss": 0.5704, "step": 39360 }, { "epoch": 7.41, "grad_norm": 13.665548324584961, "learning_rate": 5.179747788443441e-06, "loss": 0.7916, "step": 39370 }, { "epoch": 7.41, "grad_norm": 6.363194942474365, "learning_rate": 5.1759834368530025e-06, "loss": 0.6792, "step": 39380 }, { "epoch": 7.41, "grad_norm": 18.37224578857422, "learning_rate": 5.172219085262564e-06, "loss": 0.6757, "step": 39390 }, { "epoch": 7.42, "grad_norm": 11.329370498657227, "learning_rate": 5.1684547336721255e-06, "loss": 0.6348, "step": 39400 }, { "epoch": 7.42, "grad_norm": 8.472770690917969, "learning_rate": 5.164690382081688e-06, "loss": 0.4816, "step": 39410 }, { "epoch": 7.42, "grad_norm": 8.334856986999512, "learning_rate": 5.1609260304912475e-06, "loss": 0.5467, "step": 39420 }, { "epoch": 7.42, "grad_norm": 57.72010040283203, "learning_rate": 5.15716167890081e-06, "loss": 0.5458, "step": 39430 }, { "epoch": 7.42, "grad_norm": 7.276113510131836, "learning_rate": 5.153397327310371e-06, "loss": 0.6807, "step": 39440 }, { "epoch": 7.43, "grad_norm": 5.335689067840576, "learning_rate": 5.149632975719933e-06, "loss": 0.739, "step": 39450 }, { "epoch": 7.43, "grad_norm": 4.246729850769043, "learning_rate": 5.145868624129494e-06, "loss": 0.6396, "step": 39460 }, { "epoch": 7.43, "grad_norm": 26.549583435058594, "learning_rate": 5.142104272539056e-06, "loss": 0.734, "step": 39470 }, { "epoch": 7.43, "grad_norm": 21.118213653564453, "learning_rate": 5.138339920948617e-06, "loss": 0.7594, "step": 39480 }, { "epoch": 7.43, "grad_norm": 20.760250091552734, "learning_rate": 5.134575569358179e-06, "loss": 0.4326, "step": 39490 }, { "epoch": 7.43, "grad_norm": 1.7645206451416016, "learning_rate": 5.130811217767739e-06, "loss": 0.8093, "step": 39500 }, { "epoch": 7.44, "grad_norm": 8.050736427307129, "learning_rate": 5.127046866177301e-06, "loss": 0.552, "step": 39510 }, { "epoch": 7.44, "grad_norm": 4.024702072143555, "learning_rate": 5.123282514586862e-06, "loss": 0.4428, "step": 39520 }, { "epoch": 7.44, "grad_norm": 24.897802352905273, "learning_rate": 5.1195181629964245e-06, "loss": 0.9756, "step": 39530 }, { "epoch": 7.44, "grad_norm": 7.027749061584473, "learning_rate": 5.115753811405986e-06, "loss": 0.8593, "step": 39540 }, { "epoch": 7.44, "grad_norm": 5.2921247482299805, "learning_rate": 5.111989459815547e-06, "loss": 0.5476, "step": 39550 }, { "epoch": 7.45, "grad_norm": 12.500511169433594, "learning_rate": 5.108225108225109e-06, "loss": 0.5872, "step": 39560 }, { "epoch": 7.45, "grad_norm": 16.93218994140625, "learning_rate": 5.10446075663467e-06, "loss": 0.7882, "step": 39570 }, { "epoch": 7.45, "grad_norm": 50.98332977294922, "learning_rate": 5.100696405044232e-06, "loss": 0.4918, "step": 39580 }, { "epoch": 7.45, "grad_norm": 1.9770981073379517, "learning_rate": 5.096932053453792e-06, "loss": 0.6862, "step": 39590 }, { "epoch": 7.45, "grad_norm": 3.6896109580993652, "learning_rate": 5.093167701863354e-06, "loss": 0.7951, "step": 39600 }, { "epoch": 7.46, "grad_norm": 22.211389541625977, "learning_rate": 5.089403350272915e-06, "loss": 0.6522, "step": 39610 }, { "epoch": 7.46, "grad_norm": 15.687029838562012, "learning_rate": 5.085638998682478e-06, "loss": 0.7902, "step": 39620 }, { "epoch": 7.46, "grad_norm": 10.344719886779785, "learning_rate": 5.081874647092039e-06, "loss": 0.851, "step": 39630 }, { "epoch": 7.46, "grad_norm": 12.113409996032715, "learning_rate": 5.0781102955016005e-06, "loss": 0.5317, "step": 39640 }, { "epoch": 7.46, "grad_norm": 4.6147918701171875, "learning_rate": 5.074345943911162e-06, "loss": 0.5423, "step": 39650 }, { "epoch": 7.46, "grad_norm": 8.96312141418457, "learning_rate": 5.0705815923207234e-06, "loss": 1.1695, "step": 39660 }, { "epoch": 7.47, "grad_norm": 21.657495498657227, "learning_rate": 5.066817240730285e-06, "loss": 0.6037, "step": 39670 }, { "epoch": 7.47, "grad_norm": 8.477386474609375, "learning_rate": 5.0630528891398455e-06, "loss": 0.7301, "step": 39680 }, { "epoch": 7.47, "grad_norm": 23.125459671020508, "learning_rate": 5.059288537549407e-06, "loss": 0.67, "step": 39690 }, { "epoch": 7.47, "grad_norm": 12.788286209106445, "learning_rate": 5.0555241859589684e-06, "loss": 0.8835, "step": 39700 }, { "epoch": 7.47, "grad_norm": 21.63763999938965, "learning_rate": 5.051759834368531e-06, "loss": 0.9294, "step": 39710 }, { "epoch": 7.48, "grad_norm": 23.287736892700195, "learning_rate": 5.047995482778092e-06, "loss": 0.3314, "step": 39720 }, { "epoch": 7.48, "grad_norm": 6.8206467628479, "learning_rate": 5.044231131187654e-06, "loss": 0.6189, "step": 39730 }, { "epoch": 7.48, "grad_norm": 19.416696548461914, "learning_rate": 5.040466779597215e-06, "loss": 0.6216, "step": 39740 }, { "epoch": 7.48, "grad_norm": 15.730079650878906, "learning_rate": 5.036702428006777e-06, "loss": 0.4919, "step": 39750 }, { "epoch": 7.48, "grad_norm": 22.439964294433594, "learning_rate": 5.032938076416338e-06, "loss": 0.6435, "step": 39760 }, { "epoch": 7.49, "grad_norm": 5.978825569152832, "learning_rate": 5.029173724825899e-06, "loss": 0.5197, "step": 39770 }, { "epoch": 7.49, "grad_norm": 6.658477306365967, "learning_rate": 5.02540937323546e-06, "loss": 0.7042, "step": 39780 }, { "epoch": 7.49, "grad_norm": 2.1028757095336914, "learning_rate": 5.021645021645022e-06, "loss": 0.6909, "step": 39790 }, { "epoch": 7.49, "grad_norm": 26.77193260192871, "learning_rate": 5.017880670054583e-06, "loss": 0.8228, "step": 39800 }, { "epoch": 7.49, "grad_norm": 38.920623779296875, "learning_rate": 5.014116318464145e-06, "loss": 0.5459, "step": 39810 }, { "epoch": 7.49, "grad_norm": 8.325672149658203, "learning_rate": 5.010351966873707e-06, "loss": 0.6531, "step": 39820 }, { "epoch": 7.5, "grad_norm": 5.312551498413086, "learning_rate": 5.006587615283268e-06, "loss": 0.5324, "step": 39830 }, { "epoch": 7.5, "grad_norm": 15.660419464111328, "learning_rate": 5.00282326369283e-06, "loss": 0.5293, "step": 39840 }, { "epoch": 7.5, "grad_norm": 7.468656539916992, "learning_rate": 4.999058912102391e-06, "loss": 0.755, "step": 39850 }, { "epoch": 7.5, "grad_norm": 4.447936534881592, "learning_rate": 4.995294560511953e-06, "loss": 0.5967, "step": 39860 }, { "epoch": 7.5, "grad_norm": 6.53826904296875, "learning_rate": 4.991530208921513e-06, "loss": 0.4897, "step": 39870 }, { "epoch": 7.51, "grad_norm": 26.313703536987305, "learning_rate": 4.987765857331075e-06, "loss": 0.7384, "step": 39880 }, { "epoch": 7.51, "grad_norm": 9.259058952331543, "learning_rate": 4.984001505740636e-06, "loss": 0.5892, "step": 39890 }, { "epoch": 7.51, "grad_norm": 11.453961372375488, "learning_rate": 4.9802371541501985e-06, "loss": 0.709, "step": 39900 }, { "epoch": 7.51, "grad_norm": 6.938704967498779, "learning_rate": 4.97647280255976e-06, "loss": 0.8085, "step": 39910 }, { "epoch": 7.51, "grad_norm": 9.102197647094727, "learning_rate": 4.9727084509693206e-06, "loss": 0.7154, "step": 39920 }, { "epoch": 7.52, "grad_norm": 2.2153682708740234, "learning_rate": 4.968944099378882e-06, "loss": 0.2806, "step": 39930 }, { "epoch": 7.52, "grad_norm": 7.025110721588135, "learning_rate": 4.9651797477884435e-06, "loss": 0.521, "step": 39940 }, { "epoch": 7.52, "grad_norm": 7.886729717254639, "learning_rate": 4.961415396198006e-06, "loss": 1.0426, "step": 39950 }, { "epoch": 7.52, "grad_norm": 8.98823356628418, "learning_rate": 4.957651044607566e-06, "loss": 0.6756, "step": 39960 }, { "epoch": 7.52, "grad_norm": 4.276978015899658, "learning_rate": 4.953886693017128e-06, "loss": 0.7381, "step": 39970 }, { "epoch": 7.52, "grad_norm": 30.311508178710938, "learning_rate": 4.950122341426689e-06, "loss": 0.6359, "step": 39980 }, { "epoch": 7.53, "grad_norm": 1.7316584587097168, "learning_rate": 4.946357989836252e-06, "loss": 0.2928, "step": 39990 }, { "epoch": 7.53, "grad_norm": 10.383225440979004, "learning_rate": 4.942593638245812e-06, "loss": 0.9519, "step": 40000 }, { "epoch": 7.53, "grad_norm": 6.726780414581299, "learning_rate": 4.938829286655374e-06, "loss": 0.7285, "step": 40010 }, { "epoch": 7.53, "grad_norm": 14.731858253479004, "learning_rate": 4.935064935064935e-06, "loss": 0.7152, "step": 40020 }, { "epoch": 7.53, "grad_norm": 4.943718433380127, "learning_rate": 4.931300583474497e-06, "loss": 0.5914, "step": 40030 }, { "epoch": 7.54, "grad_norm": 23.996057510375977, "learning_rate": 4.927536231884059e-06, "loss": 0.6292, "step": 40040 }, { "epoch": 7.54, "grad_norm": 3.7692580223083496, "learning_rate": 4.9237718802936196e-06, "loss": 0.5808, "step": 40050 }, { "epoch": 7.54, "grad_norm": 9.744156837463379, "learning_rate": 4.920007528703181e-06, "loss": 0.4698, "step": 40060 }, { "epoch": 7.54, "grad_norm": 9.300414085388184, "learning_rate": 4.9162431771127425e-06, "loss": 0.8142, "step": 40070 }, { "epoch": 7.54, "grad_norm": 39.78608322143555, "learning_rate": 4.912478825522305e-06, "loss": 0.6068, "step": 40080 }, { "epoch": 7.55, "grad_norm": 2.285921573638916, "learning_rate": 4.908714473931865e-06, "loss": 0.495, "step": 40090 }, { "epoch": 7.55, "grad_norm": 1.753082275390625, "learning_rate": 4.904950122341427e-06, "loss": 0.4074, "step": 40100 }, { "epoch": 7.55, "grad_norm": 4.920131206512451, "learning_rate": 4.901185770750988e-06, "loss": 0.5714, "step": 40110 }, { "epoch": 7.55, "grad_norm": 2.5696611404418945, "learning_rate": 4.89742141916055e-06, "loss": 0.6695, "step": 40120 }, { "epoch": 7.55, "grad_norm": 10.178996086120605, "learning_rate": 4.893657067570112e-06, "loss": 0.6704, "step": 40130 }, { "epoch": 7.56, "grad_norm": 46.131134033203125, "learning_rate": 4.889892715979673e-06, "loss": 0.8476, "step": 40140 }, { "epoch": 7.56, "grad_norm": 7.363309860229492, "learning_rate": 4.886128364389234e-06, "loss": 0.7517, "step": 40150 }, { "epoch": 7.56, "grad_norm": 85.74574279785156, "learning_rate": 4.882364012798796e-06, "loss": 0.7572, "step": 40160 }, { "epoch": 7.56, "grad_norm": 26.763887405395508, "learning_rate": 4.878599661208357e-06, "loss": 1.0535, "step": 40170 }, { "epoch": 7.56, "grad_norm": 2.287273406982422, "learning_rate": 4.8748353096179186e-06, "loss": 0.7071, "step": 40180 }, { "epoch": 7.56, "grad_norm": 9.184791564941406, "learning_rate": 4.87107095802748e-06, "loss": 0.5359, "step": 40190 }, { "epoch": 7.57, "grad_norm": 2.4340195655822754, "learning_rate": 4.8673066064370415e-06, "loss": 0.7074, "step": 40200 }, { "epoch": 7.57, "grad_norm": 5.408249378204346, "learning_rate": 4.863542254846603e-06, "loss": 0.6437, "step": 40210 }, { "epoch": 7.57, "grad_norm": 24.10207748413086, "learning_rate": 4.859777903256164e-06, "loss": 0.7965, "step": 40220 }, { "epoch": 7.57, "grad_norm": 7.04209566116333, "learning_rate": 4.856013551665726e-06, "loss": 0.8191, "step": 40230 }, { "epoch": 7.57, "grad_norm": 12.735274314880371, "learning_rate": 4.852249200075287e-06, "loss": 0.6039, "step": 40240 }, { "epoch": 7.58, "grad_norm": 1.7954115867614746, "learning_rate": 4.848484848484849e-06, "loss": 0.7864, "step": 40250 }, { "epoch": 7.58, "grad_norm": 29.247159957885742, "learning_rate": 4.84472049689441e-06, "loss": 0.4797, "step": 40260 }, { "epoch": 7.58, "grad_norm": 5.017600059509277, "learning_rate": 4.840956145303972e-06, "loss": 0.8047, "step": 40270 }, { "epoch": 7.58, "grad_norm": 8.275792121887207, "learning_rate": 4.837191793713533e-06, "loss": 0.5685, "step": 40280 }, { "epoch": 7.58, "grad_norm": 10.087935447692871, "learning_rate": 4.833427442123095e-06, "loss": 0.6682, "step": 40290 }, { "epoch": 7.59, "grad_norm": 8.961991310119629, "learning_rate": 4.829663090532656e-06, "loss": 0.8516, "step": 40300 }, { "epoch": 7.59, "grad_norm": 13.71435832977295, "learning_rate": 4.8258987389422175e-06, "loss": 0.3818, "step": 40310 }, { "epoch": 7.59, "grad_norm": 8.743640899658203, "learning_rate": 4.822134387351779e-06, "loss": 0.585, "step": 40320 }, { "epoch": 7.59, "grad_norm": 20.268667221069336, "learning_rate": 4.8183700357613405e-06, "loss": 0.7012, "step": 40330 }, { "epoch": 7.59, "grad_norm": 3.453274726867676, "learning_rate": 4.814605684170902e-06, "loss": 0.7444, "step": 40340 }, { "epoch": 7.59, "grad_norm": 1.207269549369812, "learning_rate": 4.810841332580463e-06, "loss": 0.7123, "step": 40350 }, { "epoch": 7.6, "grad_norm": 1.340839147567749, "learning_rate": 4.807076980990025e-06, "loss": 0.5855, "step": 40360 }, { "epoch": 7.6, "grad_norm": 13.67527961730957, "learning_rate": 4.803312629399586e-06, "loss": 0.6018, "step": 40370 }, { "epoch": 7.6, "grad_norm": 15.73237133026123, "learning_rate": 4.799548277809148e-06, "loss": 0.5691, "step": 40380 }, { "epoch": 7.6, "grad_norm": 9.629433631896973, "learning_rate": 4.795783926218709e-06, "loss": 0.5859, "step": 40390 }, { "epoch": 7.6, "grad_norm": 31.861783981323242, "learning_rate": 4.792019574628271e-06, "loss": 0.6733, "step": 40400 }, { "epoch": 7.61, "grad_norm": 18.899389266967773, "learning_rate": 4.788255223037832e-06, "loss": 0.6046, "step": 40410 }, { "epoch": 7.61, "grad_norm": 14.72265338897705, "learning_rate": 4.784490871447394e-06, "loss": 0.7938, "step": 40420 }, { "epoch": 7.61, "grad_norm": 3.226560354232788, "learning_rate": 4.780726519856955e-06, "loss": 0.7408, "step": 40430 }, { "epoch": 7.61, "grad_norm": 18.94156265258789, "learning_rate": 4.7769621682665165e-06, "loss": 0.5939, "step": 40440 }, { "epoch": 7.61, "grad_norm": 13.244141578674316, "learning_rate": 4.773197816676078e-06, "loss": 0.5905, "step": 40450 }, { "epoch": 7.62, "grad_norm": 3.4479150772094727, "learning_rate": 4.7694334650856395e-06, "loss": 0.5858, "step": 40460 }, { "epoch": 7.62, "grad_norm": 4.739234924316406, "learning_rate": 4.765669113495201e-06, "loss": 0.4601, "step": 40470 }, { "epoch": 7.62, "grad_norm": 17.5033016204834, "learning_rate": 4.761904761904762e-06, "loss": 0.6048, "step": 40480 }, { "epoch": 7.62, "grad_norm": 8.003280639648438, "learning_rate": 4.758140410314324e-06, "loss": 0.7102, "step": 40490 }, { "epoch": 7.62, "grad_norm": 29.26943588256836, "learning_rate": 4.754376058723885e-06, "loss": 0.5325, "step": 40500 }, { "epoch": 7.62, "grad_norm": 10.125970840454102, "learning_rate": 4.750611707133447e-06, "loss": 0.7656, "step": 40510 }, { "epoch": 7.63, "grad_norm": 5.632318019866943, "learning_rate": 4.746847355543008e-06, "loss": 0.6063, "step": 40520 }, { "epoch": 7.63, "grad_norm": 4.835878372192383, "learning_rate": 4.74308300395257e-06, "loss": 0.5283, "step": 40530 }, { "epoch": 7.63, "grad_norm": 17.783977508544922, "learning_rate": 4.739318652362131e-06, "loss": 0.6452, "step": 40540 }, { "epoch": 7.63, "grad_norm": 10.88222885131836, "learning_rate": 4.735554300771693e-06, "loss": 0.6625, "step": 40550 }, { "epoch": 7.63, "grad_norm": 14.537688255310059, "learning_rate": 4.731789949181254e-06, "loss": 0.8266, "step": 40560 }, { "epoch": 7.64, "grad_norm": 7.025276184082031, "learning_rate": 4.7280255975908155e-06, "loss": 0.7873, "step": 40570 }, { "epoch": 7.64, "grad_norm": 22.656702041625977, "learning_rate": 4.724261246000377e-06, "loss": 0.7008, "step": 40580 }, { "epoch": 7.64, "grad_norm": 20.864770889282227, "learning_rate": 4.7204968944099384e-06, "loss": 0.4904, "step": 40590 }, { "epoch": 7.64, "grad_norm": 13.416523933410645, "learning_rate": 4.7167325428195e-06, "loss": 0.6536, "step": 40600 }, { "epoch": 7.64, "grad_norm": 8.85934066772461, "learning_rate": 4.7129681912290605e-06, "loss": 0.6849, "step": 40610 }, { "epoch": 7.65, "grad_norm": 31.11150360107422, "learning_rate": 4.709203839638623e-06, "loss": 0.4983, "step": 40620 }, { "epoch": 7.65, "grad_norm": 16.5590763092041, "learning_rate": 4.705439488048184e-06, "loss": 0.7671, "step": 40630 }, { "epoch": 7.65, "grad_norm": 1.8591314554214478, "learning_rate": 4.701675136457746e-06, "loss": 0.6079, "step": 40640 }, { "epoch": 7.65, "grad_norm": 8.708986282348633, "learning_rate": 4.697910784867307e-06, "loss": 0.8783, "step": 40650 }, { "epoch": 7.65, "grad_norm": 9.244623184204102, "learning_rate": 4.694146433276869e-06, "loss": 0.9057, "step": 40660 }, { "epoch": 7.65, "grad_norm": 2.42488956451416, "learning_rate": 4.69038208168643e-06, "loss": 0.6583, "step": 40670 }, { "epoch": 7.66, "grad_norm": 12.842972755432129, "learning_rate": 4.686617730095992e-06, "loss": 1.0509, "step": 40680 }, { "epoch": 7.66, "grad_norm": 12.182962417602539, "learning_rate": 4.682853378505553e-06, "loss": 0.527, "step": 40690 }, { "epoch": 7.66, "grad_norm": 12.226886749267578, "learning_rate": 4.679089026915114e-06, "loss": 0.5301, "step": 40700 }, { "epoch": 7.66, "grad_norm": 26.817169189453125, "learning_rate": 4.675324675324676e-06, "loss": 0.7724, "step": 40710 }, { "epoch": 7.66, "grad_norm": 6.593476295471191, "learning_rate": 4.6715603237342374e-06, "loss": 0.6253, "step": 40720 }, { "epoch": 7.67, "grad_norm": 27.855947494506836, "learning_rate": 4.667795972143799e-06, "loss": 0.6286, "step": 40730 }, { "epoch": 7.67, "grad_norm": 60.77217102050781, "learning_rate": 4.66403162055336e-06, "loss": 0.7077, "step": 40740 }, { "epoch": 7.67, "grad_norm": 24.09757423400879, "learning_rate": 4.660267268962921e-06, "loss": 1.0211, "step": 40750 }, { "epoch": 7.67, "grad_norm": 0.618398129940033, "learning_rate": 4.656502917372483e-06, "loss": 0.5325, "step": 40760 }, { "epoch": 7.67, "grad_norm": 3.6310360431671143, "learning_rate": 4.652738565782045e-06, "loss": 0.5259, "step": 40770 }, { "epoch": 7.68, "grad_norm": 5.603189945220947, "learning_rate": 4.648974214191606e-06, "loss": 0.7276, "step": 40780 }, { "epoch": 7.68, "grad_norm": 26.887163162231445, "learning_rate": 4.645209862601167e-06, "loss": 0.7147, "step": 40790 }, { "epoch": 7.68, "grad_norm": 15.721969604492188, "learning_rate": 4.641445511010729e-06, "loss": 0.791, "step": 40800 }, { "epoch": 7.68, "grad_norm": 9.369284629821777, "learning_rate": 4.637681159420291e-06, "loss": 0.6822, "step": 40810 }, { "epoch": 7.68, "grad_norm": 22.81471061706543, "learning_rate": 4.633916807829852e-06, "loss": 1.0583, "step": 40820 }, { "epoch": 7.68, "grad_norm": 10.889772415161133, "learning_rate": 4.630152456239413e-06, "loss": 0.847, "step": 40830 }, { "epoch": 7.69, "grad_norm": 9.222798347473145, "learning_rate": 4.626388104648974e-06, "loss": 0.7577, "step": 40840 }, { "epoch": 7.69, "grad_norm": 17.067047119140625, "learning_rate": 4.622623753058536e-06, "loss": 0.5523, "step": 40850 }, { "epoch": 7.69, "grad_norm": 25.14735984802246, "learning_rate": 4.618859401468098e-06, "loss": 0.6698, "step": 40860 }, { "epoch": 7.69, "grad_norm": 7.742861270904541, "learning_rate": 4.615095049877659e-06, "loss": 0.6274, "step": 40870 }, { "epoch": 7.69, "grad_norm": 0.9540747404098511, "learning_rate": 4.61133069828722e-06, "loss": 0.4707, "step": 40880 }, { "epoch": 7.7, "grad_norm": 7.411771297454834, "learning_rate": 4.607566346696781e-06, "loss": 0.6528, "step": 40890 }, { "epoch": 7.7, "grad_norm": 11.34437370300293, "learning_rate": 4.603801995106344e-06, "loss": 0.5187, "step": 40900 }, { "epoch": 7.7, "grad_norm": 17.062744140625, "learning_rate": 4.600037643515905e-06, "loss": 0.7695, "step": 40910 }, { "epoch": 7.7, "grad_norm": 14.723018646240234, "learning_rate": 4.596273291925466e-06, "loss": 0.5603, "step": 40920 }, { "epoch": 7.7, "grad_norm": 3.046761989593506, "learning_rate": 4.592508940335027e-06, "loss": 0.4964, "step": 40930 }, { "epoch": 7.71, "grad_norm": 5.844112873077393, "learning_rate": 4.5887445887445896e-06, "loss": 0.5202, "step": 40940 }, { "epoch": 7.71, "grad_norm": 25.061861038208008, "learning_rate": 4.584980237154151e-06, "loss": 0.8229, "step": 40950 }, { "epoch": 7.71, "grad_norm": 1.9810789823532104, "learning_rate": 4.581215885563712e-06, "loss": 0.5581, "step": 40960 }, { "epoch": 7.71, "grad_norm": 5.052197456359863, "learning_rate": 4.577451533973273e-06, "loss": 0.7015, "step": 40970 }, { "epoch": 7.71, "grad_norm": 8.623268127441406, "learning_rate": 4.5736871823828346e-06, "loss": 0.554, "step": 40980 }, { "epoch": 7.72, "grad_norm": 2.725163459777832, "learning_rate": 4.569922830792397e-06, "loss": 0.8269, "step": 40990 }, { "epoch": 7.72, "grad_norm": 3.4321749210357666, "learning_rate": 4.566158479201958e-06, "loss": 0.7675, "step": 41000 }, { "epoch": 7.72, "grad_norm": 4.25792932510376, "learning_rate": 4.562394127611519e-06, "loss": 0.535, "step": 41010 }, { "epoch": 7.72, "grad_norm": 7.7395219802856445, "learning_rate": 4.55862977602108e-06, "loss": 0.7916, "step": 41020 }, { "epoch": 7.72, "grad_norm": 1.3611125946044922, "learning_rate": 4.554865424430642e-06, "loss": 0.5845, "step": 41030 }, { "epoch": 7.72, "grad_norm": 5.006902694702148, "learning_rate": 4.551101072840204e-06, "loss": 0.793, "step": 41040 }, { "epoch": 7.73, "grad_norm": 20.101076126098633, "learning_rate": 4.547336721249765e-06, "loss": 0.4905, "step": 41050 }, { "epoch": 7.73, "grad_norm": 3.724898338317871, "learning_rate": 4.543572369659326e-06, "loss": 0.4192, "step": 41060 }, { "epoch": 7.73, "grad_norm": 19.539445877075195, "learning_rate": 4.539808018068888e-06, "loss": 0.8283, "step": 41070 }, { "epoch": 7.73, "grad_norm": 7.611993789672852, "learning_rate": 4.53604366647845e-06, "loss": 0.7414, "step": 41080 }, { "epoch": 7.73, "grad_norm": 4.557059288024902, "learning_rate": 4.532279314888011e-06, "loss": 0.6797, "step": 41090 }, { "epoch": 7.74, "grad_norm": 1.2346349954605103, "learning_rate": 4.528514963297572e-06, "loss": 0.4999, "step": 41100 }, { "epoch": 7.74, "grad_norm": 12.652429580688477, "learning_rate": 4.5247506117071336e-06, "loss": 0.7825, "step": 41110 }, { "epoch": 7.74, "grad_norm": 28.280677795410156, "learning_rate": 4.520986260116695e-06, "loss": 0.6687, "step": 41120 }, { "epoch": 7.74, "grad_norm": 4.621511459350586, "learning_rate": 4.517221908526257e-06, "loss": 1.0286, "step": 41130 }, { "epoch": 7.74, "grad_norm": 1.5836962461471558, "learning_rate": 4.513457556935818e-06, "loss": 0.5081, "step": 41140 }, { "epoch": 7.75, "grad_norm": 11.651044845581055, "learning_rate": 4.509693205345379e-06, "loss": 0.5183, "step": 41150 }, { "epoch": 7.75, "grad_norm": 0.9869891405105591, "learning_rate": 4.505928853754941e-06, "loss": 0.3698, "step": 41160 }, { "epoch": 7.75, "grad_norm": 20.051952362060547, "learning_rate": 4.502164502164502e-06, "loss": 0.7959, "step": 41170 }, { "epoch": 7.75, "grad_norm": 5.264538288116455, "learning_rate": 4.498400150574064e-06, "loss": 0.4474, "step": 41180 }, { "epoch": 7.75, "grad_norm": 2.0304129123687744, "learning_rate": 4.494635798983625e-06, "loss": 0.749, "step": 41190 }, { "epoch": 7.75, "grad_norm": 29.883962631225586, "learning_rate": 4.490871447393187e-06, "loss": 0.9788, "step": 41200 }, { "epoch": 7.76, "grad_norm": 2.509295701980591, "learning_rate": 4.487107095802748e-06, "loss": 0.6449, "step": 41210 }, { "epoch": 7.76, "grad_norm": 34.072452545166016, "learning_rate": 4.48334274421231e-06, "loss": 0.5776, "step": 41220 }, { "epoch": 7.76, "grad_norm": 26.158466339111328, "learning_rate": 4.479578392621871e-06, "loss": 0.646, "step": 41230 }, { "epoch": 7.76, "grad_norm": 13.538796424865723, "learning_rate": 4.4758140410314325e-06, "loss": 0.5987, "step": 41240 }, { "epoch": 7.76, "grad_norm": 7.280695915222168, "learning_rate": 4.472049689440994e-06, "loss": 0.5659, "step": 41250 }, { "epoch": 7.77, "grad_norm": 9.619302749633789, "learning_rate": 4.4682853378505555e-06, "loss": 0.7988, "step": 41260 }, { "epoch": 7.77, "grad_norm": 15.12780475616455, "learning_rate": 4.464520986260117e-06, "loss": 0.6142, "step": 41270 }, { "epoch": 7.77, "grad_norm": 1.9630608558654785, "learning_rate": 4.460756634669678e-06, "loss": 0.4835, "step": 41280 }, { "epoch": 7.77, "grad_norm": 9.1524019241333, "learning_rate": 4.45699228307924e-06, "loss": 0.9363, "step": 41290 }, { "epoch": 7.77, "grad_norm": 5.156355381011963, "learning_rate": 4.453227931488801e-06, "loss": 0.6907, "step": 41300 }, { "epoch": 7.78, "grad_norm": 14.706850051879883, "learning_rate": 4.449463579898363e-06, "loss": 0.8253, "step": 41310 }, { "epoch": 7.78, "grad_norm": 15.74466609954834, "learning_rate": 4.445699228307924e-06, "loss": 0.7335, "step": 41320 }, { "epoch": 7.78, "grad_norm": 17.57891082763672, "learning_rate": 4.441934876717486e-06, "loss": 0.814, "step": 41330 }, { "epoch": 7.78, "grad_norm": 9.973849296569824, "learning_rate": 4.438170525127047e-06, "loss": 0.8614, "step": 41340 }, { "epoch": 7.78, "grad_norm": 10.409669876098633, "learning_rate": 4.434406173536609e-06, "loss": 0.7168, "step": 41350 }, { "epoch": 7.78, "grad_norm": 6.882500648498535, "learning_rate": 4.43064182194617e-06, "loss": 0.541, "step": 41360 }, { "epoch": 7.79, "grad_norm": 11.961504936218262, "learning_rate": 4.4268774703557315e-06, "loss": 0.9712, "step": 41370 }, { "epoch": 7.79, "grad_norm": 15.884038925170898, "learning_rate": 4.423113118765293e-06, "loss": 0.584, "step": 41380 }, { "epoch": 7.79, "grad_norm": 4.495667457580566, "learning_rate": 4.4193487671748545e-06, "loss": 0.7261, "step": 41390 }, { "epoch": 7.79, "grad_norm": 3.5504605770111084, "learning_rate": 4.415584415584416e-06, "loss": 0.5894, "step": 41400 }, { "epoch": 7.79, "grad_norm": 1.8022191524505615, "learning_rate": 4.411820063993977e-06, "loss": 0.6235, "step": 41410 }, { "epoch": 7.8, "grad_norm": 38.18122482299805, "learning_rate": 4.408055712403539e-06, "loss": 0.6536, "step": 41420 }, { "epoch": 7.8, "grad_norm": 28.786279678344727, "learning_rate": 4.4042913608131e-06, "loss": 0.7857, "step": 41430 }, { "epoch": 7.8, "grad_norm": 28.222558975219727, "learning_rate": 4.400527009222662e-06, "loss": 0.6352, "step": 41440 }, { "epoch": 7.8, "grad_norm": 15.584514617919922, "learning_rate": 4.396762657632223e-06, "loss": 0.6327, "step": 41450 }, { "epoch": 7.8, "grad_norm": 22.753358840942383, "learning_rate": 4.392998306041785e-06, "loss": 0.5531, "step": 41460 }, { "epoch": 7.81, "grad_norm": 6.467126369476318, "learning_rate": 4.389233954451346e-06, "loss": 0.9273, "step": 41470 }, { "epoch": 7.81, "grad_norm": 6.077983379364014, "learning_rate": 4.385469602860908e-06, "loss": 0.7311, "step": 41480 }, { "epoch": 7.81, "grad_norm": 10.727202415466309, "learning_rate": 4.381705251270469e-06, "loss": 0.5045, "step": 41490 }, { "epoch": 7.81, "grad_norm": 49.991512298583984, "learning_rate": 4.3779408996800305e-06, "loss": 0.539, "step": 41500 }, { "epoch": 7.81, "grad_norm": 40.65004348754883, "learning_rate": 4.374176548089592e-06, "loss": 0.86, "step": 41510 }, { "epoch": 7.81, "grad_norm": 1.1146597862243652, "learning_rate": 4.3704121964991534e-06, "loss": 0.6157, "step": 41520 }, { "epoch": 7.82, "grad_norm": 5.897725582122803, "learning_rate": 4.366647844908715e-06, "loss": 0.4644, "step": 41530 }, { "epoch": 7.82, "grad_norm": 5.34705924987793, "learning_rate": 4.362883493318276e-06, "loss": 0.571, "step": 41540 }, { "epoch": 7.82, "grad_norm": 10.167000770568848, "learning_rate": 4.359119141727838e-06, "loss": 0.9904, "step": 41550 }, { "epoch": 7.82, "grad_norm": 33.4565544128418, "learning_rate": 4.355354790137399e-06, "loss": 0.7784, "step": 41560 }, { "epoch": 7.82, "grad_norm": 8.830489158630371, "learning_rate": 4.351590438546961e-06, "loss": 0.6165, "step": 41570 }, { "epoch": 7.83, "grad_norm": 16.392597198486328, "learning_rate": 4.347826086956522e-06, "loss": 0.5931, "step": 41580 }, { "epoch": 7.83, "grad_norm": 2.084258794784546, "learning_rate": 4.344061735366084e-06, "loss": 0.5454, "step": 41590 }, { "epoch": 7.83, "grad_norm": 26.511987686157227, "learning_rate": 4.340297383775645e-06, "loss": 0.6571, "step": 41600 }, { "epoch": 7.83, "grad_norm": 12.932252883911133, "learning_rate": 4.336533032185207e-06, "loss": 0.4992, "step": 41610 }, { "epoch": 7.83, "grad_norm": 4.868186950683594, "learning_rate": 4.332768680594768e-06, "loss": 0.452, "step": 41620 }, { "epoch": 7.84, "grad_norm": 3.3700175285339355, "learning_rate": 4.3290043290043295e-06, "loss": 0.7087, "step": 41630 }, { "epoch": 7.84, "grad_norm": 4.603378772735596, "learning_rate": 4.325239977413891e-06, "loss": 0.6563, "step": 41640 }, { "epoch": 7.84, "grad_norm": 27.490306854248047, "learning_rate": 4.3214756258234524e-06, "loss": 0.9937, "step": 41650 }, { "epoch": 7.84, "grad_norm": 2.3835196495056152, "learning_rate": 4.317711274233014e-06, "loss": 0.6545, "step": 41660 }, { "epoch": 7.84, "grad_norm": 10.318363189697266, "learning_rate": 4.313946922642575e-06, "loss": 0.621, "step": 41670 }, { "epoch": 7.84, "grad_norm": 18.3858642578125, "learning_rate": 4.310182571052137e-06, "loss": 0.4197, "step": 41680 }, { "epoch": 7.85, "grad_norm": 22.93217658996582, "learning_rate": 4.306418219461698e-06, "loss": 0.5328, "step": 41690 }, { "epoch": 7.85, "grad_norm": 18.80039405822754, "learning_rate": 4.302653867871259e-06, "loss": 0.8886, "step": 41700 }, { "epoch": 7.85, "grad_norm": 4.552122592926025, "learning_rate": 4.298889516280821e-06, "loss": 0.3871, "step": 41710 }, { "epoch": 7.85, "grad_norm": 1.8649321794509888, "learning_rate": 4.295125164690383e-06, "loss": 0.7137, "step": 41720 }, { "epoch": 7.85, "grad_norm": 40.410770416259766, "learning_rate": 4.291360813099944e-06, "loss": 0.56, "step": 41730 }, { "epoch": 7.86, "grad_norm": 18.530065536499023, "learning_rate": 4.287596461509506e-06, "loss": 0.9473, "step": 41740 }, { "epoch": 7.86, "grad_norm": 5.517451763153076, "learning_rate": 4.283832109919067e-06, "loss": 0.4106, "step": 41750 }, { "epoch": 7.86, "grad_norm": 25.537195205688477, "learning_rate": 4.2800677583286285e-06, "loss": 0.6905, "step": 41760 }, { "epoch": 7.86, "grad_norm": 14.931816101074219, "learning_rate": 4.27630340673819e-06, "loss": 0.7276, "step": 41770 }, { "epoch": 7.86, "grad_norm": 10.676124572753906, "learning_rate": 4.272539055147751e-06, "loss": 0.8282, "step": 41780 }, { "epoch": 7.87, "grad_norm": 17.386003494262695, "learning_rate": 4.268774703557312e-06, "loss": 0.6738, "step": 41790 }, { "epoch": 7.87, "grad_norm": 9.615264892578125, "learning_rate": 4.265010351966874e-06, "loss": 1.0862, "step": 41800 }, { "epoch": 7.87, "grad_norm": 7.778825759887695, "learning_rate": 4.261246000376436e-06, "loss": 0.796, "step": 41810 }, { "epoch": 7.87, "grad_norm": 12.907247543334961, "learning_rate": 4.257481648785997e-06, "loss": 0.9511, "step": 41820 }, { "epoch": 7.87, "grad_norm": 38.01078414916992, "learning_rate": 4.253717297195558e-06, "loss": 0.8119, "step": 41830 }, { "epoch": 7.88, "grad_norm": 5.836081504821777, "learning_rate": 4.249952945605119e-06, "loss": 0.2945, "step": 41840 }, { "epoch": 7.88, "grad_norm": 31.802064895629883, "learning_rate": 4.246188594014682e-06, "loss": 0.6734, "step": 41850 }, { "epoch": 7.88, "grad_norm": 10.85340690612793, "learning_rate": 4.242424242424243e-06, "loss": 0.8804, "step": 41860 }, { "epoch": 7.88, "grad_norm": 7.636331081390381, "learning_rate": 4.2386598908338046e-06, "loss": 0.7127, "step": 41870 }, { "epoch": 7.88, "grad_norm": 7.453187465667725, "learning_rate": 4.234895539243365e-06, "loss": 0.3514, "step": 41880 }, { "epoch": 7.88, "grad_norm": 28.02394676208496, "learning_rate": 4.2311311876529275e-06, "loss": 0.5254, "step": 41890 }, { "epoch": 7.89, "grad_norm": 7.167729377746582, "learning_rate": 4.227366836062489e-06, "loss": 0.6939, "step": 41900 }, { "epoch": 7.89, "grad_norm": 46.680320739746094, "learning_rate": 4.22360248447205e-06, "loss": 0.787, "step": 41910 }, { "epoch": 7.89, "grad_norm": 4.237600326538086, "learning_rate": 4.219838132881611e-06, "loss": 0.7735, "step": 41920 }, { "epoch": 7.89, "grad_norm": 13.3323392868042, "learning_rate": 4.2160737812911725e-06, "loss": 0.5789, "step": 41930 }, { "epoch": 7.89, "grad_norm": 21.19347381591797, "learning_rate": 4.212309429700735e-06, "loss": 0.6088, "step": 41940 }, { "epoch": 7.9, "grad_norm": 7.943591117858887, "learning_rate": 4.208545078110296e-06, "loss": 0.6542, "step": 41950 }, { "epoch": 7.9, "grad_norm": 10.774558067321777, "learning_rate": 4.204780726519857e-06, "loss": 0.9625, "step": 41960 }, { "epoch": 7.9, "grad_norm": 3.5156683921813965, "learning_rate": 4.201016374929418e-06, "loss": 0.7473, "step": 41970 }, { "epoch": 7.9, "grad_norm": 33.20036697387695, "learning_rate": 4.19725202333898e-06, "loss": 0.7641, "step": 41980 }, { "epoch": 7.9, "grad_norm": 9.791431427001953, "learning_rate": 4.193487671748542e-06, "loss": 0.3821, "step": 41990 }, { "epoch": 7.91, "grad_norm": 19.5290470123291, "learning_rate": 4.1897233201581036e-06, "loss": 0.7977, "step": 42000 }, { "epoch": 7.91, "grad_norm": 4.693203926086426, "learning_rate": 4.185958968567664e-06, "loss": 0.6561, "step": 42010 }, { "epoch": 7.91, "grad_norm": 3.058779716491699, "learning_rate": 4.182194616977226e-06, "loss": 0.5354, "step": 42020 }, { "epoch": 7.91, "grad_norm": 0.7531399726867676, "learning_rate": 4.178430265386788e-06, "loss": 0.781, "step": 42030 }, { "epoch": 7.91, "grad_norm": 8.725363731384277, "learning_rate": 4.174665913796349e-06, "loss": 0.7572, "step": 42040 }, { "epoch": 7.91, "grad_norm": 7.027099609375, "learning_rate": 4.17090156220591e-06, "loss": 0.4036, "step": 42050 }, { "epoch": 7.92, "grad_norm": 27.52947998046875, "learning_rate": 4.1671372106154715e-06, "loss": 0.7578, "step": 42060 }, { "epoch": 7.92, "grad_norm": 18.265785217285156, "learning_rate": 4.163372859025033e-06, "loss": 0.7248, "step": 42070 }, { "epoch": 7.92, "grad_norm": 4.392576217651367, "learning_rate": 4.159608507434595e-06, "loss": 0.8648, "step": 42080 }, { "epoch": 7.92, "grad_norm": 19.828096389770508, "learning_rate": 4.155844155844157e-06, "loss": 0.6475, "step": 42090 }, { "epoch": 7.92, "grad_norm": 8.763219833374023, "learning_rate": 4.152079804253717e-06, "loss": 0.5425, "step": 42100 }, { "epoch": 7.93, "grad_norm": 1.6160296201705933, "learning_rate": 4.148315452663279e-06, "loss": 0.6573, "step": 42110 }, { "epoch": 7.93, "grad_norm": 15.69921588897705, "learning_rate": 4.14455110107284e-06, "loss": 0.969, "step": 42120 }, { "epoch": 7.93, "grad_norm": 2.965888500213623, "learning_rate": 4.1407867494824025e-06, "loss": 0.4124, "step": 42130 }, { "epoch": 7.93, "grad_norm": 9.641205787658691, "learning_rate": 4.137022397891963e-06, "loss": 0.6181, "step": 42140 }, { "epoch": 7.93, "grad_norm": 22.382549285888672, "learning_rate": 4.133258046301525e-06, "loss": 0.8662, "step": 42150 }, { "epoch": 7.94, "grad_norm": 3.958751916885376, "learning_rate": 4.129493694711086e-06, "loss": 0.6877, "step": 42160 }, { "epoch": 7.94, "grad_norm": 13.550714492797852, "learning_rate": 4.125729343120648e-06, "loss": 0.6759, "step": 42170 }, { "epoch": 7.94, "grad_norm": 4.865496635437012, "learning_rate": 4.121964991530209e-06, "loss": 0.7914, "step": 42180 }, { "epoch": 7.94, "grad_norm": 23.548952102661133, "learning_rate": 4.1182006399397705e-06, "loss": 0.5538, "step": 42190 }, { "epoch": 7.94, "grad_norm": 28.779630661010742, "learning_rate": 4.114436288349332e-06, "loss": 0.5862, "step": 42200 }, { "epoch": 7.94, "grad_norm": 2.35088849067688, "learning_rate": 4.110671936758893e-06, "loss": 0.5629, "step": 42210 }, { "epoch": 7.95, "grad_norm": 4.026741027832031, "learning_rate": 4.106907585168456e-06, "loss": 0.4453, "step": 42220 }, { "epoch": 7.95, "grad_norm": 6.401082992553711, "learning_rate": 4.103143233578016e-06, "loss": 0.8043, "step": 42230 }, { "epoch": 7.95, "grad_norm": 8.926095008850098, "learning_rate": 4.099378881987578e-06, "loss": 0.9143, "step": 42240 }, { "epoch": 7.95, "grad_norm": 7.036243438720703, "learning_rate": 4.095614530397139e-06, "loss": 0.7105, "step": 42250 }, { "epoch": 7.95, "grad_norm": 3.915670871734619, "learning_rate": 4.091850178806701e-06, "loss": 1.1143, "step": 42260 }, { "epoch": 7.96, "grad_norm": 18.60647201538086, "learning_rate": 4.088085827216262e-06, "loss": 0.6352, "step": 42270 }, { "epoch": 7.96, "grad_norm": 2.800506114959717, "learning_rate": 4.084321475625824e-06, "loss": 0.4785, "step": 42280 }, { "epoch": 7.96, "grad_norm": 6.2158660888671875, "learning_rate": 4.080557124035385e-06, "loss": 0.6005, "step": 42290 }, { "epoch": 7.96, "grad_norm": 8.474069595336914, "learning_rate": 4.0767927724449465e-06, "loss": 0.7124, "step": 42300 }, { "epoch": 7.96, "grad_norm": 19.390552520751953, "learning_rate": 4.073028420854508e-06, "loss": 0.6404, "step": 42310 }, { "epoch": 7.97, "grad_norm": 7.515751838684082, "learning_rate": 4.0692640692640695e-06, "loss": 0.64, "step": 42320 }, { "epoch": 7.97, "grad_norm": 4.034000396728516, "learning_rate": 4.065499717673631e-06, "loss": 0.8088, "step": 42330 }, { "epoch": 7.97, "grad_norm": 17.853302001953125, "learning_rate": 4.061735366083192e-06, "loss": 0.6692, "step": 42340 }, { "epoch": 7.97, "grad_norm": 14.024540901184082, "learning_rate": 4.057971014492754e-06, "loss": 0.7118, "step": 42350 }, { "epoch": 7.97, "grad_norm": 18.116090774536133, "learning_rate": 4.054206662902315e-06, "loss": 1.0517, "step": 42360 }, { "epoch": 7.97, "grad_norm": 12.715302467346191, "learning_rate": 4.050442311311877e-06, "loss": 0.8456, "step": 42370 }, { "epoch": 7.98, "grad_norm": 28.6717472076416, "learning_rate": 4.046677959721438e-06, "loss": 0.7263, "step": 42380 }, { "epoch": 7.98, "grad_norm": 2.929394006729126, "learning_rate": 4.042913608131e-06, "loss": 0.6057, "step": 42390 }, { "epoch": 7.98, "grad_norm": 13.099629402160645, "learning_rate": 4.039149256540561e-06, "loss": 0.7885, "step": 42400 }, { "epoch": 7.98, "grad_norm": 5.010390281677246, "learning_rate": 4.035384904950123e-06, "loss": 0.6682, "step": 42410 }, { "epoch": 7.98, "grad_norm": 11.586647033691406, "learning_rate": 4.031620553359684e-06, "loss": 1.2064, "step": 42420 }, { "epoch": 7.99, "grad_norm": 4.218161106109619, "learning_rate": 4.0278562017692455e-06, "loss": 0.7478, "step": 42430 }, { "epoch": 7.99, "grad_norm": 16.22096061706543, "learning_rate": 4.024091850178807e-06, "loss": 0.6102, "step": 42440 }, { "epoch": 7.99, "grad_norm": 1.6383742094039917, "learning_rate": 4.0203274985883684e-06, "loss": 0.4183, "step": 42450 }, { "epoch": 7.99, "grad_norm": 21.848058700561523, "learning_rate": 4.01656314699793e-06, "loss": 0.6437, "step": 42460 }, { "epoch": 7.99, "grad_norm": 12.690462112426758, "learning_rate": 4.012798795407491e-06, "loss": 0.6778, "step": 42470 }, { "epoch": 8.0, "grad_norm": 11.503471374511719, "learning_rate": 4.009034443817053e-06, "loss": 0.5668, "step": 42480 }, { "epoch": 8.0, "grad_norm": 11.852116584777832, "learning_rate": 4.005270092226614e-06, "loss": 0.5817, "step": 42490 }, { "epoch": 8.0, "grad_norm": 22.544607162475586, "learning_rate": 4.001505740636176e-06, "loss": 0.7981, "step": 42500 }, { "epoch": 8.0, "eval_accuracy": 0.8052, "eval_loss": 0.885317862033844, "eval_runtime": 31.1151, "eval_samples_per_second": 241.041, "eval_steps_per_second": 30.146, "step": 42504 }, { "epoch": 8.0, "grad_norm": 27.474084854125977, "learning_rate": 3.997741389045737e-06, "loss": 0.6596, "step": 42510 }, { "epoch": 8.0, "grad_norm": 28.633947372436523, "learning_rate": 3.993977037455299e-06, "loss": 0.9212, "step": 42520 }, { "epoch": 8.0, "grad_norm": 7.0895819664001465, "learning_rate": 3.99021268586486e-06, "loss": 0.3263, "step": 42530 }, { "epoch": 8.01, "grad_norm": 18.773303985595703, "learning_rate": 3.986448334274422e-06, "loss": 0.6925, "step": 42540 }, { "epoch": 8.01, "grad_norm": 30.71114730834961, "learning_rate": 3.982683982683983e-06, "loss": 0.4323, "step": 42550 }, { "epoch": 8.01, "grad_norm": 20.15796661376953, "learning_rate": 3.9789196310935445e-06, "loss": 0.5718, "step": 42560 }, { "epoch": 8.01, "grad_norm": 1.0080842971801758, "learning_rate": 3.975155279503106e-06, "loss": 0.5326, "step": 42570 }, { "epoch": 8.01, "grad_norm": 11.660303115844727, "learning_rate": 3.9713909279126674e-06, "loss": 0.846, "step": 42580 }, { "epoch": 8.02, "grad_norm": 10.309700965881348, "learning_rate": 3.967626576322229e-06, "loss": 0.5906, "step": 42590 }, { "epoch": 8.02, "grad_norm": 16.09084701538086, "learning_rate": 3.96386222473179e-06, "loss": 0.596, "step": 42600 }, { "epoch": 8.02, "grad_norm": 28.103029251098633, "learning_rate": 3.960097873141352e-06, "loss": 0.4713, "step": 42610 }, { "epoch": 8.02, "grad_norm": 8.648486137390137, "learning_rate": 3.956333521550913e-06, "loss": 0.7244, "step": 42620 }, { "epoch": 8.02, "grad_norm": 21.880413055419922, "learning_rate": 3.952569169960475e-06, "loss": 0.6089, "step": 42630 }, { "epoch": 8.03, "grad_norm": 4.405519008636475, "learning_rate": 3.948804818370036e-06, "loss": 0.6037, "step": 42640 }, { "epoch": 8.03, "grad_norm": 8.18945598602295, "learning_rate": 3.945040466779598e-06, "loss": 0.6882, "step": 42650 }, { "epoch": 8.03, "grad_norm": 15.145816802978516, "learning_rate": 3.941276115189159e-06, "loss": 0.8617, "step": 42660 }, { "epoch": 8.03, "grad_norm": 11.022823333740234, "learning_rate": 3.9375117635987206e-06, "loss": 0.486, "step": 42670 }, { "epoch": 8.03, "grad_norm": 9.756415367126465, "learning_rate": 3.933747412008282e-06, "loss": 0.8968, "step": 42680 }, { "epoch": 8.04, "grad_norm": 9.885300636291504, "learning_rate": 3.9299830604178435e-06, "loss": 0.5924, "step": 42690 }, { "epoch": 8.04, "grad_norm": 14.079621315002441, "learning_rate": 3.926218708827404e-06, "loss": 0.6747, "step": 42700 }, { "epoch": 8.04, "grad_norm": 38.73008728027344, "learning_rate": 3.922454357236966e-06, "loss": 0.7407, "step": 42710 }, { "epoch": 8.04, "grad_norm": 9.643011093139648, "learning_rate": 3.918690005646528e-06, "loss": 0.6615, "step": 42720 }, { "epoch": 8.04, "grad_norm": 2.2830774784088135, "learning_rate": 3.914925654056089e-06, "loss": 0.6084, "step": 42730 }, { "epoch": 8.04, "grad_norm": 7.782507419586182, "learning_rate": 3.911161302465651e-06, "loss": 0.8502, "step": 42740 }, { "epoch": 8.05, "grad_norm": 24.85226821899414, "learning_rate": 3.907396950875212e-06, "loss": 0.5706, "step": 42750 }, { "epoch": 8.05, "grad_norm": 10.673163414001465, "learning_rate": 3.903632599284774e-06, "loss": 0.6315, "step": 42760 }, { "epoch": 8.05, "grad_norm": 6.851144313812256, "learning_rate": 3.899868247694335e-06, "loss": 0.6369, "step": 42770 }, { "epoch": 8.05, "grad_norm": 13.164300918579102, "learning_rate": 3.896103896103897e-06, "loss": 0.8078, "step": 42780 }, { "epoch": 8.05, "grad_norm": 11.8223295211792, "learning_rate": 3.892339544513457e-06, "loss": 0.9385, "step": 42790 }, { "epoch": 8.06, "grad_norm": 7.310250282287598, "learning_rate": 3.8885751929230196e-06, "loss": 0.4419, "step": 42800 }, { "epoch": 8.06, "grad_norm": 19.002811431884766, "learning_rate": 3.884810841332581e-06, "loss": 0.6196, "step": 42810 }, { "epoch": 8.06, "grad_norm": 25.556903839111328, "learning_rate": 3.8810464897421425e-06, "loss": 0.4839, "step": 42820 }, { "epoch": 8.06, "grad_norm": 22.775390625, "learning_rate": 3.877282138151704e-06, "loss": 0.6981, "step": 42830 }, { "epoch": 8.06, "grad_norm": 35.40963363647461, "learning_rate": 3.8735177865612646e-06, "loss": 0.4257, "step": 42840 }, { "epoch": 8.07, "grad_norm": 21.315773010253906, "learning_rate": 3.869753434970827e-06, "loss": 0.4542, "step": 42850 }, { "epoch": 8.07, "grad_norm": 9.800973892211914, "learning_rate": 3.865989083380388e-06, "loss": 0.6485, "step": 42860 }, { "epoch": 8.07, "grad_norm": 6.812503814697266, "learning_rate": 3.86222473178995e-06, "loss": 0.6213, "step": 42870 }, { "epoch": 8.07, "grad_norm": 4.366781234741211, "learning_rate": 3.85846038019951e-06, "loss": 0.448, "step": 42880 }, { "epoch": 8.07, "grad_norm": 1.6783299446105957, "learning_rate": 3.854696028609073e-06, "loss": 0.5184, "step": 42890 }, { "epoch": 8.07, "grad_norm": 5.302606105804443, "learning_rate": 3.850931677018634e-06, "loss": 0.656, "step": 42900 }, { "epoch": 8.08, "grad_norm": 14.876054763793945, "learning_rate": 3.847167325428196e-06, "loss": 0.7031, "step": 42910 }, { "epoch": 8.08, "grad_norm": 5.262720108032227, "learning_rate": 3.843402973837756e-06, "loss": 0.375, "step": 42920 }, { "epoch": 8.08, "grad_norm": 4.300555229187012, "learning_rate": 3.839638622247318e-06, "loss": 0.7145, "step": 42930 }, { "epoch": 8.08, "grad_norm": 12.29123306274414, "learning_rate": 3.83587427065688e-06, "loss": 0.6992, "step": 42940 }, { "epoch": 8.08, "grad_norm": 4.698318958282471, "learning_rate": 3.8321099190664415e-06, "loss": 0.4023, "step": 42950 }, { "epoch": 8.09, "grad_norm": 2.2966959476470947, "learning_rate": 3.828345567476003e-06, "loss": 0.6641, "step": 42960 }, { "epoch": 8.09, "grad_norm": 0.6551181674003601, "learning_rate": 3.8245812158855635e-06, "loss": 0.7118, "step": 42970 }, { "epoch": 8.09, "grad_norm": 3.318326711654663, "learning_rate": 3.820816864295126e-06, "loss": 0.8686, "step": 42980 }, { "epoch": 8.09, "grad_norm": 9.104196548461914, "learning_rate": 3.817052512704687e-06, "loss": 0.7262, "step": 42990 }, { "epoch": 8.09, "grad_norm": 12.957881927490234, "learning_rate": 3.8132881611142488e-06, "loss": 0.6844, "step": 43000 }, { "epoch": 8.1, "grad_norm": 1.567747712135315, "learning_rate": 3.80952380952381e-06, "loss": 0.8228, "step": 43010 }, { "epoch": 8.1, "grad_norm": 18.69003677368164, "learning_rate": 3.8057594579333713e-06, "loss": 0.7219, "step": 43020 }, { "epoch": 8.1, "grad_norm": 2.4941070079803467, "learning_rate": 3.8019951063429327e-06, "loss": 0.3485, "step": 43030 }, { "epoch": 8.1, "grad_norm": 1.889604926109314, "learning_rate": 3.798230754752494e-06, "loss": 0.6792, "step": 43040 }, { "epoch": 8.1, "grad_norm": 3.0734691619873047, "learning_rate": 3.7944664031620552e-06, "loss": 0.4855, "step": 43050 }, { "epoch": 8.1, "grad_norm": 17.109647750854492, "learning_rate": 3.790702051571617e-06, "loss": 0.5067, "step": 43060 }, { "epoch": 8.11, "grad_norm": 12.241493225097656, "learning_rate": 3.7869376999811786e-06, "loss": 0.7333, "step": 43070 }, { "epoch": 8.11, "grad_norm": 12.447990417480469, "learning_rate": 3.78317334839074e-06, "loss": 0.6839, "step": 43080 }, { "epoch": 8.11, "grad_norm": 8.43791389465332, "learning_rate": 3.7794089968003015e-06, "loss": 0.5034, "step": 43090 }, { "epoch": 8.11, "grad_norm": 16.71822738647461, "learning_rate": 3.7756446452098625e-06, "loss": 0.593, "step": 43100 }, { "epoch": 8.11, "grad_norm": 25.01463508605957, "learning_rate": 3.7718802936194244e-06, "loss": 0.7501, "step": 43110 }, { "epoch": 8.12, "grad_norm": 26.904882431030273, "learning_rate": 3.768115942028986e-06, "loss": 0.7831, "step": 43120 }, { "epoch": 8.12, "grad_norm": 8.051025390625, "learning_rate": 3.7643515904385473e-06, "loss": 0.7806, "step": 43130 }, { "epoch": 8.12, "grad_norm": 2.785883903503418, "learning_rate": 3.7605872388481084e-06, "loss": 0.5128, "step": 43140 }, { "epoch": 8.12, "grad_norm": 6.323906898498535, "learning_rate": 3.7568228872576703e-06, "loss": 0.5295, "step": 43150 }, { "epoch": 8.12, "grad_norm": 3.9023892879486084, "learning_rate": 3.7530585356672317e-06, "loss": 0.5746, "step": 43160 }, { "epoch": 8.13, "grad_norm": 4.318655967712402, "learning_rate": 3.749294184076793e-06, "loss": 0.6759, "step": 43170 }, { "epoch": 8.13, "grad_norm": 8.113146781921387, "learning_rate": 3.7455298324863542e-06, "loss": 0.4526, "step": 43180 }, { "epoch": 8.13, "grad_norm": 1.0072318315505981, "learning_rate": 3.7417654808959157e-06, "loss": 0.367, "step": 43190 }, { "epoch": 8.13, "grad_norm": 2.841911554336548, "learning_rate": 3.7380011293054776e-06, "loss": 0.8503, "step": 43200 }, { "epoch": 8.13, "grad_norm": 16.94948959350586, "learning_rate": 3.734236777715039e-06, "loss": 0.5503, "step": 43210 }, { "epoch": 8.13, "grad_norm": 1.882973551750183, "learning_rate": 3.7304724261246005e-06, "loss": 0.784, "step": 43220 }, { "epoch": 8.14, "grad_norm": 4.787994861602783, "learning_rate": 3.7267080745341615e-06, "loss": 0.5506, "step": 43230 }, { "epoch": 8.14, "grad_norm": 10.229573249816895, "learning_rate": 3.722943722943723e-06, "loss": 0.7662, "step": 43240 }, { "epoch": 8.14, "grad_norm": 29.38533592224121, "learning_rate": 3.719179371353285e-06, "loss": 0.8054, "step": 43250 }, { "epoch": 8.14, "grad_norm": 1.2236417531967163, "learning_rate": 3.7154150197628463e-06, "loss": 0.6526, "step": 43260 }, { "epoch": 8.14, "grad_norm": 3.5998024940490723, "learning_rate": 3.7116506681724074e-06, "loss": 0.497, "step": 43270 }, { "epoch": 8.15, "grad_norm": 7.608570575714111, "learning_rate": 3.707886316581969e-06, "loss": 0.567, "step": 43280 }, { "epoch": 8.15, "grad_norm": 1.5260602235794067, "learning_rate": 3.7041219649915307e-06, "loss": 0.7219, "step": 43290 }, { "epoch": 8.15, "grad_norm": 2.997941017150879, "learning_rate": 3.700357613401092e-06, "loss": 0.5059, "step": 43300 }, { "epoch": 8.15, "grad_norm": 20.200481414794922, "learning_rate": 3.6965932618106532e-06, "loss": 0.5741, "step": 43310 }, { "epoch": 8.15, "grad_norm": 24.715145111083984, "learning_rate": 3.6928289102202147e-06, "loss": 0.6357, "step": 43320 }, { "epoch": 8.16, "grad_norm": 1.9063732624053955, "learning_rate": 3.689064558629776e-06, "loss": 0.5052, "step": 43330 }, { "epoch": 8.16, "grad_norm": 12.693863868713379, "learning_rate": 3.685300207039338e-06, "loss": 0.9772, "step": 43340 }, { "epoch": 8.16, "grad_norm": 4.797914028167725, "learning_rate": 3.6815358554488995e-06, "loss": 0.7304, "step": 43350 }, { "epoch": 8.16, "grad_norm": 9.199864387512207, "learning_rate": 3.6777715038584605e-06, "loss": 0.6844, "step": 43360 }, { "epoch": 8.16, "grad_norm": 16.02825927734375, "learning_rate": 3.674007152268022e-06, "loss": 0.639, "step": 43370 }, { "epoch": 8.16, "grad_norm": 3.1297593116760254, "learning_rate": 3.670242800677584e-06, "loss": 0.6463, "step": 43380 }, { "epoch": 8.17, "grad_norm": 22.230676651000977, "learning_rate": 3.6664784490871453e-06, "loss": 1.0809, "step": 43390 }, { "epoch": 8.17, "grad_norm": 17.3179931640625, "learning_rate": 3.6627140974967064e-06, "loss": 0.8053, "step": 43400 }, { "epoch": 8.17, "grad_norm": 27.16463279724121, "learning_rate": 3.658949745906268e-06, "loss": 1.0252, "step": 43410 }, { "epoch": 8.17, "grad_norm": 7.530394554138184, "learning_rate": 3.6551853943158293e-06, "loss": 1.0477, "step": 43420 }, { "epoch": 8.17, "grad_norm": 3.3225035667419434, "learning_rate": 3.651421042725391e-06, "loss": 0.4073, "step": 43430 }, { "epoch": 8.18, "grad_norm": 7.571877956390381, "learning_rate": 3.6476566911349526e-06, "loss": 0.8084, "step": 43440 }, { "epoch": 8.18, "grad_norm": 1.640356421470642, "learning_rate": 3.6438923395445137e-06, "loss": 0.6394, "step": 43450 }, { "epoch": 8.18, "grad_norm": 2.479665756225586, "learning_rate": 3.640127987954075e-06, "loss": 0.8718, "step": 43460 }, { "epoch": 8.18, "grad_norm": 3.027372360229492, "learning_rate": 3.6363636363636366e-06, "loss": 0.5582, "step": 43470 }, { "epoch": 8.18, "grad_norm": 19.327043533325195, "learning_rate": 3.6325992847731985e-06, "loss": 0.7859, "step": 43480 }, { "epoch": 8.19, "grad_norm": 14.668938636779785, "learning_rate": 3.6288349331827595e-06, "loss": 0.5379, "step": 43490 }, { "epoch": 8.19, "grad_norm": 12.993864059448242, "learning_rate": 3.625070581592321e-06, "loss": 0.656, "step": 43500 }, { "epoch": 8.19, "grad_norm": 4.96496057510376, "learning_rate": 3.6213062300018824e-06, "loss": 0.5854, "step": 43510 }, { "epoch": 8.19, "grad_norm": 2.8552846908569336, "learning_rate": 3.6175418784114443e-06, "loss": 0.6436, "step": 43520 }, { "epoch": 8.19, "grad_norm": 30.623044967651367, "learning_rate": 3.613777526821005e-06, "loss": 0.8594, "step": 43530 }, { "epoch": 8.19, "grad_norm": 13.185827255249023, "learning_rate": 3.610013175230567e-06, "loss": 0.5234, "step": 43540 }, { "epoch": 8.2, "grad_norm": 56.314544677734375, "learning_rate": 3.6062488236401283e-06, "loss": 0.9389, "step": 43550 }, { "epoch": 8.2, "grad_norm": 1.8497740030288696, "learning_rate": 3.6024844720496897e-06, "loss": 0.2609, "step": 43560 }, { "epoch": 8.2, "grad_norm": 8.1415433883667, "learning_rate": 3.5987201204592516e-06, "loss": 0.6107, "step": 43570 }, { "epoch": 8.2, "grad_norm": 28.436689376831055, "learning_rate": 3.5949557688688127e-06, "loss": 0.6883, "step": 43580 }, { "epoch": 8.2, "grad_norm": 40.17943572998047, "learning_rate": 3.591191417278374e-06, "loss": 0.6466, "step": 43590 }, { "epoch": 8.21, "grad_norm": 24.928728103637695, "learning_rate": 3.5874270656879356e-06, "loss": 0.6656, "step": 43600 }, { "epoch": 8.21, "grad_norm": 3.2267372608184814, "learning_rate": 3.583662714097497e-06, "loss": 0.7166, "step": 43610 }, { "epoch": 8.21, "grad_norm": 14.755294799804688, "learning_rate": 3.579898362507058e-06, "loss": 0.5982, "step": 43620 }, { "epoch": 8.21, "grad_norm": 11.700262069702148, "learning_rate": 3.57613401091662e-06, "loss": 0.3995, "step": 43630 }, { "epoch": 8.21, "grad_norm": 7.365218639373779, "learning_rate": 3.5723696593261814e-06, "loss": 0.5556, "step": 43640 }, { "epoch": 8.22, "grad_norm": 17.917104721069336, "learning_rate": 3.568605307735743e-06, "loss": 0.576, "step": 43650 }, { "epoch": 8.22, "grad_norm": 5.979570388793945, "learning_rate": 3.564840956145304e-06, "loss": 0.6453, "step": 43660 }, { "epoch": 8.22, "grad_norm": 15.784065246582031, "learning_rate": 3.561076604554866e-06, "loss": 0.8655, "step": 43670 }, { "epoch": 8.22, "grad_norm": 7.728758811950684, "learning_rate": 3.5573122529644273e-06, "loss": 0.6367, "step": 43680 }, { "epoch": 8.22, "grad_norm": 24.66337013244629, "learning_rate": 3.5535479013739887e-06, "loss": 0.6268, "step": 43690 }, { "epoch": 8.23, "grad_norm": 19.224403381347656, "learning_rate": 3.54978354978355e-06, "loss": 0.7219, "step": 43700 }, { "epoch": 8.23, "grad_norm": 10.079967498779297, "learning_rate": 3.5460191981931112e-06, "loss": 0.7525, "step": 43710 }, { "epoch": 8.23, "grad_norm": 8.401063919067383, "learning_rate": 3.542254846602673e-06, "loss": 0.7502, "step": 43720 }, { "epoch": 8.23, "grad_norm": 5.570341110229492, "learning_rate": 3.5384904950122346e-06, "loss": 0.6572, "step": 43730 }, { "epoch": 8.23, "grad_norm": 5.7887725830078125, "learning_rate": 3.534726143421796e-06, "loss": 0.4368, "step": 43740 }, { "epoch": 8.23, "grad_norm": 12.364099502563477, "learning_rate": 3.530961791831357e-06, "loss": 0.5267, "step": 43750 }, { "epoch": 8.24, "grad_norm": 26.368518829345703, "learning_rate": 3.5271974402409185e-06, "loss": 0.5043, "step": 43760 }, { "epoch": 8.24, "grad_norm": 5.576831340789795, "learning_rate": 3.5234330886504804e-06, "loss": 0.6201, "step": 43770 }, { "epoch": 8.24, "grad_norm": 8.283651351928711, "learning_rate": 3.519668737060042e-06, "loss": 0.6271, "step": 43780 }, { "epoch": 8.24, "grad_norm": 9.156807899475098, "learning_rate": 3.515904385469603e-06, "loss": 0.5056, "step": 43790 }, { "epoch": 8.24, "grad_norm": 4.121074199676514, "learning_rate": 3.5121400338791644e-06, "loss": 0.4618, "step": 43800 }, { "epoch": 8.25, "grad_norm": 9.835829734802246, "learning_rate": 3.5083756822887262e-06, "loss": 0.4881, "step": 43810 }, { "epoch": 8.25, "grad_norm": 12.066664695739746, "learning_rate": 3.5046113306982877e-06, "loss": 0.4989, "step": 43820 }, { "epoch": 8.25, "grad_norm": 10.607754707336426, "learning_rate": 3.500846979107849e-06, "loss": 0.6481, "step": 43830 }, { "epoch": 8.25, "grad_norm": 13.655089378356934, "learning_rate": 3.49708262751741e-06, "loss": 0.36, "step": 43840 }, { "epoch": 8.25, "grad_norm": 18.223438262939453, "learning_rate": 3.4933182759269717e-06, "loss": 0.4792, "step": 43850 }, { "epoch": 8.26, "grad_norm": 33.74332046508789, "learning_rate": 3.4895539243365336e-06, "loss": 0.8607, "step": 43860 }, { "epoch": 8.26, "grad_norm": 49.610050201416016, "learning_rate": 3.485789572746095e-06, "loss": 0.7213, "step": 43870 }, { "epoch": 8.26, "grad_norm": 7.361185073852539, "learning_rate": 3.482025221155656e-06, "loss": 0.4172, "step": 43880 }, { "epoch": 8.26, "grad_norm": 17.406097412109375, "learning_rate": 3.4782608695652175e-06, "loss": 0.676, "step": 43890 }, { "epoch": 8.26, "grad_norm": 12.898189544677734, "learning_rate": 3.474496517974779e-06, "loss": 0.3558, "step": 43900 }, { "epoch": 8.26, "grad_norm": 1.7593990564346313, "learning_rate": 3.470732166384341e-06, "loss": 0.4208, "step": 43910 }, { "epoch": 8.27, "grad_norm": 8.412177085876465, "learning_rate": 3.466967814793902e-06, "loss": 0.2631, "step": 43920 }, { "epoch": 8.27, "grad_norm": 18.444435119628906, "learning_rate": 3.4632034632034634e-06, "loss": 0.8142, "step": 43930 }, { "epoch": 8.27, "grad_norm": 8.673301696777344, "learning_rate": 3.459439111613025e-06, "loss": 0.6842, "step": 43940 }, { "epoch": 8.27, "grad_norm": 10.125624656677246, "learning_rate": 3.4556747600225867e-06, "loss": 0.4371, "step": 43950 }, { "epoch": 8.27, "grad_norm": 8.081964492797852, "learning_rate": 3.451910408432148e-06, "loss": 0.4815, "step": 43960 }, { "epoch": 8.28, "grad_norm": 12.483230590820312, "learning_rate": 3.448146056841709e-06, "loss": 0.4306, "step": 43970 }, { "epoch": 8.28, "grad_norm": 2.5073659420013428, "learning_rate": 3.4443817052512707e-06, "loss": 0.3978, "step": 43980 }, { "epoch": 8.28, "grad_norm": 3.006073236465454, "learning_rate": 3.440617353660832e-06, "loss": 0.6286, "step": 43990 }, { "epoch": 8.28, "grad_norm": 36.17524719238281, "learning_rate": 3.436853002070394e-06, "loss": 0.5726, "step": 44000 }, { "epoch": 8.28, "grad_norm": 14.961207389831543, "learning_rate": 3.433088650479955e-06, "loss": 0.5, "step": 44010 }, { "epoch": 8.29, "grad_norm": 5.547390460968018, "learning_rate": 3.4293242988895165e-06, "loss": 0.6341, "step": 44020 }, { "epoch": 8.29, "grad_norm": 13.21618366241455, "learning_rate": 3.425559947299078e-06, "loss": 0.7677, "step": 44030 }, { "epoch": 8.29, "grad_norm": 1.0086277723312378, "learning_rate": 3.4217955957086394e-06, "loss": 0.3811, "step": 44040 }, { "epoch": 8.29, "grad_norm": 0.8586221933364868, "learning_rate": 3.4180312441182005e-06, "loss": 0.555, "step": 44050 }, { "epoch": 8.29, "grad_norm": 0.7436699867248535, "learning_rate": 3.4142668925277623e-06, "loss": 0.6834, "step": 44060 }, { "epoch": 8.29, "grad_norm": 22.060810089111328, "learning_rate": 3.410502540937324e-06, "loss": 0.8258, "step": 44070 }, { "epoch": 8.3, "grad_norm": 7.581924915313721, "learning_rate": 3.4067381893468853e-06, "loss": 0.4419, "step": 44080 }, { "epoch": 8.3, "grad_norm": 18.971668243408203, "learning_rate": 3.402973837756447e-06, "loss": 0.6672, "step": 44090 }, { "epoch": 8.3, "grad_norm": 7.166525363922119, "learning_rate": 3.399209486166008e-06, "loss": 0.5142, "step": 44100 }, { "epoch": 8.3, "grad_norm": 13.554433822631836, "learning_rate": 3.3954451345755696e-06, "loss": 0.805, "step": 44110 }, { "epoch": 8.3, "grad_norm": 6.124617576599121, "learning_rate": 3.391680782985131e-06, "loss": 0.478, "step": 44120 }, { "epoch": 8.31, "grad_norm": 0.709112286567688, "learning_rate": 3.3879164313946926e-06, "loss": 0.6927, "step": 44130 }, { "epoch": 8.31, "grad_norm": 39.29637145996094, "learning_rate": 3.3841520798042536e-06, "loss": 0.5979, "step": 44140 }, { "epoch": 8.31, "grad_norm": 30.300195693969727, "learning_rate": 3.3803877282138155e-06, "loss": 0.7315, "step": 44150 }, { "epoch": 8.31, "grad_norm": 12.570667266845703, "learning_rate": 3.376623376623377e-06, "loss": 0.5411, "step": 44160 }, { "epoch": 8.31, "grad_norm": 20.061830520629883, "learning_rate": 3.3728590250329384e-06, "loss": 0.6905, "step": 44170 }, { "epoch": 8.32, "grad_norm": 4.5251851081848145, "learning_rate": 3.3690946734425e-06, "loss": 0.487, "step": 44180 }, { "epoch": 8.32, "grad_norm": 14.062969207763672, "learning_rate": 3.365330321852061e-06, "loss": 0.4724, "step": 44190 }, { "epoch": 8.32, "grad_norm": 4.117557525634766, "learning_rate": 3.361565970261623e-06, "loss": 0.298, "step": 44200 }, { "epoch": 8.32, "grad_norm": 3.304168701171875, "learning_rate": 3.3578016186711843e-06, "loss": 0.5095, "step": 44210 }, { "epoch": 8.32, "grad_norm": 15.786203384399414, "learning_rate": 3.3540372670807457e-06, "loss": 0.8977, "step": 44220 }, { "epoch": 8.32, "grad_norm": 15.957511901855469, "learning_rate": 3.3502729154903068e-06, "loss": 0.7905, "step": 44230 }, { "epoch": 8.33, "grad_norm": 14.398000717163086, "learning_rate": 3.3465085638998686e-06, "loss": 0.654, "step": 44240 }, { "epoch": 8.33, "grad_norm": 6.240730285644531, "learning_rate": 3.34274421230943e-06, "loss": 0.4968, "step": 44250 }, { "epoch": 8.33, "grad_norm": 6.868484973907471, "learning_rate": 3.3389798607189916e-06, "loss": 0.6426, "step": 44260 }, { "epoch": 8.33, "grad_norm": 9.236321449279785, "learning_rate": 3.3352155091285526e-06, "loss": 0.4985, "step": 44270 }, { "epoch": 8.33, "grad_norm": 11.34466552734375, "learning_rate": 3.331451157538114e-06, "loss": 0.5533, "step": 44280 }, { "epoch": 8.34, "grad_norm": 4.790088176727295, "learning_rate": 3.327686805947676e-06, "loss": 0.5617, "step": 44290 }, { "epoch": 8.34, "grad_norm": 22.32793617248535, "learning_rate": 3.3239224543572374e-06, "loss": 0.7385, "step": 44300 }, { "epoch": 8.34, "grad_norm": 22.453144073486328, "learning_rate": 3.320158102766799e-06, "loss": 0.7092, "step": 44310 }, { "epoch": 8.34, "grad_norm": 9.739965438842773, "learning_rate": 3.31639375117636e-06, "loss": 0.6099, "step": 44320 }, { "epoch": 8.34, "grad_norm": 11.814325332641602, "learning_rate": 3.3126293995859214e-06, "loss": 0.3585, "step": 44330 }, { "epoch": 8.35, "grad_norm": 6.5929856300354, "learning_rate": 3.3088650479954832e-06, "loss": 0.4831, "step": 44340 }, { "epoch": 8.35, "grad_norm": 8.252095222473145, "learning_rate": 3.3051006964050447e-06, "loss": 0.5399, "step": 44350 }, { "epoch": 8.35, "grad_norm": 2.29144024848938, "learning_rate": 3.3013363448146057e-06, "loss": 0.4328, "step": 44360 }, { "epoch": 8.35, "grad_norm": 11.538058280944824, "learning_rate": 3.297571993224167e-06, "loss": 0.5311, "step": 44370 }, { "epoch": 8.35, "grad_norm": 35.04473114013672, "learning_rate": 3.293807641633729e-06, "loss": 0.8645, "step": 44380 }, { "epoch": 8.35, "grad_norm": 9.149798393249512, "learning_rate": 3.2900432900432905e-06, "loss": 0.7334, "step": 44390 }, { "epoch": 8.36, "grad_norm": 1.0100746154785156, "learning_rate": 3.2862789384528516e-06, "loss": 0.6551, "step": 44400 }, { "epoch": 8.36, "grad_norm": 6.005456447601318, "learning_rate": 3.282514586862413e-06, "loss": 0.632, "step": 44410 }, { "epoch": 8.36, "grad_norm": 2.054805278778076, "learning_rate": 3.2787502352719745e-06, "loss": 0.5814, "step": 44420 }, { "epoch": 8.36, "grad_norm": 21.52336311340332, "learning_rate": 3.2749858836815364e-06, "loss": 1.0416, "step": 44430 }, { "epoch": 8.36, "grad_norm": 7.227096080780029, "learning_rate": 3.271221532091098e-06, "loss": 0.5654, "step": 44440 }, { "epoch": 8.37, "grad_norm": 3.3243463039398193, "learning_rate": 3.267457180500659e-06, "loss": 0.4803, "step": 44450 }, { "epoch": 8.37, "grad_norm": 32.86634063720703, "learning_rate": 3.2636928289102203e-06, "loss": 0.5485, "step": 44460 }, { "epoch": 8.37, "grad_norm": 2.2668581008911133, "learning_rate": 3.259928477319782e-06, "loss": 0.7858, "step": 44470 }, { "epoch": 8.37, "grad_norm": 16.1653995513916, "learning_rate": 3.2561641257293437e-06, "loss": 0.722, "step": 44480 }, { "epoch": 8.37, "grad_norm": 18.26914405822754, "learning_rate": 3.2523997741389047e-06, "loss": 0.5671, "step": 44490 }, { "epoch": 8.38, "grad_norm": 20.45002555847168, "learning_rate": 3.248635422548466e-06, "loss": 0.5654, "step": 44500 }, { "epoch": 8.38, "grad_norm": 17.940828323364258, "learning_rate": 3.2448710709580277e-06, "loss": 0.976, "step": 44510 }, { "epoch": 8.38, "grad_norm": 18.439937591552734, "learning_rate": 3.2411067193675895e-06, "loss": 0.6151, "step": 44520 }, { "epoch": 8.38, "grad_norm": 94.43441009521484, "learning_rate": 3.2373423677771506e-06, "loss": 0.5959, "step": 44530 }, { "epoch": 8.38, "grad_norm": 20.579090118408203, "learning_rate": 3.233578016186712e-06, "loss": 0.546, "step": 44540 }, { "epoch": 8.39, "grad_norm": 14.981857299804688, "learning_rate": 3.2298136645962735e-06, "loss": 0.4734, "step": 44550 }, { "epoch": 8.39, "grad_norm": 2.0256664752960205, "learning_rate": 3.226049313005835e-06, "loss": 0.6634, "step": 44560 }, { "epoch": 8.39, "grad_norm": 11.0910062789917, "learning_rate": 3.222284961415397e-06, "loss": 0.465, "step": 44570 }, { "epoch": 8.39, "grad_norm": 4.853362083435059, "learning_rate": 3.218520609824958e-06, "loss": 0.6532, "step": 44580 }, { "epoch": 8.39, "grad_norm": 30.577682495117188, "learning_rate": 3.2147562582345193e-06, "loss": 0.4277, "step": 44590 }, { "epoch": 8.39, "grad_norm": 12.4761381149292, "learning_rate": 3.210991906644081e-06, "loss": 0.8246, "step": 44600 }, { "epoch": 8.4, "grad_norm": 11.036797523498535, "learning_rate": 3.2072275550536427e-06, "loss": 0.4634, "step": 44610 }, { "epoch": 8.4, "grad_norm": 4.647388935089111, "learning_rate": 3.2034632034632033e-06, "loss": 0.5062, "step": 44620 }, { "epoch": 8.4, "grad_norm": 8.75901985168457, "learning_rate": 3.199698851872765e-06, "loss": 0.4288, "step": 44630 }, { "epoch": 8.4, "grad_norm": 1.3946189880371094, "learning_rate": 3.1959345002823266e-06, "loss": 0.4226, "step": 44640 }, { "epoch": 8.4, "grad_norm": 28.2685489654541, "learning_rate": 3.192170148691888e-06, "loss": 0.668, "step": 44650 }, { "epoch": 8.41, "grad_norm": 8.326583862304688, "learning_rate": 3.188405797101449e-06, "loss": 0.7653, "step": 44660 }, { "epoch": 8.41, "grad_norm": 30.164844512939453, "learning_rate": 3.184641445511011e-06, "loss": 0.5945, "step": 44670 }, { "epoch": 8.41, "grad_norm": 3.70147442817688, "learning_rate": 3.1808770939205725e-06, "loss": 0.4499, "step": 44680 }, { "epoch": 8.41, "grad_norm": 17.750473022460938, "learning_rate": 3.177112742330134e-06, "loss": 0.889, "step": 44690 }, { "epoch": 8.41, "grad_norm": 29.010417938232422, "learning_rate": 3.1733483907396954e-06, "loss": 0.6681, "step": 44700 }, { "epoch": 8.42, "grad_norm": 8.533760070800781, "learning_rate": 3.1695840391492564e-06, "loss": 0.7376, "step": 44710 }, { "epoch": 8.42, "grad_norm": 3.58298659324646, "learning_rate": 3.1658196875588183e-06, "loss": 0.5152, "step": 44720 }, { "epoch": 8.42, "grad_norm": 5.7278218269348145, "learning_rate": 3.1620553359683798e-06, "loss": 0.7319, "step": 44730 }, { "epoch": 8.42, "grad_norm": 13.810081481933594, "learning_rate": 3.1582909843779412e-06, "loss": 0.8944, "step": 44740 }, { "epoch": 8.42, "grad_norm": 9.698067665100098, "learning_rate": 3.1545266327875023e-06, "loss": 0.7519, "step": 44750 }, { "epoch": 8.42, "grad_norm": 6.224327087402344, "learning_rate": 3.1507622811970637e-06, "loss": 0.8011, "step": 44760 }, { "epoch": 8.43, "grad_norm": 19.914257049560547, "learning_rate": 3.1469979296066256e-06, "loss": 0.7994, "step": 44770 }, { "epoch": 8.43, "grad_norm": 2.012037992477417, "learning_rate": 3.143233578016187e-06, "loss": 0.6648, "step": 44780 }, { "epoch": 8.43, "grad_norm": 0.8676754236221313, "learning_rate": 3.1394692264257485e-06, "loss": 0.6629, "step": 44790 }, { "epoch": 8.43, "grad_norm": 64.43790435791016, "learning_rate": 3.1357048748353096e-06, "loss": 0.5267, "step": 44800 }, { "epoch": 8.43, "grad_norm": 0.6340997815132141, "learning_rate": 3.1319405232448715e-06, "loss": 0.4891, "step": 44810 }, { "epoch": 8.44, "grad_norm": 9.094552993774414, "learning_rate": 3.128176171654433e-06, "loss": 0.7169, "step": 44820 }, { "epoch": 8.44, "grad_norm": 2.686659574508667, "learning_rate": 3.1244118200639944e-06, "loss": 0.4826, "step": 44830 }, { "epoch": 8.44, "grad_norm": 32.29351806640625, "learning_rate": 3.1206474684735554e-06, "loss": 0.5936, "step": 44840 }, { "epoch": 8.44, "grad_norm": 3.7064695358276367, "learning_rate": 3.116883116883117e-06, "loss": 0.5565, "step": 44850 }, { "epoch": 8.44, "grad_norm": 6.751142501831055, "learning_rate": 3.1131187652926788e-06, "loss": 0.7285, "step": 44860 }, { "epoch": 8.45, "grad_norm": 36.527488708496094, "learning_rate": 3.1093544137022402e-06, "loss": 0.793, "step": 44870 }, { "epoch": 8.45, "grad_norm": 5.650635242462158, "learning_rate": 3.1055900621118013e-06, "loss": 0.4796, "step": 44880 }, { "epoch": 8.45, "grad_norm": 9.59321403503418, "learning_rate": 3.1018257105213627e-06, "loss": 0.4723, "step": 44890 }, { "epoch": 8.45, "grad_norm": 20.056053161621094, "learning_rate": 3.0980613589309246e-06, "loss": 0.7005, "step": 44900 }, { "epoch": 8.45, "grad_norm": 9.86202335357666, "learning_rate": 3.094297007340486e-06, "loss": 0.7074, "step": 44910 }, { "epoch": 8.45, "grad_norm": 11.277523040771484, "learning_rate": 3.0905326557500475e-06, "loss": 0.5845, "step": 44920 }, { "epoch": 8.46, "grad_norm": 1.237337350845337, "learning_rate": 3.0867683041596086e-06, "loss": 0.6578, "step": 44930 }, { "epoch": 8.46, "grad_norm": 6.283239841461182, "learning_rate": 3.08300395256917e-06, "loss": 0.5129, "step": 44940 }, { "epoch": 8.46, "grad_norm": 4.76901912689209, "learning_rate": 3.079239600978732e-06, "loss": 0.6016, "step": 44950 }, { "epoch": 8.46, "grad_norm": 44.452430725097656, "learning_rate": 3.0754752493882934e-06, "loss": 0.4908, "step": 44960 }, { "epoch": 8.46, "grad_norm": 9.22043514251709, "learning_rate": 3.0717108977978544e-06, "loss": 0.9249, "step": 44970 }, { "epoch": 8.47, "grad_norm": 6.483752727508545, "learning_rate": 3.067946546207416e-06, "loss": 0.8278, "step": 44980 }, { "epoch": 8.47, "grad_norm": 5.587104320526123, "learning_rate": 3.0641821946169773e-06, "loss": 0.7677, "step": 44990 }, { "epoch": 8.47, "grad_norm": 10.787910461425781, "learning_rate": 3.0604178430265392e-06, "loss": 0.7267, "step": 45000 }, { "epoch": 8.47, "grad_norm": 10.05722713470459, "learning_rate": 3.0566534914361003e-06, "loss": 0.857, "step": 45010 }, { "epoch": 8.47, "grad_norm": 18.300647735595703, "learning_rate": 3.0528891398456617e-06, "loss": 0.4189, "step": 45020 }, { "epoch": 8.48, "grad_norm": 24.512313842773438, "learning_rate": 3.049124788255223e-06, "loss": 0.5907, "step": 45030 }, { "epoch": 8.48, "grad_norm": 0.4789193868637085, "learning_rate": 3.045360436664785e-06, "loss": 0.6094, "step": 45040 }, { "epoch": 8.48, "grad_norm": 8.276256561279297, "learning_rate": 3.0415960850743465e-06, "loss": 0.5725, "step": 45050 }, { "epoch": 8.48, "grad_norm": 24.884714126586914, "learning_rate": 3.0378317334839076e-06, "loss": 0.6064, "step": 45060 }, { "epoch": 8.48, "grad_norm": 13.073172569274902, "learning_rate": 3.034067381893469e-06, "loss": 0.6484, "step": 45070 }, { "epoch": 8.48, "grad_norm": 7.134273052215576, "learning_rate": 3.0303030303030305e-06, "loss": 0.7184, "step": 45080 }, { "epoch": 8.49, "grad_norm": 2.833738088607788, "learning_rate": 3.0265386787125924e-06, "loss": 0.4723, "step": 45090 }, { "epoch": 8.49, "grad_norm": 1.3132004737854004, "learning_rate": 3.0227743271221534e-06, "loss": 0.7449, "step": 45100 }, { "epoch": 8.49, "grad_norm": 5.53575325012207, "learning_rate": 3.019009975531715e-06, "loss": 0.461, "step": 45110 }, { "epoch": 8.49, "grad_norm": 0.9196773171424866, "learning_rate": 3.0152456239412763e-06, "loss": 0.5558, "step": 45120 }, { "epoch": 8.49, "grad_norm": 15.09958267211914, "learning_rate": 3.011481272350838e-06, "loss": 0.4448, "step": 45130 }, { "epoch": 8.5, "grad_norm": 36.49333953857422, "learning_rate": 3.007716920760399e-06, "loss": 0.6308, "step": 45140 }, { "epoch": 8.5, "grad_norm": 12.150959014892578, "learning_rate": 3.0039525691699607e-06, "loss": 0.451, "step": 45150 }, { "epoch": 8.5, "grad_norm": 5.882155418395996, "learning_rate": 3.000188217579522e-06, "loss": 0.4076, "step": 45160 }, { "epoch": 8.5, "grad_norm": 16.017332077026367, "learning_rate": 2.9964238659890836e-06, "loss": 0.3654, "step": 45170 }, { "epoch": 8.5, "grad_norm": 1.194792628288269, "learning_rate": 2.9926595143986455e-06, "loss": 0.5463, "step": 45180 }, { "epoch": 8.51, "grad_norm": 4.398236274719238, "learning_rate": 2.9888951628082066e-06, "loss": 0.318, "step": 45190 }, { "epoch": 8.51, "grad_norm": 12.98875904083252, "learning_rate": 2.985130811217768e-06, "loss": 0.6161, "step": 45200 }, { "epoch": 8.51, "grad_norm": 33.51036834716797, "learning_rate": 2.9813664596273295e-06, "loss": 1.0006, "step": 45210 }, { "epoch": 8.51, "grad_norm": 1.3355485200881958, "learning_rate": 2.977602108036891e-06, "loss": 0.5115, "step": 45220 }, { "epoch": 8.51, "grad_norm": 1.9838895797729492, "learning_rate": 2.973837756446452e-06, "loss": 0.3853, "step": 45230 }, { "epoch": 8.51, "grad_norm": 1.1442241668701172, "learning_rate": 2.970073404856014e-06, "loss": 0.3258, "step": 45240 }, { "epoch": 8.52, "grad_norm": 5.214044570922852, "learning_rate": 2.9663090532655753e-06, "loss": 0.7165, "step": 45250 }, { "epoch": 8.52, "grad_norm": 6.380605697631836, "learning_rate": 2.9625447016751368e-06, "loss": 0.3101, "step": 45260 }, { "epoch": 8.52, "grad_norm": 27.787433624267578, "learning_rate": 2.958780350084698e-06, "loss": 0.9776, "step": 45270 }, { "epoch": 8.52, "grad_norm": 6.463216781616211, "learning_rate": 2.9550159984942593e-06, "loss": 0.8232, "step": 45280 }, { "epoch": 8.52, "grad_norm": 24.21323013305664, "learning_rate": 2.951251646903821e-06, "loss": 1.0015, "step": 45290 }, { "epoch": 8.53, "grad_norm": 22.82830810546875, "learning_rate": 2.9474872953133826e-06, "loss": 0.7101, "step": 45300 }, { "epoch": 8.53, "grad_norm": 2.0714211463928223, "learning_rate": 2.943722943722944e-06, "loss": 0.3811, "step": 45310 }, { "epoch": 8.53, "grad_norm": 5.328965663909912, "learning_rate": 2.939958592132505e-06, "loss": 0.5708, "step": 45320 }, { "epoch": 8.53, "grad_norm": 21.409069061279297, "learning_rate": 2.936194240542067e-06, "loss": 0.9812, "step": 45330 }, { "epoch": 8.53, "grad_norm": 22.913482666015625, "learning_rate": 2.9324298889516285e-06, "loss": 0.7348, "step": 45340 }, { "epoch": 8.54, "grad_norm": 3.093852996826172, "learning_rate": 2.92866553736119e-06, "loss": 0.3073, "step": 45350 }, { "epoch": 8.54, "grad_norm": 2.0163822174072266, "learning_rate": 2.924901185770751e-06, "loss": 0.4732, "step": 45360 }, { "epoch": 8.54, "grad_norm": 10.924235343933105, "learning_rate": 2.9211368341803124e-06, "loss": 0.6043, "step": 45370 }, { "epoch": 8.54, "grad_norm": 10.99343204498291, "learning_rate": 2.9173724825898743e-06, "loss": 0.6364, "step": 45380 }, { "epoch": 8.54, "grad_norm": 2.671330451965332, "learning_rate": 2.9136081309994358e-06, "loss": 0.5883, "step": 45390 }, { "epoch": 8.55, "grad_norm": 31.984107971191406, "learning_rate": 2.9098437794089972e-06, "loss": 0.8114, "step": 45400 }, { "epoch": 8.55, "grad_norm": 3.859494209289551, "learning_rate": 2.9060794278185583e-06, "loss": 0.6513, "step": 45410 }, { "epoch": 8.55, "grad_norm": 8.56103801727295, "learning_rate": 2.9023150762281197e-06, "loss": 0.6994, "step": 45420 }, { "epoch": 8.55, "grad_norm": 10.559017181396484, "learning_rate": 2.8985507246376816e-06, "loss": 1.0319, "step": 45430 }, { "epoch": 8.55, "grad_norm": 8.480578422546387, "learning_rate": 2.894786373047243e-06, "loss": 0.9953, "step": 45440 }, { "epoch": 8.55, "grad_norm": 10.064159393310547, "learning_rate": 2.891022021456804e-06, "loss": 0.3891, "step": 45450 }, { "epoch": 8.56, "grad_norm": 3.896289110183716, "learning_rate": 2.8872576698663656e-06, "loss": 0.7005, "step": 45460 }, { "epoch": 8.56, "grad_norm": 14.289514541625977, "learning_rate": 2.8834933182759275e-06, "loss": 0.6693, "step": 45470 }, { "epoch": 8.56, "grad_norm": 11.35452651977539, "learning_rate": 2.879728966685489e-06, "loss": 0.6499, "step": 45480 }, { "epoch": 8.56, "grad_norm": 5.67701530456543, "learning_rate": 2.87596461509505e-06, "loss": 0.6569, "step": 45490 }, { "epoch": 8.56, "grad_norm": 17.74590492248535, "learning_rate": 2.8722002635046114e-06, "loss": 0.7351, "step": 45500 }, { "epoch": 8.57, "grad_norm": 0.909513533115387, "learning_rate": 2.868435911914173e-06, "loss": 0.5471, "step": 45510 }, { "epoch": 8.57, "grad_norm": 12.866127014160156, "learning_rate": 2.8646715603237348e-06, "loss": 0.5417, "step": 45520 }, { "epoch": 8.57, "grad_norm": 8.781469345092773, "learning_rate": 2.8609072087332962e-06, "loss": 0.3157, "step": 45530 }, { "epoch": 8.57, "grad_norm": 8.589152336120605, "learning_rate": 2.8571428571428573e-06, "loss": 0.5927, "step": 45540 }, { "epoch": 8.57, "grad_norm": 1.6003247499465942, "learning_rate": 2.8533785055524187e-06, "loss": 0.2446, "step": 45550 }, { "epoch": 8.58, "grad_norm": 48.97837448120117, "learning_rate": 2.84961415396198e-06, "loss": 0.5445, "step": 45560 }, { "epoch": 8.58, "grad_norm": 42.185699462890625, "learning_rate": 2.845849802371542e-06, "loss": 0.8836, "step": 45570 }, { "epoch": 8.58, "grad_norm": 69.58699798583984, "learning_rate": 2.842085450781103e-06, "loss": 0.4583, "step": 45580 }, { "epoch": 8.58, "grad_norm": 13.63576602935791, "learning_rate": 2.8383210991906646e-06, "loss": 0.5409, "step": 45590 }, { "epoch": 8.58, "grad_norm": 12.525568008422852, "learning_rate": 2.834556747600226e-06, "loss": 0.5649, "step": 45600 }, { "epoch": 8.58, "grad_norm": 27.21977424621582, "learning_rate": 2.830792396009788e-06, "loss": 0.7195, "step": 45610 }, { "epoch": 8.59, "grad_norm": 11.311186790466309, "learning_rate": 2.827028044419349e-06, "loss": 0.8666, "step": 45620 }, { "epoch": 8.59, "grad_norm": 28.109294891357422, "learning_rate": 2.8232636928289104e-06, "loss": 0.8059, "step": 45630 }, { "epoch": 8.59, "grad_norm": 8.775447845458984, "learning_rate": 2.819499341238472e-06, "loss": 0.6226, "step": 45640 }, { "epoch": 8.59, "grad_norm": 4.7530317306518555, "learning_rate": 2.8157349896480333e-06, "loss": 0.6427, "step": 45650 }, { "epoch": 8.59, "grad_norm": 4.78458833694458, "learning_rate": 2.811970638057595e-06, "loss": 0.5867, "step": 45660 }, { "epoch": 8.6, "grad_norm": 17.084861755371094, "learning_rate": 2.8082062864671562e-06, "loss": 0.919, "step": 45670 }, { "epoch": 8.6, "grad_norm": 31.368091583251953, "learning_rate": 2.8044419348767177e-06, "loss": 0.6141, "step": 45680 }, { "epoch": 8.6, "grad_norm": 4.767568111419678, "learning_rate": 2.800677583286279e-06, "loss": 0.3971, "step": 45690 }, { "epoch": 8.6, "grad_norm": 11.753438949584961, "learning_rate": 2.7969132316958406e-06, "loss": 0.6078, "step": 45700 }, { "epoch": 8.6, "grad_norm": 10.127785682678223, "learning_rate": 2.7931488801054017e-06, "loss": 0.3882, "step": 45710 }, { "epoch": 8.61, "grad_norm": 28.73605728149414, "learning_rate": 2.7893845285149635e-06, "loss": 0.4531, "step": 45720 }, { "epoch": 8.61, "grad_norm": 4.72650146484375, "learning_rate": 2.785620176924525e-06, "loss": 0.4625, "step": 45730 }, { "epoch": 8.61, "grad_norm": 6.095025539398193, "learning_rate": 2.7818558253340865e-06, "loss": 0.6562, "step": 45740 }, { "epoch": 8.61, "grad_norm": 8.800366401672363, "learning_rate": 2.7780914737436475e-06, "loss": 0.7205, "step": 45750 }, { "epoch": 8.61, "grad_norm": 8.902318954467773, "learning_rate": 2.7743271221532094e-06, "loss": 0.7416, "step": 45760 }, { "epoch": 8.61, "grad_norm": 8.020096778869629, "learning_rate": 2.770562770562771e-06, "loss": 0.6651, "step": 45770 }, { "epoch": 8.62, "grad_norm": 2.996055841445923, "learning_rate": 2.7667984189723323e-06, "loss": 0.414, "step": 45780 }, { "epoch": 8.62, "grad_norm": 15.51314640045166, "learning_rate": 2.7630340673818938e-06, "loss": 0.4894, "step": 45790 }, { "epoch": 8.62, "grad_norm": 16.64989471435547, "learning_rate": 2.759269715791455e-06, "loss": 0.7171, "step": 45800 }, { "epoch": 8.62, "grad_norm": 19.164337158203125, "learning_rate": 2.7555053642010167e-06, "loss": 0.7626, "step": 45810 }, { "epoch": 8.62, "grad_norm": 8.781245231628418, "learning_rate": 2.751741012610578e-06, "loss": 0.523, "step": 45820 }, { "epoch": 8.63, "grad_norm": 2.4866180419921875, "learning_rate": 2.7479766610201396e-06, "loss": 0.6642, "step": 45830 }, { "epoch": 8.63, "grad_norm": 13.035019874572754, "learning_rate": 2.7442123094297007e-06, "loss": 0.9329, "step": 45840 }, { "epoch": 8.63, "grad_norm": 20.42259979248047, "learning_rate": 2.740447957839262e-06, "loss": 0.7711, "step": 45850 }, { "epoch": 8.63, "grad_norm": 25.4084415435791, "learning_rate": 2.736683606248824e-06, "loss": 0.5054, "step": 45860 }, { "epoch": 8.63, "grad_norm": 16.9029541015625, "learning_rate": 2.7329192546583855e-06, "loss": 0.7729, "step": 45870 }, { "epoch": 8.64, "grad_norm": 8.263433456420898, "learning_rate": 2.7291549030679465e-06, "loss": 0.4819, "step": 45880 }, { "epoch": 8.64, "grad_norm": 15.39215087890625, "learning_rate": 2.725390551477508e-06, "loss": 0.9842, "step": 45890 }, { "epoch": 8.64, "grad_norm": 10.729267120361328, "learning_rate": 2.72162619988707e-06, "loss": 0.6373, "step": 45900 }, { "epoch": 8.64, "grad_norm": 9.779509544372559, "learning_rate": 2.7178618482966313e-06, "loss": 1.1067, "step": 45910 }, { "epoch": 8.64, "grad_norm": 17.492794036865234, "learning_rate": 2.7140974967061928e-06, "loss": 0.8578, "step": 45920 }, { "epoch": 8.64, "grad_norm": 2.0420212745666504, "learning_rate": 2.710333145115754e-06, "loss": 0.5105, "step": 45930 }, { "epoch": 8.65, "grad_norm": 3.9706718921661377, "learning_rate": 2.7065687935253153e-06, "loss": 0.4528, "step": 45940 }, { "epoch": 8.65, "grad_norm": 3.535522937774658, "learning_rate": 2.702804441934877e-06, "loss": 0.4152, "step": 45950 }, { "epoch": 8.65, "grad_norm": 1.2473362684249878, "learning_rate": 2.6990400903444386e-06, "loss": 0.7017, "step": 45960 }, { "epoch": 8.65, "grad_norm": 6.834277629852295, "learning_rate": 2.6952757387539996e-06, "loss": 0.5385, "step": 45970 }, { "epoch": 8.65, "grad_norm": 6.239208221435547, "learning_rate": 2.691511387163561e-06, "loss": 0.597, "step": 45980 }, { "epoch": 8.66, "grad_norm": 12.432613372802734, "learning_rate": 2.687747035573123e-06, "loss": 0.5221, "step": 45990 }, { "epoch": 8.66, "grad_norm": 31.524188995361328, "learning_rate": 2.6839826839826844e-06, "loss": 0.7989, "step": 46000 }, { "epoch": 8.66, "grad_norm": 4.926718235015869, "learning_rate": 2.6802183323922455e-06, "loss": 0.4908, "step": 46010 }, { "epoch": 8.66, "grad_norm": 11.201255798339844, "learning_rate": 2.676453980801807e-06, "loss": 0.647, "step": 46020 }, { "epoch": 8.66, "grad_norm": 18.536455154418945, "learning_rate": 2.6726896292113684e-06, "loss": 0.687, "step": 46030 }, { "epoch": 8.67, "grad_norm": 19.367650985717773, "learning_rate": 2.6689252776209303e-06, "loss": 0.6917, "step": 46040 }, { "epoch": 8.67, "grad_norm": 12.64576530456543, "learning_rate": 2.6651609260304918e-06, "loss": 0.9019, "step": 46050 }, { "epoch": 8.67, "grad_norm": 11.735465049743652, "learning_rate": 2.661396574440053e-06, "loss": 0.8247, "step": 46060 }, { "epoch": 8.67, "grad_norm": 13.32355785369873, "learning_rate": 2.6576322228496142e-06, "loss": 0.3505, "step": 46070 }, { "epoch": 8.67, "grad_norm": 6.692026138305664, "learning_rate": 2.6538678712591757e-06, "loss": 0.8277, "step": 46080 }, { "epoch": 8.67, "grad_norm": 12.325249671936035, "learning_rate": 2.6501035196687376e-06, "loss": 0.6951, "step": 46090 }, { "epoch": 8.68, "grad_norm": 7.590768337249756, "learning_rate": 2.6463391680782986e-06, "loss": 0.5754, "step": 46100 }, { "epoch": 8.68, "grad_norm": 13.319286346435547, "learning_rate": 2.64257481648786e-06, "loss": 0.625, "step": 46110 }, { "epoch": 8.68, "grad_norm": 3.969967842102051, "learning_rate": 2.6388104648974216e-06, "loss": 0.4351, "step": 46120 }, { "epoch": 8.68, "grad_norm": 9.775538444519043, "learning_rate": 2.6350461133069834e-06, "loss": 0.7453, "step": 46130 }, { "epoch": 8.68, "grad_norm": 16.052352905273438, "learning_rate": 2.631281761716545e-06, "loss": 0.5751, "step": 46140 }, { "epoch": 8.69, "grad_norm": 5.246091365814209, "learning_rate": 2.627517410126106e-06, "loss": 1.0137, "step": 46150 }, { "epoch": 8.69, "grad_norm": 3.481466293334961, "learning_rate": 2.6237530585356674e-06, "loss": 0.5092, "step": 46160 }, { "epoch": 8.69, "grad_norm": 27.15146255493164, "learning_rate": 2.619988706945229e-06, "loss": 0.724, "step": 46170 }, { "epoch": 8.69, "grad_norm": 28.39906883239746, "learning_rate": 2.6162243553547907e-06, "loss": 0.4788, "step": 46180 }, { "epoch": 8.69, "grad_norm": 16.960796356201172, "learning_rate": 2.6124600037643518e-06, "loss": 0.8636, "step": 46190 }, { "epoch": 8.7, "grad_norm": 53.47764205932617, "learning_rate": 2.6086956521739132e-06, "loss": 0.968, "step": 46200 }, { "epoch": 8.7, "grad_norm": 6.928495407104492, "learning_rate": 2.6049313005834747e-06, "loss": 0.5824, "step": 46210 }, { "epoch": 8.7, "grad_norm": 7.357982158660889, "learning_rate": 2.601166948993036e-06, "loss": 0.2472, "step": 46220 }, { "epoch": 8.7, "grad_norm": 15.295299530029297, "learning_rate": 2.597402597402597e-06, "loss": 0.6717, "step": 46230 }, { "epoch": 8.7, "grad_norm": 20.448102951049805, "learning_rate": 2.593638245812159e-06, "loss": 0.7133, "step": 46240 }, { "epoch": 8.71, "grad_norm": 11.149190902709961, "learning_rate": 2.5898738942217205e-06, "loss": 0.755, "step": 46250 }, { "epoch": 8.71, "grad_norm": 10.200156211853027, "learning_rate": 2.586109542631282e-06, "loss": 0.5183, "step": 46260 }, { "epoch": 8.71, "grad_norm": 2.1006104946136475, "learning_rate": 2.582345191040844e-06, "loss": 0.7534, "step": 46270 }, { "epoch": 8.71, "grad_norm": 10.232656478881836, "learning_rate": 2.578580839450405e-06, "loss": 0.4567, "step": 46280 }, { "epoch": 8.71, "grad_norm": 49.30923843383789, "learning_rate": 2.5748164878599664e-06, "loss": 0.6916, "step": 46290 }, { "epoch": 8.71, "grad_norm": 5.7721848487854, "learning_rate": 2.571052136269528e-06, "loss": 0.3579, "step": 46300 }, { "epoch": 8.72, "grad_norm": 14.022491455078125, "learning_rate": 2.5672877846790893e-06, "loss": 0.7266, "step": 46310 }, { "epoch": 8.72, "grad_norm": 28.376468658447266, "learning_rate": 2.5635234330886503e-06, "loss": 0.5354, "step": 46320 }, { "epoch": 8.72, "grad_norm": 15.814912796020508, "learning_rate": 2.5597590814982122e-06, "loss": 0.6255, "step": 46330 }, { "epoch": 8.72, "grad_norm": 22.56021499633789, "learning_rate": 2.5559947299077737e-06, "loss": 0.5291, "step": 46340 }, { "epoch": 8.72, "grad_norm": 13.840317726135254, "learning_rate": 2.552230378317335e-06, "loss": 0.4975, "step": 46350 }, { "epoch": 8.73, "grad_norm": 50.23695373535156, "learning_rate": 2.548466026726896e-06, "loss": 0.4736, "step": 46360 }, { "epoch": 8.73, "grad_norm": 1.2952226400375366, "learning_rate": 2.5447016751364576e-06, "loss": 0.5642, "step": 46370 }, { "epoch": 8.73, "grad_norm": 4.189306259155273, "learning_rate": 2.5409373235460195e-06, "loss": 1.1035, "step": 46380 }, { "epoch": 8.73, "grad_norm": 8.517165184020996, "learning_rate": 2.537172971955581e-06, "loss": 0.5256, "step": 46390 }, { "epoch": 8.73, "grad_norm": 24.298686981201172, "learning_rate": 2.5334086203651425e-06, "loss": 0.483, "step": 46400 }, { "epoch": 8.74, "grad_norm": 12.247828483581543, "learning_rate": 2.5296442687747035e-06, "loss": 0.6574, "step": 46410 }, { "epoch": 8.74, "grad_norm": 21.78733253479004, "learning_rate": 2.5258799171842654e-06, "loss": 0.5269, "step": 46420 }, { "epoch": 8.74, "grad_norm": 3.4735021591186523, "learning_rate": 2.522115565593827e-06, "loss": 0.4066, "step": 46430 }, { "epoch": 8.74, "grad_norm": 2.901937484741211, "learning_rate": 2.5183512140033883e-06, "loss": 0.8333, "step": 46440 }, { "epoch": 8.74, "grad_norm": 14.172199249267578, "learning_rate": 2.5145868624129493e-06, "loss": 0.739, "step": 46450 }, { "epoch": 8.74, "grad_norm": 4.983508586883545, "learning_rate": 2.510822510822511e-06, "loss": 0.4818, "step": 46460 }, { "epoch": 8.75, "grad_norm": 50.233070373535156, "learning_rate": 2.5070581592320727e-06, "loss": 1.1101, "step": 46470 }, { "epoch": 8.75, "grad_norm": 4.838659286499023, "learning_rate": 2.503293807641634e-06, "loss": 0.3756, "step": 46480 }, { "epoch": 8.75, "grad_norm": 14.943388938903809, "learning_rate": 2.4995294560511956e-06, "loss": 0.7611, "step": 46490 }, { "epoch": 8.75, "grad_norm": 13.47839641571045, "learning_rate": 2.4957651044607566e-06, "loss": 0.5197, "step": 46500 }, { "epoch": 8.75, "grad_norm": 2.747945785522461, "learning_rate": 2.492000752870318e-06, "loss": 0.6973, "step": 46510 }, { "epoch": 8.76, "grad_norm": 16.336193084716797, "learning_rate": 2.48823640127988e-06, "loss": 0.6534, "step": 46520 }, { "epoch": 8.76, "grad_norm": 5.624683856964111, "learning_rate": 2.484472049689441e-06, "loss": 0.8038, "step": 46530 }, { "epoch": 8.76, "grad_norm": 13.724801063537598, "learning_rate": 2.480707698099003e-06, "loss": 0.6525, "step": 46540 }, { "epoch": 8.76, "grad_norm": 11.737406730651855, "learning_rate": 2.476943346508564e-06, "loss": 0.5863, "step": 46550 }, { "epoch": 8.76, "grad_norm": 16.39737319946289, "learning_rate": 2.473178994918126e-06, "loss": 0.6921, "step": 46560 }, { "epoch": 8.77, "grad_norm": 3.6673481464385986, "learning_rate": 2.469414643327687e-06, "loss": 0.4921, "step": 46570 }, { "epoch": 8.77, "grad_norm": 9.15060043334961, "learning_rate": 2.4656502917372483e-06, "loss": 0.5008, "step": 46580 }, { "epoch": 8.77, "grad_norm": 4.507288455963135, "learning_rate": 2.4618859401468098e-06, "loss": 0.5448, "step": 46590 }, { "epoch": 8.77, "grad_norm": 18.566953659057617, "learning_rate": 2.4581215885563712e-06, "loss": 0.7864, "step": 46600 }, { "epoch": 8.77, "grad_norm": 24.314119338989258, "learning_rate": 2.4543572369659327e-06, "loss": 0.3278, "step": 46610 }, { "epoch": 8.77, "grad_norm": 14.54391098022461, "learning_rate": 2.450592885375494e-06, "loss": 0.7656, "step": 46620 }, { "epoch": 8.78, "grad_norm": 11.312484741210938, "learning_rate": 2.446828533785056e-06, "loss": 0.6239, "step": 46630 }, { "epoch": 8.78, "grad_norm": 36.48824691772461, "learning_rate": 2.443064182194617e-06, "loss": 0.5056, "step": 46640 }, { "epoch": 8.78, "grad_norm": 50.45350646972656, "learning_rate": 2.4392998306041785e-06, "loss": 0.3117, "step": 46650 }, { "epoch": 8.78, "grad_norm": 11.23302936553955, "learning_rate": 2.43553547901374e-06, "loss": 0.798, "step": 46660 }, { "epoch": 8.78, "grad_norm": 6.4975504875183105, "learning_rate": 2.4317711274233015e-06, "loss": 0.7973, "step": 46670 }, { "epoch": 8.79, "grad_norm": 2.676708459854126, "learning_rate": 2.428006775832863e-06, "loss": 0.7583, "step": 46680 }, { "epoch": 8.79, "grad_norm": 6.580551624298096, "learning_rate": 2.4242424242424244e-06, "loss": 0.7596, "step": 46690 }, { "epoch": 8.79, "grad_norm": 29.067611694335938, "learning_rate": 2.420478072651986e-06, "loss": 0.6931, "step": 46700 }, { "epoch": 8.79, "grad_norm": 6.944410800933838, "learning_rate": 2.4167137210615473e-06, "loss": 0.4802, "step": 46710 }, { "epoch": 8.79, "grad_norm": 4.51786994934082, "learning_rate": 2.4129493694711088e-06, "loss": 0.6491, "step": 46720 }, { "epoch": 8.8, "grad_norm": 3.9452826976776123, "learning_rate": 2.4091850178806702e-06, "loss": 1.1887, "step": 46730 }, { "epoch": 8.8, "grad_norm": 12.151463508605957, "learning_rate": 2.4054206662902317e-06, "loss": 0.9153, "step": 46740 }, { "epoch": 8.8, "grad_norm": 3.3934149742126465, "learning_rate": 2.401656314699793e-06, "loss": 0.5624, "step": 46750 }, { "epoch": 8.8, "grad_norm": 3.353159189224243, "learning_rate": 2.3978919631093546e-06, "loss": 0.3828, "step": 46760 }, { "epoch": 8.8, "grad_norm": 41.59762954711914, "learning_rate": 2.394127611518916e-06, "loss": 0.3982, "step": 46770 }, { "epoch": 8.8, "grad_norm": 9.582560539245605, "learning_rate": 2.3903632599284775e-06, "loss": 0.9732, "step": 46780 }, { "epoch": 8.81, "grad_norm": 13.329916954040527, "learning_rate": 2.386598908338039e-06, "loss": 0.5949, "step": 46790 }, { "epoch": 8.81, "grad_norm": 29.90341567993164, "learning_rate": 2.3828345567476005e-06, "loss": 0.4844, "step": 46800 }, { "epoch": 8.81, "grad_norm": 6.58671236038208, "learning_rate": 2.379070205157162e-06, "loss": 0.7588, "step": 46810 }, { "epoch": 8.81, "grad_norm": 13.950329780578613, "learning_rate": 2.3753058535667234e-06, "loss": 0.3936, "step": 46820 }, { "epoch": 8.81, "grad_norm": 3.745379686355591, "learning_rate": 2.371541501976285e-06, "loss": 0.5207, "step": 46830 }, { "epoch": 8.82, "grad_norm": 12.819116592407227, "learning_rate": 2.3677771503858463e-06, "loss": 0.749, "step": 46840 }, { "epoch": 8.82, "grad_norm": 12.26696491241455, "learning_rate": 2.3640127987954078e-06, "loss": 0.5802, "step": 46850 }, { "epoch": 8.82, "grad_norm": 16.143400192260742, "learning_rate": 2.3602484472049692e-06, "loss": 0.4706, "step": 46860 }, { "epoch": 8.82, "grad_norm": 10.1019287109375, "learning_rate": 2.3564840956145303e-06, "loss": 0.8, "step": 46870 }, { "epoch": 8.82, "grad_norm": 6.047752380371094, "learning_rate": 2.352719744024092e-06, "loss": 0.4465, "step": 46880 }, { "epoch": 8.83, "grad_norm": 21.247766494750977, "learning_rate": 2.3489553924336536e-06, "loss": 0.7566, "step": 46890 }, { "epoch": 8.83, "grad_norm": 2.1402547359466553, "learning_rate": 2.345191040843215e-06, "loss": 0.365, "step": 46900 }, { "epoch": 8.83, "grad_norm": 3.070004940032959, "learning_rate": 2.3414266892527765e-06, "loss": 0.5624, "step": 46910 }, { "epoch": 8.83, "grad_norm": 3.346886396408081, "learning_rate": 2.337662337662338e-06, "loss": 0.6329, "step": 46920 }, { "epoch": 8.83, "grad_norm": 11.768649101257324, "learning_rate": 2.3338979860718994e-06, "loss": 0.7147, "step": 46930 }, { "epoch": 8.83, "grad_norm": 0.8839296102523804, "learning_rate": 2.3301336344814605e-06, "loss": 0.7955, "step": 46940 }, { "epoch": 8.84, "grad_norm": 24.871585845947266, "learning_rate": 2.3263692828910224e-06, "loss": 0.7116, "step": 46950 }, { "epoch": 8.84, "grad_norm": 11.107851028442383, "learning_rate": 2.3226049313005834e-06, "loss": 0.7345, "step": 46960 }, { "epoch": 8.84, "grad_norm": 29.10956573486328, "learning_rate": 2.3188405797101453e-06, "loss": 0.6547, "step": 46970 }, { "epoch": 8.84, "grad_norm": 38.27964782714844, "learning_rate": 2.3150762281197063e-06, "loss": 0.3859, "step": 46980 }, { "epoch": 8.84, "grad_norm": 6.225564002990723, "learning_rate": 2.311311876529268e-06, "loss": 0.9074, "step": 46990 }, { "epoch": 8.85, "grad_norm": 15.716607093811035, "learning_rate": 2.3075475249388297e-06, "loss": 0.9341, "step": 47000 }, { "epoch": 8.85, "grad_norm": 0.18570950627326965, "learning_rate": 2.3037831733483907e-06, "loss": 0.4346, "step": 47010 }, { "epoch": 8.85, "grad_norm": 6.636110782623291, "learning_rate": 2.3000188217579526e-06, "loss": 0.6184, "step": 47020 }, { "epoch": 8.85, "grad_norm": 14.311277389526367, "learning_rate": 2.2962544701675136e-06, "loss": 0.6068, "step": 47030 }, { "epoch": 8.85, "grad_norm": 3.578314781188965, "learning_rate": 2.2924901185770755e-06, "loss": 0.4473, "step": 47040 }, { "epoch": 8.86, "grad_norm": 15.608357429504395, "learning_rate": 2.2887257669866366e-06, "loss": 0.5235, "step": 47050 }, { "epoch": 8.86, "grad_norm": 5.3571343421936035, "learning_rate": 2.2849614153961984e-06, "loss": 0.3188, "step": 47060 }, { "epoch": 8.86, "grad_norm": 6.710114479064941, "learning_rate": 2.2811970638057595e-06, "loss": 0.835, "step": 47070 }, { "epoch": 8.86, "grad_norm": 23.842775344848633, "learning_rate": 2.277432712215321e-06, "loss": 0.7257, "step": 47080 }, { "epoch": 8.86, "grad_norm": 31.147573471069336, "learning_rate": 2.2736683606248824e-06, "loss": 0.8778, "step": 47090 }, { "epoch": 8.87, "grad_norm": 6.439455509185791, "learning_rate": 2.269904009034444e-06, "loss": 0.4632, "step": 47100 }, { "epoch": 8.87, "grad_norm": 11.4172945022583, "learning_rate": 2.2661396574440053e-06, "loss": 0.8312, "step": 47110 }, { "epoch": 8.87, "grad_norm": 3.4869778156280518, "learning_rate": 2.2623753058535668e-06, "loss": 0.5454, "step": 47120 }, { "epoch": 8.87, "grad_norm": 10.910449981689453, "learning_rate": 2.2586109542631287e-06, "loss": 0.4681, "step": 47130 }, { "epoch": 8.87, "grad_norm": 0.8648974299430847, "learning_rate": 2.2548466026726897e-06, "loss": 0.4555, "step": 47140 }, { "epoch": 8.87, "grad_norm": 6.891892910003662, "learning_rate": 2.251082251082251e-06, "loss": 0.7042, "step": 47150 }, { "epoch": 8.88, "grad_norm": 14.134632110595703, "learning_rate": 2.2473178994918126e-06, "loss": 0.5195, "step": 47160 }, { "epoch": 8.88, "grad_norm": 17.393081665039062, "learning_rate": 2.243553547901374e-06, "loss": 0.6792, "step": 47170 }, { "epoch": 8.88, "grad_norm": 11.700157165527344, "learning_rate": 2.2397891963109355e-06, "loss": 0.7832, "step": 47180 }, { "epoch": 8.88, "grad_norm": 5.2067365646362305, "learning_rate": 2.236024844720497e-06, "loss": 0.6385, "step": 47190 }, { "epoch": 8.88, "grad_norm": 11.731846809387207, "learning_rate": 2.2322604931300585e-06, "loss": 0.7581, "step": 47200 }, { "epoch": 8.89, "grad_norm": 7.072956085205078, "learning_rate": 2.22849614153962e-06, "loss": 0.4065, "step": 47210 }, { "epoch": 8.89, "grad_norm": 17.5797061920166, "learning_rate": 2.2247317899491814e-06, "loss": 0.6431, "step": 47220 }, { "epoch": 8.89, "grad_norm": 23.6998348236084, "learning_rate": 2.220967438358743e-06, "loss": 0.6196, "step": 47230 }, { "epoch": 8.89, "grad_norm": 9.595467567443848, "learning_rate": 2.2172030867683043e-06, "loss": 0.5062, "step": 47240 }, { "epoch": 8.89, "grad_norm": 20.422664642333984, "learning_rate": 2.2134387351778658e-06, "loss": 0.8374, "step": 47250 }, { "epoch": 8.9, "grad_norm": 1.3820518255233765, "learning_rate": 2.2096743835874272e-06, "loss": 0.578, "step": 47260 }, { "epoch": 8.9, "grad_norm": 13.483983039855957, "learning_rate": 2.2059100319969887e-06, "loss": 0.712, "step": 47270 }, { "epoch": 8.9, "grad_norm": 1.5068944692611694, "learning_rate": 2.20214568040655e-06, "loss": 0.6054, "step": 47280 }, { "epoch": 8.9, "grad_norm": 7.836757183074951, "learning_rate": 2.1983813288161116e-06, "loss": 0.5263, "step": 47290 }, { "epoch": 8.9, "grad_norm": 1.4360402822494507, "learning_rate": 2.194616977225673e-06, "loss": 0.3449, "step": 47300 }, { "epoch": 8.9, "grad_norm": 1.4591114521026611, "learning_rate": 2.1908526256352345e-06, "loss": 0.504, "step": 47310 }, { "epoch": 8.91, "grad_norm": 21.37377166748047, "learning_rate": 2.187088274044796e-06, "loss": 0.9963, "step": 47320 }, { "epoch": 8.91, "grad_norm": 10.65417766571045, "learning_rate": 2.1833239224543575e-06, "loss": 0.6325, "step": 47330 }, { "epoch": 8.91, "grad_norm": 4.807913780212402, "learning_rate": 2.179559570863919e-06, "loss": 0.4671, "step": 47340 }, { "epoch": 8.91, "grad_norm": 22.458324432373047, "learning_rate": 2.1757952192734804e-06, "loss": 0.5971, "step": 47350 }, { "epoch": 8.91, "grad_norm": 5.206461429595947, "learning_rate": 2.172030867683042e-06, "loss": 0.7577, "step": 47360 }, { "epoch": 8.92, "grad_norm": 30.846031188964844, "learning_rate": 2.1682665160926033e-06, "loss": 0.5392, "step": 47370 }, { "epoch": 8.92, "grad_norm": 6.488540172576904, "learning_rate": 2.1645021645021648e-06, "loss": 0.7281, "step": 47380 }, { "epoch": 8.92, "grad_norm": 11.6080904006958, "learning_rate": 2.1607378129117262e-06, "loss": 0.574, "step": 47390 }, { "epoch": 8.92, "grad_norm": 2.8382701873779297, "learning_rate": 2.1569734613212877e-06, "loss": 0.3675, "step": 47400 }, { "epoch": 8.92, "grad_norm": 31.578075408935547, "learning_rate": 2.153209109730849e-06, "loss": 0.4778, "step": 47410 }, { "epoch": 8.93, "grad_norm": 14.79430866241455, "learning_rate": 2.1494447581404106e-06, "loss": 0.6087, "step": 47420 }, { "epoch": 8.93, "grad_norm": 42.76433181762695, "learning_rate": 2.145680406549972e-06, "loss": 0.9845, "step": 47430 }, { "epoch": 8.93, "grad_norm": 4.474719524383545, "learning_rate": 2.1419160549595335e-06, "loss": 0.6086, "step": 47440 }, { "epoch": 8.93, "grad_norm": 13.402969360351562, "learning_rate": 2.138151703369095e-06, "loss": 0.8404, "step": 47450 }, { "epoch": 8.93, "grad_norm": 5.0676703453063965, "learning_rate": 2.134387351778656e-06, "loss": 0.6742, "step": 47460 }, { "epoch": 8.93, "grad_norm": 36.38506317138672, "learning_rate": 2.130623000188218e-06, "loss": 0.5551, "step": 47470 }, { "epoch": 8.94, "grad_norm": 28.374101638793945, "learning_rate": 2.126858648597779e-06, "loss": 0.5895, "step": 47480 }, { "epoch": 8.94, "grad_norm": 6.351111888885498, "learning_rate": 2.123094297007341e-06, "loss": 0.5389, "step": 47490 }, { "epoch": 8.94, "grad_norm": 27.408905029296875, "learning_rate": 2.1193299454169023e-06, "loss": 0.4278, "step": 47500 }, { "epoch": 8.94, "grad_norm": 3.6693081855773926, "learning_rate": 2.1155655938264637e-06, "loss": 0.7052, "step": 47510 }, { "epoch": 8.94, "grad_norm": 1.8343833684921265, "learning_rate": 2.111801242236025e-06, "loss": 0.7768, "step": 47520 }, { "epoch": 8.95, "grad_norm": 22.67610740661621, "learning_rate": 2.1080368906455862e-06, "loss": 0.6645, "step": 47530 }, { "epoch": 8.95, "grad_norm": 1.9589619636535645, "learning_rate": 2.104272539055148e-06, "loss": 0.5998, "step": 47540 }, { "epoch": 8.95, "grad_norm": 2.940026044845581, "learning_rate": 2.100508187464709e-06, "loss": 0.762, "step": 47550 }, { "epoch": 8.95, "grad_norm": 24.99663734436035, "learning_rate": 2.096743835874271e-06, "loss": 0.7016, "step": 47560 }, { "epoch": 8.95, "grad_norm": 21.34873390197754, "learning_rate": 2.092979484283832e-06, "loss": 0.5024, "step": 47570 }, { "epoch": 8.96, "grad_norm": 11.582036972045898, "learning_rate": 2.089215132693394e-06, "loss": 0.5082, "step": 47580 }, { "epoch": 8.96, "grad_norm": 10.478972434997559, "learning_rate": 2.085450781102955e-06, "loss": 1.0764, "step": 47590 }, { "epoch": 8.96, "grad_norm": 4.743780136108398, "learning_rate": 2.0816864295125165e-06, "loss": 0.3936, "step": 47600 }, { "epoch": 8.96, "grad_norm": 7.1367669105529785, "learning_rate": 2.0779220779220784e-06, "loss": 0.8078, "step": 47610 }, { "epoch": 8.96, "grad_norm": 14.184408187866211, "learning_rate": 2.0741577263316394e-06, "loss": 0.5983, "step": 47620 }, { "epoch": 8.96, "grad_norm": 14.514617919921875, "learning_rate": 2.0703933747412013e-06, "loss": 0.6006, "step": 47630 }, { "epoch": 8.97, "grad_norm": 21.567785263061523, "learning_rate": 2.0666290231507623e-06, "loss": 0.5477, "step": 47640 }, { "epoch": 8.97, "grad_norm": 11.7056303024292, "learning_rate": 2.062864671560324e-06, "loss": 0.603, "step": 47650 }, { "epoch": 8.97, "grad_norm": 5.441671371459961, "learning_rate": 2.0591003199698852e-06, "loss": 0.6352, "step": 47660 }, { "epoch": 8.97, "grad_norm": 30.076702117919922, "learning_rate": 2.0553359683794467e-06, "loss": 0.4807, "step": 47670 }, { "epoch": 8.97, "grad_norm": 6.434941291809082, "learning_rate": 2.051571616789008e-06, "loss": 0.2713, "step": 47680 }, { "epoch": 8.98, "grad_norm": 5.448329925537109, "learning_rate": 2.0478072651985696e-06, "loss": 0.3884, "step": 47690 }, { "epoch": 8.98, "grad_norm": 15.881041526794434, "learning_rate": 2.044042913608131e-06, "loss": 0.7185, "step": 47700 }, { "epoch": 8.98, "grad_norm": 16.072519302368164, "learning_rate": 2.0402785620176925e-06, "loss": 0.7682, "step": 47710 }, { "epoch": 8.98, "grad_norm": 2.7786433696746826, "learning_rate": 2.036514210427254e-06, "loss": 0.7186, "step": 47720 }, { "epoch": 8.98, "grad_norm": 10.622347831726074, "learning_rate": 2.0327498588368155e-06, "loss": 0.4433, "step": 47730 }, { "epoch": 8.99, "grad_norm": 11.719313621520996, "learning_rate": 2.028985507246377e-06, "loss": 0.5505, "step": 47740 }, { "epoch": 8.99, "grad_norm": 11.617704391479492, "learning_rate": 2.0252211556559384e-06, "loss": 0.4553, "step": 47750 }, { "epoch": 8.99, "grad_norm": 16.727815628051758, "learning_rate": 2.0214568040655e-06, "loss": 0.6204, "step": 47760 }, { "epoch": 8.99, "grad_norm": 9.667970657348633, "learning_rate": 2.0176924524750613e-06, "loss": 0.5313, "step": 47770 }, { "epoch": 8.99, "grad_norm": 9.410597801208496, "learning_rate": 2.0139281008846228e-06, "loss": 0.9936, "step": 47780 }, { "epoch": 8.99, "grad_norm": 4.794157981872559, "learning_rate": 2.0101637492941842e-06, "loss": 0.674, "step": 47790 }, { "epoch": 9.0, "grad_norm": 20.21316909790039, "learning_rate": 2.0063993977037457e-06, "loss": 0.5472, "step": 47800 }, { "epoch": 9.0, "grad_norm": 29.136646270751953, "learning_rate": 2.002635046113307e-06, "loss": 0.7448, "step": 47810 }, { "epoch": 9.0, "eval_accuracy": 0.8113333333333334, "eval_loss": 0.8612932562828064, "eval_runtime": 31.1046, "eval_samples_per_second": 241.122, "eval_steps_per_second": 30.156, "step": 47817 }, { "epoch": 9.0, "grad_norm": 1.5622543096542358, "learning_rate": 1.9988706945228686e-06, "loss": 0.9429, "step": 47820 }, { "epoch": 9.0, "grad_norm": 25.724414825439453, "learning_rate": 1.99510634293243e-06, "loss": 0.7602, "step": 47830 }, { "epoch": 9.0, "grad_norm": 8.290772438049316, "learning_rate": 1.9913419913419915e-06, "loss": 0.5284, "step": 47840 }, { "epoch": 9.01, "grad_norm": 16.59626007080078, "learning_rate": 1.987577639751553e-06, "loss": 0.7048, "step": 47850 }, { "epoch": 9.01, "grad_norm": 4.591279983520508, "learning_rate": 1.9838132881611144e-06, "loss": 0.6146, "step": 47860 }, { "epoch": 9.01, "grad_norm": 7.34530782699585, "learning_rate": 1.980048936570676e-06, "loss": 0.6408, "step": 47870 }, { "epoch": 9.01, "grad_norm": 18.267234802246094, "learning_rate": 1.9762845849802374e-06, "loss": 0.6059, "step": 47880 }, { "epoch": 9.01, "grad_norm": 11.07114315032959, "learning_rate": 1.972520233389799e-06, "loss": 0.4136, "step": 47890 }, { "epoch": 9.02, "grad_norm": 75.70472717285156, "learning_rate": 1.9687558817993603e-06, "loss": 0.6932, "step": 47900 }, { "epoch": 9.02, "grad_norm": 15.39134407043457, "learning_rate": 1.9649915302089217e-06, "loss": 0.5755, "step": 47910 }, { "epoch": 9.02, "grad_norm": 2.378514528274536, "learning_rate": 1.961227178618483e-06, "loss": 0.4398, "step": 47920 }, { "epoch": 9.02, "grad_norm": 15.460169792175293, "learning_rate": 1.9574628270280447e-06, "loss": 0.5572, "step": 47930 }, { "epoch": 9.02, "grad_norm": 1.4817324876785278, "learning_rate": 1.953698475437606e-06, "loss": 0.798, "step": 47940 }, { "epoch": 9.03, "grad_norm": 2.4670777320861816, "learning_rate": 1.9499341238471676e-06, "loss": 0.2853, "step": 47950 }, { "epoch": 9.03, "grad_norm": 12.047622680664062, "learning_rate": 1.9461697722567286e-06, "loss": 0.6567, "step": 47960 }, { "epoch": 9.03, "grad_norm": 4.017693519592285, "learning_rate": 1.9424054206662905e-06, "loss": 0.715, "step": 47970 }, { "epoch": 9.03, "grad_norm": 9.473812103271484, "learning_rate": 1.938641069075852e-06, "loss": 0.9742, "step": 47980 }, { "epoch": 9.03, "grad_norm": 21.250436782836914, "learning_rate": 1.9348767174854134e-06, "loss": 0.7318, "step": 47990 }, { "epoch": 9.03, "grad_norm": 6.7031731605529785, "learning_rate": 1.931112365894975e-06, "loss": 0.5279, "step": 48000 }, { "epoch": 9.04, "grad_norm": 15.282720565795898, "learning_rate": 1.9273480143045364e-06, "loss": 0.6562, "step": 48010 }, { "epoch": 9.04, "grad_norm": 3.4590566158294678, "learning_rate": 1.923583662714098e-06, "loss": 0.5608, "step": 48020 }, { "epoch": 9.04, "grad_norm": 2.538116693496704, "learning_rate": 1.919819311123659e-06, "loss": 0.4783, "step": 48030 }, { "epoch": 9.04, "grad_norm": 20.771568298339844, "learning_rate": 1.9160549595332207e-06, "loss": 0.4992, "step": 48040 }, { "epoch": 9.04, "grad_norm": 6.520461082458496, "learning_rate": 1.9122906079427818e-06, "loss": 0.5313, "step": 48050 }, { "epoch": 9.05, "grad_norm": 6.486697196960449, "learning_rate": 1.9085262563523437e-06, "loss": 0.4209, "step": 48060 }, { "epoch": 9.05, "grad_norm": 8.662752151489258, "learning_rate": 1.904761904761905e-06, "loss": 0.8395, "step": 48070 }, { "epoch": 9.05, "grad_norm": 1.8475117683410645, "learning_rate": 1.9009975531714664e-06, "loss": 0.4631, "step": 48080 }, { "epoch": 9.05, "grad_norm": 19.98015594482422, "learning_rate": 1.8972332015810276e-06, "loss": 0.4317, "step": 48090 }, { "epoch": 9.05, "grad_norm": 7.7947492599487305, "learning_rate": 1.8934688499905893e-06, "loss": 0.4172, "step": 48100 }, { "epoch": 9.06, "grad_norm": 21.189083099365234, "learning_rate": 1.8897044984001508e-06, "loss": 0.446, "step": 48110 }, { "epoch": 9.06, "grad_norm": 4.611180305480957, "learning_rate": 1.8859401468097122e-06, "loss": 0.3936, "step": 48120 }, { "epoch": 9.06, "grad_norm": 4.97189474105835, "learning_rate": 1.8821757952192737e-06, "loss": 0.4853, "step": 48130 }, { "epoch": 9.06, "grad_norm": 6.571353912353516, "learning_rate": 1.8784114436288351e-06, "loss": 0.2693, "step": 48140 }, { "epoch": 9.06, "grad_norm": 17.947668075561523, "learning_rate": 1.8746470920383966e-06, "loss": 0.6615, "step": 48150 }, { "epoch": 9.06, "grad_norm": 5.229343414306641, "learning_rate": 1.8708827404479578e-06, "loss": 0.8147, "step": 48160 }, { "epoch": 9.07, "grad_norm": 3.2106659412384033, "learning_rate": 1.8671183888575195e-06, "loss": 0.8488, "step": 48170 }, { "epoch": 9.07, "grad_norm": 8.611183166503906, "learning_rate": 1.8633540372670808e-06, "loss": 0.5314, "step": 48180 }, { "epoch": 9.07, "grad_norm": 5.848559856414795, "learning_rate": 1.8595896856766424e-06, "loss": 0.6859, "step": 48190 }, { "epoch": 9.07, "grad_norm": 7.425753593444824, "learning_rate": 1.8558253340862037e-06, "loss": 0.4937, "step": 48200 }, { "epoch": 9.07, "grad_norm": 22.03529930114746, "learning_rate": 1.8520609824957654e-06, "loss": 0.4312, "step": 48210 }, { "epoch": 9.08, "grad_norm": 15.623173713684082, "learning_rate": 1.8482966309053266e-06, "loss": 0.6556, "step": 48220 }, { "epoch": 9.08, "grad_norm": 16.413898468017578, "learning_rate": 1.844532279314888e-06, "loss": 0.7305, "step": 48230 }, { "epoch": 9.08, "grad_norm": 4.784519672393799, "learning_rate": 1.8407679277244497e-06, "loss": 0.42, "step": 48240 }, { "epoch": 9.08, "grad_norm": 45.84784698486328, "learning_rate": 1.837003576134011e-06, "loss": 0.9506, "step": 48250 }, { "epoch": 9.08, "grad_norm": 17.076520919799805, "learning_rate": 1.8332392245435727e-06, "loss": 0.3164, "step": 48260 }, { "epoch": 9.09, "grad_norm": 25.98237419128418, "learning_rate": 1.829474872953134e-06, "loss": 0.6438, "step": 48270 }, { "epoch": 9.09, "grad_norm": 2.230210781097412, "learning_rate": 1.8257105213626956e-06, "loss": 0.3658, "step": 48280 }, { "epoch": 9.09, "grad_norm": 1.854834794998169, "learning_rate": 1.8219461697722568e-06, "loss": 0.4779, "step": 48290 }, { "epoch": 9.09, "grad_norm": 9.681106567382812, "learning_rate": 1.8181818181818183e-06, "loss": 0.5547, "step": 48300 }, { "epoch": 9.09, "grad_norm": 16.961469650268555, "learning_rate": 1.8144174665913798e-06, "loss": 0.532, "step": 48310 }, { "epoch": 9.09, "grad_norm": 11.956188201904297, "learning_rate": 1.8106531150009412e-06, "loss": 0.5572, "step": 48320 }, { "epoch": 9.1, "grad_norm": 5.145517349243164, "learning_rate": 1.8068887634105025e-06, "loss": 0.7962, "step": 48330 }, { "epoch": 9.1, "grad_norm": 13.83715534210205, "learning_rate": 1.8031244118200641e-06, "loss": 0.6924, "step": 48340 }, { "epoch": 9.1, "grad_norm": 27.427486419677734, "learning_rate": 1.7993600602296258e-06, "loss": 0.6151, "step": 48350 }, { "epoch": 9.1, "grad_norm": 1.7132238149642944, "learning_rate": 1.795595708639187e-06, "loss": 0.412, "step": 48360 }, { "epoch": 9.1, "grad_norm": 6.342550754547119, "learning_rate": 1.7918313570487485e-06, "loss": 0.6804, "step": 48370 }, { "epoch": 9.11, "grad_norm": 14.876754760742188, "learning_rate": 1.78806700545831e-06, "loss": 0.7302, "step": 48380 }, { "epoch": 9.11, "grad_norm": 8.127811431884766, "learning_rate": 1.7843026538678714e-06, "loss": 0.6995, "step": 48390 }, { "epoch": 9.11, "grad_norm": 11.656697273254395, "learning_rate": 1.780538302277433e-06, "loss": 0.7639, "step": 48400 }, { "epoch": 9.11, "grad_norm": 8.754948616027832, "learning_rate": 1.7767739506869944e-06, "loss": 0.5654, "step": 48410 }, { "epoch": 9.11, "grad_norm": 14.82595157623291, "learning_rate": 1.7730095990965556e-06, "loss": 0.6221, "step": 48420 }, { "epoch": 9.12, "grad_norm": 8.703484535217285, "learning_rate": 1.7692452475061173e-06, "loss": 0.4574, "step": 48430 }, { "epoch": 9.12, "grad_norm": 16.365890502929688, "learning_rate": 1.7654808959156785e-06, "loss": 0.8169, "step": 48440 }, { "epoch": 9.12, "grad_norm": 33.57789993286133, "learning_rate": 1.7617165443252402e-06, "loss": 0.4659, "step": 48450 }, { "epoch": 9.12, "grad_norm": 18.249784469604492, "learning_rate": 1.7579521927348015e-06, "loss": 0.8234, "step": 48460 }, { "epoch": 9.12, "grad_norm": 6.25030517578125, "learning_rate": 1.7541878411443631e-06, "loss": 0.3762, "step": 48470 }, { "epoch": 9.12, "grad_norm": 21.136138916015625, "learning_rate": 1.7504234895539246e-06, "loss": 0.5552, "step": 48480 }, { "epoch": 9.13, "grad_norm": 12.224783897399902, "learning_rate": 1.7466591379634858e-06, "loss": 0.3561, "step": 48490 }, { "epoch": 9.13, "grad_norm": 11.935855865478516, "learning_rate": 1.7428947863730475e-06, "loss": 0.842, "step": 48500 }, { "epoch": 9.13, "grad_norm": 30.98661231994629, "learning_rate": 1.7391304347826088e-06, "loss": 0.214, "step": 48510 }, { "epoch": 9.13, "grad_norm": 14.18700122833252, "learning_rate": 1.7353660831921704e-06, "loss": 0.5252, "step": 48520 }, { "epoch": 9.13, "grad_norm": 1.2159621715545654, "learning_rate": 1.7316017316017317e-06, "loss": 0.5819, "step": 48530 }, { "epoch": 9.14, "grad_norm": 7.028285503387451, "learning_rate": 1.7278373800112933e-06, "loss": 0.6709, "step": 48540 }, { "epoch": 9.14, "grad_norm": 33.46006393432617, "learning_rate": 1.7240730284208546e-06, "loss": 0.4328, "step": 48550 }, { "epoch": 9.14, "grad_norm": 23.722309112548828, "learning_rate": 1.720308676830416e-06, "loss": 0.7447, "step": 48560 }, { "epoch": 9.14, "grad_norm": 37.20194625854492, "learning_rate": 1.7165443252399775e-06, "loss": 0.5995, "step": 48570 }, { "epoch": 9.14, "grad_norm": 19.940744400024414, "learning_rate": 1.712779973649539e-06, "loss": 0.5126, "step": 48580 }, { "epoch": 9.15, "grad_norm": 12.266576766967773, "learning_rate": 1.7090156220591002e-06, "loss": 0.6911, "step": 48590 }, { "epoch": 9.15, "grad_norm": 28.25867462158203, "learning_rate": 1.705251270468662e-06, "loss": 0.8396, "step": 48600 }, { "epoch": 9.15, "grad_norm": 11.73324203491211, "learning_rate": 1.7014869188782236e-06, "loss": 0.624, "step": 48610 }, { "epoch": 9.15, "grad_norm": 3.341336727142334, "learning_rate": 1.6977225672877848e-06, "loss": 0.83, "step": 48620 }, { "epoch": 9.15, "grad_norm": 8.438216209411621, "learning_rate": 1.6939582156973463e-06, "loss": 0.4398, "step": 48630 }, { "epoch": 9.15, "grad_norm": 11.439131736755371, "learning_rate": 1.6901938641069077e-06, "loss": 0.6164, "step": 48640 }, { "epoch": 9.16, "grad_norm": 1.03106689453125, "learning_rate": 1.6864295125164692e-06, "loss": 0.5839, "step": 48650 }, { "epoch": 9.16, "grad_norm": 7.770140647888184, "learning_rate": 1.6826651609260305e-06, "loss": 0.5815, "step": 48660 }, { "epoch": 9.16, "grad_norm": 25.574764251708984, "learning_rate": 1.6789008093355921e-06, "loss": 0.757, "step": 48670 }, { "epoch": 9.16, "grad_norm": 21.2239933013916, "learning_rate": 1.6751364577451534e-06, "loss": 0.5761, "step": 48680 }, { "epoch": 9.16, "grad_norm": 6.24124002456665, "learning_rate": 1.671372106154715e-06, "loss": 0.6327, "step": 48690 }, { "epoch": 9.17, "grad_norm": 18.178970336914062, "learning_rate": 1.6676077545642763e-06, "loss": 0.7177, "step": 48700 }, { "epoch": 9.17, "grad_norm": 11.819021224975586, "learning_rate": 1.663843402973838e-06, "loss": 0.4652, "step": 48710 }, { "epoch": 9.17, "grad_norm": 38.98137664794922, "learning_rate": 1.6600790513833994e-06, "loss": 0.5361, "step": 48720 }, { "epoch": 9.17, "grad_norm": 6.611007213592529, "learning_rate": 1.6563146997929607e-06, "loss": 0.7811, "step": 48730 }, { "epoch": 9.17, "grad_norm": 4.096015453338623, "learning_rate": 1.6525503482025224e-06, "loss": 0.8429, "step": 48740 }, { "epoch": 9.18, "grad_norm": 2.697685956954956, "learning_rate": 1.6487859966120836e-06, "loss": 0.4988, "step": 48750 }, { "epoch": 9.18, "grad_norm": 16.236778259277344, "learning_rate": 1.6450216450216453e-06, "loss": 0.8066, "step": 48760 }, { "epoch": 9.18, "grad_norm": 6.6401238441467285, "learning_rate": 1.6412572934312065e-06, "loss": 0.7341, "step": 48770 }, { "epoch": 9.18, "grad_norm": 11.671562194824219, "learning_rate": 1.6374929418407682e-06, "loss": 0.7202, "step": 48780 }, { "epoch": 9.18, "grad_norm": 25.716503143310547, "learning_rate": 1.6337285902503294e-06, "loss": 0.3903, "step": 48790 }, { "epoch": 9.19, "grad_norm": 35.046539306640625, "learning_rate": 1.629964238659891e-06, "loss": 0.5261, "step": 48800 }, { "epoch": 9.19, "grad_norm": 5.636472225189209, "learning_rate": 1.6261998870694524e-06, "loss": 0.3858, "step": 48810 }, { "epoch": 9.19, "grad_norm": 25.642667770385742, "learning_rate": 1.6224355354790138e-06, "loss": 0.5336, "step": 48820 }, { "epoch": 9.19, "grad_norm": 2.0963945388793945, "learning_rate": 1.6186711838885753e-06, "loss": 0.4974, "step": 48830 }, { "epoch": 9.19, "grad_norm": 1.623321294784546, "learning_rate": 1.6149068322981367e-06, "loss": 0.3653, "step": 48840 }, { "epoch": 9.19, "grad_norm": 9.711484909057617, "learning_rate": 1.6111424807076984e-06, "loss": 0.4682, "step": 48850 }, { "epoch": 9.2, "grad_norm": 3.289696216583252, "learning_rate": 1.6073781291172597e-06, "loss": 0.6343, "step": 48860 }, { "epoch": 9.2, "grad_norm": 14.545419692993164, "learning_rate": 1.6036137775268213e-06, "loss": 0.7178, "step": 48870 }, { "epoch": 9.2, "grad_norm": 6.24910831451416, "learning_rate": 1.5998494259363826e-06, "loss": 0.5711, "step": 48880 }, { "epoch": 9.2, "grad_norm": 2.173504590988159, "learning_rate": 1.596085074345944e-06, "loss": 0.4992, "step": 48890 }, { "epoch": 9.2, "grad_norm": 25.810836791992188, "learning_rate": 1.5923207227555055e-06, "loss": 0.6941, "step": 48900 }, { "epoch": 9.21, "grad_norm": 8.503079414367676, "learning_rate": 1.588556371165067e-06, "loss": 0.5236, "step": 48910 }, { "epoch": 9.21, "grad_norm": 2.8896260261535645, "learning_rate": 1.5847920195746282e-06, "loss": 0.9549, "step": 48920 }, { "epoch": 9.21, "grad_norm": 7.186822891235352, "learning_rate": 1.5810276679841899e-06, "loss": 0.8493, "step": 48930 }, { "epoch": 9.21, "grad_norm": 1.0297064781188965, "learning_rate": 1.5772633163937511e-06, "loss": 0.5096, "step": 48940 }, { "epoch": 9.21, "grad_norm": 9.921859741210938, "learning_rate": 1.5734989648033128e-06, "loss": 0.5277, "step": 48950 }, { "epoch": 9.22, "grad_norm": 18.231931686401367, "learning_rate": 1.5697346132128743e-06, "loss": 0.6129, "step": 48960 }, { "epoch": 9.22, "grad_norm": 22.623863220214844, "learning_rate": 1.5659702616224357e-06, "loss": 0.5031, "step": 48970 }, { "epoch": 9.22, "grad_norm": 15.760795593261719, "learning_rate": 1.5622059100319972e-06, "loss": 0.4334, "step": 48980 }, { "epoch": 9.22, "grad_norm": 4.488766670227051, "learning_rate": 1.5584415584415584e-06, "loss": 0.3044, "step": 48990 }, { "epoch": 9.22, "grad_norm": 18.082365036010742, "learning_rate": 1.5546772068511201e-06, "loss": 0.9271, "step": 49000 }, { "epoch": 9.22, "grad_norm": 16.371353149414062, "learning_rate": 1.5509128552606814e-06, "loss": 0.5857, "step": 49010 }, { "epoch": 9.23, "grad_norm": 1.1179338693618774, "learning_rate": 1.547148503670243e-06, "loss": 0.4814, "step": 49020 }, { "epoch": 9.23, "grad_norm": 2.686447858810425, "learning_rate": 1.5433841520798043e-06, "loss": 0.6662, "step": 49030 }, { "epoch": 9.23, "grad_norm": 12.188923835754395, "learning_rate": 1.539619800489366e-06, "loss": 0.6355, "step": 49040 }, { "epoch": 9.23, "grad_norm": 8.136433601379395, "learning_rate": 1.5358554488989272e-06, "loss": 0.7515, "step": 49050 }, { "epoch": 9.23, "grad_norm": 20.931377410888672, "learning_rate": 1.5320910973084887e-06, "loss": 0.6187, "step": 49060 }, { "epoch": 9.24, "grad_norm": 3.9280941486358643, "learning_rate": 1.5283267457180501e-06, "loss": 0.4498, "step": 49070 }, { "epoch": 9.24, "grad_norm": 0.3402310311794281, "learning_rate": 1.5245623941276116e-06, "loss": 0.5595, "step": 49080 }, { "epoch": 9.24, "grad_norm": 9.078661918640137, "learning_rate": 1.5207980425371733e-06, "loss": 0.7489, "step": 49090 }, { "epoch": 9.24, "grad_norm": 30.37358856201172, "learning_rate": 1.5170336909467345e-06, "loss": 0.3654, "step": 49100 }, { "epoch": 9.24, "grad_norm": 10.291831016540527, "learning_rate": 1.5132693393562962e-06, "loss": 0.5645, "step": 49110 }, { "epoch": 9.25, "grad_norm": 7.896694660186768, "learning_rate": 1.5095049877658574e-06, "loss": 0.2895, "step": 49120 }, { "epoch": 9.25, "grad_norm": 0.40163493156433105, "learning_rate": 1.505740636175419e-06, "loss": 0.5064, "step": 49130 }, { "epoch": 9.25, "grad_norm": 20.281349182128906, "learning_rate": 1.5019762845849804e-06, "loss": 0.6789, "step": 49140 }, { "epoch": 9.25, "grad_norm": 9.524188041687012, "learning_rate": 1.4982119329945418e-06, "loss": 0.3622, "step": 49150 }, { "epoch": 9.25, "grad_norm": 16.541772842407227, "learning_rate": 1.4944475814041033e-06, "loss": 0.3041, "step": 49160 }, { "epoch": 9.25, "grad_norm": 35.032470703125, "learning_rate": 1.4906832298136647e-06, "loss": 0.454, "step": 49170 }, { "epoch": 9.26, "grad_norm": 21.02794075012207, "learning_rate": 1.486918878223226e-06, "loss": 0.694, "step": 49180 }, { "epoch": 9.26, "grad_norm": 15.566442489624023, "learning_rate": 1.4831545266327877e-06, "loss": 0.8165, "step": 49190 }, { "epoch": 9.26, "grad_norm": 4.007313251495361, "learning_rate": 1.479390175042349e-06, "loss": 0.4152, "step": 49200 }, { "epoch": 9.26, "grad_norm": 18.187461853027344, "learning_rate": 1.4756258234519106e-06, "loss": 0.386, "step": 49210 }, { "epoch": 9.26, "grad_norm": 4.756071090698242, "learning_rate": 1.471861471861472e-06, "loss": 0.7949, "step": 49220 }, { "epoch": 9.27, "grad_norm": 14.6629056930542, "learning_rate": 1.4680971202710335e-06, "loss": 0.313, "step": 49230 }, { "epoch": 9.27, "grad_norm": 1.6356908082962036, "learning_rate": 1.464332768680595e-06, "loss": 0.5506, "step": 49240 }, { "epoch": 9.27, "grad_norm": 0.9106524586677551, "learning_rate": 1.4605684170901562e-06, "loss": 0.6186, "step": 49250 }, { "epoch": 9.27, "grad_norm": 4.561417579650879, "learning_rate": 1.4568040654997179e-06, "loss": 0.4145, "step": 49260 }, { "epoch": 9.27, "grad_norm": 13.317676544189453, "learning_rate": 1.4530397139092791e-06, "loss": 1.0367, "step": 49270 }, { "epoch": 9.28, "grad_norm": 6.988114833831787, "learning_rate": 1.4492753623188408e-06, "loss": 0.4514, "step": 49280 }, { "epoch": 9.28, "grad_norm": 31.31980323791504, "learning_rate": 1.445511010728402e-06, "loss": 0.7261, "step": 49290 }, { "epoch": 9.28, "grad_norm": 2.1789939403533936, "learning_rate": 1.4417466591379637e-06, "loss": 0.6407, "step": 49300 }, { "epoch": 9.28, "grad_norm": 24.293264389038086, "learning_rate": 1.437982307547525e-06, "loss": 0.4835, "step": 49310 }, { "epoch": 9.28, "grad_norm": 16.02501678466797, "learning_rate": 1.4342179559570864e-06, "loss": 0.5772, "step": 49320 }, { "epoch": 9.28, "grad_norm": 33.829254150390625, "learning_rate": 1.4304536043666481e-06, "loss": 0.6539, "step": 49330 }, { "epoch": 9.29, "grad_norm": 9.65316390991211, "learning_rate": 1.4266892527762094e-06, "loss": 0.6423, "step": 49340 }, { "epoch": 9.29, "grad_norm": 1.9328742027282715, "learning_rate": 1.422924901185771e-06, "loss": 0.7493, "step": 49350 }, { "epoch": 9.29, "grad_norm": 20.31842041015625, "learning_rate": 1.4191605495953323e-06, "loss": 0.6325, "step": 49360 }, { "epoch": 9.29, "grad_norm": 17.110515594482422, "learning_rate": 1.415396198004894e-06, "loss": 0.4475, "step": 49370 }, { "epoch": 9.29, "grad_norm": 7.140813827514648, "learning_rate": 1.4116318464144552e-06, "loss": 0.6138, "step": 49380 }, { "epoch": 9.3, "grad_norm": 62.30890655517578, "learning_rate": 1.4078674948240167e-06, "loss": 0.3909, "step": 49390 }, { "epoch": 9.3, "grad_norm": 1.9558814764022827, "learning_rate": 1.4041031432335781e-06, "loss": 0.5326, "step": 49400 }, { "epoch": 9.3, "grad_norm": 29.794363021850586, "learning_rate": 1.4003387916431396e-06, "loss": 0.7016, "step": 49410 }, { "epoch": 9.3, "grad_norm": 14.142224311828613, "learning_rate": 1.3965744400527008e-06, "loss": 0.6672, "step": 49420 }, { "epoch": 9.3, "grad_norm": 1.3822832107543945, "learning_rate": 1.3928100884622625e-06, "loss": 0.5274, "step": 49430 }, { "epoch": 9.31, "grad_norm": 5.6710286140441895, "learning_rate": 1.3890457368718238e-06, "loss": 0.4605, "step": 49440 }, { "epoch": 9.31, "grad_norm": 15.574535369873047, "learning_rate": 1.3852813852813854e-06, "loss": 0.5296, "step": 49450 }, { "epoch": 9.31, "grad_norm": 1.1573097705841064, "learning_rate": 1.3815170336909469e-06, "loss": 0.5312, "step": 49460 }, { "epoch": 9.31, "grad_norm": 12.77651596069336, "learning_rate": 1.3777526821005083e-06, "loss": 0.345, "step": 49470 }, { "epoch": 9.31, "grad_norm": 11.803595542907715, "learning_rate": 1.3739883305100698e-06, "loss": 0.3616, "step": 49480 }, { "epoch": 9.31, "grad_norm": 13.414774894714355, "learning_rate": 1.370223978919631e-06, "loss": 0.5606, "step": 49490 }, { "epoch": 9.32, "grad_norm": 1.8225449323654175, "learning_rate": 1.3664596273291927e-06, "loss": 0.4418, "step": 49500 }, { "epoch": 9.32, "grad_norm": 15.913640975952148, "learning_rate": 1.362695275738754e-06, "loss": 0.6582, "step": 49510 }, { "epoch": 9.32, "grad_norm": 4.247087478637695, "learning_rate": 1.3589309241483157e-06, "loss": 0.9087, "step": 49520 }, { "epoch": 9.32, "grad_norm": 10.692419052124023, "learning_rate": 1.355166572557877e-06, "loss": 0.4397, "step": 49530 }, { "epoch": 9.32, "grad_norm": 1.3395839929580688, "learning_rate": 1.3514022209674386e-06, "loss": 0.2051, "step": 49540 }, { "epoch": 9.33, "grad_norm": 4.697491645812988, "learning_rate": 1.3476378693769998e-06, "loss": 0.5434, "step": 49550 }, { "epoch": 9.33, "grad_norm": 5.93233060836792, "learning_rate": 1.3438735177865615e-06, "loss": 0.5759, "step": 49560 }, { "epoch": 9.33, "grad_norm": 31.52614974975586, "learning_rate": 1.3401091661961227e-06, "loss": 0.6128, "step": 49570 }, { "epoch": 9.33, "grad_norm": 17.150054931640625, "learning_rate": 1.3363448146056842e-06, "loss": 0.7897, "step": 49580 }, { "epoch": 9.33, "grad_norm": 5.239444255828857, "learning_rate": 1.3325804630152459e-06, "loss": 0.538, "step": 49590 }, { "epoch": 9.34, "grad_norm": 10.952408790588379, "learning_rate": 1.3288161114248071e-06, "loss": 0.4829, "step": 49600 }, { "epoch": 9.34, "grad_norm": 7.637601375579834, "learning_rate": 1.3250517598343688e-06, "loss": 0.709, "step": 49610 }, { "epoch": 9.34, "grad_norm": 11.83415699005127, "learning_rate": 1.32128740824393e-06, "loss": 0.4149, "step": 49620 }, { "epoch": 9.34, "grad_norm": 9.83401107788086, "learning_rate": 1.3175230566534917e-06, "loss": 0.8603, "step": 49630 }, { "epoch": 9.34, "grad_norm": 3.275660514831543, "learning_rate": 1.313758705063053e-06, "loss": 0.6182, "step": 49640 }, { "epoch": 9.35, "grad_norm": 7.85551118850708, "learning_rate": 1.3099943534726144e-06, "loss": 0.5872, "step": 49650 }, { "epoch": 9.35, "grad_norm": 29.82291603088379, "learning_rate": 1.3062300018821759e-06, "loss": 0.5029, "step": 49660 }, { "epoch": 9.35, "grad_norm": 25.748849868774414, "learning_rate": 1.3024656502917373e-06, "loss": 0.6461, "step": 49670 }, { "epoch": 9.35, "grad_norm": 12.761697769165039, "learning_rate": 1.2987012987012986e-06, "loss": 0.7729, "step": 49680 }, { "epoch": 9.35, "grad_norm": 18.305538177490234, "learning_rate": 1.2949369471108603e-06, "loss": 0.5715, "step": 49690 }, { "epoch": 9.35, "grad_norm": 43.01716613769531, "learning_rate": 1.291172595520422e-06, "loss": 0.8107, "step": 49700 }, { "epoch": 9.36, "grad_norm": 1.7720367908477783, "learning_rate": 1.2874082439299832e-06, "loss": 0.2419, "step": 49710 }, { "epoch": 9.36, "grad_norm": 23.167449951171875, "learning_rate": 1.2836438923395447e-06, "loss": 0.7143, "step": 49720 }, { "epoch": 9.36, "grad_norm": 19.44156265258789, "learning_rate": 1.2798795407491061e-06, "loss": 0.538, "step": 49730 }, { "epoch": 9.36, "grad_norm": 23.96051788330078, "learning_rate": 1.2761151891586676e-06, "loss": 0.7893, "step": 49740 }, { "epoch": 9.36, "grad_norm": 18.885520935058594, "learning_rate": 1.2723508375682288e-06, "loss": 0.6589, "step": 49750 }, { "epoch": 9.37, "grad_norm": 16.42365264892578, "learning_rate": 1.2685864859777905e-06, "loss": 0.6178, "step": 49760 }, { "epoch": 9.37, "grad_norm": 18.40915298461914, "learning_rate": 1.2648221343873517e-06, "loss": 0.4403, "step": 49770 }, { "epoch": 9.37, "grad_norm": 17.522653579711914, "learning_rate": 1.2610577827969134e-06, "loss": 0.5484, "step": 49780 }, { "epoch": 9.37, "grad_norm": 28.844148635864258, "learning_rate": 1.2572934312064747e-06, "loss": 0.8541, "step": 49790 }, { "epoch": 9.37, "grad_norm": 20.508644104003906, "learning_rate": 1.2535290796160363e-06, "loss": 0.7103, "step": 49800 }, { "epoch": 9.38, "grad_norm": 1.3447939157485962, "learning_rate": 1.2497647280255978e-06, "loss": 0.3501, "step": 49810 }, { "epoch": 9.38, "grad_norm": 19.830303192138672, "learning_rate": 1.246000376435159e-06, "loss": 0.9489, "step": 49820 }, { "epoch": 9.38, "grad_norm": 8.828612327575684, "learning_rate": 1.2422360248447205e-06, "loss": 0.5627, "step": 49830 }, { "epoch": 9.38, "grad_norm": 6.055992126464844, "learning_rate": 1.238471673254282e-06, "loss": 0.4459, "step": 49840 }, { "epoch": 9.38, "grad_norm": 2.648444890975952, "learning_rate": 1.2347073216638434e-06, "loss": 0.6248, "step": 49850 }, { "epoch": 9.38, "grad_norm": 1.98809015750885, "learning_rate": 1.2309429700734049e-06, "loss": 0.4276, "step": 49860 }, { "epoch": 9.39, "grad_norm": 4.863277912139893, "learning_rate": 1.2271786184829664e-06, "loss": 0.7212, "step": 49870 }, { "epoch": 9.39, "grad_norm": 15.093059539794922, "learning_rate": 1.223414266892528e-06, "loss": 0.7187, "step": 49880 }, { "epoch": 9.39, "grad_norm": 1.527540683746338, "learning_rate": 1.2196499153020893e-06, "loss": 0.394, "step": 49890 }, { "epoch": 9.39, "grad_norm": 12.30584716796875, "learning_rate": 1.2158855637116507e-06, "loss": 0.5784, "step": 49900 }, { "epoch": 9.39, "grad_norm": 15.094637870788574, "learning_rate": 1.2121212121212122e-06, "loss": 0.7876, "step": 49910 }, { "epoch": 9.4, "grad_norm": 4.709288120269775, "learning_rate": 1.2083568605307737e-06, "loss": 0.446, "step": 49920 }, { "epoch": 9.4, "grad_norm": 4.510318756103516, "learning_rate": 1.2045925089403351e-06, "loss": 0.3298, "step": 49930 }, { "epoch": 9.4, "grad_norm": 12.671858787536621, "learning_rate": 1.2008281573498966e-06, "loss": 0.4997, "step": 49940 }, { "epoch": 9.4, "grad_norm": 9.532928466796875, "learning_rate": 1.197063805759458e-06, "loss": 0.5138, "step": 49950 }, { "epoch": 9.4, "grad_norm": 13.062521934509277, "learning_rate": 1.1932994541690195e-06, "loss": 0.6847, "step": 49960 }, { "epoch": 9.41, "grad_norm": 15.044008255004883, "learning_rate": 1.189535102578581e-06, "loss": 0.7019, "step": 49970 }, { "epoch": 9.41, "grad_norm": 7.6698102951049805, "learning_rate": 1.1857707509881424e-06, "loss": 0.5531, "step": 49980 }, { "epoch": 9.41, "grad_norm": 27.95477294921875, "learning_rate": 1.1820063993977039e-06, "loss": 0.5766, "step": 49990 }, { "epoch": 9.41, "grad_norm": 14.472332954406738, "learning_rate": 1.1782420478072651e-06, "loss": 0.5564, "step": 50000 }, { "epoch": 9.41, "grad_norm": 2.641383171081543, "learning_rate": 1.1744776962168268e-06, "loss": 0.5283, "step": 50010 }, { "epoch": 9.41, "grad_norm": 3.7167069911956787, "learning_rate": 1.1707133446263883e-06, "loss": 0.3921, "step": 50020 }, { "epoch": 9.42, "grad_norm": 3.721633195877075, "learning_rate": 1.1669489930359497e-06, "loss": 0.6279, "step": 50030 }, { "epoch": 9.42, "grad_norm": 31.65423583984375, "learning_rate": 1.1631846414455112e-06, "loss": 0.7653, "step": 50040 }, { "epoch": 9.42, "grad_norm": 24.756643295288086, "learning_rate": 1.1594202898550726e-06, "loss": 0.5439, "step": 50050 }, { "epoch": 9.42, "grad_norm": 21.029985427856445, "learning_rate": 1.155655938264634e-06, "loss": 0.6155, "step": 50060 }, { "epoch": 9.42, "grad_norm": 26.169784545898438, "learning_rate": 1.1518915866741954e-06, "loss": 0.7292, "step": 50070 }, { "epoch": 9.43, "grad_norm": 13.06205940246582, "learning_rate": 1.1481272350837568e-06, "loss": 0.7705, "step": 50080 }, { "epoch": 9.43, "grad_norm": 8.912203788757324, "learning_rate": 1.1443628834933183e-06, "loss": 0.6972, "step": 50090 }, { "epoch": 9.43, "grad_norm": 9.975449562072754, "learning_rate": 1.1405985319028797e-06, "loss": 0.6996, "step": 50100 }, { "epoch": 9.43, "grad_norm": 1.276980996131897, "learning_rate": 1.1368341803124412e-06, "loss": 0.7684, "step": 50110 }, { "epoch": 9.43, "grad_norm": 19.19355583190918, "learning_rate": 1.1330698287220027e-06, "loss": 0.4533, "step": 50120 }, { "epoch": 9.44, "grad_norm": 11.982575416564941, "learning_rate": 1.1293054771315643e-06, "loss": 0.462, "step": 50130 }, { "epoch": 9.44, "grad_norm": 11.03071403503418, "learning_rate": 1.1255411255411256e-06, "loss": 0.8484, "step": 50140 }, { "epoch": 9.44, "grad_norm": 5.758443355560303, "learning_rate": 1.121776773950687e-06, "loss": 0.6782, "step": 50150 }, { "epoch": 9.44, "grad_norm": 8.203763961791992, "learning_rate": 1.1180124223602485e-06, "loss": 0.821, "step": 50160 }, { "epoch": 9.44, "grad_norm": 29.303573608398438, "learning_rate": 1.11424807076981e-06, "loss": 0.469, "step": 50170 }, { "epoch": 9.44, "grad_norm": 7.694312572479248, "learning_rate": 1.1104837191793714e-06, "loss": 0.7798, "step": 50180 }, { "epoch": 9.45, "grad_norm": 21.332275390625, "learning_rate": 1.1067193675889329e-06, "loss": 0.5166, "step": 50190 }, { "epoch": 9.45, "grad_norm": 6.681392192840576, "learning_rate": 1.1029550159984943e-06, "loss": 0.4061, "step": 50200 }, { "epoch": 9.45, "grad_norm": 13.169896125793457, "learning_rate": 1.0991906644080558e-06, "loss": 0.6083, "step": 50210 }, { "epoch": 9.45, "grad_norm": 5.735914707183838, "learning_rate": 1.0954263128176173e-06, "loss": 0.6267, "step": 50220 }, { "epoch": 9.45, "grad_norm": 5.603200435638428, "learning_rate": 1.0916619612271787e-06, "loss": 0.5674, "step": 50230 }, { "epoch": 9.46, "grad_norm": 19.91693687438965, "learning_rate": 1.0878976096367402e-06, "loss": 0.6007, "step": 50240 }, { "epoch": 9.46, "grad_norm": 11.66734790802002, "learning_rate": 1.0841332580463016e-06, "loss": 0.6519, "step": 50250 }, { "epoch": 9.46, "grad_norm": 1.6658735275268555, "learning_rate": 1.0803689064558631e-06, "loss": 0.6445, "step": 50260 }, { "epoch": 9.46, "grad_norm": 5.168521404266357, "learning_rate": 1.0766045548654246e-06, "loss": 0.6066, "step": 50270 }, { "epoch": 9.46, "grad_norm": 20.741867065429688, "learning_rate": 1.072840203274986e-06, "loss": 0.7539, "step": 50280 }, { "epoch": 9.47, "grad_norm": 2.8274893760681152, "learning_rate": 1.0690758516845475e-06, "loss": 0.6486, "step": 50290 }, { "epoch": 9.47, "grad_norm": 27.90814971923828, "learning_rate": 1.065311500094109e-06, "loss": 0.6388, "step": 50300 }, { "epoch": 9.47, "grad_norm": 35.642887115478516, "learning_rate": 1.0615471485036704e-06, "loss": 0.7001, "step": 50310 }, { "epoch": 9.47, "grad_norm": 19.224685668945312, "learning_rate": 1.0577827969132319e-06, "loss": 0.4743, "step": 50320 }, { "epoch": 9.47, "grad_norm": 1.2370526790618896, "learning_rate": 1.0540184453227931e-06, "loss": 0.2631, "step": 50330 }, { "epoch": 9.47, "grad_norm": 2.0130465030670166, "learning_rate": 1.0502540937323546e-06, "loss": 0.6753, "step": 50340 }, { "epoch": 9.48, "grad_norm": 6.638840198516846, "learning_rate": 1.046489742141916e-06, "loss": 0.4586, "step": 50350 }, { "epoch": 9.48, "grad_norm": 28.855546951293945, "learning_rate": 1.0427253905514775e-06, "loss": 0.452, "step": 50360 }, { "epoch": 9.48, "grad_norm": 11.827780723571777, "learning_rate": 1.0389610389610392e-06, "loss": 0.4487, "step": 50370 }, { "epoch": 9.48, "grad_norm": 2.8460874557495117, "learning_rate": 1.0351966873706006e-06, "loss": 0.5687, "step": 50380 }, { "epoch": 9.48, "grad_norm": 1.1678156852722168, "learning_rate": 1.031432335780162e-06, "loss": 0.5319, "step": 50390 }, { "epoch": 9.49, "grad_norm": 12.879281044006348, "learning_rate": 1.0276679841897233e-06, "loss": 0.6915, "step": 50400 }, { "epoch": 9.49, "grad_norm": 22.82890510559082, "learning_rate": 1.0239036325992848e-06, "loss": 0.7823, "step": 50410 }, { "epoch": 9.49, "grad_norm": 11.886734008789062, "learning_rate": 1.0201392810088463e-06, "loss": 0.5204, "step": 50420 }, { "epoch": 9.49, "grad_norm": 7.491224765777588, "learning_rate": 1.0163749294184077e-06, "loss": 0.7198, "step": 50430 }, { "epoch": 9.49, "grad_norm": 5.192986011505127, "learning_rate": 1.0126105778279692e-06, "loss": 0.851, "step": 50440 }, { "epoch": 9.5, "grad_norm": 7.826737880706787, "learning_rate": 1.0088462262375306e-06, "loss": 0.6206, "step": 50450 }, { "epoch": 9.5, "grad_norm": 3.9375627040863037, "learning_rate": 1.0050818746470921e-06, "loss": 0.6543, "step": 50460 }, { "epoch": 9.5, "grad_norm": 9.7042875289917, "learning_rate": 1.0013175230566536e-06, "loss": 1.0389, "step": 50470 }, { "epoch": 9.5, "grad_norm": 17.00861167907715, "learning_rate": 9.97553171466215e-07, "loss": 0.5467, "step": 50480 }, { "epoch": 9.5, "grad_norm": 3.9601008892059326, "learning_rate": 9.937888198757765e-07, "loss": 0.6016, "step": 50490 }, { "epoch": 9.5, "grad_norm": 2.3020453453063965, "learning_rate": 9.90024468285338e-07, "loss": 0.6172, "step": 50500 }, { "epoch": 9.51, "grad_norm": 21.45136260986328, "learning_rate": 9.862601166948994e-07, "loss": 0.658, "step": 50510 }, { "epoch": 9.51, "grad_norm": 4.374157905578613, "learning_rate": 9.824957651044609e-07, "loss": 0.5469, "step": 50520 }, { "epoch": 9.51, "grad_norm": 1.7671420574188232, "learning_rate": 9.787314135140223e-07, "loss": 0.4868, "step": 50530 }, { "epoch": 9.51, "grad_norm": 5.874146938323975, "learning_rate": 9.749670619235838e-07, "loss": 0.6663, "step": 50540 }, { "epoch": 9.51, "grad_norm": 10.903212547302246, "learning_rate": 9.712027103331453e-07, "loss": 0.6018, "step": 50550 }, { "epoch": 9.52, "grad_norm": 1.831722378730774, "learning_rate": 9.674383587427067e-07, "loss": 0.5616, "step": 50560 }, { "epoch": 9.52, "grad_norm": 7.64743185043335, "learning_rate": 9.636740071522682e-07, "loss": 0.6374, "step": 50570 }, { "epoch": 9.52, "grad_norm": 10.178704261779785, "learning_rate": 9.599096555618294e-07, "loss": 0.8358, "step": 50580 }, { "epoch": 9.52, "grad_norm": 15.528922080993652, "learning_rate": 9.561453039713909e-07, "loss": 0.7869, "step": 50590 }, { "epoch": 9.52, "grad_norm": 17.538108825683594, "learning_rate": 9.523809523809525e-07, "loss": 0.5824, "step": 50600 }, { "epoch": 9.53, "grad_norm": 9.513688087463379, "learning_rate": 9.486166007905138e-07, "loss": 0.6195, "step": 50610 }, { "epoch": 9.53, "grad_norm": 0.6273812651634216, "learning_rate": 9.448522492000754e-07, "loss": 0.5377, "step": 50620 }, { "epoch": 9.53, "grad_norm": 30.33049774169922, "learning_rate": 9.410878976096368e-07, "loss": 0.7211, "step": 50630 }, { "epoch": 9.53, "grad_norm": 16.3236083984375, "learning_rate": 9.373235460191983e-07, "loss": 0.554, "step": 50640 }, { "epoch": 9.53, "grad_norm": 31.731229782104492, "learning_rate": 9.335591944287598e-07, "loss": 0.5433, "step": 50650 }, { "epoch": 9.54, "grad_norm": 3.7947933673858643, "learning_rate": 9.297948428383212e-07, "loss": 0.5317, "step": 50660 }, { "epoch": 9.54, "grad_norm": 4.978206157684326, "learning_rate": 9.260304912478827e-07, "loss": 0.5423, "step": 50670 }, { "epoch": 9.54, "grad_norm": 25.835474014282227, "learning_rate": 9.22266139657444e-07, "loss": 1.0302, "step": 50680 }, { "epoch": 9.54, "grad_norm": 9.1592435836792, "learning_rate": 9.185017880670055e-07, "loss": 0.726, "step": 50690 }, { "epoch": 9.54, "grad_norm": 1.835302710533142, "learning_rate": 9.14737436476567e-07, "loss": 0.5563, "step": 50700 }, { "epoch": 9.54, "grad_norm": 3.317290782928467, "learning_rate": 9.109730848861284e-07, "loss": 0.2991, "step": 50710 }, { "epoch": 9.55, "grad_norm": 63.06484603881836, "learning_rate": 9.072087332956899e-07, "loss": 0.6732, "step": 50720 }, { "epoch": 9.55, "grad_norm": 9.328119277954102, "learning_rate": 9.034443817052512e-07, "loss": 0.5525, "step": 50730 }, { "epoch": 9.55, "grad_norm": 0.2526796758174896, "learning_rate": 8.996800301148129e-07, "loss": 0.7044, "step": 50740 }, { "epoch": 9.55, "grad_norm": 2.888564348220825, "learning_rate": 8.959156785243743e-07, "loss": 0.7198, "step": 50750 }, { "epoch": 9.55, "grad_norm": 10.724895477294922, "learning_rate": 8.921513269339357e-07, "loss": 0.6142, "step": 50760 }, { "epoch": 9.56, "grad_norm": 5.929544448852539, "learning_rate": 8.883869753434972e-07, "loss": 0.5525, "step": 50770 }, { "epoch": 9.56, "grad_norm": 19.98274803161621, "learning_rate": 8.846226237530586e-07, "loss": 0.6745, "step": 50780 }, { "epoch": 9.56, "grad_norm": 18.69937515258789, "learning_rate": 8.808582721626201e-07, "loss": 0.7878, "step": 50790 }, { "epoch": 9.56, "grad_norm": 2.0860795974731445, "learning_rate": 8.770939205721816e-07, "loss": 0.5067, "step": 50800 }, { "epoch": 9.56, "grad_norm": 35.415252685546875, "learning_rate": 8.733295689817429e-07, "loss": 0.6524, "step": 50810 }, { "epoch": 9.57, "grad_norm": 30.638639450073242, "learning_rate": 8.695652173913044e-07, "loss": 0.5543, "step": 50820 }, { "epoch": 9.57, "grad_norm": 13.439196586608887, "learning_rate": 8.658008658008658e-07, "loss": 0.8766, "step": 50830 }, { "epoch": 9.57, "grad_norm": 20.156808853149414, "learning_rate": 8.620365142104273e-07, "loss": 0.5709, "step": 50840 }, { "epoch": 9.57, "grad_norm": 9.987468719482422, "learning_rate": 8.582721626199888e-07, "loss": 0.5995, "step": 50850 }, { "epoch": 9.57, "grad_norm": 4.771295547485352, "learning_rate": 8.545078110295501e-07, "loss": 0.6246, "step": 50860 }, { "epoch": 9.57, "grad_norm": 4.836380958557129, "learning_rate": 8.507434594391118e-07, "loss": 0.3697, "step": 50870 }, { "epoch": 9.58, "grad_norm": 53.96941375732422, "learning_rate": 8.469791078486731e-07, "loss": 0.6839, "step": 50880 }, { "epoch": 9.58, "grad_norm": 7.25943660736084, "learning_rate": 8.432147562582346e-07, "loss": 0.5902, "step": 50890 }, { "epoch": 9.58, "grad_norm": 7.461061000823975, "learning_rate": 8.394504046677961e-07, "loss": 0.3668, "step": 50900 }, { "epoch": 9.58, "grad_norm": 19.484573364257812, "learning_rate": 8.356860530773575e-07, "loss": 0.5896, "step": 50910 }, { "epoch": 9.58, "grad_norm": 3.453340530395508, "learning_rate": 8.31921701486919e-07, "loss": 0.5979, "step": 50920 }, { "epoch": 9.59, "grad_norm": 20.89385223388672, "learning_rate": 8.281573498964803e-07, "loss": 0.4996, "step": 50930 }, { "epoch": 9.59, "grad_norm": 9.047896385192871, "learning_rate": 8.243929983060418e-07, "loss": 0.3846, "step": 50940 }, { "epoch": 9.59, "grad_norm": 4.1045637130737305, "learning_rate": 8.206286467156033e-07, "loss": 0.7805, "step": 50950 }, { "epoch": 9.59, "grad_norm": 21.748098373413086, "learning_rate": 8.168642951251647e-07, "loss": 0.9516, "step": 50960 }, { "epoch": 9.59, "grad_norm": 1.549134373664856, "learning_rate": 8.130999435347262e-07, "loss": 0.6452, "step": 50970 }, { "epoch": 9.6, "grad_norm": 19.51275634765625, "learning_rate": 8.093355919442876e-07, "loss": 0.5384, "step": 50980 }, { "epoch": 9.6, "grad_norm": 19.296972274780273, "learning_rate": 8.055712403538492e-07, "loss": 0.6985, "step": 50990 }, { "epoch": 9.6, "grad_norm": 4.228772163391113, "learning_rate": 8.018068887634107e-07, "loss": 0.3677, "step": 51000 }, { "epoch": 9.6, "grad_norm": 17.887115478515625, "learning_rate": 7.98042537172972e-07, "loss": 0.5894, "step": 51010 }, { "epoch": 9.6, "grad_norm": 11.481246948242188, "learning_rate": 7.942781855825335e-07, "loss": 0.8832, "step": 51020 }, { "epoch": 9.6, "grad_norm": 83.25935363769531, "learning_rate": 7.905138339920949e-07, "loss": 0.8708, "step": 51030 }, { "epoch": 9.61, "grad_norm": 8.844846725463867, "learning_rate": 7.867494824016564e-07, "loss": 0.4787, "step": 51040 }, { "epoch": 9.61, "grad_norm": 7.551287651062012, "learning_rate": 7.829851308112179e-07, "loss": 0.6241, "step": 51050 }, { "epoch": 9.61, "grad_norm": 8.449108123779297, "learning_rate": 7.792207792207792e-07, "loss": 0.3055, "step": 51060 }, { "epoch": 9.61, "grad_norm": 17.781858444213867, "learning_rate": 7.754564276303407e-07, "loss": 0.6023, "step": 51070 }, { "epoch": 9.61, "grad_norm": 2.0274620056152344, "learning_rate": 7.716920760399021e-07, "loss": 0.5186, "step": 51080 }, { "epoch": 9.62, "grad_norm": 1.4594037532806396, "learning_rate": 7.679277244494636e-07, "loss": 0.6858, "step": 51090 }, { "epoch": 9.62, "grad_norm": 4.4524617195129395, "learning_rate": 7.641633728590251e-07, "loss": 0.599, "step": 51100 }, { "epoch": 9.62, "grad_norm": 2.6778459548950195, "learning_rate": 7.603990212685866e-07, "loss": 0.4101, "step": 51110 }, { "epoch": 9.62, "grad_norm": 12.10312271118164, "learning_rate": 7.566346696781481e-07, "loss": 0.582, "step": 51120 }, { "epoch": 9.62, "grad_norm": 6.429940700531006, "learning_rate": 7.528703180877094e-07, "loss": 0.6342, "step": 51130 }, { "epoch": 9.63, "grad_norm": 9.68521785736084, "learning_rate": 7.491059664972709e-07, "loss": 0.6853, "step": 51140 }, { "epoch": 9.63, "grad_norm": 23.555265426635742, "learning_rate": 7.453416149068324e-07, "loss": 0.8379, "step": 51150 }, { "epoch": 9.63, "grad_norm": 10.316865921020508, "learning_rate": 7.415772633163938e-07, "loss": 0.4812, "step": 51160 }, { "epoch": 9.63, "grad_norm": 13.322529792785645, "learning_rate": 7.378129117259553e-07, "loss": 0.7392, "step": 51170 }, { "epoch": 9.63, "grad_norm": 4.720820903778076, "learning_rate": 7.340485601355168e-07, "loss": 0.6491, "step": 51180 }, { "epoch": 9.63, "grad_norm": 27.2785587310791, "learning_rate": 7.302842085450781e-07, "loss": 0.5141, "step": 51190 }, { "epoch": 9.64, "grad_norm": 9.413230895996094, "learning_rate": 7.265198569546396e-07, "loss": 0.7144, "step": 51200 }, { "epoch": 9.64, "grad_norm": 14.331764221191406, "learning_rate": 7.22755505364201e-07, "loss": 0.7457, "step": 51210 }, { "epoch": 9.64, "grad_norm": 5.420490741729736, "learning_rate": 7.189911537737625e-07, "loss": 0.7607, "step": 51220 }, { "epoch": 9.64, "grad_norm": 13.919527053833008, "learning_rate": 7.152268021833241e-07, "loss": 0.9176, "step": 51230 }, { "epoch": 9.64, "grad_norm": 11.064170837402344, "learning_rate": 7.114624505928855e-07, "loss": 0.486, "step": 51240 }, { "epoch": 9.65, "grad_norm": 12.979689598083496, "learning_rate": 7.07698099002447e-07, "loss": 0.6936, "step": 51250 }, { "epoch": 9.65, "grad_norm": 1.0243254899978638, "learning_rate": 7.039337474120083e-07, "loss": 0.8458, "step": 51260 }, { "epoch": 9.65, "grad_norm": 2.5675830841064453, "learning_rate": 7.001693958215698e-07, "loss": 0.5708, "step": 51270 }, { "epoch": 9.65, "grad_norm": 13.160748481750488, "learning_rate": 6.964050442311313e-07, "loss": 0.6766, "step": 51280 }, { "epoch": 9.65, "grad_norm": 5.163463115692139, "learning_rate": 6.926406926406927e-07, "loss": 0.5097, "step": 51290 }, { "epoch": 9.66, "grad_norm": 20.507312774658203, "learning_rate": 6.888763410502542e-07, "loss": 0.7875, "step": 51300 }, { "epoch": 9.66, "grad_norm": 13.377191543579102, "learning_rate": 6.851119894598155e-07, "loss": 0.575, "step": 51310 }, { "epoch": 9.66, "grad_norm": 10.720812797546387, "learning_rate": 6.81347637869377e-07, "loss": 0.7355, "step": 51320 }, { "epoch": 9.66, "grad_norm": 20.88014030456543, "learning_rate": 6.775832862789384e-07, "loss": 0.438, "step": 51330 }, { "epoch": 9.66, "grad_norm": 11.077818870544434, "learning_rate": 6.738189346884999e-07, "loss": 0.4916, "step": 51340 }, { "epoch": 9.66, "grad_norm": 1.9108494520187378, "learning_rate": 6.700545830980614e-07, "loss": 0.7721, "step": 51350 }, { "epoch": 9.67, "grad_norm": 14.092747688293457, "learning_rate": 6.662902315076229e-07, "loss": 0.7167, "step": 51360 }, { "epoch": 9.67, "grad_norm": 11.634759902954102, "learning_rate": 6.625258799171844e-07, "loss": 0.4792, "step": 51370 }, { "epoch": 9.67, "grad_norm": 17.04500389099121, "learning_rate": 6.587615283267459e-07, "loss": 0.6694, "step": 51380 }, { "epoch": 9.67, "grad_norm": 10.782588958740234, "learning_rate": 6.549971767363072e-07, "loss": 0.5467, "step": 51390 }, { "epoch": 9.67, "grad_norm": 15.915536880493164, "learning_rate": 6.512328251458687e-07, "loss": 0.7236, "step": 51400 }, { "epoch": 9.68, "grad_norm": 8.385047912597656, "learning_rate": 6.474684735554301e-07, "loss": 0.5677, "step": 51410 }, { "epoch": 9.68, "grad_norm": 6.425243854522705, "learning_rate": 6.437041219649916e-07, "loss": 0.6197, "step": 51420 }, { "epoch": 9.68, "grad_norm": 3.9518914222717285, "learning_rate": 6.399397703745531e-07, "loss": 0.4698, "step": 51430 }, { "epoch": 9.68, "grad_norm": 23.47572898864746, "learning_rate": 6.361754187841144e-07, "loss": 0.7296, "step": 51440 }, { "epoch": 9.68, "grad_norm": 6.047122478485107, "learning_rate": 6.324110671936759e-07, "loss": 0.407, "step": 51450 }, { "epoch": 9.69, "grad_norm": 11.228981971740723, "learning_rate": 6.286467156032373e-07, "loss": 0.6502, "step": 51460 }, { "epoch": 9.69, "grad_norm": 6.409139633178711, "learning_rate": 6.248823640127989e-07, "loss": 0.7182, "step": 51470 }, { "epoch": 9.69, "grad_norm": 0.5493441820144653, "learning_rate": 6.211180124223603e-07, "loss": 0.6939, "step": 51480 }, { "epoch": 9.69, "grad_norm": 21.116708755493164, "learning_rate": 6.173536608319217e-07, "loss": 0.6814, "step": 51490 }, { "epoch": 9.69, "grad_norm": 5.822625637054443, "learning_rate": 6.135893092414832e-07, "loss": 0.6301, "step": 51500 }, { "epoch": 9.7, "grad_norm": 14.22628116607666, "learning_rate": 6.098249576510446e-07, "loss": 0.504, "step": 51510 }, { "epoch": 9.7, "grad_norm": 0.9555671811103821, "learning_rate": 6.060606060606061e-07, "loss": 0.7721, "step": 51520 }, { "epoch": 9.7, "grad_norm": 4.8472371101379395, "learning_rate": 6.022962544701676e-07, "loss": 0.8527, "step": 51530 }, { "epoch": 9.7, "grad_norm": 1.605295181274414, "learning_rate": 5.98531902879729e-07, "loss": 0.3902, "step": 51540 }, { "epoch": 9.7, "grad_norm": 7.616785526275635, "learning_rate": 5.947675512892905e-07, "loss": 0.6099, "step": 51550 }, { "epoch": 9.7, "grad_norm": 7.735783100128174, "learning_rate": 5.910031996988519e-07, "loss": 0.489, "step": 51560 }, { "epoch": 9.71, "grad_norm": 5.869561672210693, "learning_rate": 5.872388481084134e-07, "loss": 0.7795, "step": 51570 }, { "epoch": 9.71, "grad_norm": 32.06550979614258, "learning_rate": 5.834744965179749e-07, "loss": 0.8022, "step": 51580 }, { "epoch": 9.71, "grad_norm": 11.570901870727539, "learning_rate": 5.797101449275363e-07, "loss": 0.6053, "step": 51590 }, { "epoch": 9.71, "grad_norm": 6.074151039123535, "learning_rate": 5.759457933370977e-07, "loss": 0.8133, "step": 51600 }, { "epoch": 9.71, "grad_norm": 13.513484954833984, "learning_rate": 5.721814417466591e-07, "loss": 0.5188, "step": 51610 }, { "epoch": 9.72, "grad_norm": 9.265263557434082, "learning_rate": 5.684170901562206e-07, "loss": 0.3382, "step": 51620 }, { "epoch": 9.72, "grad_norm": 6.627038478851318, "learning_rate": 5.646527385657822e-07, "loss": 0.6288, "step": 51630 }, { "epoch": 9.72, "grad_norm": 26.184432983398438, "learning_rate": 5.608883869753435e-07, "loss": 0.4671, "step": 51640 }, { "epoch": 9.72, "grad_norm": 5.137016773223877, "learning_rate": 5.57124035384905e-07, "loss": 0.5872, "step": 51650 }, { "epoch": 9.72, "grad_norm": 2.2832343578338623, "learning_rate": 5.533596837944664e-07, "loss": 0.5235, "step": 51660 }, { "epoch": 9.73, "grad_norm": 4.629515647888184, "learning_rate": 5.495953322040279e-07, "loss": 0.5303, "step": 51670 }, { "epoch": 9.73, "grad_norm": 1.6002618074417114, "learning_rate": 5.458309806135894e-07, "loss": 0.7297, "step": 51680 }, { "epoch": 9.73, "grad_norm": 5.028347015380859, "learning_rate": 5.420666290231508e-07, "loss": 0.7791, "step": 51690 }, { "epoch": 9.73, "grad_norm": 6.394321441650391, "learning_rate": 5.383022774327123e-07, "loss": 0.4827, "step": 51700 }, { "epoch": 9.73, "grad_norm": 12.599891662597656, "learning_rate": 5.345379258422737e-07, "loss": 0.5966, "step": 51710 }, { "epoch": 9.73, "grad_norm": 4.757793426513672, "learning_rate": 5.307735742518352e-07, "loss": 0.5777, "step": 51720 }, { "epoch": 9.74, "grad_norm": 30.54682731628418, "learning_rate": 5.270092226613966e-07, "loss": 0.5297, "step": 51730 }, { "epoch": 9.74, "grad_norm": 21.74601173400879, "learning_rate": 5.23244871070958e-07, "loss": 0.7536, "step": 51740 }, { "epoch": 9.74, "grad_norm": 1.1612796783447266, "learning_rate": 5.194805194805196e-07, "loss": 0.3204, "step": 51750 }, { "epoch": 9.74, "grad_norm": 23.62040901184082, "learning_rate": 5.15716167890081e-07, "loss": 0.643, "step": 51760 }, { "epoch": 9.74, "grad_norm": 16.90983009338379, "learning_rate": 5.119518162996424e-07, "loss": 0.6277, "step": 51770 }, { "epoch": 9.75, "grad_norm": 8.718644142150879, "learning_rate": 5.081874647092039e-07, "loss": 0.5494, "step": 51780 }, { "epoch": 9.75, "grad_norm": 0.733951985836029, "learning_rate": 5.044231131187653e-07, "loss": 0.408, "step": 51790 }, { "epoch": 9.75, "grad_norm": 2.202390670776367, "learning_rate": 5.006587615283268e-07, "loss": 0.281, "step": 51800 }, { "epoch": 9.75, "grad_norm": 12.960224151611328, "learning_rate": 4.968944099378882e-07, "loss": 0.6881, "step": 51810 }, { "epoch": 9.75, "grad_norm": 57.7561149597168, "learning_rate": 4.931300583474497e-07, "loss": 0.4864, "step": 51820 }, { "epoch": 9.76, "grad_norm": 4.927740097045898, "learning_rate": 4.893657067570112e-07, "loss": 0.5122, "step": 51830 }, { "epoch": 9.76, "grad_norm": 12.618846893310547, "learning_rate": 4.856013551665726e-07, "loss": 0.7205, "step": 51840 }, { "epoch": 9.76, "grad_norm": 14.155098915100098, "learning_rate": 4.818370035761341e-07, "loss": 0.5097, "step": 51850 }, { "epoch": 9.76, "grad_norm": 18.120792388916016, "learning_rate": 4.780726519856954e-07, "loss": 0.6503, "step": 51860 }, { "epoch": 9.76, "grad_norm": 13.224746704101562, "learning_rate": 4.743083003952569e-07, "loss": 0.4125, "step": 51870 }, { "epoch": 9.76, "grad_norm": 5.5424113273620605, "learning_rate": 4.705439488048184e-07, "loss": 0.6142, "step": 51880 }, { "epoch": 9.77, "grad_norm": 50.84480285644531, "learning_rate": 4.667795972143799e-07, "loss": 0.7898, "step": 51890 }, { "epoch": 9.77, "grad_norm": 2.5985782146453857, "learning_rate": 4.6301524562394134e-07, "loss": 0.4004, "step": 51900 }, { "epoch": 9.77, "grad_norm": 1.943764328956604, "learning_rate": 4.5925089403350275e-07, "loss": 0.4649, "step": 51910 }, { "epoch": 9.77, "grad_norm": 5.038739204406738, "learning_rate": 4.554865424430642e-07, "loss": 0.3788, "step": 51920 }, { "epoch": 9.77, "grad_norm": 3.727426290512085, "learning_rate": 4.517221908526256e-07, "loss": 0.6511, "step": 51930 }, { "epoch": 9.78, "grad_norm": 0.8934656381607056, "learning_rate": 4.4795783926218713e-07, "loss": 0.6402, "step": 51940 }, { "epoch": 9.78, "grad_norm": 1.4348187446594238, "learning_rate": 4.441934876717486e-07, "loss": 0.4634, "step": 51950 }, { "epoch": 9.78, "grad_norm": 11.006245613098145, "learning_rate": 4.4042913608131005e-07, "loss": 0.7996, "step": 51960 }, { "epoch": 9.78, "grad_norm": 16.705305099487305, "learning_rate": 4.3666478449087146e-07, "loss": 0.6679, "step": 51970 }, { "epoch": 9.78, "grad_norm": 5.821102619171143, "learning_rate": 4.329004329004329e-07, "loss": 0.4645, "step": 51980 }, { "epoch": 9.79, "grad_norm": 25.86487579345703, "learning_rate": 4.291360813099944e-07, "loss": 0.6766, "step": 51990 }, { "epoch": 9.79, "grad_norm": 7.9889678955078125, "learning_rate": 4.253717297195559e-07, "loss": 0.9122, "step": 52000 }, { "epoch": 9.79, "grad_norm": 22.770687103271484, "learning_rate": 4.216073781291173e-07, "loss": 0.6444, "step": 52010 }, { "epoch": 9.79, "grad_norm": 15.014555931091309, "learning_rate": 4.1784302653867876e-07, "loss": 0.6525, "step": 52020 }, { "epoch": 9.79, "grad_norm": 15.484350204467773, "learning_rate": 4.1407867494824017e-07, "loss": 0.8954, "step": 52030 }, { "epoch": 9.79, "grad_norm": 15.990805625915527, "learning_rate": 4.1031432335780163e-07, "loss": 1.0466, "step": 52040 }, { "epoch": 9.8, "grad_norm": 10.061239242553711, "learning_rate": 4.065499717673631e-07, "loss": 0.9088, "step": 52050 }, { "epoch": 9.8, "grad_norm": 14.985965728759766, "learning_rate": 4.027856201769246e-07, "loss": 0.6081, "step": 52060 }, { "epoch": 9.8, "grad_norm": 8.062920570373535, "learning_rate": 3.99021268586486e-07, "loss": 0.9532, "step": 52070 }, { "epoch": 9.8, "grad_norm": 6.515170574188232, "learning_rate": 3.9525691699604747e-07, "loss": 0.6892, "step": 52080 }, { "epoch": 9.8, "grad_norm": 3.16182017326355, "learning_rate": 3.9149256540560893e-07, "loss": 0.6659, "step": 52090 }, { "epoch": 9.81, "grad_norm": 6.038811683654785, "learning_rate": 3.8772821381517034e-07, "loss": 0.5384, "step": 52100 }, { "epoch": 9.81, "grad_norm": 9.420355796813965, "learning_rate": 3.839638622247318e-07, "loss": 0.712, "step": 52110 }, { "epoch": 9.81, "grad_norm": 16.933589935302734, "learning_rate": 3.801995106342933e-07, "loss": 0.5624, "step": 52120 }, { "epoch": 9.81, "grad_norm": 25.0205020904541, "learning_rate": 3.764351590438547e-07, "loss": 0.8724, "step": 52130 }, { "epoch": 9.81, "grad_norm": 1.1323323249816895, "learning_rate": 3.726708074534162e-07, "loss": 0.7676, "step": 52140 }, { "epoch": 9.82, "grad_norm": 3.2806289196014404, "learning_rate": 3.6890645586297765e-07, "loss": 0.5679, "step": 52150 }, { "epoch": 9.82, "grad_norm": 13.735160827636719, "learning_rate": 3.6514210427253905e-07, "loss": 0.6586, "step": 52160 }, { "epoch": 9.82, "grad_norm": 38.57212829589844, "learning_rate": 3.613777526821005e-07, "loss": 0.8254, "step": 52170 }, { "epoch": 9.82, "grad_norm": 5.336735725402832, "learning_rate": 3.5761340109166203e-07, "loss": 0.6502, "step": 52180 }, { "epoch": 9.82, "grad_norm": 2.667588233947754, "learning_rate": 3.538490495012235e-07, "loss": 0.5852, "step": 52190 }, { "epoch": 9.82, "grad_norm": 2.7714147567749023, "learning_rate": 3.500846979107849e-07, "loss": 0.3604, "step": 52200 }, { "epoch": 9.83, "grad_norm": 5.518807888031006, "learning_rate": 3.4632034632034636e-07, "loss": 0.5844, "step": 52210 }, { "epoch": 9.83, "grad_norm": 20.48015785217285, "learning_rate": 3.4255599472990776e-07, "loss": 0.5256, "step": 52220 }, { "epoch": 9.83, "grad_norm": 8.182001113891602, "learning_rate": 3.387916431394692e-07, "loss": 0.6589, "step": 52230 }, { "epoch": 9.83, "grad_norm": 30.78422737121582, "learning_rate": 3.350272915490307e-07, "loss": 0.7304, "step": 52240 }, { "epoch": 9.83, "grad_norm": 0.41738197207450867, "learning_rate": 3.312629399585922e-07, "loss": 0.7316, "step": 52250 }, { "epoch": 9.84, "grad_norm": 14.3366060256958, "learning_rate": 3.274985883681536e-07, "loss": 0.5822, "step": 52260 }, { "epoch": 9.84, "grad_norm": 40.16769790649414, "learning_rate": 3.2373423677771507e-07, "loss": 0.7786, "step": 52270 }, { "epoch": 9.84, "grad_norm": 2.436497211456299, "learning_rate": 3.1996988518727653e-07, "loss": 0.4918, "step": 52280 }, { "epoch": 9.84, "grad_norm": 4.028090000152588, "learning_rate": 3.1620553359683794e-07, "loss": 0.4449, "step": 52290 }, { "epoch": 9.84, "grad_norm": 0.5917661786079407, "learning_rate": 3.1244118200639945e-07, "loss": 0.3922, "step": 52300 }, { "epoch": 9.85, "grad_norm": 7.457550048828125, "learning_rate": 3.0867683041596086e-07, "loss": 0.7478, "step": 52310 }, { "epoch": 9.85, "grad_norm": 22.653587341308594, "learning_rate": 3.049124788255223e-07, "loss": 0.8436, "step": 52320 }, { "epoch": 9.85, "grad_norm": 2.306321144104004, "learning_rate": 3.011481272350838e-07, "loss": 0.3168, "step": 52330 }, { "epoch": 9.85, "grad_norm": 9.707208633422852, "learning_rate": 2.9738377564464524e-07, "loss": 0.7441, "step": 52340 }, { "epoch": 9.85, "grad_norm": 4.472634315490723, "learning_rate": 2.936194240542067e-07, "loss": 0.5522, "step": 52350 }, { "epoch": 9.86, "grad_norm": 0.7269446849822998, "learning_rate": 2.8985507246376816e-07, "loss": 0.3579, "step": 52360 }, { "epoch": 9.86, "grad_norm": 16.50381088256836, "learning_rate": 2.8609072087332957e-07, "loss": 0.5451, "step": 52370 }, { "epoch": 9.86, "grad_norm": 8.023009300231934, "learning_rate": 2.823263692828911e-07, "loss": 0.5021, "step": 52380 }, { "epoch": 9.86, "grad_norm": 9.023140907287598, "learning_rate": 2.785620176924525e-07, "loss": 0.4708, "step": 52390 }, { "epoch": 9.86, "grad_norm": 12.816524505615234, "learning_rate": 2.7479766610201395e-07, "loss": 0.3499, "step": 52400 }, { "epoch": 9.86, "grad_norm": 11.94821834564209, "learning_rate": 2.710333145115754e-07, "loss": 0.992, "step": 52410 }, { "epoch": 9.87, "grad_norm": 39.60972213745117, "learning_rate": 2.6726896292113687e-07, "loss": 0.6137, "step": 52420 }, { "epoch": 9.87, "grad_norm": 24.9156494140625, "learning_rate": 2.635046113306983e-07, "loss": 0.3473, "step": 52430 }, { "epoch": 9.87, "grad_norm": 49.8020133972168, "learning_rate": 2.597402597402598e-07, "loss": 0.4267, "step": 52440 }, { "epoch": 9.87, "grad_norm": 4.023902893066406, "learning_rate": 2.559759081498212e-07, "loss": 0.9566, "step": 52450 }, { "epoch": 9.87, "grad_norm": 2.0986695289611816, "learning_rate": 2.5221155655938266e-07, "loss": 0.5851, "step": 52460 }, { "epoch": 9.88, "grad_norm": 0.5418403148651123, "learning_rate": 2.484472049689441e-07, "loss": 0.279, "step": 52470 }, { "epoch": 9.88, "grad_norm": 4.7587103843688965, "learning_rate": 2.446828533785056e-07, "loss": 0.6662, "step": 52480 }, { "epoch": 9.88, "grad_norm": 2.692694664001465, "learning_rate": 2.4091850178806704e-07, "loss": 0.2345, "step": 52490 }, { "epoch": 9.88, "grad_norm": 14.710683822631836, "learning_rate": 2.3715415019762845e-07, "loss": 0.3466, "step": 52500 }, { "epoch": 9.88, "grad_norm": 30.450748443603516, "learning_rate": 2.3338979860718994e-07, "loss": 0.568, "step": 52510 }, { "epoch": 9.89, "grad_norm": 13.52367877960205, "learning_rate": 2.2962544701675137e-07, "loss": 1.2291, "step": 52520 }, { "epoch": 9.89, "grad_norm": 11.53637409210205, "learning_rate": 2.258610954263128e-07, "loss": 0.7912, "step": 52530 }, { "epoch": 9.89, "grad_norm": 5.5346174240112305, "learning_rate": 2.220967438358743e-07, "loss": 0.731, "step": 52540 }, { "epoch": 9.89, "grad_norm": 23.65789794921875, "learning_rate": 2.1833239224543573e-07, "loss": 0.7382, "step": 52550 }, { "epoch": 9.89, "grad_norm": 3.014709949493408, "learning_rate": 2.145680406549972e-07, "loss": 0.2181, "step": 52560 }, { "epoch": 9.89, "grad_norm": 4.752124309539795, "learning_rate": 2.1080368906455865e-07, "loss": 0.8964, "step": 52570 }, { "epoch": 9.9, "grad_norm": 7.932596206665039, "learning_rate": 2.0703933747412008e-07, "loss": 0.7741, "step": 52580 }, { "epoch": 9.9, "grad_norm": 12.318062782287598, "learning_rate": 2.0327498588368155e-07, "loss": 0.5168, "step": 52590 }, { "epoch": 9.9, "grad_norm": 34.704307556152344, "learning_rate": 1.99510634293243e-07, "loss": 0.3152, "step": 52600 }, { "epoch": 9.9, "grad_norm": 2.4121196269989014, "learning_rate": 1.9574628270280447e-07, "loss": 0.4516, "step": 52610 }, { "epoch": 9.9, "grad_norm": 15.037623405456543, "learning_rate": 1.919819311123659e-07, "loss": 1.0576, "step": 52620 }, { "epoch": 9.91, "grad_norm": 14.277022361755371, "learning_rate": 1.8821757952192736e-07, "loss": 0.6509, "step": 52630 }, { "epoch": 9.91, "grad_norm": 11.150029182434082, "learning_rate": 1.8445322793148882e-07, "loss": 0.401, "step": 52640 }, { "epoch": 9.91, "grad_norm": 10.912738800048828, "learning_rate": 1.8068887634105026e-07, "loss": 1.232, "step": 52650 }, { "epoch": 9.91, "grad_norm": 20.20965576171875, "learning_rate": 1.7692452475061174e-07, "loss": 0.5187, "step": 52660 }, { "epoch": 9.91, "grad_norm": 3.261793851852417, "learning_rate": 1.7316017316017318e-07, "loss": 0.6039, "step": 52670 }, { "epoch": 9.92, "grad_norm": 3.90350079536438, "learning_rate": 1.693958215697346e-07, "loss": 0.4078, "step": 52680 }, { "epoch": 9.92, "grad_norm": 16.29595947265625, "learning_rate": 1.656314699792961e-07, "loss": 0.7171, "step": 52690 }, { "epoch": 9.92, "grad_norm": 5.020286560058594, "learning_rate": 1.6186711838885753e-07, "loss": 0.4468, "step": 52700 }, { "epoch": 9.92, "grad_norm": 7.8975830078125, "learning_rate": 1.5810276679841897e-07, "loss": 0.5226, "step": 52710 }, { "epoch": 9.92, "grad_norm": 1.327446699142456, "learning_rate": 1.5433841520798043e-07, "loss": 0.3128, "step": 52720 }, { "epoch": 9.92, "grad_norm": 0.5548346042633057, "learning_rate": 1.505740636175419e-07, "loss": 0.8055, "step": 52730 }, { "epoch": 9.93, "grad_norm": 10.072444915771484, "learning_rate": 1.4680971202710335e-07, "loss": 0.5244, "step": 52740 }, { "epoch": 9.93, "grad_norm": 14.165583610534668, "learning_rate": 1.4304536043666478e-07, "loss": 0.4185, "step": 52750 }, { "epoch": 9.93, "grad_norm": 1.485308051109314, "learning_rate": 1.3928100884622625e-07, "loss": 0.3717, "step": 52760 }, { "epoch": 9.93, "grad_norm": 7.70651388168335, "learning_rate": 1.355166572557877e-07, "loss": 0.5782, "step": 52770 }, { "epoch": 9.93, "grad_norm": 4.456355094909668, "learning_rate": 1.3175230566534914e-07, "loss": 0.5974, "step": 52780 }, { "epoch": 9.94, "grad_norm": 24.572412490844727, "learning_rate": 1.279879540749106e-07, "loss": 0.4243, "step": 52790 }, { "epoch": 9.94, "grad_norm": 1.7075419425964355, "learning_rate": 1.2422360248447206e-07, "loss": 0.6152, "step": 52800 }, { "epoch": 9.94, "grad_norm": 0.5838226675987244, "learning_rate": 1.2045925089403352e-07, "loss": 0.3608, "step": 52810 }, { "epoch": 9.94, "grad_norm": 25.698505401611328, "learning_rate": 1.1669489930359497e-07, "loss": 0.467, "step": 52820 }, { "epoch": 9.94, "grad_norm": 13.838290214538574, "learning_rate": 1.129305477131564e-07, "loss": 0.9436, "step": 52830 }, { "epoch": 9.95, "grad_norm": 6.618202209472656, "learning_rate": 1.0916619612271786e-07, "loss": 0.5467, "step": 52840 }, { "epoch": 9.95, "grad_norm": 39.71380615234375, "learning_rate": 1.0540184453227933e-07, "loss": 0.7317, "step": 52850 }, { "epoch": 9.95, "grad_norm": 3.4273688793182373, "learning_rate": 1.0163749294184077e-07, "loss": 0.7483, "step": 52860 }, { "epoch": 9.95, "grad_norm": 23.52995491027832, "learning_rate": 9.787314135140223e-08, "loss": 0.8891, "step": 52870 }, { "epoch": 9.95, "grad_norm": 10.14623737335205, "learning_rate": 9.410878976096368e-08, "loss": 0.3956, "step": 52880 }, { "epoch": 9.95, "grad_norm": 25.550630569458008, "learning_rate": 9.034443817052513e-08, "loss": 0.7451, "step": 52890 }, { "epoch": 9.96, "grad_norm": 7.88820219039917, "learning_rate": 8.658008658008659e-08, "loss": 0.4586, "step": 52900 }, { "epoch": 9.96, "grad_norm": 44.730712890625, "learning_rate": 8.281573498964805e-08, "loss": 0.6488, "step": 52910 }, { "epoch": 9.96, "grad_norm": 8.287548065185547, "learning_rate": 7.905138339920948e-08, "loss": 0.6168, "step": 52920 }, { "epoch": 9.96, "grad_norm": 24.090682983398438, "learning_rate": 7.528703180877094e-08, "loss": 0.6773, "step": 52930 }, { "epoch": 9.96, "grad_norm": 2.835190534591675, "learning_rate": 7.152268021833239e-08, "loss": 0.4813, "step": 52940 }, { "epoch": 9.97, "grad_norm": 4.38554573059082, "learning_rate": 6.775832862789385e-08, "loss": 0.6574, "step": 52950 }, { "epoch": 9.97, "grad_norm": 17.962499618530273, "learning_rate": 6.39939770374553e-08, "loss": 0.587, "step": 52960 }, { "epoch": 9.97, "grad_norm": 27.21987533569336, "learning_rate": 6.022962544701676e-08, "loss": 0.6546, "step": 52970 }, { "epoch": 9.97, "grad_norm": 10.5108060836792, "learning_rate": 5.64652738565782e-08, "loss": 0.5803, "step": 52980 }, { "epoch": 9.97, "grad_norm": 19.367639541625977, "learning_rate": 5.270092226613966e-08, "loss": 0.5467, "step": 52990 }, { "epoch": 9.98, "grad_norm": 12.922274589538574, "learning_rate": 4.893657067570112e-08, "loss": 0.7419, "step": 53000 }, { "epoch": 9.98, "grad_norm": 6.959540367126465, "learning_rate": 4.5172219085262564e-08, "loss": 0.7518, "step": 53010 }, { "epoch": 9.98, "grad_norm": 12.34788703918457, "learning_rate": 4.1407867494824025e-08, "loss": 0.5116, "step": 53020 }, { "epoch": 9.98, "grad_norm": 4.518441677093506, "learning_rate": 3.764351590438547e-08, "loss": 0.7443, "step": 53030 }, { "epoch": 9.98, "grad_norm": 11.56289291381836, "learning_rate": 3.3879164313946926e-08, "loss": 0.4388, "step": 53040 }, { "epoch": 9.98, "grad_norm": 8.461308479309082, "learning_rate": 3.011481272350838e-08, "loss": 0.5078, "step": 53050 }, { "epoch": 9.99, "grad_norm": 21.100744247436523, "learning_rate": 2.635046113306983e-08, "loss": 0.4579, "step": 53060 }, { "epoch": 9.99, "grad_norm": 7.55901575088501, "learning_rate": 2.2586109542631282e-08, "loss": 0.4985, "step": 53070 }, { "epoch": 9.99, "grad_norm": 4.260222434997559, "learning_rate": 1.8821757952192736e-08, "loss": 0.8377, "step": 53080 }, { "epoch": 9.99, "grad_norm": 21.4654483795166, "learning_rate": 1.505740636175419e-08, "loss": 0.5078, "step": 53090 }, { "epoch": 9.99, "grad_norm": 12.223139762878418, "learning_rate": 1.1293054771315641e-08, "loss": 0.6421, "step": 53100 }, { "epoch": 10.0, "grad_norm": 4.522390842437744, "learning_rate": 7.528703180877095e-09, "loss": 0.5871, "step": 53110 }, { "epoch": 10.0, "grad_norm": 36.91136169433594, "learning_rate": 3.7643515904385476e-09, "loss": 0.4395, "step": 53120 }, { "epoch": 10.0, "grad_norm": 13.879157066345215, "learning_rate": 0.0, "loss": 0.3953, "step": 53130 }, { "epoch": 10.0, "eval_accuracy": 0.8109333333333333, "eval_loss": 0.8562657833099365, "eval_runtime": 31.1579, "eval_samples_per_second": 240.709, "eval_steps_per_second": 30.105, "step": 53130 }, { "epoch": 10.0, "step": 53130, "total_flos": 3.32286831120384e+19, "train_loss": 2.1818104096597004, "train_runtime": 5783.4985, "train_samples_per_second": 73.485, "train_steps_per_second": 9.186 } ], "logging_steps": 10, "max_steps": 53130, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.32286831120384e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }