diff --git "a/checkpoint-775/trainer_state.json" "b/checkpoint-775/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-775/trainer_state.json" @@ -0,0 +1,6233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.98793242156074, + "eval_steps": 500, + "global_step": 775, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006436041834271922, + "grad_norm": 181.50096130371094, + "learning_rate": 8.333333333333335e-09, + "loss": 8.4196, + "num_input_tokens_seen": 6848, + "step": 1 + }, + { + "epoch": 0.012872083668543845, + "grad_norm": 187.05642700195312, + "learning_rate": 1.666666666666667e-08, + "loss": 8.44, + "num_input_tokens_seen": 14000, + "step": 2 + }, + { + "epoch": 0.019308125502815767, + "grad_norm": 182.92320251464844, + "learning_rate": 2.5000000000000002e-08, + "loss": 8.3839, + "num_input_tokens_seen": 21152, + "step": 3 + }, + { + "epoch": 0.02574416733708769, + "grad_norm": 186.71311950683594, + "learning_rate": 3.333333333333334e-08, + "loss": 8.4024, + "num_input_tokens_seen": 28224, + "step": 4 + }, + { + "epoch": 0.032180209171359615, + "grad_norm": 180.32656860351562, + "learning_rate": 4.166666666666667e-08, + "loss": 8.4594, + "num_input_tokens_seen": 35360, + "step": 5 + }, + { + "epoch": 0.038616251005631534, + "grad_norm": 189.87557983398438, + "learning_rate": 5.0000000000000004e-08, + "loss": 8.4107, + "num_input_tokens_seen": 42192, + "step": 6 + }, + { + "epoch": 0.04505229283990346, + "grad_norm": 185.89984130859375, + "learning_rate": 5.833333333333334e-08, + "loss": 8.4551, + "num_input_tokens_seen": 49088, + "step": 7 + }, + { + "epoch": 0.05148833467417538, + "grad_norm": 188.8160400390625, + "learning_rate": 6.666666666666668e-08, + "loss": 8.4415, + "num_input_tokens_seen": 55856, + "step": 8 + }, + { + "epoch": 0.057924376508447305, + "grad_norm": 190.1417236328125, + "learning_rate": 7.500000000000001e-08, + "loss": 8.4965, + "num_input_tokens_seen": 63120, + "step": 9 + }, + { + "epoch": 0.06436041834271923, + "grad_norm": 185.3598175048828, + "learning_rate": 8.333333333333334e-08, + "loss": 8.4251, + "num_input_tokens_seen": 69968, + "step": 10 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 183.81944274902344, + "learning_rate": 9.166666666666668e-08, + "loss": 8.4291, + "num_input_tokens_seen": 77168, + "step": 11 + }, + { + "epoch": 0.07723250201126307, + "grad_norm": 196.39779663085938, + "learning_rate": 1.0000000000000001e-07, + "loss": 8.4463, + "num_input_tokens_seen": 84272, + "step": 12 + }, + { + "epoch": 0.083668543845535, + "grad_norm": 181.4925994873047, + "learning_rate": 1.0833333333333335e-07, + "loss": 8.5116, + "num_input_tokens_seen": 91232, + "step": 13 + }, + { + "epoch": 0.09010458567980692, + "grad_norm": 190.0314178466797, + "learning_rate": 1.1666666666666668e-07, + "loss": 8.4749, + "num_input_tokens_seen": 97968, + "step": 14 + }, + { + "epoch": 0.09654062751407884, + "grad_norm": 188.7615203857422, + "learning_rate": 1.2500000000000002e-07, + "loss": 8.3311, + "num_input_tokens_seen": 104864, + "step": 15 + }, + { + "epoch": 0.10297666934835076, + "grad_norm": 184.1820526123047, + "learning_rate": 1.3333333333333336e-07, + "loss": 8.3729, + "num_input_tokens_seen": 111488, + "step": 16 + }, + { + "epoch": 0.10941271118262269, + "grad_norm": 181.39308166503906, + "learning_rate": 1.4166666666666668e-07, + "loss": 8.4261, + "num_input_tokens_seen": 118384, + "step": 17 + }, + { + "epoch": 0.11584875301689461, + "grad_norm": 181.79583740234375, + "learning_rate": 1.5000000000000002e-07, + "loss": 8.3051, + "num_input_tokens_seen": 125360, + "step": 18 + }, + { + "epoch": 0.12228479485116653, + "grad_norm": 181.36965942382812, + "learning_rate": 1.5833333333333336e-07, + "loss": 8.2461, + "num_input_tokens_seen": 132320, + "step": 19 + }, + { + "epoch": 0.12872083668543846, + "grad_norm": 182.36839294433594, + "learning_rate": 1.6666666666666668e-07, + "loss": 8.2894, + "num_input_tokens_seen": 139376, + "step": 20 + }, + { + "epoch": 0.13515687851971037, + "grad_norm": 189.7889404296875, + "learning_rate": 1.7500000000000002e-07, + "loss": 8.2484, + "num_input_tokens_seen": 146544, + "step": 21 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 190.1185302734375, + "learning_rate": 1.8333333333333336e-07, + "loss": 8.3034, + "num_input_tokens_seen": 153472, + "step": 22 + }, + { + "epoch": 0.14802896218825423, + "grad_norm": 183.1331024169922, + "learning_rate": 1.9166666666666668e-07, + "loss": 8.054, + "num_input_tokens_seen": 159856, + "step": 23 + }, + { + "epoch": 0.15446500402252614, + "grad_norm": 168.13046264648438, + "learning_rate": 2.0000000000000002e-07, + "loss": 7.9583, + "num_input_tokens_seen": 166528, + "step": 24 + }, + { + "epoch": 0.16090104585679807, + "grad_norm": 167.57830810546875, + "learning_rate": 2.0833333333333333e-07, + "loss": 7.9626, + "num_input_tokens_seen": 173056, + "step": 25 + }, + { + "epoch": 0.16733708769107, + "grad_norm": 170.6557159423828, + "learning_rate": 2.166666666666667e-07, + "loss": 7.8761, + "num_input_tokens_seen": 179616, + "step": 26 + }, + { + "epoch": 0.1737731295253419, + "grad_norm": 179.7693328857422, + "learning_rate": 2.2500000000000002e-07, + "loss": 7.8896, + "num_input_tokens_seen": 186912, + "step": 27 + }, + { + "epoch": 0.18020917135961384, + "grad_norm": 180.4197998046875, + "learning_rate": 2.3333333333333336e-07, + "loss": 7.8352, + "num_input_tokens_seen": 193936, + "step": 28 + }, + { + "epoch": 0.18664521319388577, + "grad_norm": 164.2944793701172, + "learning_rate": 2.416666666666667e-07, + "loss": 7.691, + "num_input_tokens_seen": 200672, + "step": 29 + }, + { + "epoch": 0.19308125502815768, + "grad_norm": 167.71722412109375, + "learning_rate": 2.5000000000000004e-07, + "loss": 7.7851, + "num_input_tokens_seen": 207536, + "step": 30 + }, + { + "epoch": 0.1995172968624296, + "grad_norm": 169.2217254638672, + "learning_rate": 2.5833333333333333e-07, + "loss": 7.7249, + "num_input_tokens_seen": 214640, + "step": 31 + }, + { + "epoch": 0.20595333869670152, + "grad_norm": 155.74537658691406, + "learning_rate": 2.666666666666667e-07, + "loss": 6.8838, + "num_input_tokens_seen": 221744, + "step": 32 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 148.12120056152344, + "learning_rate": 2.75e-07, + "loss": 6.7173, + "num_input_tokens_seen": 228624, + "step": 33 + }, + { + "epoch": 0.21882542236524538, + "grad_norm": 150.97012329101562, + "learning_rate": 2.8333333333333336e-07, + "loss": 6.6793, + "num_input_tokens_seen": 235456, + "step": 34 + }, + { + "epoch": 0.2252614641995173, + "grad_norm": 149.623291015625, + "learning_rate": 2.916666666666667e-07, + "loss": 6.725, + "num_input_tokens_seen": 242768, + "step": 35 + }, + { + "epoch": 0.23169750603378922, + "grad_norm": 147.1656036376953, + "learning_rate": 3.0000000000000004e-07, + "loss": 6.6905, + "num_input_tokens_seen": 249552, + "step": 36 + }, + { + "epoch": 0.23813354786806115, + "grad_norm": 151.0162811279297, + "learning_rate": 3.083333333333334e-07, + "loss": 6.6179, + "num_input_tokens_seen": 256160, + "step": 37 + }, + { + "epoch": 0.24456958970233306, + "grad_norm": 150.03030395507812, + "learning_rate": 3.166666666666667e-07, + "loss": 6.501, + "num_input_tokens_seen": 262912, + "step": 38 + }, + { + "epoch": 0.251005631536605, + "grad_norm": 145.5784149169922, + "learning_rate": 3.25e-07, + "loss": 6.4588, + "num_input_tokens_seen": 269600, + "step": 39 + }, + { + "epoch": 0.2574416733708769, + "grad_norm": 143.5873565673828, + "learning_rate": 3.3333333333333335e-07, + "loss": 6.3614, + "num_input_tokens_seen": 276560, + "step": 40 + }, + { + "epoch": 0.26387771520514886, + "grad_norm": 144.9624481201172, + "learning_rate": 3.416666666666667e-07, + "loss": 6.2775, + "num_input_tokens_seen": 283696, + "step": 41 + }, + { + "epoch": 0.27031375703942073, + "grad_norm": 146.71554565429688, + "learning_rate": 3.5000000000000004e-07, + "loss": 5.9868, + "num_input_tokens_seen": 290832, + "step": 42 + }, + { + "epoch": 0.27674979887369267, + "grad_norm": 138.25450134277344, + "learning_rate": 3.583333333333334e-07, + "loss": 5.2286, + "num_input_tokens_seen": 298096, + "step": 43 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 156.28713989257812, + "learning_rate": 3.666666666666667e-07, + "loss": 4.5076, + "num_input_tokens_seen": 305120, + "step": 44 + }, + { + "epoch": 0.28962188254223653, + "grad_norm": 178.4820556640625, + "learning_rate": 3.75e-07, + "loss": 4.1167, + "num_input_tokens_seen": 312000, + "step": 45 + }, + { + "epoch": 0.29605792437650846, + "grad_norm": 317.7680358886719, + "learning_rate": 3.8333333333333335e-07, + "loss": 3.6585, + "num_input_tokens_seen": 319008, + "step": 46 + }, + { + "epoch": 0.3024939662107804, + "grad_norm": 282.17803955078125, + "learning_rate": 3.9166666666666675e-07, + "loss": 3.3613, + "num_input_tokens_seen": 326192, + "step": 47 + }, + { + "epoch": 0.3089300080450523, + "grad_norm": 257.7794494628906, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1068, + "num_input_tokens_seen": 333664, + "step": 48 + }, + { + "epoch": 0.3153660498793242, + "grad_norm": 255.1024169921875, + "learning_rate": 4.083333333333334e-07, + "loss": 2.9368, + "num_input_tokens_seen": 340912, + "step": 49 + }, + { + "epoch": 0.32180209171359614, + "grad_norm": 259.47015380859375, + "learning_rate": 4.1666666666666667e-07, + "loss": 2.3466, + "num_input_tokens_seen": 347712, + "step": 50 + }, + { + "epoch": 0.32823813354786807, + "grad_norm": 263.3533935546875, + "learning_rate": 4.2500000000000006e-07, + "loss": 2.0645, + "num_input_tokens_seen": 355232, + "step": 51 + }, + { + "epoch": 0.33467417538214, + "grad_norm": 239.1399688720703, + "learning_rate": 4.333333333333334e-07, + "loss": 1.7729, + "num_input_tokens_seen": 361968, + "step": 52 + }, + { + "epoch": 0.3411102172164119, + "grad_norm": 257.4410095214844, + "learning_rate": 4.416666666666667e-07, + "loss": 1.6199, + "num_input_tokens_seen": 369136, + "step": 53 + }, + { + "epoch": 0.3475462590506838, + "grad_norm": 169.56935119628906, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.1593, + "num_input_tokens_seen": 375904, + "step": 54 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 95.25677490234375, + "learning_rate": 4.583333333333333e-07, + "loss": 0.7199, + "num_input_tokens_seen": 382848, + "step": 55 + }, + { + "epoch": 0.3604183427192277, + "grad_norm": 48.7137451171875, + "learning_rate": 4.666666666666667e-07, + "loss": 0.4394, + "num_input_tokens_seen": 389680, + "step": 56 + }, + { + "epoch": 0.3668543845534996, + "grad_norm": 62.34474563598633, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.3806, + "num_input_tokens_seen": 396192, + "step": 57 + }, + { + "epoch": 0.37329042638777155, + "grad_norm": 30.711780548095703, + "learning_rate": 4.833333333333334e-07, + "loss": 0.3185, + "num_input_tokens_seen": 403104, + "step": 58 + }, + { + "epoch": 0.3797264682220434, + "grad_norm": 34.46913528442383, + "learning_rate": 4.916666666666667e-07, + "loss": 0.3056, + "num_input_tokens_seen": 410176, + "step": 59 + }, + { + "epoch": 0.38616251005631536, + "grad_norm": 25.92363166809082, + "learning_rate": 5.000000000000001e-07, + "loss": 0.2981, + "num_input_tokens_seen": 416928, + "step": 60 + }, + { + "epoch": 0.3925985518905873, + "grad_norm": 11.064619064331055, + "learning_rate": 5.083333333333334e-07, + "loss": 0.2473, + "num_input_tokens_seen": 424128, + "step": 61 + }, + { + "epoch": 0.3990345937248592, + "grad_norm": 55.367347717285156, + "learning_rate": 5.166666666666667e-07, + "loss": 0.2924, + "num_input_tokens_seen": 430864, + "step": 62 + }, + { + "epoch": 0.40547063555913115, + "grad_norm": 42.00873947143555, + "learning_rate": 5.250000000000001e-07, + "loss": 0.2656, + "num_input_tokens_seen": 437744, + "step": 63 + }, + { + "epoch": 0.41190667739340303, + "grad_norm": 13.313591003417969, + "learning_rate": 5.333333333333335e-07, + "loss": 0.2335, + "num_input_tokens_seen": 444624, + "step": 64 + }, + { + "epoch": 0.41834271922767496, + "grad_norm": 60.489715576171875, + "learning_rate": 5.416666666666667e-07, + "loss": 0.2647, + "num_input_tokens_seen": 451696, + "step": 65 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 77.01821899414062, + "learning_rate": 5.5e-07, + "loss": 0.3003, + "num_input_tokens_seen": 458784, + "step": 66 + }, + { + "epoch": 0.43121480289621883, + "grad_norm": 58.067596435546875, + "learning_rate": 5.583333333333333e-07, + "loss": 0.2656, + "num_input_tokens_seen": 465920, + "step": 67 + }, + { + "epoch": 0.43765084473049076, + "grad_norm": 12.40570068359375, + "learning_rate": 5.666666666666667e-07, + "loss": 0.2212, + "num_input_tokens_seen": 473152, + "step": 68 + }, + { + "epoch": 0.4440868865647627, + "grad_norm": 35.392276763916016, + "learning_rate": 5.750000000000001e-07, + "loss": 0.2532, + "num_input_tokens_seen": 480544, + "step": 69 + }, + { + "epoch": 0.4505229283990346, + "grad_norm": 51.42181396484375, + "learning_rate": 5.833333333333334e-07, + "loss": 0.2799, + "num_input_tokens_seen": 487552, + "step": 70 + }, + { + "epoch": 0.4569589702333065, + "grad_norm": 45.73934555053711, + "learning_rate": 5.916666666666667e-07, + "loss": 0.2876, + "num_input_tokens_seen": 494256, + "step": 71 + }, + { + "epoch": 0.46339501206757844, + "grad_norm": 20.654096603393555, + "learning_rate": 6.000000000000001e-07, + "loss": 0.2191, + "num_input_tokens_seen": 500768, + "step": 72 + }, + { + "epoch": 0.46983105390185037, + "grad_norm": 21.078027725219727, + "learning_rate": 6.083333333333334e-07, + "loss": 0.2344, + "num_input_tokens_seen": 507136, + "step": 73 + }, + { + "epoch": 0.4762670957361223, + "grad_norm": 36.7335205078125, + "learning_rate": 6.166666666666668e-07, + "loss": 0.2547, + "num_input_tokens_seen": 514208, + "step": 74 + }, + { + "epoch": 0.4827031375703942, + "grad_norm": 34.47271728515625, + "learning_rate": 6.25e-07, + "loss": 0.2349, + "num_input_tokens_seen": 521120, + "step": 75 + }, + { + "epoch": 0.4891391794046661, + "grad_norm": 5.103244781494141, + "learning_rate": 6.333333333333334e-07, + "loss": 0.2045, + "num_input_tokens_seen": 527824, + "step": 76 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 22.47526741027832, + "learning_rate": 6.416666666666667e-07, + "loss": 0.2262, + "num_input_tokens_seen": 534832, + "step": 77 + }, + { + "epoch": 0.50201126307321, + "grad_norm": 30.610803604125977, + "learning_rate": 6.5e-07, + "loss": 0.2393, + "num_input_tokens_seen": 541696, + "step": 78 + }, + { + "epoch": 0.5084473049074819, + "grad_norm": 10.922965049743652, + "learning_rate": 6.583333333333333e-07, + "loss": 0.2206, + "num_input_tokens_seen": 548608, + "step": 79 + }, + { + "epoch": 0.5148833467417538, + "grad_norm": 17.484182357788086, + "learning_rate": 6.666666666666667e-07, + "loss": 0.2029, + "num_input_tokens_seen": 555456, + "step": 80 + }, + { + "epoch": 0.5213193885760258, + "grad_norm": 16.49226188659668, + "learning_rate": 6.750000000000001e-07, + "loss": 0.2125, + "num_input_tokens_seen": 562768, + "step": 81 + }, + { + "epoch": 0.5277554304102977, + "grad_norm": 9.977084159851074, + "learning_rate": 6.833333333333334e-07, + "loss": 0.2023, + "num_input_tokens_seen": 569536, + "step": 82 + }, + { + "epoch": 0.5341914722445696, + "grad_norm": 17.79197120666504, + "learning_rate": 6.916666666666668e-07, + "loss": 0.2262, + "num_input_tokens_seen": 576096, + "step": 83 + }, + { + "epoch": 0.5406275140788415, + "grad_norm": 16.699260711669922, + "learning_rate": 7.000000000000001e-07, + "loss": 0.2003, + "num_input_tokens_seen": 583472, + "step": 84 + }, + { + "epoch": 0.5470635559131134, + "grad_norm": 25.02164077758789, + "learning_rate": 7.083333333333334e-07, + "loss": 0.2351, + "num_input_tokens_seen": 590304, + "step": 85 + }, + { + "epoch": 0.5534995977473853, + "grad_norm": 3.8612709045410156, + "learning_rate": 7.166666666666668e-07, + "loss": 0.1839, + "num_input_tokens_seen": 597152, + "step": 86 + }, + { + "epoch": 0.5599356395816573, + "grad_norm": 31.555482864379883, + "learning_rate": 7.25e-07, + "loss": 0.2315, + "num_input_tokens_seen": 604208, + "step": 87 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 54.94756317138672, + "learning_rate": 7.333333333333334e-07, + "loss": 0.2732, + "num_input_tokens_seen": 610896, + "step": 88 + }, + { + "epoch": 0.5728077232502011, + "grad_norm": 30.55241584777832, + "learning_rate": 7.416666666666668e-07, + "loss": 0.2405, + "num_input_tokens_seen": 618112, + "step": 89 + }, + { + "epoch": 0.5792437650844731, + "grad_norm": 16.687997817993164, + "learning_rate": 7.5e-07, + "loss": 0.2005, + "num_input_tokens_seen": 625040, + "step": 90 + }, + { + "epoch": 0.585679806918745, + "grad_norm": 10.350790977478027, + "learning_rate": 7.583333333333334e-07, + "loss": 0.2005, + "num_input_tokens_seen": 631840, + "step": 91 + }, + { + "epoch": 0.5921158487530169, + "grad_norm": 25.88368797302246, + "learning_rate": 7.666666666666667e-07, + "loss": 0.2115, + "num_input_tokens_seen": 638752, + "step": 92 + }, + { + "epoch": 0.5985518905872889, + "grad_norm": 17.11625099182129, + "learning_rate": 7.750000000000001e-07, + "loss": 0.2141, + "num_input_tokens_seen": 645968, + "step": 93 + }, + { + "epoch": 0.6049879324215608, + "grad_norm": 12.70864200592041, + "learning_rate": 7.833333333333335e-07, + "loss": 0.1898, + "num_input_tokens_seen": 652752, + "step": 94 + }, + { + "epoch": 0.6114239742558326, + "grad_norm": 3.674001455307007, + "learning_rate": 7.916666666666667e-07, + "loss": 0.2099, + "num_input_tokens_seen": 660048, + "step": 95 + }, + { + "epoch": 0.6178600160901045, + "grad_norm": 20.51032066345215, + "learning_rate": 8.000000000000001e-07, + "loss": 0.2014, + "num_input_tokens_seen": 666752, + "step": 96 + }, + { + "epoch": 0.6242960579243765, + "grad_norm": 47.562381744384766, + "learning_rate": 8.083333333333334e-07, + "loss": 0.2349, + "num_input_tokens_seen": 673856, + "step": 97 + }, + { + "epoch": 0.6307320997586484, + "grad_norm": 35.69169998168945, + "learning_rate": 8.166666666666668e-07, + "loss": 0.2205, + "num_input_tokens_seen": 681104, + "step": 98 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 10.080629348754883, + "learning_rate": 8.250000000000001e-07, + "loss": 0.199, + "num_input_tokens_seen": 688128, + "step": 99 + }, + { + "epoch": 0.6436041834271923, + "grad_norm": 26.242666244506836, + "learning_rate": 8.333333333333333e-07, + "loss": 0.236, + "num_input_tokens_seen": 695216, + "step": 100 + }, + { + "epoch": 0.6500402252614642, + "grad_norm": 22.0434627532959, + "learning_rate": 8.416666666666667e-07, + "loss": 0.2265, + "num_input_tokens_seen": 701968, + "step": 101 + }, + { + "epoch": 0.6564762670957361, + "grad_norm": 27.378408432006836, + "learning_rate": 8.500000000000001e-07, + "loss": 0.2443, + "num_input_tokens_seen": 708928, + "step": 102 + }, + { + "epoch": 0.6629123089300081, + "grad_norm": 11.929069519042969, + "learning_rate": 8.583333333333334e-07, + "loss": 0.2086, + "num_input_tokens_seen": 715952, + "step": 103 + }, + { + "epoch": 0.66934835076428, + "grad_norm": 6.677243232727051, + "learning_rate": 8.666666666666668e-07, + "loss": 0.1915, + "num_input_tokens_seen": 722928, + "step": 104 + }, + { + "epoch": 0.6757843925985519, + "grad_norm": 17.033658981323242, + "learning_rate": 8.75e-07, + "loss": 0.1967, + "num_input_tokens_seen": 730160, + "step": 105 + }, + { + "epoch": 0.6822204344328238, + "grad_norm": 6.806990146636963, + "learning_rate": 8.833333333333334e-07, + "loss": 0.188, + "num_input_tokens_seen": 737088, + "step": 106 + }, + { + "epoch": 0.6886564762670957, + "grad_norm": 4.871335506439209, + "learning_rate": 8.916666666666668e-07, + "loss": 0.1895, + "num_input_tokens_seen": 743744, + "step": 107 + }, + { + "epoch": 0.6950925181013676, + "grad_norm": 9.054122924804688, + "learning_rate": 9.000000000000001e-07, + "loss": 0.1667, + "num_input_tokens_seen": 750496, + "step": 108 + }, + { + "epoch": 0.7015285599356396, + "grad_norm": 15.78903579711914, + "learning_rate": 9.083333333333335e-07, + "loss": 0.1976, + "num_input_tokens_seen": 757792, + "step": 109 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 10.51429271697998, + "learning_rate": 9.166666666666666e-07, + "loss": 0.2057, + "num_input_tokens_seen": 764992, + "step": 110 + }, + { + "epoch": 0.7144006436041834, + "grad_norm": 24.346830368041992, + "learning_rate": 9.25e-07, + "loss": 0.2002, + "num_input_tokens_seen": 771648, + "step": 111 + }, + { + "epoch": 0.7208366854384554, + "grad_norm": 46.50392532348633, + "learning_rate": 9.333333333333334e-07, + "loss": 0.2173, + "num_input_tokens_seen": 778480, + "step": 112 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 22.505762100219727, + "learning_rate": 9.416666666666667e-07, + "loss": 0.1756, + "num_input_tokens_seen": 785328, + "step": 113 + }, + { + "epoch": 0.7337087691069992, + "grad_norm": 5.675211429595947, + "learning_rate": 9.500000000000001e-07, + "loss": 0.1786, + "num_input_tokens_seen": 792592, + "step": 114 + }, + { + "epoch": 0.7401448109412712, + "grad_norm": 14.814651489257812, + "learning_rate": 9.583333333333334e-07, + "loss": 0.1879, + "num_input_tokens_seen": 799808, + "step": 115 + }, + { + "epoch": 0.7465808527755431, + "grad_norm": 13.106173515319824, + "learning_rate": 9.666666666666668e-07, + "loss": 0.173, + "num_input_tokens_seen": 806896, + "step": 116 + }, + { + "epoch": 0.7530168946098149, + "grad_norm": 24.56918716430664, + "learning_rate": 9.750000000000002e-07, + "loss": 0.1714, + "num_input_tokens_seen": 813536, + "step": 117 + }, + { + "epoch": 0.7594529364440868, + "grad_norm": 27.256954193115234, + "learning_rate": 9.833333333333334e-07, + "loss": 0.2015, + "num_input_tokens_seen": 820608, + "step": 118 + }, + { + "epoch": 0.7658889782783588, + "grad_norm": 4.209413051605225, + "learning_rate": 9.916666666666668e-07, + "loss": 0.1847, + "num_input_tokens_seen": 827776, + "step": 119 + }, + { + "epoch": 0.7723250201126307, + "grad_norm": 18.684349060058594, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1876, + "num_input_tokens_seen": 834704, + "step": 120 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 19.470041275024414, + "learning_rate": 1.0083333333333333e-06, + "loss": 0.1937, + "num_input_tokens_seen": 841568, + "step": 121 + }, + { + "epoch": 0.7851971037811746, + "grad_norm": 11.242873191833496, + "learning_rate": 1.0166666666666667e-06, + "loss": 0.1974, + "num_input_tokens_seen": 848704, + "step": 122 + }, + { + "epoch": 0.7916331456154465, + "grad_norm": 26.72730255126953, + "learning_rate": 1.025e-06, + "loss": 0.2099, + "num_input_tokens_seen": 855664, + "step": 123 + }, + { + "epoch": 0.7980691874497184, + "grad_norm": 41.4288215637207, + "learning_rate": 1.0333333333333333e-06, + "loss": 0.2239, + "num_input_tokens_seen": 862464, + "step": 124 + }, + { + "epoch": 0.8045052292839904, + "grad_norm": 27.283327102661133, + "learning_rate": 1.0416666666666667e-06, + "loss": 0.1953, + "num_input_tokens_seen": 869376, + "step": 125 + }, + { + "epoch": 0.8109412711182623, + "grad_norm": 4.882501602172852, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.1906, + "num_input_tokens_seen": 876848, + "step": 126 + }, + { + "epoch": 0.8173773129525342, + "grad_norm": 8.478296279907227, + "learning_rate": 1.0583333333333335e-06, + "loss": 0.1852, + "num_input_tokens_seen": 883664, + "step": 127 + }, + { + "epoch": 0.8238133547868061, + "grad_norm": 6.773479461669922, + "learning_rate": 1.066666666666667e-06, + "loss": 0.198, + "num_input_tokens_seen": 890592, + "step": 128 + }, + { + "epoch": 0.830249396621078, + "grad_norm": 21.877212524414062, + "learning_rate": 1.075e-06, + "loss": 0.2105, + "num_input_tokens_seen": 898048, + "step": 129 + }, + { + "epoch": 0.8366854384553499, + "grad_norm": 12.123941421508789, + "learning_rate": 1.0833333333333335e-06, + "loss": 0.1899, + "num_input_tokens_seen": 905040, + "step": 130 + }, + { + "epoch": 0.8431214802896219, + "grad_norm": 15.84151554107666, + "learning_rate": 1.0916666666666667e-06, + "loss": 0.1742, + "num_input_tokens_seen": 912080, + "step": 131 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 8.174356460571289, + "learning_rate": 1.1e-06, + "loss": 0.1585, + "num_input_tokens_seen": 919424, + "step": 132 + }, + { + "epoch": 0.8559935639581657, + "grad_norm": 14.87348461151123, + "learning_rate": 1.1083333333333335e-06, + "loss": 0.1878, + "num_input_tokens_seen": 926608, + "step": 133 + }, + { + "epoch": 0.8624296057924377, + "grad_norm": 11.989315032958984, + "learning_rate": 1.1166666666666666e-06, + "loss": 0.1748, + "num_input_tokens_seen": 933712, + "step": 134 + }, + { + "epoch": 0.8688656476267096, + "grad_norm": 9.659666061401367, + "learning_rate": 1.125e-06, + "loss": 0.1944, + "num_input_tokens_seen": 940304, + "step": 135 + }, + { + "epoch": 0.8753016894609815, + "grad_norm": 20.558237075805664, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.1727, + "num_input_tokens_seen": 947008, + "step": 136 + }, + { + "epoch": 0.8817377312952535, + "grad_norm": 8.66232967376709, + "learning_rate": 1.1416666666666668e-06, + "loss": 0.1748, + "num_input_tokens_seen": 954112, + "step": 137 + }, + { + "epoch": 0.8881737731295254, + "grad_norm": 16.516559600830078, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.1625, + "num_input_tokens_seen": 961120, + "step": 138 + }, + { + "epoch": 0.8946098149637972, + "grad_norm": 6.140871047973633, + "learning_rate": 1.1583333333333334e-06, + "loss": 0.1649, + "num_input_tokens_seen": 967792, + "step": 139 + }, + { + "epoch": 0.9010458567980691, + "grad_norm": 11.593804359436035, + "learning_rate": 1.1666666666666668e-06, + "loss": 0.1738, + "num_input_tokens_seen": 974496, + "step": 140 + }, + { + "epoch": 0.9074818986323411, + "grad_norm": 26.92620849609375, + "learning_rate": 1.175e-06, + "loss": 0.2221, + "num_input_tokens_seen": 981344, + "step": 141 + }, + { + "epoch": 0.913917940466613, + "grad_norm": 26.845230102539062, + "learning_rate": 1.1833333333333334e-06, + "loss": 0.1989, + "num_input_tokens_seen": 988224, + "step": 142 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 12.823030471801758, + "learning_rate": 1.1916666666666668e-06, + "loss": 0.1569, + "num_input_tokens_seen": 995552, + "step": 143 + }, + { + "epoch": 0.9267900241351569, + "grad_norm": 14.508877754211426, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.1594, + "num_input_tokens_seen": 1002224, + "step": 144 + }, + { + "epoch": 0.9332260659694288, + "grad_norm": 13.097854614257812, + "learning_rate": 1.2083333333333333e-06, + "loss": 0.1609, + "num_input_tokens_seen": 1009312, + "step": 145 + }, + { + "epoch": 0.9396621078037007, + "grad_norm": 12.183431625366211, + "learning_rate": 1.2166666666666667e-06, + "loss": 0.1649, + "num_input_tokens_seen": 1016256, + "step": 146 + }, + { + "epoch": 0.9460981496379727, + "grad_norm": 10.628469467163086, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.1412, + "num_input_tokens_seen": 1022880, + "step": 147 + }, + { + "epoch": 0.9525341914722446, + "grad_norm": 11.713327407836914, + "learning_rate": 1.2333333333333335e-06, + "loss": 0.165, + "num_input_tokens_seen": 1029856, + "step": 148 + }, + { + "epoch": 0.9589702333065165, + "grad_norm": 10.031126976013184, + "learning_rate": 1.2416666666666667e-06, + "loss": 0.1971, + "num_input_tokens_seen": 1036928, + "step": 149 + }, + { + "epoch": 0.9654062751407884, + "grad_norm": 34.122074127197266, + "learning_rate": 1.25e-06, + "loss": 0.1843, + "num_input_tokens_seen": 1044000, + "step": 150 + }, + { + "epoch": 0.9718423169750603, + "grad_norm": 13.707520484924316, + "learning_rate": 1.2583333333333333e-06, + "loss": 0.1628, + "num_input_tokens_seen": 1050928, + "step": 151 + }, + { + "epoch": 0.9782783588093322, + "grad_norm": 8.588343620300293, + "learning_rate": 1.2666666666666669e-06, + "loss": 0.1878, + "num_input_tokens_seen": 1057920, + "step": 152 + }, + { + "epoch": 0.9847144006436042, + "grad_norm": 4.411599159240723, + "learning_rate": 1.275e-06, + "loss": 0.1153, + "num_input_tokens_seen": 1064704, + "step": 153 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 13.095698356628418, + "learning_rate": 1.2833333333333335e-06, + "loss": 0.1622, + "num_input_tokens_seen": 1071760, + "step": 154 + }, + { + "epoch": 0.997586484312148, + "grad_norm": 14.093315124511719, + "learning_rate": 1.2916666666666669e-06, + "loss": 0.1549, + "num_input_tokens_seen": 1078912, + "step": 155 + }, + { + "epoch": 1.00402252614642, + "grad_norm": 17.082075119018555, + "learning_rate": 1.3e-06, + "loss": 0.1729, + "num_input_tokens_seen": 1086288, + "step": 156 + }, + { + "epoch": 1.010458567980692, + "grad_norm": 4.992012977600098, + "learning_rate": 1.3083333333333334e-06, + "loss": 0.1198, + "num_input_tokens_seen": 1093584, + "step": 157 + }, + { + "epoch": 1.0168946098149638, + "grad_norm": 5.45336389541626, + "learning_rate": 1.3166666666666666e-06, + "loss": 0.1723, + "num_input_tokens_seen": 1100432, + "step": 158 + }, + { + "epoch": 1.0233306516492358, + "grad_norm": 7.4880757331848145, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.1485, + "num_input_tokens_seen": 1107280, + "step": 159 + }, + { + "epoch": 1.0297666934835077, + "grad_norm": 40.28890609741211, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.1757, + "num_input_tokens_seen": 1113968, + "step": 160 + }, + { + "epoch": 1.0362027353177796, + "grad_norm": 39.24993896484375, + "learning_rate": 1.3416666666666666e-06, + "loss": 0.1907, + "num_input_tokens_seen": 1120752, + "step": 161 + }, + { + "epoch": 1.0426387771520516, + "grad_norm": 5.63855504989624, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.1842, + "num_input_tokens_seen": 1127712, + "step": 162 + }, + { + "epoch": 1.0490748189863235, + "grad_norm": 5.1802754402160645, + "learning_rate": 1.3583333333333334e-06, + "loss": 0.1549, + "num_input_tokens_seen": 1134592, + "step": 163 + }, + { + "epoch": 1.0555108608205954, + "grad_norm": 4.200067043304443, + "learning_rate": 1.3666666666666668e-06, + "loss": 0.153, + "num_input_tokens_seen": 1141888, + "step": 164 + }, + { + "epoch": 1.0619469026548674, + "grad_norm": 6.892277240753174, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.1532, + "num_input_tokens_seen": 1148688, + "step": 165 + }, + { + "epoch": 1.068382944489139, + "grad_norm": 11.852892875671387, + "learning_rate": 1.3833333333333336e-06, + "loss": 0.1629, + "num_input_tokens_seen": 1155552, + "step": 166 + }, + { + "epoch": 1.074818986323411, + "grad_norm": 8.346076011657715, + "learning_rate": 1.3916666666666668e-06, + "loss": 0.1708, + "num_input_tokens_seen": 1162624, + "step": 167 + }, + { + "epoch": 1.081255028157683, + "grad_norm": 7.836976528167725, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.1461, + "num_input_tokens_seen": 1169904, + "step": 168 + }, + { + "epoch": 1.0876910699919549, + "grad_norm": 15.59913158416748, + "learning_rate": 1.4083333333333335e-06, + "loss": 0.1402, + "num_input_tokens_seen": 1176928, + "step": 169 + }, + { + "epoch": 1.0941271118262268, + "grad_norm": 8.46536636352539, + "learning_rate": 1.4166666666666667e-06, + "loss": 0.143, + "num_input_tokens_seen": 1184160, + "step": 170 + }, + { + "epoch": 1.1005631536604987, + "grad_norm": 7.491546154022217, + "learning_rate": 1.425e-06, + "loss": 0.1454, + "num_input_tokens_seen": 1191120, + "step": 171 + }, + { + "epoch": 1.1069991954947707, + "grad_norm": 16.70829200744629, + "learning_rate": 1.4333333333333335e-06, + "loss": 0.1286, + "num_input_tokens_seen": 1197920, + "step": 172 + }, + { + "epoch": 1.1134352373290426, + "grad_norm": 16.273927688598633, + "learning_rate": 1.4416666666666667e-06, + "loss": 0.1523, + "num_input_tokens_seen": 1204576, + "step": 173 + }, + { + "epoch": 1.1198712791633145, + "grad_norm": 8.122928619384766, + "learning_rate": 1.45e-06, + "loss": 0.1345, + "num_input_tokens_seen": 1211344, + "step": 174 + }, + { + "epoch": 1.1263073209975865, + "grad_norm": 27.850522994995117, + "learning_rate": 1.4583333333333335e-06, + "loss": 0.1749, + "num_input_tokens_seen": 1218432, + "step": 175 + }, + { + "epoch": 1.1327433628318584, + "grad_norm": 30.498666763305664, + "learning_rate": 1.4666666666666669e-06, + "loss": 0.166, + "num_input_tokens_seen": 1225728, + "step": 176 + }, + { + "epoch": 1.1391794046661303, + "grad_norm": 26.916791915893555, + "learning_rate": 1.475e-06, + "loss": 0.1708, + "num_input_tokens_seen": 1232784, + "step": 177 + }, + { + "epoch": 1.1456154465004023, + "grad_norm": 13.593954086303711, + "learning_rate": 1.4833333333333337e-06, + "loss": 0.1363, + "num_input_tokens_seen": 1239472, + "step": 178 + }, + { + "epoch": 1.1520514883346742, + "grad_norm": 17.63590431213379, + "learning_rate": 1.4916666666666669e-06, + "loss": 0.1369, + "num_input_tokens_seen": 1246864, + "step": 179 + }, + { + "epoch": 1.1584875301689461, + "grad_norm": 12.465302467346191, + "learning_rate": 1.5e-06, + "loss": 0.1632, + "num_input_tokens_seen": 1253936, + "step": 180 + }, + { + "epoch": 1.164923572003218, + "grad_norm": 18.099266052246094, + "learning_rate": 1.5083333333333336e-06, + "loss": 0.1734, + "num_input_tokens_seen": 1261120, + "step": 181 + }, + { + "epoch": 1.17135961383749, + "grad_norm": 12.134090423583984, + "learning_rate": 1.5166666666666668e-06, + "loss": 0.135, + "num_input_tokens_seen": 1268208, + "step": 182 + }, + { + "epoch": 1.177795655671762, + "grad_norm": 5.747508525848389, + "learning_rate": 1.525e-06, + "loss": 0.1355, + "num_input_tokens_seen": 1275296, + "step": 183 + }, + { + "epoch": 1.1842316975060339, + "grad_norm": 16.193449020385742, + "learning_rate": 1.5333333333333334e-06, + "loss": 0.1324, + "num_input_tokens_seen": 1282320, + "step": 184 + }, + { + "epoch": 1.1906677393403058, + "grad_norm": 23.576427459716797, + "learning_rate": 1.5416666666666668e-06, + "loss": 0.1754, + "num_input_tokens_seen": 1289008, + "step": 185 + }, + { + "epoch": 1.1971037811745777, + "grad_norm": 4.542221546173096, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1484, + "num_input_tokens_seen": 1296208, + "step": 186 + }, + { + "epoch": 1.2035398230088497, + "grad_norm": 6.084584712982178, + "learning_rate": 1.5583333333333334e-06, + "loss": 0.1315, + "num_input_tokens_seen": 1303072, + "step": 187 + }, + { + "epoch": 1.2099758648431216, + "grad_norm": 18.8467960357666, + "learning_rate": 1.566666666666667e-06, + "loss": 0.1665, + "num_input_tokens_seen": 1310320, + "step": 188 + }, + { + "epoch": 1.2164119066773935, + "grad_norm": 6.79512882232666, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.1406, + "num_input_tokens_seen": 1317728, + "step": 189 + }, + { + "epoch": 1.2228479485116655, + "grad_norm": 11.130036354064941, + "learning_rate": 1.5833333333333333e-06, + "loss": 0.1391, + "num_input_tokens_seen": 1325216, + "step": 190 + }, + { + "epoch": 1.2292839903459372, + "grad_norm": 17.00998306274414, + "learning_rate": 1.591666666666667e-06, + "loss": 0.1339, + "num_input_tokens_seen": 1332272, + "step": 191 + }, + { + "epoch": 1.235720032180209, + "grad_norm": 16.623762130737305, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1613, + "num_input_tokens_seen": 1339008, + "step": 192 + }, + { + "epoch": 1.242156074014481, + "grad_norm": 15.660219192504883, + "learning_rate": 1.6083333333333333e-06, + "loss": 0.1274, + "num_input_tokens_seen": 1345664, + "step": 193 + }, + { + "epoch": 1.248592115848753, + "grad_norm": 21.379770278930664, + "learning_rate": 1.6166666666666667e-06, + "loss": 0.1882, + "num_input_tokens_seen": 1352720, + "step": 194 + }, + { + "epoch": 1.255028157683025, + "grad_norm": 8.196439743041992, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.1106, + "num_input_tokens_seen": 1359616, + "step": 195 + }, + { + "epoch": 1.2614641995172968, + "grad_norm": 4.444194793701172, + "learning_rate": 1.6333333333333335e-06, + "loss": 0.1249, + "num_input_tokens_seen": 1366656, + "step": 196 + }, + { + "epoch": 1.2679002413515688, + "grad_norm": 10.585016250610352, + "learning_rate": 1.6416666666666667e-06, + "loss": 0.1499, + "num_input_tokens_seen": 1373904, + "step": 197 + }, + { + "epoch": 1.2743362831858407, + "grad_norm": 18.406293869018555, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.1512, + "num_input_tokens_seen": 1380528, + "step": 198 + }, + { + "epoch": 1.2807723250201126, + "grad_norm": 5.323694229125977, + "learning_rate": 1.6583333333333335e-06, + "loss": 0.1166, + "num_input_tokens_seen": 1386912, + "step": 199 + }, + { + "epoch": 1.2872083668543846, + "grad_norm": 20.726289749145508, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.21, + "num_input_tokens_seen": 1393648, + "step": 200 + }, + { + "epoch": 1.2936444086886565, + "grad_norm": 24.05786895751953, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.1915, + "num_input_tokens_seen": 1400640, + "step": 201 + }, + { + "epoch": 1.3000804505229284, + "grad_norm": 19.30237579345703, + "learning_rate": 1.6833333333333335e-06, + "loss": 0.1911, + "num_input_tokens_seen": 1407984, + "step": 202 + }, + { + "epoch": 1.3065164923572004, + "grad_norm": 6.517977714538574, + "learning_rate": 1.6916666666666666e-06, + "loss": 0.1487, + "num_input_tokens_seen": 1414672, + "step": 203 + }, + { + "epoch": 1.3129525341914723, + "grad_norm": 30.81540870666504, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.2154, + "num_input_tokens_seen": 1421872, + "step": 204 + }, + { + "epoch": 1.3193885760257442, + "grad_norm": 44.00107955932617, + "learning_rate": 1.7083333333333334e-06, + "loss": 0.2909, + "num_input_tokens_seen": 1428640, + "step": 205 + }, + { + "epoch": 1.3258246178600162, + "grad_norm": 41.464210510253906, + "learning_rate": 1.7166666666666668e-06, + "loss": 0.271, + "num_input_tokens_seen": 1435456, + "step": 206 + }, + { + "epoch": 1.332260659694288, + "grad_norm": 12.14904499053955, + "learning_rate": 1.725e-06, + "loss": 0.1616, + "num_input_tokens_seen": 1442592, + "step": 207 + }, + { + "epoch": 1.33869670152856, + "grad_norm": 8.393083572387695, + "learning_rate": 1.7333333333333336e-06, + "loss": 0.1427, + "num_input_tokens_seen": 1449200, + "step": 208 + }, + { + "epoch": 1.3451327433628317, + "grad_norm": 11.04562759399414, + "learning_rate": 1.7416666666666668e-06, + "loss": 0.1602, + "num_input_tokens_seen": 1455920, + "step": 209 + }, + { + "epoch": 1.3515687851971037, + "grad_norm": 12.494465827941895, + "learning_rate": 1.75e-06, + "loss": 0.169, + "num_input_tokens_seen": 1462624, + "step": 210 + }, + { + "epoch": 1.3580048270313756, + "grad_norm": 5.395782470703125, + "learning_rate": 1.7583333333333336e-06, + "loss": 0.1285, + "num_input_tokens_seen": 1469520, + "step": 211 + }, + { + "epoch": 1.3644408688656475, + "grad_norm": 19.773469924926758, + "learning_rate": 1.7666666666666668e-06, + "loss": 0.1636, + "num_input_tokens_seen": 1476592, + "step": 212 + }, + { + "epoch": 1.3708769106999195, + "grad_norm": 28.318584442138672, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.1702, + "num_input_tokens_seen": 1483632, + "step": 213 + }, + { + "epoch": 1.3773129525341914, + "grad_norm": 20.225502014160156, + "learning_rate": 1.7833333333333336e-06, + "loss": 0.1562, + "num_input_tokens_seen": 1490528, + "step": 214 + }, + { + "epoch": 1.3837489943684633, + "grad_norm": 5.386298179626465, + "learning_rate": 1.7916666666666667e-06, + "loss": 0.1537, + "num_input_tokens_seen": 1497648, + "step": 215 + }, + { + "epoch": 1.3901850362027353, + "grad_norm": 6.181918144226074, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.1114, + "num_input_tokens_seen": 1504800, + "step": 216 + }, + { + "epoch": 1.3966210780370072, + "grad_norm": 5.554294109344482, + "learning_rate": 1.8083333333333335e-06, + "loss": 0.1017, + "num_input_tokens_seen": 1512240, + "step": 217 + }, + { + "epoch": 1.4030571198712791, + "grad_norm": 5.2657880783081055, + "learning_rate": 1.816666666666667e-06, + "loss": 0.1184, + "num_input_tokens_seen": 1519200, + "step": 218 + }, + { + "epoch": 1.409493161705551, + "grad_norm": 8.627300262451172, + "learning_rate": 1.825e-06, + "loss": 0.1343, + "num_input_tokens_seen": 1526272, + "step": 219 + }, + { + "epoch": 1.415929203539823, + "grad_norm": 7.965896129608154, + "learning_rate": 1.8333333333333333e-06, + "loss": 0.1271, + "num_input_tokens_seen": 1533440, + "step": 220 + }, + { + "epoch": 1.422365245374095, + "grad_norm": 7.089397430419922, + "learning_rate": 1.8416666666666669e-06, + "loss": 0.1383, + "num_input_tokens_seen": 1540272, + "step": 221 + }, + { + "epoch": 1.4288012872083669, + "grad_norm": 4.354486465454102, + "learning_rate": 1.85e-06, + "loss": 0.1558, + "num_input_tokens_seen": 1547632, + "step": 222 + }, + { + "epoch": 1.4352373290426388, + "grad_norm": 7.841838836669922, + "learning_rate": 1.8583333333333335e-06, + "loss": 0.1312, + "num_input_tokens_seen": 1554608, + "step": 223 + }, + { + "epoch": 1.4416733708769107, + "grad_norm": 6.812905311584473, + "learning_rate": 1.8666666666666669e-06, + "loss": 0.1212, + "num_input_tokens_seen": 1561472, + "step": 224 + }, + { + "epoch": 1.4481094127111827, + "grad_norm": 5.038280963897705, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.1342, + "num_input_tokens_seen": 1568496, + "step": 225 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 4.255394458770752, + "learning_rate": 1.8833333333333334e-06, + "loss": 0.096, + "num_input_tokens_seen": 1575184, + "step": 226 + }, + { + "epoch": 1.4609814963797265, + "grad_norm": 3.311915397644043, + "learning_rate": 1.8916666666666668e-06, + "loss": 0.0982, + "num_input_tokens_seen": 1582080, + "step": 227 + }, + { + "epoch": 1.4674175382139985, + "grad_norm": 4.303693771362305, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.1099, + "num_input_tokens_seen": 1588688, + "step": 228 + }, + { + "epoch": 1.4738535800482704, + "grad_norm": 14.854019165039062, + "learning_rate": 1.9083333333333334e-06, + "loss": 0.1265, + "num_input_tokens_seen": 1595216, + "step": 229 + }, + { + "epoch": 1.4802896218825423, + "grad_norm": 10.509958267211914, + "learning_rate": 1.916666666666667e-06, + "loss": 0.1066, + "num_input_tokens_seen": 1602336, + "step": 230 + }, + { + "epoch": 1.4867256637168142, + "grad_norm": 9.096975326538086, + "learning_rate": 1.925e-06, + "loss": 0.1593, + "num_input_tokens_seen": 1609024, + "step": 231 + }, + { + "epoch": 1.4931617055510862, + "grad_norm": 18.944650650024414, + "learning_rate": 1.9333333333333336e-06, + "loss": 0.1891, + "num_input_tokens_seen": 1615712, + "step": 232 + }, + { + "epoch": 1.4995977473853581, + "grad_norm": 6.735738754272461, + "learning_rate": 1.9416666666666666e-06, + "loss": 0.0867, + "num_input_tokens_seen": 1622608, + "step": 233 + }, + { + "epoch": 1.50603378921963, + "grad_norm": 12.395522117614746, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.1286, + "num_input_tokens_seen": 1629520, + "step": 234 + }, + { + "epoch": 1.512469831053902, + "grad_norm": 13.864114761352539, + "learning_rate": 1.9583333333333334e-06, + "loss": 0.1262, + "num_input_tokens_seen": 1636320, + "step": 235 + }, + { + "epoch": 1.518905872888174, + "grad_norm": 4.206810474395752, + "learning_rate": 1.9666666666666668e-06, + "loss": 0.0878, + "num_input_tokens_seen": 1643216, + "step": 236 + }, + { + "epoch": 1.5253419147224458, + "grad_norm": 9.294787406921387, + "learning_rate": 1.975e-06, + "loss": 0.1532, + "num_input_tokens_seen": 1650256, + "step": 237 + }, + { + "epoch": 1.5317779565567178, + "grad_norm": 5.397519111633301, + "learning_rate": 1.9833333333333335e-06, + "loss": 0.1232, + "num_input_tokens_seen": 1657328, + "step": 238 + }, + { + "epoch": 1.5382139983909895, + "grad_norm": 4.74614953994751, + "learning_rate": 1.991666666666667e-06, + "loss": 0.1119, + "num_input_tokens_seen": 1664192, + "step": 239 + }, + { + "epoch": 1.5446500402252614, + "grad_norm": 8.80385971069336, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1334, + "num_input_tokens_seen": 1670944, + "step": 240 + }, + { + "epoch": 1.5510860820595334, + "grad_norm": 12.17174243927002, + "learning_rate": 2.0083333333333337e-06, + "loss": 0.1224, + "num_input_tokens_seen": 1677792, + "step": 241 + }, + { + "epoch": 1.5575221238938053, + "grad_norm": 6.9399800300598145, + "learning_rate": 2.0166666666666667e-06, + "loss": 0.106, + "num_input_tokens_seen": 1684640, + "step": 242 + }, + { + "epoch": 1.5639581657280772, + "grad_norm": 5.804976463317871, + "learning_rate": 2.025e-06, + "loss": 0.1237, + "num_input_tokens_seen": 1691664, + "step": 243 + }, + { + "epoch": 1.5703942075623492, + "grad_norm": 5.245293617248535, + "learning_rate": 2.0333333333333335e-06, + "loss": 0.095, + "num_input_tokens_seen": 1698528, + "step": 244 + }, + { + "epoch": 1.576830249396621, + "grad_norm": 2.9305763244628906, + "learning_rate": 2.041666666666667e-06, + "loss": 0.0741, + "num_input_tokens_seen": 1705600, + "step": 245 + }, + { + "epoch": 1.583266291230893, + "grad_norm": 10.269381523132324, + "learning_rate": 2.05e-06, + "loss": 0.1239, + "num_input_tokens_seen": 1712704, + "step": 246 + }, + { + "epoch": 1.589702333065165, + "grad_norm": 4.453558921813965, + "learning_rate": 2.0583333333333337e-06, + "loss": 0.091, + "num_input_tokens_seen": 1719568, + "step": 247 + }, + { + "epoch": 1.5961383748994369, + "grad_norm": 16.549911499023438, + "learning_rate": 2.0666666666666666e-06, + "loss": 0.1403, + "num_input_tokens_seen": 1726480, + "step": 248 + }, + { + "epoch": 1.6025744167337088, + "grad_norm": 17.650426864624023, + "learning_rate": 2.075e-06, + "loss": 0.1638, + "num_input_tokens_seen": 1733936, + "step": 249 + }, + { + "epoch": 1.6090104585679805, + "grad_norm": 5.322378158569336, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.1343, + "num_input_tokens_seen": 1741008, + "step": 250 + }, + { + "epoch": 1.6154465004022525, + "grad_norm": 11.570721626281738, + "learning_rate": 2.091666666666667e-06, + "loss": 0.1558, + "num_input_tokens_seen": 1748240, + "step": 251 + }, + { + "epoch": 1.6218825422365244, + "grad_norm": 2.901578426361084, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0809, + "num_input_tokens_seen": 1755072, + "step": 252 + }, + { + "epoch": 1.6283185840707963, + "grad_norm": 8.972208023071289, + "learning_rate": 2.1083333333333336e-06, + "loss": 0.1435, + "num_input_tokens_seen": 1762048, + "step": 253 + }, + { + "epoch": 1.6347546259050683, + "grad_norm": 2.364783525466919, + "learning_rate": 2.116666666666667e-06, + "loss": 0.0887, + "num_input_tokens_seen": 1769200, + "step": 254 + }, + { + "epoch": 1.6411906677393402, + "grad_norm": 3.7692675590515137, + "learning_rate": 2.125e-06, + "loss": 0.1038, + "num_input_tokens_seen": 1776112, + "step": 255 + }, + { + "epoch": 1.6476267095736121, + "grad_norm": 3.0572264194488525, + "learning_rate": 2.133333333333334e-06, + "loss": 0.0889, + "num_input_tokens_seen": 1783664, + "step": 256 + }, + { + "epoch": 1.654062751407884, + "grad_norm": 3.8316140174865723, + "learning_rate": 2.1416666666666668e-06, + "loss": 0.0751, + "num_input_tokens_seen": 1790096, + "step": 257 + }, + { + "epoch": 1.660498793242156, + "grad_norm": 5.133974552154541, + "learning_rate": 2.15e-06, + "loss": 0.0921, + "num_input_tokens_seen": 1796912, + "step": 258 + }, + { + "epoch": 1.666934835076428, + "grad_norm": 5.002286911010742, + "learning_rate": 2.1583333333333336e-06, + "loss": 0.1102, + "num_input_tokens_seen": 1804144, + "step": 259 + }, + { + "epoch": 1.6733708769106999, + "grad_norm": 8.221644401550293, + "learning_rate": 2.166666666666667e-06, + "loss": 0.1036, + "num_input_tokens_seen": 1811040, + "step": 260 + }, + { + "epoch": 1.6798069187449718, + "grad_norm": 6.029963493347168, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.1093, + "num_input_tokens_seen": 1818064, + "step": 261 + }, + { + "epoch": 1.6862429605792437, + "grad_norm": 6.715224742889404, + "learning_rate": 2.1833333333333333e-06, + "loss": 0.1714, + "num_input_tokens_seen": 1825056, + "step": 262 + }, + { + "epoch": 1.6926790024135157, + "grad_norm": 6.136181354522705, + "learning_rate": 2.191666666666667e-06, + "loss": 0.1007, + "num_input_tokens_seen": 1831968, + "step": 263 + }, + { + "epoch": 1.6991150442477876, + "grad_norm": 5.392821788787842, + "learning_rate": 2.2e-06, + "loss": 0.109, + "num_input_tokens_seen": 1838656, + "step": 264 + }, + { + "epoch": 1.7055510860820595, + "grad_norm": 3.0743072032928467, + "learning_rate": 2.2083333333333335e-06, + "loss": 0.0574, + "num_input_tokens_seen": 1845760, + "step": 265 + }, + { + "epoch": 1.7119871279163315, + "grad_norm": 4.986932277679443, + "learning_rate": 2.216666666666667e-06, + "loss": 0.0697, + "num_input_tokens_seen": 1852480, + "step": 266 + }, + { + "epoch": 1.7184231697506034, + "grad_norm": 3.588496685028076, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.1188, + "num_input_tokens_seen": 1859312, + "step": 267 + }, + { + "epoch": 1.7248592115848753, + "grad_norm": 3.850637912750244, + "learning_rate": 2.2333333333333333e-06, + "loss": 0.0998, + "num_input_tokens_seen": 1866256, + "step": 268 + }, + { + "epoch": 1.7312952534191473, + "grad_norm": 10.427441596984863, + "learning_rate": 2.2416666666666667e-06, + "loss": 0.1083, + "num_input_tokens_seen": 1873104, + "step": 269 + }, + { + "epoch": 1.7377312952534192, + "grad_norm": 6.516834259033203, + "learning_rate": 2.25e-06, + "loss": 0.0749, + "num_input_tokens_seen": 1880192, + "step": 270 + }, + { + "epoch": 1.7441673370876911, + "grad_norm": 5.243050575256348, + "learning_rate": 2.2583333333333335e-06, + "loss": 0.0771, + "num_input_tokens_seen": 1887008, + "step": 271 + }, + { + "epoch": 1.750603378921963, + "grad_norm": 3.874545097351074, + "learning_rate": 2.266666666666667e-06, + "loss": 0.0646, + "num_input_tokens_seen": 1894096, + "step": 272 + }, + { + "epoch": 1.757039420756235, + "grad_norm": 4.2995476722717285, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.1147, + "num_input_tokens_seen": 1901216, + "step": 273 + }, + { + "epoch": 1.763475462590507, + "grad_norm": 9.720036506652832, + "learning_rate": 2.2833333333333336e-06, + "loss": 0.0917, + "num_input_tokens_seen": 1908160, + "step": 274 + }, + { + "epoch": 1.7699115044247788, + "grad_norm": 7.985558986663818, + "learning_rate": 2.2916666666666666e-06, + "loss": 0.106, + "num_input_tokens_seen": 1915104, + "step": 275 + }, + { + "epoch": 1.7763475462590508, + "grad_norm": 4.0768327713012695, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0849, + "num_input_tokens_seen": 1922128, + "step": 276 + }, + { + "epoch": 1.7827835880933227, + "grad_norm": 5.870975017547607, + "learning_rate": 2.3083333333333334e-06, + "loss": 0.1074, + "num_input_tokens_seen": 1929200, + "step": 277 + }, + { + "epoch": 1.7892196299275946, + "grad_norm": 3.490455389022827, + "learning_rate": 2.316666666666667e-06, + "loss": 0.0981, + "num_input_tokens_seen": 1936144, + "step": 278 + }, + { + "epoch": 1.7956556717618666, + "grad_norm": 4.1171183586120605, + "learning_rate": 2.325e-06, + "loss": 0.1008, + "num_input_tokens_seen": 1943136, + "step": 279 + }, + { + "epoch": 1.8020917135961385, + "grad_norm": 7.664264678955078, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.1032, + "num_input_tokens_seen": 1950208, + "step": 280 + }, + { + "epoch": 1.8085277554304104, + "grad_norm": 4.865798473358154, + "learning_rate": 2.341666666666667e-06, + "loss": 0.0711, + "num_input_tokens_seen": 1957056, + "step": 281 + }, + { + "epoch": 1.8149637972646824, + "grad_norm": 2.5436036586761475, + "learning_rate": 2.35e-06, + "loss": 0.0901, + "num_input_tokens_seen": 1964176, + "step": 282 + }, + { + "epoch": 1.8213998390989543, + "grad_norm": 6.305140972137451, + "learning_rate": 2.3583333333333338e-06, + "loss": 0.0847, + "num_input_tokens_seen": 1970736, + "step": 283 + }, + { + "epoch": 1.827835880933226, + "grad_norm": 2.6688449382781982, + "learning_rate": 2.3666666666666667e-06, + "loss": 0.0752, + "num_input_tokens_seen": 1977440, + "step": 284 + }, + { + "epoch": 1.834271922767498, + "grad_norm": 2.5124077796936035, + "learning_rate": 2.375e-06, + "loss": 0.068, + "num_input_tokens_seen": 1984464, + "step": 285 + }, + { + "epoch": 1.8407079646017699, + "grad_norm": 6.168980121612549, + "learning_rate": 2.3833333333333335e-06, + "loss": 0.1088, + "num_input_tokens_seen": 1991248, + "step": 286 + }, + { + "epoch": 1.8471440064360418, + "grad_norm": 5.883851051330566, + "learning_rate": 2.391666666666667e-06, + "loss": 0.1017, + "num_input_tokens_seen": 1998496, + "step": 287 + }, + { + "epoch": 1.8535800482703138, + "grad_norm": 9.373373985290527, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.13, + "num_input_tokens_seen": 2005552, + "step": 288 + }, + { + "epoch": 1.8600160901045857, + "grad_norm": 9.111586570739746, + "learning_rate": 2.4083333333333337e-06, + "loss": 0.0998, + "num_input_tokens_seen": 2012272, + "step": 289 + }, + { + "epoch": 1.8664521319388576, + "grad_norm": 5.353252410888672, + "learning_rate": 2.4166666666666667e-06, + "loss": 0.0779, + "num_input_tokens_seen": 2019056, + "step": 290 + }, + { + "epoch": 1.8728881737731295, + "grad_norm": 6.586206436157227, + "learning_rate": 2.425e-06, + "loss": 0.0907, + "num_input_tokens_seen": 2025760, + "step": 291 + }, + { + "epoch": 1.8793242156074015, + "grad_norm": 5.485732555389404, + "learning_rate": 2.4333333333333335e-06, + "loss": 0.0911, + "num_input_tokens_seen": 2032928, + "step": 292 + }, + { + "epoch": 1.8857602574416734, + "grad_norm": 3.5151724815368652, + "learning_rate": 2.441666666666667e-06, + "loss": 0.0987, + "num_input_tokens_seen": 2039856, + "step": 293 + }, + { + "epoch": 1.8921962992759453, + "grad_norm": 3.680494546890259, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.1254, + "num_input_tokens_seen": 2046896, + "step": 294 + }, + { + "epoch": 1.898632341110217, + "grad_norm": 3.302248001098633, + "learning_rate": 2.4583333333333332e-06, + "loss": 0.0494, + "num_input_tokens_seen": 2053600, + "step": 295 + }, + { + "epoch": 1.905068382944489, + "grad_norm": 3.605039119720459, + "learning_rate": 2.466666666666667e-06, + "loss": 0.1082, + "num_input_tokens_seen": 2060240, + "step": 296 + }, + { + "epoch": 1.911504424778761, + "grad_norm": 2.6599857807159424, + "learning_rate": 2.475e-06, + "loss": 0.0785, + "num_input_tokens_seen": 2067936, + "step": 297 + }, + { + "epoch": 1.9179404666130329, + "grad_norm": 7.149720191955566, + "learning_rate": 2.4833333333333334e-06, + "loss": 0.1026, + "num_input_tokens_seen": 2074656, + "step": 298 + }, + { + "epoch": 1.9243765084473048, + "grad_norm": 4.549108982086182, + "learning_rate": 2.491666666666667e-06, + "loss": 0.0617, + "num_input_tokens_seen": 2081568, + "step": 299 + }, + { + "epoch": 1.9308125502815767, + "grad_norm": 2.900601625442505, + "learning_rate": 2.5e-06, + "loss": 0.0659, + "num_input_tokens_seen": 2088368, + "step": 300 + }, + { + "epoch": 1.9372485921158487, + "grad_norm": 6.378200531005859, + "learning_rate": 2.5083333333333336e-06, + "loss": 0.088, + "num_input_tokens_seen": 2095728, + "step": 301 + }, + { + "epoch": 1.9436846339501206, + "grad_norm": 6.718885898590088, + "learning_rate": 2.5166666666666666e-06, + "loss": 0.0771, + "num_input_tokens_seen": 2103104, + "step": 302 + }, + { + "epoch": 1.9501206757843925, + "grad_norm": 3.587820291519165, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0642, + "num_input_tokens_seen": 2110032, + "step": 303 + }, + { + "epoch": 1.9565567176186645, + "grad_norm": 7.106460094451904, + "learning_rate": 2.5333333333333338e-06, + "loss": 0.0947, + "num_input_tokens_seen": 2117056, + "step": 304 + }, + { + "epoch": 1.9629927594529364, + "grad_norm": 3.480973243713379, + "learning_rate": 2.5416666666666668e-06, + "loss": 0.0975, + "num_input_tokens_seen": 2123552, + "step": 305 + }, + { + "epoch": 1.9694288012872083, + "grad_norm": 2.709892511367798, + "learning_rate": 2.55e-06, + "loss": 0.0527, + "num_input_tokens_seen": 2130128, + "step": 306 + }, + { + "epoch": 1.9758648431214803, + "grad_norm": 3.3756306171417236, + "learning_rate": 2.558333333333334e-06, + "loss": 0.0869, + "num_input_tokens_seen": 2137232, + "step": 307 + }, + { + "epoch": 1.9823008849557522, + "grad_norm": 6.785555839538574, + "learning_rate": 2.566666666666667e-06, + "loss": 0.0605, + "num_input_tokens_seen": 2143776, + "step": 308 + }, + { + "epoch": 1.9887369267900241, + "grad_norm": 3.4628372192382812, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0684, + "num_input_tokens_seen": 2150976, + "step": 309 + }, + { + "epoch": 1.995172968624296, + "grad_norm": 3.56925892829895, + "learning_rate": 2.5833333333333337e-06, + "loss": 0.0701, + "num_input_tokens_seen": 2158080, + "step": 310 + }, + { + "epoch": 2.001609010458568, + "grad_norm": 4.06324577331543, + "learning_rate": 2.5916666666666667e-06, + "loss": 0.0699, + "num_input_tokens_seen": 2164992, + "step": 311 + }, + { + "epoch": 2.00804505229284, + "grad_norm": 7.733395576477051, + "learning_rate": 2.6e-06, + "loss": 0.0949, + "num_input_tokens_seen": 2171952, + "step": 312 + }, + { + "epoch": 2.014481094127112, + "grad_norm": 7.6149139404296875, + "learning_rate": 2.608333333333333e-06, + "loss": 0.0911, + "num_input_tokens_seen": 2179072, + "step": 313 + }, + { + "epoch": 2.020917135961384, + "grad_norm": 2.538379192352295, + "learning_rate": 2.616666666666667e-06, + "loss": 0.0615, + "num_input_tokens_seen": 2185872, + "step": 314 + }, + { + "epoch": 2.0273531777956557, + "grad_norm": 2.5334603786468506, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0448, + "num_input_tokens_seen": 2192656, + "step": 315 + }, + { + "epoch": 2.0337892196299276, + "grad_norm": 4.8344340324401855, + "learning_rate": 2.6333333333333332e-06, + "loss": 0.0619, + "num_input_tokens_seen": 2199728, + "step": 316 + }, + { + "epoch": 2.0402252614641996, + "grad_norm": 4.393861770629883, + "learning_rate": 2.6416666666666666e-06, + "loss": 0.0475, + "num_input_tokens_seen": 2206608, + "step": 317 + }, + { + "epoch": 2.0466613032984715, + "grad_norm": 2.7922892570495605, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0438, + "num_input_tokens_seen": 2213856, + "step": 318 + }, + { + "epoch": 2.0530973451327434, + "grad_norm": 1.5408401489257812, + "learning_rate": 2.6583333333333334e-06, + "loss": 0.0245, + "num_input_tokens_seen": 2220528, + "step": 319 + }, + { + "epoch": 2.0595333869670154, + "grad_norm": 5.6088433265686035, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0716, + "num_input_tokens_seen": 2227616, + "step": 320 + }, + { + "epoch": 2.0659694288012873, + "grad_norm": 9.311470985412598, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.1015, + "num_input_tokens_seen": 2234304, + "step": 321 + }, + { + "epoch": 2.0724054706355592, + "grad_norm": 5.244096279144287, + "learning_rate": 2.683333333333333e-06, + "loss": 0.0753, + "num_input_tokens_seen": 2241088, + "step": 322 + }, + { + "epoch": 2.078841512469831, + "grad_norm": 3.443998098373413, + "learning_rate": 2.691666666666667e-06, + "loss": 0.0521, + "num_input_tokens_seen": 2247632, + "step": 323 + }, + { + "epoch": 2.085277554304103, + "grad_norm": 2.4997072219848633, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0287, + "num_input_tokens_seen": 2254448, + "step": 324 + }, + { + "epoch": 2.091713596138375, + "grad_norm": 4.817678928375244, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.0471, + "num_input_tokens_seen": 2261424, + "step": 325 + }, + { + "epoch": 2.098149637972647, + "grad_norm": 6.326369285583496, + "learning_rate": 2.7166666666666668e-06, + "loss": 0.0697, + "num_input_tokens_seen": 2268528, + "step": 326 + }, + { + "epoch": 2.104585679806919, + "grad_norm": 3.599905490875244, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0438, + "num_input_tokens_seen": 2275328, + "step": 327 + }, + { + "epoch": 2.111021721641191, + "grad_norm": 2.8037264347076416, + "learning_rate": 2.7333333333333336e-06, + "loss": 0.0475, + "num_input_tokens_seen": 2282400, + "step": 328 + }, + { + "epoch": 2.1174577634754628, + "grad_norm": 2.7425622940063477, + "learning_rate": 2.741666666666667e-06, + "loss": 0.0601, + "num_input_tokens_seen": 2289312, + "step": 329 + }, + { + "epoch": 2.1238938053097347, + "grad_norm": 2.064824342727661, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0355, + "num_input_tokens_seen": 2295824, + "step": 330 + }, + { + "epoch": 2.1303298471440066, + "grad_norm": 3.695521593093872, + "learning_rate": 2.7583333333333333e-06, + "loss": 0.0515, + "num_input_tokens_seen": 2303024, + "step": 331 + }, + { + "epoch": 2.136765888978278, + "grad_norm": 3.3290112018585205, + "learning_rate": 2.766666666666667e-06, + "loss": 0.0601, + "num_input_tokens_seen": 2309904, + "step": 332 + }, + { + "epoch": 2.14320193081255, + "grad_norm": 2.751953363418579, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0288, + "num_input_tokens_seen": 2316416, + "step": 333 + }, + { + "epoch": 2.149637972646822, + "grad_norm": 4.679827690124512, + "learning_rate": 2.7833333333333335e-06, + "loss": 0.0563, + "num_input_tokens_seen": 2323088, + "step": 334 + }, + { + "epoch": 2.156074014481094, + "grad_norm": 9.301896095275879, + "learning_rate": 2.791666666666667e-06, + "loss": 0.1176, + "num_input_tokens_seen": 2329968, + "step": 335 + }, + { + "epoch": 2.162510056315366, + "grad_norm": 6.16165828704834, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0965, + "num_input_tokens_seen": 2336656, + "step": 336 + }, + { + "epoch": 2.168946098149638, + "grad_norm": 2.442518711090088, + "learning_rate": 2.8083333333333333e-06, + "loss": 0.0359, + "num_input_tokens_seen": 2343984, + "step": 337 + }, + { + "epoch": 2.1753821399839097, + "grad_norm": 3.537282943725586, + "learning_rate": 2.816666666666667e-06, + "loss": 0.0609, + "num_input_tokens_seen": 2350912, + "step": 338 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 5.1499223709106445, + "learning_rate": 2.825e-06, + "loss": 0.0768, + "num_input_tokens_seen": 2357680, + "step": 339 + }, + { + "epoch": 2.1882542236524536, + "grad_norm": 8.193970680236816, + "learning_rate": 2.8333333333333335e-06, + "loss": 0.0849, + "num_input_tokens_seen": 2364736, + "step": 340 + }, + { + "epoch": 2.1946902654867255, + "grad_norm": 2.2035670280456543, + "learning_rate": 2.841666666666667e-06, + "loss": 0.0581, + "num_input_tokens_seen": 2371568, + "step": 341 + }, + { + "epoch": 2.2011263073209975, + "grad_norm": 2.7924435138702393, + "learning_rate": 2.85e-06, + "loss": 0.046, + "num_input_tokens_seen": 2378384, + "step": 342 + }, + { + "epoch": 2.2075623491552694, + "grad_norm": 4.6174445152282715, + "learning_rate": 2.8583333333333336e-06, + "loss": 0.0674, + "num_input_tokens_seen": 2385584, + "step": 343 + }, + { + "epoch": 2.2139983909895413, + "grad_norm": 2.4459989070892334, + "learning_rate": 2.866666666666667e-06, + "loss": 0.0563, + "num_input_tokens_seen": 2392640, + "step": 344 + }, + { + "epoch": 2.2204344328238133, + "grad_norm": 2.3443846702575684, + "learning_rate": 2.875e-06, + "loss": 0.0621, + "num_input_tokens_seen": 2399936, + "step": 345 + }, + { + "epoch": 2.226870474658085, + "grad_norm": 2.865879774093628, + "learning_rate": 2.8833333333333334e-06, + "loss": 0.0659, + "num_input_tokens_seen": 2406928, + "step": 346 + }, + { + "epoch": 2.233306516492357, + "grad_norm": 4.03169059753418, + "learning_rate": 2.8916666666666672e-06, + "loss": 0.039, + "num_input_tokens_seen": 2413888, + "step": 347 + }, + { + "epoch": 2.239742558326629, + "grad_norm": 1.693605899810791, + "learning_rate": 2.9e-06, + "loss": 0.0239, + "num_input_tokens_seen": 2421104, + "step": 348 + }, + { + "epoch": 2.246178600160901, + "grad_norm": 2.7058444023132324, + "learning_rate": 2.9083333333333336e-06, + "loss": 0.0521, + "num_input_tokens_seen": 2428128, + "step": 349 + }, + { + "epoch": 2.252614641995173, + "grad_norm": 3.9503567218780518, + "learning_rate": 2.916666666666667e-06, + "loss": 0.0561, + "num_input_tokens_seen": 2434880, + "step": 350 + }, + { + "epoch": 2.259050683829445, + "grad_norm": 4.444098472595215, + "learning_rate": 2.925e-06, + "loss": 0.0622, + "num_input_tokens_seen": 2441824, + "step": 351 + }, + { + "epoch": 2.265486725663717, + "grad_norm": 3.7014055252075195, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.0875, + "num_input_tokens_seen": 2448688, + "step": 352 + }, + { + "epoch": 2.2719227674979887, + "grad_norm": 4.078037261962891, + "learning_rate": 2.941666666666667e-06, + "loss": 0.0307, + "num_input_tokens_seen": 2455488, + "step": 353 + }, + { + "epoch": 2.2783588093322606, + "grad_norm": 3.753711700439453, + "learning_rate": 2.95e-06, + "loss": 0.063, + "num_input_tokens_seen": 2462240, + "step": 354 + }, + { + "epoch": 2.2847948511665326, + "grad_norm": 2.9653706550598145, + "learning_rate": 2.9583333333333335e-06, + "loss": 0.0404, + "num_input_tokens_seen": 2469408, + "step": 355 + }, + { + "epoch": 2.2912308930008045, + "grad_norm": 3.8090925216674805, + "learning_rate": 2.9666666666666673e-06, + "loss": 0.0759, + "num_input_tokens_seen": 2476240, + "step": 356 + }, + { + "epoch": 2.2976669348350764, + "grad_norm": 2.4684033393859863, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.0488, + "num_input_tokens_seen": 2482864, + "step": 357 + }, + { + "epoch": 2.3041029766693484, + "grad_norm": 2.0687243938446045, + "learning_rate": 2.9833333333333337e-06, + "loss": 0.0499, + "num_input_tokens_seen": 2489664, + "step": 358 + }, + { + "epoch": 2.3105390185036203, + "grad_norm": 3.223965883255005, + "learning_rate": 2.991666666666667e-06, + "loss": 0.0441, + "num_input_tokens_seen": 2496704, + "step": 359 + }, + { + "epoch": 2.3169750603378922, + "grad_norm": 2.1407270431518555, + "learning_rate": 3e-06, + "loss": 0.0485, + "num_input_tokens_seen": 2503920, + "step": 360 + }, + { + "epoch": 2.323411102172164, + "grad_norm": 2.632885217666626, + "learning_rate": 3.0083333333333335e-06, + "loss": 0.0674, + "num_input_tokens_seen": 2510544, + "step": 361 + }, + { + "epoch": 2.329847144006436, + "grad_norm": 3.258030652999878, + "learning_rate": 3.0166666666666673e-06, + "loss": 0.0689, + "num_input_tokens_seen": 2517408, + "step": 362 + }, + { + "epoch": 2.336283185840708, + "grad_norm": 6.024159908294678, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0618, + "num_input_tokens_seen": 2524160, + "step": 363 + }, + { + "epoch": 2.34271922767498, + "grad_norm": 4.7281999588012695, + "learning_rate": 3.0333333333333337e-06, + "loss": 0.0629, + "num_input_tokens_seen": 2531072, + "step": 364 + }, + { + "epoch": 2.349155269509252, + "grad_norm": 4.178661823272705, + "learning_rate": 3.0416666666666666e-06, + "loss": 0.0499, + "num_input_tokens_seen": 2537920, + "step": 365 + }, + { + "epoch": 2.355591311343524, + "grad_norm": 1.5715197324752808, + "learning_rate": 3.05e-06, + "loss": 0.0361, + "num_input_tokens_seen": 2544736, + "step": 366 + }, + { + "epoch": 2.3620273531777958, + "grad_norm": 2.835855722427368, + "learning_rate": 3.058333333333334e-06, + "loss": 0.0471, + "num_input_tokens_seen": 2552016, + "step": 367 + }, + { + "epoch": 2.3684633950120677, + "grad_norm": 2.870889902114868, + "learning_rate": 3.066666666666667e-06, + "loss": 0.0622, + "num_input_tokens_seen": 2559616, + "step": 368 + }, + { + "epoch": 2.3748994368463396, + "grad_norm": 1.7411049604415894, + "learning_rate": 3.075e-06, + "loss": 0.0328, + "num_input_tokens_seen": 2566240, + "step": 369 + }, + { + "epoch": 2.3813354786806116, + "grad_norm": 3.0499918460845947, + "learning_rate": 3.0833333333333336e-06, + "loss": 0.0437, + "num_input_tokens_seen": 2573392, + "step": 370 + }, + { + "epoch": 2.3877715205148835, + "grad_norm": 4.242414474487305, + "learning_rate": 3.0916666666666666e-06, + "loss": 0.0644, + "num_input_tokens_seen": 2580544, + "step": 371 + }, + { + "epoch": 2.3942075623491554, + "grad_norm": 2.962906837463379, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0553, + "num_input_tokens_seen": 2587344, + "step": 372 + }, + { + "epoch": 2.4006436041834274, + "grad_norm": 4.431301116943359, + "learning_rate": 3.1083333333333338e-06, + "loss": 0.061, + "num_input_tokens_seen": 2594560, + "step": 373 + }, + { + "epoch": 2.4070796460176993, + "grad_norm": 5.075587272644043, + "learning_rate": 3.1166666666666668e-06, + "loss": 0.0866, + "num_input_tokens_seen": 2601408, + "step": 374 + }, + { + "epoch": 2.4135156878519712, + "grad_norm": 3.877520799636841, + "learning_rate": 3.125e-06, + "loss": 0.0632, + "num_input_tokens_seen": 2608624, + "step": 375 + }, + { + "epoch": 2.419951729686243, + "grad_norm": 2.9902503490448, + "learning_rate": 3.133333333333334e-06, + "loss": 0.0395, + "num_input_tokens_seen": 2615456, + "step": 376 + }, + { + "epoch": 2.426387771520515, + "grad_norm": 3.7800397872924805, + "learning_rate": 3.141666666666667e-06, + "loss": 0.0819, + "num_input_tokens_seen": 2622672, + "step": 377 + }, + { + "epoch": 2.432823813354787, + "grad_norm": 2.4674911499023438, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.064, + "num_input_tokens_seen": 2629952, + "step": 378 + }, + { + "epoch": 2.439259855189059, + "grad_norm": 5.3331146240234375, + "learning_rate": 3.1583333333333337e-06, + "loss": 0.0803, + "num_input_tokens_seen": 2637168, + "step": 379 + }, + { + "epoch": 2.445695897023331, + "grad_norm": 9.950706481933594, + "learning_rate": 3.1666666666666667e-06, + "loss": 0.0798, + "num_input_tokens_seen": 2644144, + "step": 380 + }, + { + "epoch": 2.4521319388576024, + "grad_norm": 5.1734442710876465, + "learning_rate": 3.175e-06, + "loss": 0.0544, + "num_input_tokens_seen": 2651376, + "step": 381 + }, + { + "epoch": 2.4585679806918743, + "grad_norm": 2.5671188831329346, + "learning_rate": 3.183333333333334e-06, + "loss": 0.0629, + "num_input_tokens_seen": 2658336, + "step": 382 + }, + { + "epoch": 2.4650040225261463, + "grad_norm": 4.357182025909424, + "learning_rate": 3.191666666666667e-06, + "loss": 0.0471, + "num_input_tokens_seen": 2665360, + "step": 383 + }, + { + "epoch": 2.471440064360418, + "grad_norm": 4.694338321685791, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0533, + "num_input_tokens_seen": 2672704, + "step": 384 + }, + { + "epoch": 2.47787610619469, + "grad_norm": 2.391195774078369, + "learning_rate": 3.2083333333333337e-06, + "loss": 0.0542, + "num_input_tokens_seen": 2679872, + "step": 385 + }, + { + "epoch": 2.484312148028962, + "grad_norm": 3.859102249145508, + "learning_rate": 3.2166666666666666e-06, + "loss": 0.034, + "num_input_tokens_seen": 2686672, + "step": 386 + }, + { + "epoch": 2.490748189863234, + "grad_norm": 2.4710166454315186, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0517, + "num_input_tokens_seen": 2693520, + "step": 387 + }, + { + "epoch": 2.497184231697506, + "grad_norm": 3.309068202972412, + "learning_rate": 3.2333333333333334e-06, + "loss": 0.0698, + "num_input_tokens_seen": 2700432, + "step": 388 + }, + { + "epoch": 2.503620273531778, + "grad_norm": 4.21011209487915, + "learning_rate": 3.241666666666667e-06, + "loss": 0.0573, + "num_input_tokens_seen": 2707184, + "step": 389 + }, + { + "epoch": 2.51005631536605, + "grad_norm": 4.34623908996582, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0568, + "num_input_tokens_seen": 2713936, + "step": 390 + }, + { + "epoch": 2.5164923572003217, + "grad_norm": 3.361445188522339, + "learning_rate": 3.258333333333333e-06, + "loss": 0.0669, + "num_input_tokens_seen": 2721216, + "step": 391 + }, + { + "epoch": 2.5229283990345936, + "grad_norm": 2.091728925704956, + "learning_rate": 3.266666666666667e-06, + "loss": 0.027, + "num_input_tokens_seen": 2727968, + "step": 392 + }, + { + "epoch": 2.5293644408688656, + "grad_norm": 2.1977951526641846, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.0303, + "num_input_tokens_seen": 2734816, + "step": 393 + }, + { + "epoch": 2.5358004827031375, + "grad_norm": 2.7409942150115967, + "learning_rate": 3.2833333333333334e-06, + "loss": 0.0392, + "num_input_tokens_seen": 2741744, + "step": 394 + }, + { + "epoch": 2.5422365245374094, + "grad_norm": 3.695770740509033, + "learning_rate": 3.2916666666666668e-06, + "loss": 0.0813, + "num_input_tokens_seen": 2748640, + "step": 395 + }, + { + "epoch": 2.5486725663716814, + "grad_norm": 3.674891471862793, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0403, + "num_input_tokens_seen": 2755888, + "step": 396 + }, + { + "epoch": 2.5551086082059533, + "grad_norm": 1.716131567955017, + "learning_rate": 3.3083333333333336e-06, + "loss": 0.0222, + "num_input_tokens_seen": 2762464, + "step": 397 + }, + { + "epoch": 2.5615446500402252, + "grad_norm": 2.5081095695495605, + "learning_rate": 3.316666666666667e-06, + "loss": 0.0611, + "num_input_tokens_seen": 2769712, + "step": 398 + }, + { + "epoch": 2.567980691874497, + "grad_norm": 1.9974850416183472, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.035, + "num_input_tokens_seen": 2776736, + "step": 399 + }, + { + "epoch": 2.574416733708769, + "grad_norm": 4.233558177947998, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.068, + "num_input_tokens_seen": 2783376, + "step": 400 + }, + { + "epoch": 2.580852775543041, + "grad_norm": 3.359081983566284, + "learning_rate": 3.341666666666667e-06, + "loss": 0.0543, + "num_input_tokens_seen": 2790528, + "step": 401 + }, + { + "epoch": 2.587288817377313, + "grad_norm": 2.669712543487549, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0466, + "num_input_tokens_seen": 2797312, + "step": 402 + }, + { + "epoch": 2.593724859211585, + "grad_norm": 3.1529603004455566, + "learning_rate": 3.3583333333333335e-06, + "loss": 0.0626, + "num_input_tokens_seen": 2804288, + "step": 403 + }, + { + "epoch": 2.600160901045857, + "grad_norm": 3.069842576980591, + "learning_rate": 3.366666666666667e-06, + "loss": 0.0589, + "num_input_tokens_seen": 2811456, + "step": 404 + }, + { + "epoch": 2.6065969428801288, + "grad_norm": 1.881988525390625, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0415, + "num_input_tokens_seen": 2818080, + "step": 405 + }, + { + "epoch": 2.6130329847144007, + "grad_norm": 1.862747073173523, + "learning_rate": 3.3833333333333333e-06, + "loss": 0.0344, + "num_input_tokens_seen": 2825136, + "step": 406 + }, + { + "epoch": 2.6194690265486726, + "grad_norm": 2.6847071647644043, + "learning_rate": 3.391666666666667e-06, + "loss": 0.0423, + "num_input_tokens_seen": 2832400, + "step": 407 + }, + { + "epoch": 2.6259050683829446, + "grad_norm": 3.631681203842163, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0838, + "num_input_tokens_seen": 2839712, + "step": 408 + }, + { + "epoch": 2.6323411102172165, + "grad_norm": 3.7878201007843018, + "learning_rate": 3.4083333333333335e-06, + "loss": 0.0732, + "num_input_tokens_seen": 2846160, + "step": 409 + }, + { + "epoch": 2.6387771520514884, + "grad_norm": 2.826582431793213, + "learning_rate": 3.416666666666667e-06, + "loss": 0.0464, + "num_input_tokens_seen": 2853520, + "step": 410 + }, + { + "epoch": 2.6452131938857604, + "grad_norm": 2.330638885498047, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.0387, + "num_input_tokens_seen": 2860384, + "step": 411 + }, + { + "epoch": 2.6516492357200323, + "grad_norm": 2.330439567565918, + "learning_rate": 3.4333333333333336e-06, + "loss": 0.0507, + "num_input_tokens_seen": 2867360, + "step": 412 + }, + { + "epoch": 2.6580852775543042, + "grad_norm": 3.929145336151123, + "learning_rate": 3.441666666666667e-06, + "loss": 0.0549, + "num_input_tokens_seen": 2873648, + "step": 413 + }, + { + "epoch": 2.664521319388576, + "grad_norm": 3.001359224319458, + "learning_rate": 3.45e-06, + "loss": 0.0285, + "num_input_tokens_seen": 2880848, + "step": 414 + }, + { + "epoch": 2.670957361222848, + "grad_norm": 2.7936651706695557, + "learning_rate": 3.4583333333333334e-06, + "loss": 0.0668, + "num_input_tokens_seen": 2888256, + "step": 415 + }, + { + "epoch": 2.67739340305712, + "grad_norm": 4.050117015838623, + "learning_rate": 3.4666666666666672e-06, + "loss": 0.0691, + "num_input_tokens_seen": 2895040, + "step": 416 + }, + { + "epoch": 2.6838294448913915, + "grad_norm": 5.509685516357422, + "learning_rate": 3.475e-06, + "loss": 0.066, + "num_input_tokens_seen": 2902320, + "step": 417 + }, + { + "epoch": 2.6902654867256635, + "grad_norm": 3.968433380126953, + "learning_rate": 3.4833333333333336e-06, + "loss": 0.0495, + "num_input_tokens_seen": 2908960, + "step": 418 + }, + { + "epoch": 2.6967015285599354, + "grad_norm": 2.082157611846924, + "learning_rate": 3.491666666666667e-06, + "loss": 0.034, + "num_input_tokens_seen": 2915808, + "step": 419 + }, + { + "epoch": 2.7031375703942073, + "grad_norm": 2.403968334197998, + "learning_rate": 3.5e-06, + "loss": 0.0604, + "num_input_tokens_seen": 2922608, + "step": 420 + }, + { + "epoch": 2.7095736122284793, + "grad_norm": 4.667454719543457, + "learning_rate": 3.5083333333333338e-06, + "loss": 0.0535, + "num_input_tokens_seen": 2929728, + "step": 421 + }, + { + "epoch": 2.716009654062751, + "grad_norm": 2.5968987941741943, + "learning_rate": 3.516666666666667e-06, + "loss": 0.0369, + "num_input_tokens_seen": 2937024, + "step": 422 + }, + { + "epoch": 2.722445695897023, + "grad_norm": 3.4746780395507812, + "learning_rate": 3.525e-06, + "loss": 0.045, + "num_input_tokens_seen": 2943760, + "step": 423 + }, + { + "epoch": 2.728881737731295, + "grad_norm": 1.9599398374557495, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.0314, + "num_input_tokens_seen": 2950848, + "step": 424 + }, + { + "epoch": 2.735317779565567, + "grad_norm": 2.971634864807129, + "learning_rate": 3.5416666666666673e-06, + "loss": 0.0611, + "num_input_tokens_seen": 2957408, + "step": 425 + }, + { + "epoch": 2.741753821399839, + "grad_norm": 3.1944162845611572, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0478, + "num_input_tokens_seen": 2964288, + "step": 426 + }, + { + "epoch": 2.748189863234111, + "grad_norm": 3.3659610748291016, + "learning_rate": 3.5583333333333337e-06, + "loss": 0.038, + "num_input_tokens_seen": 2970912, + "step": 427 + }, + { + "epoch": 2.754625905068383, + "grad_norm": 2.965097188949585, + "learning_rate": 3.566666666666667e-06, + "loss": 0.043, + "num_input_tokens_seen": 2978032, + "step": 428 + }, + { + "epoch": 2.7610619469026547, + "grad_norm": 2.4006049633026123, + "learning_rate": 3.575e-06, + "loss": 0.0478, + "num_input_tokens_seen": 2985232, + "step": 429 + }, + { + "epoch": 2.7674979887369267, + "grad_norm": 3.7348554134368896, + "learning_rate": 3.5833333333333335e-06, + "loss": 0.0977, + "num_input_tokens_seen": 2992240, + "step": 430 + }, + { + "epoch": 2.7739340305711986, + "grad_norm": 3.1373274326324463, + "learning_rate": 3.5916666666666673e-06, + "loss": 0.0835, + "num_input_tokens_seen": 2999008, + "step": 431 + }, + { + "epoch": 2.7803700724054705, + "grad_norm": 1.9444302320480347, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0406, + "num_input_tokens_seen": 3005648, + "step": 432 + }, + { + "epoch": 2.7868061142397424, + "grad_norm": 1.8665870428085327, + "learning_rate": 3.6083333333333337e-06, + "loss": 0.0661, + "num_input_tokens_seen": 3012224, + "step": 433 + }, + { + "epoch": 2.7932421560740144, + "grad_norm": 1.9893403053283691, + "learning_rate": 3.616666666666667e-06, + "loss": 0.0647, + "num_input_tokens_seen": 3019104, + "step": 434 + }, + { + "epoch": 2.7996781979082863, + "grad_norm": 2.656529426574707, + "learning_rate": 3.625e-06, + "loss": 0.0499, + "num_input_tokens_seen": 3026096, + "step": 435 + }, + { + "epoch": 2.8061142397425582, + "grad_norm": 1.7047683000564575, + "learning_rate": 3.633333333333334e-06, + "loss": 0.0422, + "num_input_tokens_seen": 3032784, + "step": 436 + }, + { + "epoch": 2.81255028157683, + "grad_norm": 1.6727882623672485, + "learning_rate": 3.6416666666666672e-06, + "loss": 0.048, + "num_input_tokens_seen": 3040096, + "step": 437 + }, + { + "epoch": 2.818986323411102, + "grad_norm": 4.0175251960754395, + "learning_rate": 3.65e-06, + "loss": 0.0474, + "num_input_tokens_seen": 3046720, + "step": 438 + }, + { + "epoch": 2.825422365245374, + "grad_norm": 8.139860153198242, + "learning_rate": 3.6583333333333336e-06, + "loss": 0.0801, + "num_input_tokens_seen": 3053712, + "step": 439 + }, + { + "epoch": 2.831858407079646, + "grad_norm": 3.832087278366089, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.0528, + "num_input_tokens_seen": 3060528, + "step": 440 + }, + { + "epoch": 2.838294448913918, + "grad_norm": 2.881619930267334, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0461, + "num_input_tokens_seen": 3067440, + "step": 441 + }, + { + "epoch": 2.84473049074819, + "grad_norm": 4.456245422363281, + "learning_rate": 3.6833333333333338e-06, + "loss": 0.0646, + "num_input_tokens_seen": 3074208, + "step": 442 + }, + { + "epoch": 2.8511665325824618, + "grad_norm": 5.1570820808410645, + "learning_rate": 3.6916666666666668e-06, + "loss": 0.049, + "num_input_tokens_seen": 3081072, + "step": 443 + }, + { + "epoch": 2.8576025744167337, + "grad_norm": 2.944526433944702, + "learning_rate": 3.7e-06, + "loss": 0.0531, + "num_input_tokens_seen": 3088240, + "step": 444 + }, + { + "epoch": 2.8640386162510056, + "grad_norm": 2.021688222885132, + "learning_rate": 3.708333333333334e-06, + "loss": 0.0521, + "num_input_tokens_seen": 3095504, + "step": 445 + }, + { + "epoch": 2.8704746580852776, + "grad_norm": 6.054248809814453, + "learning_rate": 3.716666666666667e-06, + "loss": 0.0927, + "num_input_tokens_seen": 3102688, + "step": 446 + }, + { + "epoch": 2.8769106999195495, + "grad_norm": 3.5824503898620605, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0491, + "num_input_tokens_seen": 3109440, + "step": 447 + }, + { + "epoch": 2.8833467417538214, + "grad_norm": 2.0240774154663086, + "learning_rate": 3.7333333333333337e-06, + "loss": 0.0399, + "num_input_tokens_seen": 3116720, + "step": 448 + }, + { + "epoch": 2.8897827835880934, + "grad_norm": 4.0125579833984375, + "learning_rate": 3.7416666666666667e-06, + "loss": 0.0499, + "num_input_tokens_seen": 3123568, + "step": 449 + }, + { + "epoch": 2.8962188254223653, + "grad_norm": 3.733275890350342, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0569, + "num_input_tokens_seen": 3130768, + "step": 450 + }, + { + "epoch": 2.9026548672566372, + "grad_norm": 4.261077880859375, + "learning_rate": 3.758333333333334e-06, + "loss": 0.0608, + "num_input_tokens_seen": 3138128, + "step": 451 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 1.4142907857894897, + "learning_rate": 3.766666666666667e-06, + "loss": 0.0325, + "num_input_tokens_seen": 3145008, + "step": 452 + }, + { + "epoch": 2.915526950925181, + "grad_norm": 2.610344171524048, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0643, + "num_input_tokens_seen": 3151792, + "step": 453 + }, + { + "epoch": 2.921962992759453, + "grad_norm": 2.9687604904174805, + "learning_rate": 3.7833333333333337e-06, + "loss": 0.0479, + "num_input_tokens_seen": 3158800, + "step": 454 + }, + { + "epoch": 2.928399034593725, + "grad_norm": 2.2706518173217773, + "learning_rate": 3.7916666666666666e-06, + "loss": 0.0549, + "num_input_tokens_seen": 3165744, + "step": 455 + }, + { + "epoch": 2.934835076427997, + "grad_norm": 3.606792449951172, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0789, + "num_input_tokens_seen": 3172896, + "step": 456 + }, + { + "epoch": 2.941271118262269, + "grad_norm": 1.8851637840270996, + "learning_rate": 3.808333333333334e-06, + "loss": 0.0319, + "num_input_tokens_seen": 3179888, + "step": 457 + }, + { + "epoch": 2.9477071600965408, + "grad_norm": 2.6292834281921387, + "learning_rate": 3.816666666666667e-06, + "loss": 0.05, + "num_input_tokens_seen": 3186960, + "step": 458 + }, + { + "epoch": 2.9541432019308127, + "grad_norm": 2.099109172821045, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0677, + "num_input_tokens_seen": 3194208, + "step": 459 + }, + { + "epoch": 2.9605792437650846, + "grad_norm": 2.5214834213256836, + "learning_rate": 3.833333333333334e-06, + "loss": 0.0512, + "num_input_tokens_seen": 3201120, + "step": 460 + }, + { + "epoch": 2.9670152855993566, + "grad_norm": 6.318456649780273, + "learning_rate": 3.841666666666667e-06, + "loss": 0.0681, + "num_input_tokens_seen": 3208160, + "step": 461 + }, + { + "epoch": 2.9734513274336285, + "grad_norm": 4.119838714599609, + "learning_rate": 3.85e-06, + "loss": 0.0651, + "num_input_tokens_seen": 3214992, + "step": 462 + }, + { + "epoch": 2.9798873692679004, + "grad_norm": 3.248420238494873, + "learning_rate": 3.858333333333333e-06, + "loss": 0.0498, + "num_input_tokens_seen": 3222192, + "step": 463 + }, + { + "epoch": 2.9863234111021724, + "grad_norm": 1.6198488473892212, + "learning_rate": 3.866666666666667e-06, + "loss": 0.0496, + "num_input_tokens_seen": 3229504, + "step": 464 + }, + { + "epoch": 2.9927594529364443, + "grad_norm": 2.6008763313293457, + "learning_rate": 3.875e-06, + "loss": 0.0446, + "num_input_tokens_seen": 3236400, + "step": 465 + }, + { + "epoch": 2.9991954947707162, + "grad_norm": 2.349928379058838, + "learning_rate": 3.883333333333333e-06, + "loss": 0.0543, + "num_input_tokens_seen": 3243600, + "step": 466 + }, + { + "epoch": 3.0056315366049877, + "grad_norm": 0.8590204119682312, + "learning_rate": 3.891666666666667e-06, + "loss": 0.0137, + "num_input_tokens_seen": 3249808, + "step": 467 + }, + { + "epoch": 3.0120675784392597, + "grad_norm": 1.2689623832702637, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0201, + "num_input_tokens_seen": 3257168, + "step": 468 + }, + { + "epoch": 3.0185036202735316, + "grad_norm": 1.329512596130371, + "learning_rate": 3.908333333333334e-06, + "loss": 0.0119, + "num_input_tokens_seen": 3264064, + "step": 469 + }, + { + "epoch": 3.0249396621078035, + "grad_norm": 2.423644781112671, + "learning_rate": 3.916666666666667e-06, + "loss": 0.0305, + "num_input_tokens_seen": 3270688, + "step": 470 + }, + { + "epoch": 3.0313757039420755, + "grad_norm": 3.6647322177886963, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0213, + "num_input_tokens_seen": 3277664, + "step": 471 + }, + { + "epoch": 3.0378117457763474, + "grad_norm": 3.736281156539917, + "learning_rate": 3.9333333333333335e-06, + "loss": 0.035, + "num_input_tokens_seen": 3284352, + "step": 472 + }, + { + "epoch": 3.0442477876106193, + "grad_norm": 2.274883270263672, + "learning_rate": 3.941666666666667e-06, + "loss": 0.0438, + "num_input_tokens_seen": 3290864, + "step": 473 + }, + { + "epoch": 3.0506838294448912, + "grad_norm": 3.032172203063965, + "learning_rate": 3.95e-06, + "loss": 0.0464, + "num_input_tokens_seen": 3297856, + "step": 474 + }, + { + "epoch": 3.057119871279163, + "grad_norm": 2.258751392364502, + "learning_rate": 3.958333333333333e-06, + "loss": 0.0172, + "num_input_tokens_seen": 3305120, + "step": 475 + }, + { + "epoch": 3.063555913113435, + "grad_norm": 2.925736427307129, + "learning_rate": 3.966666666666667e-06, + "loss": 0.0287, + "num_input_tokens_seen": 3312032, + "step": 476 + }, + { + "epoch": 3.069991954947707, + "grad_norm": 3.100857734680176, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0579, + "num_input_tokens_seen": 3319424, + "step": 477 + }, + { + "epoch": 3.076427996781979, + "grad_norm": 1.753515601158142, + "learning_rate": 3.983333333333334e-06, + "loss": 0.0095, + "num_input_tokens_seen": 3326304, + "step": 478 + }, + { + "epoch": 3.082864038616251, + "grad_norm": 2.3217740058898926, + "learning_rate": 3.991666666666667e-06, + "loss": 0.0238, + "num_input_tokens_seen": 3333184, + "step": 479 + }, + { + "epoch": 3.089300080450523, + "grad_norm": 2.512751579284668, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0313, + "num_input_tokens_seen": 3340384, + "step": 480 + }, + { + "epoch": 3.0957361222847948, + "grad_norm": 1.2185322046279907, + "learning_rate": 4.008333333333334e-06, + "loss": 0.0146, + "num_input_tokens_seen": 3347344, + "step": 481 + }, + { + "epoch": 3.1021721641190667, + "grad_norm": 1.1303057670593262, + "learning_rate": 4.0166666666666675e-06, + "loss": 0.0347, + "num_input_tokens_seen": 3354080, + "step": 482 + }, + { + "epoch": 3.1086082059533386, + "grad_norm": 2.4247186183929443, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.024, + "num_input_tokens_seen": 3360848, + "step": 483 + }, + { + "epoch": 3.1150442477876106, + "grad_norm": 1.4767001867294312, + "learning_rate": 4.033333333333333e-06, + "loss": 0.0128, + "num_input_tokens_seen": 3367616, + "step": 484 + }, + { + "epoch": 3.1214802896218825, + "grad_norm": 2.458953857421875, + "learning_rate": 4.041666666666667e-06, + "loss": 0.0311, + "num_input_tokens_seen": 3374880, + "step": 485 + }, + { + "epoch": 3.1279163314561544, + "grad_norm": 0.5494964718818665, + "learning_rate": 4.05e-06, + "loss": 0.0178, + "num_input_tokens_seen": 3381696, + "step": 486 + }, + { + "epoch": 3.1343523732904264, + "grad_norm": 1.5969914197921753, + "learning_rate": 4.058333333333333e-06, + "loss": 0.0379, + "num_input_tokens_seen": 3388880, + "step": 487 + }, + { + "epoch": 3.1407884151246983, + "grad_norm": 1.7003910541534424, + "learning_rate": 4.066666666666667e-06, + "loss": 0.0299, + "num_input_tokens_seen": 3395984, + "step": 488 + }, + { + "epoch": 3.1472244569589702, + "grad_norm": 2.297182083129883, + "learning_rate": 4.075e-06, + "loss": 0.0261, + "num_input_tokens_seen": 3402896, + "step": 489 + }, + { + "epoch": 3.153660498793242, + "grad_norm": 2.3937814235687256, + "learning_rate": 4.083333333333334e-06, + "loss": 0.0347, + "num_input_tokens_seen": 3409888, + "step": 490 + }, + { + "epoch": 3.160096540627514, + "grad_norm": 1.349425196647644, + "learning_rate": 4.091666666666667e-06, + "loss": 0.011, + "num_input_tokens_seen": 3416928, + "step": 491 + }, + { + "epoch": 3.166532582461786, + "grad_norm": 3.0355069637298584, + "learning_rate": 4.1e-06, + "loss": 0.0541, + "num_input_tokens_seen": 3423968, + "step": 492 + }, + { + "epoch": 3.172968624296058, + "grad_norm": 2.680206537246704, + "learning_rate": 4.1083333333333335e-06, + "loss": 0.0465, + "num_input_tokens_seen": 3431120, + "step": 493 + }, + { + "epoch": 3.17940466613033, + "grad_norm": 1.5906095504760742, + "learning_rate": 4.116666666666667e-06, + "loss": 0.0187, + "num_input_tokens_seen": 3437776, + "step": 494 + }, + { + "epoch": 3.185840707964602, + "grad_norm": 0.8296425938606262, + "learning_rate": 4.125e-06, + "loss": 0.0089, + "num_input_tokens_seen": 3444480, + "step": 495 + }, + { + "epoch": 3.1922767497988738, + "grad_norm": 2.857689142227173, + "learning_rate": 4.133333333333333e-06, + "loss": 0.0289, + "num_input_tokens_seen": 3451232, + "step": 496 + }, + { + "epoch": 3.1987127916331457, + "grad_norm": 1.0910203456878662, + "learning_rate": 4.141666666666667e-06, + "loss": 0.0103, + "num_input_tokens_seen": 3457776, + "step": 497 + }, + { + "epoch": 3.2051488334674176, + "grad_norm": 1.3560919761657715, + "learning_rate": 4.15e-06, + "loss": 0.0132, + "num_input_tokens_seen": 3465056, + "step": 498 + }, + { + "epoch": 3.2115848753016896, + "grad_norm": 4.861215591430664, + "learning_rate": 4.158333333333334e-06, + "loss": 0.0375, + "num_input_tokens_seen": 3471968, + "step": 499 + }, + { + "epoch": 3.2180209171359615, + "grad_norm": 1.8714208602905273, + "learning_rate": 4.166666666666667e-06, + "loss": 0.0143, + "num_input_tokens_seen": 3479648, + "step": 500 + }, + { + "epoch": 3.2244569589702334, + "grad_norm": 1.6230028867721558, + "learning_rate": 4.175e-06, + "loss": 0.0159, + "num_input_tokens_seen": 3486272, + "step": 501 + }, + { + "epoch": 3.2308930008045054, + "grad_norm": 0.7852226495742798, + "learning_rate": 4.183333333333334e-06, + "loss": 0.0073, + "num_input_tokens_seen": 3493360, + "step": 502 + }, + { + "epoch": 3.2373290426387773, + "grad_norm": 2.3990976810455322, + "learning_rate": 4.1916666666666675e-06, + "loss": 0.0186, + "num_input_tokens_seen": 3500336, + "step": 503 + }, + { + "epoch": 3.2437650844730492, + "grad_norm": 0.796851634979248, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0035, + "num_input_tokens_seen": 3507232, + "step": 504 + }, + { + "epoch": 3.250201126307321, + "grad_norm": 2.7951748371124268, + "learning_rate": 4.208333333333333e-06, + "loss": 0.0416, + "num_input_tokens_seen": 3514144, + "step": 505 + }, + { + "epoch": 3.256637168141593, + "grad_norm": 2.40897274017334, + "learning_rate": 4.216666666666667e-06, + "loss": 0.0266, + "num_input_tokens_seen": 3520976, + "step": 506 + }, + { + "epoch": 3.263073209975865, + "grad_norm": 2.3974061012268066, + "learning_rate": 4.225e-06, + "loss": 0.0351, + "num_input_tokens_seen": 3527920, + "step": 507 + }, + { + "epoch": 3.2695092518101365, + "grad_norm": 2.30100154876709, + "learning_rate": 4.233333333333334e-06, + "loss": 0.0209, + "num_input_tokens_seen": 3534864, + "step": 508 + }, + { + "epoch": 3.2759452936444085, + "grad_norm": 2.1172518730163574, + "learning_rate": 4.241666666666667e-06, + "loss": 0.0434, + "num_input_tokens_seen": 3541872, + "step": 509 + }, + { + "epoch": 3.2823813354786804, + "grad_norm": 3.7030341625213623, + "learning_rate": 4.25e-06, + "loss": 0.0174, + "num_input_tokens_seen": 3548384, + "step": 510 + }, + { + "epoch": 3.2888173773129523, + "grad_norm": 2.152125597000122, + "learning_rate": 4.258333333333334e-06, + "loss": 0.0529, + "num_input_tokens_seen": 3555792, + "step": 511 + }, + { + "epoch": 3.2952534191472242, + "grad_norm": 0.6081152558326721, + "learning_rate": 4.266666666666668e-06, + "loss": 0.0033, + "num_input_tokens_seen": 3562608, + "step": 512 + }, + { + "epoch": 3.301689460981496, + "grad_norm": 1.7042624950408936, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0196, + "num_input_tokens_seen": 3569184, + "step": 513 + }, + { + "epoch": 3.308125502815768, + "grad_norm": 1.3502767086029053, + "learning_rate": 4.2833333333333335e-06, + "loss": 0.0242, + "num_input_tokens_seen": 3576224, + "step": 514 + }, + { + "epoch": 3.31456154465004, + "grad_norm": 4.480360984802246, + "learning_rate": 4.2916666666666665e-06, + "loss": 0.0316, + "num_input_tokens_seen": 3583328, + "step": 515 + }, + { + "epoch": 3.320997586484312, + "grad_norm": 2.2217299938201904, + "learning_rate": 4.3e-06, + "loss": 0.0268, + "num_input_tokens_seen": 3590256, + "step": 516 + }, + { + "epoch": 3.327433628318584, + "grad_norm": 1.5919010639190674, + "learning_rate": 4.308333333333334e-06, + "loss": 0.0248, + "num_input_tokens_seen": 3597328, + "step": 517 + }, + { + "epoch": 3.333869670152856, + "grad_norm": 2.425961971282959, + "learning_rate": 4.316666666666667e-06, + "loss": 0.032, + "num_input_tokens_seen": 3604576, + "step": 518 + }, + { + "epoch": 3.340305711987128, + "grad_norm": 2.987424612045288, + "learning_rate": 4.325e-06, + "loss": 0.0202, + "num_input_tokens_seen": 3611520, + "step": 519 + }, + { + "epoch": 3.3467417538213997, + "grad_norm": 2.633897304534912, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0329, + "num_input_tokens_seen": 3618288, + "step": 520 + }, + { + "epoch": 3.3531777956556716, + "grad_norm": 1.0696384906768799, + "learning_rate": 4.341666666666667e-06, + "loss": 0.019, + "num_input_tokens_seen": 3625216, + "step": 521 + }, + { + "epoch": 3.3596138374899436, + "grad_norm": 2.400972604751587, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0182, + "num_input_tokens_seen": 3631888, + "step": 522 + }, + { + "epoch": 3.3660498793242155, + "grad_norm": 1.3744821548461914, + "learning_rate": 4.358333333333334e-06, + "loss": 0.0124, + "num_input_tokens_seen": 3638848, + "step": 523 + }, + { + "epoch": 3.3724859211584874, + "grad_norm": 1.613145112991333, + "learning_rate": 4.366666666666667e-06, + "loss": 0.0122, + "num_input_tokens_seen": 3646112, + "step": 524 + }, + { + "epoch": 3.3789219629927594, + "grad_norm": 2.450824499130249, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0388, + "num_input_tokens_seen": 3652928, + "step": 525 + }, + { + "epoch": 3.3853580048270313, + "grad_norm": 1.6122058629989624, + "learning_rate": 4.383333333333334e-06, + "loss": 0.0106, + "num_input_tokens_seen": 3659632, + "step": 526 + }, + { + "epoch": 3.3917940466613032, + "grad_norm": 1.53513765335083, + "learning_rate": 4.391666666666667e-06, + "loss": 0.0305, + "num_input_tokens_seen": 3666480, + "step": 527 + }, + { + "epoch": 3.398230088495575, + "grad_norm": 2.103663444519043, + "learning_rate": 4.4e-06, + "loss": 0.0512, + "num_input_tokens_seen": 3673136, + "step": 528 + }, + { + "epoch": 3.404666130329847, + "grad_norm": 0.41373467445373535, + "learning_rate": 4.408333333333334e-06, + "loss": 0.0031, + "num_input_tokens_seen": 3679760, + "step": 529 + }, + { + "epoch": 3.411102172164119, + "grad_norm": 2.9610488414764404, + "learning_rate": 4.416666666666667e-06, + "loss": 0.0309, + "num_input_tokens_seen": 3686576, + "step": 530 + }, + { + "epoch": 3.417538213998391, + "grad_norm": 2.415531873703003, + "learning_rate": 4.425e-06, + "loss": 0.0472, + "num_input_tokens_seen": 3693312, + "step": 531 + }, + { + "epoch": 3.423974255832663, + "grad_norm": 2.175546407699585, + "learning_rate": 4.433333333333334e-06, + "loss": 0.0222, + "num_input_tokens_seen": 3700000, + "step": 532 + }, + { + "epoch": 3.430410297666935, + "grad_norm": 1.0903018712997437, + "learning_rate": 4.441666666666667e-06, + "loss": 0.0077, + "num_input_tokens_seen": 3706736, + "step": 533 + }, + { + "epoch": 3.4368463395012068, + "grad_norm": 0.8305991888046265, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0064, + "num_input_tokens_seen": 3714192, + "step": 534 + }, + { + "epoch": 3.4432823813354787, + "grad_norm": 0.9347790479660034, + "learning_rate": 4.4583333333333336e-06, + "loss": 0.0104, + "num_input_tokens_seen": 3721408, + "step": 535 + }, + { + "epoch": 3.4497184231697506, + "grad_norm": 1.7669559717178345, + "learning_rate": 4.4666666666666665e-06, + "loss": 0.0121, + "num_input_tokens_seen": 3728144, + "step": 536 + }, + { + "epoch": 3.4561544650040226, + "grad_norm": 3.121467351913452, + "learning_rate": 4.475e-06, + "loss": 0.0386, + "num_input_tokens_seen": 3734960, + "step": 537 + }, + { + "epoch": 3.4625905068382945, + "grad_norm": 2.683410882949829, + "learning_rate": 4.483333333333333e-06, + "loss": 0.0319, + "num_input_tokens_seen": 3741728, + "step": 538 + }, + { + "epoch": 3.4690265486725664, + "grad_norm": 9.728205680847168, + "learning_rate": 4.491666666666667e-06, + "loss": 0.0579, + "num_input_tokens_seen": 3749200, + "step": 539 + }, + { + "epoch": 3.4754625905068384, + "grad_norm": 4.415483474731445, + "learning_rate": 4.5e-06, + "loss": 0.0255, + "num_input_tokens_seen": 3755856, + "step": 540 + }, + { + "epoch": 3.4818986323411103, + "grad_norm": 3.651423692703247, + "learning_rate": 4.508333333333333e-06, + "loss": 0.0301, + "num_input_tokens_seen": 3762528, + "step": 541 + }, + { + "epoch": 3.4883346741753822, + "grad_norm": 2.318000078201294, + "learning_rate": 4.516666666666667e-06, + "loss": 0.0589, + "num_input_tokens_seen": 3769632, + "step": 542 + }, + { + "epoch": 3.494770716009654, + "grad_norm": 4.982158660888672, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0442, + "num_input_tokens_seen": 3776592, + "step": 543 + }, + { + "epoch": 3.501206757843926, + "grad_norm": 3.0872108936309814, + "learning_rate": 4.533333333333334e-06, + "loss": 0.0366, + "num_input_tokens_seen": 3783824, + "step": 544 + }, + { + "epoch": 3.507642799678198, + "grad_norm": 5.150477886199951, + "learning_rate": 4.541666666666667e-06, + "loss": 0.0643, + "num_input_tokens_seen": 3790864, + "step": 545 + }, + { + "epoch": 3.51407884151247, + "grad_norm": 3.0513834953308105, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0213, + "num_input_tokens_seen": 3797664, + "step": 546 + }, + { + "epoch": 3.520514883346742, + "grad_norm": 1.5530712604522705, + "learning_rate": 4.5583333333333335e-06, + "loss": 0.0154, + "num_input_tokens_seen": 3804576, + "step": 547 + }, + { + "epoch": 3.526950925181014, + "grad_norm": 2.6350319385528564, + "learning_rate": 4.566666666666667e-06, + "loss": 0.0252, + "num_input_tokens_seen": 3811440, + "step": 548 + }, + { + "epoch": 3.5333869670152858, + "grad_norm": 2.8993167877197266, + "learning_rate": 4.575e-06, + "loss": 0.038, + "num_input_tokens_seen": 3818352, + "step": 549 + }, + { + "epoch": 3.5398230088495577, + "grad_norm": 2.0168752670288086, + "learning_rate": 4.583333333333333e-06, + "loss": 0.0169, + "num_input_tokens_seen": 3825360, + "step": 550 + }, + { + "epoch": 3.5462590506838296, + "grad_norm": 2.4160525798797607, + "learning_rate": 4.591666666666667e-06, + "loss": 0.0253, + "num_input_tokens_seen": 3832416, + "step": 551 + }, + { + "epoch": 3.5526950925181016, + "grad_norm": 1.543545126914978, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0164, + "num_input_tokens_seen": 3839344, + "step": 552 + }, + { + "epoch": 3.5591311343523735, + "grad_norm": 2.355316400527954, + "learning_rate": 4.608333333333334e-06, + "loss": 0.0269, + "num_input_tokens_seen": 3846688, + "step": 553 + }, + { + "epoch": 3.5655671761866454, + "grad_norm": 1.4751020669937134, + "learning_rate": 4.616666666666667e-06, + "loss": 0.0192, + "num_input_tokens_seen": 3853696, + "step": 554 + }, + { + "epoch": 3.5720032180209174, + "grad_norm": 0.9673195481300354, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0132, + "num_input_tokens_seen": 3860832, + "step": 555 + }, + { + "epoch": 3.5784392598551893, + "grad_norm": 1.1592040061950684, + "learning_rate": 4.633333333333334e-06, + "loss": 0.0156, + "num_input_tokens_seen": 3868000, + "step": 556 + }, + { + "epoch": 3.5848753016894612, + "grad_norm": 1.01143217086792, + "learning_rate": 4.641666666666667e-06, + "loss": 0.0081, + "num_input_tokens_seen": 3874672, + "step": 557 + }, + { + "epoch": 3.591311343523733, + "grad_norm": 2.855041980743408, + "learning_rate": 4.65e-06, + "loss": 0.0351, + "num_input_tokens_seen": 3881744, + "step": 558 + }, + { + "epoch": 3.597747385358005, + "grad_norm": 2.0597968101501465, + "learning_rate": 4.658333333333333e-06, + "loss": 0.0288, + "num_input_tokens_seen": 3888256, + "step": 559 + }, + { + "epoch": 3.604183427192277, + "grad_norm": 2.9965226650238037, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0335, + "num_input_tokens_seen": 3895104, + "step": 560 + }, + { + "epoch": 3.6106194690265485, + "grad_norm": 3.625206708908081, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0492, + "num_input_tokens_seen": 3902208, + "step": 561 + }, + { + "epoch": 3.6170555108608204, + "grad_norm": 2.021160840988159, + "learning_rate": 4.683333333333334e-06, + "loss": 0.0082, + "num_input_tokens_seen": 3909040, + "step": 562 + }, + { + "epoch": 3.6234915526950924, + "grad_norm": 3.4565329551696777, + "learning_rate": 4.691666666666667e-06, + "loss": 0.0491, + "num_input_tokens_seen": 3916304, + "step": 563 + }, + { + "epoch": 3.6299275945293643, + "grad_norm": 3.2362654209136963, + "learning_rate": 4.7e-06, + "loss": 0.0568, + "num_input_tokens_seen": 3923216, + "step": 564 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 3.234666347503662, + "learning_rate": 4.708333333333334e-06, + "loss": 0.0414, + "num_input_tokens_seen": 3930448, + "step": 565 + }, + { + "epoch": 3.642799678197908, + "grad_norm": 2.1742103099823, + "learning_rate": 4.7166666666666675e-06, + "loss": 0.034, + "num_input_tokens_seen": 3937424, + "step": 566 + }, + { + "epoch": 3.64923572003218, + "grad_norm": 2.9156923294067383, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0392, + "num_input_tokens_seen": 3944112, + "step": 567 + }, + { + "epoch": 3.655671761866452, + "grad_norm": 4.092429161071777, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.051, + "num_input_tokens_seen": 3951504, + "step": 568 + }, + { + "epoch": 3.662107803700724, + "grad_norm": 3.9395768642425537, + "learning_rate": 4.741666666666667e-06, + "loss": 0.034, + "num_input_tokens_seen": 3958352, + "step": 569 + }, + { + "epoch": 3.668543845534996, + "grad_norm": 1.9961844682693481, + "learning_rate": 4.75e-06, + "loss": 0.014, + "num_input_tokens_seen": 3965552, + "step": 570 + }, + { + "epoch": 3.674979887369268, + "grad_norm": 1.8078194856643677, + "learning_rate": 4.758333333333334e-06, + "loss": 0.0406, + "num_input_tokens_seen": 3972544, + "step": 571 + }, + { + "epoch": 3.6814159292035398, + "grad_norm": 2.048532485961914, + "learning_rate": 4.766666666666667e-06, + "loss": 0.0407, + "num_input_tokens_seen": 3979264, + "step": 572 + }, + { + "epoch": 3.6878519710378117, + "grad_norm": 1.9979974031448364, + "learning_rate": 4.775e-06, + "loss": 0.0282, + "num_input_tokens_seen": 3986240, + "step": 573 + }, + { + "epoch": 3.6942880128720836, + "grad_norm": 3.6126463413238525, + "learning_rate": 4.783333333333334e-06, + "loss": 0.0326, + "num_input_tokens_seen": 3993232, + "step": 574 + }, + { + "epoch": 3.7007240547063556, + "grad_norm": 3.131657838821411, + "learning_rate": 4.791666666666668e-06, + "loss": 0.0348, + "num_input_tokens_seen": 3999952, + "step": 575 + }, + { + "epoch": 3.7071600965406275, + "grad_norm": 2.2662060260772705, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0256, + "num_input_tokens_seen": 4007456, + "step": 576 + }, + { + "epoch": 3.7135961383748994, + "grad_norm": 4.874523639678955, + "learning_rate": 4.808333333333334e-06, + "loss": 0.0765, + "num_input_tokens_seen": 4015024, + "step": 577 + }, + { + "epoch": 3.7200321802091714, + "grad_norm": 0.882166862487793, + "learning_rate": 4.816666666666667e-06, + "loss": 0.0099, + "num_input_tokens_seen": 4021920, + "step": 578 + }, + { + "epoch": 3.7264682220434433, + "grad_norm": 3.1239066123962402, + "learning_rate": 4.825e-06, + "loss": 0.0173, + "num_input_tokens_seen": 4028720, + "step": 579 + }, + { + "epoch": 3.7329042638777152, + "grad_norm": 1.5819370746612549, + "learning_rate": 4.833333333333333e-06, + "loss": 0.0084, + "num_input_tokens_seen": 4035584, + "step": 580 + }, + { + "epoch": 3.739340305711987, + "grad_norm": 2.6252429485321045, + "learning_rate": 4.841666666666667e-06, + "loss": 0.0251, + "num_input_tokens_seen": 4042464, + "step": 581 + }, + { + "epoch": 3.745776347546259, + "grad_norm": 2.0619590282440186, + "learning_rate": 4.85e-06, + "loss": 0.0909, + "num_input_tokens_seen": 4049600, + "step": 582 + }, + { + "epoch": 3.752212389380531, + "grad_norm": 2.547422409057617, + "learning_rate": 4.858333333333334e-06, + "loss": 0.039, + "num_input_tokens_seen": 4056320, + "step": 583 + }, + { + "epoch": 3.758648431214803, + "grad_norm": 1.3179091215133667, + "learning_rate": 4.866666666666667e-06, + "loss": 0.0079, + "num_input_tokens_seen": 4063200, + "step": 584 + }, + { + "epoch": 3.765084473049075, + "grad_norm": 3.090376377105713, + "learning_rate": 4.875e-06, + "loss": 0.0242, + "num_input_tokens_seen": 4070112, + "step": 585 + }, + { + "epoch": 3.771520514883347, + "grad_norm": 2.50468111038208, + "learning_rate": 4.883333333333334e-06, + "loss": 0.0138, + "num_input_tokens_seen": 4076928, + "step": 586 + }, + { + "epoch": 3.7779565567176188, + "grad_norm": 3.921415090560913, + "learning_rate": 4.8916666666666675e-06, + "loss": 0.0467, + "num_input_tokens_seen": 4083792, + "step": 587 + }, + { + "epoch": 3.7843925985518907, + "grad_norm": 1.2243348360061646, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0241, + "num_input_tokens_seen": 4090672, + "step": 588 + }, + { + "epoch": 3.7908286403861626, + "grad_norm": 1.4968576431274414, + "learning_rate": 4.9083333333333335e-06, + "loss": 0.0404, + "num_input_tokens_seen": 4097472, + "step": 589 + }, + { + "epoch": 3.7972646822204346, + "grad_norm": 1.235217809677124, + "learning_rate": 4.9166666666666665e-06, + "loss": 0.0094, + "num_input_tokens_seen": 4104016, + "step": 590 + }, + { + "epoch": 3.8037007240547065, + "grad_norm": 1.3862783908843994, + "learning_rate": 4.925e-06, + "loss": 0.0196, + "num_input_tokens_seen": 4110784, + "step": 591 + }, + { + "epoch": 3.8101367658889784, + "grad_norm": 3.560793399810791, + "learning_rate": 4.933333333333334e-06, + "loss": 0.0514, + "num_input_tokens_seen": 4117984, + "step": 592 + }, + { + "epoch": 3.8165728077232504, + "grad_norm": 2.008575677871704, + "learning_rate": 4.941666666666667e-06, + "loss": 0.0286, + "num_input_tokens_seen": 4125072, + "step": 593 + }, + { + "epoch": 3.823008849557522, + "grad_norm": 2.3213093280792236, + "learning_rate": 4.95e-06, + "loss": 0.0417, + "num_input_tokens_seen": 4132160, + "step": 594 + }, + { + "epoch": 3.829444891391794, + "grad_norm": 1.3540257215499878, + "learning_rate": 4.958333333333334e-06, + "loss": 0.0347, + "num_input_tokens_seen": 4139136, + "step": 595 + }, + { + "epoch": 3.8358809332260657, + "grad_norm": 1.289825677871704, + "learning_rate": 4.966666666666667e-06, + "loss": 0.0229, + "num_input_tokens_seen": 4146240, + "step": 596 + }, + { + "epoch": 3.8423169750603376, + "grad_norm": 2.4050135612487793, + "learning_rate": 4.975000000000001e-06, + "loss": 0.0176, + "num_input_tokens_seen": 4153152, + "step": 597 + }, + { + "epoch": 3.8487530168946096, + "grad_norm": 1.523977518081665, + "learning_rate": 4.983333333333334e-06, + "loss": 0.0274, + "num_input_tokens_seen": 4160080, + "step": 598 + }, + { + "epoch": 3.8551890587288815, + "grad_norm": 1.1898863315582275, + "learning_rate": 4.991666666666667e-06, + "loss": 0.0253, + "num_input_tokens_seen": 4167008, + "step": 599 + }, + { + "epoch": 3.8616251005631534, + "grad_norm": 1.992311954498291, + "learning_rate": 5e-06, + "loss": 0.0429, + "num_input_tokens_seen": 4174080, + "step": 600 + }, + { + "epoch": 3.8680611423974254, + "grad_norm": 0.9558950066566467, + "learning_rate": 4.999597169822646e-06, + "loss": 0.0142, + "num_input_tokens_seen": 4181104, + "step": 601 + }, + { + "epoch": 3.8744971842316973, + "grad_norm": 0.9275301694869995, + "learning_rate": 4.998388809108304e-06, + "loss": 0.0148, + "num_input_tokens_seen": 4188096, + "step": 602 + }, + { + "epoch": 3.8809332260659692, + "grad_norm": 1.6707432270050049, + "learning_rate": 4.996375307268303e-06, + "loss": 0.0166, + "num_input_tokens_seen": 4195152, + "step": 603 + }, + { + "epoch": 3.887369267900241, + "grad_norm": 5.857227325439453, + "learning_rate": 4.993557313182086e-06, + "loss": 0.0224, + "num_input_tokens_seen": 4201952, + "step": 604 + }, + { + "epoch": 3.893805309734513, + "grad_norm": 5.273613452911377, + "learning_rate": 4.989935734988098e-06, + "loss": 0.0227, + "num_input_tokens_seen": 4209104, + "step": 605 + }, + { + "epoch": 3.900241351568785, + "grad_norm": 6.268670082092285, + "learning_rate": 4.985511739791129e-06, + "loss": 0.0597, + "num_input_tokens_seen": 4216496, + "step": 606 + }, + { + "epoch": 3.906677393403057, + "grad_norm": 3.373368501663208, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0339, + "num_input_tokens_seen": 4223840, + "step": 607 + }, + { + "epoch": 3.913113435237329, + "grad_norm": 1.3991198539733887, + "learning_rate": 4.974262459299088e-06, + "loss": 0.0192, + "num_input_tokens_seen": 4230752, + "step": 608 + }, + { + "epoch": 3.919549477071601, + "grad_norm": 0.7424534559249878, + "learning_rate": 4.967440799243739e-06, + "loss": 0.007, + "num_input_tokens_seen": 4237360, + "step": 609 + }, + { + "epoch": 3.9259855189058728, + "grad_norm": 3.0347440242767334, + "learning_rate": 4.959823971496575e-06, + "loss": 0.017, + "num_input_tokens_seen": 4244128, + "step": 610 + }, + { + "epoch": 3.9324215607401447, + "grad_norm": 2.929175853729248, + "learning_rate": 4.9514144306880506e-06, + "loss": 0.0296, + "num_input_tokens_seen": 4251264, + "step": 611 + }, + { + "epoch": 3.9388576025744166, + "grad_norm": 4.076401710510254, + "learning_rate": 4.942214886911619e-06, + "loss": 0.0429, + "num_input_tokens_seen": 4258256, + "step": 612 + }, + { + "epoch": 3.9452936444086886, + "grad_norm": 0.7720851302146912, + "learning_rate": 4.932228304850363e-06, + "loss": 0.0027, + "num_input_tokens_seen": 4265280, + "step": 613 + }, + { + "epoch": 3.9517296862429605, + "grad_norm": 1.500545859336853, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0395, + "num_input_tokens_seen": 4271968, + "step": 614 + }, + { + "epoch": 3.9581657280772324, + "grad_norm": 3.0767860412597656, + "learning_rate": 4.909907151739634e-06, + "loss": 0.03, + "num_input_tokens_seen": 4278848, + "step": 615 + }, + { + "epoch": 3.9646017699115044, + "grad_norm": 1.5455620288848877, + "learning_rate": 4.897579773997415e-06, + "loss": 0.0178, + "num_input_tokens_seen": 4285808, + "step": 616 + }, + { + "epoch": 3.9710378117457763, + "grad_norm": 1.1472654342651367, + "learning_rate": 4.884479742266731e-06, + "loss": 0.0139, + "num_input_tokens_seen": 4292912, + "step": 617 + }, + { + "epoch": 3.9774738535800482, + "grad_norm": 1.3290921449661255, + "learning_rate": 4.870611278218066e-06, + "loss": 0.0076, + "num_input_tokens_seen": 4300176, + "step": 618 + }, + { + "epoch": 3.98390989541432, + "grad_norm": 4.543910026550293, + "learning_rate": 4.855978851160088e-06, + "loss": 0.0683, + "num_input_tokens_seen": 4307776, + "step": 619 + }, + { + "epoch": 3.990345937248592, + "grad_norm": 3.424959421157837, + "learning_rate": 4.8405871765993435e-06, + "loss": 0.0367, + "num_input_tokens_seen": 4314688, + "step": 620 + }, + { + "epoch": 3.996781979082864, + "grad_norm": 1.5345810651779175, + "learning_rate": 4.824441214720629e-06, + "loss": 0.0497, + "num_input_tokens_seen": 4321840, + "step": 621 + }, + { + "epoch": 4.003218020917136, + "grad_norm": 0.5405219793319702, + "learning_rate": 4.8075461687884935e-06, + "loss": 0.0054, + "num_input_tokens_seen": 4328736, + "step": 622 + }, + { + "epoch": 4.009654062751408, + "grad_norm": 2.3540198802948, + "learning_rate": 4.7899074834704165e-06, + "loss": 0.0259, + "num_input_tokens_seen": 4335952, + "step": 623 + }, + { + "epoch": 4.01609010458568, + "grad_norm": 0.7733599543571472, + "learning_rate": 4.771530843082187e-06, + "loss": 0.0082, + "num_input_tokens_seen": 4342816, + "step": 624 + }, + { + "epoch": 4.022526146419952, + "grad_norm": 3.051017999649048, + "learning_rate": 4.752422169756048e-06, + "loss": 0.0359, + "num_input_tokens_seen": 4349456, + "step": 625 + }, + { + "epoch": 4.028962188254224, + "grad_norm": 0.4645274579524994, + "learning_rate": 4.732587621532214e-06, + "loss": 0.0081, + "num_input_tokens_seen": 4356032, + "step": 626 + }, + { + "epoch": 4.035398230088496, + "grad_norm": 1.9294419288635254, + "learning_rate": 4.712033590374346e-06, + "loss": 0.0118, + "num_input_tokens_seen": 4362928, + "step": 627 + }, + { + "epoch": 4.041834271922768, + "grad_norm": 2.5432851314544678, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0235, + "num_input_tokens_seen": 4369616, + "step": 628 + }, + { + "epoch": 4.0482703137570395, + "grad_norm": 1.8334590196609497, + "learning_rate": 4.668793804294294e-06, + "loss": 0.0145, + "num_input_tokens_seen": 4376656, + "step": 629 + }, + { + "epoch": 4.054706355591311, + "grad_norm": 0.6473208069801331, + "learning_rate": 4.646121984004666e-06, + "loss": 0.006, + "num_input_tokens_seen": 4383696, + "step": 630 + }, + { + "epoch": 4.061142397425583, + "grad_norm": 2.0988128185272217, + "learning_rate": 4.622758545555485e-06, + "loss": 0.0191, + "num_input_tokens_seen": 4390880, + "step": 631 + }, + { + "epoch": 4.067578439259855, + "grad_norm": 1.8957973718643188, + "learning_rate": 4.598711018145193e-06, + "loss": 0.0075, + "num_input_tokens_seen": 4398000, + "step": 632 + }, + { + "epoch": 4.074014481094127, + "grad_norm": 1.117255449295044, + "learning_rate": 4.573987151429579e-06, + "loss": 0.0253, + "num_input_tokens_seen": 4404640, + "step": 633 + }, + { + "epoch": 4.080450522928399, + "grad_norm": 2.326129198074341, + "learning_rate": 4.54859491302433e-06, + "loss": 0.0317, + "num_input_tokens_seen": 4411760, + "step": 634 + }, + { + "epoch": 4.086886564762671, + "grad_norm": 1.6843276023864746, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0082, + "num_input_tokens_seen": 4418896, + "step": 635 + }, + { + "epoch": 4.093322606596943, + "grad_norm": 2.301496744155884, + "learning_rate": 4.495838265931754e-06, + "loss": 0.0101, + "num_input_tokens_seen": 4425776, + "step": 636 + }, + { + "epoch": 4.099758648431215, + "grad_norm": 1.434444546699524, + "learning_rate": 4.4684908588200305e-06, + "loss": 0.0112, + "num_input_tokens_seen": 4432656, + "step": 637 + }, + { + "epoch": 4.106194690265487, + "grad_norm": 1.3446779251098633, + "learning_rate": 4.440509077690883e-06, + "loss": 0.0034, + "num_input_tokens_seen": 4439424, + "step": 638 + }, + { + "epoch": 4.112630732099759, + "grad_norm": 0.6733867526054382, + "learning_rate": 4.411901940068997e-06, + "loss": 0.0037, + "num_input_tokens_seen": 4446160, + "step": 639 + }, + { + "epoch": 4.119066773934031, + "grad_norm": 1.339034080505371, + "learning_rate": 4.382678665009028e-06, + "loss": 0.0085, + "num_input_tokens_seen": 4453376, + "step": 640 + }, + { + "epoch": 4.125502815768303, + "grad_norm": 3.2036638259887695, + "learning_rate": 4.352848670124637e-06, + "loss": 0.0328, + "num_input_tokens_seen": 4459952, + "step": 641 + }, + { + "epoch": 4.131938857602575, + "grad_norm": 1.1791878938674927, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0098, + "num_input_tokens_seen": 4466880, + "step": 642 + }, + { + "epoch": 4.1383748994368466, + "grad_norm": 1.8526674509048462, + "learning_rate": 4.291407165859481e-06, + "loss": 0.0051, + "num_input_tokens_seen": 4474064, + "step": 643 + }, + { + "epoch": 4.1448109412711185, + "grad_norm": 0.4795032739639282, + "learning_rate": 4.259815456872363e-06, + "loss": 0.0047, + "num_input_tokens_seen": 4480864, + "step": 644 + }, + { + "epoch": 4.15124698310539, + "grad_norm": 1.4392155408859253, + "learning_rate": 4.227656622467162e-06, + "loss": 0.0111, + "num_input_tokens_seen": 4487504, + "step": 645 + }, + { + "epoch": 4.157683024939662, + "grad_norm": 3.185128688812256, + "learning_rate": 4.194941026283053e-06, + "loss": 0.0334, + "num_input_tokens_seen": 4494512, + "step": 646 + }, + { + "epoch": 4.164119066773934, + "grad_norm": 1.7285927534103394, + "learning_rate": 4.161679211383565e-06, + "loss": 0.013, + "num_input_tokens_seen": 4501296, + "step": 647 + }, + { + "epoch": 4.170555108608206, + "grad_norm": 4.266958713531494, + "learning_rate": 4.127881896858934e-06, + "loss": 0.0305, + "num_input_tokens_seen": 4508128, + "step": 648 + }, + { + "epoch": 4.176991150442478, + "grad_norm": 1.000532627105713, + "learning_rate": 4.093559974371725e-06, + "loss": 0.0092, + "num_input_tokens_seen": 4515008, + "step": 649 + }, + { + "epoch": 4.18342719227675, + "grad_norm": 1.1824270486831665, + "learning_rate": 4.058724504646834e-06, + "loss": 0.0223, + "num_input_tokens_seen": 4521920, + "step": 650 + }, + { + "epoch": 4.189863234111022, + "grad_norm": 2.444427728652954, + "learning_rate": 4.023386713907021e-06, + "loss": 0.0234, + "num_input_tokens_seen": 4528912, + "step": 651 + }, + { + "epoch": 4.196299275945294, + "grad_norm": 1.421184778213501, + "learning_rate": 3.987557990255093e-06, + "loss": 0.0185, + "num_input_tokens_seen": 4535664, + "step": 652 + }, + { + "epoch": 4.202735317779566, + "grad_norm": 0.9019869565963745, + "learning_rate": 3.951249880003934e-06, + "loss": 0.0075, + "num_input_tokens_seen": 4542832, + "step": 653 + }, + { + "epoch": 4.209171359613838, + "grad_norm": 1.7373372316360474, + "learning_rate": 3.914474083955537e-06, + "loss": 0.0217, + "num_input_tokens_seen": 4549552, + "step": 654 + }, + { + "epoch": 4.21560740144811, + "grad_norm": 0.31386592984199524, + "learning_rate": 3.8772424536302565e-06, + "loss": 0.0027, + "num_input_tokens_seen": 4556192, + "step": 655 + }, + { + "epoch": 4.222043443282382, + "grad_norm": 1.8379613161087036, + "learning_rate": 3.839566987447492e-06, + "loss": 0.0153, + "num_input_tokens_seen": 4563168, + "step": 656 + }, + { + "epoch": 4.228479485116654, + "grad_norm": 1.221056342124939, + "learning_rate": 3.801459826859022e-06, + "loss": 0.0092, + "num_input_tokens_seen": 4570704, + "step": 657 + }, + { + "epoch": 4.2349155269509255, + "grad_norm": 0.7823006510734558, + "learning_rate": 3.7629332524362532e-06, + "loss": 0.0082, + "num_input_tokens_seen": 4578016, + "step": 658 + }, + { + "epoch": 4.2413515687851975, + "grad_norm": 1.149715781211853, + "learning_rate": 3.7239996799126315e-06, + "loss": 0.0163, + "num_input_tokens_seen": 4584896, + "step": 659 + }, + { + "epoch": 4.247787610619469, + "grad_norm": 0.6069539189338684, + "learning_rate": 3.684671656182497e-06, + "loss": 0.0099, + "num_input_tokens_seen": 4591984, + "step": 660 + }, + { + "epoch": 4.254223652453741, + "grad_norm": 2.427281141281128, + "learning_rate": 3.644961855257669e-06, + "loss": 0.0269, + "num_input_tokens_seen": 4598656, + "step": 661 + }, + { + "epoch": 4.260659694288013, + "grad_norm": 1.0770633220672607, + "learning_rate": 3.6048830741830678e-06, + "loss": 0.007, + "num_input_tokens_seen": 4606032, + "step": 662 + }, + { + "epoch": 4.267095736122285, + "grad_norm": 2.4310688972473145, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0427, + "num_input_tokens_seen": 4613056, + "step": 663 + }, + { + "epoch": 4.273531777956556, + "grad_norm": 1.2328161001205444, + "learning_rate": 3.523670350147227e-06, + "loss": 0.0122, + "num_input_tokens_seen": 4619776, + "step": 664 + }, + { + "epoch": 4.279967819790828, + "grad_norm": 1.519998550415039, + "learning_rate": 3.4825625791348093e-06, + "loss": 0.0137, + "num_input_tokens_seen": 4626240, + "step": 665 + }, + { + "epoch": 4.2864038616251, + "grad_norm": 1.4114880561828613, + "learning_rate": 3.44113816343598e-06, + "loss": 0.02, + "num_input_tokens_seen": 4633216, + "step": 666 + }, + { + "epoch": 4.292839903459372, + "grad_norm": 1.4585809707641602, + "learning_rate": 3.399410452654518e-06, + "loss": 0.006, + "num_input_tokens_seen": 4639856, + "step": 667 + }, + { + "epoch": 4.299275945293644, + "grad_norm": 1.594936490058899, + "learning_rate": 3.357392894135329e-06, + "loss": 0.0085, + "num_input_tokens_seen": 4646832, + "step": 668 + }, + { + "epoch": 4.305711987127916, + "grad_norm": 2.5802690982818604, + "learning_rate": 3.315099028630855e-06, + "loss": 0.0112, + "num_input_tokens_seen": 4653648, + "step": 669 + }, + { + "epoch": 4.312148028962188, + "grad_norm": 1.3826483488082886, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0131, + "num_input_tokens_seen": 4660672, + "step": 670 + }, + { + "epoch": 4.31858407079646, + "grad_norm": 2.1874148845672607, + "learning_rate": 3.229736980502584e-06, + "loss": 0.0124, + "num_input_tokens_seen": 4667888, + "step": 671 + }, + { + "epoch": 4.325020112630732, + "grad_norm": 1.61604642868042, + "learning_rate": 3.186696307005976e-06, + "loss": 0.0042, + "num_input_tokens_seen": 4675072, + "step": 672 + }, + { + "epoch": 4.331456154465004, + "grad_norm": 0.40999871492385864, + "learning_rate": 3.1434343359132565e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4682016, + "step": 673 + }, + { + "epoch": 4.337892196299276, + "grad_norm": 0.1305094212293625, + "learning_rate": 3.099965009006415e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4688912, + "step": 674 + }, + { + "epoch": 4.3443282381335475, + "grad_norm": 1.6623185873031616, + "learning_rate": 3.056302334890786e-06, + "loss": 0.0056, + "num_input_tokens_seen": 4695936, + "step": 675 + }, + { + "epoch": 4.3507642799678194, + "grad_norm": 1.034837007522583, + "learning_rate": 3.0124603844805767e-06, + "loss": 0.0079, + "num_input_tokens_seen": 4703184, + "step": 676 + }, + { + "epoch": 4.357200321802091, + "grad_norm": 2.2049107551574707, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.0216, + "num_input_tokens_seen": 4710064, + "step": 677 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 4.32258939743042, + "learning_rate": 2.9242952227516726e-06, + "loss": 0.0258, + "num_input_tokens_seen": 4716336, + "step": 678 + }, + { + "epoch": 4.370072405470635, + "grad_norm": 1.0949031114578247, + "learning_rate": 2.8800004239031687e-06, + "loss": 0.0049, + "num_input_tokens_seen": 4723360, + "step": 679 + }, + { + "epoch": 4.376508447304907, + "grad_norm": 1.563004493713379, + "learning_rate": 2.835583164544139e-06, + "loss": 0.0034, + "num_input_tokens_seen": 4730464, + "step": 680 + }, + { + "epoch": 4.382944489139179, + "grad_norm": 2.775270938873291, + "learning_rate": 2.791057758764557e-06, + "loss": 0.0341, + "num_input_tokens_seen": 4737056, + "step": 681 + }, + { + "epoch": 4.389380530973451, + "grad_norm": 3.1517560482025146, + "learning_rate": 2.7464385555061092e-06, + "loss": 0.0074, + "num_input_tokens_seen": 4743936, + "step": 682 + }, + { + "epoch": 4.395816572807723, + "grad_norm": 1.2521913051605225, + "learning_rate": 2.7017399339380435e-06, + "loss": 0.0272, + "num_input_tokens_seen": 4751024, + "step": 683 + }, + { + "epoch": 4.402252614641995, + "grad_norm": 3.4706435203552246, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.0168, + "num_input_tokens_seen": 4758000, + "step": 684 + }, + { + "epoch": 4.408688656476267, + "grad_norm": 0.8021034598350525, + "learning_rate": 2.6121620758762877e-06, + "loss": 0.0047, + "num_input_tokens_seen": 4764816, + "step": 685 + }, + { + "epoch": 4.415124698310539, + "grad_norm": 4.709753036499023, + "learning_rate": 2.5673117071141574e-06, + "loss": 0.0198, + "num_input_tokens_seen": 4772144, + "step": 686 + }, + { + "epoch": 4.421560740144811, + "grad_norm": 0.40973323583602905, + "learning_rate": 2.522439646202495e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4778960, + "step": 687 + }, + { + "epoch": 4.427996781979083, + "grad_norm": 3.179236888885498, + "learning_rate": 2.4775603537975055e-06, + "loss": 0.0256, + "num_input_tokens_seen": 4785952, + "step": 688 + }, + { + "epoch": 4.434432823813355, + "grad_norm": 2.5204341411590576, + "learning_rate": 2.4326882928858435e-06, + "loss": 0.0187, + "num_input_tokens_seen": 4792608, + "step": 689 + }, + { + "epoch": 4.4408688656476265, + "grad_norm": 3.6536998748779297, + "learning_rate": 2.3878379241237136e-06, + "loss": 0.0135, + "num_input_tokens_seen": 4799232, + "step": 690 + }, + { + "epoch": 4.447304907481898, + "grad_norm": 1.0689839124679565, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.0036, + "num_input_tokens_seen": 4806080, + "step": 691 + }, + { + "epoch": 4.45374094931617, + "grad_norm": 2.071629762649536, + "learning_rate": 2.2982600660619574e-06, + "loss": 0.0135, + "num_input_tokens_seen": 4813728, + "step": 692 + }, + { + "epoch": 4.460176991150442, + "grad_norm": 3.4168224334716797, + "learning_rate": 2.253561444493891e-06, + "loss": 0.0046, + "num_input_tokens_seen": 4820608, + "step": 693 + }, + { + "epoch": 4.466613032984714, + "grad_norm": 0.3058677017688751, + "learning_rate": 2.2089422412354434e-06, + "loss": 0.0019, + "num_input_tokens_seen": 4827056, + "step": 694 + }, + { + "epoch": 4.473049074818986, + "grad_norm": 0.4175882935523987, + "learning_rate": 2.1644168354558623e-06, + "loss": 0.0022, + "num_input_tokens_seen": 4834080, + "step": 695 + }, + { + "epoch": 4.479485116653258, + "grad_norm": 0.7226863503456116, + "learning_rate": 2.119999576096832e-06, + "loss": 0.0093, + "num_input_tokens_seen": 4840912, + "step": 696 + }, + { + "epoch": 4.48592115848753, + "grad_norm": 0.1190720871090889, + "learning_rate": 2.0757047772483278e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4848112, + "step": 697 + }, + { + "epoch": 4.492357200321802, + "grad_norm": 1.0061287879943848, + "learning_rate": 2.031546713535688e-06, + "loss": 0.0036, + "num_input_tokens_seen": 4855072, + "step": 698 + }, + { + "epoch": 4.498793242156074, + "grad_norm": 0.9472126364707947, + "learning_rate": 1.987539615519424e-06, + "loss": 0.0071, + "num_input_tokens_seen": 4862064, + "step": 699 + }, + { + "epoch": 4.505229283990346, + "grad_norm": 0.8338857889175415, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.0055, + "num_input_tokens_seen": 4869104, + "step": 700 + }, + { + "epoch": 4.511665325824618, + "grad_norm": 3.2061474323272705, + "learning_rate": 1.9000349909935852e-06, + "loss": 0.0291, + "num_input_tokens_seen": 4876112, + "step": 701 + }, + { + "epoch": 4.51810136765889, + "grad_norm": 3.644125461578369, + "learning_rate": 1.8565656640867448e-06, + "loss": 0.0407, + "num_input_tokens_seen": 4883264, + "step": 702 + }, + { + "epoch": 4.524537409493162, + "grad_norm": 2.2370316982269287, + "learning_rate": 1.813303692994025e-06, + "loss": 0.0245, + "num_input_tokens_seen": 4890192, + "step": 703 + }, + { + "epoch": 4.530973451327434, + "grad_norm": 3.3120510578155518, + "learning_rate": 1.770263019497417e-06, + "loss": 0.0207, + "num_input_tokens_seen": 4897200, + "step": 704 + }, + { + "epoch": 4.5374094931617055, + "grad_norm": 1.256335973739624, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.0269, + "num_input_tokens_seen": 4904016, + "step": 705 + }, + { + "epoch": 4.543845534995977, + "grad_norm": 0.10977872461080551, + "learning_rate": 1.6849009713691456e-06, + "loss": 0.001, + "num_input_tokens_seen": 4910944, + "step": 706 + }, + { + "epoch": 4.550281576830249, + "grad_norm": 1.9825077056884766, + "learning_rate": 1.6426071058646718e-06, + "loss": 0.0205, + "num_input_tokens_seen": 4917424, + "step": 707 + }, + { + "epoch": 4.556717618664521, + "grad_norm": 0.7529383897781372, + "learning_rate": 1.6005895473454836e-06, + "loss": 0.0148, + "num_input_tokens_seen": 4924288, + "step": 708 + }, + { + "epoch": 4.563153660498793, + "grad_norm": 2.29215145111084, + "learning_rate": 1.55886183656402e-06, + "loss": 0.0239, + "num_input_tokens_seen": 4931040, + "step": 709 + }, + { + "epoch": 4.569589702333065, + "grad_norm": 1.639636754989624, + "learning_rate": 1.5174374208651913e-06, + "loss": 0.0165, + "num_input_tokens_seen": 4937968, + "step": 710 + }, + { + "epoch": 4.576025744167337, + "grad_norm": 1.8043317794799805, + "learning_rate": 1.4763296498527744e-06, + "loss": 0.0079, + "num_input_tokens_seen": 4945456, + "step": 711 + }, + { + "epoch": 4.582461786001609, + "grad_norm": 1.8007737398147583, + "learning_rate": 1.4355517710873184e-06, + "loss": 0.0338, + "num_input_tokens_seen": 4952080, + "step": 712 + }, + { + "epoch": 4.588897827835881, + "grad_norm": 0.6810876131057739, + "learning_rate": 1.395116925816934e-06, + "loss": 0.0136, + "num_input_tokens_seen": 4958944, + "step": 713 + }, + { + "epoch": 4.595333869670153, + "grad_norm": 1.0080180168151855, + "learning_rate": 1.3550381447423317e-06, + "loss": 0.0126, + "num_input_tokens_seen": 4966320, + "step": 714 + }, + { + "epoch": 4.601769911504425, + "grad_norm": 1.1210750341415405, + "learning_rate": 1.3153283438175036e-06, + "loss": 0.0174, + "num_input_tokens_seen": 4973344, + "step": 715 + }, + { + "epoch": 4.608205953338697, + "grad_norm": 2.2793147563934326, + "learning_rate": 1.27600032008737e-06, + "loss": 0.0155, + "num_input_tokens_seen": 4980304, + "step": 716 + }, + { + "epoch": 4.614641995172969, + "grad_norm": 2.0746471881866455, + "learning_rate": 1.2370667475637474e-06, + "loss": 0.0349, + "num_input_tokens_seen": 4987616, + "step": 717 + }, + { + "epoch": 4.621078037007241, + "grad_norm": 1.9974377155303955, + "learning_rate": 1.1985401731409793e-06, + "loss": 0.0082, + "num_input_tokens_seen": 4994656, + "step": 718 + }, + { + "epoch": 4.627514078841513, + "grad_norm": 0.9225305914878845, + "learning_rate": 1.160433012552508e-06, + "loss": 0.0204, + "num_input_tokens_seen": 5001776, + "step": 719 + }, + { + "epoch": 4.6339501206757845, + "grad_norm": 0.6030845642089844, + "learning_rate": 1.122757546369744e-06, + "loss": 0.0074, + "num_input_tokens_seen": 5008688, + "step": 720 + }, + { + "epoch": 4.640386162510056, + "grad_norm": 1.1969950199127197, + "learning_rate": 1.085525916044464e-06, + "loss": 0.0154, + "num_input_tokens_seen": 5015680, + "step": 721 + }, + { + "epoch": 4.646822204344328, + "grad_norm": 1.7312675714492798, + "learning_rate": 1.048750119996066e-06, + "loss": 0.0101, + "num_input_tokens_seen": 5022336, + "step": 722 + }, + { + "epoch": 4.6532582461786, + "grad_norm": 0.9403418898582458, + "learning_rate": 1.0124420097449077e-06, + "loss": 0.0107, + "num_input_tokens_seen": 5029184, + "step": 723 + }, + { + "epoch": 4.659694288012872, + "grad_norm": 2.2545931339263916, + "learning_rate": 9.7661328609298e-07, + "loss": 0.0279, + "num_input_tokens_seen": 5036000, + "step": 724 + }, + { + "epoch": 4.666130329847144, + "grad_norm": 0.5637010931968689, + "learning_rate": 9.412754953531664e-07, + "loss": 0.0044, + "num_input_tokens_seen": 5042944, + "step": 725 + }, + { + "epoch": 4.672566371681416, + "grad_norm": 0.24136967957019806, + "learning_rate": 9.064400256282757e-07, + "loss": 0.0021, + "num_input_tokens_seen": 5049840, + "step": 726 + }, + { + "epoch": 4.679002413515688, + "grad_norm": 1.0340116024017334, + "learning_rate": 8.721181031410661e-07, + "loss": 0.0086, + "num_input_tokens_seen": 5057296, + "step": 727 + }, + { + "epoch": 4.68543845534996, + "grad_norm": 0.548861026763916, + "learning_rate": 8.383207886164366e-07, + "loss": 0.005, + "num_input_tokens_seen": 5064560, + "step": 728 + }, + { + "epoch": 4.691874497184232, + "grad_norm": 1.089135766029358, + "learning_rate": 8.050589737169485e-07, + "loss": 0.0096, + "num_input_tokens_seen": 5071472, + "step": 729 + }, + { + "epoch": 4.698310539018504, + "grad_norm": 0.3106631636619568, + "learning_rate": 7.723433775328385e-07, + "loss": 0.0029, + "num_input_tokens_seen": 5078512, + "step": 730 + }, + { + "epoch": 4.704746580852776, + "grad_norm": 1.3499066829681396, + "learning_rate": 7.401845431276378e-07, + "loss": 0.0082, + "num_input_tokens_seen": 5085248, + "step": 731 + }, + { + "epoch": 4.711182622687048, + "grad_norm": 0.30332618951797485, + "learning_rate": 7.085928341405193e-07, + "loss": 0.0033, + "num_input_tokens_seen": 5092160, + "step": 732 + }, + { + "epoch": 4.71761866452132, + "grad_norm": 0.7549375295639038, + "learning_rate": 6.775784314464717e-07, + "loss": 0.0253, + "num_input_tokens_seen": 5099360, + "step": 733 + }, + { + "epoch": 4.7240547063555915, + "grad_norm": 1.567395567893982, + "learning_rate": 6.471513298753634e-07, + "loss": 0.0117, + "num_input_tokens_seen": 5106160, + "step": 734 + }, + { + "epoch": 4.7304907481898635, + "grad_norm": 1.192610502243042, + "learning_rate": 6.17321334990973e-07, + "loss": 0.0052, + "num_input_tokens_seen": 5113264, + "step": 735 + }, + { + "epoch": 4.736926790024135, + "grad_norm": 3.9402077198028564, + "learning_rate": 5.880980599310041e-07, + "loss": 0.0305, + "num_input_tokens_seen": 5120032, + "step": 736 + }, + { + "epoch": 4.743362831858407, + "grad_norm": 0.3623356223106384, + "learning_rate": 5.59490922309118e-07, + "loss": 0.0018, + "num_input_tokens_seen": 5127280, + "step": 737 + }, + { + "epoch": 4.749798873692679, + "grad_norm": 0.815592885017395, + "learning_rate": 5.3150914117997e-07, + "loss": 0.0066, + "num_input_tokens_seen": 5134400, + "step": 738 + }, + { + "epoch": 4.756234915526951, + "grad_norm": 0.4423564076423645, + "learning_rate": 5.041617340682467e-07, + "loss": 0.0032, + "num_input_tokens_seen": 5141488, + "step": 739 + }, + { + "epoch": 4.762670957361223, + "grad_norm": 0.5768114924430847, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0089, + "num_input_tokens_seen": 5148432, + "step": 740 + }, + { + "epoch": 4.769106999195495, + "grad_norm": 1.2286343574523926, + "learning_rate": 4.514050869756703e-07, + "loss": 0.0124, + "num_input_tokens_seen": 5155328, + "step": 741 + }, + { + "epoch": 4.775543041029767, + "grad_norm": 0.552872359752655, + "learning_rate": 4.2601284857042263e-07, + "loss": 0.0022, + "num_input_tokens_seen": 5163008, + "step": 742 + }, + { + "epoch": 4.781979082864039, + "grad_norm": 0.6165493726730347, + "learning_rate": 4.012889818548069e-07, + "loss": 0.0063, + "num_input_tokens_seen": 5170096, + "step": 743 + }, + { + "epoch": 4.788415124698311, + "grad_norm": 1.1403653621673584, + "learning_rate": 3.772414544445163e-07, + "loss": 0.0149, + "num_input_tokens_seen": 5177536, + "step": 744 + }, + { + "epoch": 4.794851166532583, + "grad_norm": 0.1795167326927185, + "learning_rate": 3.538780159953348e-07, + "loss": 0.0012, + "num_input_tokens_seen": 5184608, + "step": 745 + }, + { + "epoch": 4.801287208366855, + "grad_norm": 0.9326004981994629, + "learning_rate": 3.312061957057061e-07, + "loss": 0.0127, + "num_input_tokens_seen": 5191344, + "step": 746 + }, + { + "epoch": 4.807723250201127, + "grad_norm": 0.41363996267318726, + "learning_rate": 3.092332998903416e-07, + "loss": 0.0018, + "num_input_tokens_seen": 5198416, + "step": 747 + }, + { + "epoch": 4.814159292035399, + "grad_norm": 0.538027286529541, + "learning_rate": 2.8796640962565374e-07, + "loss": 0.0034, + "num_input_tokens_seen": 5205392, + "step": 748 + }, + { + "epoch": 4.8205953338696705, + "grad_norm": 1.531555414199829, + "learning_rate": 2.674123784677868e-07, + "loss": 0.0137, + "num_input_tokens_seen": 5213216, + "step": 749 + }, + { + "epoch": 4.8270313757039425, + "grad_norm": 1.671035647392273, + "learning_rate": 2.4757783024395244e-07, + "loss": 0.0219, + "num_input_tokens_seen": 5220032, + "step": 750 + }, + { + "epoch": 4.833467417538214, + "grad_norm": 0.30722492933273315, + "learning_rate": 2.284691569178138e-07, + "loss": 0.0014, + "num_input_tokens_seen": 5226816, + "step": 751 + }, + { + "epoch": 4.839903459372486, + "grad_norm": 1.3107943534851074, + "learning_rate": 2.100925165295839e-07, + "loss": 0.019, + "num_input_tokens_seen": 5233920, + "step": 752 + }, + { + "epoch": 4.846339501206758, + "grad_norm": 2.1163885593414307, + "learning_rate": 1.9245383121150678e-07, + "loss": 0.0075, + "num_input_tokens_seen": 5241344, + "step": 753 + }, + { + "epoch": 4.85277554304103, + "grad_norm": 1.2636387348175049, + "learning_rate": 1.7555878527937164e-07, + "loss": 0.0078, + "num_input_tokens_seen": 5248256, + "step": 754 + }, + { + "epoch": 4.859211584875302, + "grad_norm": 4.166254997253418, + "learning_rate": 1.59412823400657e-07, + "loss": 0.0244, + "num_input_tokens_seen": 5255248, + "step": 755 + }, + { + "epoch": 4.865647626709574, + "grad_norm": 1.078273892402649, + "learning_rate": 1.4402114883991318e-07, + "loss": 0.0218, + "num_input_tokens_seen": 5262048, + "step": 756 + }, + { + "epoch": 4.872083668543846, + "grad_norm": 2.091312885284424, + "learning_rate": 1.2938872178193395e-07, + "loss": 0.0044, + "num_input_tokens_seen": 5268848, + "step": 757 + }, + { + "epoch": 4.878519710378118, + "grad_norm": 1.7236751317977905, + "learning_rate": 1.1552025773327008e-07, + "loss": 0.0122, + "num_input_tokens_seen": 5275664, + "step": 758 + }, + { + "epoch": 4.88495575221239, + "grad_norm": 0.9874201416969299, + "learning_rate": 1.0242022600258611e-07, + "loss": 0.007, + "num_input_tokens_seen": 5282112, + "step": 759 + }, + { + "epoch": 4.891391794046662, + "grad_norm": 0.6303602457046509, + "learning_rate": 9.00928482603669e-08, + "loss": 0.0019, + "num_input_tokens_seen": 5288912, + "step": 760 + }, + { + "epoch": 4.897827835880933, + "grad_norm": 0.7971038818359375, + "learning_rate": 7.854209717842231e-08, + "loss": 0.0147, + "num_input_tokens_seen": 5295920, + "step": 761 + }, + { + "epoch": 4.904263877715205, + "grad_norm": 1.0757670402526855, + "learning_rate": 6.777169514963766e-08, + "loss": 0.0087, + "num_input_tokens_seen": 5302816, + "step": 762 + }, + { + "epoch": 4.910699919549477, + "grad_norm": 1.8044992685317993, + "learning_rate": 5.778511308838108e-08, + "loss": 0.0085, + "num_input_tokens_seen": 5309680, + "step": 763 + }, + { + "epoch": 4.917135961383749, + "grad_norm": 0.3801545202732086, + "learning_rate": 4.8585569311949966e-08, + "loss": 0.0026, + "num_input_tokens_seen": 5316848, + "step": 764 + }, + { + "epoch": 4.923572003218021, + "grad_norm": 0.20918627083301544, + "learning_rate": 4.017602850342584e-08, + "loss": 0.0018, + "num_input_tokens_seen": 5323760, + "step": 765 + }, + { + "epoch": 4.9300080450522925, + "grad_norm": 2.037950277328491, + "learning_rate": 3.2559200756260845e-08, + "loss": 0.0072, + "num_input_tokens_seen": 5330336, + "step": 766 + }, + { + "epoch": 4.936444086886564, + "grad_norm": 0.8903030753135681, + "learning_rate": 2.5737540700912777e-08, + "loss": 0.0079, + "num_input_tokens_seen": 5336816, + "step": 767 + }, + { + "epoch": 4.942880128720836, + "grad_norm": 1.0508862733840942, + "learning_rate": 1.9713246713805588e-08, + "loss": 0.0275, + "num_input_tokens_seen": 5344064, + "step": 768 + }, + { + "epoch": 4.949316170555108, + "grad_norm": 1.0068142414093018, + "learning_rate": 1.4488260208871397e-08, + "loss": 0.0036, + "num_input_tokens_seen": 5351328, + "step": 769 + }, + { + "epoch": 4.95575221238938, + "grad_norm": 1.5033273696899414, + "learning_rate": 1.006426501190233e-08, + "loss": 0.0501, + "num_input_tokens_seen": 5358672, + "step": 770 + }, + { + "epoch": 4.962188254223652, + "grad_norm": 0.667352557182312, + "learning_rate": 6.442686817914878e-09, + "loss": 0.0082, + "num_input_tokens_seen": 5365648, + "step": 771 + }, + { + "epoch": 4.968624296057924, + "grad_norm": 0.9037322998046875, + "learning_rate": 3.6246927316976875e-09, + "loss": 0.0032, + "num_input_tokens_seen": 5372432, + "step": 772 + }, + { + "epoch": 4.975060337892196, + "grad_norm": 0.3071233630180359, + "learning_rate": 1.6111908916965902e-09, + "loss": 0.0017, + "num_input_tokens_seen": 5379648, + "step": 773 + }, + { + "epoch": 4.981496379726468, + "grad_norm": 0.7171315550804138, + "learning_rate": 4.0283017735454066e-10, + "loss": 0.0042, + "num_input_tokens_seen": 5386864, + "step": 774 + }, + { + "epoch": 4.98793242156074, + "grad_norm": 2.855295181274414, + "learning_rate": 0.0, + "loss": 0.0176, + "num_input_tokens_seen": 5393616, + "step": 775 + } + ], + "logging_steps": 1, + "max_steps": 775, + "num_input_tokens_seen": 5393616, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1382484588285133e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}