{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 108, "global_step": 431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.44969096779823303, "learning_rate": 1e-05, "loss": 1.2567, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.3469510078430176, "eval_runtime": 4.7982, "eval_samples_per_second": 20.841, "eval_steps_per_second": 20.841, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.4098110496997833, "learning_rate": 2e-05, "loss": 1.3328, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.5452544093132019, "learning_rate": 3e-05, "loss": 1.6567, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.7305347323417664, "learning_rate": 4e-05, "loss": 1.5499, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.34722596406936646, "learning_rate": 5e-05, "loss": 1.4343, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.5768171548843384, "learning_rate": 6e-05, "loss": 1.2677, "step": 6 }, { "epoch": 0.02, "grad_norm": 0.49281951785087585, "learning_rate": 7e-05, "loss": 1.473, "step": 7 }, { "epoch": 0.02, "grad_norm": 0.42547014355659485, "learning_rate": 8e-05, "loss": 1.406, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.40852880477905273, "learning_rate": 9e-05, "loss": 1.2842, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.36900222301483154, "learning_rate": 0.0001, "loss": 1.1442, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.40908315777778625, "learning_rate": 0.00011000000000000002, "loss": 1.4667, "step": 11 }, { "epoch": 0.03, "grad_norm": 0.4117198884487152, "learning_rate": 0.00012, "loss": 0.9759, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.6714757084846497, "learning_rate": 0.00013000000000000002, "loss": 1.5162, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.5178409218788147, "learning_rate": 0.00014, "loss": 1.1519, "step": 14 }, { "epoch": 0.03, "grad_norm": 1.1334081888198853, "learning_rate": 0.00015000000000000001, "loss": 1.2767, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.6541109681129456, "learning_rate": 0.00016, "loss": 1.3227, "step": 16 }, { "epoch": 0.04, "grad_norm": 0.5143316984176636, "learning_rate": 0.00017, "loss": 1.2991, "step": 17 }, { "epoch": 0.04, "grad_norm": 0.5004872679710388, "learning_rate": 0.00018, "loss": 1.3201, "step": 18 }, { "epoch": 0.04, "grad_norm": 0.4225166141986847, "learning_rate": 0.00019, "loss": 1.2054, "step": 19 }, { "epoch": 0.05, "grad_norm": 0.8387117385864258, "learning_rate": 0.0002, "loss": 1.2026, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.6876248717308044, "learning_rate": 0.00019999707864731247, "loss": 1.2311, "step": 21 }, { "epoch": 0.05, "grad_norm": 0.6449929475784302, "learning_rate": 0.00019998831475993593, "loss": 1.3162, "step": 22 }, { "epoch": 0.05, "grad_norm": 0.6817747950553894, "learning_rate": 0.00019997370884991842, "loss": 1.5836, "step": 23 }, { "epoch": 0.06, "grad_norm": 0.5388895273208618, "learning_rate": 0.0001999532617706403, "loss": 1.3085, "step": 24 }, { "epoch": 0.06, "grad_norm": 0.48557716608047485, "learning_rate": 0.00019992697471676413, "loss": 1.3159, "step": 25 }, { "epoch": 0.06, "grad_norm": 0.531157910823822, "learning_rate": 0.00019989484922416502, "loss": 1.2498, "step": 26 }, { "epoch": 0.06, "grad_norm": 0.47394225001335144, "learning_rate": 0.0001998568871698409, "loss": 1.1981, "step": 27 }, { "epoch": 0.06, "grad_norm": 0.48895126581192017, "learning_rate": 0.00019981309077180272, "loss": 1.0461, "step": 28 }, { "epoch": 0.07, "grad_norm": 0.4990120828151703, "learning_rate": 0.00019976346258894503, "loss": 1.3558, "step": 29 }, { "epoch": 0.07, "grad_norm": 0.6515373587608337, "learning_rate": 0.00019970800552089623, "loss": 1.2566, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.4564642012119293, "learning_rate": 0.00019964672280784954, "loss": 1.1579, "step": 31 }, { "epoch": 0.07, "grad_norm": 0.6385357975959778, "learning_rate": 0.00019957961803037326, "loss": 1.1824, "step": 32 }, { "epoch": 0.08, "grad_norm": 0.5059694051742554, "learning_rate": 0.00019950669510920184, "loss": 1.0092, "step": 33 }, { "epoch": 0.08, "grad_norm": 0.49390971660614014, "learning_rate": 0.0001994279583050067, "loss": 1.1982, "step": 34 }, { "epoch": 0.08, "grad_norm": 0.6610875129699707, "learning_rate": 0.00019934341221814739, "loss": 1.5833, "step": 35 }, { "epoch": 0.08, "grad_norm": 0.5280133485794067, "learning_rate": 0.0001992530617884026, "loss": 0.9696, "step": 36 }, { "epoch": 0.09, "grad_norm": 0.49730122089385986, "learning_rate": 0.00019915691229468178, "loss": 1.1866, "step": 37 }, { "epoch": 0.09, "grad_norm": 0.8253684043884277, "learning_rate": 0.00019905496935471658, "loss": 1.7239, "step": 38 }, { "epoch": 0.09, "grad_norm": 0.4826942980289459, "learning_rate": 0.0001989472389247326, "loss": 1.5085, "step": 39 }, { "epoch": 0.09, "grad_norm": 0.6226486563682556, "learning_rate": 0.00019883372729910152, "loss": 1.1311, "step": 40 }, { "epoch": 0.1, "grad_norm": 0.4536150097846985, "learning_rate": 0.0001987144411099731, "loss": 1.243, "step": 41 }, { "epoch": 0.1, "grad_norm": 0.48488786816596985, "learning_rate": 0.000198589387326888, "loss": 1.2221, "step": 42 }, { "epoch": 0.1, "grad_norm": 0.4422035813331604, "learning_rate": 0.00019845857325637031, "loss": 1.5296, "step": 43 }, { "epoch": 0.1, "grad_norm": 0.47116392850875854, "learning_rate": 0.00019832200654150076, "loss": 1.1176, "step": 44 }, { "epoch": 0.1, "grad_norm": 0.4207026958465576, "learning_rate": 0.0001981796951614701, "loss": 1.0939, "step": 45 }, { "epoch": 0.11, "grad_norm": 0.5403062105178833, "learning_rate": 0.00019803164743111302, "loss": 1.2371, "step": 46 }, { "epoch": 0.11, "grad_norm": 0.6009058952331543, "learning_rate": 0.00019787787200042223, "loss": 1.3423, "step": 47 }, { "epoch": 0.11, "grad_norm": 0.421722412109375, "learning_rate": 0.00019771837785404305, "loss": 0.9633, "step": 48 }, { "epoch": 0.11, "grad_norm": 0.5040818452835083, "learning_rate": 0.00019755317431074859, "loss": 1.3888, "step": 49 }, { "epoch": 0.12, "grad_norm": 0.5417558550834656, "learning_rate": 0.0001973822710228951, "loss": 1.0181, "step": 50 }, { "epoch": 0.12, "grad_norm": 0.5744611024856567, "learning_rate": 0.00019720567797585817, "loss": 1.3067, "step": 51 }, { "epoch": 0.12, "grad_norm": 0.4889126121997833, "learning_rate": 0.0001970234054874493, "loss": 1.1691, "step": 52 }, { "epoch": 0.12, "grad_norm": 0.6123921275138855, "learning_rate": 0.0001968354642073129, "loss": 1.2479, "step": 53 }, { "epoch": 0.13, "grad_norm": 0.5839653611183167, "learning_rate": 0.00019664186511630433, "loss": 0.8839, "step": 54 }, { "epoch": 0.13, "grad_norm": 0.40772297978401184, "learning_rate": 0.000196442619525848, "loss": 1.2291, "step": 55 }, { "epoch": 0.13, "grad_norm": 0.5853259563446045, "learning_rate": 0.00019623773907727682, "loss": 1.4134, "step": 56 }, { "epoch": 0.13, "grad_norm": 0.6176974177360535, "learning_rate": 0.0001960272357411517, "loss": 1.3261, "step": 57 }, { "epoch": 0.13, "grad_norm": 0.4556988477706909, "learning_rate": 0.0001958111218165624, "loss": 1.1442, "step": 58 }, { "epoch": 0.14, "grad_norm": 0.46846720576286316, "learning_rate": 0.00019558940993040885, "loss": 1.0652, "step": 59 }, { "epoch": 0.14, "grad_norm": 0.4391572177410126, "learning_rate": 0.00019536211303666323, "loss": 1.3619, "step": 60 }, { "epoch": 0.14, "grad_norm": 0.4686979651451111, "learning_rate": 0.00019512924441561348, "loss": 0.5938, "step": 61 }, { "epoch": 0.14, "grad_norm": 0.430164635181427, "learning_rate": 0.00019489081767308698, "loss": 1.2571, "step": 62 }, { "epoch": 0.15, "grad_norm": 0.6731412410736084, "learning_rate": 0.00019464684673965583, "loss": 1.3784, "step": 63 }, { "epoch": 0.15, "grad_norm": 0.39849698543548584, "learning_rate": 0.0001943973458698229, "loss": 0.9475, "step": 64 }, { "epoch": 0.15, "grad_norm": 0.5062661170959473, "learning_rate": 0.00019414232964118892, "loss": 1.1145, "step": 65 }, { "epoch": 0.15, "grad_norm": 0.5099257230758667, "learning_rate": 0.00019388181295360078, "loss": 1.0426, "step": 66 }, { "epoch": 0.16, "grad_norm": 0.5760392546653748, "learning_rate": 0.00019361581102828095, "loss": 1.057, "step": 67 }, { "epoch": 0.16, "grad_norm": 0.41587546467781067, "learning_rate": 0.0001933443394069383, "loss": 1.181, "step": 68 }, { "epoch": 0.16, "grad_norm": 0.47386714816093445, "learning_rate": 0.00019306741395085976, "loss": 1.1613, "step": 69 }, { "epoch": 0.16, "grad_norm": 0.39666488766670227, "learning_rate": 0.0001927850508399839, "loss": 1.1075, "step": 70 }, { "epoch": 0.16, "grad_norm": 0.4820801019668579, "learning_rate": 0.00019249726657195532, "loss": 1.3065, "step": 71 }, { "epoch": 0.17, "grad_norm": 0.4606281518936157, "learning_rate": 0.00019220407796116098, "loss": 1.3073, "step": 72 }, { "epoch": 0.17, "grad_norm": 0.7897779941558838, "learning_rate": 0.00019190550213774756, "loss": 1.162, "step": 73 }, { "epoch": 0.17, "grad_norm": 0.42179742455482483, "learning_rate": 0.00019160155654662076, "loss": 0.7054, "step": 74 }, { "epoch": 0.17, "grad_norm": 0.5278844237327576, "learning_rate": 0.00019129225894642593, "loss": 1.0787, "step": 75 }, { "epoch": 0.18, "grad_norm": 0.42007845640182495, "learning_rate": 0.00019097762740851061, "loss": 1.0423, "step": 76 }, { "epoch": 0.18, "grad_norm": 0.5332473516464233, "learning_rate": 0.0001906576803158686, "loss": 1.1466, "step": 77 }, { "epoch": 0.18, "grad_norm": 0.5041698217391968, "learning_rate": 0.0001903324363620659, "loss": 1.0691, "step": 78 }, { "epoch": 0.18, "grad_norm": 0.2976062595844269, "learning_rate": 0.0001900019145501484, "loss": 0.6296, "step": 79 }, { "epoch": 0.19, "grad_norm": 0.4532124400138855, "learning_rate": 0.0001896661341915318, "loss": 1.4112, "step": 80 }, { "epoch": 0.19, "grad_norm": 0.4600175619125366, "learning_rate": 0.0001893251149048732, "loss": 1.0353, "step": 81 }, { "epoch": 0.19, "grad_norm": 0.5047218203544617, "learning_rate": 0.00018897887661492474, "loss": 1.1204, "step": 82 }, { "epoch": 0.19, "grad_norm": 0.5466210246086121, "learning_rate": 0.00018862743955136966, "loss": 1.4353, "step": 83 }, { "epoch": 0.19, "grad_norm": 0.3947051465511322, "learning_rate": 0.0001882708242476401, "loss": 1.0523, "step": 84 }, { "epoch": 0.2, "grad_norm": 0.4432850778102875, "learning_rate": 0.00018790905153971758, "loss": 0.8657, "step": 85 }, { "epoch": 0.2, "grad_norm": 0.4575612246990204, "learning_rate": 0.00018754214256491562, "loss": 1.0524, "step": 86 }, { "epoch": 0.2, "grad_norm": 0.3713025748729706, "learning_rate": 0.00018717011876064453, "loss": 1.0439, "step": 87 }, { "epoch": 0.2, "grad_norm": 0.4547780156135559, "learning_rate": 0.0001867930018631592, "loss": 1.01, "step": 88 }, { "epoch": 0.21, "grad_norm": 0.421790212392807, "learning_rate": 0.00018641081390628877, "loss": 1.1949, "step": 89 }, { "epoch": 0.21, "grad_norm": 0.4341322183609009, "learning_rate": 0.00018602357722014964, "loss": 1.2932, "step": 90 }, { "epoch": 0.21, "grad_norm": 0.3753775358200073, "learning_rate": 0.00018563131442984044, "loss": 1.0415, "step": 91 }, { "epoch": 0.21, "grad_norm": 0.5391212105751038, "learning_rate": 0.00018523404845412027, "loss": 1.1474, "step": 92 }, { "epoch": 0.22, "grad_norm": 0.7697898149490356, "learning_rate": 0.0001848318025040697, "loss": 1.3765, "step": 93 }, { "epoch": 0.22, "grad_norm": 0.39486926794052124, "learning_rate": 0.00018442460008173445, "loss": 1.0153, "step": 94 }, { "epoch": 0.22, "grad_norm": 0.4511030912399292, "learning_rate": 0.0001840124649787524, "loss": 1.0658, "step": 95 }, { "epoch": 0.22, "grad_norm": 0.5483732223510742, "learning_rate": 0.0001835954212749632, "loss": 1.1011, "step": 96 }, { "epoch": 0.23, "grad_norm": 0.3995431959629059, "learning_rate": 0.0001831734933370019, "loss": 0.8759, "step": 97 }, { "epoch": 0.23, "grad_norm": 0.4208924472332001, "learning_rate": 0.0001827467058168748, "loss": 0.8876, "step": 98 }, { "epoch": 0.23, "grad_norm": 0.568627119064331, "learning_rate": 0.00018231508365051922, "loss": 1.313, "step": 99 }, { "epoch": 0.23, "grad_norm": 0.4205207824707031, "learning_rate": 0.0001818786520563467, "loss": 1.3252, "step": 100 }, { "epoch": 0.23, "grad_norm": 0.5135630369186401, "learning_rate": 0.00018143743653376942, "loss": 1.093, "step": 101 }, { "epoch": 0.24, "grad_norm": 0.558722198009491, "learning_rate": 0.0001809914628617105, "loss": 1.5598, "step": 102 }, { "epoch": 0.24, "grad_norm": 0.364212304353714, "learning_rate": 0.00018054075709709756, "loss": 1.1574, "step": 103 }, { "epoch": 0.24, "grad_norm": 0.6438787579536438, "learning_rate": 0.00018008534557334064, "loss": 1.2912, "step": 104 }, { "epoch": 0.24, "grad_norm": 0.4466971755027771, "learning_rate": 0.00017962525489879325, "loss": 1.2194, "step": 105 }, { "epoch": 0.25, "grad_norm": 0.5050408244132996, "learning_rate": 0.00017916051195519797, "loss": 0.9488, "step": 106 }, { "epoch": 0.25, "grad_norm": 0.4414922893047333, "learning_rate": 0.00017869114389611575, "loss": 0.9977, "step": 107 }, { "epoch": 0.25, "grad_norm": 0.4354766011238098, "learning_rate": 0.0001782171781453394, "loss": 1.1738, "step": 108 }, { "epoch": 0.25, "eval_loss": 1.1372472047805786, "eval_runtime": 4.9728, "eval_samples_per_second": 20.109, "eval_steps_per_second": 20.109, "step": 108 }, { "epoch": 0.25, "grad_norm": 0.44711774587631226, "learning_rate": 0.00017773864239529132, "loss": 1.0631, "step": 109 }, { "epoch": 0.26, "grad_norm": 0.43031662702560425, "learning_rate": 0.0001772555646054055, "loss": 1.0922, "step": 110 }, { "epoch": 0.26, "grad_norm": 0.4053135812282562, "learning_rate": 0.00017676797300049393, "loss": 1.2369, "step": 111 }, { "epoch": 0.26, "grad_norm": 0.6041821241378784, "learning_rate": 0.00017627589606909755, "loss": 1.2172, "step": 112 }, { "epoch": 0.26, "grad_norm": 0.2745780050754547, "learning_rate": 0.00017577936256182167, "loss": 0.4539, "step": 113 }, { "epoch": 0.26, "grad_norm": 0.4951040744781494, "learning_rate": 0.0001752784014896562, "loss": 1.3143, "step": 114 }, { "epoch": 0.27, "grad_norm": 0.38725313544273376, "learning_rate": 0.00017477304212228057, "loss": 1.0367, "step": 115 }, { "epoch": 0.27, "grad_norm": 0.4349636435508728, "learning_rate": 0.0001742633139863538, "loss": 1.1372, "step": 116 }, { "epoch": 0.27, "grad_norm": 0.5229642391204834, "learning_rate": 0.00017374924686378905, "loss": 1.2274, "step": 117 }, { "epoch": 0.27, "grad_norm": 0.5884748101234436, "learning_rate": 0.0001732308707900137, "loss": 1.1546, "step": 118 }, { "epoch": 0.28, "grad_norm": 0.4116392433643341, "learning_rate": 0.0001727082160522145, "loss": 1.1144, "step": 119 }, { "epoch": 0.28, "grad_norm": 0.4359859228134155, "learning_rate": 0.0001721813131875679, "loss": 1.0402, "step": 120 }, { "epoch": 0.28, "grad_norm": 0.4674035906791687, "learning_rate": 0.00017165019298145585, "loss": 0.7442, "step": 121 }, { "epoch": 0.28, "grad_norm": 0.40642812848091125, "learning_rate": 0.00017111488646566727, "loss": 1.1118, "step": 122 }, { "epoch": 0.29, "grad_norm": 0.4694182872772217, "learning_rate": 0.00017057542491658468, "loss": 1.3227, "step": 123 }, { "epoch": 0.29, "grad_norm": 0.4917917251586914, "learning_rate": 0.000170031839853357, "loss": 1.3724, "step": 124 }, { "epoch": 0.29, "grad_norm": 0.47446152567863464, "learning_rate": 0.00016948416303605795, "loss": 1.406, "step": 125 }, { "epoch": 0.29, "grad_norm": 0.36553582549095154, "learning_rate": 0.0001689324264638304, "loss": 0.8186, "step": 126 }, { "epoch": 0.29, "grad_norm": 0.3034989535808563, "learning_rate": 0.00016837666237301663, "loss": 0.5963, "step": 127 }, { "epoch": 0.3, "grad_norm": 0.5044969320297241, "learning_rate": 0.00016781690323527511, "loss": 1.1356, "step": 128 }, { "epoch": 0.3, "grad_norm": 0.449246346950531, "learning_rate": 0.00016725318175568306, "loss": 1.015, "step": 129 }, { "epoch": 0.3, "grad_norm": 0.41430914402008057, "learning_rate": 0.00016668553087082567, "loss": 1.2896, "step": 130 }, { "epoch": 0.3, "grad_norm": 0.4527166783809662, "learning_rate": 0.0001661139837468717, "loss": 1.3077, "step": 131 }, { "epoch": 0.31, "grad_norm": 0.37175431847572327, "learning_rate": 0.00016553857377763566, "loss": 1.0345, "step": 132 }, { "epoch": 0.31, "grad_norm": 0.5722200870513916, "learning_rate": 0.0001649593345826268, "loss": 1.2999, "step": 133 }, { "epoch": 0.31, "grad_norm": 0.4093303978443146, "learning_rate": 0.00016437630000508464, "loss": 0.9219, "step": 134 }, { "epoch": 0.31, "grad_norm": 0.5196316242218018, "learning_rate": 0.00016378950411000183, "loss": 1.2376, "step": 135 }, { "epoch": 0.32, "grad_norm": 0.5437043905258179, "learning_rate": 0.00016319898118213365, "loss": 1.0182, "step": 136 }, { "epoch": 0.32, "grad_norm": 0.5286529660224915, "learning_rate": 0.00016260476572399496, "loss": 1.4231, "step": 137 }, { "epoch": 0.32, "grad_norm": 0.43187054991722107, "learning_rate": 0.00016200689245384424, "loss": 1.0627, "step": 138 }, { "epoch": 0.32, "grad_norm": 0.6503031849861145, "learning_rate": 0.00016140539630365522, "loss": 1.1164, "step": 139 }, { "epoch": 0.32, "grad_norm": 0.3417191505432129, "learning_rate": 0.00016080031241707578, "loss": 0.9242, "step": 140 }, { "epoch": 0.33, "grad_norm": 0.4407976567745209, "learning_rate": 0.0001601916761473747, "loss": 1.1057, "step": 141 }, { "epoch": 0.33, "grad_norm": 0.5504916310310364, "learning_rate": 0.00015957952305537597, "loss": 1.1505, "step": 142 }, { "epoch": 0.33, "grad_norm": 0.44324642419815063, "learning_rate": 0.00015896388890738127, "loss": 1.2143, "step": 143 }, { "epoch": 0.33, "grad_norm": 0.44402655959129333, "learning_rate": 0.00015834480967308003, "loss": 1.0909, "step": 144 }, { "epoch": 0.34, "grad_norm": 0.539313554763794, "learning_rate": 0.00015772232152344795, "loss": 1.2159, "step": 145 }, { "epoch": 0.34, "grad_norm": 0.5022640824317932, "learning_rate": 0.0001570964608286336, "loss": 1.0763, "step": 146 }, { "epoch": 0.34, "grad_norm": 0.4640175402164459, "learning_rate": 0.00015646726415583344, "loss": 0.8441, "step": 147 }, { "epoch": 0.34, "grad_norm": 0.47599032521247864, "learning_rate": 0.0001558347682671553, "loss": 1.4246, "step": 148 }, { "epoch": 0.35, "grad_norm": 0.6268883943557739, "learning_rate": 0.00015519901011747044, "loss": 1.0982, "step": 149 }, { "epoch": 0.35, "grad_norm": 0.5821301937103271, "learning_rate": 0.00015456002685225448, "loss": 1.3526, "step": 150 }, { "epoch": 0.35, "grad_norm": 0.6036801934242249, "learning_rate": 0.00015391785580541698, "loss": 1.3432, "step": 151 }, { "epoch": 0.35, "grad_norm": 0.44567015767097473, "learning_rate": 0.0001532725344971202, "loss": 1.6177, "step": 152 }, { "epoch": 0.35, "grad_norm": 0.5531861782073975, "learning_rate": 0.0001526241006315869, "loss": 1.4035, "step": 153 }, { "epoch": 0.36, "grad_norm": 0.40534013509750366, "learning_rate": 0.00015197259209489747, "loss": 1.4051, "step": 154 }, { "epoch": 0.36, "grad_norm": 0.49140480160713196, "learning_rate": 0.00015131804695277612, "loss": 1.1617, "step": 155 }, { "epoch": 0.36, "grad_norm": 0.5026464462280273, "learning_rate": 0.00015066050344836706, "loss": 1.188, "step": 156 }, { "epoch": 0.36, "grad_norm": 0.454348623752594, "learning_rate": 0.00015000000000000001, "loss": 1.2269, "step": 157 }, { "epoch": 0.37, "grad_norm": 0.3233983516693115, "learning_rate": 0.0001493365751989454, "loss": 0.839, "step": 158 }, { "epoch": 0.37, "grad_norm": 0.35600027441978455, "learning_rate": 0.0001486702678071598, "loss": 0.8139, "step": 159 }, { "epoch": 0.37, "grad_norm": 0.3790452480316162, "learning_rate": 0.00014800111675502094, "loss": 0.9154, "step": 160 }, { "epoch": 0.37, "grad_norm": 0.3982885777950287, "learning_rate": 0.00014732916113905335, "loss": 1.1825, "step": 161 }, { "epoch": 0.38, "grad_norm": 0.5747079253196716, "learning_rate": 0.0001466544402196439, "loss": 1.0599, "step": 162 }, { "epoch": 0.38, "grad_norm": 0.412662148475647, "learning_rate": 0.00014597699341874806, "loss": 0.7421, "step": 163 }, { "epoch": 0.38, "grad_norm": 0.4285506308078766, "learning_rate": 0.00014529686031758643, "loss": 0.8738, "step": 164 }, { "epoch": 0.38, "grad_norm": 0.5278518199920654, "learning_rate": 0.00014461408065433227, "loss": 1.3702, "step": 165 }, { "epoch": 0.39, "grad_norm": 0.3949754536151886, "learning_rate": 0.00014392869432178971, "loss": 1.005, "step": 166 }, { "epoch": 0.39, "grad_norm": 0.39798328280448914, "learning_rate": 0.00014324074136506284, "loss": 1.1153, "step": 167 }, { "epoch": 0.39, "grad_norm": 0.6939488053321838, "learning_rate": 0.00014255026197921596, "loss": 1.0646, "step": 168 }, { "epoch": 0.39, "grad_norm": 0.45957663655281067, "learning_rate": 0.00014185729650692533, "loss": 1.2686, "step": 169 }, { "epoch": 0.39, "grad_norm": 0.4837738871574402, "learning_rate": 0.0001411618854361218, "loss": 1.3417, "step": 170 }, { "epoch": 0.4, "grad_norm": 0.6829615831375122, "learning_rate": 0.00014046406939762545, "loss": 1.1174, "step": 171 }, { "epoch": 0.4, "grad_norm": 0.5104063153266907, "learning_rate": 0.0001397638891627714, "loss": 1.2003, "step": 172 }, { "epoch": 0.4, "grad_norm": 0.6368303894996643, "learning_rate": 0.00013906138564102793, "loss": 1.2944, "step": 173 }, { "epoch": 0.4, "grad_norm": 0.5132100582122803, "learning_rate": 0.00013835659987760605, "loss": 1.3055, "step": 174 }, { "epoch": 0.41, "grad_norm": 0.4685085415840149, "learning_rate": 0.0001376495730510614, "loss": 1.3442, "step": 175 }, { "epoch": 0.41, "grad_norm": 0.5321069955825806, "learning_rate": 0.0001369403464708884, "loss": 1.2254, "step": 176 }, { "epoch": 0.41, "grad_norm": 0.5504405498504639, "learning_rate": 0.00013622896157510658, "loss": 1.306, "step": 177 }, { "epoch": 0.41, "grad_norm": 0.4058278203010559, "learning_rate": 0.00013551545992783947, "loss": 0.8795, "step": 178 }, { "epoch": 0.42, "grad_norm": 0.4485948979854584, "learning_rate": 0.0001347998832168862, "loss": 1.0097, "step": 179 }, { "epoch": 0.42, "grad_norm": 0.5311691164970398, "learning_rate": 0.0001340822732512857, "loss": 1.4547, "step": 180 }, { "epoch": 0.42, "grad_norm": 0.4577678442001343, "learning_rate": 0.00013336267195887398, "loss": 1.1929, "step": 181 }, { "epoch": 0.42, "grad_norm": 0.45450282096862793, "learning_rate": 0.00013264112138383445, "loss": 0.9774, "step": 182 }, { "epoch": 0.42, "grad_norm": 0.39516186714172363, "learning_rate": 0.00013191766368424133, "loss": 1.1093, "step": 183 }, { "epoch": 0.43, "grad_norm": 0.5446043014526367, "learning_rate": 0.00013119234112959655, "loss": 0.9583, "step": 184 }, { "epoch": 0.43, "grad_norm": 0.4918731451034546, "learning_rate": 0.00013046519609836, "loss": 1.35, "step": 185 }, { "epoch": 0.43, "grad_norm": 0.37299469113349915, "learning_rate": 0.00012973627107547346, "loss": 1.2582, "step": 186 }, { "epoch": 0.43, "grad_norm": 0.5013623237609863, "learning_rate": 0.0001290056086498785, "loss": 1.2556, "step": 187 }, { "epoch": 0.44, "grad_norm": 0.4400407373905182, "learning_rate": 0.00012827325151202782, "loss": 1.2137, "step": 188 }, { "epoch": 0.44, "grad_norm": 0.46342340111732483, "learning_rate": 0.00012753924245139135, "loss": 1.1706, "step": 189 }, { "epoch": 0.44, "grad_norm": 0.417937308549881, "learning_rate": 0.00012680362435395595, "loss": 1.1111, "step": 190 }, { "epoch": 0.44, "grad_norm": 0.9197636246681213, "learning_rate": 0.00012606644019971968, "loss": 1.4709, "step": 191 }, { "epoch": 0.45, "grad_norm": 0.4040437936782837, "learning_rate": 0.00012532773306018076, "loss": 1.1701, "step": 192 }, { "epoch": 0.45, "grad_norm": 0.5382991433143616, "learning_rate": 0.00012458754609582097, "loss": 0.9994, "step": 193 }, { "epoch": 0.45, "grad_norm": 0.40422263741493225, "learning_rate": 0.00012384592255358385, "loss": 1.0439, "step": 194 }, { "epoch": 0.45, "grad_norm": 0.5063545107841492, "learning_rate": 0.00012310290576434795, "loss": 1.4452, "step": 195 }, { "epoch": 0.45, "grad_norm": 0.37706631422042847, "learning_rate": 0.00012235853914039515, "loss": 0.9206, "step": 196 }, { "epoch": 0.46, "grad_norm": 0.4789075553417206, "learning_rate": 0.00012161286617287419, "loss": 0.9392, "step": 197 }, { "epoch": 0.46, "grad_norm": 0.43901360034942627, "learning_rate": 0.00012086593042925964, "loss": 1.228, "step": 198 }, { "epoch": 0.46, "grad_norm": 0.34066489338874817, "learning_rate": 0.00012011777555080638, "loss": 0.7, "step": 199 }, { "epoch": 0.46, "grad_norm": 0.5998871326446533, "learning_rate": 0.00011936844524999966, "loss": 1.3413, "step": 200 }, { "epoch": 0.47, "grad_norm": 0.43001553416252136, "learning_rate": 0.00011861798330800125, "loss": 1.0357, "step": 201 }, { "epoch": 0.47, "grad_norm": 0.5246945023536682, "learning_rate": 0.00011786643357209136, "loss": 0.9392, "step": 202 }, { "epoch": 0.47, "grad_norm": 0.614080548286438, "learning_rate": 0.00011711383995310681, "loss": 1.1147, "step": 203 }, { "epoch": 0.47, "grad_norm": 0.45178118348121643, "learning_rate": 0.00011636024642287546, "loss": 1.1284, "step": 204 }, { "epoch": 0.48, "grad_norm": 0.4508957266807556, "learning_rate": 0.00011560569701164697, "loss": 1.3169, "step": 205 }, { "epoch": 0.48, "grad_norm": 1.084119439125061, "learning_rate": 0.00011485023580552039, "loss": 1.4725, "step": 206 }, { "epoch": 0.48, "grad_norm": 0.417682409286499, "learning_rate": 0.00011409390694386817, "loss": 1.1748, "step": 207 }, { "epoch": 0.48, "grad_norm": 0.4242180585861206, "learning_rate": 0.00011333675461675739, "loss": 1.0729, "step": 208 }, { "epoch": 0.48, "grad_norm": 0.4738271236419678, "learning_rate": 0.00011257882306236775, "loss": 1.2974, "step": 209 }, { "epoch": 0.49, "grad_norm": 0.3936362862586975, "learning_rate": 0.00011182015656440692, "loss": 1.0202, "step": 210 }, { "epoch": 0.49, "grad_norm": 0.4622405767440796, "learning_rate": 0.00011106079944952317, "loss": 1.2899, "step": 211 }, { "epoch": 0.49, "grad_norm": 0.5150789618492126, "learning_rate": 0.00011030079608471544, "loss": 0.8933, "step": 212 }, { "epoch": 0.49, "grad_norm": 0.5435701608657837, "learning_rate": 0.00010954019087474124, "loss": 1.4871, "step": 213 }, { "epoch": 0.5, "grad_norm": 0.42681682109832764, "learning_rate": 0.00010877902825952197, "loss": 1.0943, "step": 214 }, { "epoch": 0.5, "grad_norm": 0.46130529046058655, "learning_rate": 0.00010801735271154669, "loss": 1.1492, "step": 215 }, { "epoch": 0.5, "grad_norm": 0.502712607383728, "learning_rate": 0.00010725520873327361, "loss": 1.1175, "step": 216 }, { "epoch": 0.5, "eval_loss": 1.1232960224151611, "eval_runtime": 4.7032, "eval_samples_per_second": 21.262, "eval_steps_per_second": 21.262, "step": 216 }, { "epoch": 0.5, "grad_norm": 0.4459720849990845, "learning_rate": 0.00010649264085452988, "loss": 0.9861, "step": 217 }, { "epoch": 0.51, "grad_norm": 0.5748346447944641, "learning_rate": 0.00010572969362990998, "loss": 1.2841, "step": 218 }, { "epoch": 0.51, "grad_norm": 0.5053660273551941, "learning_rate": 0.0001049664116361724, "loss": 1.1955, "step": 219 }, { "epoch": 0.51, "grad_norm": 0.5145143270492554, "learning_rate": 0.0001042028394696352, "loss": 1.0322, "step": 220 }, { "epoch": 0.51, "grad_norm": 0.552711009979248, "learning_rate": 0.00010343902174357039, "loss": 1.1417, "step": 221 }, { "epoch": 0.52, "grad_norm": 0.4026980400085449, "learning_rate": 0.00010267500308559732, "loss": 1.1334, "step": 222 }, { "epoch": 0.52, "grad_norm": 0.5193754434585571, "learning_rate": 0.0001019108281350752, "loss": 1.0735, "step": 223 }, { "epoch": 0.52, "grad_norm": 0.4189368486404419, "learning_rate": 0.0001011465415404949, "loss": 1.2567, "step": 224 }, { "epoch": 0.52, "grad_norm": 0.45475542545318604, "learning_rate": 0.0001003821879568704, "loss": 0.9511, "step": 225 }, { "epoch": 0.52, "grad_norm": 0.46022626757621765, "learning_rate": 9.96178120431296e-05, "loss": 1.1802, "step": 226 }, { "epoch": 0.53, "grad_norm": 0.4458233416080475, "learning_rate": 9.88534584595051e-05, "loss": 1.0332, "step": 227 }, { "epoch": 0.53, "grad_norm": 0.3980939984321594, "learning_rate": 9.80891718649248e-05, "loss": 0.9659, "step": 228 }, { "epoch": 0.53, "grad_norm": 0.4233875274658203, "learning_rate": 9.732499691440266e-05, "loss": 1.3747, "step": 229 }, { "epoch": 0.53, "grad_norm": 0.4073673486709595, "learning_rate": 9.656097825642961e-05, "loss": 1.2297, "step": 230 }, { "epoch": 0.54, "grad_norm": 0.39706096053123474, "learning_rate": 9.579716053036479e-05, "loss": 0.973, "step": 231 }, { "epoch": 0.54, "grad_norm": 0.4343310296535492, "learning_rate": 9.503358836382761e-05, "loss": 1.238, "step": 232 }, { "epoch": 0.54, "grad_norm": 0.4828486144542694, "learning_rate": 9.427030637009003e-05, "loss": 1.1616, "step": 233 }, { "epoch": 0.54, "grad_norm": 0.56890869140625, "learning_rate": 9.35073591454701e-05, "loss": 0.8345, "step": 234 }, { "epoch": 0.55, "grad_norm": 0.4366269111633301, "learning_rate": 9.274479126672641e-05, "loss": 1.111, "step": 235 }, { "epoch": 0.55, "grad_norm": 0.4700612723827362, "learning_rate": 9.198264728845332e-05, "loss": 1.3553, "step": 236 }, { "epoch": 0.55, "grad_norm": 0.40685421228408813, "learning_rate": 9.122097174047805e-05, "loss": 1.1515, "step": 237 }, { "epoch": 0.55, "grad_norm": 0.37611886858940125, "learning_rate": 9.045980912525879e-05, "loss": 0.9226, "step": 238 }, { "epoch": 0.55, "grad_norm": 0.40409284830093384, "learning_rate": 8.969920391528458e-05, "loss": 0.8641, "step": 239 }, { "epoch": 0.56, "grad_norm": 0.39749234914779663, "learning_rate": 8.893920055047686e-05, "loss": 1.0498, "step": 240 }, { "epoch": 0.56, "grad_norm": 0.554320752620697, "learning_rate": 8.81798434355931e-05, "loss": 1.1806, "step": 241 }, { "epoch": 0.56, "grad_norm": 0.37640953063964844, "learning_rate": 8.742117693763227e-05, "loss": 1.03, "step": 242 }, { "epoch": 0.56, "grad_norm": 0.4188061058521271, "learning_rate": 8.666324538324264e-05, "loss": 1.0891, "step": 243 }, { "epoch": 0.57, "grad_norm": 0.40569260716438293, "learning_rate": 8.590609305613184e-05, "loss": 1.2237, "step": 244 }, { "epoch": 0.57, "grad_norm": 0.35996559262275696, "learning_rate": 8.514976419447964e-05, "loss": 1.1086, "step": 245 }, { "epoch": 0.57, "grad_norm": 0.3959590792655945, "learning_rate": 8.439430298835304e-05, "loss": 1.2445, "step": 246 }, { "epoch": 0.57, "grad_norm": 0.41735970973968506, "learning_rate": 8.363975357712457e-05, "loss": 1.2318, "step": 247 }, { "epoch": 0.58, "grad_norm": 0.40587231516838074, "learning_rate": 8.28861600468932e-05, "loss": 0.8108, "step": 248 }, { "epoch": 0.58, "grad_norm": 0.4381011128425598, "learning_rate": 8.213356642790867e-05, "loss": 0.8791, "step": 249 }, { "epoch": 0.58, "grad_norm": 0.455411821603775, "learning_rate": 8.138201669199879e-05, "loss": 1.1067, "step": 250 }, { "epoch": 0.58, "grad_norm": 0.486929714679718, "learning_rate": 8.063155475000037e-05, "loss": 1.0332, "step": 251 }, { "epoch": 0.58, "grad_norm": 0.3872472941875458, "learning_rate": 7.988222444919364e-05, "loss": 0.9779, "step": 252 }, { "epoch": 0.59, "grad_norm": 0.5021295547485352, "learning_rate": 7.913406957074037e-05, "loss": 1.2159, "step": 253 }, { "epoch": 0.59, "grad_norm": 0.3245134949684143, "learning_rate": 7.838713382712583e-05, "loss": 0.6035, "step": 254 }, { "epoch": 0.59, "grad_norm": 0.39470720291137695, "learning_rate": 7.76414608596049e-05, "loss": 1.0058, "step": 255 }, { "epoch": 0.59, "grad_norm": 0.36348241567611694, "learning_rate": 7.68970942356521e-05, "loss": 0.6888, "step": 256 }, { "epoch": 0.6, "grad_norm": 0.6455038189888, "learning_rate": 7.615407744641619e-05, "loss": 1.0407, "step": 257 }, { "epoch": 0.6, "grad_norm": 0.5500055551528931, "learning_rate": 7.541245390417906e-05, "loss": 1.1612, "step": 258 }, { "epoch": 0.6, "grad_norm": 0.3761384189128876, "learning_rate": 7.467226693981925e-05, "loss": 0.9731, "step": 259 }, { "epoch": 0.6, "grad_norm": 0.6317098736763, "learning_rate": 7.393355980028039e-05, "loss": 1.3037, "step": 260 }, { "epoch": 0.61, "grad_norm": 0.4043276607990265, "learning_rate": 7.319637564604412e-05, "loss": 1.323, "step": 261 }, { "epoch": 0.61, "grad_norm": 0.46140003204345703, "learning_rate": 7.246075754860868e-05, "loss": 0.8617, "step": 262 }, { "epoch": 0.61, "grad_norm": 0.4819968044757843, "learning_rate": 7.172674848797219e-05, "loss": 1.1345, "step": 263 }, { "epoch": 0.61, "grad_norm": 0.46252205967903137, "learning_rate": 7.099439135012153e-05, "loss": 1.4092, "step": 264 }, { "epoch": 0.61, "grad_norm": 0.5231882333755493, "learning_rate": 7.026372892452653e-05, "loss": 1.1937, "step": 265 }, { "epoch": 0.62, "grad_norm": 0.3481261432170868, "learning_rate": 6.953480390164e-05, "loss": 0.7882, "step": 266 }, { "epoch": 0.62, "grad_norm": 0.8361053466796875, "learning_rate": 6.880765887040343e-05, "loss": 1.4242, "step": 267 }, { "epoch": 0.62, "grad_norm": 0.3501080572605133, "learning_rate": 6.808233631575867e-05, "loss": 0.7428, "step": 268 }, { "epoch": 0.62, "grad_norm": 0.46989908814430237, "learning_rate": 6.735887861616556e-05, "loss": 1.2293, "step": 269 }, { "epoch": 0.63, "grad_norm": 0.44108134508132935, "learning_rate": 6.663732804112603e-05, "loss": 1.1302, "step": 270 }, { "epoch": 0.63, "grad_norm": 0.484967440366745, "learning_rate": 6.591772674871434e-05, "loss": 1.2551, "step": 271 }, { "epoch": 0.63, "grad_norm": 0.4039287865161896, "learning_rate": 6.520011678311382e-05, "loss": 1.2437, "step": 272 }, { "epoch": 0.63, "grad_norm": 0.5447697639465332, "learning_rate": 6.448454007216054e-05, "loss": 1.1141, "step": 273 }, { "epoch": 0.64, "grad_norm": 0.5546038150787354, "learning_rate": 6.377103842489343e-05, "loss": 0.8614, "step": 274 }, { "epoch": 0.64, "grad_norm": 0.4008563756942749, "learning_rate": 6.305965352911161e-05, "loss": 0.9837, "step": 275 }, { "epoch": 0.64, "grad_norm": 0.4528581202030182, "learning_rate": 6.235042694893862e-05, "loss": 1.3003, "step": 276 }, { "epoch": 0.64, "grad_norm": 0.5693272352218628, "learning_rate": 6.164340012239396e-05, "loss": 1.2459, "step": 277 }, { "epoch": 0.65, "grad_norm": 0.44974976778030396, "learning_rate": 6.093861435897208e-05, "loss": 1.3594, "step": 278 }, { "epoch": 0.65, "grad_norm": 0.42815354466438293, "learning_rate": 6.02361108372286e-05, "loss": 1.1973, "step": 279 }, { "epoch": 0.65, "grad_norm": 0.4000280797481537, "learning_rate": 5.953593060237457e-05, "loss": 1.1897, "step": 280 }, { "epoch": 0.65, "grad_norm": 0.4146158993244171, "learning_rate": 5.883811456387821e-05, "loss": 0.9858, "step": 281 }, { "epoch": 0.65, "grad_norm": 0.43319183588027954, "learning_rate": 5.8142703493074714e-05, "loss": 0.9849, "step": 282 }, { "epoch": 0.66, "grad_norm": 0.4292287528514862, "learning_rate": 5.7449738020784085e-05, "loss": 1.2853, "step": 283 }, { "epoch": 0.66, "grad_norm": 0.4380340576171875, "learning_rate": 5.675925863493721e-05, "loss": 1.1053, "step": 284 }, { "epoch": 0.66, "grad_norm": 0.47855687141418457, "learning_rate": 5.607130567821031e-05, "loss": 1.147, "step": 285 }, { "epoch": 0.66, "grad_norm": 0.47737643122673035, "learning_rate": 5.5385919345667715e-05, "loss": 1.2292, "step": 286 }, { "epoch": 0.67, "grad_norm": 0.532492995262146, "learning_rate": 5.4703139682413586e-05, "loss": 1.2948, "step": 287 }, { "epoch": 0.67, "grad_norm": 0.47192513942718506, "learning_rate": 5.402300658125197e-05, "loss": 1.0005, "step": 288 }, { "epoch": 0.67, "grad_norm": 0.3757496476173401, "learning_rate": 5.334555978035609e-05, "loss": 0.904, "step": 289 }, { "epoch": 0.67, "grad_norm": 0.5040110349655151, "learning_rate": 5.267083886094668e-05, "loss": 1.4786, "step": 290 }, { "epoch": 0.68, "grad_norm": 0.4644010663032532, "learning_rate": 5.199888324497907e-05, "loss": 1.0901, "step": 291 }, { "epoch": 0.68, "grad_norm": 0.4540674686431885, "learning_rate": 5.132973219284023e-05, "loss": 1.1761, "step": 292 }, { "epoch": 0.68, "grad_norm": 0.4109453558921814, "learning_rate": 5.0663424801054595e-05, "loss": 1.1454, "step": 293 }, { "epoch": 0.68, "grad_norm": 0.46687519550323486, "learning_rate": 5.000000000000002e-05, "loss": 1.0732, "step": 294 }, { "epoch": 0.68, "grad_norm": 0.5146364569664001, "learning_rate": 4.9339496551632944e-05, "loss": 1.45, "step": 295 }, { "epoch": 0.69, "grad_norm": 0.4732467532157898, "learning_rate": 4.8681953047223914e-05, "loss": 0.8685, "step": 296 }, { "epoch": 0.69, "grad_norm": 0.5239700675010681, "learning_rate": 4.8027407905102585e-05, "loss": 1.1003, "step": 297 }, { "epoch": 0.69, "grad_norm": 0.404216468334198, "learning_rate": 4.73758993684131e-05, "loss": 0.8534, "step": 298 }, { "epoch": 0.69, "grad_norm": 0.3570495545864105, "learning_rate": 4.672746550287985e-05, "loss": 0.9244, "step": 299 }, { "epoch": 0.7, "grad_norm": 0.535670816898346, "learning_rate": 4.6082144194583056e-05, "loss": 1.4359, "step": 300 }, { "epoch": 0.7, "grad_norm": 0.4560551047325134, "learning_rate": 4.543997314774553e-05, "loss": 1.0442, "step": 301 }, { "epoch": 0.7, "grad_norm": 0.4430754482746124, "learning_rate": 4.4800989882529574e-05, "loss": 1.1429, "step": 302 }, { "epoch": 0.7, "grad_norm": 0.48371657729148865, "learning_rate": 4.41652317328447e-05, "loss": 1.21, "step": 303 }, { "epoch": 0.71, "grad_norm": 0.462596595287323, "learning_rate": 4.3532735844166574e-05, "loss": 1.0128, "step": 304 }, { "epoch": 0.71, "grad_norm": 0.43516165018081665, "learning_rate": 4.2903539171366393e-05, "loss": 1.1847, "step": 305 }, { "epoch": 0.71, "grad_norm": 0.805388867855072, "learning_rate": 4.227767847655205e-05, "loss": 1.2468, "step": 306 }, { "epoch": 0.71, "grad_norm": 0.4382609724998474, "learning_rate": 4.165519032691998e-05, "loss": 1.2178, "step": 307 }, { "epoch": 0.71, "grad_norm": 0.3396044969558716, "learning_rate": 4.1036111092618725e-05, "loss": 0.9241, "step": 308 }, { "epoch": 0.72, "grad_norm": 0.4570239782333374, "learning_rate": 4.042047694462404e-05, "loss": 1.0592, "step": 309 }, { "epoch": 0.72, "grad_norm": 0.49074164032936096, "learning_rate": 3.9808323852625316e-05, "loss": 1.1067, "step": 310 }, { "epoch": 0.72, "grad_norm": 0.5517829656600952, "learning_rate": 3.919968758292425e-05, "loss": 1.3361, "step": 311 }, { "epoch": 0.72, "grad_norm": 0.4893704950809479, "learning_rate": 3.859460369634479e-05, "loss": 1.1157, "step": 312 }, { "epoch": 0.73, "grad_norm": 0.4687519967556, "learning_rate": 3.799310754615578e-05, "loss": 1.3605, "step": 313 }, { "epoch": 0.73, "grad_norm": 0.35046279430389404, "learning_rate": 3.7395234276005087e-05, "loss": 0.8558, "step": 314 }, { "epoch": 0.73, "grad_norm": 0.5013456344604492, "learning_rate": 3.6801018817866375e-05, "loss": 1.0554, "step": 315 }, { "epoch": 0.73, "grad_norm": 0.4760110378265381, "learning_rate": 3.62104958899982e-05, "loss": 1.1839, "step": 316 }, { "epoch": 0.74, "grad_norm": 0.5143394470214844, "learning_rate": 3.562369999491536e-05, "loss": 1.4146, "step": 317 }, { "epoch": 0.74, "grad_norm": 0.4014797508716583, "learning_rate": 3.504066541737323e-05, "loss": 1.0731, "step": 318 }, { "epoch": 0.74, "grad_norm": 0.40402811765670776, "learning_rate": 3.4461426222364336e-05, "loss": 1.0001, "step": 319 }, { "epoch": 0.74, "grad_norm": 0.43316271901130676, "learning_rate": 3.3886016253128326e-05, "loss": 1.1141, "step": 320 }, { "epoch": 0.74, "grad_norm": 0.49771907925605774, "learning_rate": 3.3314469129174364e-05, "loss": 1.3224, "step": 321 }, { "epoch": 0.75, "grad_norm": 0.4895000159740448, "learning_rate": 3.2746818244316956e-05, "loss": 1.04, "step": 322 }, { "epoch": 0.75, "grad_norm": 0.5450279116630554, "learning_rate": 3.2183096764724915e-05, "loss": 1.4016, "step": 323 }, { "epoch": 0.75, "grad_norm": 0.4866601526737213, "learning_rate": 3.16233376269834e-05, "loss": 1.4102, "step": 324 }, { "epoch": 0.75, "eval_loss": 1.1097332239151, "eval_runtime": 5.825, "eval_samples_per_second": 17.167, "eval_steps_per_second": 17.167, "step": 324 }, { "epoch": 0.75, "grad_norm": 0.40248391032218933, "learning_rate": 3.106757353616966e-05, "loss": 0.8404, "step": 325 }, { "epoch": 0.76, "grad_norm": 0.44263288378715515, "learning_rate": 3.0515836963942056e-05, "loss": 1.2992, "step": 326 }, { "epoch": 0.76, "grad_norm": 0.4113296866416931, "learning_rate": 2.9968160146643022e-05, "loss": 1.1058, "step": 327 }, { "epoch": 0.76, "grad_norm": 0.5293351411819458, "learning_rate": 2.9424575083415362e-05, "loss": 1.5505, "step": 328 }, { "epoch": 0.76, "grad_norm": 0.500862181186676, "learning_rate": 2.888511353433274e-05, "loss": 1.1173, "step": 329 }, { "epoch": 0.77, "grad_norm": 0.408054918050766, "learning_rate": 2.8349807018544174e-05, "loss": 1.1951, "step": 330 }, { "epoch": 0.77, "grad_norm": 0.39368724822998047, "learning_rate": 2.7818686812432136e-05, "loss": 1.1842, "step": 331 }, { "epoch": 0.77, "grad_norm": 0.5162099599838257, "learning_rate": 2.7291783947785543e-05, "loss": 1.5018, "step": 332 }, { "epoch": 0.77, "grad_norm": 0.49148476123809814, "learning_rate": 2.6769129209986322e-05, "loss": 1.4618, "step": 333 }, { "epoch": 0.77, "grad_norm": 0.4833587408065796, "learning_rate": 2.6250753136210983e-05, "loss": 0.8573, "step": 334 }, { "epoch": 0.78, "grad_norm": 0.40724459290504456, "learning_rate": 2.5736686013646228e-05, "loss": 1.1546, "step": 335 }, { "epoch": 0.78, "grad_norm": 0.683252215385437, "learning_rate": 2.5226957877719436e-05, "loss": 1.1321, "step": 336 }, { "epoch": 0.78, "grad_norm": 0.6411616802215576, "learning_rate": 2.4721598510343858e-05, "loss": 1.3481, "step": 337 }, { "epoch": 0.78, "grad_norm": 0.4726378917694092, "learning_rate": 2.4220637438178317e-05, "loss": 1.0497, "step": 338 }, { "epoch": 0.79, "grad_norm": 0.6027311682701111, "learning_rate": 2.372410393090243e-05, "loss": 1.1339, "step": 339 }, { "epoch": 0.79, "grad_norm": 0.4566133916378021, "learning_rate": 2.3232026999506062e-05, "loss": 1.2384, "step": 340 }, { "epoch": 0.79, "grad_norm": 0.4425651431083679, "learning_rate": 2.2744435394594497e-05, "loss": 1.2542, "step": 341 }, { "epoch": 0.79, "grad_norm": 0.3810960352420807, "learning_rate": 2.22613576047087e-05, "loss": 0.9794, "step": 342 }, { "epoch": 0.8, "grad_norm": 0.441679447889328, "learning_rate": 2.1782821854660606e-05, "loss": 1.0967, "step": 343 }, { "epoch": 0.8, "grad_norm": 0.5362725257873535, "learning_rate": 2.130885610388428e-05, "loss": 1.3365, "step": 344 }, { "epoch": 0.8, "grad_norm": 0.5472970604896545, "learning_rate": 2.0839488044802036e-05, "loss": 1.7679, "step": 345 }, { "epoch": 0.8, "grad_norm": 0.4027257561683655, "learning_rate": 2.037474510120676e-05, "loss": 1.1387, "step": 346 }, { "epoch": 0.81, "grad_norm": 0.5073655843734741, "learning_rate": 1.9914654426659374e-05, "loss": 1.0684, "step": 347 }, { "epoch": 0.81, "grad_norm": 0.4431632161140442, "learning_rate": 1.945924290290242e-05, "loss": 1.1118, "step": 348 }, { "epoch": 0.81, "grad_norm": 0.7360476851463318, "learning_rate": 1.9008537138289527e-05, "loss": 1.5517, "step": 349 }, { "epoch": 0.81, "grad_norm": 0.43251582980155945, "learning_rate": 1.8562563466230576e-05, "loss": 1.1982, "step": 350 }, { "epoch": 0.81, "grad_norm": 0.4459226429462433, "learning_rate": 1.8121347943653332e-05, "loss": 1.3095, "step": 351 }, { "epoch": 0.82, "grad_norm": 0.34792765974998474, "learning_rate": 1.7684916349480794e-05, "loss": 0.9415, "step": 352 }, { "epoch": 0.82, "grad_norm": 0.5468625426292419, "learning_rate": 1.7253294183125223e-05, "loss": 1.1621, "step": 353 }, { "epoch": 0.82, "grad_norm": 0.5048443675041199, "learning_rate": 1.6826506662998097e-05, "loss": 1.3502, "step": 354 }, { "epoch": 0.82, "grad_norm": 0.3652397692203522, "learning_rate": 1.64045787250368e-05, "loss": 1.1079, "step": 355 }, { "epoch": 0.83, "grad_norm": 0.45304709672927856, "learning_rate": 1.5987535021247667e-05, "loss": 1.0239, "step": 356 }, { "epoch": 0.83, "grad_norm": 0.38547682762145996, "learning_rate": 1.5575399918265542e-05, "loss": 1.1345, "step": 357 }, { "epoch": 0.83, "grad_norm": 0.47186747193336487, "learning_rate": 1.5168197495930315e-05, "loss": 1.4572, "step": 358 }, { "epoch": 0.83, "grad_norm": 0.4306280314922333, "learning_rate": 1.476595154587973e-05, "loss": 0.9894, "step": 359 }, { "epoch": 0.84, "grad_norm": 0.49191519618034363, "learning_rate": 1.436868557015959e-05, "loss": 1.3018, "step": 360 }, { "epoch": 0.84, "grad_norm": 0.548028826713562, "learning_rate": 1.3976422779850384e-05, "loss": 1.3219, "step": 361 }, { "epoch": 0.84, "grad_norm": 0.39407745003700256, "learning_rate": 1.3589186093711226e-05, "loss": 1.002, "step": 362 }, { "epoch": 0.84, "grad_norm": 0.47022268176078796, "learning_rate": 1.3206998136840831e-05, "loss": 1.2112, "step": 363 }, { "epoch": 0.84, "grad_norm": 0.7266699075698853, "learning_rate": 1.2829881239355468e-05, "loss": 1.3021, "step": 364 }, { "epoch": 0.85, "grad_norm": 0.6192635297775269, "learning_rate": 1.2457857435084408e-05, "loss": 2.7741, "step": 365 }, { "epoch": 0.85, "grad_norm": 0.39360511302948, "learning_rate": 1.2090948460282414e-05, "loss": 1.1131, "step": 366 }, { "epoch": 0.85, "grad_norm": 0.6498689651489258, "learning_rate": 1.1729175752359922e-05, "loss": 1.3666, "step": 367 }, { "epoch": 0.85, "grad_norm": 0.40655967593193054, "learning_rate": 1.1372560448630376e-05, "loss": 1.1841, "step": 368 }, { "epoch": 0.86, "grad_norm": 0.3894653022289276, "learning_rate": 1.102112338507526e-05, "loss": 1.1494, "step": 369 }, { "epoch": 0.86, "grad_norm": 0.5716227889060974, "learning_rate": 1.067488509512683e-05, "loss": 0.9971, "step": 370 }, { "epoch": 0.86, "grad_norm": 0.4679757356643677, "learning_rate": 1.0333865808468202e-05, "loss": 1.0505, "step": 371 }, { "epoch": 0.86, "grad_norm": 0.5550671219825745, "learning_rate": 9.998085449851635e-06, "loss": 1.0006, "step": 372 }, { "epoch": 0.87, "grad_norm": 0.4179516136646271, "learning_rate": 9.667563637934129e-06, "loss": 0.8996, "step": 373 }, { "epoch": 0.87, "grad_norm": 0.4545114040374756, "learning_rate": 9.342319684131395e-06, "loss": 1.1175, "step": 374 }, { "epoch": 0.87, "grad_norm": 0.5175038576126099, "learning_rate": 9.02237259148938e-06, "loss": 1.1729, "step": 375 }, { "epoch": 0.87, "grad_norm": 0.45041874051094055, "learning_rate": 8.70774105357407e-06, "loss": 1.2005, "step": 376 }, { "epoch": 0.87, "grad_norm": 0.47179684042930603, "learning_rate": 8.398443453379267e-06, "loss": 1.1232, "step": 377 }, { "epoch": 0.88, "grad_norm": 0.4453977942466736, "learning_rate": 8.094497862252471e-06, "loss": 1.3498, "step": 378 }, { "epoch": 0.88, "grad_norm": 0.545090913772583, "learning_rate": 7.795922038839032e-06, "loss": 1.3135, "step": 379 }, { "epoch": 0.88, "grad_norm": 0.41150522232055664, "learning_rate": 7.502733428044683e-06, "loss": 1.0358, "step": 380 }, { "epoch": 0.88, "grad_norm": 0.4423467218875885, "learning_rate": 7.214949160016115e-06, "loss": 1.1287, "step": 381 }, { "epoch": 0.89, "grad_norm": 0.4803576171398163, "learning_rate": 6.932586049140255e-06, "loss": 1.1248, "step": 382 }, { "epoch": 0.89, "grad_norm": 0.50335294008255, "learning_rate": 6.655660593061719e-06, "loss": 1.2563, "step": 383 }, { "epoch": 0.89, "grad_norm": 0.4255715310573578, "learning_rate": 6.384188971719052e-06, "loss": 1.1771, "step": 384 }, { "epoch": 0.89, "grad_norm": 0.5371767282485962, "learning_rate": 6.11818704639926e-06, "loss": 1.4661, "step": 385 }, { "epoch": 0.9, "grad_norm": 0.7055171132087708, "learning_rate": 5.857670358811096e-06, "loss": 1.2068, "step": 386 }, { "epoch": 0.9, "grad_norm": 0.6857877969741821, "learning_rate": 5.6026541301771095e-06, "loss": 1.025, "step": 387 }, { "epoch": 0.9, "grad_norm": 0.3706812858581543, "learning_rate": 5.353153260344179e-06, "loss": 0.4216, "step": 388 }, { "epoch": 0.9, "grad_norm": 0.3943740725517273, "learning_rate": 5.109182326913054e-06, "loss": 1.1772, "step": 389 }, { "epoch": 0.9, "grad_norm": 0.44216853380203247, "learning_rate": 4.870755584386544e-06, "loss": 1.1343, "step": 390 }, { "epoch": 0.91, "grad_norm": 0.5114327669143677, "learning_rate": 4.63788696333678e-06, "loss": 1.3096, "step": 391 }, { "epoch": 0.91, "grad_norm": 0.4400082230567932, "learning_rate": 4.410590069591192e-06, "loss": 1.0596, "step": 392 }, { "epoch": 0.91, "grad_norm": 0.5141562223434448, "learning_rate": 4.188878183437594e-06, "loss": 0.976, "step": 393 }, { "epoch": 0.91, "grad_norm": 0.47619250416755676, "learning_rate": 3.972764258848305e-06, "loss": 0.8015, "step": 394 }, { "epoch": 0.92, "grad_norm": 0.44292959570884705, "learning_rate": 3.7622609227231818e-06, "loss": 1.2856, "step": 395 }, { "epoch": 0.92, "grad_norm": 0.5242791175842285, "learning_rate": 3.5573804741519833e-06, "loss": 1.0283, "step": 396 }, { "epoch": 0.92, "grad_norm": 0.41017472743988037, "learning_rate": 3.3581348836956738e-06, "loss": 0.8387, "step": 397 }, { "epoch": 0.92, "grad_norm": 0.44496363401412964, "learning_rate": 3.1645357926870955e-06, "loss": 1.0878, "step": 398 }, { "epoch": 0.93, "grad_norm": 0.5821820497512817, "learning_rate": 2.9765945125507235e-06, "loss": 1.3477, "step": 399 }, { "epoch": 0.93, "grad_norm": 0.5164168477058411, "learning_rate": 2.7943220241418377e-06, "loss": 1.6242, "step": 400 }, { "epoch": 0.93, "grad_norm": 0.5002605319023132, "learning_rate": 2.6177289771049274e-06, "loss": 1.2107, "step": 401 }, { "epoch": 0.93, "grad_norm": 0.44040682911872864, "learning_rate": 2.4468256892514417e-06, "loss": 0.8678, "step": 402 }, { "epoch": 0.94, "grad_norm": 0.35371842980384827, "learning_rate": 2.281622145956952e-06, "loss": 0.7567, "step": 403 }, { "epoch": 0.94, "grad_norm": 0.4473396837711334, "learning_rate": 2.122127999577783e-06, "loss": 1.0541, "step": 404 }, { "epoch": 0.94, "grad_norm": 0.6327199339866638, "learning_rate": 1.9683525688869773e-06, "loss": 1.2364, "step": 405 }, { "epoch": 0.94, "grad_norm": 0.4338397681713104, "learning_rate": 1.8203048385299181e-06, "loss": 1.1542, "step": 406 }, { "epoch": 0.94, "grad_norm": 0.5743327140808105, "learning_rate": 1.6779934584992718e-06, "loss": 1.2498, "step": 407 }, { "epoch": 0.95, "grad_norm": 0.40991920232772827, "learning_rate": 1.5414267436297037e-06, "loss": 1.1019, "step": 408 }, { "epoch": 0.95, "grad_norm": 0.40616923570632935, "learning_rate": 1.4106126731119996e-06, "loss": 1.2456, "step": 409 }, { "epoch": 0.95, "grad_norm": 0.4829423725605011, "learning_rate": 1.2855588900269056e-06, "loss": 1.1966, "step": 410 }, { "epoch": 0.95, "grad_norm": 0.4857659339904785, "learning_rate": 1.1662727008984964e-06, "loss": 1.4712, "step": 411 }, { "epoch": 0.96, "grad_norm": 0.4441464841365814, "learning_rate": 1.0527610752673944e-06, "loss": 1.1358, "step": 412 }, { "epoch": 0.96, "grad_norm": 0.4043671488761902, "learning_rate": 9.450306452834179e-07, "loss": 1.1369, "step": 413 }, { "epoch": 0.96, "grad_norm": 0.5048549771308899, "learning_rate": 8.430877053182129e-07, "loss": 1.1279, "step": 414 }, { "epoch": 0.96, "grad_norm": 0.43177226185798645, "learning_rate": 7.469382115974032e-07, "loss": 1.269, "step": 415 }, { "epoch": 0.97, "grad_norm": 0.45105987787246704, "learning_rate": 6.565877818526245e-07, "loss": 1.6302, "step": 416 }, { "epoch": 0.97, "grad_norm": 0.4008980989456177, "learning_rate": 5.72041694993286e-07, "loss": 0.7278, "step": 417 }, { "epoch": 0.97, "grad_norm": 0.570662796497345, "learning_rate": 4.933048907981741e-07, "loss": 1.2836, "step": 418 }, { "epoch": 0.97, "grad_norm": 0.39943748712539673, "learning_rate": 4.203819696267486e-07, "loss": 1.1154, "step": 419 }, { "epoch": 0.97, "grad_norm": 0.6464762687683105, "learning_rate": 3.532771921504696e-07, "loss": 0.9646, "step": 420 }, { "epoch": 0.98, "grad_norm": 0.46713489294052124, "learning_rate": 2.919944791037632e-07, "loss": 1.1209, "step": 421 }, { "epoch": 0.98, "grad_norm": 0.45911917090415955, "learning_rate": 2.3653741105499338e-07, "loss": 1.2976, "step": 422 }, { "epoch": 0.98, "grad_norm": 0.4376751482486725, "learning_rate": 1.8690922819727398e-07, "loss": 1.2812, "step": 423 }, { "epoch": 0.98, "grad_norm": 0.461121529340744, "learning_rate": 1.4311283015910893e-07, "loss": 1.4073, "step": 424 }, { "epoch": 0.99, "grad_norm": 0.39589831233024597, "learning_rate": 1.0515077583498344e-07, "loss": 1.0048, "step": 425 }, { "epoch": 0.99, "grad_norm": 0.39896348118782043, "learning_rate": 7.302528323589464e-08, "loss": 0.8817, "step": 426 }, { "epoch": 0.99, "grad_norm": 0.43354111909866333, "learning_rate": 4.6738229359732935e-08, "loss": 0.9946, "step": 427 }, { "epoch": 0.99, "grad_norm": 0.3909148871898651, "learning_rate": 2.6291150081603212e-08, "loss": 1.1572, "step": 428 }, { "epoch": 1.0, "grad_norm": 0.5046055912971497, "learning_rate": 1.168524006410765e-08, "loss": 1.3007, "step": 429 }, { "epoch": 1.0, "grad_norm": 0.6245082020759583, "learning_rate": 2.921352687534906e-09, "loss": 0.9887, "step": 430 }, { "epoch": 1.0, "grad_norm": 0.45793771743774414, "learning_rate": 0.0, "loss": 1.1817, "step": 431 } ], "logging_steps": 1, "max_steps": 431, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 7928752835297280.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }