{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989258861439313, "eval_steps": 100000, "global_step": 465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010741138560687433, "grad_norm": 94.5147817778946, "learning_rate": 8.51063829787234e-08, "logits/chosen": -10.583702087402344, "logits/rejected": -10.455877304077148, "logps/chosen": -0.9049979448318481, "logps/rejected": -0.8784100413322449, "loss": 6.1451, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -9.04997730255127, "rewards/margins": -0.2658771872520447, "rewards/rejected": -8.784101486206055, "step": 5 }, { "epoch": 0.021482277121374866, "grad_norm": 128.5515421485228, "learning_rate": 1.702127659574468e-07, "logits/chosen": -10.710015296936035, "logits/rejected": -10.85377311706543, "logps/chosen": -1.0046945810317993, "logps/rejected": -0.8850045204162598, "loss": 5.8491, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -10.04694652557373, "rewards/margins": -1.196901559829712, "rewards/rejected": -8.850046157836914, "step": 10 }, { "epoch": 0.0322234156820623, "grad_norm": 58.802331595913145, "learning_rate": 2.553191489361702e-07, "logits/chosen": -10.312850952148438, "logits/rejected": -10.239133834838867, "logps/chosen": -1.0889472961425781, "logps/rejected": -1.1543949842453003, "loss": 6.2505, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -10.889472961425781, "rewards/margins": 0.6544777154922485, "rewards/rejected": -11.543951034545898, "step": 15 }, { "epoch": 0.04296455424274973, "grad_norm": 151.21348799898024, "learning_rate": 3.404255319148936e-07, "logits/chosen": -9.954164505004883, "logits/rejected": -10.053568840026855, "logps/chosen": -0.9611791372299194, "logps/rejected": -1.1332345008850098, "loss": 5.5619, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -9.611791610717773, "rewards/margins": 1.7205528020858765, "rewards/rejected": -11.332345008850098, "step": 20 }, { "epoch": 0.05370569280343716, "grad_norm": 99.92865956439873, "learning_rate": 4.25531914893617e-07, "logits/chosen": -10.326103210449219, "logits/rejected": -10.055009841918945, "logps/chosen": -0.8260948061943054, "logps/rejected": -1.1549828052520752, "loss": 5.2635, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.260948181152344, "rewards/margins": 3.2888808250427246, "rewards/rejected": -11.54982852935791, "step": 25 }, { "epoch": 0.0644468313641246, "grad_norm": 81.6600880763309, "learning_rate": 5.106382978723404e-07, "logits/chosen": -9.319940567016602, "logits/rejected": -9.13192081451416, "logps/chosen": -0.6618553996086121, "logps/rejected": -0.6553267240524292, "loss": 5.0518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.61855411529541, "rewards/margins": -0.06528709828853607, "rewards/rejected": -6.553267002105713, "step": 30 }, { "epoch": 0.07518796992481203, "grad_norm": 92.79131753018717, "learning_rate": 5.957446808510638e-07, "logits/chosen": -8.877812385559082, "logits/rejected": -8.929550170898438, "logps/chosen": -0.717892050743103, "logps/rejected": -0.6935927867889404, "loss": 5.224, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.178920745849609, "rewards/margins": -0.24299363791942596, "rewards/rejected": -6.935927391052246, "step": 35 }, { "epoch": 0.08592910848549946, "grad_norm": 89.40050001716304, "learning_rate": 6.808510638297872e-07, "logits/chosen": -7.988096714019775, "logits/rejected": -7.907191276550293, "logps/chosen": -0.7402302622795105, "logps/rejected": -0.7434382438659668, "loss": 4.8885, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.4023027420043945, "rewards/margins": 0.032080501317977905, "rewards/rejected": -7.434383392333984, "step": 40 }, { "epoch": 0.0966702470461869, "grad_norm": 59.92644904113339, "learning_rate": 7.659574468085107e-07, "logits/chosen": -8.71805477142334, "logits/rejected": -8.232014656066895, "logps/chosen": -0.5317873954772949, "logps/rejected": -0.6050616502761841, "loss": 4.5879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.317873477935791, "rewards/margins": 0.7327424883842468, "rewards/rejected": -6.0506157875061035, "step": 45 }, { "epoch": 0.10741138560687433, "grad_norm": 60.25452206880678, "learning_rate": 7.998983280184396e-07, "logits/chosen": -8.83049488067627, "logits/rejected": -8.585375785827637, "logps/chosen": -0.5144228339195251, "logps/rejected": -0.5809676647186279, "loss": 4.6549, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -5.144228458404541, "rewards/margins": 0.6654484868049622, "rewards/rejected": -5.8096771240234375, "step": 50 }, { "epoch": 0.11815252416756176, "grad_norm": 46.997605934809734, "learning_rate": 7.992771864078597e-07, "logits/chosen": -8.163946151733398, "logits/rejected": -8.180994033813477, "logps/chosen": -0.5956984758377075, "logps/rejected": -0.7000880837440491, "loss": 4.6606, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.956984519958496, "rewards/margins": 1.0438958406448364, "rewards/rejected": -7.000881195068359, "step": 55 }, { "epoch": 0.1288936627282492, "grad_norm": 59.25357465703395, "learning_rate": 7.980922636120897e-07, "logits/chosen": -8.718216896057129, "logits/rejected": -8.35698127746582, "logps/chosen": -0.5706155896186829, "logps/rejected": -0.6969493627548218, "loss": 4.4885, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -5.706155776977539, "rewards/margins": 1.2633379697799683, "rewards/rejected": -6.969493865966797, "step": 60 }, { "epoch": 0.13963480128893663, "grad_norm": 50.79780612404402, "learning_rate": 7.963452327474534e-07, "logits/chosen": -9.234804153442383, "logits/rejected": -9.1095609664917, "logps/chosen": -0.6090906858444214, "logps/rejected": -0.7208055257797241, "loss": 4.666, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.090908050537109, "rewards/margins": 1.1171473264694214, "rewards/rejected": -7.208055019378662, "step": 65 }, { "epoch": 0.15037593984962405, "grad_norm": 56.36564641791114, "learning_rate": 7.940385606293987e-07, "logits/chosen": -8.946883201599121, "logits/rejected": -8.716778755187988, "logps/chosen": -0.6818052530288696, "logps/rejected": -0.7961267828941345, "loss": 4.577, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.818052768707275, "rewards/margins": 1.1432150602340698, "rewards/rejected": -7.961267948150635, "step": 70 }, { "epoch": 0.1611170784103115, "grad_norm": 61.76930510948969, "learning_rate": 7.911755042893434e-07, "logits/chosen": -9.067525863647461, "logits/rejected": -8.9346923828125, "logps/chosen": -0.6832990646362305, "logps/rejected": -0.7763570547103882, "loss": 4.4179, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -6.8329901695251465, "rewards/margins": 0.9305804371833801, "rewards/rejected": -7.763571262359619, "step": 75 }, { "epoch": 0.17185821697099893, "grad_norm": 63.602633074853536, "learning_rate": 7.877601063757321e-07, "logits/chosen": -9.461370468139648, "logits/rejected": -8.981520652770996, "logps/chosen": -0.6881433129310608, "logps/rejected": -0.8508684039115906, "loss": 4.3763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.881433010101318, "rewards/margins": 1.6272509098052979, "rewards/rejected": -8.508684158325195, "step": 80 }, { "epoch": 0.18259935553168635, "grad_norm": 50.67613033462041, "learning_rate": 7.837971894457989e-07, "logits/chosen": -9.557887077331543, "logits/rejected": -9.17081069946289, "logps/chosen": -0.6830392479896545, "logps/rejected": -0.799291729927063, "loss": 4.6499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.830392360687256, "rewards/margins": 1.1625245809555054, "rewards/rejected": -7.992917060852051, "step": 85 }, { "epoch": 0.1933404940923738, "grad_norm": 57.58777625940833, "learning_rate": 7.792923491560942e-07, "logits/chosen": -8.579484939575195, "logits/rejected": -8.546136856079102, "logps/chosen": -0.6667743921279907, "logps/rejected": -0.7407978177070618, "loss": 4.4492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.667743682861328, "rewards/margins": 0.7402342557907104, "rewards/rejected": -7.407977104187012, "step": 90 }, { "epoch": 0.20408163265306123, "grad_norm": 68.91179001810359, "learning_rate": 7.742519463613926e-07, "logits/chosen": -9.336307525634766, "logits/rejected": -9.128133773803711, "logps/chosen": -0.706219494342804, "logps/rejected": -0.7757526636123657, "loss": 4.2763, "rewards/accuracies": 0.5, "rewards/chosen": -7.062193870544434, "rewards/margins": 0.6953321099281311, "rewards/rejected": -7.7575273513793945, "step": 95 }, { "epoch": 0.21482277121374865, "grad_norm": 140.43504047927686, "learning_rate": 7.68683098133138e-07, "logits/chosen": -8.939419746398926, "logits/rejected": -8.681028366088867, "logps/chosen": -0.7093919515609741, "logps/rejected": -0.8932901620864868, "loss": 4.4002, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.093919277191162, "rewards/margins": 1.838982343673706, "rewards/rejected": -8.932901382446289, "step": 100 }, { "epoch": 0.22556390977443608, "grad_norm": 105.52949175741638, "learning_rate": 7.625936677101051e-07, "logits/chosen": -8.601816177368164, "logits/rejected": -8.625459671020508, "logps/chosen": -0.8767679333686829, "logps/rejected": -0.8515819311141968, "loss": 4.4644, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.767679214477539, "rewards/margins": -0.25185948610305786, "rewards/rejected": -8.51581859588623, "step": 105 }, { "epoch": 0.23630504833512353, "grad_norm": 56.44051818244315, "learning_rate": 7.559922533954731e-07, "logits/chosen": -9.58240795135498, "logits/rejected": -9.501542091369629, "logps/chosen": -0.7655607461929321, "logps/rejected": -0.8916142582893372, "loss": 4.2797, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.6556077003479, "rewards/margins": 1.2605348825454712, "rewards/rejected": -8.916143417358398, "step": 110 }, { "epoch": 0.24704618689581095, "grad_norm": 68.9497539150874, "learning_rate": 7.488881764159808e-07, "logits/chosen": -9.756335258483887, "logits/rejected": -9.543218612670898, "logps/chosen": -0.7038711309432983, "logps/rejected": -0.7986757159233093, "loss": 4.154, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -7.0387115478515625, "rewards/margins": 0.9480463862419128, "rewards/rejected": -7.986758232116699, "step": 115 }, { "epoch": 0.2577873254564984, "grad_norm": 90.20504733911939, "learning_rate": 7.412914677603135e-07, "logits/chosen": -9.883420944213867, "logits/rejected": -9.735390663146973, "logps/chosen": -0.9017173647880554, "logps/rejected": -0.9996153116226196, "loss": 4.2168, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.017173767089844, "rewards/margins": 0.9789786338806152, "rewards/rejected": -9.996152877807617, "step": 120 }, { "epoch": 0.26852846401718583, "grad_norm": 63.31580116626401, "learning_rate": 7.332128540153017e-07, "logits/chosen": -10.71928596496582, "logits/rejected": -10.555776596069336, "logps/chosen": -0.788918673992157, "logps/rejected": -0.9437187910079956, "loss": 4.1085, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.889185905456543, "rewards/margins": 1.5480016469955444, "rewards/rejected": -9.437189102172852, "step": 125 }, { "epoch": 0.27926960257787325, "grad_norm": 81.18347953083857, "learning_rate": 7.246637422199322e-07, "logits/chosen": -10.676037788391113, "logits/rejected": -10.630210876464844, "logps/chosen": -0.8381876945495605, "logps/rejected": -1.0221493244171143, "loss": 4.0857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.381875991821289, "rewards/margins": 1.8396151065826416, "rewards/rejected": -10.221491813659668, "step": 130 }, { "epoch": 0.2900107411385607, "grad_norm": 80.90445716094146, "learning_rate": 7.156562037585574e-07, "logits/chosen": -11.714326858520508, "logits/rejected": -11.1636323928833, "logps/chosen": -0.8452903628349304, "logps/rejected": -1.1077954769134521, "loss": 3.9406, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.452905654907227, "rewards/margins": 2.6250510215759277, "rewards/rejected": -11.07795524597168, "step": 135 }, { "epoch": 0.3007518796992481, "grad_norm": 88.8085364984583, "learning_rate": 7.062029573160467e-07, "logits/chosen": -11.935297012329102, "logits/rejected": -11.792046546936035, "logps/chosen": -0.9109989404678345, "logps/rejected": -1.1446388959884644, "loss": 3.6921, "rewards/accuracies": 0.8125, "rewards/chosen": -9.109989166259766, "rewards/margins": 2.3363993167877197, "rewards/rejected": -11.446390151977539, "step": 140 }, { "epoch": 0.31149301825993553, "grad_norm": 110.78828702618637, "learning_rate": 6.963173509189455e-07, "logits/chosen": -13.552042007446289, "logits/rejected": -13.324705123901367, "logps/chosen": -1.0502710342407227, "logps/rejected": -1.2693006992340088, "loss": 3.8236, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -10.502711296081543, "rewards/margins": 2.190295696258545, "rewards/rejected": -12.69300651550293, "step": 145 }, { "epoch": 0.322234156820623, "grad_norm": 111.15417892636452, "learning_rate": 6.860133430880024e-07, "logits/chosen": -14.586761474609375, "logits/rejected": -14.23077392578125, "logps/chosen": -1.2444875240325928, "logps/rejected": -1.4712624549865723, "loss": 3.7527, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -12.444875717163086, "rewards/margins": 2.2677478790283203, "rewards/rejected": -14.712623596191406, "step": 150 }, { "epoch": 0.33297529538131043, "grad_norm": 119.28100769496392, "learning_rate": 6.753054831286747e-07, "logits/chosen": -14.823234558105469, "logits/rejected": -14.770757675170898, "logps/chosen": -1.3944904804229736, "logps/rejected": -1.6155385971069336, "loss": 3.893, "rewards/accuracies": 0.6875, "rewards/chosen": -13.944903373718262, "rewards/margins": 2.2104804515838623, "rewards/rejected": -16.155384063720703, "step": 155 }, { "epoch": 0.34371643394199786, "grad_norm": 103.61310969210844, "learning_rate": 6.642088905874433e-07, "logits/chosen": -14.195696830749512, "logits/rejected": -14.07690143585205, "logps/chosen": -1.3757129907608032, "logps/rejected": -1.5891997814178467, "loss": 3.7363, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.75713062286377, "rewards/margins": 2.134868860244751, "rewards/rejected": -15.891998291015625, "step": 160 }, { "epoch": 0.3544575725026853, "grad_norm": 95.46872943421629, "learning_rate": 6.527392339029455e-07, "logits/chosen": -14.401777267456055, "logits/rejected": -14.322749137878418, "logps/chosen": -1.2530758380889893, "logps/rejected": -1.5336121320724487, "loss": 3.4763, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -12.530759811401367, "rewards/margins": 2.8053627014160156, "rewards/rejected": -15.336122512817383, "step": 165 }, { "epoch": 0.3651987110633727, "grad_norm": 119.23452912777815, "learning_rate": 6.409127082820689e-07, "logits/chosen": -14.566454887390137, "logits/rejected": -14.484842300415039, "logps/chosen": -1.531166434288025, "logps/rejected": -1.7996248006820679, "loss": 3.6554, "rewards/accuracies": 0.6875, "rewards/chosen": -15.311663627624512, "rewards/margins": 2.6845829486846924, "rewards/rejected": -17.996248245239258, "step": 170 }, { "epoch": 0.37593984962406013, "grad_norm": 123.93953756068933, "learning_rate": 6.287460128322457e-07, "logits/chosen": -14.157377243041992, "logits/rejected": -14.0371675491333, "logps/chosen": -1.5080561637878418, "logps/rejected": -1.9248558282852173, "loss": 3.4374, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -15.080561637878418, "rewards/margins": 4.167994022369385, "rewards/rejected": -19.24855613708496, "step": 175 }, { "epoch": 0.3866809881847476, "grad_norm": 223.710665367037, "learning_rate": 6.16256326982239e-07, "logits/chosen": -16.03777313232422, "logits/rejected": -16.100345611572266, "logps/chosen": -1.5300877094268799, "logps/rejected": -1.8575336933135986, "loss": 3.4606, "rewards/accuracies": 0.6875, "rewards/chosen": -15.300875663757324, "rewards/margins": 3.2744598388671875, "rewards/rejected": -18.575336456298828, "step": 180 }, { "epoch": 0.39742212674543503, "grad_norm": 294.4348098673441, "learning_rate": 6.034612862247114e-07, "logits/chosen": -14.142799377441406, "logits/rejected": -13.796422958374023, "logps/chosen": -1.5025275945663452, "logps/rejected": -1.7810484170913696, "loss": 3.1185, "rewards/accuracies": 0.6875, "rewards/chosen": -15.025274276733398, "rewards/margins": 2.7852089405059814, "rewards/rejected": -17.810483932495117, "step": 185 }, { "epoch": 0.40816326530612246, "grad_norm": 112.37773385751002, "learning_rate": 5.903789572148295e-07, "logits/chosen": -14.8009614944458, "logits/rejected": -14.250249862670898, "logps/chosen": -1.5931546688079834, "logps/rejected": -2.022378444671631, "loss": 3.3847, "rewards/accuracies": 0.75, "rewards/chosen": -15.931546211242676, "rewards/margins": 4.292238712310791, "rewards/rejected": -20.223783493041992, "step": 190 }, { "epoch": 0.4189044038668099, "grad_norm": 125.77761635765717, "learning_rate": 5.770278122600662e-07, "logits/chosen": -14.832977294921875, "logits/rejected": -14.608530044555664, "logps/chosen": -1.6177564859390259, "logps/rejected": -1.9727256298065186, "loss": 3.6009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -16.177562713623047, "rewards/margins": 3.549692153930664, "rewards/rejected": -19.72725486755371, "step": 195 }, { "epoch": 0.4296455424274973, "grad_norm": 101.70153964682511, "learning_rate": 5.634267032372192e-07, "logits/chosen": -14.803668022155762, "logits/rejected": -14.786203384399414, "logps/chosen": -1.6423594951629639, "logps/rejected": -1.9881032705307007, "loss": 3.3904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.423595428466797, "rewards/margins": 3.457437515258789, "rewards/rejected": -19.881032943725586, "step": 200 }, { "epoch": 0.44038668098818473, "grad_norm": 123.22102599092351, "learning_rate": 5.495948349734758e-07, "logits/chosen": -14.582061767578125, "logits/rejected": -14.51270580291748, "logps/chosen": -1.747982382774353, "logps/rejected": -2.051506757736206, "loss": 3.1521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.47982406616211, "rewards/margins": 3.035243034362793, "rewards/rejected": -20.51506805419922, "step": 205 }, { "epoch": 0.45112781954887216, "grad_norm": 106.53206062736002, "learning_rate": 5.355517381291105e-07, "logits/chosen": -15.856142044067383, "logits/rejected": -15.048059463500977, "logps/chosen": -1.9086406230926514, "logps/rejected": -2.3940463066101074, "loss": 3.1856, "rewards/accuracies": 0.75, "rewards/chosen": -19.086406707763672, "rewards/margins": 4.854057312011719, "rewards/rejected": -23.940462112426758, "step": 210 }, { "epoch": 0.46186895810955964, "grad_norm": 135.01895835142213, "learning_rate": 5.21317241620105e-07, "logits/chosen": -17.223520278930664, "logits/rejected": -16.822795867919922, "logps/chosen": -1.9180253744125366, "logps/rejected": -2.2895708084106445, "loss": 3.4171, "rewards/accuracies": 0.75, "rewards/chosen": -19.180253982543945, "rewards/margins": 3.7154533863067627, "rewards/rejected": -22.895706176757812, "step": 215 }, { "epoch": 0.47261009667024706, "grad_norm": 116.88786548811646, "learning_rate": 5.069114446196291e-07, "logits/chosen": -14.405430793762207, "logits/rejected": -14.03125, "logps/chosen": -1.8195463418960571, "logps/rejected": -2.373373508453369, "loss": 3.1087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -18.195463180541992, "rewards/margins": 5.538268566131592, "rewards/rejected": -23.73373031616211, "step": 220 }, { "epoch": 0.4833512352309345, "grad_norm": 117.33800643418842, "learning_rate": 4.923546881779183e-07, "logits/chosen": -15.34239387512207, "logits/rejected": -15.118896484375, "logps/chosen": -1.5427398681640625, "logps/rejected": -1.9821481704711914, "loss": 3.0885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.427398681640625, "rewards/margins": 4.394083023071289, "rewards/rejected": -19.821481704711914, "step": 225 }, { "epoch": 0.4940923737916219, "grad_norm": 116.49865117484065, "learning_rate": 4.776675265006186e-07, "logits/chosen": -14.630195617675781, "logits/rejected": -14.59937858581543, "logps/chosen": -1.6563146114349365, "logps/rejected": -2.0928232669830322, "loss": 3.2032, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -16.56314468383789, "rewards/margins": 4.365086555480957, "rewards/rejected": -20.928232192993164, "step": 230 }, { "epoch": 0.5048335123523093, "grad_norm": 203.29377037587201, "learning_rate": 4.62870697926156e-07, "logits/chosen": -14.8600435256958, "logits/rejected": -15.338279724121094, "logps/chosen": -1.7831714153289795, "logps/rejected": -2.145383358001709, "loss": 3.2633, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -17.831714630126953, "rewards/margins": 3.622117519378662, "rewards/rejected": -21.45383071899414, "step": 235 }, { "epoch": 0.5155746509129968, "grad_norm": 116.16395762640381, "learning_rate": 4.479850956431092e-07, "logits/chosen": -14.476922988891602, "logits/rejected": -14.742956161499023, "logps/chosen": -1.6810489892959595, "logps/rejected": -2.0232789516448975, "loss": 3.0869, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -16.810489654541016, "rewards/margins": 3.4223015308380127, "rewards/rejected": -20.232791900634766, "step": 240 }, { "epoch": 0.5263157894736842, "grad_norm": 98.59837238595348, "learning_rate": 4.33031738188933e-07, "logits/chosen": -15.394210815429688, "logits/rejected": -14.864255905151367, "logps/chosen": -1.6588356494903564, "logps/rejected": -2.0340933799743652, "loss": 3.2303, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -16.58835792541504, "rewards/margins": 3.752579927444458, "rewards/rejected": -20.3409366607666, "step": 245 }, { "epoch": 0.5370569280343717, "grad_norm": 122.58746074195308, "learning_rate": 4.180317397716889e-07, "logits/chosen": -15.588345527648926, "logits/rejected": -15.222723007202148, "logps/chosen": -1.6328115463256836, "logps/rejected": -2.2505877017974854, "loss": 3.0874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -16.328113555908203, "rewards/margins": 6.177763938903809, "rewards/rejected": -22.505878448486328, "step": 250 }, { "epoch": 0.547798066595059, "grad_norm": 120.49234048427142, "learning_rate": 4.030062804566888e-07, "logits/chosen": -15.462881088256836, "logits/rejected": -15.3878173828125, "logps/chosen": -1.6617505550384521, "logps/rejected": -1.9518375396728516, "loss": 3.0471, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -16.61750602722168, "rewards/margins": 2.9008688926696777, "rewards/rejected": -19.518375396728516, "step": 255 }, { "epoch": 0.5585392051557465, "grad_norm": 102.36282696415171, "learning_rate": 3.8797657626014614e-07, "logits/chosen": -15.693799018859863, "logits/rejected": -15.475125312805176, "logps/chosen": -1.7303409576416016, "logps/rejected": -2.111295223236084, "loss": 3.1486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.303409576416016, "rewards/margins": 3.809544801712036, "rewards/rejected": -21.112953186035156, "step": 260 }, { "epoch": 0.569280343716434, "grad_norm": 120.30352416855824, "learning_rate": 3.729638491920669e-07, "logits/chosen": -14.018827438354492, "logits/rejected": -14.128240585327148, "logps/chosen": -1.5650581121444702, "logps/rejected": -1.8844165802001953, "loss": 3.0723, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -15.650581359863281, "rewards/margins": 3.193586826324463, "rewards/rejected": -18.84417152404785, "step": 265 }, { "epoch": 0.5800214822771214, "grad_norm": 91.11946128351651, "learning_rate": 3.5798929729067464e-07, "logits/chosen": -15.980966567993164, "logits/rejected": -15.59577465057373, "logps/chosen": -1.759478211402893, "logps/rejected": -2.1487960815429688, "loss": 2.7323, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.59478187561035, "rewards/margins": 3.8931777477264404, "rewards/rejected": -21.487960815429688, "step": 270 }, { "epoch": 0.5907626208378088, "grad_norm": 111.96074414088407, "learning_rate": 3.4307406469068595e-07, "logits/chosen": -15.691810607910156, "logits/rejected": -15.631698608398438, "logps/chosen": -1.7383606433868408, "logps/rejected": -2.2635440826416016, "loss": 2.9053, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.38360595703125, "rewards/margins": 5.251835823059082, "rewards/rejected": -22.635440826416016, "step": 275 }, { "epoch": 0.6015037593984962, "grad_norm": 122.95732420434838, "learning_rate": 3.282392117676968e-07, "logits/chosen": -15.389913558959961, "logits/rejected": -15.556841850280762, "logps/chosen": -1.9042613506317139, "logps/rejected": -2.476677417755127, "loss": 2.9112, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -19.04261589050293, "rewards/margins": 5.72415828704834, "rewards/rejected": -24.766775131225586, "step": 280 }, { "epoch": 0.6122448979591837, "grad_norm": 104.08928006990811, "learning_rate": 3.135056854008371e-07, "logits/chosen": -16.152729034423828, "logits/rejected": -16.002233505249023, "logps/chosen": -1.8163830041885376, "logps/rejected": -2.228738784790039, "loss": 2.8591, "rewards/accuracies": 0.75, "rewards/chosen": -18.163829803466797, "rewards/margins": 4.123559474945068, "rewards/rejected": -22.28738784790039, "step": 285 }, { "epoch": 0.6229860365198711, "grad_norm": 113.24003069429473, "learning_rate": 2.988942893956833e-07, "logits/chosen": -15.338768005371094, "logits/rejected": -15.249606132507324, "logps/chosen": -1.8918412923812866, "logps/rejected": -2.32609224319458, "loss": 3.0229, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -18.918415069580078, "rewards/margins": 4.342508792877197, "rewards/rejected": -23.260921478271484, "step": 290 }, { "epoch": 0.6337271750805585, "grad_norm": 151.8876206935079, "learning_rate": 2.844256551091911e-07, "logits/chosen": -16.8232421875, "logits/rejected": -16.842761993408203, "logps/chosen": -1.9518108367919922, "logps/rejected": -2.4939913749694824, "loss": 2.833, "rewards/accuracies": 0.8125, "rewards/chosen": -19.518108367919922, "rewards/margins": 5.421802997589111, "rewards/rejected": -24.939910888671875, "step": 295 }, { "epoch": 0.644468313641246, "grad_norm": 273.6603281637076, "learning_rate": 2.7012021231812664e-07, "logits/chosen": -16.766956329345703, "logits/rejected": -16.424999237060547, "logps/chosen": -1.969435453414917, "logps/rejected": -2.308202028274536, "loss": 3.2442, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -19.694355010986328, "rewards/margins": 3.387664318084717, "rewards/rejected": -23.082019805908203, "step": 300 }, { "epoch": 0.6552094522019334, "grad_norm": 187.01720889425104, "learning_rate": 2.5599816037212954e-07, "logits/chosen": -14.743069648742676, "logits/rejected": -14.623723983764648, "logps/chosen": -1.8649146556854248, "logps/rejected": -2.4030163288116455, "loss": 2.9136, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -18.64914894104004, "rewards/margins": 5.381015777587891, "rewards/rejected": -24.03016471862793, "step": 305 }, { "epoch": 0.6659505907626209, "grad_norm": 142.83830462137144, "learning_rate": 2.4207943967214064e-07, "logits/chosen": -16.09463119506836, "logits/rejected": -15.806689262390137, "logps/chosen": -2.033447742462158, "logps/rejected": -2.524364471435547, "loss": 3.1207, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -20.334476470947266, "rewards/margins": 4.909164905548096, "rewards/rejected": -25.24364471435547, "step": 310 }, { "epoch": 0.6766917293233082, "grad_norm": 106.6308691579763, "learning_rate": 2.2838370351446547e-07, "logits/chosen": -15.870585441589355, "logits/rejected": -15.55876350402832, "logps/chosen": -1.8023881912231445, "logps/rejected": -2.277968168258667, "loss": 2.7641, "rewards/accuracies": 0.6875, "rewards/chosen": -18.023881912231445, "rewards/margins": 4.755801200866699, "rewards/rejected": -22.779682159423828, "step": 315 }, { "epoch": 0.6874328678839957, "grad_norm": 108.7814729394269, "learning_rate": 2.1493029034023188e-07, "logits/chosen": -15.210580825805664, "logits/rejected": -15.057415962219238, "logps/chosen": -1.789072036743164, "logps/rejected": -2.2775633335113525, "loss": 2.8863, "rewards/accuracies": 0.75, "rewards/chosen": -17.89072036743164, "rewards/margins": 4.884912014007568, "rewards/rejected": -22.775630950927734, "step": 320 }, { "epoch": 0.6981740064446831, "grad_norm": 180.86259023584043, "learning_rate": 2.0173819642942376e-07, "logits/chosen": -14.378689765930176, "logits/rejected": -14.201916694641113, "logps/chosen": -1.9404065608978271, "logps/rejected": -2.5850563049316406, "loss": 2.9848, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -19.404064178466797, "rewards/margins": 6.446499824523926, "rewards/rejected": -25.85056495666504, "step": 325 }, { "epoch": 0.7089151450053706, "grad_norm": 132.13798361660807, "learning_rate": 1.888260490780485e-07, "logits/chosen": -14.281087875366211, "logits/rejected": -14.154438972473145, "logps/chosen": -1.7613548040390015, "logps/rejected": -2.238495349884033, "loss": 3.0495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.61355209350586, "rewards/margins": 4.771404266357422, "rewards/rejected": -22.384952545166016, "step": 330 }, { "epoch": 0.719656283566058, "grad_norm": 127.44733826541191, "learning_rate": 1.7621208029631078e-07, "logits/chosen": -14.766406059265137, "logits/rejected": -14.667470932006836, "logps/chosen": -1.9043552875518799, "logps/rejected": -2.4970054626464844, "loss": 2.9109, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -19.04355239868164, "rewards/margins": 5.9265007972717285, "rewards/rejected": -24.970054626464844, "step": 335 }, { "epoch": 0.7303974221267454, "grad_norm": 114.16195914051926, "learning_rate": 1.6391410106493227e-07, "logits/chosen": -14.881872177124023, "logits/rejected": -14.627456665039062, "logps/chosen": -1.9786325693130493, "logps/rejected": -2.5424978733062744, "loss": 2.8817, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -19.786325454711914, "rewards/margins": 5.63865327835083, "rewards/rejected": -25.424976348876953, "step": 340 }, { "epoch": 0.7411385606874329, "grad_norm": 99.76652109445614, "learning_rate": 1.5194947618596673e-07, "logits/chosen": -15.026782035827637, "logits/rejected": -14.58587646484375, "logps/chosen": -1.9211170673370361, "logps/rejected": -2.394774913787842, "loss": 3.0269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.211170196533203, "rewards/margins": 4.736577033996582, "rewards/rejected": -23.9477481842041, "step": 345 }, { "epoch": 0.7518796992481203, "grad_norm": 107.2380444923777, "learning_rate": 1.4033509976362083e-07, "logits/chosen": -15.670697212219238, "logits/rejected": -15.579752922058105, "logps/chosen": -1.9338979721069336, "logps/rejected": -2.3520257472991943, "loss": 2.9892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.338979721069336, "rewards/margins": 4.181277751922607, "rewards/rejected": -23.52025604248047, "step": 350 }, { "epoch": 0.7626208378088077, "grad_norm": 108.07049767560719, "learning_rate": 1.2908737134970363e-07, "logits/chosen": -14.5513334274292, "logits/rejected": -14.516873359680176, "logps/chosen": -1.8658726215362549, "logps/rejected": -2.476576566696167, "loss": 3.1196, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -18.65872573852539, "rewards/margins": 6.1070404052734375, "rewards/rejected": -24.765766143798828, "step": 355 }, { "epoch": 0.7733619763694952, "grad_norm": 103.96338647941381, "learning_rate": 1.1822217278738515e-07, "logits/chosen": -15.559527397155762, "logits/rejected": -15.545167922973633, "logps/chosen": -1.9091074466705322, "logps/rejected": -2.4284536838531494, "loss": 3.0461, "rewards/accuracies": 0.75, "rewards/chosen": -19.091075897216797, "rewards/margins": 5.193462371826172, "rewards/rejected": -24.284536361694336, "step": 360 }, { "epoch": 0.7841031149301826, "grad_norm": 138.78303459839336, "learning_rate": 1.0775484578596241e-07, "logits/chosen": -15.669352531433105, "logits/rejected": -15.537897109985352, "logps/chosen": -1.94468092918396, "logps/rejected": -2.5436809062957764, "loss": 2.7974, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -19.44681167602539, "rewards/margins": 5.989997386932373, "rewards/rejected": -25.436809539794922, "step": 365 }, { "epoch": 0.7948442534908701, "grad_norm": 122.14734643501443, "learning_rate": 9.770017025829673e-08, "logits/chosen": -15.961019515991211, "logits/rejected": -15.95417308807373, "logps/chosen": -2.1812241077423096, "logps/rejected": -2.6908135414123535, "loss": 2.6374, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -21.812240600585938, "rewards/margins": 5.095890998840332, "rewards/rejected": -26.908132553100586, "step": 370 }, { "epoch": 0.8055853920515574, "grad_norm": 114.19485857094502, "learning_rate": 8.807234345151027e-08, "logits/chosen": -14.920249938964844, "logits/rejected": -14.890344619750977, "logps/chosen": -2.0413661003112793, "logps/rejected": -2.6543760299682617, "loss": 2.8891, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -20.41366195678711, "rewards/margins": 6.13009786605835, "rewards/rejected": -26.543758392333984, "step": 375 }, { "epoch": 0.8163265306122449, "grad_norm": 184.4289644042616, "learning_rate": 7.888495990040924e-08, "logits/chosen": -13.656982421875, "logits/rejected": -13.701431274414062, "logps/chosen": -2.011772871017456, "logps/rejected": -2.7216084003448486, "loss": 2.9801, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -20.117727279663086, "rewards/margins": 7.098354339599609, "rewards/rejected": -27.21608543395996, "step": 380 }, { "epoch": 0.8270676691729323, "grad_norm": 132.46593111411627, "learning_rate": 7.015099223193943e-08, "logits/chosen": -15.658330917358398, "logits/rejected": -15.693890571594238, "logps/chosen": -1.9581212997436523, "logps/rejected": -2.5001060962677, "loss": 2.8611, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -19.58121681213379, "rewards/margins": 5.419846534729004, "rewards/rejected": -25.001062393188477, "step": 385 }, { "epoch": 0.8378088077336198, "grad_norm": 109.73562851103766, "learning_rate": 6.188277284777857e-08, "logits/chosen": -14.48884391784668, "logits/rejected": -13.800481796264648, "logps/chosen": -1.9877732992172241, "logps/rejected": -2.5746169090270996, "loss": 2.8295, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -19.877731323242188, "rewards/margins": 5.868436813354492, "rewards/rejected": -25.746166229248047, "step": 390 }, { "epoch": 0.8485499462943072, "grad_norm": 119.04235166242496, "learning_rate": 5.409197651092965e-08, "logits/chosen": -15.729510307312012, "logits/rejected": -15.620780944824219, "logps/chosen": -2.1660141944885254, "logps/rejected": -2.667978286743164, "loss": 2.7298, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -21.660140991210938, "rewards/margins": 5.019640922546387, "rewards/rejected": -26.679784774780273, "step": 395 }, { "epoch": 0.8592910848549946, "grad_norm": 105.88618147362118, "learning_rate": 4.678960386090298e-08, "logits/chosen": -15.191770553588867, "logits/rejected": -15.158576965332031, "logps/chosen": -1.9015194177627563, "logps/rejected": -2.5145716667175293, "loss": 2.7402, "rewards/accuracies": 0.8125, "rewards/chosen": -19.015193939208984, "rewards/margins": 6.130521297454834, "rewards/rejected": -25.145715713500977, "step": 400 }, { "epoch": 0.8700322234156821, "grad_norm": 114.47737417347113, "learning_rate": 3.998596588076366e-08, "logits/chosen": -13.7559814453125, "logits/rejected": -13.483953475952148, "logps/chosen": -1.9659799337387085, "logps/rejected": -2.390488862991333, "loss": 3.1028, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -19.659801483154297, "rewards/margins": 4.245090484619141, "rewards/rejected": -23.904891967773438, "step": 405 }, { "epoch": 0.8807733619763695, "grad_norm": 124.82158209754382, "learning_rate": 3.3690669337976996e-08, "logits/chosen": -15.061103820800781, "logits/rejected": -14.891242980957031, "logps/chosen": -1.8727645874023438, "logps/rejected": -2.3343753814697266, "loss": 2.7051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -18.727645874023438, "rewards/margins": 4.6161088943481445, "rewards/rejected": -23.343753814697266, "step": 410 }, { "epoch": 0.8915145005370569, "grad_norm": 127.84134494966216, "learning_rate": 2.7912603219609798e-08, "logits/chosen": -15.650156021118164, "logits/rejected": -15.531412124633789, "logps/chosen": -2.0558724403381348, "logps/rejected": -2.456268787384033, "loss": 2.7054, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -20.558725357055664, "rewards/margins": 4.003961563110352, "rewards/rejected": -24.562685012817383, "step": 415 }, { "epoch": 0.9022556390977443, "grad_norm": 121.68417066288369, "learning_rate": 2.265992618104029e-08, "logits/chosen": -15.883665084838867, "logits/rejected": -15.859227180480957, "logps/chosen": -2.1571857929229736, "logps/rejected": -2.7032248973846436, "loss": 2.7706, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -21.571857452392578, "rewards/margins": 5.460390090942383, "rewards/rejected": -27.03224754333496, "step": 420 }, { "epoch": 0.9129967776584318, "grad_norm": 205.5451817035243, "learning_rate": 1.7940055025900304e-08, "logits/chosen": -14.086555480957031, "logits/rejected": -13.882139205932617, "logps/chosen": -2.0481374263763428, "logps/rejected": -2.428682804107666, "loss": 3.0577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -20.481372833251953, "rewards/margins": 3.8054566383361816, "rewards/rejected": -24.28683090209961, "step": 425 }, { "epoch": 0.9237379162191193, "grad_norm": 119.46095967048043, "learning_rate": 1.3759654233514817e-08, "logits/chosen": -14.88987922668457, "logits/rejected": -14.742823600769043, "logps/chosen": -1.941057562828064, "logps/rejected": -2.4432146549224854, "loss": 2.8075, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -19.41057586669922, "rewards/margins": 5.021571636199951, "rewards/rejected": -24.432147979736328, "step": 430 }, { "epoch": 0.9344790547798066, "grad_norm": 112.52416047730715, "learning_rate": 1.0124626548627402e-08, "logits/chosen": -15.55200481414795, "logits/rejected": -15.57677936553955, "logps/chosen": -2.0663094520568848, "logps/rejected": -2.723595380783081, "loss": 2.7934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.663097381591797, "rewards/margins": 6.5728559494018555, "rewards/rejected": -27.235950469970703, "step": 435 }, { "epoch": 0.9452201933404941, "grad_norm": 95.43516446345191, "learning_rate": 7.040104646698042e-09, "logits/chosen": -14.149500846862793, "logits/rejected": -14.121160507202148, "logps/chosen": -2.1648197174072266, "logps/rejected": -2.803515911102295, "loss": 2.7258, "rewards/accuracies": 0.75, "rewards/chosen": -21.648197174072266, "rewards/margins": 6.386962890625, "rewards/rejected": -28.035160064697266, "step": 440 }, { "epoch": 0.9559613319011815, "grad_norm": 105.32519506523218, "learning_rate": 4.510443886542114e-09, "logits/chosen": -15.509679794311523, "logits/rejected": -15.587133407592773, "logps/chosen": -2.019160509109497, "logps/rejected": -2.5319721698760986, "loss": 2.8015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.191600799560547, "rewards/margins": 5.128118991851807, "rewards/rejected": -25.319721221923828, "step": 445 }, { "epoch": 0.966702470461869, "grad_norm": 142.71634790119586, "learning_rate": 2.539216160544333e-09, "logits/chosen": -15.480878829956055, "logits/rejected": -15.102048873901367, "logps/chosen": -2.1352379322052, "logps/rejected": -2.589895486831665, "loss": 2.889, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -21.35237693786621, "rewards/margins": 4.546576023101807, "rewards/rejected": -25.898956298828125, "step": 450 }, { "epoch": 0.9774436090225563, "grad_norm": 123.94313579799326, "learning_rate": 1.1292048511303054e-09, "logits/chosen": -14.889852523803711, "logits/rejected": -15.104809761047363, "logps/chosen": -1.9709218740463257, "logps/rejected": -2.4727888107299805, "loss": 2.9813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -19.709218978881836, "rewards/margins": 5.018665313720703, "rewards/rejected": -24.727886199951172, "step": 455 }, { "epoch": 0.9881847475832438, "grad_norm": 100.41632245481235, "learning_rate": 2.82400900618418e-10, "logits/chosen": -15.215599060058594, "logits/rejected": -15.19567584991455, "logps/chosen": -1.937787652015686, "logps/rejected": -2.593761920928955, "loss": 2.5524, "rewards/accuracies": 0.8125, "rewards/chosen": -19.377878189086914, "rewards/margins": 6.559741020202637, "rewards/rejected": -25.937618255615234, "step": 460 }, { "epoch": 0.9989258861439313, "grad_norm": 139.5358860215804, "learning_rate": 0.0, "logits/chosen": -15.462666511535645, "logits/rejected": -15.356382369995117, "logps/chosen": -2.155303716659546, "logps/rejected": -2.6153688430786133, "loss": 2.5621, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -21.55303955078125, "rewards/margins": 4.600649356842041, "rewards/rejected": -26.1536865234375, "step": 465 }, { "epoch": 0.9989258861439313, "step": 465, "total_flos": 0.0, "train_loss": 3.5518602760889197, "train_runtime": 6148.2375, "train_samples_per_second": 9.689, "train_steps_per_second": 0.076 } ], "logging_steps": 5, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }