{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999630314232902, "eval_steps": 400, "global_step": 507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001971657424522489, "grad_norm": 4.641391669893979, "learning_rate": 9.803921568627451e-09, "logits/chosen": -1.8306132555007935, "logits/rejected": -1.2712628841400146, "logps/chosen": -217.9743194580078, "logps/rejected": -312.2440185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.009858287122612447, "grad_norm": 3.881553151172807, "learning_rate": 4.901960784313725e-08, "logits/chosen": -1.3956289291381836, "logits/rejected": -1.324476718902588, "logps/chosen": -213.20277404785156, "logps/rejected": -243.072509765625, "loss": 0.6932, "rewards/accuracies": 0.3125, "rewards/chosen": 9.495137783233076e-05, "rewards/margins": -0.00030715527827851474, "rewards/rejected": 0.0004021066124550998, "step": 5 }, { "epoch": 0.019716574245224893, "grad_norm": 4.055647051577517, "learning_rate": 9.80392156862745e-08, "logits/chosen": -1.464820146560669, "logits/rejected": -1.329075813293457, "logps/chosen": -216.189697265625, "logps/rejected": -249.85464477539062, "loss": 0.6933, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0014236138667911291, "rewards/margins": -0.0008448967710137367, "rewards/rejected": 0.002268511103466153, "step": 10 }, { "epoch": 0.029574861367837338, "grad_norm": 3.8004259300545313, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -1.590954065322876, "logits/rejected": -1.3920761346817017, "logps/chosen": -227.84024047851562, "logps/rejected": -267.3565368652344, "loss": 0.6928, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0014651073142886162, "rewards/margins": 0.0005061920965090394, "rewards/rejected": 0.0009589152177795768, "step": 15 }, { "epoch": 0.039433148490449786, "grad_norm": 4.362970881343374, "learning_rate": 1.96078431372549e-07, "logits/chosen": -1.4077281951904297, "logits/rejected": -1.438763976097107, "logps/chosen": -216.7683563232422, "logps/rejected": -241.71524047851562, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0071268146857619286, "rewards/margins": 0.002329364651814103, "rewards/rejected": -0.009456178173422813, "step": 20 }, { "epoch": 0.04929143561306223, "grad_norm": 3.68850001761437, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -1.368187427520752, "logits/rejected": -1.3394204378128052, "logps/chosen": -225.8297119140625, "logps/rejected": -254.41439819335938, "loss": 0.6899, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.015500446781516075, "rewards/margins": 0.005922852084040642, "rewards/rejected": -0.021423298865556717, "step": 25 }, { "epoch": 0.059149722735674676, "grad_norm": 4.847654340669893, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.4356650114059448, "logits/rejected": -1.2754924297332764, "logps/chosen": -221.5808563232422, "logps/rejected": -255.44918823242188, "loss": 0.6856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02648136578500271, "rewards/margins": 0.015200227499008179, "rewards/rejected": -0.04168159142136574, "step": 30 }, { "epoch": 0.06900800985828712, "grad_norm": 6.653348898638824, "learning_rate": 3.431372549019608e-07, "logits/chosen": -1.347893476486206, "logits/rejected": -1.2126632928848267, "logps/chosen": -217.4748992919922, "logps/rejected": -253.11001586914062, "loss": 0.6716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04387308284640312, "rewards/margins": 0.04525812342762947, "rewards/rejected": -0.08913120627403259, "step": 35 }, { "epoch": 0.07886629698089957, "grad_norm": 6.964114197906881, "learning_rate": 3.92156862745098e-07, "logits/chosen": -1.4753751754760742, "logits/rejected": -1.3836042881011963, "logps/chosen": -233.50979614257812, "logps/rejected": -270.6595458984375, "loss": 0.6487, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09956349432468414, "rewards/margins": 0.10375545918941498, "rewards/rejected": -0.2033189833164215, "step": 40 }, { "epoch": 0.08872458410351201, "grad_norm": 15.546171706823465, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -1.4908992052078247, "logits/rejected": -1.4922513961791992, "logps/chosen": -260.85107421875, "logps/rejected": -310.8064270019531, "loss": 0.6302, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.39361271262168884, "rewards/margins": 0.24461090564727783, "rewards/rejected": -0.6382235884666443, "step": 45 }, { "epoch": 0.09858287122612445, "grad_norm": 7.346421533742723, "learning_rate": 4.901960784313725e-07, "logits/chosen": -1.8035519123077393, "logits/rejected": -1.7488648891448975, "logps/chosen": -280.26544189453125, "logps/rejected": -384.37969970703125, "loss": 0.6188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6396178007125854, "rewards/margins": 0.6693423986434937, "rewards/rejected": -1.308960199356079, "step": 50 }, { "epoch": 0.10844115834873691, "grad_norm": 6.928842609814235, "learning_rate": 4.999050767562379e-07, "logits/chosen": -1.500614881515503, "logits/rejected": -1.514692783355713, "logps/chosen": -259.22607421875, "logps/rejected": -324.70147705078125, "loss": 0.5905, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4987005591392517, "rewards/margins": 0.39352884888648987, "rewards/rejected": -0.8922293782234192, "step": 55 }, { "epoch": 0.11829944547134935, "grad_norm": 5.183419407454259, "learning_rate": 4.99519574616467e-07, "logits/chosen": -1.6389617919921875, "logits/rejected": -1.5824358463287354, "logps/chosen": -283.13287353515625, "logps/rejected": -382.1869201660156, "loss": 0.6371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7059253454208374, "rewards/margins": 0.6277474164962769, "rewards/rejected": -1.3336727619171143, "step": 60 }, { "epoch": 0.1281577325939618, "grad_norm": 9.230406347476531, "learning_rate": 4.988380179235842e-07, "logits/chosen": -1.6305882930755615, "logits/rejected": -1.6462520360946655, "logps/chosen": -256.4553527832031, "logps/rejected": -347.4143371582031, "loss": 0.5805, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.49059420824050903, "rewards/margins": 0.5759122371673584, "rewards/rejected": -1.0665065050125122, "step": 65 }, { "epoch": 0.13801601971657423, "grad_norm": 9.206165908014777, "learning_rate": 4.978612153434526e-07, "logits/chosen": -1.7708934545516968, "logits/rejected": -1.7579914331436157, "logps/chosen": -285.9685974121094, "logps/rejected": -370.2804260253906, "loss": 0.5983, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7101233601570129, "rewards/margins": 0.5043641328811646, "rewards/rejected": -1.2144873142242432, "step": 70 }, { "epoch": 0.1478743068391867, "grad_norm": 7.723809446488398, "learning_rate": 4.965903258506806e-07, "logits/chosen": -1.8401196002960205, "logits/rejected": -1.7219253778457642, "logps/chosen": -289.95068359375, "logps/rejected": -389.30889892578125, "loss": 0.5573, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7532116174697876, "rewards/margins": 0.6191812753677368, "rewards/rejected": -1.372393012046814, "step": 75 }, { "epoch": 0.15773259396179914, "grad_norm": 10.916878987391435, "learning_rate": 4.950268573535011e-07, "logits/chosen": -2.015733480453491, "logits/rejected": -1.8680551052093506, "logps/chosen": -325.2226257324219, "logps/rejected": -428.49066162109375, "loss": 0.5476, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0503952503204346, "rewards/margins": 0.6346156597137451, "rewards/rejected": -1.6850106716156006, "step": 80 }, { "epoch": 0.16759088108441159, "grad_norm": 13.425184009136764, "learning_rate": 4.93172664904641e-07, "logits/chosen": -1.8795242309570312, "logits/rejected": -1.913556694984436, "logps/chosen": -317.763916015625, "logps/rejected": -424.55450439453125, "loss": 0.5141, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1219675540924072, "rewards/margins": 0.7599529027938843, "rewards/rejected": -1.881920576095581, "step": 85 }, { "epoch": 0.17744916820702403, "grad_norm": 15.954473082571113, "learning_rate": 4.910299485003033e-07, "logits/chosen": -2.1529054641723633, "logits/rejected": -2.0844523906707764, "logps/chosen": -424.82891845703125, "logps/rejected": -543.7278442382812, "loss": 0.5199, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0633959770202637, "rewards/margins": 0.886804461479187, "rewards/rejected": -2.950200319290161, "step": 90 }, { "epoch": 0.18730745532963647, "grad_norm": 17.544754679380226, "learning_rate": 4.886012504698769e-07, "logits/chosen": -1.882367730140686, "logits/rejected": -1.9553489685058594, "logps/chosen": -406.643310546875, "logps/rejected": -471.86553955078125, "loss": 0.5386, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.862217664718628, "rewards/margins": 0.4628971219062805, "rewards/rejected": -2.3251149654388428, "step": 95 }, { "epoch": 0.1971657424522489, "grad_norm": 13.476927825101471, "learning_rate": 4.858894524594652e-07, "logits/chosen": -2.1455252170562744, "logits/rejected": -2.0651824474334717, "logps/chosen": -392.774169921875, "logps/rejected": -530.4494018554688, "loss": 0.4917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8280452489852905, "rewards/margins": 0.902090847492218, "rewards/rejected": -2.7301361560821533, "step": 100 }, { "epoch": 0.20702402957486138, "grad_norm": 23.287769508042025, "learning_rate": 4.828977720128198e-07, "logits/chosen": -1.9681150913238525, "logits/rejected": -1.9559170007705688, "logps/chosen": -431.6632385253906, "logps/rejected": -570.6896362304688, "loss": 0.4834, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2001523971557617, "rewards/margins": 1.0563952922821045, "rewards/rejected": -3.2565484046936035, "step": 105 }, { "epoch": 0.21688231669747382, "grad_norm": 23.293354005808915, "learning_rate": 4.796297587537285e-07, "logits/chosen": -2.096468448638916, "logits/rejected": -1.9595563411712646, "logps/chosen": -497.79400634765625, "logps/rejected": -643.2481689453125, "loss": 0.483, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.711547374725342, "rewards/margins": 1.0744675397872925, "rewards/rejected": -3.7860145568847656, "step": 110 }, { "epoch": 0.22674060382008626, "grad_norm": 16.815054431474035, "learning_rate": 4.760892901743944e-07, "logits/chosen": -2.1025643348693848, "logits/rejected": -2.091360092163086, "logps/chosen": -450.98028564453125, "logps/rejected": -575.75439453125, "loss": 0.4619, "rewards/accuracies": 0.75, "rewards/chosen": -2.345163106918335, "rewards/margins": 1.00288987159729, "rewards/rejected": -3.348052978515625, "step": 115 }, { "epoch": 0.2365988909426987, "grad_norm": 18.736725526597898, "learning_rate": 4.7228056703479626e-07, "logits/chosen": -1.9844331741333008, "logits/rejected": -2.1090264320373535, "logps/chosen": -505.28509521484375, "logps/rejected": -649.0353393554688, "loss": 0.4298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8585612773895264, "rewards/margins": 1.334934949874878, "rewards/rejected": -4.193496227264404, "step": 120 }, { "epoch": 0.24645717806531114, "grad_norm": 29.41530429769772, "learning_rate": 4.6820810837849535e-07, "logits/chosen": -1.9075158834457397, "logits/rejected": -1.952182412147522, "logps/chosen": -443.2312927246094, "logps/rejected": -584.6851196289062, "loss": 0.4647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3362534046173096, "rewards/margins": 1.1353000402450562, "rewards/rejected": -3.471553087234497, "step": 125 }, { "epoch": 0.2563154651879236, "grad_norm": 28.047847807749136, "learning_rate": 4.63876746170797e-07, "logits/chosen": -1.9407484531402588, "logits/rejected": -1.9303442239761353, "logps/chosen": -533.4217529296875, "logps/rejected": -713.83740234375, "loss": 0.4145, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.180513858795166, "rewards/margins": 1.4735915660858154, "rewards/rejected": -4.6541056632995605, "step": 130 }, { "epoch": 0.266173752310536, "grad_norm": 18.58702447039976, "learning_rate": 4.592916195656321e-07, "logits/chosen": -2.0613300800323486, "logits/rejected": -1.971636414527893, "logps/chosen": -469.5445251464844, "logps/rejected": -650.7494506835938, "loss": 0.4332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.464566946029663, "rewards/margins": 1.3873087167739868, "rewards/rejected": -3.8518757820129395, "step": 135 }, { "epoch": 0.27603203943314847, "grad_norm": 43.43885248557689, "learning_rate": 4.544581688079602e-07, "logits/chosen": -1.8543685674667358, "logits/rejected": -1.960680365562439, "logps/chosen": -499.29150390625, "logps/rejected": -682.0525512695312, "loss": 0.4126, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.7792274951934814, "rewards/margins": 1.672224998474121, "rewards/rejected": -4.45145320892334, "step": 140 }, { "epoch": 0.2858903265557609, "grad_norm": 25.06136332684734, "learning_rate": 4.493821287789272e-07, "logits/chosen": -2.0097248554229736, "logits/rejected": -2.05975604057312, "logps/chosen": -622.1812744140625, "logps/rejected": -857.2575073242188, "loss": 0.4115, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.092832565307617, "rewards/margins": 1.9847408533096313, "rewards/rejected": -6.077573299407959, "step": 145 }, { "epoch": 0.2957486136783734, "grad_norm": 23.62970192824471, "learning_rate": 4.4406952219143934e-07, "logits/chosen": -1.9738140106201172, "logits/rejected": -1.8969192504882812, "logps/chosen": -505.8863220214844, "logps/rejected": -674.2682495117188, "loss": 0.4551, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9365577697753906, "rewards/margins": 1.3609775304794312, "rewards/rejected": -4.297535419464111, "step": 150 }, { "epoch": 0.30560690080098585, "grad_norm": 36.928411112871835, "learning_rate": 4.38526652444224e-07, "logits/chosen": -1.9676620960235596, "logits/rejected": -1.9335002899169922, "logps/chosen": -526.3443603515625, "logps/rejected": -675.4140625, "loss": 0.4316, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1301074028015137, "rewards/margins": 1.176997423171997, "rewards/rejected": -4.30710506439209, "step": 155 }, { "epoch": 0.3154651879235983, "grad_norm": 35.737507476172006, "learning_rate": 4.3276009614285824e-07, "logits/chosen": -2.08416748046875, "logits/rejected": -2.0275375843048096, "logps/chosen": -547.2161254882812, "logps/rejected": -734.8326416015625, "loss": 0.4361, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.296079635620117, "rewards/margins": 1.489527940750122, "rewards/rejected": -4.78560733795166, "step": 160 }, { "epoch": 0.32532347504621073, "grad_norm": 25.388193696092944, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -1.8684972524642944, "logits/rejected": -1.98639714717865, "logps/chosen": -492.37518310546875, "logps/rejected": -663.5337524414062, "loss": 0.3908, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.736380100250244, "rewards/margins": 1.5142922401428223, "rewards/rejected": -4.250672340393066, "step": 165 }, { "epoch": 0.33518176216882317, "grad_norm": 26.163756341836816, "learning_rate": 4.2058354920054043e-07, "logits/chosen": -2.0008151531219482, "logits/rejected": -2.1545004844665527, "logps/chosen": -558.0103759765625, "logps/rejected": -783.8531494140625, "loss": 0.3635, "rewards/accuracies": 0.875, "rewards/chosen": -3.2563652992248535, "rewards/margins": 2.108079433441162, "rewards/rejected": -5.364445209503174, "step": 170 }, { "epoch": 0.3450400492914356, "grad_norm": 20.146161792615796, "learning_rate": 4.141880060119336e-07, "logits/chosen": -2.138545036315918, "logits/rejected": -2.1449027061462402, "logps/chosen": -580.8723754882812, "logps/rejected": -799.7882690429688, "loss": 0.4178, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.6856274604797363, "rewards/margins": 1.9810088872909546, "rewards/rejected": -5.6666364669799805, "step": 175 }, { "epoch": 0.35489833641404805, "grad_norm": 20.25459576341684, "learning_rate": 4.0759765403198877e-07, "logits/chosen": -1.9771722555160522, "logits/rejected": -1.9267823696136475, "logps/chosen": -448.6309509277344, "logps/rejected": -687.6984252929688, "loss": 0.3941, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.361887216567993, "rewards/margins": 1.849793791770935, "rewards/rejected": -4.211681365966797, "step": 180 }, { "epoch": 0.3647566235366605, "grad_norm": 23.732608340062967, "learning_rate": 4.008203127021797e-07, "logits/chosen": -2.0232439041137695, "logits/rejected": -2.0282373428344727, "logps/chosen": -536.0543212890625, "logps/rejected": -753.0247802734375, "loss": 0.3758, "rewards/accuracies": 0.8125, "rewards/chosen": -3.216007947921753, "rewards/margins": 1.8679723739624023, "rewards/rejected": -5.083980560302734, "step": 185 }, { "epoch": 0.37461491065927294, "grad_norm": 33.821388543016646, "learning_rate": 3.9386402332652754e-07, "logits/chosen": -2.0202414989471436, "logits/rejected": -1.956538200378418, "logps/chosen": -628.9379272460938, "logps/rejected": -831.4833984375, "loss": 0.4385, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.222132205963135, "rewards/margins": 1.7108278274536133, "rewards/rejected": -5.93295955657959, "step": 190 }, { "epoch": 0.3844731977818854, "grad_norm": 17.956228351745885, "learning_rate": 3.867370395306068e-07, "logits/chosen": -1.974908471107483, "logits/rejected": -1.9330415725708008, "logps/chosen": -509.0133361816406, "logps/rejected": -720.5633544921875, "loss": 0.3801, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9117178916931152, "rewards/margins": 1.6813218593597412, "rewards/rejected": -4.593040466308594, "step": 195 }, { "epoch": 0.3943314849044978, "grad_norm": 24.48103397679138, "learning_rate": 3.794478174686328e-07, "logits/chosen": -1.9475266933441162, "logits/rejected": -1.9687010049819946, "logps/chosen": -549.758544921875, "logps/rejected": -740.8396606445312, "loss": 0.4111, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.3982017040252686, "rewards/margins": 1.5769809484481812, "rewards/rejected": -4.97518253326416, "step": 200 }, { "epoch": 0.4041897720271103, "grad_norm": 19.929793517914295, "learning_rate": 3.720050057902495e-07, "logits/chosen": -2.11773419380188, "logits/rejected": -2.0510640144348145, "logps/chosen": -678.2037353515625, "logps/rejected": -897.92822265625, "loss": 0.3989, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.697990894317627, "rewards/margins": 1.6948550939559937, "rewards/rejected": -6.39284610748291, "step": 205 }, { "epoch": 0.41404805914972276, "grad_norm": 19.138382009358025, "learning_rate": 3.644174353789204e-07, "logits/chosen": -1.96860671043396, "logits/rejected": -1.9445680379867554, "logps/chosen": -541.2803955078125, "logps/rejected": -714.5045776367188, "loss": 0.3758, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2117228507995605, "rewards/margins": 1.4313344955444336, "rewards/rejected": -4.643057346343994, "step": 210 }, { "epoch": 0.4239063462723352, "grad_norm": 22.61062071667254, "learning_rate": 3.566941088741009e-07, "logits/chosen": -1.9290311336517334, "logits/rejected": -1.9250952005386353, "logps/chosen": -502.6095275878906, "logps/rejected": -698.4926147460938, "loss": 0.3967, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9526402950286865, "rewards/margins": 1.6423494815826416, "rewards/rejected": -4.594989776611328, "step": 215 }, { "epoch": 0.43376463339494764, "grad_norm": 28.506261562704676, "learning_rate": 3.488441899896217e-07, "logits/chosen": -2.1637561321258545, "logits/rejected": -1.9638168811798096, "logps/chosen": -579.2008056640625, "logps/rejected": -836.2589111328125, "loss": 0.3974, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5565972328186035, "rewards/margins": 2.140427589416504, "rewards/rejected": -5.697024345397949, "step": 220 }, { "epoch": 0.4436229205175601, "grad_norm": 22.104238159035294, "learning_rate": 3.408769926409574e-07, "logits/chosen": -1.9999799728393555, "logits/rejected": -1.9067310094833374, "logps/chosen": -533.4635009765625, "logps/rejected": -767.3900146484375, "loss": 0.3601, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1343350410461426, "rewards/margins": 1.9703528881072998, "rewards/rejected": -5.104687690734863, "step": 225 }, { "epoch": 0.4534812076401725, "grad_norm": 21.86054071865173, "learning_rate": 3.3280196989428263e-07, "logits/chosen": -2.0549824237823486, "logits/rejected": -2.079737424850464, "logps/chosen": -571.4501342773438, "logps/rejected": -805.6971435546875, "loss": 0.3644, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.492208480834961, "rewards/margins": 2.0622007846832275, "rewards/rejected": -5.554409027099609, "step": 230 }, { "epoch": 0.46333949476278496, "grad_norm": 28.670025336805338, "learning_rate": 3.2462870275042367e-07, "logits/chosen": -2.086364269256592, "logits/rejected": -2.082109212875366, "logps/chosen": -627.2444458007812, "logps/rejected": -857.6990356445312, "loss": 0.3692, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.9953174591064453, "rewards/margins": 2.085484266281128, "rewards/rejected": -6.080801963806152, "step": 235 }, { "epoch": 0.4731977818853974, "grad_norm": 23.098928119258375, "learning_rate": 3.1636688877701806e-07, "logits/chosen": -1.9278815984725952, "logits/rejected": -2.008877992630005, "logps/chosen": -536.9634399414062, "logps/rejected": -782.7907104492188, "loss": 0.3307, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.128661632537842, "rewards/margins": 2.186957836151123, "rewards/rejected": -5.315619468688965, "step": 240 }, { "epoch": 0.48305606900800985, "grad_norm": 18.256316767301172, "learning_rate": 3.080263306023669e-07, "logits/chosen": -1.9272663593292236, "logits/rejected": -1.9132862091064453, "logps/chosen": -510.236328125, "logps/rejected": -714.0992431640625, "loss": 0.3866, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.079207420349121, "rewards/margins": 1.7918453216552734, "rewards/rejected": -4.8710527420043945, "step": 245 }, { "epoch": 0.4929143561306223, "grad_norm": 21.751680260746046, "learning_rate": 2.996169242846328e-07, "logits/chosen": -1.8919010162353516, "logits/rejected": -1.9492820501327515, "logps/chosen": -575.5780029296875, "logps/rejected": -819.9085693359375, "loss": 0.3276, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.592189311981201, "rewards/margins": 2.1720731258392334, "rewards/rejected": -5.764262676239014, "step": 250 }, { "epoch": 0.5027726432532348, "grad_norm": 22.663811321818965, "learning_rate": 2.911486475701835e-07, "logits/chosen": -1.8436260223388672, "logits/rejected": -1.8624000549316406, "logps/chosen": -532.0939331054688, "logps/rejected": -772.1865234375, "loss": 0.3646, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.234412670135498, "rewards/margins": 2.037332773208618, "rewards/rejected": -5.271745681762695, "step": 255 }, { "epoch": 0.5126309303758472, "grad_norm": 45.86539600331869, "learning_rate": 2.826315480550129e-07, "logits/chosen": -1.8276054859161377, "logits/rejected": -1.944835901260376, "logps/chosen": -522.5578002929688, "logps/rejected": -729.6175537109375, "loss": 0.3653, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.1025471687316895, "rewards/margins": 1.926390290260315, "rewards/rejected": -5.028937339782715, "step": 260 }, { "epoch": 0.5224892174984597, "grad_norm": 33.790931231853406, "learning_rate": 2.740757312632854e-07, "logits/chosen": -1.9260978698730469, "logits/rejected": -1.8717044591903687, "logps/chosen": -576.6935424804688, "logps/rejected": -834.5701904296875, "loss": 0.3316, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.7069427967071533, "rewards/margins": 2.2525296211242676, "rewards/rejected": -5.959472179412842, "step": 265 }, { "epoch": 0.532347504621072, "grad_norm": 32.72135751726444, "learning_rate": 2.654913486571487e-07, "logits/chosen": -1.928877830505371, "logits/rejected": -1.9832346439361572, "logps/chosen": -580.7061767578125, "logps/rejected": -821.77734375, "loss": 0.3773, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.5770630836486816, "rewards/margins": 2.1589841842651367, "rewards/rejected": -5.73604679107666, "step": 270 }, { "epoch": 0.5422057917436846, "grad_norm": 18.44880000765859, "learning_rate": 2.5688858559204053e-07, "logits/chosen": -1.8500230312347412, "logits/rejected": -1.8931682109832764, "logps/chosen": -484.74420166015625, "logps/rejected": -701.5289916992188, "loss": 0.3747, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.6471669673919678, "rewards/margins": 1.9642257690429688, "rewards/rejected": -4.611392974853516, "step": 275 }, { "epoch": 0.5520640788662969, "grad_norm": 23.335141498824942, "learning_rate": 2.4827764923178246e-07, "logits/chosen": -1.8331562280654907, "logits/rejected": -1.9513938426971436, "logps/chosen": -470.59405517578125, "logps/rejected": -660.6781005859375, "loss": 0.3683, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.504983901977539, "rewards/margins": 1.7337911128997803, "rewards/rejected": -4.23877477645874, "step": 280 }, { "epoch": 0.5619223659889094, "grad_norm": 26.351304197321983, "learning_rate": 2.3966875643779667e-07, "logits/chosen": -2.0291342735290527, "logits/rejected": -1.9187507629394531, "logps/chosen": -495.74639892578125, "logps/rejected": -760.9273681640625, "loss": 0.3506, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.894580125808716, "rewards/margins": 2.167811632156372, "rewards/rejected": -5.062391757965088, "step": 285 }, { "epoch": 0.5717806531115218, "grad_norm": 21.633547530781627, "learning_rate": 2.3107212164681774e-07, "logits/chosen": -1.871260643005371, "logits/rejected": -1.916135549545288, "logps/chosen": -529.8262939453125, "logps/rejected": -772.5482177734375, "loss": 0.36, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0958144664764404, "rewards/margins": 2.1699347496032715, "rewards/rejected": -5.265749454498291, "step": 290 }, { "epoch": 0.5816389402341343, "grad_norm": 28.65131510288306, "learning_rate": 2.2249794475148019e-07, "logits/chosen": -2.063917636871338, "logits/rejected": -2.049710750579834, "logps/chosen": -510.1465759277344, "logps/rejected": -759.2296752929688, "loss": 0.3827, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.9606268405914307, "rewards/margins": 2.110110282897949, "rewards/rejected": -5.070736885070801, "step": 295 }, { "epoch": 0.5914972273567468, "grad_norm": 23.359544656067033, "learning_rate": 2.1395639899816332e-07, "logits/chosen": -2.2645859718322754, "logits/rejected": -1.9906375408172607, "logps/chosen": -541.4847412109375, "logps/rejected": -846.5947265625, "loss": 0.3488, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.3356003761291504, "rewards/margins": 2.4959442615509033, "rewards/rejected": -5.831544399261475, "step": 300 }, { "epoch": 0.6013555144793592, "grad_norm": 24.944829150573064, "learning_rate": 2.0545761891645177e-07, "logits/chosen": -2.0867130756378174, "logits/rejected": -2.074833393096924, "logps/chosen": -642.1096801757812, "logps/rejected": -906.7780151367188, "loss": 0.3502, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.175184726715088, "rewards/margins": 2.322237253189087, "rewards/rejected": -6.497422218322754, "step": 305 }, { "epoch": 0.6112138016019717, "grad_norm": 24.979816541182146, "learning_rate": 1.9701168829453305e-07, "logits/chosen": -1.932847023010254, "logits/rejected": -1.9259026050567627, "logps/chosen": -570.7978515625, "logps/rejected": -823.3259887695312, "loss": 0.3411, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.579385757446289, "rewards/margins": 2.1752305030822754, "rewards/rejected": -5.7546162605285645, "step": 310 }, { "epoch": 0.6210720887245841, "grad_norm": 24.585502500513254, "learning_rate": 1.886286282148002e-07, "logits/chosen": -2.069624662399292, "logits/rejected": -1.978257179260254, "logps/chosen": -547.39794921875, "logps/rejected": -787.85302734375, "loss": 0.331, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.470412015914917, "rewards/margins": 2.049595594406128, "rewards/rejected": -5.520008087158203, "step": 315 }, { "epoch": 0.6309303758471966, "grad_norm": 29.24520617120494, "learning_rate": 1.8031838516385422e-07, "logits/chosen": -2.089122772216797, "logits/rejected": -2.0376973152160645, "logps/chosen": -622.2824096679688, "logps/rejected": -920.2009887695312, "loss": 0.3733, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.0230793952941895, "rewards/margins": 2.4440813064575195, "rewards/rejected": -6.467160701751709, "step": 320 }, { "epoch": 0.640788662969809, "grad_norm": 20.742877534346576, "learning_rate": 1.7209081923101472e-07, "logits/chosen": -2.0211918354034424, "logits/rejected": -2.014601230621338, "logps/chosen": -589.6067504882812, "logps/rejected": -773.7950439453125, "loss": 0.3436, "rewards/accuracies": 0.8125, "rewards/chosen": -3.677738904953003, "rewards/margins": 1.7034008502960205, "rewards/rejected": -5.381140232086182, "step": 325 }, { "epoch": 0.6506469500924215, "grad_norm": 16.903001935618324, "learning_rate": 1.639556924093404e-07, "logits/chosen": -1.8897491693496704, "logits/rejected": -1.88128662109375, "logps/chosen": -517.2490844726562, "logps/rejected": -746.6140747070312, "loss": 0.3561, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.064331531524658, "rewards/margins": 1.9852135181427002, "rewards/rejected": -5.0495452880859375, "step": 330 }, { "epoch": 0.6605052372150338, "grad_norm": 19.809662336676986, "learning_rate": 1.5592265701304114e-07, "logits/chosen": -2.0255661010742188, "logits/rejected": -1.944502592086792, "logps/chosen": -566.5452270507812, "logps/rejected": -803.533203125, "loss": 0.3705, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5436272621154785, "rewards/margins": 2.0192878246307373, "rewards/rejected": -5.562914848327637, "step": 335 }, { "epoch": 0.6703635243376463, "grad_norm": 26.202979422607854, "learning_rate": 1.4800124422502334e-07, "logits/chosen": -1.918569803237915, "logits/rejected": -2.0119967460632324, "logps/chosen": -601.0817260742188, "logps/rejected": -845.7435302734375, "loss": 0.3597, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.8363289833068848, "rewards/margins": 2.0620241165161133, "rewards/rejected": -5.89835262298584, "step": 340 }, { "epoch": 0.6802218114602587, "grad_norm": 26.077309548266044, "learning_rate": 1.4020085278815743e-07, "logits/chosen": -2.0037617683410645, "logits/rejected": -1.8837954998016357, "logps/chosen": -645.288818359375, "logps/rejected": -909.7770385742188, "loss": 0.3434, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.325263500213623, "rewards/margins": 2.183290719985962, "rewards/rejected": -6.508553981781006, "step": 345 }, { "epoch": 0.6900800985828712, "grad_norm": 20.206758195915803, "learning_rate": 1.3253073785368545e-07, "logits/chosen": -1.97844660282135, "logits/rejected": -1.9779163599014282, "logps/chosen": -656.6150512695312, "logps/rejected": -917.7893676757812, "loss": 0.3432, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.431666374206543, "rewards/margins": 2.2607076168060303, "rewards/rejected": -6.692374229431152, "step": 350 }, { "epoch": 0.6999383857054837, "grad_norm": 24.24143829005782, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -2.066188335418701, "logits/rejected": -2.054232120513916, "logps/chosen": -643.3806762695312, "logps/rejected": -863.3739013671875, "loss": 0.3583, "rewards/accuracies": 0.78125, "rewards/chosen": -4.284465789794922, "rewards/margins": 1.9454777240753174, "rewards/rejected": -6.22994327545166, "step": 355 }, { "epoch": 0.7097966728280961, "grad_norm": 17.76728293699117, "learning_rate": 1.1761757443482285e-07, "logits/chosen": -1.8952592611312866, "logits/rejected": -1.8270065784454346, "logps/chosen": -567.1143798828125, "logps/rejected": -791.7889404296875, "loss": 0.3722, "rewards/accuracies": 0.8125, "rewards/chosen": -3.542332172393799, "rewards/margins": 1.8870967626571655, "rewards/rejected": -5.429428577423096, "step": 360 }, { "epoch": 0.7196549599507086, "grad_norm": 25.546590661527123, "learning_rate": 1.1039222039359644e-07, "logits/chosen": -1.9491792917251587, "logits/rejected": -1.8340580463409424, "logps/chosen": -522.3615112304688, "logps/rejected": -782.1358032226562, "loss": 0.3194, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.073195219039917, "rewards/margins": 2.195949077606201, "rewards/rejected": -5.269144535064697, "step": 365 }, { "epoch": 0.729513247073321, "grad_norm": 25.390221264918292, "learning_rate": 1.0333251074666608e-07, "logits/chosen": -1.8948665857315063, "logits/rejected": -1.8821592330932617, "logps/chosen": -578.3306884765625, "logps/rejected": -830.9544677734375, "loss": 0.3285, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.6443309783935547, "rewards/margins": 2.2878963947296143, "rewards/rejected": -5.932227611541748, "step": 370 }, { "epoch": 0.7393715341959335, "grad_norm": 23.32295380693496, "learning_rate": 9.644682182758304e-08, "logits/chosen": -1.8538382053375244, "logits/rejected": -1.8016763925552368, "logps/chosen": -604.1889038085938, "logps/rejected": -873.8849487304688, "loss": 0.3055, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.8995869159698486, "rewards/margins": 2.3217251300811768, "rewards/rejected": -6.221312046051025, "step": 375 }, { "epoch": 0.7492298213185459, "grad_norm": 23.34487045577994, "learning_rate": 8.974332349459992e-08, "logits/chosen": -1.913751244544983, "logits/rejected": -1.8759132623672485, "logps/chosen": -620.8341064453125, "logps/rejected": -879.5367431640625, "loss": 0.3497, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9944968223571777, "rewards/margins": 2.277583360671997, "rewards/rejected": -6.272080421447754, "step": 380 }, { "epoch": 0.7590881084411584, "grad_norm": 32.80860271044305, "learning_rate": 8.322996943714672e-08, "logits/chosen": -1.9127395153045654, "logits/rejected": -1.716653823852539, "logps/chosen": -559.0478515625, "logps/rejected": -869.8646240234375, "loss": 0.3553, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.513237714767456, "rewards/margins": 2.5553982257843018, "rewards/rejected": -6.068636894226074, "step": 385 }, { "epoch": 0.7689463955637708, "grad_norm": 22.70407770691601, "learning_rate": 7.691448773879256e-08, "logits/chosen": -1.8521419763565063, "logits/rejected": -1.7435353994369507, "logps/chosen": -503.28369140625, "logps/rejected": -769.7913818359375, "loss": 0.3582, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.978849172592163, "rewards/margins": 2.2009291648864746, "rewards/rejected": -5.179778575897217, "step": 390 }, { "epoch": 0.7788046826863833, "grad_norm": 26.31642324943315, "learning_rate": 7.080437170788722e-08, "logits/chosen": -1.9601354598999023, "logits/rejected": -1.8902816772460938, "logps/chosen": -511.15478515625, "logps/rejected": -765.9757690429688, "loss": 0.3126, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.040039539337158, "rewards/margins": 2.2116754055023193, "rewards/rejected": -5.251715183258057, "step": 395 }, { "epoch": 0.7886629698089956, "grad_norm": 73.40724109949657, "learning_rate": 6.490687098676332e-08, "logits/chosen": -1.776098608970642, "logits/rejected": -1.7230415344238281, "logps/chosen": -568.650634765625, "logps/rejected": -813.4503173828125, "loss": 0.347, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.520684003829956, "rewards/margins": 2.105372190475464, "rewards/rejected": -5.626055717468262, "step": 400 }, { "epoch": 0.7886629698089956, "eval_logits/chosen": -2.5592944622039795, "eval_logits/rejected": -2.4283623695373535, "eval_logps/chosen": -432.9762878417969, "eval_logps/rejected": -481.0541076660156, "eval_loss": 0.5772423148155212, "eval_rewards/accuracies": 0.6794354915618896, "eval_rewards/chosen": -1.7008415460586548, "eval_rewards/margins": 0.37072598934173584, "eval_rewards/rejected": -2.0715677738189697, "eval_runtime": 324.936, "eval_samples_per_second": 6.081, "eval_steps_per_second": 0.382, "step": 400 }, { "epoch": 0.7985212569316081, "grad_norm": 33.809746730746596, "learning_rate": 5.9228982950048414e-08, "logits/chosen": -1.7156673669815063, "logits/rejected": -1.7448875904083252, "logps/chosen": -582.68603515625, "logps/rejected": -882.1572265625, "loss": 0.3578, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.6797871589660645, "rewards/margins": 2.6076254844665527, "rewards/rejected": -6.287413120269775, "step": 405 }, { "epoch": 0.8083795440542206, "grad_norm": 23.982039805708112, "learning_rate": 5.3777444402291345e-08, "logits/chosen": -1.9656894207000732, "logits/rejected": -1.7757899761199951, "logps/chosen": -602.1336059570312, "logps/rejected": -914.3304443359375, "loss": 0.2749, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.811291456222534, "rewards/margins": 2.5836830139160156, "rewards/rejected": -6.394974708557129, "step": 410 }, { "epoch": 0.818237831176833, "grad_norm": 21.999889032487328, "learning_rate": 4.855872358475546e-08, "logits/chosen": -1.883536696434021, "logits/rejected": -1.8990424871444702, "logps/chosen": -593.3975219726562, "logps/rejected": -852.3743896484375, "loss": 0.3421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.749640941619873, "rewards/margins": 2.2763512134552, "rewards/rejected": -6.025992393493652, "step": 415 }, { "epoch": 0.8280961182994455, "grad_norm": 22.43509931864549, "learning_rate": 4.357901250086107e-08, "logits/chosen": -1.9897289276123047, "logits/rejected": -1.8019778728485107, "logps/chosen": -604.5925903320312, "logps/rejected": -907.0695190429688, "loss": 0.34, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.7366394996643066, "rewards/margins": 2.591937303543091, "rewards/rejected": -6.328576564788818, "step": 420 }, { "epoch": 0.8379544054220579, "grad_norm": 24.272876807226076, "learning_rate": 3.884421956938377e-08, "logits/chosen": -1.7035375833511353, "logits/rejected": -1.8067095279693604, "logps/chosen": -621.3763427734375, "logps/rejected": -809.0113525390625, "loss": 0.3274, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.943162441253662, "rewards/margins": 1.8734540939331055, "rewards/rejected": -5.816616535186768, "step": 425 }, { "epoch": 0.8478126925446704, "grad_norm": 20.673588966056126, "learning_rate": 3.435996261412591e-08, "logits/chosen": -1.7106269598007202, "logits/rejected": -1.7173693180084229, "logps/chosen": -582.3190307617188, "logps/rejected": -837.8707275390625, "loss": 0.3204, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.7369446754455566, "rewards/margins": 2.223895311355591, "rewards/rejected": -5.960839748382568, "step": 430 }, { "epoch": 0.8576709796672828, "grad_norm": 21.70614636700232, "learning_rate": 3.013156219837776e-08, "logits/chosen": -2.0358176231384277, "logits/rejected": -1.7434278726577759, "logps/chosen": -567.6253662109375, "logps/rejected": -890.8966674804688, "loss": 0.3264, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.516098737716675, "rewards/margins": 2.7037405967712402, "rewards/rejected": -6.219839096069336, "step": 435 }, { "epoch": 0.8675292667898953, "grad_norm": 32.0982872650184, "learning_rate": 2.6164035312078447e-08, "logits/chosen": -1.87311589717865, "logits/rejected": -1.8581056594848633, "logps/chosen": -588.0389404296875, "logps/rejected": -895.1696166992188, "loss": 0.3188, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.7721753120422363, "rewards/margins": 2.676305055618286, "rewards/rejected": -6.448480129241943, "step": 440 }, { "epoch": 0.8773875539125077, "grad_norm": 21.51066896519883, "learning_rate": 2.2462089419165776e-08, "logits/chosen": -1.8648655414581299, "logits/rejected": -1.7761850357055664, "logps/chosen": -582.1537475585938, "logps/rejected": -876.07080078125, "loss": 0.3584, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.821885347366333, "rewards/margins": 2.4609155654907227, "rewards/rejected": -6.282800197601318, "step": 445 }, { "epoch": 0.8872458410351202, "grad_norm": 23.61458187816769, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -1.8204158544540405, "logits/rejected": -1.798825979232788, "logps/chosen": -608.7778930664062, "logps/rejected": -842.8968505859375, "loss": 0.355, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7794837951660156, "rewards/margins": 2.1480660438537598, "rewards/rejected": -5.927549839019775, "step": 450 }, { "epoch": 0.8971041281577325, "grad_norm": 19.975596086165712, "learning_rate": 1.5872189700736337e-08, "logits/chosen": -1.7636759281158447, "logits/rejected": -1.8992855548858643, "logps/chosen": -585.3933715820312, "logps/rejected": -801.01025390625, "loss": 0.3525, "rewards/accuracies": 0.8125, "rewards/chosen": -3.633349657058716, "rewards/margins": 1.9284839630126953, "rewards/rejected": -5.561833381652832, "step": 455 }, { "epoch": 0.906962415280345, "grad_norm": 24.116575473235745, "learning_rate": 1.2992054780085692e-08, "logits/chosen": -1.6149314641952515, "logits/rejected": -1.6830947399139404, "logps/chosen": -552.21728515625, "logps/rejected": -793.0897216796875, "loss": 0.3263, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.4062886238098145, "rewards/margins": 2.0678482055664062, "rewards/rejected": -5.474137306213379, "step": 460 }, { "epoch": 0.9168207024029574, "grad_norm": 20.57194341940523, "learning_rate": 1.0393129385436823e-08, "logits/chosen": -1.9199676513671875, "logits/rejected": -1.8623239994049072, "logps/chosen": -570.3748779296875, "logps/rejected": -817.0477294921875, "loss": 0.3323, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.562505006790161, "rewards/margins": 2.1840949058532715, "rewards/rejected": -5.7465996742248535, "step": 465 }, { "epoch": 0.9266789895255699, "grad_norm": 23.30110640610616, "learning_rate": 8.078497137373242e-09, "logits/chosen": -1.7810325622558594, "logits/rejected": -1.7818634510040283, "logps/chosen": -555.9640502929688, "logps/rejected": -835.23876953125, "loss": 0.3237, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.4883415699005127, "rewards/margins": 2.179206609725952, "rewards/rejected": -5.667548179626465, "step": 470 }, { "epoch": 0.9365372766481824, "grad_norm": 21.422635902068766, "learning_rate": 6.0509043431410945e-09, "logits/chosen": -1.7087141275405884, "logits/rejected": -1.772657036781311, "logps/chosen": -568.8113403320312, "logps/rejected": -804.4452514648438, "loss": 0.3425, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.5404930114746094, "rewards/margins": 2.15417742729187, "rewards/rejected": -5.694670677185059, "step": 475 }, { "epoch": 0.9463955637707948, "grad_norm": 22.24576845817703, "learning_rate": 4.312756738160145e-09, "logits/chosen": -1.8130733966827393, "logits/rejected": -1.7939121723175049, "logps/chosen": -561.7185668945312, "logps/rejected": -826.4733276367188, "loss": 0.3187, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.5638492107391357, "rewards/margins": 2.2962565422058105, "rewards/rejected": -5.860105991363525, "step": 480 }, { "epoch": 0.9562538508934073, "grad_norm": 20.818504977861426, "learning_rate": 2.8661166316229223e-09, "logits/chosen": -1.7990143299102783, "logits/rejected": -1.7799808979034424, "logps/chosen": -545.7501220703125, "logps/rejected": -777.5648193359375, "loss": 0.338, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.4501044750213623, "rewards/margins": 1.9422149658203125, "rewards/rejected": -5.392319202423096, "step": 485 }, { "epoch": 0.9661121380160197, "grad_norm": 21.367843020001658, "learning_rate": 1.7127004595681727e-09, "logits/chosen": -1.8907989263534546, "logits/rejected": -1.803995132446289, "logps/chosen": -572.9863891601562, "logps/rejected": -869.6575317382812, "loss": 0.3514, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.549314498901367, "rewards/margins": 2.517167568206787, "rewards/rejected": -6.066482067108154, "step": 490 }, { "epoch": 0.9759704251386322, "grad_norm": 26.301841729679015, "learning_rate": 8.538767483325383e-10, "logits/chosen": -1.6898645162582397, "logits/rejected": -1.872666597366333, "logps/chosen": -564.5504760742188, "logps/rejected": -813.0301513671875, "loss": 0.3249, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.5329060554504395, "rewards/margins": 2.330728054046631, "rewards/rejected": -5.863633632659912, "step": 495 }, { "epoch": 0.9858287122612446, "grad_norm": 28.33067138539654, "learning_rate": 2.9066449079634404e-10, "logits/chosen": -1.81964910030365, "logits/rejected": -1.7677667140960693, "logps/chosen": -553.2039184570312, "logps/rejected": -806.8800048828125, "loss": 0.3026, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.421668291091919, "rewards/margins": 2.2354369163513184, "rewards/rejected": -5.657104969024658, "step": 500 }, { "epoch": 0.9956869993838571, "grad_norm": 23.713797105940532, "learning_rate": 2.3731937350224273e-11, "logits/chosen": -1.9265756607055664, "logits/rejected": -1.8447071313858032, "logps/chosen": -565.0730590820312, "logps/rejected": -841.3292236328125, "loss": 0.3122, "rewards/accuracies": 0.875, "rewards/chosen": -3.6411995887756348, "rewards/margins": 2.3857717514038086, "rewards/rejected": -6.026970863342285, "step": 505 }, { "epoch": 0.999630314232902, "step": 507, "total_flos": 0.0, "train_loss": 0.41502543125867375, "train_runtime": 18234.8908, "train_samples_per_second": 3.56, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }