{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988679245283019, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6.326096763058934, "learning_rate": 1.25e-08, "logps/chosen": -36.02279281616211, "logps/rejected": -41.85474395751953, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.3949329853057861, "losses/total": 0.6931471824645996, "ref_logps/chosen": -36.02279281616211, "ref_logps/rejected": -41.85474395751953, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 5.576784855809719, "learning_rate": 2.5e-08, "logps/chosen": -33.77919387817383, "logps/rejected": -41.04405975341797, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.3951497077941895, "losses/total": 0.6931471824645996, "ref_logps/chosen": -33.77919387817383, "ref_logps/rejected": -41.04405975341797, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.02, "grad_norm": 6.263050301271669, "learning_rate": 3.75e-08, "logps/chosen": -38.8697509765625, "logps/rejected": -48.85557556152344, "loss": 0.6931, "losses/dpo": 0.6860073804855347, "losses/sft": 1.6376307010650635, "losses/total": 0.6860073804855347, "ref_logps/chosen": -38.87074279785156, "ref_logps/rejected": -48.853511810302734, "rewards/accuracies": 0.515625, "rewards/chosen": 9.899254655465484e-05, "rewards/margins": 0.0003055855631828308, "rewards/rejected": -0.0002065933949779719, "step": 3 }, { "epoch": 0.03, "grad_norm": 5.738951829533344, "learning_rate": 5e-08, "logps/chosen": -36.64889144897461, "logps/rejected": -42.698097229003906, "loss": 0.6924, "losses/dpo": 0.6935421228408813, "losses/sft": 1.4897900819778442, "losses/total": 0.6935421228408813, "ref_logps/chosen": -36.668033599853516, "ref_logps/rejected": -42.70002746582031, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0019142806995660067, "rewards/margins": 0.001721527660265565, "rewards/rejected": 0.00019275324302725494, "step": 4 }, { "epoch": 0.04, "grad_norm": 6.637504299884417, "learning_rate": 6.25e-08, "logps/chosen": -41.41233825683594, "logps/rejected": -47.04777145385742, "loss": 0.6939, "losses/dpo": 0.6956198215484619, "losses/sft": 1.1974728107452393, "losses/total": 0.6956198215484619, "ref_logps/chosen": -41.40231704711914, "ref_logps/rejected": -47.051856994628906, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0010022701462730765, "rewards/margins": -0.0014111174969002604, "rewards/rejected": 0.00040884732152335346, "step": 5 }, { "epoch": 0.05, "grad_norm": 5.760428498194468, "learning_rate": 7.5e-08, "logps/chosen": -34.51856994628906, "logps/rejected": -41.675804138183594, "loss": 0.6946, "losses/dpo": 0.6942625641822815, "losses/sft": 1.3869932889938354, "losses/total": 0.6942625641822815, "ref_logps/chosen": -34.49778747558594, "ref_logps/rejected": -41.68275451660156, "rewards/accuracies": 0.421875, "rewards/chosen": -0.0020779455080628395, "rewards/margins": -0.002773313783109188, "rewards/rejected": 0.0006953685078769922, "step": 6 }, { "epoch": 0.05, "grad_norm": 5.773272972704967, "learning_rate": 8.75e-08, "logps/chosen": -36.6628303527832, "logps/rejected": -42.856834411621094, "loss": 0.6927, "losses/dpo": 0.6944370269775391, "losses/sft": 1.2695866823196411, "losses/total": 0.6944370269775391, "ref_logps/chosen": -36.669891357421875, "ref_logps/rejected": -42.8528938293457, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007062811637297273, "rewards/margins": 0.0011000875383615494, "rewards/rejected": -0.00039380654925480485, "step": 7 }, { "epoch": 0.06, "grad_norm": 6.837371123256996, "learning_rate": 1e-07, "logps/chosen": -41.66258239746094, "logps/rejected": -43.34931182861328, "loss": 0.6943, "losses/dpo": 0.6919102668762207, "losses/sft": 1.317617654800415, "losses/total": 0.6919102668762207, "ref_logps/chosen": -41.65662384033203, "ref_logps/rejected": -43.36621856689453, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005958047113381326, "rewards/margins": -0.0022863391786813736, "rewards/rejected": 0.0016905345255509019, "step": 8 }, { "epoch": 0.07, "grad_norm": 6.116952924505858, "learning_rate": 1.125e-07, "logps/chosen": -37.05712890625, "logps/rejected": -46.517696380615234, "loss": 0.6917, "losses/dpo": 0.692311704158783, "losses/sft": 1.112796664237976, "losses/total": 0.692311704158783, "ref_logps/chosen": -37.07765197753906, "ref_logps/rejected": -46.507904052734375, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.0020526223815977573, "rewards/margins": 0.003031900618225336, "rewards/rejected": -0.0009792782366275787, "step": 9 }, { "epoch": 0.08, "grad_norm": 5.761856386512759, "learning_rate": 1.25e-07, "logps/chosen": -33.799774169921875, "logps/rejected": -41.23558044433594, "loss": 0.6924, "losses/dpo": 0.6941465139389038, "losses/sft": 1.1185486316680908, "losses/total": 0.6941465139389038, "ref_logps/chosen": -33.81248474121094, "ref_logps/rejected": -41.23206329345703, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0012711097951978445, "rewards/margins": 0.0016234376234933734, "rewards/rejected": -0.0003523279447108507, "step": 10 }, { "epoch": 0.08, "grad_norm": 5.743856364003174, "learning_rate": 1.375e-07, "logps/chosen": -36.227317810058594, "logps/rejected": -40.51737976074219, "loss": 0.6927, "losses/dpo": 0.6916883587837219, "losses/sft": 1.4357692003250122, "losses/total": 0.6916883587837219, "ref_logps/chosen": -36.23785400390625, "ref_logps/rejected": -40.51884078979492, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0010535644832998514, "rewards/margins": 0.0009071138338185847, "rewards/rejected": 0.0001464505330659449, "step": 11 }, { "epoch": 0.09, "grad_norm": 5.9204140360711355, "learning_rate": 1.5e-07, "logps/chosen": -38.817134857177734, "logps/rejected": -42.217681884765625, "loss": 0.6925, "losses/dpo": 0.6985405683517456, "losses/sft": 1.4544084072113037, "losses/total": 0.6985405683517456, "ref_logps/chosen": -38.83327102661133, "ref_logps/rejected": -42.220176696777344, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.0016135365003719926, "rewards/margins": 0.0013644276186823845, "rewards/rejected": 0.0002491088816896081, "step": 12 }, { "epoch": 0.1, "grad_norm": 6.190779724671626, "learning_rate": 1.625e-07, "logps/chosen": -37.33137512207031, "logps/rejected": -46.71794128417969, "loss": 0.6901, "losses/dpo": 0.6943342685699463, "losses/sft": 1.3390721082687378, "losses/total": 0.6943342685699463, "ref_logps/chosen": -37.34603500366211, "ref_logps/rejected": -46.670997619628906, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0014661503955721855, "rewards/margins": 0.006160825490951538, "rewards/rejected": -0.004694675095379353, "step": 13 }, { "epoch": 0.11, "grad_norm": 5.535961166630566, "learning_rate": 1.75e-07, "logps/chosen": -34.35616683959961, "logps/rejected": -40.568878173828125, "loss": 0.6923, "losses/dpo": 0.6914072036743164, "losses/sft": 1.0790843963623047, "losses/total": 0.6914072036743164, "ref_logps/chosen": -34.35405731201172, "ref_logps/rejected": -40.548362731933594, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.00021077337441965938, "rewards/margins": 0.001840681186877191, "rewards/rejected": -0.002051454270258546, "step": 14 }, { "epoch": 0.11, "grad_norm": 5.994324182587906, "learning_rate": 1.875e-07, "logps/chosen": -35.86518859863281, "logps/rejected": -41.03656005859375, "loss": 0.6932, "losses/dpo": 0.690587043762207, "losses/sft": 1.5208988189697266, "losses/total": 0.690587043762207, "ref_logps/chosen": -35.85986328125, "ref_logps/rejected": -41.031028747558594, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0005322899669408798, "rewards/margins": 2.0701438188552856e-05, "rewards/rejected": -0.00055299187079072, "step": 15 }, { "epoch": 0.12, "grad_norm": 5.908538195984286, "learning_rate": 2e-07, "logps/chosen": -36.70806884765625, "logps/rejected": -39.733882904052734, "loss": 0.6926, "losses/dpo": 0.6891317963600159, "losses/sft": 1.1712957620620728, "losses/total": 0.6891317963600159, "ref_logps/chosen": -36.713016510009766, "ref_logps/rejected": -39.726341247558594, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0004950040020048618, "rewards/margins": 0.0012491128873080015, "rewards/rejected": -0.0007541090017184615, "step": 16 }, { "epoch": 0.13, "grad_norm": 6.17892190890487, "learning_rate": 2.1249999999999998e-07, "logps/chosen": -38.60041046142578, "logps/rejected": -43.30579376220703, "loss": 0.694, "losses/dpo": 0.6916015148162842, "losses/sft": 1.3250274658203125, "losses/total": 0.6916015148162842, "ref_logps/chosen": -38.579288482666016, "ref_logps/rejected": -43.300140380859375, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.0021122824400663376, "rewards/margins": -0.001547331572510302, "rewards/rejected": -0.0005649511003866792, "step": 17 }, { "epoch": 0.14, "grad_norm": 6.020104058580606, "learning_rate": 2.25e-07, "logps/chosen": -37.50771713256836, "logps/rejected": -41.76362609863281, "loss": 0.6919, "losses/dpo": 0.6925865411758423, "losses/sft": 1.3761729001998901, "losses/total": 0.6925865411758423, "ref_logps/chosen": -37.507423400878906, "ref_logps/rejected": -41.736366271972656, "rewards/accuracies": 0.5625, "rewards/chosen": -2.9282993637025356e-05, "rewards/margins": 0.0026969274040311575, "rewards/rejected": -0.002726210281252861, "step": 18 }, { "epoch": 0.14, "grad_norm": 5.765433605501229, "learning_rate": 2.3749999999999998e-07, "logps/chosen": -32.96650695800781, "logps/rejected": -42.942771911621094, "loss": 0.6917, "losses/dpo": 0.6924209594726562, "losses/sft": 1.1748046875, "losses/total": 0.6924209594726562, "ref_logps/chosen": -32.971160888671875, "ref_logps/rejected": -42.91703796386719, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.00046532286796718836, "rewards/margins": 0.0030381008982658386, "rewards/rejected": -0.002572778146713972, "step": 19 }, { "epoch": 0.15, "grad_norm": 5.824641204085951, "learning_rate": 2.5e-07, "logps/chosen": -37.962188720703125, "logps/rejected": -43.213279724121094, "loss": 0.6932, "losses/dpo": 0.694710373878479, "losses/sft": 1.2030720710754395, "losses/total": 0.694710373878479, "ref_logps/chosen": -37.919189453125, "ref_logps/rejected": -43.171043395996094, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.0043003251776099205, "rewards/margins": -7.65085278544575e-05, "rewards/rejected": -0.004223817028105259, "step": 20 }, { "epoch": 0.16, "grad_norm": 6.193915477797873, "learning_rate": 2.625e-07, "logps/chosen": -37.831146240234375, "logps/rejected": -47.295005798339844, "loss": 0.6954, "losses/dpo": 0.6994068622589111, "losses/sft": 1.2481111288070679, "losses/total": 0.6994068622589111, "ref_logps/chosen": -37.76181411743164, "ref_logps/rejected": -47.269264221191406, "rewards/accuracies": 0.4453125, "rewards/chosen": -0.006933108903467655, "rewards/margins": -0.004359052516520023, "rewards/rejected": -0.002574056386947632, "step": 21 }, { "epoch": 0.17, "grad_norm": 5.703247411276419, "learning_rate": 2.75e-07, "logps/chosen": -34.446189880371094, "logps/rejected": -42.82508850097656, "loss": 0.6939, "losses/dpo": 0.6885063648223877, "losses/sft": 1.2574893236160278, "losses/total": 0.6885063648223877, "ref_logps/chosen": -34.39026641845703, "ref_logps/rejected": -42.782203674316406, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.005592696368694305, "rewards/margins": -0.0013045003870502114, "rewards/rejected": -0.004288196098059416, "step": 22 }, { "epoch": 0.17, "grad_norm": 5.586186575504996, "learning_rate": 2.8749999999999995e-07, "logps/chosen": -35.78218078613281, "logps/rejected": -46.140350341796875, "loss": 0.6929, "losses/dpo": 0.6936126351356506, "losses/sft": 1.4059488773345947, "losses/total": 0.6936126351356506, "ref_logps/chosen": -35.713783264160156, "ref_logps/rejected": -46.065738677978516, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00683995708823204, "rewards/margins": 0.0006212468724697828, "rewards/rejected": -0.00746120372787118, "step": 23 }, { "epoch": 0.18, "grad_norm": 6.2807240194752, "learning_rate": 3e-07, "logps/chosen": -37.896644592285156, "logps/rejected": -43.448909759521484, "loss": 0.6914, "losses/dpo": 0.6850650310516357, "losses/sft": 1.4576250314712524, "losses/total": 0.6850650310516357, "ref_logps/chosen": -37.83003616333008, "ref_logps/rejected": -43.34458923339844, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.00666093360632658, "rewards/margins": 0.00377137353643775, "rewards/rejected": -0.010432307608425617, "step": 24 }, { "epoch": 0.19, "grad_norm": 5.6714251513252485, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -36.5435791015625, "logps/rejected": -41.46415710449219, "loss": 0.6923, "losses/dpo": 0.6902315020561218, "losses/sft": 1.3371169567108154, "losses/total": 0.6902315020561218, "ref_logps/chosen": -36.474647521972656, "ref_logps/rejected": -41.37662887573242, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.006893564946949482, "rewards/margins": 0.001859544194303453, "rewards/rejected": -0.00875310879200697, "step": 25 }, { "epoch": 0.2, "grad_norm": 6.279222054280467, "learning_rate": 3.25e-07, "logps/chosen": -37.0484733581543, "logps/rejected": -44.5318603515625, "loss": 0.6919, "losses/dpo": 0.6916804313659668, "losses/sft": 1.2641081809997559, "losses/total": 0.6916804313659668, "ref_logps/chosen": -36.94280242919922, "ref_logps/rejected": -44.40016174316406, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.010566946119070053, "rewards/margins": 0.0026026167906820774, "rewards/rejected": -0.013169562444090843, "step": 26 }, { "epoch": 0.2, "grad_norm": 5.569722658657549, "learning_rate": 3.375e-07, "logps/chosen": -33.24622344970703, "logps/rejected": -39.62266540527344, "loss": 0.6918, "losses/dpo": 0.6929441094398499, "losses/sft": 1.0789260864257812, "losses/total": 0.6929441094398499, "ref_logps/chosen": -33.152587890625, "ref_logps/rejected": -39.500396728515625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009363781660795212, "rewards/margins": 0.002862950786948204, "rewards/rejected": -0.012226731516420841, "step": 27 }, { "epoch": 0.21, "grad_norm": 6.178096448266021, "learning_rate": 3.5e-07, "logps/chosen": -40.909210205078125, "logps/rejected": -43.54678726196289, "loss": 0.6932, "losses/dpo": 0.6962201595306396, "losses/sft": 1.4941421747207642, "losses/total": 0.6962201595306396, "ref_logps/chosen": -40.74734878540039, "ref_logps/rejected": -43.38502883911133, "rewards/accuracies": 0.515625, "rewards/chosen": -0.016186244785785675, "rewards/margins": -1.0382413165643811e-05, "rewards/rejected": -0.01617586426436901, "step": 28 }, { "epoch": 0.22, "grad_norm": 6.351428202937179, "learning_rate": 3.6249999999999997e-07, "logps/chosen": -38.18675231933594, "logps/rejected": -45.641944885253906, "loss": 0.6918, "losses/dpo": 0.6936982870101929, "losses/sft": 1.5615143775939941, "losses/total": 0.6936982870101929, "ref_logps/chosen": -38.07172775268555, "ref_logps/rejected": -45.49729919433594, "rewards/accuracies": 0.546875, "rewards/chosen": -0.011502932757139206, "rewards/margins": 0.002961072139441967, "rewards/rejected": -0.014464004896581173, "step": 29 }, { "epoch": 0.23, "grad_norm": 6.197352455942284, "learning_rate": 3.75e-07, "logps/chosen": -38.10821533203125, "logps/rejected": -44.8267707824707, "loss": 0.6913, "losses/dpo": 0.6886826753616333, "losses/sft": 1.2551246881484985, "losses/total": 0.6886826753616333, "ref_logps/chosen": -37.95869827270508, "ref_logps/rejected": -44.637386322021484, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.014951780438423157, "rewards/margins": 0.00398671068251133, "rewards/rejected": -0.018938491120934486, "step": 30 }, { "epoch": 0.23, "grad_norm": 6.109331160092928, "learning_rate": 3.875e-07, "logps/chosen": -37.73468017578125, "logps/rejected": -45.03502655029297, "loss": 0.6939, "losses/dpo": 0.6942582130432129, "losses/sft": 1.307703971862793, "losses/total": 0.6942582130432129, "ref_logps/chosen": -37.55635070800781, "ref_logps/rejected": -44.870731353759766, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01783285290002823, "rewards/margins": -0.0014032268663868308, "rewards/rejected": -0.01642962545156479, "step": 31 }, { "epoch": 0.24, "grad_norm": 5.667604081077805, "learning_rate": 4e-07, "logps/chosen": -35.0442008972168, "logps/rejected": -43.61913299560547, "loss": 0.6936, "losses/dpo": 0.7014378309249878, "losses/sft": 1.3467621803283691, "losses/total": 0.7014378309249878, "ref_logps/chosen": -34.83974838256836, "ref_logps/rejected": -43.42043685913086, "rewards/accuracies": 0.5, "rewards/chosen": -0.02044512704014778, "rewards/margins": -0.0005755843594670296, "rewards/rejected": -0.019869543612003326, "step": 32 }, { "epoch": 0.25, "grad_norm": 5.678502382891613, "learning_rate": 4.1249999999999997e-07, "logps/chosen": -32.68036651611328, "logps/rejected": -37.5178337097168, "loss": 0.6907, "losses/dpo": 0.6907713413238525, "losses/sft": 1.0900930166244507, "losses/total": 0.6907713413238525, "ref_logps/chosen": -32.45442199707031, "ref_logps/rejected": -37.23982238769531, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.0225942712277174, "rewards/margins": 0.005206821020692587, "rewards/rejected": -0.027801092714071274, "step": 33 }, { "epoch": 0.26, "grad_norm": 5.921573916861138, "learning_rate": 4.2499999999999995e-07, "logps/chosen": -38.70556640625, "logps/rejected": -44.084251403808594, "loss": 0.6917, "losses/dpo": 0.6954081058502197, "losses/sft": 1.3240528106689453, "losses/total": 0.6954081058502197, "ref_logps/chosen": -38.448394775390625, "ref_logps/rejected": -43.795318603515625, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.025717251002788544, "rewards/margins": 0.0031764586456120014, "rewards/rejected": -0.02889370732009411, "step": 34 }, { "epoch": 0.26, "grad_norm": 6.262461804418144, "learning_rate": 4.375e-07, "logps/chosen": -38.28644561767578, "logps/rejected": -44.10706329345703, "loss": 0.6906, "losses/dpo": 0.6915950179100037, "losses/sft": 1.5137853622436523, "losses/total": 0.6915950179100037, "ref_logps/chosen": -37.993186950683594, "ref_logps/rejected": -43.75933837890625, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.029325801879167557, "rewards/margins": 0.005446841474622488, "rewards/rejected": -0.03477264195680618, "step": 35 }, { "epoch": 0.27, "grad_norm": 6.51719678701152, "learning_rate": 4.5e-07, "logps/chosen": -38.827396392822266, "logps/rejected": -44.608299255371094, "loss": 0.6898, "losses/dpo": 0.6939886212348938, "losses/sft": 1.1804178953170776, "losses/total": 0.6939886212348938, "ref_logps/chosen": -38.524925231933594, "ref_logps/rejected": -44.23436737060547, "rewards/accuracies": 0.515625, "rewards/chosen": -0.03024711087346077, "rewards/margins": 0.007146051619201899, "rewards/rejected": -0.03739316016435623, "step": 36 }, { "epoch": 0.28, "grad_norm": 6.054369450491099, "learning_rate": 4.625e-07, "logps/chosen": -37.75469970703125, "logps/rejected": -40.86686706542969, "loss": 0.69, "losses/dpo": 0.6901004910469055, "losses/sft": 1.2039740085601807, "losses/total": 0.6901004910469055, "ref_logps/chosen": -37.38399124145508, "ref_logps/rejected": -40.42985534667969, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03707098215818405, "rewards/margins": 0.006630584131926298, "rewards/rejected": -0.04370156675577164, "step": 37 }, { "epoch": 0.29, "grad_norm": 5.811128045517565, "learning_rate": 4.7499999999999995e-07, "logps/chosen": -37.347511291503906, "logps/rejected": -42.597320556640625, "loss": 0.6932, "losses/dpo": 0.6947627663612366, "losses/sft": 1.5172080993652344, "losses/total": 0.6947627663612366, "ref_logps/chosen": -36.92784881591797, "ref_logps/rejected": -42.17408752441406, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.041966233402490616, "rewards/margins": 0.00035727641079574823, "rewards/rejected": -0.042323507368564606, "step": 38 }, { "epoch": 0.29, "grad_norm": 6.232125527781941, "learning_rate": 4.875e-07, "logps/chosen": -35.76224899291992, "logps/rejected": -40.480010986328125, "loss": 0.6877, "losses/dpo": 0.6863731741905212, "losses/sft": 1.403287410736084, "losses/total": 0.6863731741905212, "ref_logps/chosen": -35.296600341796875, "ref_logps/rejected": -39.899620056152344, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.04656480997800827, "rewards/margins": 0.011474234983325005, "rewards/rejected": -0.058039046823978424, "step": 39 }, { "epoch": 0.3, "grad_norm": 5.919014140290479, "learning_rate": 5e-07, "logps/chosen": -33.405452728271484, "logps/rejected": -40.23749542236328, "loss": 0.6917, "losses/dpo": 0.7027544975280762, "losses/sft": 1.5135366916656494, "losses/total": 0.7027544975280762, "ref_logps/chosen": -32.92824935913086, "ref_logps/rejected": -39.72548294067383, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.04772059991955757, "rewards/margins": 0.0034805855248123407, "rewards/rejected": -0.05120118334889412, "step": 40 }, { "epoch": 0.31, "grad_norm": 6.156564024356789, "learning_rate": 4.985955056179775e-07, "logps/chosen": -33.48844528198242, "logps/rejected": -40.55287551879883, "loss": 0.6886, "losses/dpo": 0.6869601011276245, "losses/sft": 1.2104542255401611, "losses/total": 0.6869601011276245, "ref_logps/chosen": -32.960693359375, "ref_logps/rejected": -39.92742919921875, "rewards/accuracies": 0.578125, "rewards/chosen": -0.052775099873542786, "rewards/margins": 0.009769486263394356, "rewards/rejected": -0.06254458427429199, "step": 41 }, { "epoch": 0.32, "grad_norm": 5.9874098679402445, "learning_rate": 4.97191011235955e-07, "logps/chosen": -37.491756439208984, "logps/rejected": -44.21824645996094, "loss": 0.6903, "losses/dpo": 0.6947405934333801, "losses/sft": 1.5526431798934937, "losses/total": 0.6947405934333801, "ref_logps/chosen": -36.944496154785156, "ref_logps/rejected": -43.608970642089844, "rewards/accuracies": 0.53125, "rewards/chosen": -0.05472607538104057, "rewards/margins": 0.006201753858476877, "rewards/rejected": -0.060927826911211014, "step": 42 }, { "epoch": 0.32, "grad_norm": 5.890690107516997, "learning_rate": 4.957865168539325e-07, "logps/chosen": -37.96784210205078, "logps/rejected": -44.18370056152344, "loss": 0.6911, "losses/dpo": 0.6906970143318176, "losses/sft": 1.5630677938461304, "losses/total": 0.6906970143318176, "ref_logps/chosen": -37.31348419189453, "ref_logps/rejected": -43.480613708496094, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.06543563306331635, "rewards/margins": 0.0048727355897426605, "rewards/rejected": -0.07030836492776871, "step": 43 }, { "epoch": 0.33, "grad_norm": 5.6620046922389475, "learning_rate": 4.943820224719101e-07, "logps/chosen": -33.37147903442383, "logps/rejected": -40.975284576416016, "loss": 0.6876, "losses/dpo": 0.6959986090660095, "losses/sft": 1.4914252758026123, "losses/total": 0.6959986090660095, "ref_logps/chosen": -32.77292251586914, "ref_logps/rejected": -40.25656509399414, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.05985547974705696, "rewards/margins": 0.012016610242426395, "rewards/rejected": -0.07187209278345108, "step": 44 }, { "epoch": 0.34, "grad_norm": 5.975010174130114, "learning_rate": 4.929775280898877e-07, "logps/chosen": -36.01771545410156, "logps/rejected": -40.14152145385742, "loss": 0.686, "losses/dpo": 0.6887790560722351, "losses/sft": 1.0922722816467285, "losses/total": 0.6887790560722351, "ref_logps/chosen": -35.33580017089844, "ref_logps/rejected": -39.30628204345703, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06819140911102295, "rewards/margins": 0.015332860872149467, "rewards/rejected": -0.08352427184581757, "step": 45 }, { "epoch": 0.35, "grad_norm": 5.941321729150391, "learning_rate": 4.915730337078651e-07, "logps/chosen": -37.23257064819336, "logps/rejected": -43.88367462158203, "loss": 0.6872, "losses/dpo": 0.6943268775939941, "losses/sft": 1.3947020769119263, "losses/total": 0.6943268775939941, "ref_logps/chosen": -36.47919845581055, "ref_logps/rejected": -43.00157165527344, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07533714175224304, "rewards/margins": 0.012872692197561264, "rewards/rejected": -0.08820983022451401, "step": 46 }, { "epoch": 0.35, "grad_norm": 6.015115529321941, "learning_rate": 4.901685393258427e-07, "logps/chosen": -40.33525085449219, "logps/rejected": -41.87712478637695, "loss": 0.6904, "losses/dpo": 0.6985194683074951, "losses/sft": 1.4163267612457275, "losses/total": 0.6985194683074951, "ref_logps/chosen": -39.42625045776367, "ref_logps/rejected": -40.897945404052734, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.09090035408735275, "rewards/margins": 0.007017695810645819, "rewards/rejected": -0.09791804850101471, "step": 47 }, { "epoch": 0.36, "grad_norm": 5.733756968847646, "learning_rate": 4.887640449438202e-07, "logps/chosen": -36.97081756591797, "logps/rejected": -42.80936050415039, "loss": 0.6882, "losses/dpo": 0.696724534034729, "losses/sft": 1.2510000467300415, "losses/total": 0.696724534034729, "ref_logps/chosen": -36.11339569091797, "ref_logps/rejected": -41.84064483642578, "rewards/accuracies": 0.609375, "rewards/chosen": -0.08574248850345612, "rewards/margins": 0.011129248887300491, "rewards/rejected": -0.09687173366546631, "step": 48 }, { "epoch": 0.37, "grad_norm": 5.879327522594857, "learning_rate": 4.873595505617978e-07, "logps/chosen": -33.350772857666016, "logps/rejected": -41.509918212890625, "loss": 0.6838, "losses/dpo": 0.6769124269485474, "losses/sft": 1.218217372894287, "losses/total": 0.6769124269485474, "ref_logps/chosen": -32.432064056396484, "ref_logps/rejected": -40.38986587524414, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09187072515487671, "rewards/margins": 0.020134272053837776, "rewards/rejected": -0.11200500279664993, "step": 49 }, { "epoch": 0.38, "grad_norm": 6.158412418626052, "learning_rate": 4.859550561797752e-07, "logps/chosen": -38.05060577392578, "logps/rejected": -45.274757385253906, "loss": 0.6828, "losses/dpo": 0.6935728788375854, "losses/sft": 1.33810293674469, "losses/total": 0.6935728788375854, "ref_logps/chosen": -36.988319396972656, "ref_logps/rejected": -43.98942565917969, "rewards/accuracies": 0.625, "rewards/chosen": -0.10622845590114594, "rewards/margins": 0.022304760292172432, "rewards/rejected": -0.12853321433067322, "step": 50 }, { "epoch": 0.38, "grad_norm": 6.285183993622672, "learning_rate": 4.845505617977528e-07, "logps/chosen": -38.64442443847656, "logps/rejected": -42.03549575805664, "loss": 0.6823, "losses/dpo": 0.6838083267211914, "losses/sft": 1.414647102355957, "losses/total": 0.6838083267211914, "ref_logps/chosen": -37.58879089355469, "ref_logps/rejected": -40.74079132080078, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.1055637076497078, "rewards/margins": 0.023906776681542397, "rewards/rejected": -0.12947048246860504, "step": 51 }, { "epoch": 0.39, "grad_norm": 5.8676763170916155, "learning_rate": 4.831460674157303e-07, "logps/chosen": -35.28524398803711, "logps/rejected": -43.29574966430664, "loss": 0.6824, "losses/dpo": 0.6761789321899414, "losses/sft": 1.1140950918197632, "losses/total": 0.6761789321899414, "ref_logps/chosen": -34.05834197998047, "ref_logps/rejected": -41.828880310058594, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.12269000709056854, "rewards/margins": 0.023997044190764427, "rewards/rejected": -0.14668706059455872, "step": 52 }, { "epoch": 0.4, "grad_norm": 6.153262147929733, "learning_rate": 4.817415730337078e-07, "logps/chosen": -34.02470016479492, "logps/rejected": -38.51059341430664, "loss": 0.6808, "losses/dpo": 0.6883823871612549, "losses/sft": 1.1792895793914795, "losses/total": 0.6883823871612549, "ref_logps/chosen": -32.95384216308594, "ref_logps/rejected": -37.16828155517578, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.10708627104759216, "rewards/margins": 0.02714475244283676, "rewards/rejected": -0.13423103094100952, "step": 53 }, { "epoch": 0.41, "grad_norm": 5.930275284806721, "learning_rate": 4.803370786516854e-07, "logps/chosen": -39.90019989013672, "logps/rejected": -41.967960357666016, "loss": 0.6817, "losses/dpo": 0.6764520406723022, "losses/sft": 1.4552464485168457, "losses/total": 0.6764520406723022, "ref_logps/chosen": -38.590492248535156, "ref_logps/rejected": -40.406002044677734, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1309707909822464, "rewards/margins": 0.025225069373846054, "rewards/rejected": -0.15619586408138275, "step": 54 }, { "epoch": 0.42, "grad_norm": 5.991852629106404, "learning_rate": 4.789325842696629e-07, "logps/chosen": -37.67607116699219, "logps/rejected": -42.05184555053711, "loss": 0.6906, "losses/dpo": 0.6933637261390686, "losses/sft": 1.3182023763656616, "losses/total": 0.6933637261390686, "ref_logps/chosen": -36.22807312011719, "ref_logps/rejected": -40.539833068847656, "rewards/accuracies": 0.515625, "rewards/chosen": -0.14479960501194, "rewards/margins": 0.006401616148650646, "rewards/rejected": -0.15120121836662292, "step": 55 }, { "epoch": 0.42, "grad_norm": 5.887970211048207, "learning_rate": 4.775280898876405e-07, "logps/chosen": -36.0313835144043, "logps/rejected": -41.34480285644531, "loss": 0.6806, "losses/dpo": 0.6776463985443115, "losses/sft": 1.3762413263320923, "losses/total": 0.6776463985443115, "ref_logps/chosen": -34.71417999267578, "ref_logps/rejected": -39.74456024169922, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.131720170378685, "rewards/margins": 0.02830405905842781, "rewards/rejected": -0.1600242257118225, "step": 56 }, { "epoch": 0.43, "grad_norm": 6.202121890129415, "learning_rate": 4.7612359550561797e-07, "logps/chosen": -39.138973236083984, "logps/rejected": -44.62040710449219, "loss": 0.6868, "losses/dpo": 0.7154799699783325, "losses/sft": 1.4311680793762207, "losses/total": 0.7154799699783325, "ref_logps/chosen": -37.48638153076172, "ref_logps/rejected": -42.81147003173828, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1652592122554779, "rewards/margins": 0.01563437283039093, "rewards/rejected": -0.18089357018470764, "step": 57 }, { "epoch": 0.44, "grad_norm": 6.251681349620222, "learning_rate": 4.747191011235955e-07, "logps/chosen": -35.74232864379883, "logps/rejected": -41.246910095214844, "loss": 0.6762, "losses/dpo": 0.6785226464271545, "losses/sft": 1.2456488609313965, "losses/total": 0.6785226464271545, "ref_logps/chosen": -34.32046890258789, "ref_logps/rejected": -39.45071792602539, "rewards/accuracies": 0.671875, "rewards/chosen": -0.14218537509441376, "rewards/margins": 0.037433870136737823, "rewards/rejected": -0.17961923778057098, "step": 58 }, { "epoch": 0.45, "grad_norm": 6.186489597098538, "learning_rate": 4.7331460674157303e-07, "logps/chosen": -38.993804931640625, "logps/rejected": -48.68840789794922, "loss": 0.6799, "losses/dpo": 0.6576354503631592, "losses/sft": 1.2577842473983765, "losses/total": 0.6576354503631592, "ref_logps/chosen": -37.452022552490234, "ref_logps/rejected": -46.83843994140625, "rewards/accuracies": 0.625, "rewards/chosen": -0.15417808294296265, "rewards/margins": 0.03081856295466423, "rewards/rejected": -0.18499664962291718, "step": 59 }, { "epoch": 0.45, "grad_norm": 6.176384076659114, "learning_rate": 4.7191011235955054e-07, "logps/chosen": -36.94293975830078, "logps/rejected": -43.75997543334961, "loss": 0.6818, "losses/dpo": 0.6777645349502563, "losses/sft": 1.4646830558776855, "losses/total": 0.6777645349502563, "ref_logps/chosen": -35.410682678222656, "ref_logps/rejected": -41.96450424194336, "rewards/accuracies": 0.609375, "rewards/chosen": -0.15322577953338623, "rewards/margins": 0.026321690529584885, "rewards/rejected": -0.17954745888710022, "step": 60 }, { "epoch": 0.46, "grad_norm": 6.222192772165732, "learning_rate": 4.705056179775281e-07, "logps/chosen": -38.04816436767578, "logps/rejected": -46.636329650878906, "loss": 0.6813, "losses/dpo": 0.6915292739868164, "losses/sft": 1.5139144659042358, "losses/total": 0.6915292739868164, "ref_logps/chosen": -36.346763610839844, "ref_logps/rejected": -44.65103530883789, "rewards/accuracies": 0.625, "rewards/chosen": -0.170139878988266, "rewards/margins": 0.028389303013682365, "rewards/rejected": -0.1985291838645935, "step": 61 }, { "epoch": 0.47, "grad_norm": 6.581489224854424, "learning_rate": 4.691011235955056e-07, "logps/chosen": -39.37269973754883, "logps/rejected": -42.562713623046875, "loss": 0.6729, "losses/dpo": 0.6604301333427429, "losses/sft": 1.2340155839920044, "losses/total": 0.6604301333427429, "ref_logps/chosen": -37.8352165222168, "ref_logps/rejected": -40.57086181640625, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.15374788641929626, "rewards/margins": 0.04543708637356758, "rewards/rejected": -0.19918496906757355, "step": 62 }, { "epoch": 0.48, "grad_norm": 6.270954655004902, "learning_rate": 4.6769662921348315e-07, "logps/chosen": -35.57749938964844, "logps/rejected": -43.94036102294922, "loss": 0.6719, "losses/dpo": 0.6689096689224243, "losses/sft": 1.3980541229248047, "losses/total": 0.6689096689224243, "ref_logps/chosen": -33.870826721191406, "ref_logps/rejected": -41.761024475097656, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.1706671416759491, "rewards/margins": 0.0472659207880497, "rewards/rejected": -0.2179330587387085, "step": 63 }, { "epoch": 0.48, "grad_norm": 5.825145606750587, "learning_rate": 4.662921348314606e-07, "logps/chosen": -36.180145263671875, "logps/rejected": -42.19972229003906, "loss": 0.6867, "losses/dpo": 0.6951602697372437, "losses/sft": 1.4974910020828247, "losses/total": 0.6951602697372437, "ref_logps/chosen": -34.28754425048828, "ref_logps/rejected": -40.13561248779297, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.18926027417182922, "rewards/margins": 0.017150741070508957, "rewards/rejected": -0.20641100406646729, "step": 64 }, { "epoch": 0.49, "grad_norm": 5.954970035091233, "learning_rate": 4.6488764044943816e-07, "logps/chosen": -41.472923278808594, "logps/rejected": -45.73348617553711, "loss": 0.6781, "losses/dpo": 0.667303204536438, "losses/sft": 1.494096040725708, "losses/total": 0.667303204536438, "ref_logps/chosen": -39.698944091796875, "ref_logps/rejected": -43.6091423034668, "rewards/accuracies": 0.609375, "rewards/chosen": -0.177398219704628, "rewards/margins": 0.03503631800413132, "rewards/rejected": -0.2124345451593399, "step": 65 }, { "epoch": 0.5, "grad_norm": 6.488401314246342, "learning_rate": 4.634831460674157e-07, "logps/chosen": -39.84260177612305, "logps/rejected": -49.195159912109375, "loss": 0.6716, "losses/dpo": 0.6679590940475464, "losses/sft": 1.3698948621749878, "losses/total": 0.6679590940475464, "ref_logps/chosen": -37.98674774169922, "ref_logps/rejected": -46.86464309692383, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.18558543920516968, "rewards/margins": 0.047466084361076355, "rewards/rejected": -0.23305150866508484, "step": 66 }, { "epoch": 0.51, "grad_norm": 6.139969478930884, "learning_rate": 4.620786516853932e-07, "logps/chosen": -36.54951858520508, "logps/rejected": -42.6442756652832, "loss": 0.6689, "losses/dpo": 0.650477409362793, "losses/sft": 1.350743055343628, "losses/total": 0.650477409362793, "ref_logps/chosen": -34.77081298828125, "ref_logps/rejected": -40.319297790527344, "rewards/accuracies": 0.625, "rewards/chosen": -0.17787054181098938, "rewards/margins": 0.05462724715471268, "rewards/rejected": -0.23249778151512146, "step": 67 }, { "epoch": 0.51, "grad_norm": 6.626542224506714, "learning_rate": 4.606741573033708e-07, "logps/chosen": -38.910194396972656, "logps/rejected": -44.71943664550781, "loss": 0.6588, "losses/dpo": 0.6475476622581482, "losses/sft": 1.0136208534240723, "losses/total": 0.6475476622581482, "ref_logps/chosen": -37.08655548095703, "ref_logps/rejected": -42.12825393676758, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.18236377835273743, "rewards/margins": 0.0767548531293869, "rewards/rejected": -0.2591186463832855, "step": 68 }, { "epoch": 0.52, "grad_norm": 6.316526648907962, "learning_rate": 4.592696629213483e-07, "logps/chosen": -39.12900924682617, "logps/rejected": -47.94546890258789, "loss": 0.6741, "losses/dpo": 0.6746849417686462, "losses/sft": 1.3253227472305298, "losses/total": 0.6746849417686462, "ref_logps/chosen": -36.78126525878906, "ref_logps/rejected": -45.148887634277344, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23477408289909363, "rewards/margins": 0.044883839786052704, "rewards/rejected": -0.27965790033340454, "step": 69 }, { "epoch": 0.53, "grad_norm": 6.029340644383451, "learning_rate": 4.5786516853932584e-07, "logps/chosen": -37.168025970458984, "logps/rejected": -43.3531494140625, "loss": 0.6776, "losses/dpo": 0.708085298538208, "losses/sft": 1.549338698387146, "losses/total": 0.708085298538208, "ref_logps/chosen": -34.82072067260742, "ref_logps/rejected": -40.60224151611328, "rewards/accuracies": 0.578125, "rewards/chosen": -0.2347305417060852, "rewards/margins": 0.04036000370979309, "rewards/rejected": -0.2750905454158783, "step": 70 }, { "epoch": 0.54, "grad_norm": 6.206457245959275, "learning_rate": 4.5646067415730334e-07, "logps/chosen": -37.381324768066406, "logps/rejected": -44.06721878051758, "loss": 0.667, "losses/dpo": 0.6923149228096008, "losses/sft": 1.499281883239746, "losses/total": 0.6923149228096008, "ref_logps/chosen": -35.004127502441406, "ref_logps/rejected": -41.090354919433594, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.23771986365318298, "rewards/margins": 0.05996667221188545, "rewards/rejected": -0.29768651723861694, "step": 71 }, { "epoch": 0.54, "grad_norm": 6.238705763349497, "learning_rate": 4.550561797752809e-07, "logps/chosen": -38.5302734375, "logps/rejected": -48.384620666503906, "loss": 0.6669, "losses/dpo": 0.6794298887252808, "losses/sft": 1.3331537246704102, "losses/total": 0.6794298887252808, "ref_logps/chosen": -35.968894958496094, "ref_logps/rejected": -45.22618103027344, "rewards/accuracies": 0.671875, "rewards/chosen": -0.25613832473754883, "rewards/margins": 0.05970541387796402, "rewards/rejected": -0.31584370136260986, "step": 72 }, { "epoch": 0.55, "grad_norm": 6.369104792363388, "learning_rate": 4.536516853932584e-07, "logps/chosen": -38.55243682861328, "logps/rejected": -46.81627655029297, "loss": 0.6609, "losses/dpo": 0.6863117218017578, "losses/sft": 1.404316782951355, "losses/total": 0.6863117218017578, "ref_logps/chosen": -35.96749496459961, "ref_logps/rejected": -43.47722625732422, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.25849413871765137, "rewards/margins": 0.07541059702634811, "rewards/rejected": -0.3339047431945801, "step": 73 }, { "epoch": 0.56, "grad_norm": 6.15339242747321, "learning_rate": 4.522471910112359e-07, "logps/chosen": -39.58115005493164, "logps/rejected": -44.1653938293457, "loss": 0.6782, "losses/dpo": 0.7283678650856018, "losses/sft": 1.3683419227600098, "losses/total": 0.7283678650856018, "ref_logps/chosen": -37.06593704223633, "ref_logps/rejected": -41.25994873046875, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.25152140855789185, "rewards/margins": 0.03902304172515869, "rewards/rejected": -0.29054442048072815, "step": 74 }, { "epoch": 0.57, "grad_norm": 6.3073523395391105, "learning_rate": 4.5084269662921347e-07, "logps/chosen": -39.416324615478516, "logps/rejected": -45.20884323120117, "loss": 0.6702, "losses/dpo": 0.6884989738464355, "losses/sft": 1.2989376783370972, "losses/total": 0.6884989738464355, "ref_logps/chosen": -36.72044372558594, "ref_logps/rejected": -41.92823791503906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2695878744125366, "rewards/margins": 0.058472514152526855, "rewards/rejected": -0.3280603885650635, "step": 75 }, { "epoch": 0.57, "grad_norm": 6.353149418167083, "learning_rate": 4.4943820224719097e-07, "logps/chosen": -39.451934814453125, "logps/rejected": -45.58893585205078, "loss": 0.6758, "losses/dpo": 0.6700998544692993, "losses/sft": 1.423154354095459, "losses/total": 0.6700998544692993, "ref_logps/chosen": -36.6813850402832, "ref_logps/rejected": -42.35654830932617, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.2770548164844513, "rewards/margins": 0.0461842380464077, "rewards/rejected": -0.3232390582561493, "step": 76 }, { "epoch": 0.58, "grad_norm": 6.243464205666771, "learning_rate": 4.4803370786516853e-07, "logps/chosen": -38.50192642211914, "logps/rejected": -44.94510269165039, "loss": 0.6654, "losses/dpo": 0.6559799909591675, "losses/sft": 1.3645029067993164, "losses/total": 0.6559799909591675, "ref_logps/chosen": -35.83762741088867, "ref_logps/rejected": -41.636741638183594, "rewards/accuracies": 0.625, "rewards/chosen": -0.2664298415184021, "rewards/margins": 0.06440602242946625, "rewards/rejected": -0.33083584904670715, "step": 77 }, { "epoch": 0.59, "grad_norm": 6.529746915602167, "learning_rate": 4.4662921348314603e-07, "logps/chosen": -38.366634368896484, "logps/rejected": -48.25501251220703, "loss": 0.6701, "losses/dpo": 0.6894055008888245, "losses/sft": 1.4073951244354248, "losses/total": 0.6894055008888245, "ref_logps/chosen": -35.34041976928711, "ref_logps/rejected": -44.6351318359375, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.3026217818260193, "rewards/margins": 0.05936632677912712, "rewards/rejected": -0.3619880974292755, "step": 78 }, { "epoch": 0.6, "grad_norm": 6.186041404774562, "learning_rate": 4.452247191011236e-07, "logps/chosen": -37.969024658203125, "logps/rejected": -46.56663131713867, "loss": 0.6582, "losses/dpo": 0.6372844576835632, "losses/sft": 1.1740036010742188, "losses/total": 0.6372844576835632, "ref_logps/chosen": -35.09920883178711, "ref_logps/rejected": -42.874725341796875, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.2869817614555359, "rewards/margins": 0.08220900595188141, "rewards/rejected": -0.3691907525062561, "step": 79 }, { "epoch": 0.6, "grad_norm": 6.41463598459145, "learning_rate": 4.438202247191011e-07, "logps/chosen": -43.05072021484375, "logps/rejected": -49.178314208984375, "loss": 0.656, "losses/dpo": 0.6740515232086182, "losses/sft": 1.4272187948226929, "losses/total": 0.6740515232086182, "ref_logps/chosen": -39.619014739990234, "ref_logps/rejected": -44.83480453491211, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.3431706726551056, "rewards/margins": 0.09118058532476425, "rewards/rejected": -0.43435126543045044, "step": 80 }, { "epoch": 0.61, "grad_norm": 6.840871211971457, "learning_rate": 4.4241573033707865e-07, "logps/chosen": -43.888370513916016, "logps/rejected": -47.332916259765625, "loss": 0.6729, "losses/dpo": 0.666955828666687, "losses/sft": 1.6874582767486572, "losses/total": 0.666955828666687, "ref_logps/chosen": -40.38330841064453, "ref_logps/rejected": -43.3054084777832, "rewards/accuracies": 0.59375, "rewards/chosen": -0.350506067276001, "rewards/margins": 0.05224461108446121, "rewards/rejected": -0.4027506709098816, "step": 81 }, { "epoch": 0.62, "grad_norm": 6.882624120223548, "learning_rate": 4.410112359550562e-07, "logps/chosen": -40.2237548828125, "logps/rejected": -46.99496078491211, "loss": 0.6607, "losses/dpo": 0.6718687415122986, "losses/sft": 1.5186784267425537, "losses/total": 0.6718687415122986, "ref_logps/chosen": -36.70365905761719, "ref_logps/rejected": -42.63638687133789, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3520098030567169, "rewards/margins": 0.08384796977043152, "rewards/rejected": -0.43585777282714844, "step": 82 }, { "epoch": 0.63, "grad_norm": 6.3488191331703385, "learning_rate": 4.3960674157303366e-07, "logps/chosen": -40.38496780395508, "logps/rejected": -46.7673454284668, "loss": 0.6535, "losses/dpo": 0.6566940546035767, "losses/sft": 1.3071130514144897, "losses/total": 0.6566940546035767, "ref_logps/chosen": -37.20966339111328, "ref_logps/rejected": -42.63634490966797, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3175300061702728, "rewards/margins": 0.09556981176137924, "rewards/rejected": -0.4130997955799103, "step": 83 }, { "epoch": 0.63, "grad_norm": 6.624302993852389, "learning_rate": 4.382022471910112e-07, "logps/chosen": -42.17374801635742, "logps/rejected": -49.17514419555664, "loss": 0.6571, "losses/dpo": 0.6181658506393433, "losses/sft": 1.3204035758972168, "losses/total": 0.6181658506393433, "ref_logps/chosen": -38.54387664794922, "ref_logps/rejected": -44.659461975097656, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.3629874587059021, "rewards/margins": 0.08858054131269455, "rewards/rejected": -0.45156803727149963, "step": 84 }, { "epoch": 0.64, "grad_norm": 6.569663592940448, "learning_rate": 4.367977528089887e-07, "logps/chosen": -39.99671936035156, "logps/rejected": -48.49413299560547, "loss": 0.6562, "losses/dpo": 0.6639370322227478, "losses/sft": 1.6048388481140137, "losses/total": 0.6639370322227478, "ref_logps/chosen": -36.648887634277344, "ref_logps/rejected": -44.24271774291992, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3347826600074768, "rewards/margins": 0.09035841375589371, "rewards/rejected": -0.4251410961151123, "step": 85 }, { "epoch": 0.65, "grad_norm": 6.597580499931281, "learning_rate": 4.353932584269663e-07, "logps/chosen": -41.4986572265625, "logps/rejected": -48.67082214355469, "loss": 0.6519, "losses/dpo": 0.6654509902000427, "losses/sft": 1.462377905845642, "losses/total": 0.6654509902000427, "ref_logps/chosen": -37.87129211425781, "ref_logps/rejected": -43.97351837158203, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.3627370595932007, "rewards/margins": 0.10699345916509628, "rewards/rejected": -0.46973055601119995, "step": 86 }, { "epoch": 0.66, "grad_norm": 6.479183906632411, "learning_rate": 4.339887640449438e-07, "logps/chosen": -41.78961944580078, "logps/rejected": -47.387901306152344, "loss": 0.6791, "losses/dpo": 0.6730961799621582, "losses/sft": 1.1305738687515259, "losses/total": 0.6730961799621582, "ref_logps/chosen": -37.842323303222656, "ref_logps/rejected": -42.93647766113281, "rewards/accuracies": 0.515625, "rewards/chosen": -0.3947296738624573, "rewards/margins": 0.05041254311800003, "rewards/rejected": -0.4451422691345215, "step": 87 }, { "epoch": 0.66, "grad_norm": 6.926719176011086, "learning_rate": 4.3258426966292134e-07, "logps/chosen": -43.21299743652344, "logps/rejected": -47.084434509277344, "loss": 0.6673, "losses/dpo": 0.6536482572555542, "losses/sft": 1.2500860691070557, "losses/total": 0.6536482572555542, "ref_logps/chosen": -39.38795471191406, "ref_logps/rejected": -42.582969665527344, "rewards/accuracies": 0.625, "rewards/chosen": -0.38250401616096497, "rewards/margins": 0.06764230877161026, "rewards/rejected": -0.45014631748199463, "step": 88 }, { "epoch": 0.67, "grad_norm": 6.563435223333862, "learning_rate": 4.311797752808989e-07, "logps/chosen": -40.25920104980469, "logps/rejected": -49.489097595214844, "loss": 0.6508, "losses/dpo": 0.6591900587081909, "losses/sft": 1.3429124355316162, "losses/total": 0.6591900587081909, "ref_logps/chosen": -36.316349029541016, "ref_logps/rejected": -44.49497985839844, "rewards/accuracies": 0.625, "rewards/chosen": -0.3942852020263672, "rewards/margins": 0.1051262766122818, "rewards/rejected": -0.4994114637374878, "step": 89 }, { "epoch": 0.68, "grad_norm": 6.671599802331672, "learning_rate": 4.297752808988764e-07, "logps/chosen": -42.98493957519531, "logps/rejected": -45.29029846191406, "loss": 0.6775, "losses/dpo": 0.6989056468009949, "losses/sft": 1.7236398458480835, "losses/total": 0.6989056468009949, "ref_logps/chosen": -38.74510955810547, "ref_logps/rejected": -40.55065155029297, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4239833354949951, "rewards/margins": 0.04998103156685829, "rewards/rejected": -0.4739643633365631, "step": 90 }, { "epoch": 0.69, "grad_norm": 6.921605107059482, "learning_rate": 4.2837078651685396e-07, "logps/chosen": -42.04779052734375, "logps/rejected": -47.75447463989258, "loss": 0.6577, "losses/dpo": 0.6272084712982178, "losses/sft": 1.5017703771591187, "losses/total": 0.6272084712982178, "ref_logps/chosen": -37.97248458862305, "ref_logps/rejected": -42.7065315246582, "rewards/accuracies": 0.609375, "rewards/chosen": -0.40753045678138733, "rewards/margins": 0.09726397693157196, "rewards/rejected": -0.5047944784164429, "step": 91 }, { "epoch": 0.69, "grad_norm": 6.759776662372792, "learning_rate": 4.269662921348314e-07, "logps/chosen": -44.38639831542969, "logps/rejected": -53.21236038208008, "loss": 0.6431, "losses/dpo": 0.7165791988372803, "losses/sft": 1.5609912872314453, "losses/total": 0.7165791988372803, "ref_logps/chosen": -39.95743942260742, "ref_logps/rejected": -47.561588287353516, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.4428956210613251, "rewards/margins": 0.122181735932827, "rewards/rejected": -0.5650773644447327, "step": 92 }, { "epoch": 0.7, "grad_norm": 6.767540666000382, "learning_rate": 4.2556179775280896e-07, "logps/chosen": -39.6769905090332, "logps/rejected": -45.86317443847656, "loss": 0.6772, "losses/dpo": 0.6330491900444031, "losses/sft": 1.33146333694458, "losses/total": 0.6330491900444031, "ref_logps/chosen": -35.39988708496094, "ref_logps/rejected": -41.038116455078125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42771056294441223, "rewards/margins": 0.05479476973414421, "rewards/rejected": -0.4825053811073303, "step": 93 }, { "epoch": 0.71, "grad_norm": 6.521925535129618, "learning_rate": 4.2415730337078647e-07, "logps/chosen": -43.78227996826172, "logps/rejected": -47.82459259033203, "loss": 0.6607, "losses/dpo": 0.6985595226287842, "losses/sft": 1.530924677848816, "losses/total": 0.6985595226287842, "ref_logps/chosen": -39.28633499145508, "ref_logps/rejected": -42.36700439453125, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.4495944082736969, "rewards/margins": 0.09616444259881973, "rewards/rejected": -0.545758843421936, "step": 94 }, { "epoch": 0.72, "grad_norm": 6.606025652021848, "learning_rate": 4.22752808988764e-07, "logps/chosen": -44.53917694091797, "logps/rejected": -49.32555389404297, "loss": 0.6783, "losses/dpo": 0.6152039766311646, "losses/sft": 1.5025076866149902, "losses/total": 0.6152039766311646, "ref_logps/chosen": -39.66822814941406, "ref_logps/rejected": -43.90290451049805, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.4870951175689697, "rewards/margins": 0.05516959726810455, "rewards/rejected": -0.5422646999359131, "step": 95 }, { "epoch": 0.72, "grad_norm": 6.456362063653043, "learning_rate": 4.2134831460674153e-07, "logps/chosen": -40.66051483154297, "logps/rejected": -50.35266876220703, "loss": 0.6339, "losses/dpo": 0.5940225124359131, "losses/sft": 1.3329205513000488, "losses/total": 0.5940225124359131, "ref_logps/chosen": -36.583961486816406, "ref_logps/rejected": -44.78839874267578, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.40765535831451416, "rewards/margins": 0.14877161383628845, "rewards/rejected": -0.556427001953125, "step": 96 }, { "epoch": 0.73, "grad_norm": 7.002732175851258, "learning_rate": 4.199438202247191e-07, "logps/chosen": -40.17961502075195, "logps/rejected": -50.040138244628906, "loss": 0.6253, "losses/dpo": 0.597855806350708, "losses/sft": 1.5503275394439697, "losses/total": 0.597855806350708, "ref_logps/chosen": -35.91657257080078, "ref_logps/rejected": -44.05973815917969, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.426303893327713, "rewards/margins": 0.17173629999160767, "rewards/rejected": -0.5980401635169983, "step": 97 }, { "epoch": 0.74, "grad_norm": 6.7690281568226345, "learning_rate": 4.1853932584269664e-07, "logps/chosen": -43.26731872558594, "logps/rejected": -48.155426025390625, "loss": 0.6528, "losses/dpo": 0.6972070932388306, "losses/sft": 1.3802154064178467, "losses/total": 0.6972070932388306, "ref_logps/chosen": -38.819725036621094, "ref_logps/rejected": -42.62653350830078, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4447590410709381, "rewards/margins": 0.10813023149967194, "rewards/rejected": -0.5528892278671265, "step": 98 }, { "epoch": 0.75, "grad_norm": 6.184786618255584, "learning_rate": 4.1713483146067415e-07, "logps/chosen": -39.052734375, "logps/rejected": -45.65272521972656, "loss": 0.6289, "losses/dpo": 0.5835955142974854, "losses/sft": 1.2479004859924316, "losses/total": 0.5835955142974854, "ref_logps/chosen": -35.05288314819336, "ref_logps/rejected": -40.07136154174805, "rewards/accuracies": 0.703125, "rewards/chosen": -0.39998501539230347, "rewards/margins": 0.15815110504627228, "rewards/rejected": -0.5581361055374146, "step": 99 }, { "epoch": 0.75, "grad_norm": 6.925410262368385, "learning_rate": 4.157303370786517e-07, "logps/chosen": -44.249752044677734, "logps/rejected": -44.935245513916016, "loss": 0.6711, "losses/dpo": 0.5667402744293213, "losses/sft": 1.424223780632019, "losses/total": 0.5667402744293213, "ref_logps/chosen": -38.972206115722656, "ref_logps/rejected": -38.912513732910156, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.5277543067932129, "rewards/margins": 0.07451874017715454, "rewards/rejected": -0.6022731065750122, "step": 100 }, { "epoch": 0.76, "grad_norm": 6.586928303266985, "learning_rate": 4.1432584269662915e-07, "logps/chosen": -39.689693450927734, "logps/rejected": -48.46234130859375, "loss": 0.6509, "losses/dpo": 0.6659662127494812, "losses/sft": 1.3264880180358887, "losses/total": 0.6659662127494812, "ref_logps/chosen": -35.252235412597656, "ref_logps/rejected": -42.87641525268555, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.4437457323074341, "rewards/margins": 0.11484652757644653, "rewards/rejected": -0.5585922598838806, "step": 101 }, { "epoch": 0.77, "grad_norm": 6.103700351208487, "learning_rate": 4.129213483146067e-07, "logps/chosen": -38.51823425292969, "logps/rejected": -43.52346420288086, "loss": 0.6509, "losses/dpo": 0.6539372205734253, "losses/sft": 1.3750677108764648, "losses/total": 0.6539372205734253, "ref_logps/chosen": -34.18145751953125, "ref_logps/rejected": -38.04938507080078, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.4336775541305542, "rewards/margins": 0.11373014003038406, "rewards/rejected": -0.5474076867103577, "step": 102 }, { "epoch": 0.78, "grad_norm": 6.228970412657457, "learning_rate": 4.115168539325842e-07, "logps/chosen": -42.1187629699707, "logps/rejected": -47.93737030029297, "loss": 0.6451, "losses/dpo": 0.646047055721283, "losses/sft": 1.497565507888794, "losses/total": 0.646047055721283, "ref_logps/chosen": -37.47208786010742, "ref_logps/rejected": -42.03216552734375, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.4646672010421753, "rewards/margins": 0.12585340440273285, "rewards/rejected": -0.5905206203460693, "step": 103 }, { "epoch": 0.78, "grad_norm": 6.892504983818195, "learning_rate": 4.1011235955056177e-07, "logps/chosen": -42.86591339111328, "logps/rejected": -48.31887435913086, "loss": 0.6494, "losses/dpo": 0.6817602515220642, "losses/sft": 1.5651347637176514, "losses/total": 0.6817602515220642, "ref_logps/chosen": -37.89812088012695, "ref_logps/rejected": -42.230018615722656, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.4967789351940155, "rewards/margins": 0.11210669577121735, "rewards/rejected": -0.608885645866394, "step": 104 }, { "epoch": 0.79, "grad_norm": 6.344274461611194, "learning_rate": 4.0870786516853933e-07, "logps/chosen": -38.07393264770508, "logps/rejected": -46.695167541503906, "loss": 0.6485, "losses/dpo": 0.6693782806396484, "losses/sft": 1.4964573383331299, "losses/total": 0.6693782806396484, "ref_logps/chosen": -33.26963806152344, "ref_logps/rejected": -40.731658935546875, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.48042935132980347, "rewards/margins": 0.11592163890600204, "rewards/rejected": -0.5963510274887085, "step": 105 }, { "epoch": 0.8, "grad_norm": 6.340224167086584, "learning_rate": 4.0730337078651683e-07, "logps/chosen": -34.64811706542969, "logps/rejected": -44.656005859375, "loss": 0.6748, "losses/dpo": 0.7200191020965576, "losses/sft": 1.2917957305908203, "losses/total": 0.7200191020965576, "ref_logps/chosen": -29.991100311279297, "ref_logps/rejected": -39.274078369140625, "rewards/accuracies": 0.609375, "rewards/chosen": -0.4657020568847656, "rewards/margins": 0.07249079644680023, "rewards/rejected": -0.5381928086280823, "step": 106 }, { "epoch": 0.81, "grad_norm": 6.810734673228144, "learning_rate": 4.058988764044944e-07, "logps/chosen": -43.92599868774414, "logps/rejected": -51.205841064453125, "loss": 0.6618, "losses/dpo": 0.7375708818435669, "losses/sft": 1.6257060766220093, "losses/total": 0.7375708818435669, "ref_logps/chosen": -38.43563461303711, "ref_logps/rejected": -44.785430908203125, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.5490366220474243, "rewards/margins": 0.09300415217876434, "rewards/rejected": -0.6420407295227051, "step": 107 }, { "epoch": 0.82, "grad_norm": 6.646468325900178, "learning_rate": 4.044943820224719e-07, "logps/chosen": -41.907615661621094, "logps/rejected": -47.17523956298828, "loss": 0.6665, "losses/dpo": 0.6535848379135132, "losses/sft": 1.5487432479858398, "losses/total": 0.6535848379135132, "ref_logps/chosen": -36.636451721191406, "ref_logps/rejected": -41.005767822265625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5271163582801819, "rewards/margins": 0.08983068913221359, "rewards/rejected": -0.6169470548629761, "step": 108 }, { "epoch": 0.82, "grad_norm": 6.796059919133426, "learning_rate": 4.0308988764044945e-07, "logps/chosen": -43.95292663574219, "logps/rejected": -48.59518814086914, "loss": 0.6619, "losses/dpo": 0.6743461489677429, "losses/sft": 1.5721744298934937, "losses/total": 0.6743461489677429, "ref_logps/chosen": -38.58289337158203, "ref_logps/rejected": -42.209529876708984, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.5370036363601685, "rewards/margins": 0.10156210511922836, "rewards/rejected": -0.6385657787322998, "step": 109 }, { "epoch": 0.83, "grad_norm": 6.472585584476915, "learning_rate": 4.0168539325842696e-07, "logps/chosen": -40.216651916503906, "logps/rejected": -45.985801696777344, "loss": 0.6793, "losses/dpo": 0.7015002965927124, "losses/sft": 1.661520004272461, "losses/total": 0.7015002965927124, "ref_logps/chosen": -34.707008361816406, "ref_logps/rejected": -39.90086364746094, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.5509647130966187, "rewards/margins": 0.05752916634082794, "rewards/rejected": -0.6084938049316406, "step": 110 }, { "epoch": 0.84, "grad_norm": 6.441943056630329, "learning_rate": 4.0028089887640446e-07, "logps/chosen": -40.84614944458008, "logps/rejected": -49.78240966796875, "loss": 0.6495, "losses/dpo": 0.6826507449150085, "losses/sft": 1.6292600631713867, "losses/total": 0.6826507449150085, "ref_logps/chosen": -35.30500411987305, "ref_logps/rejected": -43.05198669433594, "rewards/accuracies": 0.625, "rewards/chosen": -0.554114580154419, "rewards/margins": 0.1189279854297638, "rewards/rejected": -0.6730425953865051, "step": 111 }, { "epoch": 0.85, "grad_norm": 6.739697461780844, "learning_rate": 3.9887640449438196e-07, "logps/chosen": -43.101287841796875, "logps/rejected": -51.60324478149414, "loss": 0.6365, "losses/dpo": 0.617262601852417, "losses/sft": 1.4229466915130615, "losses/total": 0.617262601852417, "ref_logps/chosen": -37.866512298583984, "ref_logps/rejected": -44.80317687988281, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.52347731590271, "rewards/margins": 0.1565295159816742, "rewards/rejected": -0.6800068020820618, "step": 112 }, { "epoch": 0.85, "grad_norm": 7.023593025692052, "learning_rate": 3.974719101123595e-07, "logps/chosen": -41.23580551147461, "logps/rejected": -52.932403564453125, "loss": 0.621, "losses/dpo": 0.6078984141349792, "losses/sft": 1.4510893821716309, "losses/total": 0.6078984141349792, "ref_logps/chosen": -36.275489807128906, "ref_logps/rejected": -46.033939361572266, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.49603164196014404, "rewards/margins": 0.19381484389305115, "rewards/rejected": -0.6898465156555176, "step": 113 }, { "epoch": 0.86, "grad_norm": 6.441040652132362, "learning_rate": 3.960674157303371e-07, "logps/chosen": -38.938751220703125, "logps/rejected": -47.65938186645508, "loss": 0.6436, "losses/dpo": 0.6575403809547424, "losses/sft": 1.4100581407546997, "losses/total": 0.6575403809547424, "ref_logps/chosen": -33.69282531738281, "ref_logps/rejected": -41.06393814086914, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5245928764343262, "rewards/margins": 0.13495120406150818, "rewards/rejected": -0.6595441102981567, "step": 114 }, { "epoch": 0.87, "grad_norm": 6.831582112977574, "learning_rate": 3.946629213483146e-07, "logps/chosen": -41.802799224853516, "logps/rejected": -49.96432876586914, "loss": 0.6368, "losses/dpo": 0.6237789392471313, "losses/sft": 1.5177757740020752, "losses/total": 0.6237789392471313, "ref_logps/chosen": -36.240840911865234, "ref_logps/rejected": -42.76991653442383, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.5561960935592651, "rewards/margins": 0.16324520111083984, "rewards/rejected": -0.719441294670105, "step": 115 }, { "epoch": 0.88, "grad_norm": 7.410431235906546, "learning_rate": 3.9325842696629214e-07, "logps/chosen": -45.350669860839844, "logps/rejected": -48.64668655395508, "loss": 0.6848, "losses/dpo": 0.746525228023529, "losses/sft": 1.8295865058898926, "losses/total": 0.746525228023529, "ref_logps/chosen": -39.40565490722656, "ref_logps/rejected": -42.114837646484375, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.5945014357566833, "rewards/margins": 0.058684106916189194, "rewards/rejected": -0.6531856060028076, "step": 116 }, { "epoch": 0.88, "grad_norm": 6.7521875642568245, "learning_rate": 3.9185393258426964e-07, "logps/chosen": -42.558738708496094, "logps/rejected": -48.05232238769531, "loss": 0.6395, "losses/dpo": 0.6771230697631836, "losses/sft": 1.4978280067443848, "losses/total": 0.6771230697631836, "ref_logps/chosen": -37.11561584472656, "ref_logps/rejected": -41.03240203857422, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.5443119406700134, "rewards/margins": 0.1576804369688034, "rewards/rejected": -0.701992392539978, "step": 117 }, { "epoch": 0.89, "grad_norm": 6.852926877450363, "learning_rate": 3.904494382022472e-07, "logps/chosen": -43.11158752441406, "logps/rejected": -50.49040985107422, "loss": 0.6294, "losses/dpo": 0.6048296689987183, "losses/sft": 1.4263670444488525, "losses/total": 0.6048296689987183, "ref_logps/chosen": -37.525428771972656, "ref_logps/rejected": -43.2536735534668, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.5586156845092773, "rewards/margins": 0.16505761444568634, "rewards/rejected": -0.7236733436584473, "step": 118 }, { "epoch": 0.9, "grad_norm": 7.482804237101483, "learning_rate": 3.890449438202247e-07, "logps/chosen": -42.81207275390625, "logps/rejected": -48.305213928222656, "loss": 0.6723, "losses/dpo": 0.6438789367675781, "losses/sft": 1.3842287063598633, "losses/total": 0.6438789367675781, "ref_logps/chosen": -37.165802001953125, "ref_logps/rejected": -41.8455696105957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5646266937255859, "rewards/margins": 0.08133774995803833, "rewards/rejected": -0.6459644436836243, "step": 119 }, { "epoch": 0.91, "grad_norm": 6.612121590764, "learning_rate": 3.876404494382022e-07, "logps/chosen": -40.42414093017578, "logps/rejected": -49.36077880859375, "loss": 0.6449, "losses/dpo": 0.6813696622848511, "losses/sft": 1.6905653476715088, "losses/total": 0.6813696622848511, "ref_logps/chosen": -34.49721145629883, "ref_logps/rejected": -42.062259674072266, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5926928520202637, "rewards/margins": 0.13715943694114685, "rewards/rejected": -0.7298523187637329, "step": 120 }, { "epoch": 0.91, "grad_norm": 6.529474424321076, "learning_rate": 3.8623595505617977e-07, "logps/chosen": -43.11798095703125, "logps/rejected": -51.801422119140625, "loss": 0.6396, "losses/dpo": 0.6661785840988159, "losses/sft": 1.4052226543426514, "losses/total": 0.6661785840988159, "ref_logps/chosen": -37.71819305419922, "ref_logps/rejected": -44.816795349121094, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5399786233901978, "rewards/margins": 0.15848389267921448, "rewards/rejected": -0.6984626054763794, "step": 121 }, { "epoch": 0.92, "grad_norm": 6.759925517450978, "learning_rate": 3.8483146067415727e-07, "logps/chosen": -44.635955810546875, "logps/rejected": -53.689002990722656, "loss": 0.6172, "losses/dpo": 0.5884010195732117, "losses/sft": 1.7550606727600098, "losses/total": 0.5884010195732117, "ref_logps/chosen": -38.62323760986328, "ref_logps/rejected": -45.53942108154297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6012718677520752, "rewards/margins": 0.21368616819381714, "rewards/rejected": -0.8149580359458923, "step": 122 }, { "epoch": 0.93, "grad_norm": 6.5150883611431, "learning_rate": 3.834269662921348e-07, "logps/chosen": -41.34044647216797, "logps/rejected": -50.817466735839844, "loss": 0.6373, "losses/dpo": 0.6241766214370728, "losses/sft": 1.3161594867706299, "losses/total": 0.6241766214370728, "ref_logps/chosen": -35.719482421875, "ref_logps/rejected": -43.593727111816406, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.562096118927002, "rewards/margins": 0.16027754545211792, "rewards/rejected": -0.7223736047744751, "step": 123 }, { "epoch": 0.94, "grad_norm": 7.117780641865591, "learning_rate": 3.8202247191011233e-07, "logps/chosen": -40.98164367675781, "logps/rejected": -46.440032958984375, "loss": 0.6259, "losses/dpo": 0.7275031805038452, "losses/sft": 1.3312557935714722, "losses/total": 0.7275031805038452, "ref_logps/chosen": -36.18989181518555, "ref_logps/rejected": -39.75829315185547, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.4791754186153412, "rewards/margins": 0.1889985203742981, "rewards/rejected": -0.6681739091873169, "step": 124 }, { "epoch": 0.94, "grad_norm": 6.810891199884393, "learning_rate": 3.806179775280899e-07, "logps/chosen": -45.05056381225586, "logps/rejected": -51.01411819458008, "loss": 0.6422, "losses/dpo": 0.5984268188476562, "losses/sft": 1.6079349517822266, "losses/total": 0.5984268188476562, "ref_logps/chosen": -39.050132751464844, "ref_logps/rejected": -43.51178741455078, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.6000430583953857, "rewards/margins": 0.15018987655639648, "rewards/rejected": -0.750232994556427, "step": 125 }, { "epoch": 0.95, "grad_norm": 7.000300157233934, "learning_rate": 3.792134831460674e-07, "logps/chosen": -45.488685607910156, "logps/rejected": -53.24082946777344, "loss": 0.6293, "losses/dpo": 0.6512585878372192, "losses/sft": 1.70095694065094, "losses/total": 0.6512585878372192, "ref_logps/chosen": -39.463134765625, "ref_logps/rejected": -45.5393180847168, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.602555513381958, "rewards/margins": 0.16759565472602844, "rewards/rejected": -0.7701511383056641, "step": 126 }, { "epoch": 0.96, "grad_norm": 6.866553634275337, "learning_rate": 3.7780898876404495e-07, "logps/chosen": -46.352333068847656, "logps/rejected": -50.944129943847656, "loss": 0.6501, "losses/dpo": 0.7261393070220947, "losses/sft": 1.7794756889343262, "losses/total": 0.7261393070220947, "ref_logps/chosen": -40.64281463623047, "ref_logps/rejected": -43.911109924316406, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5709517002105713, "rewards/margins": 0.13234999775886536, "rewards/rejected": -0.7033016681671143, "step": 127 }, { "epoch": 0.97, "grad_norm": 7.1746623953368385, "learning_rate": 3.7640449438202245e-07, "logps/chosen": -42.81266403198242, "logps/rejected": -47.58647155761719, "loss": 0.6484, "losses/dpo": 0.6509548425674438, "losses/sft": 1.4893585443496704, "losses/total": 0.6509548425674438, "ref_logps/chosen": -36.92261505126953, "ref_logps/rejected": -40.24052429199219, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5890049338340759, "rewards/margins": 0.14558979868888855, "rewards/rejected": -0.7345947623252869, "step": 128 }, { "epoch": 0.97, "grad_norm": 6.639733312989805, "learning_rate": 3.75e-07, "logps/chosen": -42.699241638183594, "logps/rejected": -49.38917922973633, "loss": 0.6384, "losses/dpo": 0.7398217916488647, "losses/sft": 1.8203296661376953, "losses/total": 0.7398217916488647, "ref_logps/chosen": -36.63134002685547, "ref_logps/rejected": -41.697608947753906, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6067899465560913, "rewards/margins": 0.16236720979213715, "rewards/rejected": -0.7691571712493896, "step": 129 }, { "epoch": 0.98, "grad_norm": 6.840872832592906, "learning_rate": 3.735955056179775e-07, "logps/chosen": -38.55268096923828, "logps/rejected": -46.57276153564453, "loss": 0.6455, "losses/dpo": 0.6873192191123962, "losses/sft": 1.3916716575622559, "losses/total": 0.6873192191123962, "ref_logps/chosen": -33.59415054321289, "ref_logps/rejected": -40.15166091918945, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.4958529472351074, "rewards/margins": 0.14625714719295502, "rewards/rejected": -0.6421101093292236, "step": 130 }, { "epoch": 0.99, "grad_norm": 7.001016051342946, "learning_rate": 3.72191011235955e-07, "logps/chosen": -42.6187629699707, "logps/rejected": -47.61820983886719, "loss": 0.6507, "losses/dpo": 0.555785059928894, "losses/sft": 1.6289881467819214, "losses/total": 0.555785059928894, "ref_logps/chosen": -36.840423583984375, "ref_logps/rejected": -40.43097686767578, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.5778340101242065, "rewards/margins": 0.14088886976242065, "rewards/rejected": -0.7187228202819824, "step": 131 }, { "epoch": 1.0, "grad_norm": 6.69674019742392, "learning_rate": 3.707865168539326e-07, "logps/chosen": -42.53615951538086, "logps/rejected": -50.24591064453125, "loss": 0.639, "losses/dpo": 0.658089280128479, "losses/sft": 1.6500622034072876, "losses/total": 0.658089280128479, "ref_logps/chosen": -36.53786087036133, "ref_logps/rejected": -42.35614776611328, "rewards/accuracies": 0.640625, "rewards/chosen": -0.599829912185669, "rewards/margins": 0.1891460418701172, "rewards/rejected": -0.7889760136604309, "step": 132 }, { "epoch": 1.0, "grad_norm": 6.641666012117393, "learning_rate": 3.693820224719101e-07, "logps/chosen": -40.87996292114258, "logps/rejected": -51.007049560546875, "loss": 0.6244, "losses/dpo": 0.6069691181182861, "losses/sft": 1.3164952993392944, "losses/total": 0.6069691181182861, "ref_logps/chosen": -35.29632568359375, "ref_logps/rejected": -43.411521911621094, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5583640933036804, "rewards/margins": 0.20118848979473114, "rewards/rejected": -0.7595525979995728, "step": 133 }, { "epoch": 1.01, "grad_norm": 6.588946335066842, "learning_rate": 3.6797752808988764e-07, "logps/chosen": -42.399169921875, "logps/rejected": -51.85491180419922, "loss": 0.6191, "losses/dpo": 0.5947903394699097, "losses/sft": 1.4862775802612305, "losses/total": 0.5947903394699097, "ref_logps/chosen": -36.425140380859375, "ref_logps/rejected": -43.687225341796875, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5974029898643494, "rewards/margins": 0.2193659394979477, "rewards/rejected": -0.8167688846588135, "step": 134 }, { "epoch": 1.02, "grad_norm": 6.690035072129239, "learning_rate": 3.6657303370786514e-07, "logps/chosen": -41.394432067871094, "logps/rejected": -53.464168548583984, "loss": 0.6, "losses/dpo": 0.6503059267997742, "losses/sft": 1.5431207418441772, "losses/total": 0.6503059267997742, "ref_logps/chosen": -35.42436218261719, "ref_logps/rejected": -44.949180603027344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5970069766044617, "rewards/margins": 0.25449231266975403, "rewards/rejected": -0.8514993190765381, "step": 135 }, { "epoch": 1.03, "grad_norm": 7.159913988692313, "learning_rate": 3.651685393258427e-07, "logps/chosen": -47.11418533325195, "logps/rejected": -49.349937438964844, "loss": 0.6642, "losses/dpo": 0.70440673828125, "losses/sft": 1.6172586679458618, "losses/total": 0.70440673828125, "ref_logps/chosen": -40.66786193847656, "ref_logps/rejected": -41.70207977294922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6446323394775391, "rewards/margins": 0.12015305459499359, "rewards/rejected": -0.7647854685783386, "step": 136 }, { "epoch": 1.03, "grad_norm": 6.861296978626527, "learning_rate": 3.637640449438202e-07, "logps/chosen": -41.73448181152344, "logps/rejected": -49.953067779541016, "loss": 0.6114, "losses/dpo": 0.6396130323410034, "losses/sft": 1.41642427444458, "losses/total": 0.6396130323410034, "ref_logps/chosen": -36.14557647705078, "ref_logps/rejected": -42.22886657714844, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5588902235031128, "rewards/margins": 0.21353021264076233, "rewards/rejected": -0.7724204063415527, "step": 137 }, { "epoch": 1.04, "grad_norm": 6.8531162605456055, "learning_rate": 3.6235955056179776e-07, "logps/chosen": -40.27118682861328, "logps/rejected": -51.58380889892578, "loss": 0.6205, "losses/dpo": 0.6927103400230408, "losses/sft": 1.5446867942810059, "losses/total": 0.6927103400230408, "ref_logps/chosen": -34.374351501464844, "ref_logps/rejected": -43.53186798095703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5896837115287781, "rewards/margins": 0.2155105173587799, "rewards/rejected": -0.8051942586898804, "step": 138 }, { "epoch": 1.05, "grad_norm": 6.581792409395151, "learning_rate": 3.6095505617977526e-07, "logps/chosen": -40.182411193847656, "logps/rejected": -50.48395538330078, "loss": 0.6351, "losses/dpo": 0.5907813906669617, "losses/sft": 1.5051367282867432, "losses/total": 0.5907813906669617, "ref_logps/chosen": -34.705772399902344, "ref_logps/rejected": -43.20262908935547, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5476638078689575, "rewards/margins": 0.18046864867210388, "rewards/rejected": -0.728132426738739, "step": 139 }, { "epoch": 1.06, "grad_norm": 7.099683859328159, "learning_rate": 3.5955056179775277e-07, "logps/chosen": -47.343101501464844, "logps/rejected": -54.016380310058594, "loss": 0.5962, "losses/dpo": 0.5484156012535095, "losses/sft": 1.3340450525283813, "losses/total": 0.5484156012535095, "ref_logps/chosen": -40.93060302734375, "ref_logps/rejected": -45.07525634765625, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6412495374679565, "rewards/margins": 0.2528632581233978, "rewards/rejected": -0.8941128253936768, "step": 140 }, { "epoch": 1.06, "grad_norm": 6.6246858434595435, "learning_rate": 3.581460674157303e-07, "logps/chosen": -42.795745849609375, "logps/rejected": -50.99128341674805, "loss": 0.6105, "losses/dpo": 0.6333677768707275, "losses/sft": 1.7310549020767212, "losses/total": 0.6333677768707275, "ref_logps/chosen": -36.29172897338867, "ref_logps/rejected": -42.22053527832031, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6504020690917969, "rewards/margins": 0.2266732007265091, "rewards/rejected": -0.8770751953125, "step": 141 }, { "epoch": 1.07, "grad_norm": 6.585988374877859, "learning_rate": 3.5674157303370783e-07, "logps/chosen": -39.767051696777344, "logps/rejected": -45.62493896484375, "loss": 0.6184, "losses/dpo": 0.6107473373413086, "losses/sft": 1.3934905529022217, "losses/total": 0.6107473373413086, "ref_logps/chosen": -34.30314636230469, "ref_logps/rejected": -38.085365295410156, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5463899374008179, "rewards/margins": 0.20756718516349792, "rewards/rejected": -0.7539570927619934, "step": 142 }, { "epoch": 1.08, "grad_norm": 7.605347145021406, "learning_rate": 3.553370786516854e-07, "logps/chosen": -45.32689666748047, "logps/rejected": -53.28538513183594, "loss": 0.6396, "losses/dpo": 0.6139785051345825, "losses/sft": 1.4996390342712402, "losses/total": 0.6139785051345825, "ref_logps/chosen": -38.790870666503906, "ref_logps/rejected": -44.86207580566406, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6536027789115906, "rewards/margins": 0.18872803449630737, "rewards/rejected": -0.842330813407898, "step": 143 }, { "epoch": 1.09, "grad_norm": 6.375447848895096, "learning_rate": 3.539325842696629e-07, "logps/chosen": -39.950706481933594, "logps/rejected": -52.67605972290039, "loss": 0.6352, "losses/dpo": 0.562360405921936, "losses/sft": 1.5273569822311401, "losses/total": 0.562360405921936, "ref_logps/chosen": -33.4902458190918, "ref_logps/rejected": -44.185813903808594, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6460464000701904, "rewards/margins": 0.20297789573669434, "rewards/rejected": -0.8490242958068848, "step": 144 }, { "epoch": 1.09, "grad_norm": 6.570913856555609, "learning_rate": 3.5252808988764045e-07, "logps/chosen": -41.82993698120117, "logps/rejected": -49.303138732910156, "loss": 0.616, "losses/dpo": 0.6345305442810059, "losses/sft": 1.6690033674240112, "losses/total": 0.6345305442810059, "ref_logps/chosen": -36.14568328857422, "ref_logps/rejected": -41.49669647216797, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5684253573417664, "rewards/margins": 0.2122190296649933, "rewards/rejected": -0.7806443572044373, "step": 145 }, { "epoch": 1.1, "grad_norm": 6.513567190172644, "learning_rate": 3.51123595505618e-07, "logps/chosen": -41.52465057373047, "logps/rejected": -53.12078094482422, "loss": 0.6021, "losses/dpo": 0.6336867809295654, "losses/sft": 1.6118431091308594, "losses/total": 0.6336867809295654, "ref_logps/chosen": -35.433265686035156, "ref_logps/rejected": -44.33394241333008, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.6091387271881104, "rewards/margins": 0.2695454955101013, "rewards/rejected": -0.8786842823028564, "step": 146 }, { "epoch": 1.11, "grad_norm": 7.159126675968495, "learning_rate": 3.497191011235955e-07, "logps/chosen": -45.985076904296875, "logps/rejected": -56.15589904785156, "loss": 0.6053, "losses/dpo": 0.6400711536407471, "losses/sft": 1.618746042251587, "losses/total": 0.6400711536407471, "ref_logps/chosen": -39.94523239135742, "ref_logps/rejected": -47.24679946899414, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6039848327636719, "rewards/margins": 0.2869252562522888, "rewards/rejected": -0.8909100294113159, "step": 147 }, { "epoch": 1.12, "grad_norm": 6.31408614588678, "learning_rate": 3.48314606741573e-07, "logps/chosen": -36.47071075439453, "logps/rejected": -44.49042510986328, "loss": 0.6059, "losses/dpo": 0.6014193892478943, "losses/sft": 1.4461239576339722, "losses/total": 0.6014193892478943, "ref_logps/chosen": -31.124897003173828, "ref_logps/rejected": -36.84246063232422, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5345816016197205, "rewards/margins": 0.23021462559700012, "rewards/rejected": -0.7647961974143982, "step": 148 }, { "epoch": 1.12, "grad_norm": 6.681653921230616, "learning_rate": 3.469101123595505e-07, "logps/chosen": -47.83122634887695, "logps/rejected": -53.77499008178711, "loss": 0.5731, "losses/dpo": 0.5383450388908386, "losses/sft": 1.5721720457077026, "losses/total": 0.5383450388908386, "ref_logps/chosen": -41.766536712646484, "ref_logps/rejected": -44.522464752197266, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6064690947532654, "rewards/margins": 0.3187834620475769, "rewards/rejected": -0.9252525568008423, "step": 149 }, { "epoch": 1.13, "grad_norm": 6.904604135283266, "learning_rate": 3.4550561797752807e-07, "logps/chosen": -44.048912048339844, "logps/rejected": -51.318206787109375, "loss": 0.6352, "losses/dpo": 0.7056742906570435, "losses/sft": 1.706668496131897, "losses/total": 0.7056742906570435, "ref_logps/chosen": -37.56008529663086, "ref_logps/rejected": -42.8933219909668, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.6488831043243408, "rewards/margins": 0.19360551238059998, "rewards/rejected": -0.8424886465072632, "step": 150 }, { "epoch": 1.14, "grad_norm": 6.64022622762387, "learning_rate": 3.441011235955056e-07, "logps/chosen": -43.570884704589844, "logps/rejected": -52.240318298339844, "loss": 0.593, "losses/dpo": 0.6028671264648438, "losses/sft": 1.4086356163024902, "losses/total": 0.6028671264648438, "ref_logps/chosen": -38.0629768371582, "ref_logps/rejected": -43.7314453125, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5507906079292297, "rewards/margins": 0.3000965714454651, "rewards/rejected": -0.8508871793746948, "step": 151 }, { "epoch": 1.15, "grad_norm": 6.233323532005839, "learning_rate": 3.4269662921348313e-07, "logps/chosen": -37.21058654785156, "logps/rejected": -48.0037841796875, "loss": 0.594, "losses/dpo": 0.614506721496582, "losses/sft": 1.60889732837677, "losses/total": 0.614506721496582, "ref_logps/chosen": -31.845245361328125, "ref_logps/rejected": -39.9993782043457, "rewards/accuracies": 0.71875, "rewards/chosen": -0.536533772945404, "rewards/margins": 0.2639070749282837, "rewards/rejected": -0.800440788269043, "step": 152 }, { "epoch": 1.15, "grad_norm": 6.3687654295239335, "learning_rate": 3.4129213483146064e-07, "logps/chosen": -38.31336975097656, "logps/rejected": -46.770484924316406, "loss": 0.6353, "losses/dpo": 0.67460036277771, "losses/sft": 1.5722901821136475, "losses/total": 0.67460036277771, "ref_logps/chosen": -32.139488220214844, "ref_logps/rejected": -38.692901611328125, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.617388129234314, "rewards/margins": 0.1903703510761261, "rewards/rejected": -0.8077584505081177, "step": 153 }, { "epoch": 1.16, "grad_norm": 6.739574073940404, "learning_rate": 3.398876404494382e-07, "logps/chosen": -40.92176055908203, "logps/rejected": -55.60083770751953, "loss": 0.5768, "losses/dpo": 0.6374070644378662, "losses/sft": 1.6940921545028687, "losses/total": 0.6374070644378662, "ref_logps/chosen": -34.817481994628906, "ref_logps/rejected": -46.21774673461914, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6104279160499573, "rewards/margins": 0.3278810977935791, "rewards/rejected": -0.9383090138435364, "step": 154 }, { "epoch": 1.17, "grad_norm": 6.879053261576145, "learning_rate": 3.3848314606741575e-07, "logps/chosen": -42.9251823425293, "logps/rejected": -51.702964782714844, "loss": 0.6139, "losses/dpo": 0.6116975545883179, "losses/sft": 1.40887451171875, "losses/total": 0.6116975545883179, "ref_logps/chosen": -37.03919982910156, "ref_logps/rejected": -43.293392181396484, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5885984301567078, "rewards/margins": 0.25235864520072937, "rewards/rejected": -0.8409571051597595, "step": 155 }, { "epoch": 1.18, "grad_norm": 6.694865071581937, "learning_rate": 3.3707865168539325e-07, "logps/chosen": -42.003990173339844, "logps/rejected": -50.280548095703125, "loss": 0.6314, "losses/dpo": 0.6589547991752625, "losses/sft": 1.475001573562622, "losses/total": 0.6589547991752625, "ref_logps/chosen": -35.952449798583984, "ref_logps/rejected": -42.320465087890625, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6051540374755859, "rewards/margins": 0.1908540576696396, "rewards/rejected": -0.7960080504417419, "step": 156 }, { "epoch": 1.18, "grad_norm": 6.680369593277332, "learning_rate": 3.356741573033708e-07, "logps/chosen": -38.10074996948242, "logps/rejected": -47.0694694519043, "loss": 0.6165, "losses/dpo": 0.5133580565452576, "losses/sft": 1.357291340827942, "losses/total": 0.5133580565452576, "ref_logps/chosen": -32.30625915527344, "ref_logps/rejected": -38.88528060913086, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5794489979743958, "rewards/margins": 0.23896978795528412, "rewards/rejected": -0.8184187412261963, "step": 157 }, { "epoch": 1.19, "grad_norm": 7.163827933877828, "learning_rate": 3.3426966292134826e-07, "logps/chosen": -43.57667541503906, "logps/rejected": -53.30217742919922, "loss": 0.5908, "losses/dpo": 0.574418306350708, "losses/sft": 1.7125813961029053, "losses/total": 0.574418306350708, "ref_logps/chosen": -37.39282989501953, "ref_logps/rejected": -44.2191162109375, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6183844804763794, "rewards/margins": 0.2899210453033447, "rewards/rejected": -0.9083055257797241, "step": 158 }, { "epoch": 1.2, "grad_norm": 7.534492030316663, "learning_rate": 3.328651685393258e-07, "logps/chosen": -43.87846755981445, "logps/rejected": -50.815635681152344, "loss": 0.6314, "losses/dpo": 0.6066723465919495, "losses/sft": 1.625878930091858, "losses/total": 0.6066723465919495, "ref_logps/chosen": -37.40098190307617, "ref_logps/rejected": -42.1273193359375, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6477489471435547, "rewards/margins": 0.2210829257965088, "rewards/rejected": -0.8688318729400635, "step": 159 }, { "epoch": 1.21, "grad_norm": 6.461969936591063, "learning_rate": 3.314606741573033e-07, "logps/chosen": -41.52513122558594, "logps/rejected": -50.716064453125, "loss": 0.5689, "losses/dpo": 0.5582201480865479, "losses/sft": 1.5073952674865723, "losses/total": 0.5582201480865479, "ref_logps/chosen": -35.848846435546875, "ref_logps/rejected": -41.70889663696289, "rewards/accuracies": 0.78125, "rewards/chosen": -0.567628026008606, "rewards/margins": 0.333088219165802, "rewards/rejected": -0.9007163047790527, "step": 160 }, { "epoch": 1.22, "grad_norm": 6.942966081986398, "learning_rate": 3.300561797752809e-07, "logps/chosen": -45.35464859008789, "logps/rejected": -48.982810974121094, "loss": 0.6365, "losses/dpo": 0.5614318251609802, "losses/sft": 1.7859472036361694, "losses/total": 0.5614318251609802, "ref_logps/chosen": -39.001136779785156, "ref_logps/rejected": -40.674232482910156, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6353514790534973, "rewards/margins": 0.19550636410713196, "rewards/rejected": -0.8308578729629517, "step": 161 }, { "epoch": 1.22, "grad_norm": 7.180092377812206, "learning_rate": 3.2865168539325844e-07, "logps/chosen": -45.20994186401367, "logps/rejected": -52.836875915527344, "loss": 0.6224, "losses/dpo": 0.5493739247322083, "losses/sft": 1.547910213470459, "losses/total": 0.5493739247322083, "ref_logps/chosen": -38.36485290527344, "ref_logps/rejected": -43.65592575073242, "rewards/accuracies": 0.671875, "rewards/chosen": -0.684508740901947, "rewards/margins": 0.2335864156484604, "rewards/rejected": -0.9180951714515686, "step": 162 }, { "epoch": 1.23, "grad_norm": 6.839328465422381, "learning_rate": 3.2724719101123594e-07, "logps/chosen": -44.02252197265625, "logps/rejected": -48.20290756225586, "loss": 0.6261, "losses/dpo": 0.5617036819458008, "losses/sft": 1.5726804733276367, "losses/total": 0.5617036819458008, "ref_logps/chosen": -37.56252670288086, "ref_logps/rejected": -39.5811882019043, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.6459991931915283, "rewards/margins": 0.21617242693901062, "rewards/rejected": -0.8621717095375061, "step": 163 }, { "epoch": 1.24, "grad_norm": 7.080895585848076, "learning_rate": 3.258426966292135e-07, "logps/chosen": -44.87802505493164, "logps/rejected": -53.4232292175293, "loss": 0.6444, "losses/dpo": 0.5438011884689331, "losses/sft": 1.6607606410980225, "losses/total": 0.5438011884689331, "ref_logps/chosen": -37.73242950439453, "ref_logps/rejected": -44.338134765625, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.7145596146583557, "rewards/margins": 0.1939493715763092, "rewards/rejected": -0.9085089564323425, "step": 164 }, { "epoch": 1.25, "grad_norm": 6.817784306171125, "learning_rate": 3.24438202247191e-07, "logps/chosen": -41.74745559692383, "logps/rejected": -54.77409362792969, "loss": 0.5757, "losses/dpo": 0.5074477195739746, "losses/sft": 1.4384926557540894, "losses/total": 0.5074477195739746, "ref_logps/chosen": -35.96852493286133, "ref_logps/rejected": -45.66744613647461, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5778931379318237, "rewards/margins": 0.3327715992927551, "rewards/rejected": -0.9106647372245789, "step": 165 }, { "epoch": 1.25, "grad_norm": 6.73402019836184, "learning_rate": 3.2303370786516856e-07, "logps/chosen": -42.45885467529297, "logps/rejected": -55.83628845214844, "loss": 0.5582, "losses/dpo": 0.5949134230613708, "losses/sft": 1.7396336793899536, "losses/total": 0.5949134230613708, "ref_logps/chosen": -35.91239547729492, "ref_logps/rejected": -45.41798400878906, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.6546458601951599, "rewards/margins": 0.387184739112854, "rewards/rejected": -1.0418306589126587, "step": 166 }, { "epoch": 1.26, "grad_norm": 6.484969223325532, "learning_rate": 3.21629213483146e-07, "logps/chosen": -39.55500030517578, "logps/rejected": -53.75917053222656, "loss": 0.5553, "losses/dpo": 0.5379188060760498, "losses/sft": 1.6754546165466309, "losses/total": 0.5379188060760498, "ref_logps/chosen": -33.612388610839844, "ref_logps/rejected": -44.048587799072266, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5942604541778564, "rewards/margins": 0.3767976760864258, "rewards/rejected": -0.971058189868927, "step": 167 }, { "epoch": 1.27, "grad_norm": 7.553429925940112, "learning_rate": 3.2022471910112357e-07, "logps/chosen": -42.48908615112305, "logps/rejected": -53.703208923339844, "loss": 0.6156, "losses/dpo": 0.5968649983406067, "losses/sft": 1.4021852016448975, "losses/total": 0.5968649983406067, "ref_logps/chosen": -36.02775573730469, "ref_logps/rejected": -44.81937026977539, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6461330056190491, "rewards/margins": 0.24225082993507385, "rewards/rejected": -0.8883838057518005, "step": 168 }, { "epoch": 1.28, "grad_norm": 6.938904519141351, "learning_rate": 3.1882022471910107e-07, "logps/chosen": -41.907588958740234, "logps/rejected": -51.34623718261719, "loss": 0.606, "losses/dpo": 0.5469992756843567, "losses/sft": 1.526263952255249, "losses/total": 0.5469992756843567, "ref_logps/chosen": -35.054046630859375, "ref_logps/rejected": -41.81439971923828, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6853541135787964, "rewards/margins": 0.26782965660095215, "rewards/rejected": -0.9531837701797485, "step": 169 }, { "epoch": 1.28, "grad_norm": 6.455751734226468, "learning_rate": 3.1741573033707863e-07, "logps/chosen": -43.997074127197266, "logps/rejected": -52.1095085144043, "loss": 0.5652, "losses/dpo": 0.5796064138412476, "losses/sft": 1.7088840007781982, "losses/total": 0.5796064138412476, "ref_logps/chosen": -37.47534942626953, "ref_logps/rejected": -42.05298614501953, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6521726846694946, "rewards/margins": 0.35347938537597656, "rewards/rejected": -1.0056521892547607, "step": 170 }, { "epoch": 1.29, "grad_norm": 6.925720662101029, "learning_rate": 3.160112359550562e-07, "logps/chosen": -42.9494743347168, "logps/rejected": -51.66035461425781, "loss": 0.5733, "losses/dpo": 0.5771209001541138, "losses/sft": 1.3783965110778809, "losses/total": 0.5771209001541138, "ref_logps/chosen": -36.78116989135742, "ref_logps/rejected": -41.843605041503906, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6168303489685059, "rewards/margins": 0.36484503746032715, "rewards/rejected": -0.981675386428833, "step": 171 }, { "epoch": 1.3, "grad_norm": 7.01889024226675, "learning_rate": 3.146067415730337e-07, "logps/chosen": -45.94253158569336, "logps/rejected": -52.16916275024414, "loss": 0.6015, "losses/dpo": 0.5898208618164062, "losses/sft": 1.6218047142028809, "losses/total": 0.5898208618164062, "ref_logps/chosen": -39.25119400024414, "ref_logps/rejected": -42.79926300048828, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.669133722782135, "rewards/margins": 0.2678561806678772, "rewards/rejected": -0.936989963054657, "step": 172 }, { "epoch": 1.31, "grad_norm": 7.495610381062607, "learning_rate": 3.1320224719101125e-07, "logps/chosen": -44.17959976196289, "logps/rejected": -53.05359649658203, "loss": 0.6002, "losses/dpo": 0.630514919757843, "losses/sft": 1.7863552570343018, "losses/total": 0.630514919757843, "ref_logps/chosen": -37.75782012939453, "ref_logps/rejected": -43.63001251220703, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.642177939414978, "rewards/margins": 0.30018070340156555, "rewards/rejected": -0.9423586130142212, "step": 173 }, { "epoch": 1.31, "grad_norm": 7.4434324895050015, "learning_rate": 3.1179775280898875e-07, "logps/chosen": -47.16429901123047, "logps/rejected": -53.266780853271484, "loss": 0.6233, "losses/dpo": 0.5390438437461853, "losses/sft": 1.480837345123291, "losses/total": 0.5390438437461853, "ref_logps/chosen": -40.324798583984375, "ref_logps/rejected": -44.025787353515625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6839500069618225, "rewards/margins": 0.24014970660209656, "rewards/rejected": -0.9240997433662415, "step": 174 }, { "epoch": 1.32, "grad_norm": 7.0617610747390644, "learning_rate": 3.103932584269663e-07, "logps/chosen": -45.060882568359375, "logps/rejected": -54.979156494140625, "loss": 0.5891, "losses/dpo": 0.5568109750747681, "losses/sft": 1.6295528411865234, "losses/total": 0.5568109750747681, "ref_logps/chosen": -37.929840087890625, "ref_logps/rejected": -44.73991394042969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7131036520004272, "rewards/margins": 0.3108205795288086, "rewards/rejected": -1.0239241123199463, "step": 175 }, { "epoch": 1.33, "grad_norm": 6.663174112724157, "learning_rate": 3.0898876404494376e-07, "logps/chosen": -42.657920837402344, "logps/rejected": -47.67673110961914, "loss": 0.629, "losses/dpo": 0.7223004102706909, "losses/sft": 1.3237264156341553, "losses/total": 0.7223004102706909, "ref_logps/chosen": -36.358001708984375, "ref_logps/rejected": -38.938507080078125, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6299920082092285, "rewards/margins": 0.24383032321929932, "rewards/rejected": -0.8738222122192383, "step": 176 }, { "epoch": 1.34, "grad_norm": 7.940936921413792, "learning_rate": 3.075842696629213e-07, "logps/chosen": -48.067203521728516, "logps/rejected": -52.38322067260742, "loss": 0.6277, "losses/dpo": 0.6528229117393494, "losses/sft": 1.5663461685180664, "losses/total": 0.6528229117393494, "ref_logps/chosen": -40.93121337890625, "ref_logps/rejected": -42.879188537597656, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7135992050170898, "rewards/margins": 0.23680387437343597, "rewards/rejected": -0.950403094291687, "step": 177 }, { "epoch": 1.34, "grad_norm": 6.9909555671219366, "learning_rate": 3.0617977528089887e-07, "logps/chosen": -43.74810791015625, "logps/rejected": -51.602622985839844, "loss": 0.6134, "losses/dpo": 0.6351133584976196, "losses/sft": 1.548452615737915, "losses/total": 0.6351133584976196, "ref_logps/chosen": -36.539649963378906, "ref_logps/rejected": -41.724647521972656, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.7208462953567505, "rewards/margins": 0.26695096492767334, "rewards/rejected": -0.9877973794937134, "step": 178 }, { "epoch": 1.35, "grad_norm": 7.516487264439432, "learning_rate": 3.047752808988764e-07, "logps/chosen": -45.41664123535156, "logps/rejected": -52.19443130493164, "loss": 0.6325, "losses/dpo": 0.6936246752738953, "losses/sft": 1.418731451034546, "losses/total": 0.6936246752738953, "ref_logps/chosen": -37.89662170410156, "ref_logps/rejected": -42.46479797363281, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7520017623901367, "rewards/margins": 0.22096163034439087, "rewards/rejected": -0.9729634523391724, "step": 179 }, { "epoch": 1.36, "grad_norm": 7.386682971240318, "learning_rate": 3.0337078651685393e-07, "logps/chosen": -44.32128143310547, "logps/rejected": -56.52943420410156, "loss": 0.589, "losses/dpo": 0.50272536277771, "losses/sft": 1.4175364971160889, "losses/total": 0.50272536277771, "ref_logps/chosen": -36.94242477416992, "ref_logps/rejected": -45.8451042175293, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.7378860712051392, "rewards/margins": 0.3305472731590271, "rewards/rejected": -1.0684332847595215, "step": 180 }, { "epoch": 1.37, "grad_norm": 7.070520625555024, "learning_rate": 3.0196629213483144e-07, "logps/chosen": -42.875099182128906, "logps/rejected": -52.050384521484375, "loss": 0.5815, "losses/dpo": 0.6107900738716125, "losses/sft": 1.7456879615783691, "losses/total": 0.6107900738716125, "ref_logps/chosen": -36.82639694213867, "ref_logps/rejected": -42.541603088378906, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.604870617389679, "rewards/margins": 0.34600716829299927, "rewards/rejected": -0.9508777856826782, "step": 181 }, { "epoch": 1.37, "grad_norm": 6.880606354027993, "learning_rate": 3.00561797752809e-07, "logps/chosen": -44.50994110107422, "logps/rejected": -50.33254623413086, "loss": 0.5976, "losses/dpo": 0.5528784990310669, "losses/sft": 1.7669578790664673, "losses/total": 0.5528784990310669, "ref_logps/chosen": -37.44862365722656, "ref_logps/rejected": -40.247039794921875, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7061322331428528, "rewards/margins": 0.30241847038269043, "rewards/rejected": -1.0085506439208984, "step": 182 }, { "epoch": 1.38, "grad_norm": 7.02791538850361, "learning_rate": 2.991573033707865e-07, "logps/chosen": -43.87753677368164, "logps/rejected": -49.34949493408203, "loss": 0.6184, "losses/dpo": 0.5282893180847168, "losses/sft": 1.5792714357376099, "losses/total": 0.5282893180847168, "ref_logps/chosen": -36.59724044799805, "ref_logps/rejected": -39.5660285949707, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7280292510986328, "rewards/margins": 0.25031745433807373, "rewards/rejected": -0.9783467054367065, "step": 183 }, { "epoch": 1.39, "grad_norm": 7.145778710825677, "learning_rate": 2.9775280898876406e-07, "logps/chosen": -44.12153625488281, "logps/rejected": -50.03323745727539, "loss": 0.5994, "losses/dpo": 0.5445400476455688, "losses/sft": 1.3715200424194336, "losses/total": 0.5445400476455688, "ref_logps/chosen": -36.779212951660156, "ref_logps/rejected": -39.96929168701172, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7342325448989868, "rewards/margins": 0.2721615731716156, "rewards/rejected": -1.0063941478729248, "step": 184 }, { "epoch": 1.4, "grad_norm": 7.033244427636761, "learning_rate": 2.9634831460674156e-07, "logps/chosen": -44.52336120605469, "logps/rejected": -52.357810974121094, "loss": 0.5954, "losses/dpo": 0.7007085084915161, "losses/sft": 1.6124120950698853, "losses/total": 0.7007085084915161, "ref_logps/chosen": -37.63973617553711, "ref_logps/rejected": -42.18433380126953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6883625984191895, "rewards/margins": 0.32898518443107605, "rewards/rejected": -1.017347812652588, "step": 185 }, { "epoch": 1.4, "grad_norm": 6.657232520968286, "learning_rate": 2.9494382022471906e-07, "logps/chosen": -41.616920471191406, "logps/rejected": -52.240867614746094, "loss": 0.5708, "losses/dpo": 0.6222548484802246, "losses/sft": 1.4851452112197876, "losses/total": 0.6222548484802246, "ref_logps/chosen": -35.22339630126953, "ref_logps/rejected": -42.063621520996094, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6393523216247559, "rewards/margins": 0.37837234139442444, "rewards/rejected": -1.0177247524261475, "step": 186 }, { "epoch": 1.41, "grad_norm": 7.293022584639844, "learning_rate": 2.935393258426966e-07, "logps/chosen": -44.5795783996582, "logps/rejected": -51.8583869934082, "loss": 0.613, "losses/dpo": 0.5626444816589355, "losses/sft": 1.5801838636398315, "losses/total": 0.5626444816589355, "ref_logps/chosen": -37.548030853271484, "ref_logps/rejected": -41.92090606689453, "rewards/accuracies": 0.625, "rewards/chosen": -0.7031550407409668, "rewards/margins": 0.2905934154987335, "rewards/rejected": -0.9937484264373779, "step": 187 }, { "epoch": 1.42, "grad_norm": 6.7646034429768385, "learning_rate": 2.921348314606741e-07, "logps/chosen": -38.493064880371094, "logps/rejected": -52.74738693237305, "loss": 0.5673, "losses/dpo": 0.5454456210136414, "losses/sft": 1.4984629154205322, "losses/total": 0.5454456210136414, "ref_logps/chosen": -31.663148880004883, "ref_logps/rejected": -42.071197509765625, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.6829913854598999, "rewards/margins": 0.38462772965431213, "rewards/rejected": -1.0676190853118896, "step": 188 }, { "epoch": 1.43, "grad_norm": 6.461217065826113, "learning_rate": 2.907303370786517e-07, "logps/chosen": -41.286556243896484, "logps/rejected": -50.088348388671875, "loss": 0.5702, "losses/dpo": 0.5214348435401917, "losses/sft": 1.4683022499084473, "losses/total": 0.5214348435401917, "ref_logps/chosen": -35.07402038574219, "ref_logps/rejected": -40.096168518066406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6212539672851562, "rewards/margins": 0.377963662147522, "rewards/rejected": -0.9992176294326782, "step": 189 }, { "epoch": 1.43, "grad_norm": 7.005927567034465, "learning_rate": 2.893258426966292e-07, "logps/chosen": -40.85224533081055, "logps/rejected": -47.50454330444336, "loss": 0.6468, "losses/dpo": 0.5624043941497803, "losses/sft": 1.3726718425750732, "losses/total": 0.5624043941497803, "ref_logps/chosen": -34.436492919921875, "ref_logps/rejected": -39.054630279541016, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.6415754556655884, "rewards/margins": 0.2034158706665039, "rewards/rejected": -0.8449913263320923, "step": 190 }, { "epoch": 1.44, "grad_norm": 7.483874035437063, "learning_rate": 2.8792134831460674e-07, "logps/chosen": -42.50736999511719, "logps/rejected": -58.21019744873047, "loss": 0.5523, "losses/dpo": 0.6698145270347595, "losses/sft": 1.5408368110656738, "losses/total": 0.6698145270347595, "ref_logps/chosen": -36.81276321411133, "ref_logps/rejected": -48.2528076171875, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5694608688354492, "rewards/margins": 0.4262778162956238, "rewards/rejected": -0.995738685131073, "step": 191 }, { "epoch": 1.45, "grad_norm": 6.9825083587199, "learning_rate": 2.8651685393258425e-07, "logps/chosen": -45.57709503173828, "logps/rejected": -53.14373016357422, "loss": 0.5855, "losses/dpo": 0.557357132434845, "losses/sft": 1.6354026794433594, "losses/total": 0.557357132434845, "ref_logps/chosen": -39.346527099609375, "ref_logps/rejected": -43.652854919433594, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6230565309524536, "rewards/margins": 0.32603132724761963, "rewards/rejected": -0.9490878582000732, "step": 192 }, { "epoch": 1.46, "grad_norm": 6.9248793596215314, "learning_rate": 2.851123595505618e-07, "logps/chosen": -41.4918098449707, "logps/rejected": -51.642330169677734, "loss": 0.614, "losses/dpo": 0.5539823770523071, "losses/sft": 1.4280143976211548, "losses/total": 0.5539823770523071, "ref_logps/chosen": -34.640289306640625, "ref_logps/rejected": -42.01956558227539, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.6851522326469421, "rewards/margins": 0.2771243751049042, "rewards/rejected": -0.9622765779495239, "step": 193 }, { "epoch": 1.46, "grad_norm": 7.819580369685109, "learning_rate": 2.8370786516853936e-07, "logps/chosen": -45.17947769165039, "logps/rejected": -54.26673126220703, "loss": 0.5983, "losses/dpo": 0.5532131195068359, "losses/sft": 1.5786592960357666, "losses/total": 0.5532131195068359, "ref_logps/chosen": -38.547950744628906, "ref_logps/rejected": -44.515296936035156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6631526350975037, "rewards/margins": 0.3119913339614868, "rewards/rejected": -0.9751439094543457, "step": 194 }, { "epoch": 1.47, "grad_norm": 6.908089828532824, "learning_rate": 2.823033707865168e-07, "logps/chosen": -39.231468200683594, "logps/rejected": -55.21925735473633, "loss": 0.5648, "losses/dpo": 0.5973429083824158, "losses/sft": 1.6660652160644531, "losses/total": 0.5973429083824158, "ref_logps/chosen": -32.866737365722656, "ref_logps/rejected": -44.704856872558594, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6364729404449463, "rewards/margins": 0.4149664640426636, "rewards/rejected": -1.0514394044876099, "step": 195 }, { "epoch": 1.48, "grad_norm": 6.726944591334497, "learning_rate": 2.8089887640449437e-07, "logps/chosen": -40.06050109863281, "logps/rejected": -53.288673400878906, "loss": 0.5791, "losses/dpo": 0.5540711879730225, "losses/sft": 1.7805967330932617, "losses/total": 0.5540711879730225, "ref_logps/chosen": -33.56330490112305, "ref_logps/rejected": -42.997859954833984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.649719774723053, "rewards/margins": 0.37936151027679443, "rewards/rejected": -1.0290813446044922, "step": 196 }, { "epoch": 1.49, "grad_norm": 7.150851904029176, "learning_rate": 2.794943820224719e-07, "logps/chosen": -47.1893424987793, "logps/rejected": -61.44281005859375, "loss": 0.5702, "losses/dpo": 0.7234626412391663, "losses/sft": 1.6843864917755127, "losses/total": 0.7234626412391663, "ref_logps/chosen": -39.28502655029297, "ref_logps/rejected": -49.627784729003906, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7904319763183594, "rewards/margins": 0.39107024669647217, "rewards/rejected": -1.1815022230148315, "step": 197 }, { "epoch": 1.49, "grad_norm": 7.427853846385361, "learning_rate": 2.7808988764044943e-07, "logps/chosen": -43.90837097167969, "logps/rejected": -49.889678955078125, "loss": 0.6097, "losses/dpo": 0.5974606275558472, "losses/sft": 1.7023361921310425, "losses/total": 0.5974606275558472, "ref_logps/chosen": -36.68721008300781, "ref_logps/rejected": -39.80302047729492, "rewards/accuracies": 0.625, "rewards/chosen": -0.722116231918335, "rewards/margins": 0.2865493595600128, "rewards/rejected": -1.0086655616760254, "step": 198 }, { "epoch": 1.5, "grad_norm": 6.720052652716852, "learning_rate": 2.7668539325842694e-07, "logps/chosen": -40.47029495239258, "logps/rejected": -52.58824157714844, "loss": 0.5673, "losses/dpo": 0.5275993347167969, "losses/sft": 1.4116981029510498, "losses/total": 0.5275993347167969, "ref_logps/chosen": -33.68723678588867, "ref_logps/rejected": -42.008827209472656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6783058643341064, "rewards/margins": 0.3796355426311493, "rewards/rejected": -1.0579413175582886, "step": 199 }, { "epoch": 1.51, "grad_norm": 7.464608685292226, "learning_rate": 2.752808988764045e-07, "logps/chosen": -46.24801254272461, "logps/rejected": -54.933780670166016, "loss": 0.61, "losses/dpo": 0.6066948771476746, "losses/sft": 1.6309008598327637, "losses/total": 0.6066948771476746, "ref_logps/chosen": -38.81559753417969, "ref_logps/rejected": -44.58855438232422, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.743241548538208, "rewards/margins": 0.2912812829017639, "rewards/rejected": -1.0345228910446167, "step": 200 }, { "epoch": 1.52, "grad_norm": 7.360337757301619, "learning_rate": 2.73876404494382e-07, "logps/chosen": -42.876792907714844, "logps/rejected": -50.461334228515625, "loss": 0.6213, "losses/dpo": 0.6310982704162598, "losses/sft": 1.441427230834961, "losses/total": 0.6310982704162598, "ref_logps/chosen": -36.28274917602539, "ref_logps/rejected": -41.36058044433594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6594043970108032, "rewards/margins": 0.2506704330444336, "rewards/rejected": -0.9100748300552368, "step": 201 }, { "epoch": 1.52, "grad_norm": 7.43302729298079, "learning_rate": 2.7247191011235955e-07, "logps/chosen": -43.45911407470703, "logps/rejected": -50.20298385620117, "loss": 0.5552, "losses/dpo": 0.5599596500396729, "losses/sft": 1.4739470481872559, "losses/total": 0.5599596500396729, "ref_logps/chosen": -37.075191497802734, "ref_logps/rejected": -39.420005798339844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6383919715881348, "rewards/margins": 0.4399053752422333, "rewards/rejected": -1.0782973766326904, "step": 202 }, { "epoch": 1.53, "grad_norm": 7.05544065339559, "learning_rate": 2.710674157303371e-07, "logps/chosen": -48.13520050048828, "logps/rejected": -55.488975524902344, "loss": 0.5683, "losses/dpo": 0.5512528419494629, "losses/sft": 1.421828269958496, "losses/total": 0.5512528419494629, "ref_logps/chosen": -40.806800842285156, "ref_logps/rejected": -44.204917907714844, "rewards/accuracies": 0.75, "rewards/chosen": -0.7328400611877441, "rewards/margins": 0.39556559920310974, "rewards/rejected": -1.1284055709838867, "step": 203 }, { "epoch": 1.54, "grad_norm": 7.072397811821575, "learning_rate": 2.6966292134831456e-07, "logps/chosen": -45.89094161987305, "logps/rejected": -56.12247085571289, "loss": 0.5687, "losses/dpo": 0.5942108035087585, "losses/sft": 1.671670913696289, "losses/total": 0.5942108035087585, "ref_logps/chosen": -38.66583251953125, "ref_logps/rejected": -45.235042572021484, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7225111722946167, "rewards/margins": 0.3662317395210266, "rewards/rejected": -1.088742971420288, "step": 204 }, { "epoch": 1.55, "grad_norm": 7.135519070946246, "learning_rate": 2.682584269662921e-07, "logps/chosen": -44.51463317871094, "logps/rejected": -53.46598434448242, "loss": 0.5668, "losses/dpo": 0.5319070816040039, "losses/sft": 1.5628294944763184, "losses/total": 0.5319070816040039, "ref_logps/chosen": -37.279029846191406, "ref_logps/rejected": -42.57787322998047, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7235599756240845, "rewards/margins": 0.36525097489356995, "rewards/rejected": -1.0888110399246216, "step": 205 }, { "epoch": 1.55, "grad_norm": 7.107636740157782, "learning_rate": 2.668539325842696e-07, "logps/chosen": -43.40117645263672, "logps/rejected": -54.69598388671875, "loss": 0.5524, "losses/dpo": 0.5264509320259094, "losses/sft": 1.5363452434539795, "losses/total": 0.5264509320259094, "ref_logps/chosen": -36.371551513671875, "ref_logps/rejected": -43.2625732421875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7029624581336975, "rewards/margins": 0.44037845730781555, "rewards/rejected": -1.1433409452438354, "step": 206 }, { "epoch": 1.56, "grad_norm": 7.289414925759057, "learning_rate": 2.654494382022472e-07, "logps/chosen": -42.475379943847656, "logps/rejected": -49.646728515625, "loss": 0.6046, "losses/dpo": 0.6572248935699463, "losses/sft": 1.6387099027633667, "losses/total": 0.6572248935699463, "ref_logps/chosen": -35.308807373046875, "ref_logps/rejected": -39.25323486328125, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7166574597358704, "rewards/margins": 0.32269221544265747, "rewards/rejected": -1.0393496751785278, "step": 207 }, { "epoch": 1.57, "grad_norm": 7.273876435078599, "learning_rate": 2.640449438202247e-07, "logps/chosen": -42.280967712402344, "logps/rejected": -48.80766296386719, "loss": 0.6213, "losses/dpo": 0.5971169471740723, "losses/sft": 1.7042605876922607, "losses/total": 0.5971169471740723, "ref_logps/chosen": -35.188377380371094, "ref_logps/rejected": -39.048980712890625, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.7092592716217041, "rewards/margins": 0.2666093707084656, "rewards/rejected": -0.9758686423301697, "step": 208 }, { "epoch": 1.58, "grad_norm": 7.389043621661051, "learning_rate": 2.6264044943820224e-07, "logps/chosen": -43.01720428466797, "logps/rejected": -52.86360549926758, "loss": 0.6004, "losses/dpo": 0.5311284065246582, "losses/sft": 1.673902988433838, "losses/total": 0.5311284065246582, "ref_logps/chosen": -35.34561538696289, "ref_logps/rejected": -41.90196228027344, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7671589851379395, "rewards/margins": 0.3290054500102997, "rewards/rejected": -1.096164345741272, "step": 209 }, { "epoch": 1.58, "grad_norm": 7.051856361062949, "learning_rate": 2.612359550561798e-07, "logps/chosen": -43.410194396972656, "logps/rejected": -56.95100784301758, "loss": 0.5527, "losses/dpo": 0.494179904460907, "losses/sft": 1.3610440492630005, "losses/total": 0.494179904460907, "ref_logps/chosen": -36.352073669433594, "ref_logps/rejected": -45.52418518066406, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7058122754096985, "rewards/margins": 0.4368700683116913, "rewards/rejected": -1.1426823139190674, "step": 210 }, { "epoch": 1.59, "grad_norm": 7.075204680484654, "learning_rate": 2.598314606741573e-07, "logps/chosen": -44.7838249206543, "logps/rejected": -52.02484130859375, "loss": 0.6038, "losses/dpo": 0.5901740193367004, "losses/sft": 1.7182517051696777, "losses/total": 0.5901740193367004, "ref_logps/chosen": -37.16931915283203, "ref_logps/rejected": -41.17018127441406, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7614503502845764, "rewards/margins": 0.32401591539382935, "rewards/rejected": -1.0854662656784058, "step": 211 }, { "epoch": 1.6, "grad_norm": 7.444039226905721, "learning_rate": 2.5842696629213486e-07, "logps/chosen": -41.19989776611328, "logps/rejected": -49.64472961425781, "loss": 0.5961, "losses/dpo": 0.5703378319740295, "losses/sft": 1.288915753364563, "losses/total": 0.5703378319740295, "ref_logps/chosen": -34.397457122802734, "ref_logps/rejected": -39.43400573730469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6802438497543335, "rewards/margins": 0.34082797169685364, "rewards/rejected": -1.0210717916488647, "step": 212 }, { "epoch": 1.61, "grad_norm": 7.3272234640712455, "learning_rate": 2.5702247191011236e-07, "logps/chosen": -49.161766052246094, "logps/rejected": -55.671295166015625, "loss": 0.5862, "losses/dpo": 0.7499480843544006, "losses/sft": 1.8793140649795532, "losses/total": 0.7499480843544006, "ref_logps/chosen": -41.297489166259766, "ref_logps/rejected": -44.11161422729492, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7864278554916382, "rewards/margins": 0.3695400655269623, "rewards/rejected": -1.1559679508209229, "step": 213 }, { "epoch": 1.62, "grad_norm": 7.817780320058273, "learning_rate": 2.5561797752808987e-07, "logps/chosen": -46.763206481933594, "logps/rejected": -53.703033447265625, "loss": 0.6266, "losses/dpo": 0.48047423362731934, "losses/sft": 1.5680122375488281, "losses/total": 0.48047423362731934, "ref_logps/chosen": -39.49818801879883, "ref_logps/rejected": -43.53962326049805, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.7265015840530396, "rewards/margins": 0.28983935713768005, "rewards/rejected": -1.016340970993042, "step": 214 }, { "epoch": 1.62, "grad_norm": 7.508782666466954, "learning_rate": 2.5421348314606737e-07, "logps/chosen": -47.78954315185547, "logps/rejected": -56.90927505493164, "loss": 0.5628, "losses/dpo": 0.537736177444458, "losses/sft": 1.6823458671569824, "losses/total": 0.537736177444458, "ref_logps/chosen": -40.28362274169922, "ref_logps/rejected": -45.37610626220703, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.7505923509597778, "rewards/margins": 0.4027244448661804, "rewards/rejected": -1.1533167362213135, "step": 215 }, { "epoch": 1.63, "grad_norm": 7.806242305852612, "learning_rate": 2.5280898876404493e-07, "logps/chosen": -47.2044677734375, "logps/rejected": -58.619651794433594, "loss": 0.5899, "losses/dpo": 0.6399192214012146, "losses/sft": 1.363295316696167, "losses/total": 0.6399192214012146, "ref_logps/chosen": -39.20256423950195, "ref_logps/rejected": -46.85260772705078, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8001901507377625, "rewards/margins": 0.3765140473842621, "rewards/rejected": -1.1767041683197021, "step": 216 }, { "epoch": 1.64, "grad_norm": 6.621098009181271, "learning_rate": 2.5140449438202243e-07, "logps/chosen": -36.010169982910156, "logps/rejected": -48.608699798583984, "loss": 0.5501, "losses/dpo": 0.5600734949111938, "losses/sft": 1.3302438259124756, "losses/total": 0.5600734949111938, "ref_logps/chosen": -29.427637100219727, "ref_logps/rejected": -37.72674560546875, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6582531929016113, "rewards/margins": 0.4299423098564148, "rewards/rejected": -1.088195562362671, "step": 217 }, { "epoch": 1.65, "grad_norm": 7.0657521689003735, "learning_rate": 2.5e-07, "logps/chosen": -42.20947265625, "logps/rejected": -53.40728759765625, "loss": 0.5808, "losses/dpo": 0.5706441402435303, "losses/sft": 1.390072226524353, "losses/total": 0.5706441402435303, "ref_logps/chosen": -34.44993591308594, "ref_logps/rejected": -41.737003326416016, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.7759537696838379, "rewards/margins": 0.3910742402076721, "rewards/rejected": -1.1670279502868652, "step": 218 }, { "epoch": 1.65, "grad_norm": 7.920819614767415, "learning_rate": 2.485955056179775e-07, "logps/chosen": -46.45621109008789, "logps/rejected": -53.37653350830078, "loss": 0.6258, "losses/dpo": 0.5177885293960571, "losses/sft": 1.4505321979522705, "losses/total": 0.5177885293960571, "ref_logps/chosen": -38.40946960449219, "ref_logps/rejected": -42.48009490966797, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.8046744465827942, "rewards/margins": 0.2849688231945038, "rewards/rejected": -1.0896432399749756, "step": 219 }, { "epoch": 1.66, "grad_norm": 6.775480921328623, "learning_rate": 2.4719101123595505e-07, "logps/chosen": -43.19866943359375, "logps/rejected": -51.142852783203125, "loss": 0.5708, "losses/dpo": 0.631821870803833, "losses/sft": 1.687159776687622, "losses/total": 0.631821870803833, "ref_logps/chosen": -35.93583297729492, "ref_logps/rejected": -40.11430358886719, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7262836694717407, "rewards/margins": 0.3765709102153778, "rewards/rejected": -1.102854609489441, "step": 220 }, { "epoch": 1.67, "grad_norm": 7.00534024427554, "learning_rate": 2.4578651685393255e-07, "logps/chosen": -43.32523727416992, "logps/rejected": -52.18841552734375, "loss": 0.56, "losses/dpo": 0.5959673523902893, "losses/sft": 1.5886725187301636, "losses/total": 0.5959673523902893, "ref_logps/chosen": -35.884517669677734, "ref_logps/rejected": -40.54563522338867, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7440718412399292, "rewards/margins": 0.42020630836486816, "rewards/rejected": -1.1642781496047974, "step": 221 }, { "epoch": 1.68, "grad_norm": 7.150852349968827, "learning_rate": 2.443820224719101e-07, "logps/chosen": -42.47400665283203, "logps/rejected": -53.8497314453125, "loss": 0.5456, "losses/dpo": 0.5008928775787354, "losses/sft": 1.4967145919799805, "losses/total": 0.5008928775787354, "ref_logps/chosen": -35.4063720703125, "ref_logps/rejected": -41.911190032958984, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7067632675170898, "rewards/margins": 0.48709067702293396, "rewards/rejected": -1.1938539743423462, "step": 222 }, { "epoch": 1.68, "grad_norm": 7.009242529601585, "learning_rate": 2.429775280898876e-07, "logps/chosen": -42.480735778808594, "logps/rejected": -56.96538543701172, "loss": 0.5687, "losses/dpo": 0.5494006872177124, "losses/sft": 1.660073161125183, "losses/total": 0.5494006872177124, "ref_logps/chosen": -34.94923400878906, "ref_logps/rejected": -45.03327178955078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.753150224685669, "rewards/margins": 0.44006073474884033, "rewards/rejected": -1.1932109594345093, "step": 223 }, { "epoch": 1.69, "grad_norm": 7.145198782494123, "learning_rate": 2.4157303370786517e-07, "logps/chosen": -46.52253341674805, "logps/rejected": -56.88560485839844, "loss": 0.5578, "losses/dpo": 0.6753450632095337, "losses/sft": 1.733784556388855, "losses/total": 0.6753450632095337, "ref_logps/chosen": -38.87651062011719, "ref_logps/rejected": -44.95452117919922, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7646023035049438, "rewards/margins": 0.428506076335907, "rewards/rejected": -1.193108320236206, "step": 224 }, { "epoch": 1.7, "grad_norm": 7.2140738897995895, "learning_rate": 2.401685393258427e-07, "logps/chosen": -44.606842041015625, "logps/rejected": -51.53977966308594, "loss": 0.5755, "losses/dpo": 0.6050464510917664, "losses/sft": 1.4844509363174438, "losses/total": 0.6050464510917664, "ref_logps/chosen": -37.366485595703125, "ref_logps/rejected": -40.17390441894531, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7240351438522339, "rewards/margins": 0.41255253553390503, "rewards/rejected": -1.1365876197814941, "step": 225 }, { "epoch": 1.71, "grad_norm": 7.316056461598082, "learning_rate": 2.3876404494382023e-07, "logps/chosen": -43.40976333618164, "logps/rejected": -51.119468688964844, "loss": 0.5856, "losses/dpo": 0.6437182426452637, "losses/sft": 1.6879228353500366, "losses/total": 0.6437182426452637, "ref_logps/chosen": -35.9763298034668, "ref_logps/rejected": -39.941932678222656, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.7433432936668396, "rewards/margins": 0.37441009283065796, "rewards/rejected": -1.1177533864974976, "step": 226 }, { "epoch": 1.71, "grad_norm": 7.078331857989057, "learning_rate": 2.3735955056179774e-07, "logps/chosen": -45.81120681762695, "logps/rejected": -50.85576629638672, "loss": 0.6076, "losses/dpo": 0.7625922560691833, "losses/sft": 1.5723658800125122, "losses/total": 0.7625922560691833, "ref_logps/chosen": -38.21784973144531, "ref_logps/rejected": -40.007240295410156, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7593356966972351, "rewards/margins": 0.32551684975624084, "rewards/rejected": -1.0848525762557983, "step": 227 }, { "epoch": 1.72, "grad_norm": 7.206138039626543, "learning_rate": 2.3595505617977527e-07, "logps/chosen": -43.7403450012207, "logps/rejected": -52.108604431152344, "loss": 0.5922, "losses/dpo": 0.5139514803886414, "losses/sft": 1.6670148372650146, "losses/total": 0.5139514803886414, "ref_logps/chosen": -35.650115966796875, "ref_logps/rejected": -40.582130432128906, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.8090231418609619, "rewards/margins": 0.34362420439720154, "rewards/rejected": -1.1526473760604858, "step": 228 }, { "epoch": 1.73, "grad_norm": 7.61900579513634, "learning_rate": 2.345505617977528e-07, "logps/chosen": -42.43614959716797, "logps/rejected": -52.779483795166016, "loss": 0.5781, "losses/dpo": 0.4573014974594116, "losses/sft": 1.5003488063812256, "losses/total": 0.4573014974594116, "ref_logps/chosen": -34.90065002441406, "ref_logps/rejected": -41.4276237487793, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7535501718521118, "rewards/margins": 0.38163578510284424, "rewards/rejected": -1.135185956954956, "step": 229 }, { "epoch": 1.74, "grad_norm": 7.829509763007773, "learning_rate": 2.331460674157303e-07, "logps/chosen": -47.7276496887207, "logps/rejected": -56.36402893066406, "loss": 0.5302, "losses/dpo": 0.529563307762146, "losses/sft": 1.6256301403045654, "losses/total": 0.529563307762146, "ref_logps/chosen": -40.326351165771484, "ref_logps/rejected": -43.704612731933594, "rewards/accuracies": 0.796875, "rewards/chosen": -0.740129828453064, "rewards/margins": 0.5258119702339172, "rewards/rejected": -1.2659417390823364, "step": 230 }, { "epoch": 1.74, "grad_norm": 8.06687120109026, "learning_rate": 2.3174157303370786e-07, "logps/chosen": -44.74425506591797, "logps/rejected": -55.536312103271484, "loss": 0.561, "losses/dpo": 0.4758527874946594, "losses/sft": 1.3779159784317017, "losses/total": 0.4758527874946594, "ref_logps/chosen": -37.057281494140625, "ref_logps/rejected": -43.34575653076172, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7686972618103027, "rewards/margins": 0.45035821199417114, "rewards/rejected": -1.2190555334091187, "step": 231 }, { "epoch": 1.75, "grad_norm": 7.291686332994008, "learning_rate": 2.303370786516854e-07, "logps/chosen": -43.00548553466797, "logps/rejected": -54.49897003173828, "loss": 0.5834, "losses/dpo": 0.5421339273452759, "losses/sft": 1.4051011800765991, "losses/total": 0.5421339273452759, "ref_logps/chosen": -35.81233215332031, "ref_logps/rejected": -43.77638244628906, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7193150520324707, "rewards/margins": 0.35294392704963684, "rewards/rejected": -1.0722589492797852, "step": 232 }, { "epoch": 1.76, "grad_norm": 6.669119014424567, "learning_rate": 2.2893258426966292e-07, "logps/chosen": -42.595909118652344, "logps/rejected": -50.517574310302734, "loss": 0.5942, "losses/dpo": 0.5890272855758667, "losses/sft": 1.3421604633331299, "losses/total": 0.5890272855758667, "ref_logps/chosen": -35.23419189453125, "ref_logps/rejected": -39.80196762084961, "rewards/accuracies": 0.640625, "rewards/chosen": -0.7361720204353333, "rewards/margins": 0.33538877964019775, "rewards/rejected": -1.0715608596801758, "step": 233 }, { "epoch": 1.77, "grad_norm": 7.634998140383259, "learning_rate": 2.2752808988764045e-07, "logps/chosen": -48.97822189331055, "logps/rejected": -55.01988983154297, "loss": 0.6041, "losses/dpo": 0.4961914122104645, "losses/sft": 1.6347143650054932, "losses/total": 0.4961914122104645, "ref_logps/chosen": -40.457027435302734, "ref_logps/rejected": -43.25347900390625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8521193265914917, "rewards/margins": 0.3245222866535187, "rewards/rejected": -1.1766417026519775, "step": 234 }, { "epoch": 1.77, "grad_norm": 7.773471295460603, "learning_rate": 2.2612359550561795e-07, "logps/chosen": -46.672576904296875, "logps/rejected": -54.91902542114258, "loss": 0.5883, "losses/dpo": 0.5112382173538208, "losses/sft": 1.6069546937942505, "losses/total": 0.5112382173538208, "ref_logps/chosen": -38.114097595214844, "ref_logps/rejected": -42.68096160888672, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8558481931686401, "rewards/margins": 0.36795809864997864, "rewards/rejected": -1.2238062620162964, "step": 235 }, { "epoch": 1.78, "grad_norm": 6.9224359595021925, "learning_rate": 2.2471910112359549e-07, "logps/chosen": -43.26789855957031, "logps/rejected": -49.846065521240234, "loss": 0.5976, "losses/dpo": 0.530718207359314, "losses/sft": 1.4825395345687866, "losses/total": 0.530718207359314, "ref_logps/chosen": -35.522216796875, "ref_logps/rejected": -38.95735168457031, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7745683193206787, "rewards/margins": 0.3143025040626526, "rewards/rejected": -1.088870882987976, "step": 236 }, { "epoch": 1.79, "grad_norm": 7.5835387702946075, "learning_rate": 2.2331460674157302e-07, "logps/chosen": -44.358116149902344, "logps/rejected": -57.51253890991211, "loss": 0.5667, "losses/dpo": 0.4763038754463196, "losses/sft": 1.4994385242462158, "losses/total": 0.4763038754463196, "ref_logps/chosen": -36.64021682739258, "ref_logps/rejected": -45.455902099609375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7717897295951843, "rewards/margins": 0.4338740408420563, "rewards/rejected": -1.205663800239563, "step": 237 }, { "epoch": 1.8, "grad_norm": 6.860345248290717, "learning_rate": 2.2191011235955055e-07, "logps/chosen": -43.448211669921875, "logps/rejected": -52.67967224121094, "loss": 0.5841, "losses/dpo": 0.5856455564498901, "losses/sft": 1.5493735074996948, "losses/total": 0.5856455564498901, "ref_logps/chosen": -35.36868667602539, "ref_logps/rejected": -40.977325439453125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8079524040222168, "rewards/margins": 0.36228203773498535, "rewards/rejected": -1.1702344417572021, "step": 238 }, { "epoch": 1.8, "grad_norm": 6.877362645097382, "learning_rate": 2.205056179775281e-07, "logps/chosen": -43.96727752685547, "logps/rejected": -54.08544921875, "loss": 0.561, "losses/dpo": 0.7524189352989197, "losses/sft": 1.4943475723266602, "losses/total": 0.7524189352989197, "ref_logps/chosen": -36.482398986816406, "ref_logps/rejected": -42.11739730834961, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7484874725341797, "rewards/margins": 0.44831788539886475, "rewards/rejected": -1.1968053579330444, "step": 239 }, { "epoch": 1.81, "grad_norm": 8.176287850888796, "learning_rate": 2.191011235955056e-07, "logps/chosen": -44.187679290771484, "logps/rejected": -52.56245422363281, "loss": 0.6215, "losses/dpo": 0.5882298946380615, "losses/sft": 1.509756326675415, "losses/total": 0.5882298946380615, "ref_logps/chosen": -35.26359558105469, "ref_logps/rejected": -40.418216705322266, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.8924084901809692, "rewards/margins": 0.32201528549194336, "rewards/rejected": -1.2144238948822021, "step": 240 }, { "epoch": 1.82, "grad_norm": 6.748177391589757, "learning_rate": 2.1769662921348314e-07, "logps/chosen": -44.592193603515625, "logps/rejected": -54.5892219543457, "loss": 0.5454, "losses/dpo": 0.5010501742362976, "losses/sft": 1.692970871925354, "losses/total": 0.5010501742362976, "ref_logps/chosen": -36.254737854003906, "ref_logps/rejected": -41.49382019042969, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8337457180023193, "rewards/margins": 0.47579440474510193, "rewards/rejected": -1.3095402717590332, "step": 241 }, { "epoch": 1.83, "grad_norm": 7.9524609339083385, "learning_rate": 2.1629213483146067e-07, "logps/chosen": -49.09219741821289, "logps/rejected": -55.76482391357422, "loss": 0.6033, "losses/dpo": 0.5440762042999268, "losses/sft": 1.7360166311264038, "losses/total": 0.5440762042999268, "ref_logps/chosen": -39.933494567871094, "ref_logps/rejected": -43.29194259643555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.915870189666748, "rewards/margins": 0.33141782879829407, "rewards/rejected": -1.2472879886627197, "step": 242 }, { "epoch": 1.83, "grad_norm": 7.715847211838437, "learning_rate": 2.148876404494382e-07, "logps/chosen": -44.42055130004883, "logps/rejected": -50.31025314331055, "loss": 0.6419, "losses/dpo": 0.6423018574714661, "losses/sft": 1.8698339462280273, "losses/total": 0.6423018574714661, "ref_logps/chosen": -36.14445114135742, "ref_logps/rejected": -39.49055480957031, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8276099562644958, "rewards/margins": 0.2543600797653198, "rewards/rejected": -1.081969976425171, "step": 243 }, { "epoch": 1.84, "grad_norm": 7.717852694753547, "learning_rate": 2.134831460674157e-07, "logps/chosen": -45.71333694458008, "logps/rejected": -56.663360595703125, "loss": 0.5667, "losses/dpo": 0.578036904335022, "losses/sft": 1.548266053199768, "losses/total": 0.578036904335022, "ref_logps/chosen": -37.3514404296875, "ref_logps/rejected": -44.125831604003906, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.8361901044845581, "rewards/margins": 0.4175630807876587, "rewards/rejected": -1.2537531852722168, "step": 244 }, { "epoch": 1.85, "grad_norm": 7.217778469739938, "learning_rate": 2.1207865168539323e-07, "logps/chosen": -47.56121826171875, "logps/rejected": -55.15635299682617, "loss": 0.584, "losses/dpo": 0.6499341726303101, "losses/sft": 1.8146308660507202, "losses/total": 0.6499341726303101, "ref_logps/chosen": -39.167484283447266, "ref_logps/rejected": -43.06206130981445, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8393731117248535, "rewards/margins": 0.37005579471588135, "rewards/rejected": -1.2094289064407349, "step": 245 }, { "epoch": 1.86, "grad_norm": 7.706531034729977, "learning_rate": 2.1067415730337076e-07, "logps/chosen": -45.61647415161133, "logps/rejected": -55.11760330200195, "loss": 0.616, "losses/dpo": 0.45806318521499634, "losses/sft": 1.4561158418655396, "losses/total": 0.45806318521499634, "ref_logps/chosen": -36.999488830566406, "ref_logps/rejected": -43.26171112060547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8616988062858582, "rewards/margins": 0.3238902986049652, "rewards/rejected": -1.185589075088501, "step": 246 }, { "epoch": 1.86, "grad_norm": 7.805010195142929, "learning_rate": 2.0926966292134832e-07, "logps/chosen": -43.88758087158203, "logps/rejected": -54.20425033569336, "loss": 0.5977, "losses/dpo": 0.6152101755142212, "losses/sft": 1.5027949810028076, "losses/total": 0.6152101755142212, "ref_logps/chosen": -35.95214080810547, "ref_logps/rejected": -42.84498977661133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7935442924499512, "rewards/margins": 0.3423812687397003, "rewards/rejected": -1.135925531387329, "step": 247 }, { "epoch": 1.87, "grad_norm": 7.406497027707841, "learning_rate": 2.0786516853932585e-07, "logps/chosen": -45.788818359375, "logps/rejected": -53.047203063964844, "loss": 0.5831, "losses/dpo": 0.4654901325702667, "losses/sft": 1.479446291923523, "losses/total": 0.4654901325702667, "ref_logps/chosen": -37.57151794433594, "ref_logps/rejected": -41.39836883544922, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.8217304348945618, "rewards/margins": 0.34315240383148193, "rewards/rejected": -1.1648828983306885, "step": 248 }, { "epoch": 1.88, "grad_norm": 7.682816189246604, "learning_rate": 2.0646067415730336e-07, "logps/chosen": -45.08941650390625, "logps/rejected": -56.03681182861328, "loss": 0.6254, "losses/dpo": 0.6420303583145142, "losses/sft": 1.767283320426941, "losses/total": 0.6420303583145142, "ref_logps/chosen": -35.47539138793945, "ref_logps/rejected": -43.610809326171875, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9614025950431824, "rewards/margins": 0.281198114156723, "rewards/rejected": -1.242600679397583, "step": 249 }, { "epoch": 1.89, "grad_norm": 7.381909400967013, "learning_rate": 2.0505617977528089e-07, "logps/chosen": -44.02425765991211, "logps/rejected": -57.3465461730957, "loss": 0.552, "losses/dpo": 0.5316831469535828, "losses/sft": 1.4193787574768066, "losses/total": 0.5316831469535828, "ref_logps/chosen": -36.48744201660156, "ref_logps/rejected": -45.4715576171875, "rewards/accuracies": 0.75, "rewards/chosen": -0.7536818385124207, "rewards/margins": 0.43381738662719727, "rewards/rejected": -1.1874991655349731, "step": 250 }, { "epoch": 1.89, "grad_norm": 7.457988261963576, "learning_rate": 2.0365168539325842e-07, "logps/chosen": -44.05774688720703, "logps/rejected": -54.26824951171875, "loss": 0.5622, "losses/dpo": 0.6149340867996216, "losses/sft": 1.7144936323165894, "losses/total": 0.6149340867996216, "ref_logps/chosen": -35.81959533691406, "ref_logps/rejected": -41.808128356933594, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8238149881362915, "rewards/margins": 0.4221975803375244, "rewards/rejected": -1.246012568473816, "step": 251 }, { "epoch": 1.9, "grad_norm": 6.886174599694686, "learning_rate": 2.0224719101123595e-07, "logps/chosen": -42.96266174316406, "logps/rejected": -57.41224670410156, "loss": 0.5338, "losses/dpo": 0.5937738418579102, "losses/sft": 1.7894150018692017, "losses/total": 0.5937738418579102, "ref_logps/chosen": -35.350067138671875, "ref_logps/rejected": -44.520416259765625, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7612596750259399, "rewards/margins": 0.5279234051704407, "rewards/rejected": -1.2891831398010254, "step": 252 }, { "epoch": 1.91, "grad_norm": 7.229890008822798, "learning_rate": 2.0084269662921348e-07, "logps/chosen": -40.15806198120117, "logps/rejected": -51.43259811401367, "loss": 0.5704, "losses/dpo": 0.6577882170677185, "losses/sft": 1.8345617055892944, "losses/total": 0.6577882170677185, "ref_logps/chosen": -32.859107971191406, "ref_logps/rejected": -40.261077880859375, "rewards/accuracies": 0.75, "rewards/chosen": -0.7298952341079712, "rewards/margins": 0.3872564733028412, "rewards/rejected": -1.1171517372131348, "step": 253 }, { "epoch": 1.92, "grad_norm": 7.4822334369379995, "learning_rate": 1.9943820224719098e-07, "logps/chosen": -47.686946868896484, "logps/rejected": -57.150779724121094, "loss": 0.5379, "losses/dpo": 0.5903155207633972, "losses/sft": 1.7529627084732056, "losses/total": 0.5903155207633972, "ref_logps/chosen": -39.64442443847656, "ref_logps/rejected": -44.537864685058594, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8042521476745605, "rewards/margins": 0.45703911781311035, "rewards/rejected": -1.2612911462783813, "step": 254 }, { "epoch": 1.92, "grad_norm": 8.026318217758316, "learning_rate": 1.9803370786516854e-07, "logps/chosen": -48.09050750732422, "logps/rejected": -55.54762268066406, "loss": 0.6168, "losses/dpo": 0.6026707887649536, "losses/sft": 1.538877248764038, "losses/total": 0.6026707887649536, "ref_logps/chosen": -39.93636703491211, "ref_logps/rejected": -44.63798522949219, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.8154144287109375, "rewards/margins": 0.2755492627620697, "rewards/rejected": -1.0909637212753296, "step": 255 }, { "epoch": 1.93, "grad_norm": 7.789382763460605, "learning_rate": 1.9662921348314607e-07, "logps/chosen": -42.904396057128906, "logps/rejected": -52.95304489135742, "loss": 0.6004, "losses/dpo": 0.6533941626548767, "losses/sft": 1.7555681467056274, "losses/total": 0.6533941626548767, "ref_logps/chosen": -35.12152862548828, "ref_logps/rejected": -41.70171356201172, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.7782862186431885, "rewards/margins": 0.34684672951698303, "rewards/rejected": -1.1251329183578491, "step": 256 }, { "epoch": 1.94, "grad_norm": 7.650755358628509, "learning_rate": 1.952247191011236e-07, "logps/chosen": -47.547119140625, "logps/rejected": -55.00044250488281, "loss": 0.5857, "losses/dpo": 0.533769965171814, "losses/sft": 1.518601655960083, "losses/total": 0.533769965171814, "ref_logps/chosen": -39.58103942871094, "ref_logps/rejected": -43.044471740722656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7966080904006958, "rewards/margins": 0.3989890217781067, "rewards/rejected": -1.1955971717834473, "step": 257 }, { "epoch": 1.95, "grad_norm": 7.218761250045399, "learning_rate": 1.938202247191011e-07, "logps/chosen": -45.877933502197266, "logps/rejected": -55.09804916381836, "loss": 0.5628, "losses/dpo": 0.5916406512260437, "losses/sft": 1.787639856338501, "losses/total": 0.5916406512260437, "ref_logps/chosen": -37.8803596496582, "ref_logps/rejected": -42.772945404052734, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7997570633888245, "rewards/margins": 0.43275338411331177, "rewards/rejected": -1.2325104475021362, "step": 258 }, { "epoch": 1.95, "grad_norm": 6.746342603050737, "learning_rate": 1.9241573033707863e-07, "logps/chosen": -44.5426139831543, "logps/rejected": -52.97711944580078, "loss": 0.5404, "losses/dpo": 0.6297707557678223, "losses/sft": 1.9339282512664795, "losses/total": 0.6297707557678223, "ref_logps/chosen": -36.8856315612793, "ref_logps/rejected": -40.58320617675781, "rewards/accuracies": 0.75, "rewards/chosen": -0.7656983137130737, "rewards/margins": 0.47369277477264404, "rewards/rejected": -1.2393909692764282, "step": 259 }, { "epoch": 1.96, "grad_norm": 7.487018325482117, "learning_rate": 1.9101123595505617e-07, "logps/chosen": -42.5137825012207, "logps/rejected": -53.166908264160156, "loss": 0.5707, "losses/dpo": 0.7028491497039795, "losses/sft": 1.704848289489746, "losses/total": 0.7028491497039795, "ref_logps/chosen": -34.8709716796875, "ref_logps/rejected": -41.41071701049805, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7642812728881836, "rewards/margins": 0.4113379120826721, "rewards/rejected": -1.1756192445755005, "step": 260 }, { "epoch": 1.97, "grad_norm": 6.897909731781275, "learning_rate": 1.896067415730337e-07, "logps/chosen": -42.64958190917969, "logps/rejected": -54.01194763183594, "loss": 0.5508, "losses/dpo": 0.6007636785507202, "losses/sft": 1.6722173690795898, "losses/total": 0.6007636785507202, "ref_logps/chosen": -34.81106948852539, "ref_logps/rejected": -41.885292053222656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7838513851165771, "rewards/margins": 0.428814560174942, "rewards/rejected": -1.2126659154891968, "step": 261 }, { "epoch": 1.98, "grad_norm": 6.802922485274152, "learning_rate": 1.8820224719101123e-07, "logps/chosen": -40.00798034667969, "logps/rejected": -54.30394744873047, "loss": 0.5499, "losses/dpo": 0.39324456453323364, "losses/sft": 1.4311751127243042, "losses/total": 0.39324456453323364, "ref_logps/chosen": -32.748207092285156, "ref_logps/rejected": -42.49259948730469, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.7259770035743713, "rewards/margins": 0.4551584720611572, "rewards/rejected": -1.1811354160308838, "step": 262 }, { "epoch": 1.98, "grad_norm": 6.783836899709174, "learning_rate": 1.8679775280898876e-07, "logps/chosen": -39.83095932006836, "logps/rejected": -54.880165100097656, "loss": 0.5218, "losses/dpo": 0.5562885999679565, "losses/sft": 1.581786036491394, "losses/total": 0.5562885999679565, "ref_logps/chosen": -32.503440856933594, "ref_logps/rejected": -42.00752258300781, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7327523231506348, "rewards/margins": 0.5545117259025574, "rewards/rejected": -1.287264108657837, "step": 263 }, { "epoch": 1.99, "grad_norm": 7.863916442503097, "learning_rate": 1.853932584269663e-07, "logps/chosen": -50.77809524536133, "logps/rejected": -57.57705307006836, "loss": 0.5746, "losses/dpo": 0.5502392053604126, "losses/sft": 1.671476125717163, "losses/total": 0.5502392053604126, "ref_logps/chosen": -42.0257568359375, "ref_logps/rejected": -44.61543273925781, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8752338886260986, "rewards/margins": 0.42092812061309814, "rewards/rejected": -1.2961618900299072, "step": 264 }, { "epoch": 2.0, "grad_norm": 7.397057411594154, "learning_rate": 1.8398876404494382e-07, "logps/chosen": -45.31150817871094, "logps/rejected": -54.171669006347656, "loss": 0.5905, "losses/dpo": 0.5974111557006836, "losses/sft": 1.7264142036437988, "losses/total": 0.5974111557006836, "ref_logps/chosen": -36.78529357910156, "ref_logps/rejected": -41.80047607421875, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8526214361190796, "rewards/margins": 0.3844982385635376, "rewards/rejected": -1.2371195554733276, "step": 265 }, { "epoch": 2.01, "grad_norm": 7.017591256124865, "learning_rate": 1.8258426966292135e-07, "logps/chosen": -43.99406433105469, "logps/rejected": -53.245262145996094, "loss": 0.5349, "losses/dpo": 0.5464926362037659, "losses/sft": 1.5807067155838013, "losses/total": 0.5464926362037659, "ref_logps/chosen": -36.38019561767578, "ref_logps/rejected": -40.543190002441406, "rewards/accuracies": 0.78125, "rewards/chosen": -0.761387288570404, "rewards/margins": 0.5088198184967041, "rewards/rejected": -1.2702070474624634, "step": 266 }, { "epoch": 2.02, "grad_norm": 6.985219378495969, "learning_rate": 1.8117977528089888e-07, "logps/chosen": -44.4912109375, "logps/rejected": -53.347251892089844, "loss": 0.5743, "losses/dpo": 0.5905557870864868, "losses/sft": 1.7700715065002441, "losses/total": 0.5905557870864868, "ref_logps/chosen": -36.70296859741211, "ref_logps/rejected": -41.48924255371094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7788243889808655, "rewards/margins": 0.40697669982910156, "rewards/rejected": -1.1858012676239014, "step": 267 }, { "epoch": 2.02, "grad_norm": 7.021871568299538, "learning_rate": 1.7977528089887638e-07, "logps/chosen": -41.247291564941406, "logps/rejected": -50.476539611816406, "loss": 0.5583, "losses/dpo": 0.5536283850669861, "losses/sft": 1.3929085731506348, "losses/total": 0.5536283850669861, "ref_logps/chosen": -34.45301055908203, "ref_logps/rejected": -39.05378723144531, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6794286370277405, "rewards/margins": 0.46284645795822144, "rewards/rejected": -1.142275094985962, "step": 268 }, { "epoch": 2.03, "grad_norm": 7.290879700745406, "learning_rate": 1.7837078651685391e-07, "logps/chosen": -44.69060516357422, "logps/rejected": -52.723419189453125, "loss": 0.5862, "losses/dpo": 0.5445826053619385, "losses/sft": 1.8489296436309814, "losses/total": 0.5445826053619385, "ref_logps/chosen": -36.05701446533203, "ref_logps/rejected": -40.36290740966797, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.863358736038208, "rewards/margins": 0.37269291281700134, "rewards/rejected": -1.2360515594482422, "step": 269 }, { "epoch": 2.04, "grad_norm": 7.278112500918291, "learning_rate": 1.7696629213483144e-07, "logps/chosen": -47.17387771606445, "logps/rejected": -55.31304168701172, "loss": 0.5451, "losses/dpo": 0.4929129481315613, "losses/sft": 1.2738251686096191, "losses/total": 0.4929129481315613, "ref_logps/chosen": -38.97121810913086, "ref_logps/rejected": -42.48224639892578, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8202658891677856, "rewards/margins": 0.4628136157989502, "rewards/rejected": -1.2830795049667358, "step": 270 }, { "epoch": 2.05, "grad_norm": 7.2790694246, "learning_rate": 1.75561797752809e-07, "logps/chosen": -39.39592742919922, "logps/rejected": -49.23228073120117, "loss": 0.5839, "losses/dpo": 0.5500213503837585, "losses/sft": 1.5326621532440186, "losses/total": 0.5500213503837585, "ref_logps/chosen": -31.87863540649414, "ref_logps/rejected": -37.98380661010742, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7517290115356445, "rewards/margins": 0.37311792373657227, "rewards/rejected": -1.1248469352722168, "step": 271 }, { "epoch": 2.05, "grad_norm": 6.952262857114201, "learning_rate": 1.741573033707865e-07, "logps/chosen": -41.979820251464844, "logps/rejected": -51.27606964111328, "loss": 0.5477, "losses/dpo": 0.5781035423278809, "losses/sft": 1.6893967390060425, "losses/total": 0.5781035423278809, "ref_logps/chosen": -34.47309875488281, "ref_logps/rejected": -38.76087951660156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7506722211837769, "rewards/margins": 0.5008465051651001, "rewards/rejected": -1.251518726348877, "step": 272 }, { "epoch": 2.06, "grad_norm": 7.323213695486467, "learning_rate": 1.7275280898876404e-07, "logps/chosen": -46.38153839111328, "logps/rejected": -57.915809631347656, "loss": 0.5002, "losses/dpo": 0.5204892754554749, "losses/sft": 1.5103009939193726, "losses/total": 0.5204892754554749, "ref_logps/chosen": -38.351890563964844, "ref_logps/rejected": -43.90599060058594, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8029646873474121, "rewards/margins": 0.5980167388916016, "rewards/rejected": -1.4009814262390137, "step": 273 }, { "epoch": 2.07, "grad_norm": 7.764155916683402, "learning_rate": 1.7134831460674157e-07, "logps/chosen": -45.19919967651367, "logps/rejected": -51.13863754272461, "loss": 0.6283, "losses/dpo": 0.616185188293457, "losses/sft": 1.6811277866363525, "losses/total": 0.616185188293457, "ref_logps/chosen": -36.72953796386719, "ref_logps/rejected": -39.85737228393555, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.8469663858413696, "rewards/margins": 0.2811599373817444, "rewards/rejected": -1.1281262636184692, "step": 274 }, { "epoch": 2.08, "grad_norm": 7.020433782892144, "learning_rate": 1.699438202247191e-07, "logps/chosen": -43.26371765136719, "logps/rejected": -52.273712158203125, "loss": 0.5707, "losses/dpo": 0.584823727607727, "losses/sft": 1.7780404090881348, "losses/total": 0.584823727607727, "ref_logps/chosen": -34.843936920166016, "ref_logps/rejected": -39.69860076904297, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8419777154922485, "rewards/margins": 0.4155334234237671, "rewards/rejected": -1.2575111389160156, "step": 275 }, { "epoch": 2.08, "grad_norm": 7.92264626489854, "learning_rate": 1.6853932584269663e-07, "logps/chosen": -47.98881912231445, "logps/rejected": -56.07038116455078, "loss": 0.5876, "losses/dpo": 0.4782869219779968, "losses/sft": 1.5796866416931152, "losses/total": 0.4782869219779968, "ref_logps/chosen": -39.32164764404297, "ref_logps/rejected": -43.3940315246582, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.8667174577713013, "rewards/margins": 0.40091750025749207, "rewards/rejected": -1.2676348686218262, "step": 276 }, { "epoch": 2.09, "grad_norm": 6.857885259771192, "learning_rate": 1.6713483146067413e-07, "logps/chosen": -42.90391159057617, "logps/rejected": -53.66696548461914, "loss": 0.5666, "losses/dpo": 0.6092857122421265, "losses/sft": 1.6311126947402954, "losses/total": 0.6092857122421265, "ref_logps/chosen": -35.07619857788086, "ref_logps/rejected": -41.65775680541992, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.7827714085578918, "rewards/margins": 0.4181497395038605, "rewards/rejected": -1.2009210586547852, "step": 277 }, { "epoch": 2.1, "grad_norm": 7.0081480343548215, "learning_rate": 1.6573033707865166e-07, "logps/chosen": -43.48851013183594, "logps/rejected": -56.560142517089844, "loss": 0.5552, "losses/dpo": 0.47724148631095886, "losses/sft": 1.4892723560333252, "losses/total": 0.47724148631095886, "ref_logps/chosen": -35.48023223876953, "ref_logps/rejected": -43.66110610961914, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.8008283376693726, "rewards/margins": 0.4890754222869873, "rewards/rejected": -1.2899038791656494, "step": 278 }, { "epoch": 2.11, "grad_norm": 7.160682409155752, "learning_rate": 1.6432584269662922e-07, "logps/chosen": -44.21363830566406, "logps/rejected": -58.09941864013672, "loss": 0.5245, "losses/dpo": 0.4419279396533966, "losses/sft": 1.6377503871917725, "losses/total": 0.4419279396533966, "ref_logps/chosen": -36.17689514160156, "ref_logps/rejected": -44.57625198364258, "rewards/accuracies": 0.75, "rewards/chosen": -0.8036742210388184, "rewards/margins": 0.5486425757408142, "rewards/rejected": -1.3523168563842773, "step": 279 }, { "epoch": 2.11, "grad_norm": 7.212324015356224, "learning_rate": 1.6292134831460675e-07, "logps/chosen": -44.50836181640625, "logps/rejected": -51.84413528442383, "loss": 0.5731, "losses/dpo": 0.4934471547603607, "losses/sft": 1.4699177742004395, "losses/total": 0.4934471547603607, "ref_logps/chosen": -36.38746643066406, "ref_logps/rejected": -39.50672149658203, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8120898604393005, "rewards/margins": 0.4216514825820923, "rewards/rejected": -1.233741283416748, "step": 280 }, { "epoch": 2.12, "grad_norm": 6.532989304583127, "learning_rate": 1.6151685393258428e-07, "logps/chosen": -40.479827880859375, "logps/rejected": -52.349693298339844, "loss": 0.5111, "losses/dpo": 0.5451053380966187, "losses/sft": 1.5731171369552612, "losses/total": 0.5451053380966187, "ref_logps/chosen": -33.727699279785156, "ref_logps/rejected": -40.0911865234375, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6752126216888428, "rewards/margins": 0.5506378412246704, "rewards/rejected": -1.2258504629135132, "step": 281 }, { "epoch": 2.13, "grad_norm": 6.65644378559116, "learning_rate": 1.6011235955056178e-07, "logps/chosen": -42.37626266479492, "logps/rejected": -53.92717742919922, "loss": 0.5305, "losses/dpo": 0.523646354675293, "losses/sft": 1.5671043395996094, "losses/total": 0.523646354675293, "ref_logps/chosen": -35.16848373413086, "ref_logps/rejected": -41.62503433227539, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7207781076431274, "rewards/margins": 0.509436309337616, "rewards/rejected": -1.2302143573760986, "step": 282 }, { "epoch": 2.14, "grad_norm": 7.652354022803428, "learning_rate": 1.5870786516853931e-07, "logps/chosen": -45.83367919921875, "logps/rejected": -57.50337219238281, "loss": 0.5511, "losses/dpo": 0.5657609105110168, "losses/sft": 1.511309266090393, "losses/total": 0.5657609105110168, "ref_logps/chosen": -37.572113037109375, "ref_logps/rejected": -44.38706970214844, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8261568546295166, "rewards/margins": 0.4854734539985657, "rewards/rejected": -1.3116302490234375, "step": 283 }, { "epoch": 2.14, "grad_norm": 6.7860253446718, "learning_rate": 1.5730337078651685e-07, "logps/chosen": -41.25431823730469, "logps/rejected": -54.90302658081055, "loss": 0.5188, "losses/dpo": 0.5078562498092651, "losses/sft": 1.5500166416168213, "losses/total": 0.5078562498092651, "ref_logps/chosen": -33.7675666809082, "ref_logps/rejected": -41.983367919921875, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7486748695373535, "rewards/margins": 0.5432910919189453, "rewards/rejected": -1.2919659614562988, "step": 284 }, { "epoch": 2.15, "grad_norm": 7.11903493041396, "learning_rate": 1.5589887640449438e-07, "logps/chosen": -43.894989013671875, "logps/rejected": -58.60367202758789, "loss": 0.4914, "losses/dpo": 0.5028943419456482, "losses/sft": 1.594357967376709, "losses/total": 0.5028943419456482, "ref_logps/chosen": -36.30883026123047, "ref_logps/rejected": -45.05226516723633, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.7586159706115723, "rewards/margins": 0.5965246558189392, "rewards/rejected": -1.3551405668258667, "step": 285 }, { "epoch": 2.16, "grad_norm": 7.693954508671863, "learning_rate": 1.5449438202247188e-07, "logps/chosen": -48.08583068847656, "logps/rejected": -53.51144027709961, "loss": 0.5882, "losses/dpo": 0.8339239954948425, "losses/sft": 1.617476224899292, "losses/total": 0.8339239954948425, "ref_logps/chosen": -40.02094268798828, "ref_logps/rejected": -41.421348571777344, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8064886331558228, "rewards/margins": 0.4025205969810486, "rewards/rejected": -1.2090092897415161, "step": 286 }, { "epoch": 2.17, "grad_norm": 6.936991994103028, "learning_rate": 1.5308988764044944e-07, "logps/chosen": -42.915550231933594, "logps/rejected": -54.570682525634766, "loss": 0.5427, "losses/dpo": 0.570111095905304, "losses/sft": 1.7627439498901367, "losses/total": 0.570111095905304, "ref_logps/chosen": -34.83842468261719, "ref_logps/rejected": -41.10365295410156, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8077125549316406, "rewards/margins": 0.5389906167984009, "rewards/rejected": -1.346703052520752, "step": 287 }, { "epoch": 2.17, "grad_norm": 7.22270723761247, "learning_rate": 1.5168539325842697e-07, "logps/chosen": -42.23722457885742, "logps/rejected": -57.404205322265625, "loss": 0.529, "losses/dpo": 0.5674354434013367, "losses/sft": 1.5719692707061768, "losses/total": 0.5674354434013367, "ref_logps/chosen": -34.721920013427734, "ref_logps/rejected": -44.73695373535156, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7515305280685425, "rewards/margins": 0.5151941180229187, "rewards/rejected": -1.266724705696106, "step": 288 }, { "epoch": 2.18, "grad_norm": 8.53784031336331, "learning_rate": 1.502808988764045e-07, "logps/chosen": -48.22527313232422, "logps/rejected": -58.22871398925781, "loss": 0.5799, "losses/dpo": 0.5890235900878906, "losses/sft": 1.6156002283096313, "losses/total": 0.5890235900878906, "ref_logps/chosen": -39.35710525512695, "ref_logps/rejected": -45.079322814941406, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.8868170976638794, "rewards/margins": 0.4281224012374878, "rewards/rejected": -1.3149394989013672, "step": 289 }, { "epoch": 2.19, "grad_norm": 6.885607484250047, "learning_rate": 1.4887640449438203e-07, "logps/chosen": -42.00331115722656, "logps/rejected": -51.58038330078125, "loss": 0.5568, "losses/dpo": 0.602211058139801, "losses/sft": 1.4960790872573853, "losses/total": 0.602211058139801, "ref_logps/chosen": -33.715057373046875, "ref_logps/rejected": -39.065757751464844, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8288247585296631, "rewards/margins": 0.422637403011322, "rewards/rejected": -1.2514622211456299, "step": 290 }, { "epoch": 2.2, "grad_norm": 6.966987508866502, "learning_rate": 1.4747191011235953e-07, "logps/chosen": -43.61931228637695, "logps/rejected": -58.451629638671875, "loss": 0.5594, "losses/dpo": 0.5309076309204102, "losses/sft": 1.636415958404541, "losses/total": 0.5309076309204102, "ref_logps/chosen": -34.87947463989258, "ref_logps/rejected": -44.885459899902344, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.87398362159729, "rewards/margins": 0.48263317346572876, "rewards/rejected": -1.356616735458374, "step": 291 }, { "epoch": 2.2, "grad_norm": 6.8192638080381744, "learning_rate": 1.4606741573033706e-07, "logps/chosen": -43.92414855957031, "logps/rejected": -53.39807891845703, "loss": 0.5452, "losses/dpo": 0.6017537713050842, "losses/sft": 1.7611263990402222, "losses/total": 0.6017537713050842, "ref_logps/chosen": -35.702823638916016, "ref_logps/rejected": -40.34605407714844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8221321702003479, "rewards/margins": 0.4830705225467682, "rewards/rejected": -1.3052027225494385, "step": 292 }, { "epoch": 2.21, "grad_norm": 8.710229032299473, "learning_rate": 1.446629213483146e-07, "logps/chosen": -51.8635368347168, "logps/rejected": -58.34959030151367, "loss": 0.601, "losses/dpo": 0.5715082883834839, "losses/sft": 1.490638017654419, "losses/total": 0.5715082883834839, "ref_logps/chosen": -42.414031982421875, "ref_logps/rejected": -45.42848587036133, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9449502229690552, "rewards/margins": 0.3471601605415344, "rewards/rejected": -1.2921103239059448, "step": 293 }, { "epoch": 2.22, "grad_norm": 7.6355542087700226, "learning_rate": 1.4325842696629212e-07, "logps/chosen": -43.83769607543945, "logps/rejected": -58.36852264404297, "loss": 0.546, "losses/dpo": 0.4579807221889496, "losses/sft": 1.5301527976989746, "losses/total": 0.4579807221889496, "ref_logps/chosen": -35.81403350830078, "ref_logps/rejected": -44.5776252746582, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.802366316318512, "rewards/margins": 0.5767236948013306, "rewards/rejected": -1.3790900707244873, "step": 294 }, { "epoch": 2.23, "grad_norm": 7.4022075570091195, "learning_rate": 1.4185393258426968e-07, "logps/chosen": -44.79059600830078, "logps/rejected": -59.63528060913086, "loss": 0.5251, "losses/dpo": 0.5625388622283936, "losses/sft": 1.5417966842651367, "losses/total": 0.5625388622283936, "ref_logps/chosen": -36.72273254394531, "ref_logps/rejected": -46.061744689941406, "rewards/accuracies": 0.765625, "rewards/chosen": -0.806786060333252, "rewards/margins": 0.5505677461624146, "rewards/rejected": -1.357353925704956, "step": 295 }, { "epoch": 2.23, "grad_norm": 7.092958234931924, "learning_rate": 1.4044943820224718e-07, "logps/chosen": -42.923343658447266, "logps/rejected": -52.593894958496094, "loss": 0.5582, "losses/dpo": 0.46777036786079407, "losses/sft": 1.5354235172271729, "losses/total": 0.46777036786079407, "ref_logps/chosen": -35.31480026245117, "ref_logps/rejected": -40.524574279785156, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7608542442321777, "rewards/margins": 0.446077823638916, "rewards/rejected": -1.2069320678710938, "step": 296 }, { "epoch": 2.24, "grad_norm": 7.500648089064134, "learning_rate": 1.3904494382022472e-07, "logps/chosen": -43.400211334228516, "logps/rejected": -54.485557556152344, "loss": 0.5719, "losses/dpo": 0.43427377939224243, "losses/sft": 1.5346068143844604, "losses/total": 0.43427377939224243, "ref_logps/chosen": -35.220516204833984, "ref_logps/rejected": -41.69098663330078, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8179699182510376, "rewards/margins": 0.4614875316619873, "rewards/rejected": -1.2794575691223145, "step": 297 }, { "epoch": 2.25, "grad_norm": 6.861133660639989, "learning_rate": 1.3764044943820225e-07, "logps/chosen": -40.74993896484375, "logps/rejected": -55.73876190185547, "loss": 0.5064, "losses/dpo": 0.5779513716697693, "losses/sft": 1.53359055519104, "losses/total": 0.5779513716697693, "ref_logps/chosen": -33.70279312133789, "ref_logps/rejected": -42.673423767089844, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.7047147154808044, "rewards/margins": 0.6018195152282715, "rewards/rejected": -1.3065342903137207, "step": 298 }, { "epoch": 2.26, "grad_norm": 6.77015674340588, "learning_rate": 1.3623595505617978e-07, "logps/chosen": -41.57499694824219, "logps/rejected": -55.820674896240234, "loss": 0.5056, "losses/dpo": 0.5236800909042358, "losses/sft": 1.7500333786010742, "losses/total": 0.5236800909042358, "ref_logps/chosen": -34.20399475097656, "ref_logps/rejected": -42.717681884765625, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7371004819869995, "rewards/margins": 0.5731986165046692, "rewards/rejected": -1.3102991580963135, "step": 299 }, { "epoch": 2.26, "grad_norm": 6.520455497747794, "learning_rate": 1.3483146067415728e-07, "logps/chosen": -40.784889221191406, "logps/rejected": -53.35670471191406, "loss": 0.5158, "losses/dpo": 0.39004355669021606, "losses/sft": 1.4663935899734497, "losses/total": 0.39004355669021606, "ref_logps/chosen": -33.147525787353516, "ref_logps/rejected": -39.950157165527344, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7637366056442261, "rewards/margins": 0.5769186019897461, "rewards/rejected": -1.3406550884246826, "step": 300 }, { "epoch": 2.27, "grad_norm": 7.96122739673963, "learning_rate": 1.334269662921348e-07, "logps/chosen": -46.782169342041016, "logps/rejected": -52.76530456542969, "loss": 0.5923, "losses/dpo": 0.6052607297897339, "losses/sft": 1.6094651222229004, "losses/total": 0.6052607297897339, "ref_logps/chosen": -39.2327880859375, "ref_logps/rejected": -41.223548889160156, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.754938006401062, "rewards/margins": 0.3992377817630768, "rewards/rejected": -1.1541757583618164, "step": 301 }, { "epoch": 2.28, "grad_norm": 7.751653637126333, "learning_rate": 1.3202247191011234e-07, "logps/chosen": -49.38646697998047, "logps/rejected": -61.543209075927734, "loss": 0.5327, "losses/dpo": 0.556348443031311, "losses/sft": 1.8087131977081299, "losses/total": 0.556348443031311, "ref_logps/chosen": -40.26612091064453, "ref_logps/rejected": -47.002288818359375, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9120345115661621, "rewards/margins": 0.5420576930046082, "rewards/rejected": -1.454092264175415, "step": 302 }, { "epoch": 2.29, "grad_norm": 7.798526029135885, "learning_rate": 1.306179775280899e-07, "logps/chosen": -43.65242004394531, "logps/rejected": -57.450496673583984, "loss": 0.5818, "losses/dpo": 0.6229327321052551, "losses/sft": 1.691450834274292, "losses/total": 0.6229327321052551, "ref_logps/chosen": -35.82228088378906, "ref_logps/rejected": -45.80632781982422, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7830138802528381, "rewards/margins": 0.3814033269882202, "rewards/rejected": -1.1644171476364136, "step": 303 }, { "epoch": 2.29, "grad_norm": 7.543077771323995, "learning_rate": 1.2921348314606743e-07, "logps/chosen": -44.790557861328125, "logps/rejected": -61.33608627319336, "loss": 0.5259, "losses/dpo": 0.648471474647522, "losses/sft": 1.673068881034851, "losses/total": 0.648471474647522, "ref_logps/chosen": -36.91156768798828, "ref_logps/rejected": -48.083534240722656, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.7878991961479187, "rewards/margins": 0.5373560190200806, "rewards/rejected": -1.325255274772644, "step": 304 }, { "epoch": 2.3, "grad_norm": 7.321395556756757, "learning_rate": 1.2780898876404493e-07, "logps/chosen": -45.843082427978516, "logps/rejected": -57.27900695800781, "loss": 0.5602, "losses/dpo": 0.5154864192008972, "losses/sft": 1.5874884128570557, "losses/total": 0.5154864192008972, "ref_logps/chosen": -37.167877197265625, "ref_logps/rejected": -43.86834716796875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8675205707550049, "rewards/margins": 0.4735449552536011, "rewards/rejected": -1.341065526008606, "step": 305 }, { "epoch": 2.31, "grad_norm": 7.180779900116491, "learning_rate": 1.2640449438202246e-07, "logps/chosen": -45.17388153076172, "logps/rejected": -55.727230072021484, "loss": 0.5217, "losses/dpo": 0.4565548598766327, "losses/sft": 1.4454078674316406, "losses/total": 0.4565548598766327, "ref_logps/chosen": -37.53833770751953, "ref_logps/rejected": -42.80052185058594, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.7635539770126343, "rewards/margins": 0.529117226600647, "rewards/rejected": -1.2926712036132812, "step": 306 }, { "epoch": 2.32, "grad_norm": 7.748539788981976, "learning_rate": 1.25e-07, "logps/chosen": -45.61334991455078, "logps/rejected": -49.54269790649414, "loss": 0.5743, "losses/dpo": 0.4475148916244507, "losses/sft": 1.3761274814605713, "losses/total": 0.4475148916244507, "ref_logps/chosen": -37.51769256591797, "ref_logps/rejected": -37.59453582763672, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.8095651865005493, "rewards/margins": 0.38525110483169556, "rewards/rejected": -1.1948162317276, "step": 307 }, { "epoch": 2.32, "grad_norm": 7.42646045779241, "learning_rate": 1.2359550561797752e-07, "logps/chosen": -42.217491149902344, "logps/rejected": -57.62702941894531, "loss": 0.5103, "losses/dpo": 0.5864957571029663, "losses/sft": 1.526570439338684, "losses/total": 0.5864957571029663, "ref_logps/chosen": -34.16813659667969, "ref_logps/rejected": -43.83499526977539, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8049358129501343, "rewards/margins": 0.5742676258087158, "rewards/rejected": -1.37920343875885, "step": 308 }, { "epoch": 2.33, "grad_norm": 7.402951195988575, "learning_rate": 1.2219101123595506e-07, "logps/chosen": -43.753623962402344, "logps/rejected": -55.725196838378906, "loss": 0.5457, "losses/dpo": 0.4583805501461029, "losses/sft": 1.4125399589538574, "losses/total": 0.4583805501461029, "ref_logps/chosen": -35.22587585449219, "ref_logps/rejected": -42.14164733886719, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.8527748584747314, "rewards/margins": 0.5055804252624512, "rewards/rejected": -1.3583552837371826, "step": 309 }, { "epoch": 2.34, "grad_norm": 8.606023903012021, "learning_rate": 1.2078651685393259e-07, "logps/chosen": -52.9200439453125, "logps/rejected": -61.587310791015625, "loss": 0.5909, "losses/dpo": 0.5809124708175659, "losses/sft": 1.585126280784607, "losses/total": 0.5809124708175659, "ref_logps/chosen": -43.860687255859375, "ref_logps/rejected": -48.097747802734375, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9059357047080994, "rewards/margins": 0.443020224571228, "rewards/rejected": -1.3489558696746826, "step": 310 }, { "epoch": 2.35, "grad_norm": 7.302220410372284, "learning_rate": 1.1938202247191012e-07, "logps/chosen": -43.49970245361328, "logps/rejected": -58.033485412597656, "loss": 0.5186, "losses/dpo": 0.48940473794937134, "losses/sft": 1.4836596250534058, "losses/total": 0.48940473794937134, "ref_logps/chosen": -36.129432678222656, "ref_logps/rejected": -44.55863952636719, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7370268702507019, "rewards/margins": 0.610457181930542, "rewards/rejected": -1.3474839925765991, "step": 311 }, { "epoch": 2.35, "grad_norm": 7.715132554841409, "learning_rate": 1.1797752808988763e-07, "logps/chosen": -45.6818733215332, "logps/rejected": -57.750892639160156, "loss": 0.5446, "losses/dpo": 0.6576637625694275, "losses/sft": 1.6136798858642578, "losses/total": 0.6576637625694275, "ref_logps/chosen": -37.14127731323242, "ref_logps/rejected": -43.89785385131836, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.8540595769882202, "rewards/margins": 0.5312445759773254, "rewards/rejected": -1.3853040933609009, "step": 312 }, { "epoch": 2.36, "grad_norm": 7.103504431576494, "learning_rate": 1.1657303370786515e-07, "logps/chosen": -43.971473693847656, "logps/rejected": -57.290443420410156, "loss": 0.5204, "losses/dpo": 0.5861748456954956, "losses/sft": 1.7284009456634521, "losses/total": 0.5861748456954956, "ref_logps/chosen": -35.535255432128906, "ref_logps/rejected": -43.220550537109375, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8436219096183777, "rewards/margins": 0.5633664727210999, "rewards/rejected": -1.4069883823394775, "step": 313 }, { "epoch": 2.37, "grad_norm": 7.74539036906925, "learning_rate": 1.151685393258427e-07, "logps/chosen": -45.654815673828125, "logps/rejected": -55.697998046875, "loss": 0.5716, "losses/dpo": 0.6496266722679138, "losses/sft": 1.7458603382110596, "losses/total": 0.6496266722679138, "ref_logps/chosen": -37.42692947387695, "ref_logps/rejected": -43.09944534301758, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.8227887749671936, "rewards/margins": 0.43706685304641724, "rewards/rejected": -1.2598556280136108, "step": 314 }, { "epoch": 2.38, "grad_norm": 7.3016087270783965, "learning_rate": 1.1376404494382023e-07, "logps/chosen": -44.80577087402344, "logps/rejected": -58.83177185058594, "loss": 0.5616, "losses/dpo": 0.5281144380569458, "losses/sft": 1.5373191833496094, "losses/total": 0.5281144380569458, "ref_logps/chosen": -35.78607940673828, "ref_logps/rejected": -44.85388946533203, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.9019690155982971, "rewards/margins": 0.49581989645957947, "rewards/rejected": -1.3977890014648438, "step": 315 }, { "epoch": 2.38, "grad_norm": 7.369973670185862, "learning_rate": 1.1235955056179774e-07, "logps/chosen": -44.48060607910156, "logps/rejected": -57.20940399169922, "loss": 0.5253, "losses/dpo": 0.5681818723678589, "losses/sft": 1.7861613035202026, "losses/total": 0.5681818723678589, "ref_logps/chosen": -35.98798751831055, "ref_logps/rejected": -42.8939094543457, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8492615818977356, "rewards/margins": 0.5822880268096924, "rewards/rejected": -1.4315495491027832, "step": 316 }, { "epoch": 2.39, "grad_norm": 7.531569588872603, "learning_rate": 1.1095505617977527e-07, "logps/chosen": -43.765594482421875, "logps/rejected": -55.92811965942383, "loss": 0.5324, "losses/dpo": 0.5917935371398926, "losses/sft": 1.6781896352767944, "losses/total": 0.5917935371398926, "ref_logps/chosen": -35.10169219970703, "ref_logps/rejected": -42.32771301269531, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8663901090621948, "rewards/margins": 0.4936509132385254, "rewards/rejected": -1.3600411415100098, "step": 317 }, { "epoch": 2.4, "grad_norm": 7.00021473521157, "learning_rate": 1.095505617977528e-07, "logps/chosen": -43.7101936340332, "logps/rejected": -55.512020111083984, "loss": 0.5587, "losses/dpo": 0.3991687297821045, "losses/sft": 1.6147840023040771, "losses/total": 0.3991687297821045, "ref_logps/chosen": -34.230926513671875, "ref_logps/rejected": -40.64677810668945, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9479266405105591, "rewards/margins": 0.5385974049568176, "rewards/rejected": -1.4865241050720215, "step": 318 }, { "epoch": 2.41, "grad_norm": 7.422365442434664, "learning_rate": 1.0814606741573033e-07, "logps/chosen": -44.69245910644531, "logps/rejected": -51.95305252075195, "loss": 0.5704, "losses/dpo": 0.5418112277984619, "losses/sft": 1.3795506954193115, "losses/total": 0.5418112277984619, "ref_logps/chosen": -35.97312545776367, "ref_logps/rejected": -39.2027702331543, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.8719329833984375, "rewards/margins": 0.40309497714042664, "rewards/rejected": -1.2750279903411865, "step": 319 }, { "epoch": 2.42, "grad_norm": 7.787952340155071, "learning_rate": 1.0674157303370785e-07, "logps/chosen": -46.54815673828125, "logps/rejected": -55.3624153137207, "loss": 0.5672, "losses/dpo": 0.5258245468139648, "losses/sft": 1.7972207069396973, "losses/total": 0.5258245468139648, "ref_logps/chosen": -38.29623031616211, "ref_logps/rejected": -42.49595642089844, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8251928687095642, "rewards/margins": 0.4614531695842743, "rewards/rejected": -1.2866460084915161, "step": 320 }, { "epoch": 2.42, "grad_norm": 7.716635082250954, "learning_rate": 1.0533707865168538e-07, "logps/chosen": -45.037723541259766, "logps/rejected": -53.17112350463867, "loss": 0.5781, "losses/dpo": 0.7091802358627319, "losses/sft": 1.6653159856796265, "losses/total": 0.7091802358627319, "ref_logps/chosen": -35.86992263793945, "ref_logps/rejected": -39.99578857421875, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9167801141738892, "rewards/margins": 0.4007537364959717, "rewards/rejected": -1.3175339698791504, "step": 321 }, { "epoch": 2.43, "grad_norm": 6.913928315105185, "learning_rate": 1.0393258426966293e-07, "logps/chosen": -46.575584411621094, "logps/rejected": -59.92189407348633, "loss": 0.4903, "losses/dpo": 0.4884983003139496, "losses/sft": 1.5409932136535645, "losses/total": 0.4884983003139496, "ref_logps/chosen": -39.07024383544922, "ref_logps/rejected": -45.98902893066406, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.750534176826477, "rewards/margins": 0.6427518725395203, "rewards/rejected": -1.3932859897613525, "step": 322 }, { "epoch": 2.44, "grad_norm": 7.359357410660962, "learning_rate": 1.0252808988764044e-07, "logps/chosen": -43.37224578857422, "logps/rejected": -57.634010314941406, "loss": 0.5156, "losses/dpo": 0.4852214455604553, "losses/sft": 1.7198714017868042, "losses/total": 0.4852214455604553, "ref_logps/chosen": -34.993377685546875, "ref_logps/rejected": -43.477684020996094, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8378866910934448, "rewards/margins": 0.5777460336685181, "rewards/rejected": -1.415632724761963, "step": 323 }, { "epoch": 2.45, "grad_norm": 6.246986321807027, "learning_rate": 1.0112359550561797e-07, "logps/chosen": -39.65964889526367, "logps/rejected": -53.03920364379883, "loss": 0.4975, "losses/dpo": 0.4038864076137543, "losses/sft": 1.4372718334197998, "losses/total": 0.4038864076137543, "ref_logps/chosen": -32.10685348510742, "ref_logps/rejected": -39.401954650878906, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.7552794218063354, "rewards/margins": 0.6084451675415039, "rewards/rejected": -1.3637245893478394, "step": 324 }, { "epoch": 2.45, "grad_norm": 7.331178402164894, "learning_rate": 9.971910112359549e-08, "logps/chosen": -44.56993865966797, "logps/rejected": -59.496734619140625, "loss": 0.5273, "losses/dpo": 0.4201672077178955, "losses/sft": 1.5359259843826294, "losses/total": 0.4201672077178955, "ref_logps/chosen": -36.124656677246094, "ref_logps/rejected": -45.465816497802734, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8445284962654114, "rewards/margins": 0.558563232421875, "rewards/rejected": -1.4030916690826416, "step": 325 }, { "epoch": 2.46, "grad_norm": 8.06984724769854, "learning_rate": 9.831460674157303e-08, "logps/chosen": -48.639495849609375, "logps/rejected": -56.76270294189453, "loss": 0.5197, "losses/dpo": 0.4748002588748932, "losses/sft": 1.489260196685791, "losses/total": 0.4748002588748932, "ref_logps/chosen": -40.237266540527344, "ref_logps/rejected": -42.580772399902344, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8402228355407715, "rewards/margins": 0.5779698491096497, "rewards/rejected": -1.418192744255066, "step": 326 }, { "epoch": 2.47, "grad_norm": 7.991421566802235, "learning_rate": 9.691011235955055e-08, "logps/chosen": -46.860626220703125, "logps/rejected": -58.88548278808594, "loss": 0.5235, "losses/dpo": 0.6238963603973389, "losses/sft": 1.7782843112945557, "losses/total": 0.6238963603973389, "ref_logps/chosen": -38.90311050415039, "ref_logps/rejected": -45.0700569152832, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.7957516312599182, "rewards/margins": 0.5857904553413391, "rewards/rejected": -1.3815419673919678, "step": 327 }, { "epoch": 2.48, "grad_norm": 7.266548128858226, "learning_rate": 9.550561797752808e-08, "logps/chosen": -42.59172821044922, "logps/rejected": -52.10871505737305, "loss": 0.5667, "losses/dpo": 0.6280735731124878, "losses/sft": 1.5307084321975708, "losses/total": 0.6280735731124878, "ref_logps/chosen": -34.418861389160156, "ref_logps/rejected": -39.33207702636719, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.8172866106033325, "rewards/margins": 0.46037718653678894, "rewards/rejected": -1.2776637077331543, "step": 328 }, { "epoch": 2.48, "grad_norm": 7.642236003437132, "learning_rate": 9.410112359550561e-08, "logps/chosen": -45.72086715698242, "logps/rejected": -52.531341552734375, "loss": 0.5663, "losses/dpo": 0.5168911814689636, "losses/sft": 1.7978273630142212, "losses/total": 0.5168911814689636, "ref_logps/chosen": -37.207481384277344, "ref_logps/rejected": -39.487640380859375, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.851338267326355, "rewards/margins": 0.4530315101146698, "rewards/rejected": -1.3043696880340576, "step": 329 }, { "epoch": 2.49, "grad_norm": 7.854817727198668, "learning_rate": 9.269662921348314e-08, "logps/chosen": -46.91447448730469, "logps/rejected": -57.34621810913086, "loss": 0.5504, "losses/dpo": 0.5696989297866821, "losses/sft": 1.708069086074829, "losses/total": 0.5696989297866821, "ref_logps/chosen": -37.96686553955078, "ref_logps/rejected": -43.223960876464844, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.894761323928833, "rewards/margins": 0.5174643993377686, "rewards/rejected": -1.4122257232666016, "step": 330 }, { "epoch": 2.5, "grad_norm": 7.286496155272333, "learning_rate": 9.129213483146067e-08, "logps/chosen": -44.248069763183594, "logps/rejected": -60.32553482055664, "loss": 0.5098, "losses/dpo": 0.6059004664421082, "losses/sft": 1.61500883102417, "losses/total": 0.6059004664421082, "ref_logps/chosen": -35.856903076171875, "ref_logps/rejected": -46.063690185546875, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.839116632938385, "rewards/margins": 0.5870682001113892, "rewards/rejected": -1.426184892654419, "step": 331 }, { "epoch": 2.51, "grad_norm": 7.873630188834188, "learning_rate": 8.988764044943819e-08, "logps/chosen": -47.52843475341797, "logps/rejected": -58.742042541503906, "loss": 0.5465, "losses/dpo": 0.44472765922546387, "losses/sft": 1.8056182861328125, "losses/total": 0.44472765922546387, "ref_logps/chosen": -37.702247619628906, "ref_logps/rejected": -43.76355743408203, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.982619047164917, "rewards/margins": 0.5152289867401123, "rewards/rejected": -1.4978480339050293, "step": 332 }, { "epoch": 2.51, "grad_norm": 7.654569968967968, "learning_rate": 8.848314606741572e-08, "logps/chosen": -45.35044860839844, "logps/rejected": -54.683128356933594, "loss": 0.5477, "losses/dpo": 0.6351585388183594, "losses/sft": 1.465951681137085, "losses/total": 0.6351585388183594, "ref_logps/chosen": -37.354312896728516, "ref_logps/rejected": -41.636627197265625, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7996135354042053, "rewards/margins": 0.505035936832428, "rewards/rejected": -1.3046493530273438, "step": 333 }, { "epoch": 2.52, "grad_norm": 7.093489997148873, "learning_rate": 8.707865168539325e-08, "logps/chosen": -44.989524841308594, "logps/rejected": -56.54049301147461, "loss": 0.5235, "losses/dpo": 0.6136016845703125, "losses/sft": 1.876564860343933, "losses/total": 0.6136016845703125, "ref_logps/chosen": -36.00385284423828, "ref_logps/rejected": -41.88980484008789, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8985673785209656, "rewards/margins": 0.566501259803772, "rewards/rejected": -1.4650685787200928, "step": 334 }, { "epoch": 2.53, "grad_norm": 7.6289229336667, "learning_rate": 8.567415730337078e-08, "logps/chosen": -45.615413665771484, "logps/rejected": -56.45619201660156, "loss": 0.5586, "losses/dpo": 0.5084734559059143, "losses/sft": 1.6048380136489868, "losses/total": 0.5084734559059143, "ref_logps/chosen": -36.61585998535156, "ref_logps/rejected": -42.79827880859375, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.8999553322792053, "rewards/margins": 0.4658358097076416, "rewards/rejected": -1.3657910823822021, "step": 335 }, { "epoch": 2.54, "grad_norm": 8.275598752517682, "learning_rate": 8.426966292134831e-08, "logps/chosen": -47.839508056640625, "logps/rejected": -61.7794303894043, "loss": 0.5368, "losses/dpo": 0.5232934355735779, "losses/sft": 1.4998161792755127, "losses/total": 0.5232934355735779, "ref_logps/chosen": -38.512969970703125, "ref_logps/rejected": -46.98280334472656, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9326539039611816, "rewards/margins": 0.5470089316368103, "rewards/rejected": -1.4796628952026367, "step": 336 }, { "epoch": 2.54, "grad_norm": 6.837098147362294, "learning_rate": 8.286516853932583e-08, "logps/chosen": -42.03435516357422, "logps/rejected": -58.38957214355469, "loss": 0.4918, "losses/dpo": 0.34719789028167725, "losses/sft": 1.4158234596252441, "losses/total": 0.34719789028167725, "ref_logps/chosen": -34.547210693359375, "ref_logps/rejected": -44.05992889404297, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.7487142086029053, "rewards/margins": 0.6842500567436218, "rewards/rejected": -1.4329640865325928, "step": 337 }, { "epoch": 2.55, "grad_norm": 8.253144412756335, "learning_rate": 8.146067415730337e-08, "logps/chosen": -45.23094940185547, "logps/rejected": -53.472965240478516, "loss": 0.595, "losses/dpo": 0.5666919350624084, "losses/sft": 1.5198816061019897, "losses/total": 0.5666919350624084, "ref_logps/chosen": -36.06470489501953, "ref_logps/rejected": -40.16011047363281, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.9166238903999329, "rewards/margins": 0.4146617650985718, "rewards/rejected": -1.3312857151031494, "step": 338 }, { "epoch": 2.56, "grad_norm": 8.149761017487126, "learning_rate": 8.005617977528089e-08, "logps/chosen": -45.32318115234375, "logps/rejected": -51.304725646972656, "loss": 0.6056, "losses/dpo": 0.466902494430542, "losses/sft": 1.4729348421096802, "losses/total": 0.466902494430542, "ref_logps/chosen": -35.85576248168945, "ref_logps/rejected": -37.858970642089844, "rewards/accuracies": 0.671875, "rewards/chosen": -0.9467417597770691, "rewards/margins": 0.39783352613449097, "rewards/rejected": -1.34457528591156, "step": 339 }, { "epoch": 2.57, "grad_norm": 7.054066859896987, "learning_rate": 7.865168539325842e-08, "logps/chosen": -45.38795471191406, "logps/rejected": -57.93950653076172, "loss": 0.5182, "losses/dpo": 0.4842032194137573, "losses/sft": 1.6942293643951416, "losses/total": 0.4842032194137573, "ref_logps/chosen": -36.484046936035156, "ref_logps/rejected": -43.67852783203125, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8903906345367432, "rewards/margins": 0.5357075333595276, "rewards/rejected": -1.426098108291626, "step": 340 }, { "epoch": 2.57, "grad_norm": 7.258837050647915, "learning_rate": 7.724719101123594e-08, "logps/chosen": -46.00672149658203, "logps/rejected": -59.08924865722656, "loss": 0.5373, "losses/dpo": 0.5424889326095581, "losses/sft": 1.6475489139556885, "losses/total": 0.5424889326095581, "ref_logps/chosen": -37.73750305175781, "ref_logps/rejected": -45.23866271972656, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.826921820640564, "rewards/margins": 0.5581368207931519, "rewards/rejected": -1.3850586414337158, "step": 341 }, { "epoch": 2.58, "grad_norm": 7.988126049073018, "learning_rate": 7.584269662921348e-08, "logps/chosen": -46.84196472167969, "logps/rejected": -55.757198333740234, "loss": 0.5662, "losses/dpo": 0.353384792804718, "losses/sft": 1.717570424079895, "losses/total": 0.353384792804718, "ref_logps/chosen": -37.82433319091797, "ref_logps/rejected": -42.26597213745117, "rewards/accuracies": 0.75, "rewards/chosen": -0.9017627835273743, "rewards/margins": 0.44735997915267944, "rewards/rejected": -1.3491227626800537, "step": 342 }, { "epoch": 2.59, "grad_norm": 7.618340962447428, "learning_rate": 7.443820224719101e-08, "logps/chosen": -43.753684997558594, "logps/rejected": -55.105316162109375, "loss": 0.546, "losses/dpo": 0.6419227123260498, "losses/sft": 1.6892149448394775, "losses/total": 0.6419227123260498, "ref_logps/chosen": -35.38850402832031, "ref_logps/rejected": -41.928646087646484, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.8365182876586914, "rewards/margins": 0.4811485707759857, "rewards/rejected": -1.3176668882369995, "step": 343 }, { "epoch": 2.6, "grad_norm": 7.5691903171304915, "learning_rate": 7.303370786516853e-08, "logps/chosen": -44.20778274536133, "logps/rejected": -55.97998046875, "loss": 0.5407, "losses/dpo": 0.5625724196434021, "losses/sft": 1.5753792524337769, "losses/total": 0.5625724196434021, "ref_logps/chosen": -35.950294494628906, "ref_logps/rejected": -42.38732147216797, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8257489204406738, "rewards/margins": 0.5335172414779663, "rewards/rejected": -1.3592660427093506, "step": 344 }, { "epoch": 2.6, "grad_norm": 7.013411401019271, "learning_rate": 7.162921348314606e-08, "logps/chosen": -48.60981750488281, "logps/rejected": -61.60570526123047, "loss": 0.4779, "losses/dpo": 0.5409685373306274, "losses/sft": 1.6795134544372559, "losses/total": 0.5409685373306274, "ref_logps/chosen": -39.66438293457031, "ref_logps/rejected": -45.88689422607422, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.8945437073707581, "rewards/margins": 0.677337646484375, "rewards/rejected": -1.5718812942504883, "step": 345 }, { "epoch": 2.61, "grad_norm": 7.235335525204532, "learning_rate": 7.022471910112359e-08, "logps/chosen": -40.0158805847168, "logps/rejected": -53.02748107910156, "loss": 0.5321, "losses/dpo": 0.5608981847763062, "losses/sft": 1.2928898334503174, "losses/total": 0.5608981847763062, "ref_logps/chosen": -32.48070526123047, "ref_logps/rejected": -40.139122009277344, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7535171508789062, "rewards/margins": 0.5353185534477234, "rewards/rejected": -1.2888355255126953, "step": 346 }, { "epoch": 2.62, "grad_norm": 7.561289554463479, "learning_rate": 6.882022471910112e-08, "logps/chosen": -45.8831787109375, "logps/rejected": -52.74605178833008, "loss": 0.5634, "losses/dpo": 0.5234625935554504, "losses/sft": 1.5698529481887817, "losses/total": 0.5234625935554504, "ref_logps/chosen": -36.858985900878906, "ref_logps/rejected": -39.08251953125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9024193286895752, "rewards/margins": 0.4639340043067932, "rewards/rejected": -1.3663533926010132, "step": 347 }, { "epoch": 2.63, "grad_norm": 7.714313510104845, "learning_rate": 6.741573033707864e-08, "logps/chosen": -47.23927307128906, "logps/rejected": -56.10950469970703, "loss": 0.5513, "losses/dpo": 0.5834592580795288, "losses/sft": 1.8191860914230347, "losses/total": 0.5834592580795288, "ref_logps/chosen": -38.12981414794922, "ref_logps/rejected": -41.886940002441406, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.9109456539154053, "rewards/margins": 0.5113106966018677, "rewards/rejected": -1.422256350517273, "step": 348 }, { "epoch": 2.63, "grad_norm": 7.158492594820948, "learning_rate": 6.601123595505617e-08, "logps/chosen": -48.13493347167969, "logps/rejected": -60.77044677734375, "loss": 0.4977, "losses/dpo": 0.36096107959747314, "losses/sft": 1.417677640914917, "losses/total": 0.36096107959747314, "ref_logps/chosen": -39.71112823486328, "ref_logps/rejected": -45.55910110473633, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.8423808813095093, "rewards/margins": 0.6787533164024353, "rewards/rejected": -1.5211341381072998, "step": 349 }, { "epoch": 2.64, "grad_norm": 7.665270223156107, "learning_rate": 6.460674157303371e-08, "logps/chosen": -45.67338562011719, "logps/rejected": -52.81538391113281, "loss": 0.5593, "losses/dpo": 0.5008495450019836, "losses/sft": 1.4033509492874146, "losses/total": 0.5008495450019836, "ref_logps/chosen": -37.30530548095703, "ref_logps/rejected": -39.90568542480469, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8368085026741028, "rewards/margins": 0.45416122674942017, "rewards/rejected": -1.290969729423523, "step": 350 }, { "epoch": 2.65, "grad_norm": 7.472808082581494, "learning_rate": 6.320224719101123e-08, "logps/chosen": -43.985435485839844, "logps/rejected": -57.295692443847656, "loss": 0.5187, "losses/dpo": 0.5680770874023438, "losses/sft": 1.4148482084274292, "losses/total": 0.5680770874023438, "ref_logps/chosen": -36.4133186340332, "ref_logps/rejected": -43.849090576171875, "rewards/accuracies": 0.765625, "rewards/chosen": -0.757211446762085, "rewards/margins": 0.5874490737915039, "rewards/rejected": -1.3446605205535889, "step": 351 }, { "epoch": 2.66, "grad_norm": 8.189112257010201, "learning_rate": 6.179775280898876e-08, "logps/chosen": -47.502281188964844, "logps/rejected": -54.84540939331055, "loss": 0.583, "losses/dpo": 0.5579338073730469, "losses/sft": 1.615804672241211, "losses/total": 0.5579338073730469, "ref_logps/chosen": -38.33686065673828, "ref_logps/rejected": -41.62626647949219, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.9165424108505249, "rewards/margins": 0.40537166595458984, "rewards/rejected": -1.3219139575958252, "step": 352 }, { "epoch": 2.66, "grad_norm": 7.805410708655585, "learning_rate": 6.039325842696629e-08, "logps/chosen": -44.361324310302734, "logps/rejected": -59.17631149291992, "loss": 0.5472, "losses/dpo": 0.6015689373016357, "losses/sft": 1.6676236391067505, "losses/total": 0.6015689373016357, "ref_logps/chosen": -35.77091979980469, "ref_logps/rejected": -45.55202865600586, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8590403199195862, "rewards/margins": 0.5033884048461914, "rewards/rejected": -1.362428903579712, "step": 353 }, { "epoch": 2.67, "grad_norm": 8.028184259224918, "learning_rate": 5.898876404494382e-08, "logps/chosen": -46.5517463684082, "logps/rejected": -56.04482650756836, "loss": 0.5542, "losses/dpo": 0.6546050310134888, "losses/sft": 1.504585862159729, "losses/total": 0.6546050310134888, "ref_logps/chosen": -37.785274505615234, "ref_logps/rejected": -42.51100540161133, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.8766471147537231, "rewards/margins": 0.47673481702804565, "rewards/rejected": -1.3533821105957031, "step": 354 }, { "epoch": 2.68, "grad_norm": 7.260620206463691, "learning_rate": 5.758426966292135e-08, "logps/chosen": -48.80982971191406, "logps/rejected": -56.19672393798828, "loss": 0.5515, "losses/dpo": 0.46307122707366943, "losses/sft": 1.685928463935852, "losses/total": 0.46307122707366943, "ref_logps/chosen": -40.1851806640625, "ref_logps/rejected": -43.02751159667969, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8624651432037354, "rewards/margins": 0.4544559121131897, "rewards/rejected": -1.3169212341308594, "step": 355 }, { "epoch": 2.69, "grad_norm": 7.136349320311039, "learning_rate": 5.617977528089887e-08, "logps/chosen": -41.40632629394531, "logps/rejected": -54.12514114379883, "loss": 0.5343, "losses/dpo": 0.45047110319137573, "losses/sft": 1.3219261169433594, "losses/total": 0.45047110319137573, "ref_logps/chosen": -33.34068298339844, "ref_logps/rejected": -40.50140380859375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8065648078918457, "rewards/margins": 0.555808961391449, "rewards/rejected": -1.3623738288879395, "step": 356 }, { "epoch": 2.69, "grad_norm": 7.456298216594317, "learning_rate": 5.47752808988764e-08, "logps/chosen": -44.206722259521484, "logps/rejected": -55.71735382080078, "loss": 0.5494, "losses/dpo": 0.4734205901622772, "losses/sft": 1.4844837188720703, "losses/total": 0.4734205901622772, "ref_logps/chosen": -35.47336959838867, "ref_logps/rejected": -41.92726516723633, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8733350038528442, "rewards/margins": 0.5056736469268799, "rewards/rejected": -1.3790085315704346, "step": 357 }, { "epoch": 2.7, "grad_norm": 7.41987426694341, "learning_rate": 5.3370786516853926e-08, "logps/chosen": -46.22618865966797, "logps/rejected": -56.47550964355469, "loss": 0.5003, "losses/dpo": 0.562317430973053, "losses/sft": 1.491492509841919, "losses/total": 0.562317430973053, "ref_logps/chosen": -37.904022216796875, "ref_logps/rejected": -41.78309631347656, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.8322172164916992, "rewards/margins": 0.6370242834091187, "rewards/rejected": -1.4692414999008179, "step": 358 }, { "epoch": 2.71, "grad_norm": 6.765690296642083, "learning_rate": 5.196629213483146e-08, "logps/chosen": -41.32649612426758, "logps/rejected": -55.117488861083984, "loss": 0.4906, "losses/dpo": 0.45937132835388184, "losses/sft": 1.3386218547821045, "losses/total": 0.45937132835388184, "ref_logps/chosen": -33.87388610839844, "ref_logps/rejected": -41.34483337402344, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.7452608942985535, "rewards/margins": 0.6320046782493591, "rewards/rejected": -1.377265453338623, "step": 359 }, { "epoch": 2.72, "grad_norm": 7.95832621655637, "learning_rate": 5.056179775280899e-08, "logps/chosen": -44.83673858642578, "logps/rejected": -54.593666076660156, "loss": 0.5529, "losses/dpo": 0.5646368861198425, "losses/sft": 1.3903212547302246, "losses/total": 0.5646368861198425, "ref_logps/chosen": -36.184173583984375, "ref_logps/rejected": -41.40753173828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.865256667137146, "rewards/margins": 0.45335638523101807, "rewards/rejected": -1.318613052368164, "step": 360 }, { "epoch": 2.72, "grad_norm": 8.359120516602266, "learning_rate": 4.915730337078652e-08, "logps/chosen": -48.39961624145508, "logps/rejected": -54.2540397644043, "loss": 0.5967, "losses/dpo": 0.7173389196395874, "losses/sft": 1.989745020866394, "losses/total": 0.7173389196395874, "ref_logps/chosen": -39.236839294433594, "ref_logps/rejected": -40.71720504760742, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.9162774682044983, "rewards/margins": 0.4374057650566101, "rewards/rejected": -1.3536832332611084, "step": 361 }, { "epoch": 2.73, "grad_norm": 7.8949116422203645, "learning_rate": 4.775280898876404e-08, "logps/chosen": -45.1904296875, "logps/rejected": -55.9586181640625, "loss": 0.5313, "losses/dpo": 0.6307837963104248, "losses/sft": 1.725508213043213, "losses/total": 0.6307837963104248, "ref_logps/chosen": -36.340023040771484, "ref_logps/rejected": -41.507598876953125, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.8850406408309937, "rewards/margins": 0.5600608587265015, "rewards/rejected": -1.4451014995574951, "step": 362 }, { "epoch": 2.74, "grad_norm": 7.439921856307659, "learning_rate": 4.634831460674157e-08, "logps/chosen": -47.467689514160156, "logps/rejected": -55.250770568847656, "loss": 0.5418, "losses/dpo": 0.6079765558242798, "losses/sft": 1.8188178539276123, "losses/total": 0.6079765558242798, "ref_logps/chosen": -38.65827560424805, "ref_logps/rejected": -41.087459564208984, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.8809411525726318, "rewards/margins": 0.53538978099823, "rewards/rejected": -1.4163308143615723, "step": 363 }, { "epoch": 2.75, "grad_norm": 7.250730071839225, "learning_rate": 4.4943820224719096e-08, "logps/chosen": -42.77532196044922, "logps/rejected": -59.660240173339844, "loss": 0.4573, "losses/dpo": 0.4384981393814087, "losses/sft": 1.4787318706512451, "losses/total": 0.4384981393814087, "ref_logps/chosen": -35.051734924316406, "ref_logps/rejected": -44.746734619140625, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.7723584175109863, "rewards/margins": 0.7189919948577881, "rewards/rejected": -1.4913504123687744, "step": 364 }, { "epoch": 2.75, "grad_norm": 8.23044878029811, "learning_rate": 4.3539325842696626e-08, "logps/chosen": -49.66007995605469, "logps/rejected": -60.10342025756836, "loss": 0.5469, "losses/dpo": 0.5084363222122192, "losses/sft": 1.8791687488555908, "losses/total": 0.5084363222122192, "ref_logps/chosen": -40.346317291259766, "ref_logps/rejected": -45.41026306152344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9313763380050659, "rewards/margins": 0.5379395484924316, "rewards/rejected": -1.469315767288208, "step": 365 }, { "epoch": 2.76, "grad_norm": 7.298326331639276, "learning_rate": 4.213483146067416e-08, "logps/chosen": -48.57288360595703, "logps/rejected": -57.29835510253906, "loss": 0.5395, "losses/dpo": 0.44340649247169495, "losses/sft": 1.5243843793869019, "losses/total": 0.44340649247169495, "ref_logps/chosen": -39.81139373779297, "ref_logps/rejected": -43.033912658691406, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8761484622955322, "rewards/margins": 0.5502957701683044, "rewards/rejected": -1.4264442920684814, "step": 366 }, { "epoch": 2.77, "grad_norm": 8.51804253270616, "learning_rate": 4.073033707865169e-08, "logps/chosen": -44.41961669921875, "logps/rejected": -53.89155578613281, "loss": 0.5723, "losses/dpo": 0.5301268100738525, "losses/sft": 1.8131489753723145, "losses/total": 0.5301268100738525, "ref_logps/chosen": -35.50189971923828, "ref_logps/rejected": -40.51585388183594, "rewards/accuracies": 0.75, "rewards/chosen": -0.8917717933654785, "rewards/margins": 0.44579851627349854, "rewards/rejected": -1.337570309638977, "step": 367 }, { "epoch": 2.78, "grad_norm": 7.54105725557247, "learning_rate": 3.932584269662921e-08, "logps/chosen": -41.12848663330078, "logps/rejected": -55.290313720703125, "loss": 0.54, "losses/dpo": 0.49226510524749756, "losses/sft": 1.37047278881073, "losses/total": 0.49226510524749756, "ref_logps/chosen": -32.751502990722656, "ref_logps/rejected": -41.476318359375, "rewards/accuracies": 0.75, "rewards/chosen": -0.8376982808113098, "rewards/margins": 0.5437013506889343, "rewards/rejected": -1.3813996315002441, "step": 368 }, { "epoch": 2.78, "grad_norm": 7.706900297271427, "learning_rate": 3.792134831460674e-08, "logps/chosen": -45.84465408325195, "logps/rejected": -56.17218780517578, "loss": 0.5373, "losses/dpo": 0.5797220468521118, "losses/sft": 1.6374412775039673, "losses/total": 0.5797220468521118, "ref_logps/chosen": -36.70783233642578, "ref_logps/rejected": -41.77714538574219, "rewards/accuracies": 0.734375, "rewards/chosen": -0.913682222366333, "rewards/margins": 0.5258220434188843, "rewards/rejected": -1.4395041465759277, "step": 369 }, { "epoch": 2.79, "grad_norm": 7.7547810769646555, "learning_rate": 3.6516853932584266e-08, "logps/chosen": -42.759098052978516, "logps/rejected": -52.87897491455078, "loss": 0.5836, "losses/dpo": 0.7289267778396606, "losses/sft": 1.7013481855392456, "losses/total": 0.7289267778396606, "ref_logps/chosen": -34.204769134521484, "ref_logps/rejected": -40.38142776489258, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.85543292760849, "rewards/margins": 0.3943214416503906, "rewards/rejected": -1.2497543096542358, "step": 370 }, { "epoch": 2.8, "grad_norm": 7.07250671481464, "learning_rate": 3.5112359550561796e-08, "logps/chosen": -45.09293746948242, "logps/rejected": -56.41200256347656, "loss": 0.5025, "losses/dpo": 0.42966747283935547, "losses/sft": 1.5621216297149658, "losses/total": 0.42966747283935547, "ref_logps/chosen": -37.37934494018555, "ref_logps/rejected": -42.70643615722656, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7713593244552612, "rewards/margins": 0.599197506904602, "rewards/rejected": -1.3705568313598633, "step": 371 }, { "epoch": 2.81, "grad_norm": 8.204140371424504, "learning_rate": 3.370786516853932e-08, "logps/chosen": -48.024269104003906, "logps/rejected": -57.5866584777832, "loss": 0.5389, "losses/dpo": 0.5919984579086304, "losses/sft": 1.4933536052703857, "losses/total": 0.5919984579086304, "ref_logps/chosen": -39.3967399597168, "ref_logps/rejected": -43.82619857788086, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8627532124519348, "rewards/margins": 0.5132932662963867, "rewards/rejected": -1.3760464191436768, "step": 372 }, { "epoch": 2.82, "grad_norm": 8.113226187403898, "learning_rate": 3.230337078651686e-08, "logps/chosen": -46.628257751464844, "logps/rejected": -62.4483642578125, "loss": 0.4997, "losses/dpo": 0.4237878918647766, "losses/sft": 1.5488381385803223, "losses/total": 0.4237878918647766, "ref_logps/chosen": -38.190887451171875, "ref_logps/rejected": -47.450843811035156, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8437370657920837, "rewards/margins": 0.6560153961181641, "rewards/rejected": -1.4997525215148926, "step": 373 }, { "epoch": 2.82, "grad_norm": 7.437009897432824, "learning_rate": 3.089887640449438e-08, "logps/chosen": -44.36549377441406, "logps/rejected": -59.504005432128906, "loss": 0.4967, "losses/dpo": 0.42525550723075867, "losses/sft": 1.5591559410095215, "losses/total": 0.42525550723075867, "ref_logps/chosen": -35.93299102783203, "ref_logps/rejected": -44.88318634033203, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8432497978210449, "rewards/margins": 0.6188317537307739, "rewards/rejected": -1.4620814323425293, "step": 374 }, { "epoch": 2.83, "grad_norm": 6.665463460188975, "learning_rate": 2.949438202247191e-08, "logps/chosen": -42.69816970825195, "logps/rejected": -59.00178909301758, "loss": 0.4858, "losses/dpo": 0.40338996052742004, "losses/sft": 1.7176090478897095, "losses/total": 0.40338996052742004, "ref_logps/chosen": -34.84870147705078, "ref_logps/rejected": -44.43999481201172, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.7849469184875488, "rewards/margins": 0.6712321639060974, "rewards/rejected": -1.456179141998291, "step": 375 }, { "epoch": 2.84, "grad_norm": 7.288923139558504, "learning_rate": 2.8089887640449436e-08, "logps/chosen": -47.2071533203125, "logps/rejected": -58.098148345947266, "loss": 0.5076, "losses/dpo": 0.5357474088668823, "losses/sft": 1.654085636138916, "losses/total": 0.5357474088668823, "ref_logps/chosen": -38.56999588012695, "ref_logps/rejected": -43.37090301513672, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.8637155890464783, "rewards/margins": 0.6090089678764343, "rewards/rejected": -1.4727245569229126, "step": 376 }, { "epoch": 2.85, "grad_norm": 7.551625004814956, "learning_rate": 2.6685393258426963e-08, "logps/chosen": -45.72412109375, "logps/rejected": -56.43421173095703, "loss": 0.5474, "losses/dpo": 0.544715404510498, "losses/sft": 1.5618551969528198, "losses/total": 0.544715404510498, "ref_logps/chosen": -37.27866744995117, "ref_logps/rejected": -42.84328079223633, "rewards/accuracies": 0.75, "rewards/chosen": -0.8445456027984619, "rewards/margins": 0.5145478248596191, "rewards/rejected": -1.359093427658081, "step": 377 }, { "epoch": 2.85, "grad_norm": 8.025864212794117, "learning_rate": 2.5280898876404493e-08, "logps/chosen": -45.621158599853516, "logps/rejected": -60.68471145629883, "loss": 0.5285, "losses/dpo": 0.5538164377212524, "losses/sft": 1.5718330144882202, "losses/total": 0.5538164377212524, "ref_logps/chosen": -36.82067108154297, "ref_logps/rejected": -46.61594009399414, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8800492286682129, "rewards/margins": 0.5268282890319824, "rewards/rejected": -1.4068775177001953, "step": 378 }, { "epoch": 2.86, "grad_norm": 7.047129440366168, "learning_rate": 2.387640449438202e-08, "logps/chosen": -44.4951057434082, "logps/rejected": -50.4869499206543, "loss": 0.549, "losses/dpo": 0.490747332572937, "losses/sft": 1.6444151401519775, "losses/total": 0.490747332572937, "ref_logps/chosen": -36.696903228759766, "ref_logps/rejected": -37.55984878540039, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7798205614089966, "rewards/margins": 0.5128894448280334, "rewards/rejected": -1.2927099466323853, "step": 379 }, { "epoch": 2.87, "grad_norm": 7.471899506757266, "learning_rate": 2.2471910112359548e-08, "logps/chosen": -47.25148391723633, "logps/rejected": -59.717864990234375, "loss": 0.5358, "losses/dpo": 0.6100134253501892, "losses/sft": 1.9196665287017822, "losses/total": 0.6100134253501892, "ref_logps/chosen": -37.96227264404297, "ref_logps/rejected": -44.814449310302734, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9289212226867676, "rewards/margins": 0.5614204406738281, "rewards/rejected": -1.4903416633605957, "step": 380 }, { "epoch": 2.88, "grad_norm": 8.004493640627455, "learning_rate": 2.106741573033708e-08, "logps/chosen": -44.516780853271484, "logps/rejected": -51.935089111328125, "loss": 0.6119, "losses/dpo": 0.556452751159668, "losses/sft": 1.4079639911651611, "losses/total": 0.556452751159668, "ref_logps/chosen": -35.908897399902344, "ref_logps/rejected": -39.84672546386719, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.8607881665229797, "rewards/margins": 0.34804895520210266, "rewards/rejected": -1.2088370323181152, "step": 381 }, { "epoch": 2.88, "grad_norm": 7.393197706656567, "learning_rate": 1.9662921348314606e-08, "logps/chosen": -44.82762908935547, "logps/rejected": -59.140480041503906, "loss": 0.509, "losses/dpo": 0.420447438955307, "losses/sft": 1.7410156726837158, "losses/total": 0.420447438955307, "ref_logps/chosen": -36.08792495727539, "ref_logps/rejected": -43.82966613769531, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8739705085754395, "rewards/margins": 0.6571108102798462, "rewards/rejected": -1.5310813188552856, "step": 382 }, { "epoch": 2.89, "grad_norm": 7.992863219139361, "learning_rate": 1.8258426966292133e-08, "logps/chosen": -45.79706573486328, "logps/rejected": -53.040687561035156, "loss": 0.5962, "losses/dpo": 0.6750953197479248, "losses/sft": 1.7228975296020508, "losses/total": 0.6750953197479248, "ref_logps/chosen": -37.67970657348633, "ref_logps/rejected": -40.953521728515625, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.8117363452911377, "rewards/margins": 0.3969798684120178, "rewards/rejected": -1.2087161540985107, "step": 383 }, { "epoch": 2.9, "grad_norm": 7.487738517511007, "learning_rate": 1.685393258426966e-08, "logps/chosen": -45.35090637207031, "logps/rejected": -57.14335632324219, "loss": 0.5354, "losses/dpo": 0.5379496812820435, "losses/sft": 1.6705958843231201, "losses/total": 0.5379496812820435, "ref_logps/chosen": -37.31737518310547, "ref_logps/rejected": -43.805870056152344, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.8033530712127686, "rewards/margins": 0.5303957462310791, "rewards/rejected": -1.3337488174438477, "step": 384 }, { "epoch": 2.91, "grad_norm": 8.22102430010328, "learning_rate": 1.544943820224719e-08, "logps/chosen": -47.1776237487793, "logps/rejected": -54.27086639404297, "loss": 0.5733, "losses/dpo": 0.5613248348236084, "losses/sft": 1.773917317390442, "losses/total": 0.5613248348236084, "ref_logps/chosen": -38.05524444580078, "ref_logps/rejected": -40.46519470214844, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.9122380614280701, "rewards/margins": 0.4683291018009186, "rewards/rejected": -1.3805670738220215, "step": 385 }, { "epoch": 2.91, "grad_norm": 7.908569868218082, "learning_rate": 1.4044943820224718e-08, "logps/chosen": -44.001075744628906, "logps/rejected": -60.508758544921875, "loss": 0.5285, "losses/dpo": 0.5084520578384399, "losses/sft": 1.5907535552978516, "losses/total": 0.5084520578384399, "ref_logps/chosen": -34.90777587890625, "ref_logps/rejected": -45.287864685058594, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9093303084373474, "rewards/margins": 0.6127593517303467, "rewards/rejected": -1.5220897197723389, "step": 386 }, { "epoch": 2.92, "grad_norm": 7.329445132361356, "learning_rate": 1.2640449438202247e-08, "logps/chosen": -46.82018280029297, "logps/rejected": -53.613643646240234, "loss": 0.521, "losses/dpo": 0.5018836259841919, "losses/sft": 1.6243071556091309, "losses/total": 0.5018836259841919, "ref_logps/chosen": -38.469993591308594, "ref_logps/rejected": -39.96007537841797, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8350194692611694, "rewards/margins": 0.5303376913070679, "rewards/rejected": -1.3653571605682373, "step": 387 }, { "epoch": 2.93, "grad_norm": 7.4822766342943225, "learning_rate": 1.1235955056179774e-08, "logps/chosen": -46.926666259765625, "logps/rejected": -55.5013427734375, "loss": 0.5439, "losses/dpo": 0.575495183467865, "losses/sft": 1.3514134883880615, "losses/total": 0.575495183467865, "ref_logps/chosen": -38.2935905456543, "ref_logps/rejected": -42.219791412353516, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8633076548576355, "rewards/margins": 0.46484747529029846, "rewards/rejected": -1.3281550407409668, "step": 388 }, { "epoch": 2.94, "grad_norm": 7.481721913520452, "learning_rate": 9.831460674157303e-09, "logps/chosen": -46.69519805908203, "logps/rejected": -55.18059158325195, "loss": 0.5312, "losses/dpo": 0.5392994284629822, "losses/sft": 2.022167682647705, "losses/total": 0.5392994284629822, "ref_logps/chosen": -38.2025146484375, "ref_logps/rejected": -41.62324905395508, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.8492681384086609, "rewards/margins": 0.506466269493103, "rewards/rejected": -1.3557343482971191, "step": 389 }, { "epoch": 2.94, "grad_norm": 8.860083156452712, "learning_rate": 8.42696629213483e-09, "logps/chosen": -47.0518798828125, "logps/rejected": -56.05253601074219, "loss": 0.6151, "losses/dpo": 0.8160465955734253, "losses/sft": 1.661864161491394, "losses/total": 0.8160465955734253, "ref_logps/chosen": -37.67930603027344, "ref_logps/rejected": -42.802921295166016, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.9372565746307373, "rewards/margins": 0.3877047896385193, "rewards/rejected": -1.3249614238739014, "step": 390 }, { "epoch": 2.95, "grad_norm": 7.697893962559924, "learning_rate": 7.022471910112359e-09, "logps/chosen": -46.420570373535156, "logps/rejected": -56.345977783203125, "loss": 0.5154, "losses/dpo": 0.5586492419242859, "losses/sft": 1.621840476989746, "losses/total": 0.5586492419242859, "ref_logps/chosen": -38.219852447509766, "ref_logps/rejected": -42.38871765136719, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.8200712203979492, "rewards/margins": 0.5756551027297974, "rewards/rejected": -1.395726203918457, "step": 391 }, { "epoch": 2.96, "grad_norm": 7.398815606595402, "learning_rate": 5.617977528089887e-09, "logps/chosen": -45.353294372558594, "logps/rejected": -56.47963333129883, "loss": 0.5242, "losses/dpo": 0.5106035470962524, "losses/sft": 1.4234966039657593, "losses/total": 0.5106035470962524, "ref_logps/chosen": -36.788330078125, "ref_logps/rejected": -42.60810852050781, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.8564971089363098, "rewards/margins": 0.5306553244590759, "rewards/rejected": -1.3871524333953857, "step": 392 }, { "epoch": 2.97, "grad_norm": 7.0449225391612895, "learning_rate": 4.213483146067415e-09, "logps/chosen": -44.40395736694336, "logps/rejected": -53.697776794433594, "loss": 0.5379, "losses/dpo": 0.5200778841972351, "losses/sft": 1.9024913311004639, "losses/total": 0.5200778841972351, "ref_logps/chosen": -36.2678337097168, "ref_logps/rejected": -40.272247314453125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8136123418807983, "rewards/margins": 0.5289404392242432, "rewards/rejected": -1.3425527811050415, "step": 393 }, { "epoch": 2.97, "grad_norm": 7.8920504953670525, "learning_rate": 2.8089887640449435e-09, "logps/chosen": -45.78767776489258, "logps/rejected": -58.19701385498047, "loss": 0.5882, "losses/dpo": 0.5196930170059204, "losses/sft": 1.4936178922653198, "losses/total": 0.5196930170059204, "ref_logps/chosen": -36.467750549316406, "ref_logps/rejected": -44.57653045654297, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9319925904273987, "rewards/margins": 0.43005576729774475, "rewards/rejected": -1.3620483875274658, "step": 394 }, { "epoch": 2.98, "grad_norm": 6.749020955821219, "learning_rate": 1.4044943820224717e-09, "logps/chosen": -43.89699935913086, "logps/rejected": -53.18260955810547, "loss": 0.5308, "losses/dpo": 0.7003037333488464, "losses/sft": 1.696626901626587, "losses/total": 0.7003037333488464, "ref_logps/chosen": -35.854217529296875, "ref_logps/rejected": -39.928733825683594, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8042781949043274, "rewards/margins": 0.5211097002029419, "rewards/rejected": -1.3253878355026245, "step": 395 }, { "epoch": 2.99, "grad_norm": 8.026771636400738, "learning_rate": 0.0, "logps/chosen": -48.76679229736328, "logps/rejected": -59.84498596191406, "loss": 0.5292, "losses/dpo": 0.46332383155822754, "losses/sft": 1.643686056137085, "losses/total": 0.46332383155822754, "ref_logps/chosen": -39.41192626953125, "ref_logps/rejected": -44.815162658691406, "rewards/accuracies": 0.734375, "rewards/chosen": -0.935486912727356, "rewards/margins": 0.567494809627533, "rewards/rejected": -1.5029817819595337, "step": 396 }, { "epoch": 2.99, "step": 396, "total_flos": 0.0, "train_loss": 0.6025580500412469, "train_runtime": 11600.2001, "train_samples_per_second": 4.386, "train_steps_per_second": 0.034 } ], "logging_steps": 1.0, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 70, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }