{ "best_metric": 0.8609164953231812, "best_model_checkpoint": "/home/coder/qwen2/checkpoint-15900", "epoch": 0.9999803783832573, "eval_steps": 300, "global_step": 15926, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018836752073023807, "grad_norm": 0.2832883298397064, "learning_rate": 0.00019629420262546324, "loss": 1.1403, "step": 300 }, { "epoch": 0.018836752073023807, "eval_loss": 1.0671226978302002, "eval_runtime": 300.1436, "eval_samples_per_second": 17.155, "eval_steps_per_second": 2.146, "step": 300 }, { "epoch": 0.037673504146047615, "grad_norm": 0.33913707733154297, "learning_rate": 0.00019252559512593432, "loss": 1.0551, "step": 600 }, { "epoch": 0.037673504146047615, "eval_loss": 1.0309056043624878, "eval_runtime": 300.0852, "eval_samples_per_second": 17.158, "eval_steps_per_second": 2.146, "step": 600 }, { "epoch": 0.05651025621907143, "grad_norm": 0.30800801515579224, "learning_rate": 0.0001887569876264054, "loss": 1.0185, "step": 900 }, { "epoch": 0.05651025621907143, "eval_loss": 1.0096280574798584, "eval_runtime": 299.931, "eval_samples_per_second": 17.167, "eval_steps_per_second": 2.147, "step": 900 }, { "epoch": 0.07534700829209523, "grad_norm": 0.317748099565506, "learning_rate": 0.00018498838012687648, "loss": 0.9952, "step": 1200 }, { "epoch": 0.07534700829209523, "eval_loss": 0.9937332272529602, "eval_runtime": 299.9997, "eval_samples_per_second": 17.163, "eval_steps_per_second": 2.147, "step": 1200 }, { "epoch": 0.09418376036511904, "grad_norm": 0.36215266585350037, "learning_rate": 0.00018121977262734753, "loss": 0.9931, "step": 1500 }, { "epoch": 0.09418376036511904, "eval_loss": 0.9828254580497742, "eval_runtime": 299.9242, "eval_samples_per_second": 17.168, "eval_steps_per_second": 2.147, "step": 1500 }, { "epoch": 0.11302051243814286, "grad_norm": 0.3347627520561218, "learning_rate": 0.0001774511651278186, "loss": 0.9763, "step": 1800 }, { "epoch": 0.11302051243814286, "eval_loss": 0.972442626953125, "eval_runtime": 299.8563, "eval_samples_per_second": 17.172, "eval_steps_per_second": 2.148, "step": 1800 }, { "epoch": 0.13185726451116667, "grad_norm": 0.319148451089859, "learning_rate": 0.0001736825576282897, "loss": 0.9727, "step": 2100 }, { "epoch": 0.13185726451116667, "eval_loss": 0.965033769607544, "eval_runtime": 299.797, "eval_samples_per_second": 17.175, "eval_steps_per_second": 2.148, "step": 2100 }, { "epoch": 0.15069401658419046, "grad_norm": 0.2830144166946411, "learning_rate": 0.00016991395012876077, "loss": 0.9674, "step": 2400 }, { "epoch": 0.15069401658419046, "eval_loss": 0.9570498466491699, "eval_runtime": 300.0087, "eval_samples_per_second": 17.163, "eval_steps_per_second": 2.147, "step": 2400 }, { "epoch": 0.16953076865721428, "grad_norm": 0.30889859795570374, "learning_rate": 0.00016614534262923185, "loss": 0.9601, "step": 2700 }, { "epoch": 0.16953076865721428, "eval_loss": 0.9515209197998047, "eval_runtime": 299.9801, "eval_samples_per_second": 17.164, "eval_steps_per_second": 2.147, "step": 2700 }, { "epoch": 0.18836752073023807, "grad_norm": 0.33287131786346436, "learning_rate": 0.0001623767351297029, "loss": 0.944, "step": 3000 }, { "epoch": 0.18836752073023807, "eval_loss": 0.9454107284545898, "eval_runtime": 299.9554, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 3000 }, { "epoch": 0.2072042728032619, "grad_norm": 0.31519943475723267, "learning_rate": 0.00015860812763017398, "loss": 0.9462, "step": 3300 }, { "epoch": 0.2072042728032619, "eval_loss": 0.9404099583625793, "eval_runtime": 299.9447, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 3300 }, { "epoch": 0.22604102487628572, "grad_norm": 0.315909743309021, "learning_rate": 0.00015483952013064506, "loss": 0.9326, "step": 3600 }, { "epoch": 0.22604102487628572, "eval_loss": 0.9352145195007324, "eval_runtime": 299.9991, "eval_samples_per_second": 17.163, "eval_steps_per_second": 2.147, "step": 3600 }, { "epoch": 0.2448777769493095, "grad_norm": 0.2997918128967285, "learning_rate": 0.00015107091263111613, "loss": 0.9298, "step": 3900 }, { "epoch": 0.2448777769493095, "eval_loss": 0.9315630793571472, "eval_runtime": 299.95, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 3900 }, { "epoch": 0.26371452902233333, "grad_norm": 0.2730711102485657, "learning_rate": 0.0001473023051315872, "loss": 0.9345, "step": 4200 }, { "epoch": 0.26371452902233333, "eval_loss": 0.9274590015411377, "eval_runtime": 300.1027, "eval_samples_per_second": 17.157, "eval_steps_per_second": 2.146, "step": 4200 }, { "epoch": 0.28255128109535715, "grad_norm": 0.27338674664497375, "learning_rate": 0.0001435336976320583, "loss": 0.9378, "step": 4500 }, { "epoch": 0.28255128109535715, "eval_loss": 0.9236754775047302, "eval_runtime": 299.988, "eval_samples_per_second": 17.164, "eval_steps_per_second": 2.147, "step": 4500 }, { "epoch": 0.3013880331683809, "grad_norm": 0.3145460784435272, "learning_rate": 0.00013976509013252937, "loss": 0.9235, "step": 4800 }, { "epoch": 0.3013880331683809, "eval_loss": 0.9199886322021484, "eval_runtime": 299.9923, "eval_samples_per_second": 17.164, "eval_steps_per_second": 2.147, "step": 4800 }, { "epoch": 0.32022478524140474, "grad_norm": 0.2656671702861786, "learning_rate": 0.00013599648263300045, "loss": 0.9207, "step": 5100 }, { "epoch": 0.32022478524140474, "eval_loss": 0.9165565371513367, "eval_runtime": 299.8304, "eval_samples_per_second": 17.173, "eval_steps_per_second": 2.148, "step": 5100 }, { "epoch": 0.33906153731442856, "grad_norm": 0.2907351851463318, "learning_rate": 0.00013222787513347153, "loss": 0.9149, "step": 5400 }, { "epoch": 0.33906153731442856, "eval_loss": 0.9132035374641418, "eval_runtime": 299.8597, "eval_samples_per_second": 17.171, "eval_steps_per_second": 2.148, "step": 5400 }, { "epoch": 0.3578982893874524, "grad_norm": 0.39790818095207214, "learning_rate": 0.0001284592676339426, "loss": 0.9063, "step": 5700 }, { "epoch": 0.3578982893874524, "eval_loss": 0.9105966687202454, "eval_runtime": 300.0175, "eval_samples_per_second": 17.162, "eval_steps_per_second": 2.147, "step": 5700 }, { "epoch": 0.37673504146047615, "grad_norm": 0.3338871896266937, "learning_rate": 0.00012469066013441369, "loss": 0.9046, "step": 6000 }, { "epoch": 0.37673504146047615, "eval_loss": 0.9074862003326416, "eval_runtime": 299.8956, "eval_samples_per_second": 17.169, "eval_steps_per_second": 2.147, "step": 6000 }, { "epoch": 0.39557179353349997, "grad_norm": 0.2925800383090973, "learning_rate": 0.00012092205263488474, "loss": 0.907, "step": 6300 }, { "epoch": 0.39557179353349997, "eval_loss": 0.9044873118400574, "eval_runtime": 300.0697, "eval_samples_per_second": 17.159, "eval_steps_per_second": 2.146, "step": 6300 }, { "epoch": 0.4144085456065238, "grad_norm": 0.34801357984542847, "learning_rate": 0.00011715344513535582, "loss": 0.9042, "step": 6600 }, { "epoch": 0.4144085456065238, "eval_loss": 0.9019830822944641, "eval_runtime": 299.9421, "eval_samples_per_second": 17.167, "eval_steps_per_second": 2.147, "step": 6600 }, { "epoch": 0.4332452976795476, "grad_norm": 0.3444356918334961, "learning_rate": 0.0001133848376358269, "loss": 0.9019, "step": 6900 }, { "epoch": 0.4332452976795476, "eval_loss": 0.8995754718780518, "eval_runtime": 299.921, "eval_samples_per_second": 17.168, "eval_steps_per_second": 2.147, "step": 6900 }, { "epoch": 0.45208204975257144, "grad_norm": 0.3366526961326599, "learning_rate": 0.00010961623013629799, "loss": 0.9041, "step": 7200 }, { "epoch": 0.45208204975257144, "eval_loss": 0.8974488973617554, "eval_runtime": 299.9169, "eval_samples_per_second": 17.168, "eval_steps_per_second": 2.147, "step": 7200 }, { "epoch": 0.4709188018255952, "grad_norm": 0.34138697385787964, "learning_rate": 0.00010584762263676907, "loss": 0.9001, "step": 7500 }, { "epoch": 0.4709188018255952, "eval_loss": 0.8951303958892822, "eval_runtime": 299.8867, "eval_samples_per_second": 17.17, "eval_steps_per_second": 2.147, "step": 7500 }, { "epoch": 0.489755553898619, "grad_norm": 0.35338446497917175, "learning_rate": 0.00010207901513724012, "loss": 0.8962, "step": 7800 }, { "epoch": 0.489755553898619, "eval_loss": 0.8931267261505127, "eval_runtime": 300.0707, "eval_samples_per_second": 17.159, "eval_steps_per_second": 2.146, "step": 7800 }, { "epoch": 0.5085923059716428, "grad_norm": 0.33024904131889343, "learning_rate": 9.83104076377112e-05, "loss": 0.901, "step": 8100 }, { "epoch": 0.5085923059716428, "eval_loss": 0.8908406496047974, "eval_runtime": 300.0908, "eval_samples_per_second": 17.158, "eval_steps_per_second": 2.146, "step": 8100 }, { "epoch": 0.5274290580446667, "grad_norm": 0.30269181728363037, "learning_rate": 9.454180013818228e-05, "loss": 0.8886, "step": 8400 }, { "epoch": 0.5274290580446667, "eval_loss": 0.8892831802368164, "eval_runtime": 299.9819, "eval_samples_per_second": 17.164, "eval_steps_per_second": 2.147, "step": 8400 }, { "epoch": 0.5462658101176905, "grad_norm": 0.32455185055732727, "learning_rate": 9.077319263865335e-05, "loss": 0.8823, "step": 8700 }, { "epoch": 0.5462658101176905, "eval_loss": 0.887444019317627, "eval_runtime": 300.1485, "eval_samples_per_second": 17.155, "eval_steps_per_second": 2.146, "step": 8700 }, { "epoch": 0.5651025621907143, "grad_norm": 0.32726097106933594, "learning_rate": 8.700458513912443e-05, "loss": 0.8773, "step": 9000 }, { "epoch": 0.5651025621907143, "eval_loss": 0.885347306728363, "eval_runtime": 300.0218, "eval_samples_per_second": 17.162, "eval_steps_per_second": 2.147, "step": 9000 }, { "epoch": 0.5839393142637381, "grad_norm": 0.3211737275123596, "learning_rate": 8.323597763959551e-05, "loss": 0.8876, "step": 9300 }, { "epoch": 0.5839393142637381, "eval_loss": 0.8837311267852783, "eval_runtime": 299.8508, "eval_samples_per_second": 17.172, "eval_steps_per_second": 2.148, "step": 9300 }, { "epoch": 0.6027760663367618, "grad_norm": 0.3470586836338043, "learning_rate": 7.946737014006658e-05, "loss": 0.888, "step": 9600 }, { "epoch": 0.6027760663367618, "eval_loss": 0.8818086981773376, "eval_runtime": 299.9724, "eval_samples_per_second": 17.165, "eval_steps_per_second": 2.147, "step": 9600 }, { "epoch": 0.6216128184097857, "grad_norm": 0.3012458384037018, "learning_rate": 7.569876264053766e-05, "loss": 0.8833, "step": 9900 }, { "epoch": 0.6216128184097857, "eval_loss": 0.8803924322128296, "eval_runtime": 300.0969, "eval_samples_per_second": 17.158, "eval_steps_per_second": 2.146, "step": 9900 }, { "epoch": 0.6404495704828095, "grad_norm": 0.32445794343948364, "learning_rate": 7.194271716600717e-05, "loss": 0.8841, "step": 10200 }, { "epoch": 0.6404495704828095, "eval_loss": 0.8785931468009949, "eval_runtime": 299.8302, "eval_samples_per_second": 17.173, "eval_steps_per_second": 2.148, "step": 10200 }, { "epoch": 0.6592863225558333, "grad_norm": 0.33264926075935364, "learning_rate": 6.817410966647824e-05, "loss": 0.8852, "step": 10500 }, { "epoch": 0.6592863225558333, "eval_loss": 0.8771566152572632, "eval_runtime": 300.0665, "eval_samples_per_second": 17.16, "eval_steps_per_second": 2.146, "step": 10500 }, { "epoch": 0.6781230746288571, "grad_norm": 0.3084549307823181, "learning_rate": 6.440550216694932e-05, "loss": 0.8793, "step": 10800 }, { "epoch": 0.6781230746288571, "eval_loss": 0.8756351470947266, "eval_runtime": 300.155, "eval_samples_per_second": 17.154, "eval_steps_per_second": 2.146, "step": 10800 }, { "epoch": 0.696959826701881, "grad_norm": 0.3315499722957611, "learning_rate": 6.0636894667420396e-05, "loss": 0.8687, "step": 11100 }, { "epoch": 0.696959826701881, "eval_loss": 0.8744714260101318, "eval_runtime": 300.5183, "eval_samples_per_second": 17.134, "eval_steps_per_second": 2.143, "step": 11100 }, { "epoch": 0.7157965787749048, "grad_norm": 0.35962772369384766, "learning_rate": 5.686828716789147e-05, "loss": 0.8631, "step": 11400 }, { "epoch": 0.7157965787749048, "eval_loss": 0.8730462789535522, "eval_runtime": 299.7837, "eval_samples_per_second": 17.176, "eval_steps_per_second": 2.148, "step": 11400 }, { "epoch": 0.7346333308479286, "grad_norm": 0.33538639545440674, "learning_rate": 5.3099679668362547e-05, "loss": 0.879, "step": 11700 }, { "epoch": 0.7346333308479286, "eval_loss": 0.8714411854743958, "eval_runtime": 299.9502, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 11700 }, { "epoch": 0.7534700829209523, "grad_norm": 0.3434339165687561, "learning_rate": 4.933107216883362e-05, "loss": 0.8616, "step": 12000 }, { "epoch": 0.7534700829209523, "eval_loss": 0.8703322410583496, "eval_runtime": 300.0139, "eval_samples_per_second": 17.163, "eval_steps_per_second": 2.147, "step": 12000 }, { "epoch": 0.7723068349939761, "grad_norm": 0.34114760160446167, "learning_rate": 4.55624646693047e-05, "loss": 0.8708, "step": 12300 }, { "epoch": 0.7723068349939761, "eval_loss": 0.8692737817764282, "eval_runtime": 299.9274, "eval_samples_per_second": 17.167, "eval_steps_per_second": 2.147, "step": 12300 }, { "epoch": 0.7911435870669999, "grad_norm": 0.40352341532707214, "learning_rate": 4.18064191947742e-05, "loss": 0.8724, "step": 12600 }, { "epoch": 0.7911435870669999, "eval_loss": 0.8681650161743164, "eval_runtime": 300.4231, "eval_samples_per_second": 17.139, "eval_steps_per_second": 2.144, "step": 12600 }, { "epoch": 0.8099803391400238, "grad_norm": 0.36962220072746277, "learning_rate": 3.8037811695245274e-05, "loss": 0.8672, "step": 12900 }, { "epoch": 0.8099803391400238, "eval_loss": 0.8672531247138977, "eval_runtime": 299.8101, "eval_samples_per_second": 17.174, "eval_steps_per_second": 2.148, "step": 12900 }, { "epoch": 0.8288170912130476, "grad_norm": 0.4042891561985016, "learning_rate": 3.426920419571635e-05, "loss": 0.8643, "step": 13200 }, { "epoch": 0.8288170912130476, "eval_loss": 0.8664665818214417, "eval_runtime": 299.8585, "eval_samples_per_second": 17.171, "eval_steps_per_second": 2.148, "step": 13200 }, { "epoch": 0.8476538432860714, "grad_norm": 0.2992730140686035, "learning_rate": 3.0500596696187428e-05, "loss": 0.8632, "step": 13500 }, { "epoch": 0.8476538432860714, "eval_loss": 0.8653113842010498, "eval_runtime": 299.946, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 13500 }, { "epoch": 0.8664905953590952, "grad_norm": 0.31725963950157166, "learning_rate": 2.6731989196658503e-05, "loss": 0.8564, "step": 13800 }, { "epoch": 0.8664905953590952, "eval_loss": 0.8644812107086182, "eval_runtime": 299.8536, "eval_samples_per_second": 17.172, "eval_steps_per_second": 2.148, "step": 13800 }, { "epoch": 0.885327347432119, "grad_norm": 0.3101350963115692, "learning_rate": 2.296338169712958e-05, "loss": 0.861, "step": 14100 }, { "epoch": 0.885327347432119, "eval_loss": 0.8637036085128784, "eval_runtime": 300.356, "eval_samples_per_second": 17.143, "eval_steps_per_second": 2.144, "step": 14100 }, { "epoch": 0.9041640995051429, "grad_norm": 0.33058223128318787, "learning_rate": 1.9194774197600654e-05, "loss": 0.8543, "step": 14400 }, { "epoch": 0.9041640995051429, "eval_loss": 0.8630216121673584, "eval_runtime": 299.9507, "eval_samples_per_second": 17.166, "eval_steps_per_second": 2.147, "step": 14400 }, { "epoch": 0.9230008515781666, "grad_norm": 0.35784465074539185, "learning_rate": 1.5438728723070158e-05, "loss": 0.868, "step": 14700 }, { "epoch": 0.9230008515781666, "eval_loss": 0.8623820543289185, "eval_runtime": 299.933, "eval_samples_per_second": 17.167, "eval_steps_per_second": 2.147, "step": 14700 }, { "epoch": 0.9418376036511904, "grad_norm": 0.3938862383365631, "learning_rate": 1.1670121223541235e-05, "loss": 0.8607, "step": 15000 }, { "epoch": 0.9418376036511904, "eval_loss": 0.861863911151886, "eval_runtime": 299.8746, "eval_samples_per_second": 17.171, "eval_steps_per_second": 2.148, "step": 15000 }, { "epoch": 0.9606743557242142, "grad_norm": 0.3867338001728058, "learning_rate": 7.90151372401231e-06, "loss": 0.8491, "step": 15300 }, { "epoch": 0.9606743557242142, "eval_loss": 0.8613755106925964, "eval_runtime": 299.919, "eval_samples_per_second": 17.168, "eval_steps_per_second": 2.147, "step": 15300 }, { "epoch": 0.979511107797238, "grad_norm": 0.3372841477394104, "learning_rate": 4.132906224483387e-06, "loss": 0.8643, "step": 15600 }, { "epoch": 0.979511107797238, "eval_loss": 0.8610811829566956, "eval_runtime": 299.8287, "eval_samples_per_second": 17.173, "eval_steps_per_second": 2.148, "step": 15600 }, { "epoch": 0.9983478598702619, "grad_norm": 0.3431134819984436, "learning_rate": 3.642987249544627e-07, "loss": 0.8633, "step": 15900 }, { "epoch": 0.9983478598702619, "eval_loss": 0.8609164953231812, "eval_runtime": 299.9008, "eval_samples_per_second": 17.169, "eval_steps_per_second": 2.147, "step": 15900 } ], "logging_steps": 300, "max_steps": 15926, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1684162719241994e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }