{ "best_metric": 0.26116234064102173, "best_model_checkpoint": "outputs/checkpoint-108", "epoch": 5.0, "eval_steps": 6, "global_step": 110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045454545454545456, "grad_norm": 12.375, "learning_rate": 4e-05, "loss": 8.7425, "step": 1 }, { "epoch": 0.09090909090909091, "grad_norm": 17.125, "learning_rate": 8e-05, "loss": 8.6536, "step": 2 }, { "epoch": 0.13636363636363635, "grad_norm": 12.0625, "learning_rate": 0.00012, "loss": 8.602, "step": 3 }, { "epoch": 0.18181818181818182, "grad_norm": 8.8125, "learning_rate": 0.00016, "loss": 8.3064, "step": 4 }, { "epoch": 0.22727272727272727, "grad_norm": 10.125, "learning_rate": 0.0002, "loss": 7.7739, "step": 5 }, { "epoch": 0.2727272727272727, "grad_norm": 7.25, "learning_rate": 0.00024, "loss": 7.1622, "step": 6 }, { "epoch": 0.2727272727272727, "eval_loss": 6.408970832824707, "eval_runtime": 1.4799, "eval_samples_per_second": 95.274, "eval_steps_per_second": 12.163, "step": 6 }, { "epoch": 0.3181818181818182, "grad_norm": 6.78125, "learning_rate": 0.00028000000000000003, "loss": 6.4402, "step": 7 }, { "epoch": 0.36363636363636365, "grad_norm": 10.1875, "learning_rate": 0.00032, "loss": 5.616, "step": 8 }, { "epoch": 0.4090909090909091, "grad_norm": 4.4375, "learning_rate": 0.00035999999999999997, "loss": 4.8702, "step": 9 }, { "epoch": 0.45454545454545453, "grad_norm": 3.765625, "learning_rate": 0.0004, "loss": 4.2606, "step": 10 }, { "epoch": 0.5, "grad_norm": 4.6875, "learning_rate": 0.00044, "loss": 3.7622, "step": 11 }, { "epoch": 0.5454545454545454, "grad_norm": 5.3125, "learning_rate": 0.00048, "loss": 3.5397, "step": 12 }, { "epoch": 0.5454545454545454, "eval_loss": 2.9970808029174805, "eval_runtime": 1.4818, "eval_samples_per_second": 95.155, "eval_steps_per_second": 12.147, "step": 12 }, { "epoch": 0.5909090909090909, "grad_norm": 3.390625, "learning_rate": 0.0005200000000000001, "loss": 2.9788, "step": 13 }, { "epoch": 0.6363636363636364, "grad_norm": 3.1875, "learning_rate": 0.0005600000000000001, "loss": 2.5801, "step": 14 }, { "epoch": 0.6818181818181818, "grad_norm": 2.578125, "learning_rate": 0.0006, "loss": 2.3461, "step": 15 }, { "epoch": 0.7272727272727273, "grad_norm": 1.6171875, "learning_rate": 0.00064, "loss": 2.0174, "step": 16 }, { "epoch": 0.7727272727272727, "grad_norm": 1.9609375, "learning_rate": 0.00068, "loss": 1.889, "step": 17 }, { "epoch": 0.8181818181818182, "grad_norm": 1.5625, "learning_rate": 0.0007199999999999999, "loss": 1.7954, "step": 18 }, { "epoch": 0.8181818181818182, "eval_loss": 1.589725136756897, "eval_runtime": 1.4857, "eval_samples_per_second": 94.907, "eval_steps_per_second": 12.116, "step": 18 }, { "epoch": 0.8636363636363636, "grad_norm": 2.140625, "learning_rate": 0.00076, "loss": 1.6871, "step": 19 }, { "epoch": 0.9090909090909091, "grad_norm": 2.28125, "learning_rate": 0.0008, "loss": 1.5787, "step": 20 }, { "epoch": 0.9545454545454546, "grad_norm": 1.703125, "learning_rate": 0.00084, "loss": 1.4167, "step": 21 }, { "epoch": 1.0, "grad_norm": 1.984375, "learning_rate": 0.00088, "loss": 1.3882, "step": 22 }, { "epoch": 1.0454545454545454, "grad_norm": 1.3671875, "learning_rate": 0.00092, "loss": 1.2626, "step": 23 }, { "epoch": 1.0909090909090908, "grad_norm": 1.140625, "learning_rate": 0.00096, "loss": 1.1743, "step": 24 }, { "epoch": 1.0909090909090908, "eval_loss": 1.088915467262268, "eval_runtime": 1.4822, "eval_samples_per_second": 95.126, "eval_steps_per_second": 12.144, "step": 24 }, { "epoch": 1.1363636363636362, "grad_norm": 1.2734375, "learning_rate": 0.001, "loss": 1.1278, "step": 25 }, { "epoch": 1.1818181818181819, "grad_norm": 1.3515625, "learning_rate": 0.0009996585300715115, "loss": 1.1023, "step": 26 }, { "epoch": 1.2272727272727273, "grad_norm": 1.3359375, "learning_rate": 0.0009986345866928941, "loss": 1.0403, "step": 27 }, { "epoch": 1.2727272727272727, "grad_norm": 1.40625, "learning_rate": 0.000996929568447637, "loss": 1.0496, "step": 28 }, { "epoch": 1.3181818181818181, "grad_norm": 1.1796875, "learning_rate": 0.000994545804185573, "loss": 0.8593, "step": 29 }, { "epoch": 1.3636363636363638, "grad_norm": 0.94140625, "learning_rate": 0.000991486549841951, "loss": 0.9413, "step": 30 }, { "epoch": 1.3636363636363638, "eval_loss": 0.7773878574371338, "eval_runtime": 1.491, "eval_samples_per_second": 94.566, "eval_steps_per_second": 12.072, "step": 30 }, { "epoch": 1.4090909090909092, "grad_norm": 1.15625, "learning_rate": 0.0009877559839902184, "loss": 0.7758, "step": 31 }, { "epoch": 1.4545454545454546, "grad_norm": 0.93359375, "learning_rate": 0.0009833592021345938, "loss": 0.8344, "step": 32 }, { "epoch": 1.5, "grad_norm": 0.99609375, "learning_rate": 0.0009783022097502204, "loss": 0.6183, "step": 33 }, { "epoch": 1.5454545454545454, "grad_norm": 0.8046875, "learning_rate": 0.0009725919140804099, "loss": 0.7497, "step": 34 }, { "epoch": 1.5909090909090908, "grad_norm": 0.9140625, "learning_rate": 0.0009662361147021779, "loss": 0.7042, "step": 35 }, { "epoch": 1.6363636363636362, "grad_norm": 0.95703125, "learning_rate": 0.0009592434928729616, "loss": 0.7236, "step": 36 }, { "epoch": 1.6363636363636362, "eval_loss": 0.6186583042144775, "eval_runtime": 1.4853, "eval_samples_per_second": 94.931, "eval_steps_per_second": 12.119, "step": 36 }, { "epoch": 1.6818181818181817, "grad_norm": 0.84375, "learning_rate": 0.0009516235996730644, "loss": 0.6119, "step": 37 }, { "epoch": 1.7272727272727273, "grad_norm": 0.87109375, "learning_rate": 0.0009433868429600309, "loss": 0.606, "step": 38 }, { "epoch": 1.7727272727272727, "grad_norm": 17.625, "learning_rate": 0.0009345444731527642, "loss": 0.6787, "step": 39 }, { "epoch": 1.8181818181818183, "grad_norm": 0.859375, "learning_rate": 0.0009251085678648072, "loss": 0.6607, "step": 40 }, { "epoch": 1.8636363636363638, "grad_norm": 0.84765625, "learning_rate": 0.0009150920154077753, "loss": 0.6514, "step": 41 }, { "epoch": 1.9090909090909092, "grad_norm": 0.64453125, "learning_rate": 0.0009045084971874737, "loss": 0.6084, "step": 42 }, { "epoch": 1.9090909090909092, "eval_loss": 0.555232048034668, "eval_runtime": 1.4852, "eval_samples_per_second": 94.934, "eval_steps_per_second": 12.119, "step": 42 }, { "epoch": 1.9545454545454546, "grad_norm": 1.078125, "learning_rate": 0.0008933724690167416, "loss": 0.5991, "step": 43 }, { "epoch": 2.0, "grad_norm": 0.83984375, "learning_rate": 0.0008816991413705516, "loss": 0.6085, "step": 44 }, { "epoch": 2.0454545454545454, "grad_norm": 0.92578125, "learning_rate": 0.0008695044586103295, "loss": 0.4946, "step": 45 }, { "epoch": 2.090909090909091, "grad_norm": 1.0703125, "learning_rate": 0.0008568050772058762, "loss": 0.4987, "step": 46 }, { "epoch": 2.1363636363636362, "grad_norm": 0.6953125, "learning_rate": 0.0008436183429846313, "loss": 0.4656, "step": 47 }, { "epoch": 2.1818181818181817, "grad_norm": 0.98046875, "learning_rate": 0.0008299622674393614, "loss": 0.5454, "step": 48 }, { "epoch": 2.1818181818181817, "eval_loss": 0.4745166301727295, "eval_runtime": 1.4854, "eval_samples_per_second": 94.921, "eval_steps_per_second": 12.118, "step": 48 }, { "epoch": 2.227272727272727, "grad_norm": 0.71484375, "learning_rate": 0.0008158555031266255, "loss": 0.4058, "step": 49 }, { "epoch": 2.2727272727272725, "grad_norm": 0.84375, "learning_rate": 0.0008013173181896282, "loss": 0.5267, "step": 50 }, { "epoch": 2.3181818181818183, "grad_norm": 0.87109375, "learning_rate": 0.0007863675700402526, "loss": 0.524, "step": 51 }, { "epoch": 2.3636363636363638, "grad_norm": 0.73046875, "learning_rate": 0.0007710266782362247, "loss": 0.5331, "step": 52 }, { "epoch": 2.409090909090909, "grad_norm": 0.79296875, "learning_rate": 0.0007553155965904535, "loss": 0.4235, "step": 53 }, { "epoch": 2.4545454545454546, "grad_norm": 0.828125, "learning_rate": 0.0007392557845506433, "loss": 0.5147, "step": 54 }, { "epoch": 2.4545454545454546, "eval_loss": 0.437049001455307, "eval_runtime": 1.4804, "eval_samples_per_second": 95.243, "eval_steps_per_second": 12.159, "step": 54 }, { "epoch": 2.5, "grad_norm": 0.640625, "learning_rate": 0.0007228691778882692, "loss": 0.4376, "step": 55 }, { "epoch": 2.5454545454545454, "grad_norm": 0.71875, "learning_rate": 0.0007061781587369518, "loss": 0.4396, "step": 56 }, { "epoch": 2.590909090909091, "grad_norm": 0.81640625, "learning_rate": 0.0006892055250211552, "loss": 0.4257, "step": 57 }, { "epoch": 2.6363636363636362, "grad_norm": 0.66015625, "learning_rate": 0.0006719744593169641, "loss": 0.4447, "step": 58 }, { "epoch": 2.6818181818181817, "grad_norm": 0.75, "learning_rate": 0.0006545084971874737, "loss": 0.4591, "step": 59 }, { "epoch": 2.7272727272727275, "grad_norm": 0.7890625, "learning_rate": 0.0006368314950360416, "loss": 0.4645, "step": 60 }, { "epoch": 2.7272727272727275, "eval_loss": 0.3943060338497162, "eval_runtime": 1.4805, "eval_samples_per_second": 95.235, "eval_steps_per_second": 12.158, "step": 60 }, { "epoch": 2.7727272727272725, "grad_norm": 0.64453125, "learning_rate": 0.0006189675975213093, "loss": 0.4733, "step": 61 }, { "epoch": 2.8181818181818183, "grad_norm": 0.64453125, "learning_rate": 0.0006009412045785051, "loss": 0.4227, "step": 62 }, { "epoch": 2.8636363636363638, "grad_norm": 0.7265625, "learning_rate": 0.000582776938092065, "loss": 0.485, "step": 63 }, { "epoch": 2.909090909090909, "grad_norm": 0.58984375, "learning_rate": 0.0005644996082651017, "loss": 0.4154, "step": 64 }, { "epoch": 2.9545454545454546, "grad_norm": 0.625, "learning_rate": 0.000546134179731651, "loss": 0.4602, "step": 65 }, { "epoch": 3.0, "grad_norm": 0.6640625, "learning_rate": 0.000527705737457985, "loss": 0.4371, "step": 66 }, { "epoch": 3.0, "eval_loss": 0.35816648602485657, "eval_runtime": 1.4795, "eval_samples_per_second": 95.3, "eval_steps_per_second": 12.166, "step": 66 }, { "epoch": 3.0454545454545454, "grad_norm": 0.59375, "learning_rate": 0.000509239452479565, "loss": 0.3674, "step": 67 }, { "epoch": 3.090909090909091, "grad_norm": 0.578125, "learning_rate": 0.0004907605475204352, "loss": 0.3405, "step": 68 }, { "epoch": 3.1363636363636362, "grad_norm": 0.4921875, "learning_rate": 0.00047229426254201504, "loss": 0.3669, "step": 69 }, { "epoch": 3.1818181818181817, "grad_norm": 0.61328125, "learning_rate": 0.00045386582026834903, "loss": 0.3333, "step": 70 }, { "epoch": 3.227272727272727, "grad_norm": 0.5234375, "learning_rate": 0.0004355003917348985, "loss": 0.3032, "step": 71 }, { "epoch": 3.2727272727272725, "grad_norm": 0.66015625, "learning_rate": 0.000417223061907935, "loss": 0.3557, "step": 72 }, { "epoch": 3.2727272727272725, "eval_loss": 0.3237670361995697, "eval_runtime": 1.4942, "eval_samples_per_second": 94.367, "eval_steps_per_second": 12.047, "step": 72 }, { "epoch": 3.3181818181818183, "grad_norm": 0.59765625, "learning_rate": 0.000399058795421495, "loss": 0.3774, "step": 73 }, { "epoch": 3.3636363636363638, "grad_norm": 0.52734375, "learning_rate": 0.00038103240247869074, "loss": 0.3433, "step": 74 }, { "epoch": 3.409090909090909, "grad_norm": 0.609375, "learning_rate": 0.0003631685049639586, "loss": 0.3872, "step": 75 }, { "epoch": 3.4545454545454546, "grad_norm": 0.59375, "learning_rate": 0.00034549150281252633, "loss": 0.3675, "step": 76 }, { "epoch": 3.5, "grad_norm": 0.55859375, "learning_rate": 0.0003280255406830359, "loss": 0.3581, "step": 77 }, { "epoch": 3.5454545454545454, "grad_norm": 0.48828125, "learning_rate": 0.00031079447497884486, "loss": 0.3062, "step": 78 }, { "epoch": 3.5454545454545454, "eval_loss": 0.3086094558238983, "eval_runtime": 1.4971, "eval_samples_per_second": 94.182, "eval_steps_per_second": 12.023, "step": 78 }, { "epoch": 3.590909090909091, "grad_norm": 0.546875, "learning_rate": 0.00029382184126304836, "loss": 0.3324, "step": 79 }, { "epoch": 3.6363636363636362, "grad_norm": 0.55078125, "learning_rate": 0.0002771308221117309, "loss": 0.338, "step": 80 }, { "epoch": 3.6818181818181817, "grad_norm": 0.51171875, "learning_rate": 0.0002607442154493568, "loss": 0.3319, "step": 81 }, { "epoch": 3.7272727272727275, "grad_norm": 0.63671875, "learning_rate": 0.0002446844034095466, "loss": 0.3577, "step": 82 }, { "epoch": 3.7727272727272725, "grad_norm": 0.55078125, "learning_rate": 0.00022897332176377528, "loss": 0.3463, "step": 83 }, { "epoch": 3.8181818181818183, "grad_norm": 0.52734375, "learning_rate": 0.00021363242995974742, "loss": 0.3065, "step": 84 }, { "epoch": 3.8181818181818183, "eval_loss": 0.2896404266357422, "eval_runtime": 1.4869, "eval_samples_per_second": 94.829, "eval_steps_per_second": 12.106, "step": 84 }, { "epoch": 3.8636363636363638, "grad_norm": 0.58203125, "learning_rate": 0.00019868268181037185, "loss": 0.339, "step": 85 }, { "epoch": 3.909090909090909, "grad_norm": 0.5, "learning_rate": 0.00018414449687337466, "loss": 0.3104, "step": 86 }, { "epoch": 3.9545454545454546, "grad_norm": 0.52734375, "learning_rate": 0.0001700377325606388, "loss": 0.3248, "step": 87 }, { "epoch": 4.0, "grad_norm": 0.57421875, "learning_rate": 0.00015638165701536866, "loss": 0.3155, "step": 88 }, { "epoch": 4.045454545454546, "grad_norm": 0.455078125, "learning_rate": 0.00014319492279412388, "loss": 0.2769, "step": 89 }, { "epoch": 4.090909090909091, "grad_norm": 0.48828125, "learning_rate": 0.0001304955413896705, "loss": 0.2873, "step": 90 }, { "epoch": 4.090909090909091, "eval_loss": 0.274143785238266, "eval_runtime": 1.5006, "eval_samples_per_second": 93.962, "eval_steps_per_second": 11.995, "step": 90 }, { "epoch": 4.136363636363637, "grad_norm": 0.478515625, "learning_rate": 0.00011830085862944851, "loss": 0.2952, "step": 91 }, { "epoch": 4.181818181818182, "grad_norm": 0.46484375, "learning_rate": 0.00010662753098325839, "loss": 0.2559, "step": 92 }, { "epoch": 4.2272727272727275, "grad_norm": 0.451171875, "learning_rate": 9.549150281252633e-05, "loss": 0.2737, "step": 93 }, { "epoch": 4.2727272727272725, "grad_norm": 0.5234375, "learning_rate": 8.490798459222476e-05, "loss": 0.2822, "step": 94 }, { "epoch": 4.318181818181818, "grad_norm": 0.55078125, "learning_rate": 7.489143213519301e-05, "loss": 0.3014, "step": 95 }, { "epoch": 4.363636363636363, "grad_norm": 0.51171875, "learning_rate": 6.545552684723583e-05, "loss": 0.2827, "step": 96 }, { "epoch": 4.363636363636363, "eval_loss": 0.26471802592277527, "eval_runtime": 1.4885, "eval_samples_per_second": 94.724, "eval_steps_per_second": 12.092, "step": 96 }, { "epoch": 4.409090909090909, "grad_norm": 0.46875, "learning_rate": 5.6613157039969057e-05, "loss": 0.2638, "step": 97 }, { "epoch": 4.454545454545454, "grad_norm": 0.451171875, "learning_rate": 4.8376400326935575e-05, "loss": 0.2592, "step": 98 }, { "epoch": 4.5, "grad_norm": 0.50390625, "learning_rate": 4.075650712703849e-05, "loss": 0.298, "step": 99 }, { "epoch": 4.545454545454545, "grad_norm": 0.50390625, "learning_rate": 3.376388529782215e-05, "loss": 0.2632, "step": 100 }, { "epoch": 4.590909090909091, "grad_norm": 0.453125, "learning_rate": 2.7408085919590266e-05, "loss": 0.2404, "step": 101 }, { "epoch": 4.636363636363637, "grad_norm": 0.4765625, "learning_rate": 2.1697790249779635e-05, "loss": 0.265, "step": 102 }, { "epoch": 4.636363636363637, "eval_loss": 0.26171576976776123, "eval_runtime": 1.4803, "eval_samples_per_second": 95.248, "eval_steps_per_second": 12.159, "step": 102 }, { "epoch": 4.681818181818182, "grad_norm": 0.53125, "learning_rate": 1.6640797865406288e-05, "loss": 0.3012, "step": 103 }, { "epoch": 4.7272727272727275, "grad_norm": 0.51171875, "learning_rate": 1.22440160097817e-05, "loss": 0.3019, "step": 104 }, { "epoch": 4.7727272727272725, "grad_norm": 0.47265625, "learning_rate": 8.513450158049108e-06, "loss": 0.2667, "step": 105 }, { "epoch": 4.818181818181818, "grad_norm": 0.5078125, "learning_rate": 5.454195814427021e-06, "loss": 0.2781, "step": 106 }, { "epoch": 4.863636363636363, "grad_norm": 0.5234375, "learning_rate": 3.0704315523631954e-06, "loss": 0.281, "step": 107 }, { "epoch": 4.909090909090909, "grad_norm": 0.52734375, "learning_rate": 1.3654133071059894e-06, "loss": 0.2935, "step": 108 }, { "epoch": 4.909090909090909, "eval_loss": 0.26116234064102173, "eval_runtime": 1.4806, "eval_samples_per_second": 95.232, "eval_steps_per_second": 12.157, "step": 108 }, { "epoch": 4.954545454545455, "grad_norm": 0.5390625, "learning_rate": 3.4146992848854695e-07, "loss": 0.2806, "step": 109 }, { "epoch": 5.0, "grad_norm": 0.5703125, "learning_rate": 0.0, "loss": 0.2668, "step": 110 }, { "epoch": 5.0, "step": 110, "total_flos": 5704372783549440.0, "train_loss": 1.2665127342397517, "train_runtime": 277.3258, "train_samples_per_second": 25.349, "train_steps_per_second": 0.397 } ], "logging_steps": 1, "max_steps": 110, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 6, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5704372783549440.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }