{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1638, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030525030525030524, "grad_norm": 62.938530852833836, "learning_rate": 9.977026925653056e-07, "loss": 0.5083, "step": 50 }, { "epoch": 0.06105006105006105, "grad_norm": 213.3598267649049, "learning_rate": 9.908318807470199e-07, "loss": 0.5674, "step": 100 }, { "epoch": 0.09157509157509157, "grad_norm": 154.79969789365293, "learning_rate": 9.794507020134337e-07, "loss": 0.54, "step": 150 }, { "epoch": 0.1221001221001221, "grad_norm": 46.601082604459464, "learning_rate": 9.636637406306276e-07, "loss": 0.626, "step": 200 }, { "epoch": 0.15262515262515264, "grad_norm": 53.962192059634134, "learning_rate": 9.436160666136257e-07, "loss": 0.5212, "step": 250 }, { "epoch": 0.18315018315018314, "grad_norm": 87.58560621983881, "learning_rate": 9.194919026446978e-07, "loss": 0.5516, "step": 300 }, { "epoch": 0.21367521367521367, "grad_norm": 123.80979482151284, "learning_rate": 8.915129312088112e-07, "loss": 0.6218, "step": 350 }, { "epoch": 0.2442002442002442, "grad_norm": 390.7971062188635, "learning_rate": 8.599362575023441e-07, "loss": 0.6515, "step": 400 }, { "epoch": 0.27472527472527475, "grad_norm": 26.658076048563824, "learning_rate": 8.25052046834372e-07, "loss": 0.5756, "step": 450 }, { "epoch": 0.3052503052503053, "grad_norm": 57.30779689782776, "learning_rate": 7.87180858230979e-07, "loss": 0.5465, "step": 500 }, { "epoch": 0.3052503052503053, "eval_accuracy": 0.7755102040816326, "eval_loss": 0.5169934034347534, "eval_runtime": 38.1975, "eval_samples_per_second": 8.98, "eval_steps_per_second": 2.251, "step": 500 }, { "epoch": 0.33577533577533575, "grad_norm": 0.3421131228542828, "learning_rate": 7.466706987447221e-07, "loss": 0.573, "step": 550 }, { "epoch": 0.3663003663003663, "grad_norm": 322.32829569797076, "learning_rate": 7.03893825537875e-07, "loss": 0.5859, "step": 600 }, { "epoch": 0.3968253968253968, "grad_norm": 70.58010836437558, "learning_rate": 6.592433251258422e-07, "loss": 0.6098, "step": 650 }, { "epoch": 0.42735042735042733, "grad_norm": 46.90754337155108, "learning_rate": 6.131295012148612e-07, "loss": 0.5937, "step": 700 }, { "epoch": 0.45787545787545786, "grad_norm": 33.718249054223485, "learning_rate": 5.659761043269833e-07, "loss": 0.6069, "step": 750 }, { "epoch": 0.4884004884004884, "grad_norm": 67.18852674620265, "learning_rate": 5.182164378591751e-07, "loss": 0.5216, "step": 800 }, { "epoch": 0.518925518925519, "grad_norm": 63.20860748452122, "learning_rate": 4.7028937635885676e-07, "loss": 0.5404, "step": 850 }, { "epoch": 0.5494505494505495, "grad_norm": 101.12697436330178, "learning_rate": 4.226353326048593e-07, "loss": 0.6039, "step": 900 }, { "epoch": 0.57997557997558, "grad_norm": 16.22019917375954, "learning_rate": 3.7569221055322e-07, "loss": 0.4532, "step": 950 }, { "epoch": 0.6105006105006106, "grad_norm": 60.84282514547736, "learning_rate": 3.298913813371268e-07, "loss": 0.6788, "step": 1000 }, { "epoch": 0.6105006105006106, "eval_accuracy": 0.8104956268221575, "eval_loss": 0.45700323581695557, "eval_runtime": 39.2826, "eval_samples_per_second": 8.732, "eval_steps_per_second": 2.189, "step": 1000 }, { "epoch": 0.6410256410256411, "grad_norm": 27.664920029800854, "learning_rate": 2.856537192984728e-07, "loss": 0.498, "step": 1050 }, { "epoch": 0.6715506715506715, "grad_norm": 39.65159769933289, "learning_rate": 2.4338573447683793e-07, "loss": 0.4883, "step": 1100 }, { "epoch": 0.702075702075702, "grad_norm": 122.2483315006908, "learning_rate": 2.034758370953431e-07, "loss": 0.4834, "step": 1150 }, { "epoch": 0.7326007326007326, "grad_norm": 77.76945449705711, "learning_rate": 1.6629076836987782e-07, "loss": 0.6567, "step": 1200 }, { "epoch": 0.7631257631257631, "grad_norm": 294.4900200191884, "learning_rate": 1.3217223043981295e-07, "loss": 0.5157, "step": 1250 }, { "epoch": 0.7936507936507936, "grad_norm": 148.80279171380104, "learning_rate": 1.0143374638853891e-07, "loss": 0.5137, "step": 1300 }, { "epoch": 0.8241758241758241, "grad_norm": 196.55030736251504, "learning_rate": 7.435777920782443e-08, "loss": 0.5831, "step": 1350 }, { "epoch": 0.8547008547008547, "grad_norm": 16.26079979378068, "learning_rate": 5.119313618049309e-08, "loss": 0.5032, "step": 1400 }, { "epoch": 0.8852258852258852, "grad_norm": 56.51782762296821, "learning_rate": 3.2152682533139466e-08, "loss": 0.5128, "step": 1450 }, { "epoch": 0.9157509157509157, "grad_norm": 21.457330627788103, "learning_rate": 1.7411385368659936e-08, "loss": 0.5754, "step": 1500 }, { "epoch": 0.9157509157509157, "eval_accuracy": 0.7871720116618076, "eval_loss": 0.48293355107307434, "eval_runtime": 38.5213, "eval_samples_per_second": 8.904, "eval_steps_per_second": 2.233, "step": 1500 }, { "epoch": 0.9462759462759462, "grad_norm": 102.68064209299165, "learning_rate": 7.104705853346271e-09, "loss": 0.5787, "step": 1550 }, { "epoch": 0.9768009768009768, "grad_norm": 13.53505025487251, "learning_rate": 1.3273544331087737e-09, "loss": 0.5539, "step": 1600 }, { "epoch": 1.0, "step": 1638, "total_flos": 0.0, "train_loss": 0.5596484089945699, "train_runtime": 4783.2861, "train_samples_per_second": 1.369, "train_steps_per_second": 0.342 } ], "logging_steps": 50, "max_steps": 1638, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }