{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 2184, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045787545787545784, "grad_norm": 8.169079780578613, "learning_rate": 1.9896720173943657e-05, "loss": 0.8583, "step": 100 }, { "epoch": 0.09157509157509157, "grad_norm": 22.747098922729492, "learning_rate": 1.9589014040268678e-05, "loss": 0.759, "step": 200 }, { "epoch": 0.13736263736263737, "grad_norm": 3.93502140045166, "learning_rate": 1.908323756616754e-05, "loss": 0.7104, "step": 300 }, { "epoch": 0.18315018315018314, "grad_norm": 10.182877540588379, "learning_rate": 1.838983805289396e-05, "loss": 0.7041, "step": 400 }, { "epoch": 0.22893772893772893, "grad_norm": 2.300295352935791, "learning_rate": 1.7523138336671628e-05, "loss": 0.7295, "step": 500 }, { "epoch": 0.27472527472527475, "grad_norm": 1.7253031730651855, "learning_rate": 1.6501040936687444e-05, "loss": 0.7295, "step": 600 }, { "epoch": 0.32051282051282054, "grad_norm": 1.2007420063018799, "learning_rate": 1.5344658261278013e-05, "loss": 0.6888, "step": 700 }, { "epoch": 0.3663003663003663, "grad_norm": 2.432363986968994, "learning_rate": 1.4077876510757502e-05, "loss": 0.6686, "step": 800 }, { "epoch": 0.41208791208791207, "grad_norm": 1.7616468667984009, "learning_rate": 1.2726862284894939e-05, "loss": 0.779, "step": 900 }, { "epoch": 0.45787545787545786, "grad_norm": 4.531213760375977, "learning_rate": 1.1319522086539666e-05, "loss": 0.711, "step": 1000 }, { "epoch": 0.45787545787545786, "eval_accuracy": 0.6213872832369942, "eval_loss": 0.7638348937034607, "eval_runtime": 12.3349, "eval_samples_per_second": 28.051, "eval_steps_per_second": 9.404, "step": 1000 }, { "epoch": 0.5036630036630036, "grad_norm": 4.445523738861084, "learning_rate": 9.884925885869326e-06, "loss": 0.6899, "step": 1100 }, { "epoch": 0.5494505494505495, "grad_norm": 5.363688945770264, "learning_rate": 8.452706652097187e-06, "loss": 0.6949, "step": 1200 }, { "epoch": 0.5952380952380952, "grad_norm": 11.651291847229004, "learning_rate": 7.052448255890958e-06, "loss": 0.6604, "step": 1300 }, { "epoch": 0.6410256410256411, "grad_norm": 10.71049690246582, "learning_rate": 5.713074385969457e-06, "loss": 0.667, "step": 1400 }, { "epoch": 0.6868131868131868, "grad_norm": 5.3541436195373535, "learning_rate": 4.462251102394669e-06, "loss": 0.6748, "step": 1500 }, { "epoch": 0.7326007326007326, "grad_norm": 7.145966529846191, "learning_rate": 3.325815367397557e-06, "loss": 0.665, "step": 1600 }, { "epoch": 0.7783882783882784, "grad_norm": 6.2031168937683105, "learning_rate": 2.327241357985063e-06, "loss": 0.735, "step": 1700 }, { "epoch": 0.8241758241758241, "grad_norm": 3.511324882507324, "learning_rate": 1.4871555841564889e-06, "loss": 0.6776, "step": 1800 }, { "epoch": 0.86996336996337, "grad_norm": 5.494836330413818, "learning_rate": 8.229108284305176e-07, "loss": 0.6689, "step": 1900 }, { "epoch": 0.9157509157509157, "grad_norm": 5.310685157775879, "learning_rate": 3.482277073731988e-07, "loss": 0.6407, "step": 2000 }, { "epoch": 0.9157509157509157, "eval_accuracy": 0.6329479768786127, "eval_loss": 0.6654821634292603, "eval_runtime": 12.2552, "eval_samples_per_second": 28.233, "eval_steps_per_second": 9.465, "step": 2000 }, { "epoch": 0.9615384615384616, "grad_norm": 11.22006893157959, "learning_rate": 7.291125901946027e-08, "loss": 0.6801, "step": 2100 }, { "epoch": 1.0, "step": 2184, "total_flos": 0.0, "train_loss": 0.7032692458603408, "train_runtime": 2584.7937, "train_samples_per_second": 2.534, "train_steps_per_second": 0.845 } ], "logging_steps": 100, "max_steps": 2184, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }