itemClassification_Alpaca_Mistral / trainer_state.json
amiguel's picture
Upload 8 files
47c184e verified
raw
history blame contribute delete
No virus
22.5 kB
{
"best_metric": 0.26116234064102173,
"best_model_checkpoint": "outputs/checkpoint-108",
"epoch": 5.0,
"eval_steps": 6,
"global_step": 110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.045454545454545456,
"grad_norm": 12.375,
"learning_rate": 4e-05,
"loss": 8.7425,
"step": 1
},
{
"epoch": 0.09090909090909091,
"grad_norm": 17.125,
"learning_rate": 8e-05,
"loss": 8.6536,
"step": 2
},
{
"epoch": 0.13636363636363635,
"grad_norm": 12.0625,
"learning_rate": 0.00012,
"loss": 8.602,
"step": 3
},
{
"epoch": 0.18181818181818182,
"grad_norm": 8.8125,
"learning_rate": 0.00016,
"loss": 8.3064,
"step": 4
},
{
"epoch": 0.22727272727272727,
"grad_norm": 10.125,
"learning_rate": 0.0002,
"loss": 7.7739,
"step": 5
},
{
"epoch": 0.2727272727272727,
"grad_norm": 7.25,
"learning_rate": 0.00024,
"loss": 7.1622,
"step": 6
},
{
"epoch": 0.2727272727272727,
"eval_loss": 6.408970832824707,
"eval_runtime": 1.4799,
"eval_samples_per_second": 95.274,
"eval_steps_per_second": 12.163,
"step": 6
},
{
"epoch": 0.3181818181818182,
"grad_norm": 6.78125,
"learning_rate": 0.00028000000000000003,
"loss": 6.4402,
"step": 7
},
{
"epoch": 0.36363636363636365,
"grad_norm": 10.1875,
"learning_rate": 0.00032,
"loss": 5.616,
"step": 8
},
{
"epoch": 0.4090909090909091,
"grad_norm": 4.4375,
"learning_rate": 0.00035999999999999997,
"loss": 4.8702,
"step": 9
},
{
"epoch": 0.45454545454545453,
"grad_norm": 3.765625,
"learning_rate": 0.0004,
"loss": 4.2606,
"step": 10
},
{
"epoch": 0.5,
"grad_norm": 4.6875,
"learning_rate": 0.00044,
"loss": 3.7622,
"step": 11
},
{
"epoch": 0.5454545454545454,
"grad_norm": 5.3125,
"learning_rate": 0.00048,
"loss": 3.5397,
"step": 12
},
{
"epoch": 0.5454545454545454,
"eval_loss": 2.9970808029174805,
"eval_runtime": 1.4818,
"eval_samples_per_second": 95.155,
"eval_steps_per_second": 12.147,
"step": 12
},
{
"epoch": 0.5909090909090909,
"grad_norm": 3.390625,
"learning_rate": 0.0005200000000000001,
"loss": 2.9788,
"step": 13
},
{
"epoch": 0.6363636363636364,
"grad_norm": 3.1875,
"learning_rate": 0.0005600000000000001,
"loss": 2.5801,
"step": 14
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.578125,
"learning_rate": 0.0006,
"loss": 2.3461,
"step": 15
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.6171875,
"learning_rate": 0.00064,
"loss": 2.0174,
"step": 16
},
{
"epoch": 0.7727272727272727,
"grad_norm": 1.9609375,
"learning_rate": 0.00068,
"loss": 1.889,
"step": 17
},
{
"epoch": 0.8181818181818182,
"grad_norm": 1.5625,
"learning_rate": 0.0007199999999999999,
"loss": 1.7954,
"step": 18
},
{
"epoch": 0.8181818181818182,
"eval_loss": 1.589725136756897,
"eval_runtime": 1.4857,
"eval_samples_per_second": 94.907,
"eval_steps_per_second": 12.116,
"step": 18
},
{
"epoch": 0.8636363636363636,
"grad_norm": 2.140625,
"learning_rate": 0.00076,
"loss": 1.6871,
"step": 19
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.28125,
"learning_rate": 0.0008,
"loss": 1.5787,
"step": 20
},
{
"epoch": 0.9545454545454546,
"grad_norm": 1.703125,
"learning_rate": 0.00084,
"loss": 1.4167,
"step": 21
},
{
"epoch": 1.0,
"grad_norm": 1.984375,
"learning_rate": 0.00088,
"loss": 1.3882,
"step": 22
},
{
"epoch": 1.0454545454545454,
"grad_norm": 1.3671875,
"learning_rate": 0.00092,
"loss": 1.2626,
"step": 23
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.140625,
"learning_rate": 0.00096,
"loss": 1.1743,
"step": 24
},
{
"epoch": 1.0909090909090908,
"eval_loss": 1.088915467262268,
"eval_runtime": 1.4822,
"eval_samples_per_second": 95.126,
"eval_steps_per_second": 12.144,
"step": 24
},
{
"epoch": 1.1363636363636362,
"grad_norm": 1.2734375,
"learning_rate": 0.001,
"loss": 1.1278,
"step": 25
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.3515625,
"learning_rate": 0.0009996585300715115,
"loss": 1.1023,
"step": 26
},
{
"epoch": 1.2272727272727273,
"grad_norm": 1.3359375,
"learning_rate": 0.0009986345866928941,
"loss": 1.0403,
"step": 27
},
{
"epoch": 1.2727272727272727,
"grad_norm": 1.40625,
"learning_rate": 0.000996929568447637,
"loss": 1.0496,
"step": 28
},
{
"epoch": 1.3181818181818181,
"grad_norm": 1.1796875,
"learning_rate": 0.000994545804185573,
"loss": 0.8593,
"step": 29
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.94140625,
"learning_rate": 0.000991486549841951,
"loss": 0.9413,
"step": 30
},
{
"epoch": 1.3636363636363638,
"eval_loss": 0.7773878574371338,
"eval_runtime": 1.491,
"eval_samples_per_second": 94.566,
"eval_steps_per_second": 12.072,
"step": 30
},
{
"epoch": 1.4090909090909092,
"grad_norm": 1.15625,
"learning_rate": 0.0009877559839902184,
"loss": 0.7758,
"step": 31
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.93359375,
"learning_rate": 0.0009833592021345938,
"loss": 0.8344,
"step": 32
},
{
"epoch": 1.5,
"grad_norm": 0.99609375,
"learning_rate": 0.0009783022097502204,
"loss": 0.6183,
"step": 33
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.8046875,
"learning_rate": 0.0009725919140804099,
"loss": 0.7497,
"step": 34
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.9140625,
"learning_rate": 0.0009662361147021779,
"loss": 0.7042,
"step": 35
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.95703125,
"learning_rate": 0.0009592434928729616,
"loss": 0.7236,
"step": 36
},
{
"epoch": 1.6363636363636362,
"eval_loss": 0.6186583042144775,
"eval_runtime": 1.4853,
"eval_samples_per_second": 94.931,
"eval_steps_per_second": 12.119,
"step": 36
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.84375,
"learning_rate": 0.0009516235996730644,
"loss": 0.6119,
"step": 37
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.87109375,
"learning_rate": 0.0009433868429600309,
"loss": 0.606,
"step": 38
},
{
"epoch": 1.7727272727272727,
"grad_norm": 17.625,
"learning_rate": 0.0009345444731527642,
"loss": 0.6787,
"step": 39
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.859375,
"learning_rate": 0.0009251085678648072,
"loss": 0.6607,
"step": 40
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.84765625,
"learning_rate": 0.0009150920154077753,
"loss": 0.6514,
"step": 41
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.64453125,
"learning_rate": 0.0009045084971874737,
"loss": 0.6084,
"step": 42
},
{
"epoch": 1.9090909090909092,
"eval_loss": 0.555232048034668,
"eval_runtime": 1.4852,
"eval_samples_per_second": 94.934,
"eval_steps_per_second": 12.119,
"step": 42
},
{
"epoch": 1.9545454545454546,
"grad_norm": 1.078125,
"learning_rate": 0.0008933724690167416,
"loss": 0.5991,
"step": 43
},
{
"epoch": 2.0,
"grad_norm": 0.83984375,
"learning_rate": 0.0008816991413705516,
"loss": 0.6085,
"step": 44
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.92578125,
"learning_rate": 0.0008695044586103295,
"loss": 0.4946,
"step": 45
},
{
"epoch": 2.090909090909091,
"grad_norm": 1.0703125,
"learning_rate": 0.0008568050772058762,
"loss": 0.4987,
"step": 46
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.6953125,
"learning_rate": 0.0008436183429846313,
"loss": 0.4656,
"step": 47
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.98046875,
"learning_rate": 0.0008299622674393614,
"loss": 0.5454,
"step": 48
},
{
"epoch": 2.1818181818181817,
"eval_loss": 0.4745166301727295,
"eval_runtime": 1.4854,
"eval_samples_per_second": 94.921,
"eval_steps_per_second": 12.118,
"step": 48
},
{
"epoch": 2.227272727272727,
"grad_norm": 0.71484375,
"learning_rate": 0.0008158555031266255,
"loss": 0.4058,
"step": 49
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.84375,
"learning_rate": 0.0008013173181896282,
"loss": 0.5267,
"step": 50
},
{
"epoch": 2.3181818181818183,
"grad_norm": 0.87109375,
"learning_rate": 0.0007863675700402526,
"loss": 0.524,
"step": 51
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.73046875,
"learning_rate": 0.0007710266782362247,
"loss": 0.5331,
"step": 52
},
{
"epoch": 2.409090909090909,
"grad_norm": 0.79296875,
"learning_rate": 0.0007553155965904535,
"loss": 0.4235,
"step": 53
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.828125,
"learning_rate": 0.0007392557845506433,
"loss": 0.5147,
"step": 54
},
{
"epoch": 2.4545454545454546,
"eval_loss": 0.437049001455307,
"eval_runtime": 1.4804,
"eval_samples_per_second": 95.243,
"eval_steps_per_second": 12.159,
"step": 54
},
{
"epoch": 2.5,
"grad_norm": 0.640625,
"learning_rate": 0.0007228691778882692,
"loss": 0.4376,
"step": 55
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.71875,
"learning_rate": 0.0007061781587369518,
"loss": 0.4396,
"step": 56
},
{
"epoch": 2.590909090909091,
"grad_norm": 0.81640625,
"learning_rate": 0.0006892055250211552,
"loss": 0.4257,
"step": 57
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.66015625,
"learning_rate": 0.0006719744593169641,
"loss": 0.4447,
"step": 58
},
{
"epoch": 2.6818181818181817,
"grad_norm": 0.75,
"learning_rate": 0.0006545084971874737,
"loss": 0.4591,
"step": 59
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.7890625,
"learning_rate": 0.0006368314950360416,
"loss": 0.4645,
"step": 60
},
{
"epoch": 2.7272727272727275,
"eval_loss": 0.3943060338497162,
"eval_runtime": 1.4805,
"eval_samples_per_second": 95.235,
"eval_steps_per_second": 12.158,
"step": 60
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.64453125,
"learning_rate": 0.0006189675975213093,
"loss": 0.4733,
"step": 61
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.64453125,
"learning_rate": 0.0006009412045785051,
"loss": 0.4227,
"step": 62
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.7265625,
"learning_rate": 0.000582776938092065,
"loss": 0.485,
"step": 63
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.58984375,
"learning_rate": 0.0005644996082651017,
"loss": 0.4154,
"step": 64
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.625,
"learning_rate": 0.000546134179731651,
"loss": 0.4602,
"step": 65
},
{
"epoch": 3.0,
"grad_norm": 0.6640625,
"learning_rate": 0.000527705737457985,
"loss": 0.4371,
"step": 66
},
{
"epoch": 3.0,
"eval_loss": 0.35816648602485657,
"eval_runtime": 1.4795,
"eval_samples_per_second": 95.3,
"eval_steps_per_second": 12.166,
"step": 66
},
{
"epoch": 3.0454545454545454,
"grad_norm": 0.59375,
"learning_rate": 0.000509239452479565,
"loss": 0.3674,
"step": 67
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.578125,
"learning_rate": 0.0004907605475204352,
"loss": 0.3405,
"step": 68
},
{
"epoch": 3.1363636363636362,
"grad_norm": 0.4921875,
"learning_rate": 0.00047229426254201504,
"loss": 0.3669,
"step": 69
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.61328125,
"learning_rate": 0.00045386582026834903,
"loss": 0.3333,
"step": 70
},
{
"epoch": 3.227272727272727,
"grad_norm": 0.5234375,
"learning_rate": 0.0004355003917348985,
"loss": 0.3032,
"step": 71
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.66015625,
"learning_rate": 0.000417223061907935,
"loss": 0.3557,
"step": 72
},
{
"epoch": 3.2727272727272725,
"eval_loss": 0.3237670361995697,
"eval_runtime": 1.4942,
"eval_samples_per_second": 94.367,
"eval_steps_per_second": 12.047,
"step": 72
},
{
"epoch": 3.3181818181818183,
"grad_norm": 0.59765625,
"learning_rate": 0.000399058795421495,
"loss": 0.3774,
"step": 73
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.52734375,
"learning_rate": 0.00038103240247869074,
"loss": 0.3433,
"step": 74
},
{
"epoch": 3.409090909090909,
"grad_norm": 0.609375,
"learning_rate": 0.0003631685049639586,
"loss": 0.3872,
"step": 75
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.59375,
"learning_rate": 0.00034549150281252633,
"loss": 0.3675,
"step": 76
},
{
"epoch": 3.5,
"grad_norm": 0.55859375,
"learning_rate": 0.0003280255406830359,
"loss": 0.3581,
"step": 77
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.48828125,
"learning_rate": 0.00031079447497884486,
"loss": 0.3062,
"step": 78
},
{
"epoch": 3.5454545454545454,
"eval_loss": 0.3086094558238983,
"eval_runtime": 1.4971,
"eval_samples_per_second": 94.182,
"eval_steps_per_second": 12.023,
"step": 78
},
{
"epoch": 3.590909090909091,
"grad_norm": 0.546875,
"learning_rate": 0.00029382184126304836,
"loss": 0.3324,
"step": 79
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.55078125,
"learning_rate": 0.0002771308221117309,
"loss": 0.338,
"step": 80
},
{
"epoch": 3.6818181818181817,
"grad_norm": 0.51171875,
"learning_rate": 0.0002607442154493568,
"loss": 0.3319,
"step": 81
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.63671875,
"learning_rate": 0.0002446844034095466,
"loss": 0.3577,
"step": 82
},
{
"epoch": 3.7727272727272725,
"grad_norm": 0.55078125,
"learning_rate": 0.00022897332176377528,
"loss": 0.3463,
"step": 83
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.52734375,
"learning_rate": 0.00021363242995974742,
"loss": 0.3065,
"step": 84
},
{
"epoch": 3.8181818181818183,
"eval_loss": 0.2896404266357422,
"eval_runtime": 1.4869,
"eval_samples_per_second": 94.829,
"eval_steps_per_second": 12.106,
"step": 84
},
{
"epoch": 3.8636363636363638,
"grad_norm": 0.58203125,
"learning_rate": 0.00019868268181037185,
"loss": 0.339,
"step": 85
},
{
"epoch": 3.909090909090909,
"grad_norm": 0.5,
"learning_rate": 0.00018414449687337466,
"loss": 0.3104,
"step": 86
},
{
"epoch": 3.9545454545454546,
"grad_norm": 0.52734375,
"learning_rate": 0.0001700377325606388,
"loss": 0.3248,
"step": 87
},
{
"epoch": 4.0,
"grad_norm": 0.57421875,
"learning_rate": 0.00015638165701536866,
"loss": 0.3155,
"step": 88
},
{
"epoch": 4.045454545454546,
"grad_norm": 0.455078125,
"learning_rate": 0.00014319492279412388,
"loss": 0.2769,
"step": 89
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.48828125,
"learning_rate": 0.0001304955413896705,
"loss": 0.2873,
"step": 90
},
{
"epoch": 4.090909090909091,
"eval_loss": 0.274143785238266,
"eval_runtime": 1.5006,
"eval_samples_per_second": 93.962,
"eval_steps_per_second": 11.995,
"step": 90
},
{
"epoch": 4.136363636363637,
"grad_norm": 0.478515625,
"learning_rate": 0.00011830085862944851,
"loss": 0.2952,
"step": 91
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.46484375,
"learning_rate": 0.00010662753098325839,
"loss": 0.2559,
"step": 92
},
{
"epoch": 4.2272727272727275,
"grad_norm": 0.451171875,
"learning_rate": 9.549150281252633e-05,
"loss": 0.2737,
"step": 93
},
{
"epoch": 4.2727272727272725,
"grad_norm": 0.5234375,
"learning_rate": 8.490798459222476e-05,
"loss": 0.2822,
"step": 94
},
{
"epoch": 4.318181818181818,
"grad_norm": 0.55078125,
"learning_rate": 7.489143213519301e-05,
"loss": 0.3014,
"step": 95
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.51171875,
"learning_rate": 6.545552684723583e-05,
"loss": 0.2827,
"step": 96
},
{
"epoch": 4.363636363636363,
"eval_loss": 0.26471802592277527,
"eval_runtime": 1.4885,
"eval_samples_per_second": 94.724,
"eval_steps_per_second": 12.092,
"step": 96
},
{
"epoch": 4.409090909090909,
"grad_norm": 0.46875,
"learning_rate": 5.6613157039969057e-05,
"loss": 0.2638,
"step": 97
},
{
"epoch": 4.454545454545454,
"grad_norm": 0.451171875,
"learning_rate": 4.8376400326935575e-05,
"loss": 0.2592,
"step": 98
},
{
"epoch": 4.5,
"grad_norm": 0.50390625,
"learning_rate": 4.075650712703849e-05,
"loss": 0.298,
"step": 99
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.50390625,
"learning_rate": 3.376388529782215e-05,
"loss": 0.2632,
"step": 100
},
{
"epoch": 4.590909090909091,
"grad_norm": 0.453125,
"learning_rate": 2.7408085919590266e-05,
"loss": 0.2404,
"step": 101
},
{
"epoch": 4.636363636363637,
"grad_norm": 0.4765625,
"learning_rate": 2.1697790249779635e-05,
"loss": 0.265,
"step": 102
},
{
"epoch": 4.636363636363637,
"eval_loss": 0.26171576976776123,
"eval_runtime": 1.4803,
"eval_samples_per_second": 95.248,
"eval_steps_per_second": 12.159,
"step": 102
},
{
"epoch": 4.681818181818182,
"grad_norm": 0.53125,
"learning_rate": 1.6640797865406288e-05,
"loss": 0.3012,
"step": 103
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.51171875,
"learning_rate": 1.22440160097817e-05,
"loss": 0.3019,
"step": 104
},
{
"epoch": 4.7727272727272725,
"grad_norm": 0.47265625,
"learning_rate": 8.513450158049108e-06,
"loss": 0.2667,
"step": 105
},
{
"epoch": 4.818181818181818,
"grad_norm": 0.5078125,
"learning_rate": 5.454195814427021e-06,
"loss": 0.2781,
"step": 106
},
{
"epoch": 4.863636363636363,
"grad_norm": 0.5234375,
"learning_rate": 3.0704315523631954e-06,
"loss": 0.281,
"step": 107
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.52734375,
"learning_rate": 1.3654133071059894e-06,
"loss": 0.2935,
"step": 108
},
{
"epoch": 4.909090909090909,
"eval_loss": 0.26116234064102173,
"eval_runtime": 1.4806,
"eval_samples_per_second": 95.232,
"eval_steps_per_second": 12.157,
"step": 108
},
{
"epoch": 4.954545454545455,
"grad_norm": 0.5390625,
"learning_rate": 3.4146992848854695e-07,
"loss": 0.2806,
"step": 109
},
{
"epoch": 5.0,
"grad_norm": 0.5703125,
"learning_rate": 0.0,
"loss": 0.2668,
"step": 110
},
{
"epoch": 5.0,
"step": 110,
"total_flos": 5704372783549440.0,
"train_loss": 1.2665127342397517,
"train_runtime": 277.3258,
"train_samples_per_second": 25.349,
"train_steps_per_second": 0.397
}
],
"logging_steps": 1,
"max_steps": 110,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 6,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5704372783549440.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}