{ "best_metric": 0.09645664691925049, "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned_v2024-7-24-frost/checkpoint-500", "epoch": 30.0, "eval_steps": 100, "global_step": 1920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15625, "grad_norm": 0.3123115003108978, "learning_rate": 1.0416666666666668e-05, "loss": 0.0743, "step": 10 }, { "epoch": 0.3125, "grad_norm": 0.10650705546140671, "learning_rate": 2.0833333333333336e-05, "loss": 0.0699, "step": 20 }, { "epoch": 0.46875, "grad_norm": 0.40636828541755676, "learning_rate": 3.125e-05, "loss": 0.0732, "step": 30 }, { "epoch": 0.625, "grad_norm": 0.42912840843200684, "learning_rate": 4.166666666666667e-05, "loss": 0.075, "step": 40 }, { "epoch": 0.78125, "grad_norm": 0.3166373372077942, "learning_rate": 5.208333333333334e-05, "loss": 0.0695, "step": 50 }, { "epoch": 0.9375, "grad_norm": 0.8551476001739502, "learning_rate": 6.25e-05, "loss": 0.0883, "step": 60 }, { "epoch": 1.09375, "grad_norm": 0.18066875636577606, "learning_rate": 7.291666666666667e-05, "loss": 0.0699, "step": 70 }, { "epoch": 1.25, "grad_norm": 0.28325945138931274, "learning_rate": 8.333333333333334e-05, "loss": 0.0627, "step": 80 }, { "epoch": 1.40625, "grad_norm": 0.3701513409614563, "learning_rate": 9.375e-05, "loss": 0.0866, "step": 90 }, { "epoch": 1.5625, "grad_norm": 0.35587912797927856, "learning_rate": 0.00010416666666666667, "loss": 0.0728, "step": 100 }, { "epoch": 1.5625, "eval_accuracy": 0.984070796460177, "eval_f1": 0.9606986899563319, "eval_loss": 0.06593623757362366, "eval_precision": 0.9691629955947136, "eval_recall": 0.9523809523809523, "eval_runtime": 0.9039, "eval_samples_per_second": 125.009, "eval_steps_per_second": 16.594, "step": 100 }, { "epoch": 1.71875, "grad_norm": 0.23759329319000244, "learning_rate": 0.00011458333333333333, "loss": 0.0653, "step": 110 }, { "epoch": 1.875, "grad_norm": 0.6092272996902466, "learning_rate": 0.000125, "loss": 0.1015, "step": 120 }, { "epoch": 2.03125, "grad_norm": 0.15887708961963654, "learning_rate": 0.0001354166666666667, "loss": 0.09, "step": 130 }, { "epoch": 2.1875, "grad_norm": 0.3399417996406555, "learning_rate": 0.00014583333333333335, "loss": 0.0847, "step": 140 }, { "epoch": 2.34375, "grad_norm": 0.2599344253540039, "learning_rate": 0.00015625, "loss": 0.0722, "step": 150 }, { "epoch": 2.5, "grad_norm": 0.20714014768600464, "learning_rate": 0.0001666666666666667, "loss": 0.0915, "step": 160 }, { "epoch": 2.65625, "grad_norm": 0.7900287508964539, "learning_rate": 0.00017708333333333335, "loss": 0.1008, "step": 170 }, { "epoch": 2.8125, "grad_norm": 0.23315797746181488, "learning_rate": 0.0001875, "loss": 0.1142, "step": 180 }, { "epoch": 2.96875, "grad_norm": 1.258319616317749, "learning_rate": 0.0001979166666666667, "loss": 0.1027, "step": 190 }, { "epoch": 3.125, "grad_norm": 1.0168662071228027, "learning_rate": 0.0001990740740740741, "loss": 0.0871, "step": 200 }, { "epoch": 3.125, "eval_accuracy": 0.9566371681415929, "eval_f1": 0.8941684665226782, "eval_loss": 0.12436065077781677, "eval_precision": 0.8922413793103449, "eval_recall": 0.8961038961038961, "eval_runtime": 0.8601, "eval_samples_per_second": 131.383, "eval_steps_per_second": 17.44, "step": 200 }, { "epoch": 3.28125, "grad_norm": 0.38566353917121887, "learning_rate": 0.0001979166666666667, "loss": 0.1166, "step": 210 }, { "epoch": 3.4375, "grad_norm": 0.4687894284725189, "learning_rate": 0.00019675925925925926, "loss": 0.108, "step": 220 }, { "epoch": 3.59375, "grad_norm": 0.5190223455429077, "learning_rate": 0.00019560185185185186, "loss": 0.0901, "step": 230 }, { "epoch": 3.75, "grad_norm": 0.5094243288040161, "learning_rate": 0.00019444444444444446, "loss": 0.1144, "step": 240 }, { "epoch": 3.90625, "grad_norm": 0.5921277403831482, "learning_rate": 0.00019328703703703706, "loss": 0.1196, "step": 250 }, { "epoch": 4.0625, "grad_norm": 0.23840609192848206, "learning_rate": 0.00019212962962962963, "loss": 0.1122, "step": 260 }, { "epoch": 4.21875, "grad_norm": 0.9276812672615051, "learning_rate": 0.00019097222222222223, "loss": 0.1147, "step": 270 }, { "epoch": 4.375, "grad_norm": 0.7325614094734192, "learning_rate": 0.00018981481481481483, "loss": 0.1075, "step": 280 }, { "epoch": 4.53125, "grad_norm": 0.5574468374252319, "learning_rate": 0.00018865740740740743, "loss": 0.0958, "step": 290 }, { "epoch": 4.6875, "grad_norm": 0.3893429934978485, "learning_rate": 0.0001875, "loss": 0.0999, "step": 300 }, { "epoch": 4.6875, "eval_accuracy": 0.963716814159292, "eval_f1": 0.9125799573560768, "eval_loss": 0.10427873581647873, "eval_precision": 0.8991596638655462, "eval_recall": 0.9264069264069265, "eval_runtime": 0.8493, "eval_samples_per_second": 133.049, "eval_steps_per_second": 17.661, "step": 300 }, { "epoch": 4.84375, "grad_norm": 0.49028488993644714, "learning_rate": 0.0001863425925925926, "loss": 0.1087, "step": 310 }, { "epoch": 5.0, "grad_norm": 0.6510241627693176, "learning_rate": 0.0001851851851851852, "loss": 0.0949, "step": 320 }, { "epoch": 5.15625, "grad_norm": 0.3244408667087555, "learning_rate": 0.00018402777777777778, "loss": 0.0957, "step": 330 }, { "epoch": 5.3125, "grad_norm": 0.32894158363342285, "learning_rate": 0.00018287037037037038, "loss": 0.0761, "step": 340 }, { "epoch": 5.46875, "grad_norm": 0.4168912470340729, "learning_rate": 0.00018171296296296297, "loss": 0.1014, "step": 350 }, { "epoch": 5.625, "grad_norm": 0.30746978521347046, "learning_rate": 0.00018055555555555557, "loss": 0.0789, "step": 360 }, { "epoch": 5.78125, "grad_norm": 0.3337535262107849, "learning_rate": 0.00017939814814814815, "loss": 0.0891, "step": 370 }, { "epoch": 5.9375, "grad_norm": 0.2659320533275604, "learning_rate": 0.00017824074074074075, "loss": 0.0798, "step": 380 }, { "epoch": 6.09375, "grad_norm": 0.28791913390159607, "learning_rate": 0.00017708333333333335, "loss": 0.0961, "step": 390 }, { "epoch": 6.25, "grad_norm": 0.41803187131881714, "learning_rate": 0.00017592592592592595, "loss": 0.0743, "step": 400 }, { "epoch": 6.25, "eval_accuracy": 0.9610619469026549, "eval_f1": 0.9043478260869565, "eval_loss": 0.10431604832410812, "eval_precision": 0.9082969432314411, "eval_recall": 0.9004329004329005, "eval_runtime": 1.3126, "eval_samples_per_second": 86.086, "eval_steps_per_second": 11.427, "step": 400 }, { "epoch": 6.40625, "grad_norm": 0.398034930229187, "learning_rate": 0.00017476851851851852, "loss": 0.0798, "step": 410 }, { "epoch": 6.5625, "grad_norm": 0.533364474773407, "learning_rate": 0.00017361111111111112, "loss": 0.0808, "step": 420 }, { "epoch": 6.71875, "grad_norm": 0.6189862489700317, "learning_rate": 0.00017245370370370372, "loss": 0.091, "step": 430 }, { "epoch": 6.875, "grad_norm": 0.31593209505081177, "learning_rate": 0.00017129629629629632, "loss": 0.0729, "step": 440 }, { "epoch": 7.03125, "grad_norm": 0.3167741000652313, "learning_rate": 0.0001701388888888889, "loss": 0.0796, "step": 450 }, { "epoch": 7.1875, "grad_norm": 0.30901169776916504, "learning_rate": 0.0001689814814814815, "loss": 0.0867, "step": 460 }, { "epoch": 7.34375, "grad_norm": 0.13378705084323883, "learning_rate": 0.0001678240740740741, "loss": 0.0701, "step": 470 }, { "epoch": 7.5, "grad_norm": 0.15507709980010986, "learning_rate": 0.0001666666666666667, "loss": 0.0789, "step": 480 }, { "epoch": 7.65625, "grad_norm": 0.21113860607147217, "learning_rate": 0.00016550925925925926, "loss": 0.0647, "step": 490 }, { "epoch": 7.8125, "grad_norm": 0.15848499536514282, "learning_rate": 0.00016435185185185186, "loss": 0.0655, "step": 500 }, { "epoch": 7.8125, "eval_accuracy": 0.963716814159292, "eval_f1": 0.9118279569892475, "eval_loss": 0.09645664691925049, "eval_precision": 0.905982905982906, "eval_recall": 0.9177489177489178, "eval_runtime": 2.2304, "eval_samples_per_second": 50.664, "eval_steps_per_second": 6.725, "step": 500 }, { "epoch": 7.96875, "grad_norm": 0.19086486101150513, "learning_rate": 0.00016319444444444446, "loss": 0.0502, "step": 510 }, { "epoch": 8.125, "grad_norm": 0.4851354956626892, "learning_rate": 0.00016203703703703706, "loss": 0.0646, "step": 520 }, { "epoch": 8.28125, "grad_norm": 0.43803560733795166, "learning_rate": 0.00016087962962962963, "loss": 0.0668, "step": 530 }, { "epoch": 8.4375, "grad_norm": 0.26552197337150574, "learning_rate": 0.00015972222222222223, "loss": 0.0549, "step": 540 }, { "epoch": 8.59375, "grad_norm": 0.18909405171871185, "learning_rate": 0.00015856481481481483, "loss": 0.07, "step": 550 }, { "epoch": 8.75, "grad_norm": 0.2485276311635971, "learning_rate": 0.00015740740740740743, "loss": 0.0525, "step": 560 }, { "epoch": 8.90625, "grad_norm": 0.4424391984939575, "learning_rate": 0.00015625, "loss": 0.0775, "step": 570 }, { "epoch": 9.0625, "grad_norm": 0.2584344446659088, "learning_rate": 0.0001550925925925926, "loss": 0.064, "step": 580 }, { "epoch": 9.21875, "grad_norm": 0.6115286946296692, "learning_rate": 0.0001539351851851852, "loss": 0.0599, "step": 590 }, { "epoch": 9.375, "grad_norm": 0.5155323147773743, "learning_rate": 0.00015277777777777777, "loss": 0.0559, "step": 600 }, { "epoch": 9.375, "eval_accuracy": 0.9619469026548673, "eval_f1": 0.9087048832271762, "eval_loss": 0.10384609550237656, "eval_precision": 0.8916666666666667, "eval_recall": 0.9264069264069265, "eval_runtime": 2.2545, "eval_samples_per_second": 50.123, "eval_steps_per_second": 6.653, "step": 600 }, { "epoch": 9.53125, "grad_norm": 0.40844494104385376, "learning_rate": 0.00015162037037037037, "loss": 0.072, "step": 610 }, { "epoch": 9.6875, "grad_norm": 0.2091340720653534, "learning_rate": 0.00015046296296296297, "loss": 0.0513, "step": 620 }, { "epoch": 9.84375, "grad_norm": 0.22117160260677338, "learning_rate": 0.00014930555555555557, "loss": 0.0623, "step": 630 }, { "epoch": 10.0, "grad_norm": 0.31236401200294495, "learning_rate": 0.00014814814814814815, "loss": 0.0534, "step": 640 }, { "epoch": 10.15625, "grad_norm": 0.31281912326812744, "learning_rate": 0.00014699074074074075, "loss": 0.0644, "step": 650 }, { "epoch": 10.3125, "grad_norm": 0.5201927423477173, "learning_rate": 0.00014583333333333335, "loss": 0.057, "step": 660 }, { "epoch": 10.46875, "grad_norm": 0.2596763074398041, "learning_rate": 0.00014467592592592594, "loss": 0.0542, "step": 670 }, { "epoch": 10.625, "grad_norm": 0.3063810467720032, "learning_rate": 0.00014351851851851852, "loss": 0.0389, "step": 680 }, { "epoch": 10.78125, "grad_norm": 0.48713332414627075, "learning_rate": 0.00014236111111111112, "loss": 0.0742, "step": 690 }, { "epoch": 10.9375, "grad_norm": 0.21316884458065033, "learning_rate": 0.00014120370370370372, "loss": 0.0517, "step": 700 }, { "epoch": 10.9375, "eval_accuracy": 0.9584070796460177, "eval_f1": 0.8997867803837952, "eval_loss": 0.09719711542129517, "eval_precision": 0.8865546218487395, "eval_recall": 0.9134199134199135, "eval_runtime": 0.8764, "eval_samples_per_second": 128.937, "eval_steps_per_second": 17.115, "step": 700 }, { "epoch": 11.09375, "grad_norm": 0.9216361045837402, "learning_rate": 0.00014004629629629632, "loss": 0.0623, "step": 710 }, { "epoch": 11.25, "grad_norm": 0.31130528450012207, "learning_rate": 0.0001388888888888889, "loss": 0.0641, "step": 720 }, { "epoch": 11.40625, "grad_norm": 1.03948974609375, "learning_rate": 0.0001377314814814815, "loss": 0.0594, "step": 730 }, { "epoch": 11.5625, "grad_norm": 0.12757237255573273, "learning_rate": 0.0001365740740740741, "loss": 0.0572, "step": 740 }, { "epoch": 11.71875, "grad_norm": 0.25488346815109253, "learning_rate": 0.0001354166666666667, "loss": 0.0533, "step": 750 }, { "epoch": 11.875, "grad_norm": 0.2517576813697815, "learning_rate": 0.00013425925925925926, "loss": 0.0557, "step": 760 }, { "epoch": 12.03125, "grad_norm": 0.14332328736782074, "learning_rate": 0.00013310185185185186, "loss": 0.0433, "step": 770 }, { "epoch": 12.1875, "grad_norm": 0.7062014937400818, "learning_rate": 0.00013194444444444446, "loss": 0.0569, "step": 780 }, { "epoch": 12.34375, "grad_norm": 0.727057158946991, "learning_rate": 0.00013078703703703706, "loss": 0.0443, "step": 790 }, { "epoch": 12.5, "grad_norm": 0.17331984639167786, "learning_rate": 0.00012962962962962963, "loss": 0.0407, "step": 800 }, { "epoch": 12.5, "eval_accuracy": 0.963716814159292, "eval_f1": 0.9110629067245118, "eval_loss": 0.11198227852582932, "eval_precision": 0.9130434782608695, "eval_recall": 0.9090909090909091, "eval_runtime": 0.9019, "eval_samples_per_second": 125.294, "eval_steps_per_second": 16.632, "step": 800 }, { "epoch": 12.65625, "grad_norm": 0.104576975107193, "learning_rate": 0.00012847222222222223, "loss": 0.0465, "step": 810 }, { "epoch": 12.8125, "grad_norm": 0.6552168726921082, "learning_rate": 0.00012731481481481483, "loss": 0.0536, "step": 820 }, { "epoch": 12.96875, "grad_norm": 0.39452189207077026, "learning_rate": 0.00012615740740740743, "loss": 0.0514, "step": 830 }, { "epoch": 13.125, "grad_norm": 0.16756129264831543, "learning_rate": 0.000125, "loss": 0.0417, "step": 840 }, { "epoch": 13.28125, "grad_norm": 0.13866697251796722, "learning_rate": 0.00012384259259259258, "loss": 0.0419, "step": 850 }, { "epoch": 13.4375, "grad_norm": 0.9053749442100525, "learning_rate": 0.0001226851851851852, "loss": 0.0548, "step": 860 }, { "epoch": 13.59375, "grad_norm": 0.43149927258491516, "learning_rate": 0.00012152777777777777, "loss": 0.0503, "step": 870 }, { "epoch": 13.75, "grad_norm": 0.49532395601272583, "learning_rate": 0.00012037037037037037, "loss": 0.0476, "step": 880 }, { "epoch": 13.90625, "grad_norm": 0.12025842815637589, "learning_rate": 0.00011921296296296296, "loss": 0.049, "step": 890 }, { "epoch": 14.0625, "grad_norm": 0.8570975065231323, "learning_rate": 0.00011805555555555556, "loss": 0.0513, "step": 900 }, { "epoch": 14.0625, "eval_accuracy": 0.9557522123893806, "eval_f1": 0.8893805309734513, "eval_loss": 0.1092919260263443, "eval_precision": 0.9095022624434389, "eval_recall": 0.8701298701298701, "eval_runtime": 1.2237, "eval_samples_per_second": 92.344, "eval_steps_per_second": 12.258, "step": 900 }, { "epoch": 14.21875, "grad_norm": 0.4120664596557617, "learning_rate": 0.00011689814814814815, "loss": 0.0552, "step": 910 }, { "epoch": 14.375, "grad_norm": 0.24265483021736145, "learning_rate": 0.00011574074074074075, "loss": 0.0434, "step": 920 }, { "epoch": 14.53125, "grad_norm": 0.23618777096271515, "learning_rate": 0.00011458333333333333, "loss": 0.0482, "step": 930 }, { "epoch": 14.6875, "grad_norm": 0.1366555392742157, "learning_rate": 0.00011342592592592593, "loss": 0.045, "step": 940 }, { "epoch": 14.84375, "grad_norm": 0.1841152310371399, "learning_rate": 0.00011226851851851852, "loss": 0.0539, "step": 950 }, { "epoch": 15.0, "grad_norm": 0.6849538087844849, "learning_rate": 0.00011111111111111112, "loss": 0.0363, "step": 960 }, { "epoch": 15.15625, "grad_norm": 0.5442699790000916, "learning_rate": 0.0001099537037037037, "loss": 0.0372, "step": 970 }, { "epoch": 15.3125, "grad_norm": 0.3825988173484802, "learning_rate": 0.0001087962962962963, "loss": 0.0405, "step": 980 }, { "epoch": 15.46875, "grad_norm": 0.0459093414247036, "learning_rate": 0.00010763888888888889, "loss": 0.0386, "step": 990 }, { "epoch": 15.625, "grad_norm": 0.2602522373199463, "learning_rate": 0.00010648148148148149, "loss": 0.0378, "step": 1000 }, { "epoch": 15.625, "eval_accuracy": 0.9548672566371681, "eval_f1": 0.8888888888888888, "eval_loss": 0.11969945579767227, "eval_precision": 0.8947368421052632, "eval_recall": 0.8831168831168831, "eval_runtime": 0.8768, "eval_samples_per_second": 128.871, "eval_steps_per_second": 17.107, "step": 1000 }, { "epoch": 15.78125, "grad_norm": 0.07926033437252045, "learning_rate": 0.00010532407407407407, "loss": 0.0419, "step": 1010 }, { "epoch": 15.9375, "grad_norm": 0.2084084302186966, "learning_rate": 0.00010416666666666667, "loss": 0.0336, "step": 1020 }, { "epoch": 16.09375, "grad_norm": 0.11587415635585785, "learning_rate": 0.00010300925925925926, "loss": 0.0293, "step": 1030 }, { "epoch": 16.25, "grad_norm": 0.4128260314464569, "learning_rate": 0.00010185185185185186, "loss": 0.0346, "step": 1040 }, { "epoch": 16.40625, "grad_norm": 0.2051563411951065, "learning_rate": 0.00010069444444444445, "loss": 0.0404, "step": 1050 }, { "epoch": 16.5625, "grad_norm": 1.0257600545883179, "learning_rate": 9.953703703703704e-05, "loss": 0.0521, "step": 1060 }, { "epoch": 16.71875, "grad_norm": 0.13610199093818665, "learning_rate": 9.837962962962963e-05, "loss": 0.0513, "step": 1070 }, { "epoch": 16.875, "grad_norm": 0.5424107909202576, "learning_rate": 9.722222222222223e-05, "loss": 0.0662, "step": 1080 }, { "epoch": 17.03125, "grad_norm": 0.1417212188243866, "learning_rate": 9.606481481481482e-05, "loss": 0.0364, "step": 1090 }, { "epoch": 17.1875, "grad_norm": 0.15864621102809906, "learning_rate": 9.490740740740742e-05, "loss": 0.0487, "step": 1100 }, { "epoch": 17.1875, "eval_accuracy": 0.9646017699115044, "eval_f1": 0.9137931034482759, "eval_loss": 0.09552007168531418, "eval_precision": 0.9098712446351931, "eval_recall": 0.9177489177489178, "eval_runtime": 0.8603, "eval_samples_per_second": 131.352, "eval_steps_per_second": 17.436, "step": 1100 }, { "epoch": 17.34375, "grad_norm": 0.5382766127586365, "learning_rate": 9.375e-05, "loss": 0.0315, "step": 1110 }, { "epoch": 17.5, "grad_norm": 0.16078180074691772, "learning_rate": 9.25925925925926e-05, "loss": 0.0245, "step": 1120 }, { "epoch": 17.65625, "grad_norm": 0.05187100172042847, "learning_rate": 9.143518518518519e-05, "loss": 0.0342, "step": 1130 }, { "epoch": 17.8125, "grad_norm": 0.4016551077365875, "learning_rate": 9.027777777777779e-05, "loss": 0.0285, "step": 1140 }, { "epoch": 17.96875, "grad_norm": 0.22832362353801727, "learning_rate": 8.912037037037037e-05, "loss": 0.0379, "step": 1150 }, { "epoch": 18.125, "grad_norm": 1.3720444440841675, "learning_rate": 8.796296296296297e-05, "loss": 0.0369, "step": 1160 }, { "epoch": 18.28125, "grad_norm": 0.23146755993366241, "learning_rate": 8.680555555555556e-05, "loss": 0.0343, "step": 1170 }, { "epoch": 18.4375, "grad_norm": 0.2672041356563568, "learning_rate": 8.564814814814816e-05, "loss": 0.0353, "step": 1180 }, { "epoch": 18.59375, "grad_norm": 0.17212288081645966, "learning_rate": 8.449074074074074e-05, "loss": 0.0315, "step": 1190 }, { "epoch": 18.75, "grad_norm": 0.12989170849323273, "learning_rate": 8.333333333333334e-05, "loss": 0.0272, "step": 1200 }, { "epoch": 18.75, "eval_accuracy": 0.9566371681415929, "eval_f1": 0.8927789934354485, "eval_loss": 0.10875095427036285, "eval_precision": 0.9026548672566371, "eval_recall": 0.8831168831168831, "eval_runtime": 1.2152, "eval_samples_per_second": 92.985, "eval_steps_per_second": 12.343, "step": 1200 }, { "epoch": 18.90625, "grad_norm": 0.15251386165618896, "learning_rate": 8.217592592592593e-05, "loss": 0.0353, "step": 1210 }, { "epoch": 19.0625, "grad_norm": 0.1500956416130066, "learning_rate": 8.101851851851853e-05, "loss": 0.0258, "step": 1220 }, { "epoch": 19.21875, "grad_norm": 0.16236737370491028, "learning_rate": 7.986111111111112e-05, "loss": 0.0318, "step": 1230 }, { "epoch": 19.375, "grad_norm": 0.5188699960708618, "learning_rate": 7.870370370370372e-05, "loss": 0.0388, "step": 1240 }, { "epoch": 19.53125, "grad_norm": 0.14171747863292694, "learning_rate": 7.75462962962963e-05, "loss": 0.0371, "step": 1250 }, { "epoch": 19.6875, "grad_norm": 0.355496883392334, "learning_rate": 7.638888888888889e-05, "loss": 0.0278, "step": 1260 }, { "epoch": 19.84375, "grad_norm": 0.30447283387184143, "learning_rate": 7.523148148148149e-05, "loss": 0.0307, "step": 1270 }, { "epoch": 20.0, "grad_norm": 1.8779629468917847, "learning_rate": 7.407407407407407e-05, "loss": 0.0378, "step": 1280 }, { "epoch": 20.15625, "grad_norm": 0.24115116894245148, "learning_rate": 7.291666666666667e-05, "loss": 0.0234, "step": 1290 }, { "epoch": 20.3125, "grad_norm": 0.07465353608131409, "learning_rate": 7.175925925925926e-05, "loss": 0.0241, "step": 1300 }, { "epoch": 20.3125, "eval_accuracy": 0.963716814159292, "eval_f1": 0.9114470842332613, "eval_loss": 0.09792255610227585, "eval_precision": 0.9094827586206896, "eval_recall": 0.9134199134199135, "eval_runtime": 0.8811, "eval_samples_per_second": 128.253, "eval_steps_per_second": 17.025, "step": 1300 }, { "epoch": 20.46875, "grad_norm": 0.44642359018325806, "learning_rate": 7.060185185185186e-05, "loss": 0.0271, "step": 1310 }, { "epoch": 20.625, "grad_norm": 0.16677480936050415, "learning_rate": 6.944444444444444e-05, "loss": 0.0199, "step": 1320 }, { "epoch": 20.78125, "grad_norm": 0.05306961014866829, "learning_rate": 6.828703703703704e-05, "loss": 0.028, "step": 1330 }, { "epoch": 20.9375, "grad_norm": 0.7962948679924011, "learning_rate": 6.712962962962963e-05, "loss": 0.0238, "step": 1340 }, { "epoch": 21.09375, "grad_norm": 0.19253899157047272, "learning_rate": 6.597222222222223e-05, "loss": 0.0367, "step": 1350 }, { "epoch": 21.25, "grad_norm": 0.22666649520397186, "learning_rate": 6.481481481481482e-05, "loss": 0.021, "step": 1360 }, { "epoch": 21.40625, "grad_norm": 0.09341959655284882, "learning_rate": 6.365740740740742e-05, "loss": 0.0217, "step": 1370 }, { "epoch": 21.5625, "grad_norm": 0.40562504529953003, "learning_rate": 6.25e-05, "loss": 0.0268, "step": 1380 }, { "epoch": 21.71875, "grad_norm": 0.20743058621883392, "learning_rate": 6.13425925925926e-05, "loss": 0.0394, "step": 1390 }, { "epoch": 21.875, "grad_norm": 0.16062897443771362, "learning_rate": 6.018518518518519e-05, "loss": 0.0311, "step": 1400 }, { "epoch": 21.875, "eval_accuracy": 0.9654867256637168, "eval_f1": 0.9157667386609072, "eval_loss": 0.11342811584472656, "eval_precision": 0.9137931034482759, "eval_recall": 0.9177489177489178, "eval_runtime": 0.8884, "eval_samples_per_second": 127.2, "eval_steps_per_second": 16.885, "step": 1400 }, { "epoch": 22.03125, "grad_norm": 0.08394443988800049, "learning_rate": 5.902777777777778e-05, "loss": 0.0312, "step": 1410 }, { "epoch": 22.1875, "grad_norm": 0.6736553311347961, "learning_rate": 5.787037037037037e-05, "loss": 0.0383, "step": 1420 }, { "epoch": 22.34375, "grad_norm": 0.563914954662323, "learning_rate": 5.6712962962962965e-05, "loss": 0.0287, "step": 1430 }, { "epoch": 22.5, "grad_norm": 0.08304356783628464, "learning_rate": 5.555555555555556e-05, "loss": 0.026, "step": 1440 }, { "epoch": 22.65625, "grad_norm": 0.6314889788627625, "learning_rate": 5.439814814814815e-05, "loss": 0.0337, "step": 1450 }, { "epoch": 22.8125, "grad_norm": 0.1526585817337036, "learning_rate": 5.3240740740740744e-05, "loss": 0.0386, "step": 1460 }, { "epoch": 22.96875, "grad_norm": 0.4352094829082489, "learning_rate": 5.208333333333334e-05, "loss": 0.0225, "step": 1470 }, { "epoch": 23.125, "grad_norm": 0.07802680879831314, "learning_rate": 5.092592592592593e-05, "loss": 0.028, "step": 1480 }, { "epoch": 23.28125, "grad_norm": 0.06631523370742798, "learning_rate": 4.976851851851852e-05, "loss": 0.0216, "step": 1490 }, { "epoch": 23.4375, "grad_norm": 0.4568875730037689, "learning_rate": 4.8611111111111115e-05, "loss": 0.0303, "step": 1500 }, { "epoch": 23.4375, "eval_accuracy": 0.9628318584070796, "eval_f1": 0.9078947368421053, "eval_loss": 0.10922601819038391, "eval_precision": 0.92, "eval_recall": 0.8961038961038961, "eval_runtime": 1.1366, "eval_samples_per_second": 99.417, "eval_steps_per_second": 13.197, "step": 1500 }, { "epoch": 23.59375, "grad_norm": 0.16732257604599, "learning_rate": 4.745370370370371e-05, "loss": 0.0182, "step": 1510 }, { "epoch": 23.75, "grad_norm": 0.8489612340927124, "learning_rate": 4.62962962962963e-05, "loss": 0.0419, "step": 1520 }, { "epoch": 23.90625, "grad_norm": 0.23256537318229675, "learning_rate": 4.5138888888888894e-05, "loss": 0.0344, "step": 1530 }, { "epoch": 24.0625, "grad_norm": 0.23274816572666168, "learning_rate": 4.3981481481481486e-05, "loss": 0.0283, "step": 1540 }, { "epoch": 24.21875, "grad_norm": 0.1935439109802246, "learning_rate": 4.282407407407408e-05, "loss": 0.0293, "step": 1550 }, { "epoch": 24.375, "grad_norm": 0.4433891773223877, "learning_rate": 4.166666666666667e-05, "loss": 0.0341, "step": 1560 }, { "epoch": 24.53125, "grad_norm": 0.3329981863498688, "learning_rate": 4.0509259259259265e-05, "loss": 0.022, "step": 1570 }, { "epoch": 24.6875, "grad_norm": 0.774336040019989, "learning_rate": 3.935185185185186e-05, "loss": 0.0304, "step": 1580 }, { "epoch": 24.84375, "grad_norm": 0.45676717162132263, "learning_rate": 3.8194444444444444e-05, "loss": 0.0213, "step": 1590 }, { "epoch": 25.0, "grad_norm": 0.18224991858005524, "learning_rate": 3.7037037037037037e-05, "loss": 0.0225, "step": 1600 }, { "epoch": 25.0, "eval_accuracy": 0.9628318584070796, "eval_f1": 0.908296943231441, "eval_loss": 0.11213955283164978, "eval_precision": 0.9162995594713657, "eval_recall": 0.9004329004329005, "eval_runtime": 1.05, "eval_samples_per_second": 107.622, "eval_steps_per_second": 14.286, "step": 1600 }, { "epoch": 25.15625, "grad_norm": 0.45673811435699463, "learning_rate": 3.587962962962963e-05, "loss": 0.0253, "step": 1610 }, { "epoch": 25.3125, "grad_norm": 0.09005212038755417, "learning_rate": 3.472222222222222e-05, "loss": 0.0127, "step": 1620 }, { "epoch": 25.46875, "grad_norm": 0.20682398974895477, "learning_rate": 3.3564814814814815e-05, "loss": 0.0231, "step": 1630 }, { "epoch": 25.625, "grad_norm": 0.7664525508880615, "learning_rate": 3.240740740740741e-05, "loss": 0.0174, "step": 1640 }, { "epoch": 25.78125, "grad_norm": 0.20978455245494843, "learning_rate": 3.125e-05, "loss": 0.0203, "step": 1650 }, { "epoch": 25.9375, "grad_norm": 0.5540274977684021, "learning_rate": 3.0092592592592593e-05, "loss": 0.0205, "step": 1660 }, { "epoch": 26.09375, "grad_norm": 0.1240416169166565, "learning_rate": 2.8935185185185186e-05, "loss": 0.0168, "step": 1670 }, { "epoch": 26.25, "grad_norm": 0.04385749623179436, "learning_rate": 2.777777777777778e-05, "loss": 0.0162, "step": 1680 }, { "epoch": 26.40625, "grad_norm": 0.1799972951412201, "learning_rate": 2.6620370370370372e-05, "loss": 0.0196, "step": 1690 }, { "epoch": 26.5625, "grad_norm": 0.24593585729599, "learning_rate": 2.5462962962962965e-05, "loss": 0.0292, "step": 1700 }, { "epoch": 26.5625, "eval_accuracy": 0.9619469026548673, "eval_f1": 0.9071274298056156, "eval_loss": 0.11493521183729172, "eval_precision": 0.9051724137931034, "eval_recall": 0.9090909090909091, "eval_runtime": 0.8719, "eval_samples_per_second": 129.607, "eval_steps_per_second": 17.204, "step": 1700 }, { "epoch": 26.71875, "grad_norm": 0.2344673126935959, "learning_rate": 2.4305555555555558e-05, "loss": 0.0177, "step": 1710 }, { "epoch": 26.875, "grad_norm": 0.263621062040329, "learning_rate": 2.314814814814815e-05, "loss": 0.0288, "step": 1720 }, { "epoch": 27.03125, "grad_norm": 0.27248746156692505, "learning_rate": 2.1990740740740743e-05, "loss": 0.0158, "step": 1730 }, { "epoch": 27.1875, "grad_norm": 0.35065901279449463, "learning_rate": 2.0833333333333336e-05, "loss": 0.0198, "step": 1740 }, { "epoch": 27.34375, "grad_norm": 0.23319651186466217, "learning_rate": 1.967592592592593e-05, "loss": 0.0208, "step": 1750 }, { "epoch": 27.5, "grad_norm": 0.15196481347084045, "learning_rate": 1.8518518518518518e-05, "loss": 0.0161, "step": 1760 }, { "epoch": 27.65625, "grad_norm": 0.17273353040218353, "learning_rate": 1.736111111111111e-05, "loss": 0.0213, "step": 1770 }, { "epoch": 27.8125, "grad_norm": 0.31511059403419495, "learning_rate": 1.6203703703703704e-05, "loss": 0.012, "step": 1780 }, { "epoch": 27.96875, "grad_norm": 0.09265203773975372, "learning_rate": 1.5046296296296297e-05, "loss": 0.0218, "step": 1790 }, { "epoch": 28.125, "grad_norm": 0.2638147473335266, "learning_rate": 1.388888888888889e-05, "loss": 0.0261, "step": 1800 }, { "epoch": 28.125, "eval_accuracy": 0.9619469026548673, "eval_f1": 0.9079229122055675, "eval_loss": 0.11067904531955719, "eval_precision": 0.8983050847457628, "eval_recall": 0.9177489177489178, "eval_runtime": 0.9722, "eval_samples_per_second": 116.235, "eval_steps_per_second": 15.429, "step": 1800 }, { "epoch": 28.28125, "grad_norm": 0.2734526991844177, "learning_rate": 1.2731481481481482e-05, "loss": 0.0175, "step": 1810 }, { "epoch": 28.4375, "grad_norm": 0.06026133522391319, "learning_rate": 1.1574074074074075e-05, "loss": 0.0168, "step": 1820 }, { "epoch": 28.59375, "grad_norm": 0.02611556649208069, "learning_rate": 1.0416666666666668e-05, "loss": 0.0177, "step": 1830 }, { "epoch": 28.75, "grad_norm": 0.23434928059577942, "learning_rate": 9.259259259259259e-06, "loss": 0.0252, "step": 1840 }, { "epoch": 28.90625, "grad_norm": 0.07623735815286636, "learning_rate": 8.101851851851852e-06, "loss": 0.0235, "step": 1850 }, { "epoch": 29.0625, "grad_norm": 0.05061192065477371, "learning_rate": 6.944444444444445e-06, "loss": 0.0185, "step": 1860 }, { "epoch": 29.21875, "grad_norm": 0.03355779871344566, "learning_rate": 5.787037037037038e-06, "loss": 0.0157, "step": 1870 }, { "epoch": 29.375, "grad_norm": 0.023396974429488182, "learning_rate": 4.6296296296296296e-06, "loss": 0.016, "step": 1880 }, { "epoch": 29.53125, "grad_norm": 0.31754446029663086, "learning_rate": 3.4722222222222224e-06, "loss": 0.0213, "step": 1890 }, { "epoch": 29.6875, "grad_norm": 0.04646694287657738, "learning_rate": 2.3148148148148148e-06, "loss": 0.0166, "step": 1900 }, { "epoch": 29.6875, "eval_accuracy": 0.9610619469026549, "eval_f1": 0.9051724137931034, "eval_loss": 0.11101004481315613, "eval_precision": 0.9012875536480687, "eval_recall": 0.9090909090909091, "eval_runtime": 0.8549, "eval_samples_per_second": 132.182, "eval_steps_per_second": 17.546, "step": 1900 }, { "epoch": 29.84375, "grad_norm": 0.05433151125907898, "learning_rate": 1.1574074074074074e-06, "loss": 0.0174, "step": 1910 }, { "epoch": 30.0, "grad_norm": 0.3085578382015228, "learning_rate": 0.0, "loss": 0.0208, "step": 1920 }, { "epoch": 30.0, "step": 1920, "total_flos": 2.352825493649326e+18, "train_loss": 0.05003170374160012, "train_runtime": 517.118, "train_samples_per_second": 58.71, "train_steps_per_second": 3.713 } ], "logging_steps": 10, "max_steps": 1920, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.352825493649326e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }