{ "best_metric": 0.34852299094200134, "best_model_checkpoint": "./convnext-base-3e-4/checkpoint-10990", "epoch": 10.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 17.357587814331055, "learning_rate": 0.00029993871755982685, "loss": 2.1854, "step": 100 }, { "epoch": 0.18, "grad_norm": 28.103832244873047, "learning_rate": 0.0002997549203131404, "loss": 1.4323, "step": 200 }, { "epoch": 0.27, "grad_norm": 15.243412017822266, "learning_rate": 0.0002994487584405243, "loss": 1.3262, "step": 300 }, { "epoch": 0.36, "grad_norm": 13.571104049682617, "learning_rate": 0.00029902048210660057, "loss": 1.164, "step": 400 }, { "epoch": 0.45, "grad_norm": 11.755253791809082, "learning_rate": 0.00029847044125561983, "loss": 1.1175, "step": 500 }, { "epoch": 0.55, "grad_norm": 8.938959121704102, "learning_rate": 0.00029779908532552276, "loss": 1.0117, "step": 600 }, { "epoch": 0.64, "grad_norm": 8.597779273986816, "learning_rate": 0.00029700696288070426, "loss": 1.0719, "step": 700 }, { "epoch": 0.73, "grad_norm": 12.803329467773438, "learning_rate": 0.0002960947211637822, "loss": 1.0533, "step": 800 }, { "epoch": 0.82, "grad_norm": 12.126448631286621, "learning_rate": 0.00029506310556673567, "loss": 0.9138, "step": 900 }, { "epoch": 0.91, "grad_norm": 8.313648223876953, "learning_rate": 0.0002939129590218462, "loss": 0.8947, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.7884691848906561, "eval_loss": 0.7363528609275818, "eval_runtime": 103.2916, "eval_samples_per_second": 24.349, "eval_steps_per_second": 1.53, "step": 1099 }, { "epoch": 1.0, "grad_norm": 8.587052345275879, "learning_rate": 0.00029264522131293815, "loss": 0.9902, "step": 1100 }, { "epoch": 1.09, "grad_norm": 8.60452938079834, "learning_rate": 0.00029126092830748215, "loss": 0.8517, "step": 1200 }, { "epoch": 1.18, "grad_norm": 14.598617553710938, "learning_rate": 0.00028976121111018877, "loss": 0.802, "step": 1300 }, { "epoch": 1.27, "grad_norm": 7.155284881591797, "learning_rate": 0.00028814729513878363, "loss": 0.7962, "step": 1400 }, { "epoch": 1.36, "grad_norm": 13.24478816986084, "learning_rate": 0.00028642049912271946, "loss": 0.7782, "step": 1500 }, { "epoch": 1.46, "grad_norm": 10.02603530883789, "learning_rate": 0.0002845822340256436, "loss": 0.7813, "step": 1600 }, { "epoch": 1.55, "grad_norm": 10.052382469177246, "learning_rate": 0.00028263400189250057, "loss": 0.8079, "step": 1700 }, { "epoch": 1.64, "grad_norm": 8.431123733520508, "learning_rate": 0.0002805773946222121, "loss": 0.8041, "step": 1800 }, { "epoch": 1.73, "grad_norm": 7.3062944412231445, "learning_rate": 0.00027841409266693835, "loss": 0.8019, "step": 1900 }, { "epoch": 1.82, "grad_norm": 7.146261215209961, "learning_rate": 0.0002761458636589813, "loss": 0.679, "step": 2000 }, { "epoch": 1.91, "grad_norm": 12.160822868347168, "learning_rate": 0.0002737745609664539, "loss": 0.7643, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.8170974155069582, "eval_loss": 0.628582775592804, "eval_runtime": 103.2407, "eval_samples_per_second": 24.361, "eval_steps_per_second": 1.53, "step": 2198 }, { "epoch": 2.0, "grad_norm": 14.596354484558105, "learning_rate": 0.00027130212217889483, "loss": 0.7511, "step": 2200 }, { "epoch": 2.09, "grad_norm": 11.497203826904297, "learning_rate": 0.000268730567524065, "loss": 0.6527, "step": 2300 }, { "epoch": 2.18, "grad_norm": 6.976894855499268, "learning_rate": 0.00026606199821722166, "loss": 0.6289, "step": 2400 }, { "epoch": 2.27, "grad_norm": 9.718855857849121, "learning_rate": 0.0002632985947442167, "loss": 0.6755, "step": 2500 }, { "epoch": 2.37, "grad_norm": 6.231022357940674, "learning_rate": 0.00026044261507982355, "loss": 0.6377, "step": 2600 }, { "epoch": 2.46, "grad_norm": 15.673833847045898, "learning_rate": 0.0002574963928427478, "loss": 0.626, "step": 2700 }, { "epoch": 2.55, "grad_norm": 5.745795726776123, "learning_rate": 0.00025446233538882923, "loss": 0.6276, "step": 2800 }, { "epoch": 2.64, "grad_norm": 11.555608749389648, "learning_rate": 0.00025134292184399317, "loss": 0.695, "step": 2900 }, { "epoch": 2.73, "grad_norm": 7.4662089347839355, "learning_rate": 0.00024814070107855875, "loss": 0.6095, "step": 3000 }, { "epoch": 2.82, "grad_norm": 7.660247802734375, "learning_rate": 0.00024485828962455907, "loss": 0.631, "step": 3100 }, { "epoch": 2.91, "grad_norm": 9.731380462646484, "learning_rate": 0.00024149836953777485, "loss": 0.6036, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.8481113320079523, "eval_loss": 0.5258452892303467, "eval_runtime": 104.1346, "eval_samples_per_second": 24.151, "eval_steps_per_second": 1.517, "step": 3297 }, { "epoch": 3.0, "grad_norm": 5.469219207763672, "learning_rate": 0.00023806368620622872, "loss": 0.5889, "step": 3300 }, { "epoch": 3.09, "grad_norm": 8.50100040435791, "learning_rate": 0.0002345570461069312, "loss": 0.5034, "step": 3400 }, { "epoch": 3.18, "grad_norm": 12.361306190490723, "learning_rate": 0.00023098131451271015, "loss": 0.5181, "step": 3500 }, { "epoch": 3.28, "grad_norm": 7.584137439727783, "learning_rate": 0.0002273394131509988, "loss": 0.5336, "step": 3600 }, { "epoch": 3.37, "grad_norm": 10.401506423950195, "learning_rate": 0.0002236343178164948, "loss": 0.5216, "step": 3700 }, { "epoch": 3.46, "grad_norm": 5.352778434753418, "learning_rate": 0.00021986905593964046, "loss": 0.4939, "step": 3800 }, { "epoch": 3.55, "grad_norm": 7.9993109703063965, "learning_rate": 0.0002160467041129117, "loss": 0.521, "step": 3900 }, { "epoch": 3.64, "grad_norm": 9.176218032836914, "learning_rate": 0.00021217038557693726, "loss": 0.5288, "step": 4000 }, { "epoch": 3.73, "grad_norm": 9.322657585144043, "learning_rate": 0.0002082432676685007, "loss": 0.5168, "step": 4100 }, { "epoch": 3.82, "grad_norm": 5.861387252807617, "learning_rate": 0.00020426855923251228, "loss": 0.5081, "step": 4200 }, { "epoch": 3.91, "grad_norm": 4.290045261383057, "learning_rate": 0.00020024950800006462, "loss": 0.5012, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.869582504970179, "eval_loss": 0.49109867215156555, "eval_runtime": 102.6443, "eval_samples_per_second": 24.502, "eval_steps_per_second": 1.539, "step": 4396 }, { "epoch": 4.0, "grad_norm": 4.063934803009033, "learning_rate": 0.0001961893979347137, "loss": 0.4754, "step": 4400 }, { "epoch": 4.09, "grad_norm": 5.163869380950928, "learning_rate": 0.00019209154654915522, "loss": 0.4471, "step": 4500 }, { "epoch": 4.19, "grad_norm": 3.733952522277832, "learning_rate": 0.0001879593021944875, "loss": 0.4004, "step": 4600 }, { "epoch": 4.28, "grad_norm": 7.615368366241455, "learning_rate": 0.00018379604132427648, "loss": 0.4076, "step": 4700 }, { "epoch": 4.37, "grad_norm": 9.600367546081543, "learning_rate": 0.0001796051657356582, "loss": 0.4035, "step": 4800 }, { "epoch": 4.46, "grad_norm": 9.41919231414795, "learning_rate": 0.0001753900997897331, "loss": 0.4281, "step": 4900 }, { "epoch": 4.55, "grad_norm": 13.647310256958008, "learning_rate": 0.00017115428761352327, "loss": 0.3674, "step": 5000 }, { "epoch": 4.64, "grad_norm": 2.1058132648468018, "learning_rate": 0.00016690119028577906, "loss": 0.3917, "step": 5100 }, { "epoch": 4.73, "grad_norm": 4.259520053863525, "learning_rate": 0.0001626342830089342, "loss": 0.3899, "step": 5200 }, { "epoch": 4.82, "grad_norm": 4.13034200668335, "learning_rate": 0.0001583570522695211, "loss": 0.4178, "step": 5300 }, { "epoch": 4.91, "grad_norm": 13.085577011108398, "learning_rate": 0.00015407299298936486, "loss": 0.3926, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.8930417495029821, "eval_loss": 0.38039031624794006, "eval_runtime": 103.7463, "eval_samples_per_second": 24.242, "eval_steps_per_second": 1.523, "step": 5495 }, { "epoch": 5.0, "grad_norm": 3.522939443588257, "learning_rate": 0.000149785605669886, "loss": 0.3671, "step": 5500 }, { "epoch": 5.1, "grad_norm": 5.17425537109375, "learning_rate": 0.00014549839353184327, "loss": 0.3017, "step": 5600 }, { "epoch": 5.19, "grad_norm": 6.327219009399414, "learning_rate": 0.00014121485965285484, "loss": 0.2922, "step": 5700 }, { "epoch": 5.28, "grad_norm": 0.7092263102531433, "learning_rate": 0.00013693850410503614, "loss": 0.314, "step": 5800 }, { "epoch": 5.37, "grad_norm": 0.38373059034347534, "learning_rate": 0.0001326728210950942, "loss": 0.3141, "step": 5900 }, { "epoch": 5.46, "grad_norm": 5.553852558135986, "learning_rate": 0.00012842129610921376, "loss": 0.2821, "step": 6000 }, { "epoch": 5.55, "grad_norm": 3.678790330886841, "learning_rate": 0.00012418740306506922, "loss": 0.3359, "step": 6100 }, { "epoch": 5.64, "grad_norm": 4.428023338317871, "learning_rate": 0.00011997460147328983, "loss": 0.2825, "step": 6200 }, { "epoch": 5.73, "grad_norm": 3.3043198585510254, "learning_rate": 0.00011578633361069557, "loss": 0.3317, "step": 6300 }, { "epoch": 5.82, "grad_norm": 1.317456603050232, "learning_rate": 0.0001116260217076161, "loss": 0.2983, "step": 6400 }, { "epoch": 5.91, "grad_norm": 8.99087142944336, "learning_rate": 0.00010749706515158862, "loss": 0.3348, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.8970178926441352, "eval_loss": 0.41324833035469055, "eval_runtime": 103.3001, "eval_samples_per_second": 24.347, "eval_steps_per_second": 1.53, "step": 6594 }, { "epoch": 6.01, "grad_norm": 4.097568988800049, "learning_rate": 0.00010340283770972167, "loss": 0.3045, "step": 6600 }, { "epoch": 6.1, "grad_norm": 10.98578929901123, "learning_rate": 9.93466847719919e-05, "loss": 0.2327, "step": 6700 }, { "epoch": 6.19, "grad_norm": 13.919866561889648, "learning_rate": 9.533192061772917e-05, "loss": 0.2696, "step": 6800 }, { "epoch": 6.28, "grad_norm": 6.072042942047119, "learning_rate": 9.136182570752152e-05, "loss": 0.2258, "step": 6900 }, { "epoch": 6.37, "grad_norm": 0.17451171576976776, "learning_rate": 8.743964400275302e-05, "loss": 0.2406, "step": 7000 }, { "epoch": 6.46, "grad_norm": 0.33122172951698303, "learning_rate": 8.356858031496595e-05, "loss": 0.2505, "step": 7100 }, { "epoch": 6.55, "grad_norm": 1.151172161102295, "learning_rate": 7.975179768721186e-05, "loss": 0.1903, "step": 7200 }, { "epoch": 6.64, "grad_norm": 0.7031873464584351, "learning_rate": 7.59924148095311e-05, "loss": 0.2085, "step": 7300 }, { "epoch": 6.73, "grad_norm": 6.131903171539307, "learning_rate": 7.229350347067424e-05, "loss": 0.2471, "step": 7400 }, { "epoch": 6.82, "grad_norm": 6.110349178314209, "learning_rate": 6.865808604814564e-05, "loss": 0.2085, "step": 7500 }, { "epoch": 6.92, "grad_norm": 4.413149833679199, "learning_rate": 6.508913303862143e-05, "loss": 0.2594, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.9153081510934393, "eval_loss": 0.3626956641674042, "eval_runtime": 103.1814, "eval_samples_per_second": 24.375, "eval_steps_per_second": 1.531, "step": 7693 }, { "epoch": 7.01, "grad_norm": 3.3153018951416016, "learning_rate": 6.158956063075865e-05, "loss": 0.1743, "step": 7700 }, { "epoch": 7.1, "grad_norm": 2.5595600605010986, "learning_rate": 5.816222832238015e-05, "loss": 0.1699, "step": 7800 }, { "epoch": 7.19, "grad_norm": 3.9605636596679688, "learning_rate": 5.4809936583981286e-05, "loss": 0.2036, "step": 7900 }, { "epoch": 7.28, "grad_norm": 0.7597993612289429, "learning_rate": 5.1535424570467366e-05, "loss": 0.1829, "step": 8000 }, { "epoch": 7.37, "grad_norm": 5.694293022155762, "learning_rate": 4.834136788299248e-05, "loss": 0.2039, "step": 8100 }, { "epoch": 7.46, "grad_norm": 0.5163713097572327, "learning_rate": 4.523037638272821e-05, "loss": 0.1764, "step": 8200 }, { "epoch": 7.55, "grad_norm": 4.396867275238037, "learning_rate": 4.220499205834782e-05, "loss": 0.1862, "step": 8300 }, { "epoch": 7.64, "grad_norm": 0.054451316595077515, "learning_rate": 3.926768694896931e-05, "loss": 0.1773, "step": 8400 }, { "epoch": 7.73, "grad_norm": 0.23744052648544312, "learning_rate": 3.64208611242546e-05, "loss": 0.1648, "step": 8500 }, { "epoch": 7.83, "grad_norm": 3.540268659591675, "learning_rate": 3.366684072331414e-05, "loss": 0.1541, "step": 8600 }, { "epoch": 7.92, "grad_norm": 0.33744722604751587, "learning_rate": 3.100787605402072e-05, "loss": 0.1751, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.9308151093439364, "eval_loss": 0.3506681025028229, "eval_runtime": 103.8384, "eval_samples_per_second": 24.22, "eval_steps_per_second": 1.522, "step": 8792 }, { "epoch": 8.01, "grad_norm": 0.13831892609596252, "learning_rate": 2.844613975428448e-05, "loss": 0.1472, "step": 8800 }, { "epoch": 8.1, "grad_norm": 2.3098576068878174, "learning_rate": 2.5983725016792572e-05, "loss": 0.1772, "step": 8900 }, { "epoch": 8.19, "grad_norm": 0.10428429394960403, "learning_rate": 2.3622643878662696e-05, "loss": 0.1524, "step": 9000 }, { "epoch": 8.28, "grad_norm": 9.646160125732422, "learning_rate": 2.1364825577409422e-05, "loss": 0.1023, "step": 9100 }, { "epoch": 8.37, "grad_norm": 0.3879956305027008, "learning_rate": 1.9212114974565664e-05, "loss": 0.1421, "step": 9200 }, { "epoch": 8.46, "grad_norm": 0.022449787706136703, "learning_rate": 1.7166271048247792e-05, "loss": 0.1101, "step": 9300 }, { "epoch": 8.55, "grad_norm": 4.682805061340332, "learning_rate": 1.5228965455896053e-05, "loss": 0.1355, "step": 9400 }, { "epoch": 8.64, "grad_norm": 4.212618350982666, "learning_rate": 1.3401781168364589e-05, "loss": 0.1465, "step": 9500 }, { "epoch": 8.74, "grad_norm": 0.17709462344646454, "learning_rate": 1.1686211176477206e-05, "loss": 0.1375, "step": 9600 }, { "epoch": 8.83, "grad_norm": 7.981707572937012, "learning_rate": 1.0083657271105799e-05, "loss": 0.1498, "step": 9700 }, { "epoch": 8.92, "grad_norm": 7.477297306060791, "learning_rate": 8.59542889776807e-06, "loss": 0.1613, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.9300198807157057, "eval_loss": 0.34880414605140686, "eval_runtime": 103.3381, "eval_samples_per_second": 24.338, "eval_steps_per_second": 1.529, "step": 9891 }, { "epoch": 9.01, "grad_norm": 1.4414594173431396, "learning_rate": 7.222742086680755e-06, "loss": 0.1335, "step": 9900 }, { "epoch": 9.1, "grad_norm": 4.2091474533081055, "learning_rate": 5.966718459142195e-06, "loss": 0.1066, "step": 10000 }, { "epoch": 9.19, "grad_norm": 18.196033477783203, "learning_rate": 4.828384311056549e-06, "loss": 0.125, "step": 10100 }, { "epoch": 9.28, "grad_norm": 2.323212146759033, "learning_rate": 3.8086697743481664e-06, "loss": 0.1239, "step": 10200 }, { "epoch": 9.37, "grad_norm": 0.2004556953907013, "learning_rate": 2.9084080569515775e-06, "loss": 0.1076, "step": 10300 }, { "epoch": 9.46, "grad_norm": 12.655696868896484, "learning_rate": 2.128334761997924e-06, "loss": 0.1054, "step": 10400 }, { "epoch": 9.55, "grad_norm": 0.03721316158771515, "learning_rate": 1.469087286754289e-06, "loss": 0.125, "step": 10500 }, { "epoch": 9.65, "grad_norm": 3.7458605766296387, "learning_rate": 9.31204301806776e-07, "loss": 0.1161, "step": 10600 }, { "epoch": 9.74, "grad_norm": 0.63627690076828, "learning_rate": 5.151253109133391e-07, "loss": 0.1342, "step": 10700 }, { "epoch": 9.83, "grad_norm": 7.162803649902344, "learning_rate": 2.211902918855313e-07, "loss": 0.1365, "step": 10800 }, { "epoch": 9.92, "grad_norm": 1.5799212455749512, "learning_rate": 4.9639418792951634e-08, "loss": 0.1102, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9280318091451292, "eval_loss": 0.34852299094200134, "eval_runtime": 103.4235, "eval_samples_per_second": 24.317, "eval_steps_per_second": 1.528, "step": 10990 }, { "epoch": 10.0, "step": 10990, "total_flos": 4.09349935387607e+19, "train_loss": 0.45005689912540897, "train_runtime": 16363.7477, "train_samples_per_second": 10.744, "train_steps_per_second": 0.672 } ], "logging_steps": 100, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.09349935387607e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }