{ "best_metric": 0.11245531588792801, "best_model_checkpoint": "/home/tonyzhao6/Projects/urgency-detection-finetuning/results/model_training/gemma-2-2b-it-8bit-64-32-v4/checkpoint-700", "epoch": 0.970873786407767, "eval_steps": 100, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013869625520110958, "grad_norm": 0.9987179040908813, "learning_rate": 1.834862385321101e-05, "loss": 2.1962, "step": 10 }, { "epoch": 0.027739251040221916, "grad_norm": 0.7583550810813904, "learning_rate": 3.669724770642202e-05, "loss": 1.6408, "step": 20 }, { "epoch": 0.04160887656033287, "grad_norm": 0.6101565361022949, "learning_rate": 5.504587155963303e-05, "loss": 0.8461, "step": 30 }, { "epoch": 0.05547850208044383, "grad_norm": 0.2617435157299042, "learning_rate": 7.339449541284404e-05, "loss": 0.3555, "step": 40 }, { "epoch": 0.06934812760055478, "grad_norm": 0.17355866730213165, "learning_rate": 9.174311926605506e-05, "loss": 0.2506, "step": 50 }, { "epoch": 0.08321775312066575, "grad_norm": 0.1640639752149582, "learning_rate": 0.00011009174311926606, "loss": 0.23, "step": 60 }, { "epoch": 0.0970873786407767, "grad_norm": 0.1592264175415039, "learning_rate": 0.00012844036697247707, "loss": 0.2178, "step": 70 }, { "epoch": 0.11095700416088766, "grad_norm": 0.13895617425441742, "learning_rate": 0.0001467889908256881, "loss": 0.1953, "step": 80 }, { "epoch": 0.12482662968099861, "grad_norm": 0.13263335824012756, "learning_rate": 0.0001651376146788991, "loss": 0.1851, "step": 90 }, { "epoch": 0.13869625520110956, "grad_norm": 0.14112189412117004, "learning_rate": 0.00018348623853211012, "loss": 0.1782, "step": 100 }, { "epoch": 0.13869625520110956, "eval_loss": 0.1666356474161148, "eval_runtime": 87.2237, "eval_samples_per_second": 14.48, "eval_steps_per_second": 0.906, "step": 100 }, { "epoch": 0.15256588072122051, "grad_norm": 0.1304333657026291, "learning_rate": 0.00019994279176201374, "loss": 0.1644, "step": 110 }, { "epoch": 0.1664355062413315, "grad_norm": 0.13556469976902008, "learning_rate": 0.00019937070938215104, "loss": 0.1614, "step": 120 }, { "epoch": 0.18030513176144244, "grad_norm": 0.12493357062339783, "learning_rate": 0.00019879862700228834, "loss": 0.148, "step": 130 }, { "epoch": 0.1941747572815534, "grad_norm": 0.12785165011882782, "learning_rate": 0.00019822654462242566, "loss": 0.1526, "step": 140 }, { "epoch": 0.20804438280166435, "grad_norm": 0.1414169818162918, "learning_rate": 0.00019765446224256295, "loss": 0.1498, "step": 150 }, { "epoch": 0.22191400832177532, "grad_norm": 0.11336012184619904, "learning_rate": 0.00019708237986270025, "loss": 0.1506, "step": 160 }, { "epoch": 0.23578363384188628, "grad_norm": 0.11893126368522644, "learning_rate": 0.00019651029748283754, "loss": 0.1343, "step": 170 }, { "epoch": 0.24965325936199723, "grad_norm": 0.12188615649938583, "learning_rate": 0.00019593821510297484, "loss": 0.1379, "step": 180 }, { "epoch": 0.2635228848821082, "grad_norm": 0.11430846899747849, "learning_rate": 0.00019536613272311214, "loss": 0.1344, "step": 190 }, { "epoch": 0.27739251040221913, "grad_norm": 0.11359121650457382, "learning_rate": 0.00019479405034324946, "loss": 0.139, "step": 200 }, { "epoch": 0.27739251040221913, "eval_loss": 0.13432957231998444, "eval_runtime": 86.9127, "eval_samples_per_second": 14.532, "eval_steps_per_second": 0.909, "step": 200 }, { "epoch": 0.2912621359223301, "grad_norm": 0.10335998982191086, "learning_rate": 0.00019422196796338675, "loss": 0.1374, "step": 210 }, { "epoch": 0.30513176144244103, "grad_norm": 0.09991727769374847, "learning_rate": 0.00019364988558352405, "loss": 0.1344, "step": 220 }, { "epoch": 0.31900138696255204, "grad_norm": 0.10995834320783615, "learning_rate": 0.00019307780320366135, "loss": 0.1394, "step": 230 }, { "epoch": 0.332871012482663, "grad_norm": 0.10396566987037659, "learning_rate": 0.00019250572082379864, "loss": 0.1223, "step": 240 }, { "epoch": 0.34674063800277394, "grad_norm": 0.10032226890325546, "learning_rate": 0.00019193363844393594, "loss": 0.1285, "step": 250 }, { "epoch": 0.3606102635228849, "grad_norm": 0.10445073246955872, "learning_rate": 0.00019136155606407323, "loss": 0.1261, "step": 260 }, { "epoch": 0.37447988904299584, "grad_norm": 0.11336586624383926, "learning_rate": 0.00019078947368421053, "loss": 0.1286, "step": 270 }, { "epoch": 0.3883495145631068, "grad_norm": 0.10205301642417908, "learning_rate": 0.00019021739130434782, "loss": 0.1208, "step": 280 }, { "epoch": 0.40221914008321774, "grad_norm": 0.09567493945360184, "learning_rate": 0.00018964530892448515, "loss": 0.1271, "step": 290 }, { "epoch": 0.4160887656033287, "grad_norm": 0.10747899860143661, "learning_rate": 0.00018907322654462244, "loss": 0.1233, "step": 300 }, { "epoch": 0.4160887656033287, "eval_loss": 0.1257271021604538, "eval_runtime": 86.6524, "eval_samples_per_second": 14.575, "eval_steps_per_second": 0.912, "step": 300 }, { "epoch": 0.42995839112343964, "grad_norm": 0.10108979046344757, "learning_rate": 0.00018850114416475974, "loss": 0.124, "step": 310 }, { "epoch": 0.44382801664355065, "grad_norm": 0.09316466003656387, "learning_rate": 0.00018792906178489703, "loss": 0.1212, "step": 320 }, { "epoch": 0.4576976421636616, "grad_norm": 0.10638488829135895, "learning_rate": 0.00018735697940503433, "loss": 0.1288, "step": 330 }, { "epoch": 0.47156726768377255, "grad_norm": 0.09914766252040863, "learning_rate": 0.00018678489702517162, "loss": 0.1259, "step": 340 }, { "epoch": 0.4854368932038835, "grad_norm": 0.09707864373922348, "learning_rate": 0.00018621281464530892, "loss": 0.124, "step": 350 }, { "epoch": 0.49930651872399445, "grad_norm": 0.09507231414318085, "learning_rate": 0.00018564073226544621, "loss": 0.1262, "step": 360 }, { "epoch": 0.5131761442441054, "grad_norm": 0.09129882603883743, "learning_rate": 0.0001850686498855835, "loss": 0.1211, "step": 370 }, { "epoch": 0.5270457697642164, "grad_norm": 0.09889239072799683, "learning_rate": 0.00018449656750572083, "loss": 0.1218, "step": 380 }, { "epoch": 0.5409153952843273, "grad_norm": 0.09886115044355392, "learning_rate": 0.00018392448512585813, "loss": 0.1214, "step": 390 }, { "epoch": 0.5547850208044383, "grad_norm": 0.09064166992902756, "learning_rate": 0.00018335240274599542, "loss": 0.126, "step": 400 }, { "epoch": 0.5547850208044383, "eval_loss": 0.12142250686883926, "eval_runtime": 86.64, "eval_samples_per_second": 14.578, "eval_steps_per_second": 0.912, "step": 400 }, { "epoch": 0.5686546463245492, "grad_norm": 0.10354544222354889, "learning_rate": 0.00018278032036613272, "loss": 0.1253, "step": 410 }, { "epoch": 0.5825242718446602, "grad_norm": 0.09165250509977341, "learning_rate": 0.00018220823798627001, "loss": 0.1224, "step": 420 }, { "epoch": 0.5963938973647711, "grad_norm": 0.09138130396604538, "learning_rate": 0.0001816361556064073, "loss": 0.1289, "step": 430 }, { "epoch": 0.6102635228848821, "grad_norm": 0.09735599905252457, "learning_rate": 0.00018106407322654463, "loss": 0.1181, "step": 440 }, { "epoch": 0.624133148404993, "grad_norm": 0.09955897927284241, "learning_rate": 0.00018049199084668193, "loss": 0.1207, "step": 450 }, { "epoch": 0.6380027739251041, "grad_norm": 0.09378518909215927, "learning_rate": 0.00017991990846681922, "loss": 0.1189, "step": 460 }, { "epoch": 0.651872399445215, "grad_norm": 0.09985518455505371, "learning_rate": 0.00017934782608695652, "loss": 0.1196, "step": 470 }, { "epoch": 0.665742024965326, "grad_norm": 0.09567826986312866, "learning_rate": 0.00017877574370709382, "loss": 0.1189, "step": 480 }, { "epoch": 0.6796116504854369, "grad_norm": 0.09133660793304443, "learning_rate": 0.0001782036613272311, "loss": 0.1199, "step": 490 }, { "epoch": 0.6934812760055479, "grad_norm": 0.07571779191493988, "learning_rate": 0.00017763157894736843, "loss": 0.1199, "step": 500 }, { "epoch": 0.6934812760055479, "eval_loss": 0.11764033138751984, "eval_runtime": 86.7125, "eval_samples_per_second": 14.565, "eval_steps_per_second": 0.911, "step": 500 }, { "epoch": 0.7073509015256588, "grad_norm": 0.07904700189828873, "learning_rate": 0.00017705949656750573, "loss": 0.1174, "step": 510 }, { "epoch": 0.7212205270457698, "grad_norm": 0.0874553844332695, "learning_rate": 0.00017648741418764302, "loss": 0.1191, "step": 520 }, { "epoch": 0.7350901525658807, "grad_norm": 0.09417985379695892, "learning_rate": 0.00017591533180778032, "loss": 0.1158, "step": 530 }, { "epoch": 0.7489597780859917, "grad_norm": 0.0866062194108963, "learning_rate": 0.00017534324942791762, "loss": 0.1106, "step": 540 }, { "epoch": 0.7628294036061026, "grad_norm": 0.08498796820640564, "learning_rate": 0.0001747711670480549, "loss": 0.1124, "step": 550 }, { "epoch": 0.7766990291262136, "grad_norm": 0.08251694589853287, "learning_rate": 0.00017419908466819223, "loss": 0.1136, "step": 560 }, { "epoch": 0.7905686546463245, "grad_norm": 0.08275240659713745, "learning_rate": 0.00017362700228832953, "loss": 0.1107, "step": 570 }, { "epoch": 0.8044382801664355, "grad_norm": 0.08751562237739563, "learning_rate": 0.00017305491990846682, "loss": 0.1169, "step": 580 }, { "epoch": 0.8183079056865464, "grad_norm": 0.09078636020421982, "learning_rate": 0.00017248283752860412, "loss": 0.1143, "step": 590 }, { "epoch": 0.8321775312066574, "grad_norm": 0.08412676304578781, "learning_rate": 0.00017191075514874142, "loss": 0.1197, "step": 600 }, { "epoch": 0.8321775312066574, "eval_loss": 0.11502571403980255, "eval_runtime": 86.625, "eval_samples_per_second": 14.58, "eval_steps_per_second": 0.912, "step": 600 }, { "epoch": 0.8460471567267683, "grad_norm": 0.08373397588729858, "learning_rate": 0.0001713386727688787, "loss": 0.1205, "step": 610 }, { "epoch": 0.8599167822468793, "grad_norm": 0.08933025598526001, "learning_rate": 0.00017076659038901603, "loss": 0.1147, "step": 620 }, { "epoch": 0.8737864077669902, "grad_norm": 0.08800772577524185, "learning_rate": 0.00017019450800915333, "loss": 0.1201, "step": 630 }, { "epoch": 0.8876560332871013, "grad_norm": 0.08623263984918594, "learning_rate": 0.00016962242562929063, "loss": 0.1144, "step": 640 }, { "epoch": 0.9015256588072122, "grad_norm": 0.0788191556930542, "learning_rate": 0.00016905034324942792, "loss": 0.1188, "step": 650 }, { "epoch": 0.9153952843273232, "grad_norm": 0.0787658542394638, "learning_rate": 0.00016847826086956522, "loss": 0.1077, "step": 660 }, { "epoch": 0.9292649098474342, "grad_norm": 0.08364666253328323, "learning_rate": 0.0001679061784897025, "loss": 0.1072, "step": 670 }, { "epoch": 0.9431345353675451, "grad_norm": 0.08853990584611893, "learning_rate": 0.00016733409610983983, "loss": 0.1097, "step": 680 }, { "epoch": 0.957004160887656, "grad_norm": 0.08456674963235855, "learning_rate": 0.00016676201372997713, "loss": 0.1167, "step": 690 }, { "epoch": 0.970873786407767, "grad_norm": 0.0840703621506691, "learning_rate": 0.00016618993135011443, "loss": 0.1231, "step": 700 }, { "epoch": 0.970873786407767, "eval_loss": 0.11245531588792801, "eval_runtime": 86.613, "eval_samples_per_second": 14.582, "eval_steps_per_second": 0.912, "step": 700 } ], "logging_steps": 10, "max_steps": 3605, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0638251619228058e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }