{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 704, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007106057914372002, "grad_norm": 66.55605725941018, "learning_rate": 3.6363636363636366e-06, "loss": 2.4888, "mean_token_accuracy": 0.6310219071805477, "num_tokens": 5923034.0, "step": 5 }, { "epoch": 0.014212115828744005, "grad_norm": 31.27595336039531, "learning_rate": 8.181818181818183e-06, "loss": 1.6522, "mean_token_accuracy": 0.6847351841628552, "num_tokens": 11832986.0, "step": 10 }, { "epoch": 0.021318173743116006, "grad_norm": 6.505788538701925, "learning_rate": 1.2727272727272728e-05, "loss": 1.0489, "mean_token_accuracy": 0.7373726107180119, "num_tokens": 17770055.0, "step": 15 }, { "epoch": 0.02842423165748801, "grad_norm": 3.4652580364487653, "learning_rate": 1.7272727272727274e-05, "loss": 0.8759, "mean_token_accuracy": 0.7604092583060265, "num_tokens": 23710854.0, "step": 20 }, { "epoch": 0.03553028957186001, "grad_norm": 3.073564258123934, "learning_rate": 1.999961805535155e-05, "loss": 0.8341, "mean_token_accuracy": 0.7634279936552048, "num_tokens": 29669291.0, "step": 25 }, { "epoch": 0.04263634748623201, "grad_norm": 2.908944153808897, "learning_rate": 1.9995321550350065e-05, "loss": 0.7871, "mean_token_accuracy": 0.7735667265951633, "num_tokens": 35600851.0, "step": 30 }, { "epoch": 0.04974240540060401, "grad_norm": 3.8874402850017673, "learning_rate": 1.998625339625423e-05, "loss": 0.7597, "mean_token_accuracy": 0.7763237416744232, "num_tokens": 41530042.0, "step": 35 }, { "epoch": 0.05684846331497602, "grad_norm": 2.5449731597634697, "learning_rate": 1.9972418403347817e-05, "loss": 0.7219, "mean_token_accuracy": 0.7790968000888825, "num_tokens": 47452248.0, "step": 40 }, { "epoch": 0.06395452122934801, "grad_norm": 1.7369247244872457, "learning_rate": 1.9953823910527057e-05, "loss": 0.6924, "mean_token_accuracy": 0.7809838131070137, "num_tokens": 53373989.0, "step": 45 }, { "epoch": 0.07106057914372002, "grad_norm": 1.619526678453925, "learning_rate": 1.993047978140764e-05, "loss": 0.6759, "mean_token_accuracy": 0.7882510013878345, "num_tokens": 59280737.0, "step": 50 }, { "epoch": 0.07816663705809203, "grad_norm": 1.4050312043738487, "learning_rate": 1.9902398399092494e-05, "loss": 0.6604, "mean_token_accuracy": 0.7918689742684364, "num_tokens": 65195627.0, "step": 55 }, { "epoch": 0.08527269497246402, "grad_norm": 3.4085768530761555, "learning_rate": 1.9869594659603032e-05, "loss": 0.6644, "mean_token_accuracy": 0.7887415766716004, "num_tokens": 71142550.0, "step": 60 }, { "epoch": 0.09237875288683603, "grad_norm": 2.1575833755380707, "learning_rate": 1.9832085963977445e-05, "loss": 0.6732, "mean_token_accuracy": 0.786578668653965, "num_tokens": 77081694.0, "step": 65 }, { "epoch": 0.09948481080120802, "grad_norm": 2.1880249716642743, "learning_rate": 1.978989220904016e-05, "loss": 0.6647, "mean_token_accuracy": 0.7869649574160575, "num_tokens": 83044225.0, "step": 70 }, { "epoch": 0.10659086871558003, "grad_norm": 3.500718658694026, "learning_rate": 1.9743035776847377e-05, "loss": 0.6618, "mean_token_accuracy": 0.7884958483278751, "num_tokens": 88988372.0, "step": 75 }, { "epoch": 0.11369692662995204, "grad_norm": 3.5435216425879, "learning_rate": 1.9691541522814327e-05, "loss": 0.6538, "mean_token_accuracy": 0.7917183473706245, "num_tokens": 94931302.0, "step": 80 }, { "epoch": 0.12080298454432403, "grad_norm": 3.3524381829737675, "learning_rate": 1.963543676253048e-05, "loss": 0.668, "mean_token_accuracy": 0.7873365215957164, "num_tokens": 100848612.0, "step": 85 }, { "epoch": 0.12790904245869603, "grad_norm": 2.5235394583310655, "learning_rate": 1.9574751257269748e-05, "loss": 0.6632, "mean_token_accuracy": 0.7879712589085102, "num_tokens": 106754467.0, "step": 90 }, { "epoch": 0.13501510037306805, "grad_norm": 2.1804248444209673, "learning_rate": 1.950951719820335e-05, "loss": 0.6519, "mean_token_accuracy": 0.7906477533280849, "num_tokens": 112664258.0, "step": 95 }, { "epoch": 0.14212115828744004, "grad_norm": 2.1726552942080497, "learning_rate": 1.9439769189323727e-05, "loss": 0.654, "mean_token_accuracy": 0.7900555059313774, "num_tokens": 118606688.0, "step": 100 }, { "epoch": 0.14212115828744004, "eval_loss": 0.6459761261940002, "eval_mean_token_accuracy": 0.7921642295101232, "eval_num_tokens": 118606688.0, "eval_runtime": 295.44, "eval_samples_per_second": 12.317, "eval_steps_per_second": 0.386, "step": 100 }, { "epoch": 0.14922721620181204, "grad_norm": 2.0323675408135484, "learning_rate": 1.9365544229088517e-05, "loss": 0.6521, "mean_token_accuracy": 0.7909861005842685, "num_tokens": 124512961.0, "step": 105 }, { "epoch": 0.15633327411618406, "grad_norm": 5.430519736360723, "learning_rate": 1.9286881690794425e-05, "loss": 0.6453, "mean_token_accuracy": 0.7935434632003308, "num_tokens": 130441640.0, "step": 110 }, { "epoch": 0.16343933203055605, "grad_norm": 2.6425373488919943, "learning_rate": 1.9203823301691272e-05, "loss": 0.6683, "mean_token_accuracy": 0.7861981622874736, "num_tokens": 136394287.0, "step": 115 }, { "epoch": 0.17054538994492804, "grad_norm": 3.2273755870117724, "learning_rate": 1.9116413120847425e-05, "loss": 0.6649, "mean_token_accuracy": 0.7885523498058319, "num_tokens": 142309319.0, "step": 120 }, { "epoch": 0.17765144785930007, "grad_norm": 4.163268789121899, "learning_rate": 1.902469751577826e-05, "loss": 0.6679, "mean_token_accuracy": 0.7868955120444298, "num_tokens": 148247490.0, "step": 125 }, { "epoch": 0.18475750577367206, "grad_norm": 1.6661402706531616, "learning_rate": 1.892872513785008e-05, "loss": 0.6394, "mean_token_accuracy": 0.7936947368085384, "num_tokens": 154195595.0, "step": 130 }, { "epoch": 0.19186356368804405, "grad_norm": 1.5794423300632663, "learning_rate": 1.88285468964726e-05, "loss": 0.6373, "mean_token_accuracy": 0.7937119543552399, "num_tokens": 160139401.0, "step": 135 }, { "epoch": 0.19896962160241605, "grad_norm": 1.9146265900976953, "learning_rate": 1.872421593209355e-05, "loss": 0.6367, "mean_token_accuracy": 0.7936570249497891, "num_tokens": 166057697.0, "step": 140 }, { "epoch": 0.20607567951678807, "grad_norm": 1.8920022402043013, "learning_rate": 1.861578758800989e-05, "loss": 0.6448, "mean_token_accuracy": 0.7935090765357018, "num_tokens": 171980981.0, "step": 145 }, { "epoch": 0.21318173743116006, "grad_norm": 1.8829721816030007, "learning_rate": 1.8503319381010414e-05, "loss": 0.6394, "mean_token_accuracy": 0.7929822854697705, "num_tokens": 177920811.0, "step": 150 }, { "epoch": 0.22028779534553206, "grad_norm": 2.710625896660201, "learning_rate": 1.8386870970865488e-05, "loss": 0.6297, "mean_token_accuracy": 0.795507474988699, "num_tokens": 183857404.0, "step": 155 }, { "epoch": 0.22739385325990408, "grad_norm": 1.7692011271897554, "learning_rate": 1.8266504128679988e-05, "loss": 0.6374, "mean_token_accuracy": 0.7952376157045364, "num_tokens": 189793021.0, "step": 160 }, { "epoch": 0.23449991117427607, "grad_norm": 1.226150395650258, "learning_rate": 1.814228270412624e-05, "loss": 0.6525, "mean_token_accuracy": 0.7921756438910961, "num_tokens": 195715230.0, "step": 165 }, { "epoch": 0.24160596908864806, "grad_norm": 2.1730739897367206, "learning_rate": 1.8014272591574405e-05, "loss": 0.6452, "mean_token_accuracy": 0.7933229982852936, "num_tokens": 201644724.0, "step": 170 }, { "epoch": 0.2487120270030201, "grad_norm": 1.1273239605881678, "learning_rate": 1.7882541695138224e-05, "loss": 0.6328, "mean_token_accuracy": 0.795696322619915, "num_tokens": 207585561.0, "step": 175 }, { "epoch": 0.25581808491739205, "grad_norm": 1.094910349166684, "learning_rate": 1.7747159892654646e-05, "loss": 0.6294, "mean_token_accuracy": 0.7952122300863266, "num_tokens": 213540458.0, "step": 180 }, { "epoch": 0.2629241428317641, "grad_norm": 1.4265158165299685, "learning_rate": 1.7608198998616533e-05, "loss": 0.6401, "mean_token_accuracy": 0.7934339419007301, "num_tokens": 219456523.0, "step": 185 }, { "epoch": 0.2700302007461361, "grad_norm": 1.1866353031490813, "learning_rate": 1.7465732726077993e-05, "loss": 0.6376, "mean_token_accuracy": 0.7937880590558052, "num_tokens": 225385658.0, "step": 190 }, { "epoch": 0.27713625866050806, "grad_norm": 1.1122272717695676, "learning_rate": 1.731983664755264e-05, "loss": 0.6434, "mean_token_accuracy": 0.793655838817358, "num_tokens": 231315244.0, "step": 195 }, { "epoch": 0.2842423165748801, "grad_norm": 0.9655859557982183, "learning_rate": 1.717058815492548e-05, "loss": 0.641, "mean_token_accuracy": 0.792879494279623, "num_tokens": 237275441.0, "step": 200 }, { "epoch": 0.2842423165748801, "eval_loss": 0.6347336769104004, "eval_mean_token_accuracy": 0.7943492432435354, "eval_num_tokens": 237275441.0, "eval_runtime": 296.27, "eval_samples_per_second": 12.283, "eval_steps_per_second": 0.385, "step": 200 }, { "epoch": 0.2913483744892521, "grad_norm": 0.9237497775331585, "learning_rate": 1.701806641839967e-05, "loss": 0.6373, "mean_token_accuracy": 0.7951489560306072, "num_tokens": 243189498.0, "step": 205 }, { "epoch": 0.29845443240362407, "grad_norm": 0.953221738403784, "learning_rate": 1.6862352344500004e-05, "loss": 0.6402, "mean_token_accuracy": 0.7936429545283318, "num_tokens": 249109887.0, "step": 210 }, { "epoch": 0.3055604903179961, "grad_norm": 0.9703848636195304, "learning_rate": 1.6703528533155283e-05, "loss": 0.6414, "mean_token_accuracy": 0.7941999517381191, "num_tokens": 255049150.0, "step": 215 }, { "epoch": 0.3126665482323681, "grad_norm": 0.96915858246581, "learning_rate": 1.6541679233882477e-05, "loss": 0.6298, "mean_token_accuracy": 0.7961237229406833, "num_tokens": 260976846.0, "step": 220 }, { "epoch": 0.3197726061467401, "grad_norm": 0.9933766730385373, "learning_rate": 1.63768903010958e-05, "loss": 0.6356, "mean_token_accuracy": 0.795130829513073, "num_tokens": 266905427.0, "step": 225 }, { "epoch": 0.3268786640611121, "grad_norm": 1.0546956761351753, "learning_rate": 1.6209249148564437e-05, "loss": 0.6304, "mean_token_accuracy": 0.7961406633257866, "num_tokens": 272824016.0, "step": 230 }, { "epoch": 0.3339847219754841, "grad_norm": 0.9224242489597514, "learning_rate": 1.603884470304318e-05, "loss": 0.6414, "mean_token_accuracy": 0.7928595051169396, "num_tokens": 278772935.0, "step": 235 }, { "epoch": 0.3410907798898561, "grad_norm": 1.775037599109117, "learning_rate": 1.5865767357100383e-05, "loss": 0.6247, "mean_token_accuracy": 0.7980937138199806, "num_tokens": 284699860.0, "step": 240 }, { "epoch": 0.3481968378042281, "grad_norm": 3.6986968651179866, "learning_rate": 1.5690108921168428e-05, "loss": 0.6282, "mean_token_accuracy": 0.7979529812932015, "num_tokens": 290642638.0, "step": 245 }, { "epoch": 0.35530289571860013, "grad_norm": 1.1970583265540296, "learning_rate": 1.5511962574842073e-05, "loss": 0.6265, "mean_token_accuracy": 0.7972055464982987, "num_tokens": 296584355.0, "step": 250 }, { "epoch": 0.3624089536329721, "grad_norm": 1.3725931963002411, "learning_rate": 1.5331422817450485e-05, "loss": 0.6316, "mean_token_accuracy": 0.7962015710771084, "num_tokens": 302514276.0, "step": 255 }, { "epoch": 0.3695150115473441, "grad_norm": 1.052505642053235, "learning_rate": 1.5148585417929212e-05, "loss": 0.6265, "mean_token_accuracy": 0.797739926725626, "num_tokens": 308442058.0, "step": 260 }, { "epoch": 0.37662106946171614, "grad_norm": 1.2266076226995661, "learning_rate": 1.4963547364018711e-05, "loss": 0.6228, "mean_token_accuracy": 0.7978610590100288, "num_tokens": 314358386.0, "step": 265 }, { "epoch": 0.3837271273760881, "grad_norm": 0.9779433419910959, "learning_rate": 1.477640681081632e-05, "loss": 0.6274, "mean_token_accuracy": 0.797017228603363, "num_tokens": 320330186.0, "step": 270 }, { "epoch": 0.39083318529046013, "grad_norm": 1.386190197186753, "learning_rate": 1.4587263028709013e-05, "loss": 0.6243, "mean_token_accuracy": 0.7981224037706852, "num_tokens": 326254997.0, "step": 275 }, { "epoch": 0.3979392432048321, "grad_norm": 2.614420120110884, "learning_rate": 1.4396216350714512e-05, "loss": 0.6262, "mean_token_accuracy": 0.7971527561545372, "num_tokens": 332131183.0, "step": 280 }, { "epoch": 0.4050453011192041, "grad_norm": 1.042309981896861, "learning_rate": 1.4203368119258759e-05, "loss": 0.6239, "mean_token_accuracy": 0.7973503857851029, "num_tokens": 338051217.0, "step": 285 }, { "epoch": 0.41215135903357614, "grad_norm": 1.3072882069993246, "learning_rate": 1.4008820632417906e-05, "loss": 0.6153, "mean_token_accuracy": 0.8006650306284427, "num_tokens": 343962755.0, "step": 290 }, { "epoch": 0.4192574169479481, "grad_norm": 0.9860476843101288, "learning_rate": 1.381267708965339e-05, "loss": 0.6203, "mean_token_accuracy": 0.7994598127901554, "num_tokens": 349868870.0, "step": 295 }, { "epoch": 0.4263634748623201, "grad_norm": 1.4036170847387148, "learning_rate": 1.3615041537068831e-05, "loss": 0.6295, "mean_token_accuracy": 0.7960586912930012, "num_tokens": 355805962.0, "step": 300 }, { "epoch": 0.4263634748623201, "eval_loss": 0.6163658499717712, "eval_mean_token_accuracy": 0.7995022409840634, "eval_num_tokens": 355805962.0, "eval_runtime": 296.606, "eval_samples_per_second": 12.269, "eval_steps_per_second": 0.384, "step": 300 }, { "epoch": 0.43346953277669215, "grad_norm": 1.4151324155929559, "learning_rate": 1.3416018812217866e-05, "loss": 0.6254, "mean_token_accuracy": 0.7966083332896232, "num_tokens": 361723150.0, "step": 305 }, { "epoch": 0.4405755906910641, "grad_norm": 1.2868115337774566, "learning_rate": 1.3215714488492121e-05, "loss": 0.6078, "mean_token_accuracy": 0.8015532568097115, "num_tokens": 367658526.0, "step": 310 }, { "epoch": 0.44768164860543613, "grad_norm": 1.928461708072047, "learning_rate": 1.3014234819118846e-05, "loss": 0.606, "mean_token_accuracy": 0.8015141606330871, "num_tokens": 373569982.0, "step": 315 }, { "epoch": 0.45478770651980815, "grad_norm": 1.899128993970621, "learning_rate": 1.2811686680797942e-05, "loss": 0.6182, "mean_token_accuracy": 0.7982937648892403, "num_tokens": 379503775.0, "step": 320 }, { "epoch": 0.4618937644341801, "grad_norm": 1.4589431652822673, "learning_rate": 1.2608177517008268e-05, "loss": 0.606, "mean_token_accuracy": 0.8024964012205601, "num_tokens": 385421798.0, "step": 325 }, { "epoch": 0.46899982234855214, "grad_norm": 2.312165029030224, "learning_rate": 1.240381528101327e-05, "loss": 0.6036, "mean_token_accuracy": 0.8019742712378501, "num_tokens": 391368024.0, "step": 330 }, { "epoch": 0.47610588026292416, "grad_norm": 1.538144906292264, "learning_rate": 1.2198708378596198e-05, "loss": 0.5993, "mean_token_accuracy": 0.804237449914217, "num_tokens": 397292736.0, "step": 335 }, { "epoch": 0.48321193817729613, "grad_norm": 1.307446043015124, "learning_rate": 1.19929656105553e-05, "loss": 0.6046, "mean_token_accuracy": 0.8020064242184162, "num_tokens": 403217739.0, "step": 340 }, { "epoch": 0.49031799609166815, "grad_norm": 1.4958823006704178, "learning_rate": 1.1786696114989455e-05, "loss": 0.6049, "mean_token_accuracy": 0.8029085315763951, "num_tokens": 409134483.0, "step": 345 }, { "epoch": 0.4974240540060402, "grad_norm": 1.3590504704358832, "learning_rate": 1.1580009309404887e-05, "loss": 0.6068, "mean_token_accuracy": 0.8020060114562512, "num_tokens": 415063752.0, "step": 350 }, { "epoch": 0.5045301119204122, "grad_norm": 1.5361640914741626, "learning_rate": 1.1373014832673661e-05, "loss": 0.6058, "mean_token_accuracy": 0.8034303903579711, "num_tokens": 420986440.0, "step": 355 }, { "epoch": 0.5116361698347841, "grad_norm": 1.7406253504574596, "learning_rate": 1.1165822486874773e-05, "loss": 0.6013, "mean_token_accuracy": 0.8025040835142135, "num_tokens": 426919043.0, "step": 360 }, { "epoch": 0.5187422277491561, "grad_norm": 1.4252805000356146, "learning_rate": 1.0958542179048637e-05, "loss": 0.5975, "mean_token_accuracy": 0.8049191392958164, "num_tokens": 432835321.0, "step": 365 }, { "epoch": 0.5258482856635281, "grad_norm": 1.6635488719233447, "learning_rate": 1.0751283862895914e-05, "loss": 0.6198, "mean_token_accuracy": 0.7999552808701992, "num_tokens": 438731328.0, "step": 370 }, { "epoch": 0.5329543435779002, "grad_norm": 1.3128810337611407, "learning_rate": 1.0544157480451586e-05, "loss": 0.6091, "mean_token_accuracy": 0.800326906144619, "num_tokens": 444678429.0, "step": 375 }, { "epoch": 0.5400604014922722, "grad_norm": 1.4889952603189804, "learning_rate": 1.033727290376522e-05, "loss": 0.6138, "mean_token_accuracy": 0.8001590967178345, "num_tokens": 450570676.0, "step": 380 }, { "epoch": 0.5471664594066442, "grad_norm": 1.466863294864097, "learning_rate": 1.013073987661834e-05, "loss": 0.6135, "mean_token_accuracy": 0.7993282429873944, "num_tokens": 456505695.0, "step": 385 }, { "epoch": 0.5542725173210161, "grad_norm": 2.0382279212397996, "learning_rate": 9.924667956309862e-06, "loss": 0.5994, "mean_token_accuracy": 0.8032713256776333, "num_tokens": 462436441.0, "step": 390 }, { "epoch": 0.5613785752353881, "grad_norm": 2.5944556078034373, "learning_rate": 9.719166455540437e-06, "loss": 0.6081, "mean_token_accuracy": 0.801172049343586, "num_tokens": 468376688.0, "step": 395 }, { "epoch": 0.5684846331497602, "grad_norm": 2.3431282794499113, "learning_rate": 9.51434438442655e-06, "loss": 0.6024, "mean_token_accuracy": 0.8036491274833679, "num_tokens": 474296296.0, "step": 400 }, { "epoch": 0.5684846331497602, "eval_loss": 0.6026122570037842, "eval_mean_token_accuracy": 0.8030629869092974, "eval_num_tokens": 474296296.0, "eval_runtime": 295.7635, "eval_samples_per_second": 12.304, "eval_steps_per_second": 0.385, "step": 400 }, { "epoch": 0.5755906910641322, "grad_norm": 2.004552904299561, "learning_rate": 9.310310392675132e-06, "loss": 0.5946, "mean_token_accuracy": 0.8059669084846973, "num_tokens": 480221232.0, "step": 405 }, { "epoch": 0.5826967489785042, "grad_norm": 2.182094919737894, "learning_rate": 9.107172711949324e-06, "loss": 0.6098, "mean_token_accuracy": 0.8020808771252632, "num_tokens": 486176414.0, "step": 410 }, { "epoch": 0.5898028068928762, "grad_norm": 2.181381325740844, "learning_rate": 8.905039098456049e-06, "loss": 0.6011, "mean_token_accuracy": 0.8030769810080528, "num_tokens": 492102670.0, "step": 415 }, { "epoch": 0.5969088648072481, "grad_norm": 1.8310617565549334, "learning_rate": 8.704016775785742e-06, "loss": 0.6047, "mean_token_accuracy": 0.8027667418122292, "num_tokens": 498034157.0, "step": 420 }, { "epoch": 0.6040149227216202, "grad_norm": 1.8408277960821589, "learning_rate": 8.50421237803464e-06, "loss": 0.6, "mean_token_accuracy": 0.8030680187046528, "num_tokens": 503966085.0, "step": 425 }, { "epoch": 0.6111209806359922, "grad_norm": 1.58781331586126, "learning_rate": 8.30573189323978e-06, "loss": 0.5923, "mean_token_accuracy": 0.8065764397382736, "num_tokens": 509912387.0, "step": 430 }, { "epoch": 0.6182270385503642, "grad_norm": 2.4917867524381085, "learning_rate": 8.108680607156669e-06, "loss": 0.6057, "mean_token_accuracy": 0.8019099831581116, "num_tokens": 515846667.0, "step": 435 }, { "epoch": 0.6253330964647362, "grad_norm": 1.4683997587556166, "learning_rate": 7.913163047409533e-06, "loss": 0.6007, "mean_token_accuracy": 0.8031819522380829, "num_tokens": 521759712.0, "step": 440 }, { "epoch": 0.6324391543791081, "grad_norm": 2.0497193131572993, "learning_rate": 7.719282928043688e-06, "loss": 0.6026, "mean_token_accuracy": 0.8027250319719315, "num_tokens": 527693723.0, "step": 445 }, { "epoch": 0.6395452122934802, "grad_norm": 1.9866627694997772, "learning_rate": 7.527143094509492e-06, "loss": 0.5962, "mean_token_accuracy": 0.8050611786544323, "num_tokens": 533641375.0, "step": 450 }, { "epoch": 0.6466512702078522, "grad_norm": 2.0671724220740284, "learning_rate": 7.336845469107061e-06, "loss": 0.6012, "mean_token_accuracy": 0.8034082941710949, "num_tokens": 539566506.0, "step": 455 }, { "epoch": 0.6537573281222242, "grad_norm": 1.1733461701395618, "learning_rate": 7.148490996920661e-06, "loss": 0.6005, "mean_token_accuracy": 0.8036525435745716, "num_tokens": 545484724.0, "step": 460 }, { "epoch": 0.6608633860365962, "grad_norm": 1.3593627896085958, "learning_rate": 6.9621795922714805e-06, "loss": 0.5938, "mean_token_accuracy": 0.8044769234955311, "num_tokens": 551415017.0, "step": 465 }, { "epoch": 0.6679694439509682, "grad_norm": 1.3812452385551917, "learning_rate": 6.778010085717202e-06, "loss": 0.5998, "mean_token_accuracy": 0.8046882562339306, "num_tokens": 557320461.0, "step": 470 }, { "epoch": 0.6750755018653402, "grad_norm": 1.8099599749434352, "learning_rate": 6.596080171626409e-06, "loss": 0.6023, "mean_token_accuracy": 0.8030730128288269, "num_tokens": 563258287.0, "step": 475 }, { "epoch": 0.6821815597797122, "grad_norm": 1.4003389577047656, "learning_rate": 6.416486356355769e-06, "loss": 0.6083, "mean_token_accuracy": 0.8014967061579228, "num_tokens": 569180910.0, "step": 480 }, { "epoch": 0.6892876176940842, "grad_norm": 1.064555140921138, "learning_rate": 6.239323907057342e-06, "loss": 0.6031, "mean_token_accuracy": 0.8031500183045864, "num_tokens": 575092087.0, "step": 485 }, { "epoch": 0.6963936756084562, "grad_norm": 1.42689507484952, "learning_rate": 6.064686801143271e-06, "loss": 0.5872, "mean_token_accuracy": 0.8078797787427903, "num_tokens": 581002946.0, "step": 490 }, { "epoch": 0.7034997335228282, "grad_norm": 1.4988954301366828, "learning_rate": 5.892667676434633e-06, "loss": 0.5969, "mean_token_accuracy": 0.805253654718399, "num_tokens": 586913668.0, "step": 495 }, { "epoch": 0.7106057914372003, "grad_norm": 1.812281786151278, "learning_rate": 5.723357782020867e-06, "loss": 0.5895, "mean_token_accuracy": 0.8048664882779122, "num_tokens": 592832114.0, "step": 500 }, { "epoch": 0.7106057914372003, "eval_loss": 0.5955030918121338, "eval_mean_token_accuracy": 0.804359252515592, "eval_num_tokens": 592832114.0, "eval_runtime": 295.6444, "eval_samples_per_second": 12.309, "eval_steps_per_second": 0.386, "step": 500 }, { "epoch": 0.7177118493515722, "grad_norm": 1.9139670158434299, "learning_rate": 5.556846929855857e-06, "loss": 0.5887, "mean_token_accuracy": 0.8057731881737709, "num_tokens": 598754045.0, "step": 505 }, { "epoch": 0.7248179072659442, "grad_norm": 1.5674173121754666, "learning_rate": 5.393223447116409e-06, "loss": 0.6035, "mean_token_accuracy": 0.803417231887579, "num_tokens": 604714198.0, "step": 510 }, { "epoch": 0.7319239651803162, "grad_norm": 1.4540694573929962, "learning_rate": 5.232574129348278e-06, "loss": 0.5945, "mean_token_accuracy": 0.804961483925581, "num_tokens": 610643984.0, "step": 515 }, { "epoch": 0.7390300230946882, "grad_norm": 1.5167979309975335, "learning_rate": 5.0749841944247e-06, "loss": 0.6049, "mean_token_accuracy": 0.8024425834417344, "num_tokens": 616589217.0, "step": 520 }, { "epoch": 0.7461360810090603, "grad_norm": 1.4872845521101352, "learning_rate": 4.92053723734182e-06, "loss": 0.6022, "mean_token_accuracy": 0.8034923203289509, "num_tokens": 622495105.0, "step": 525 }, { "epoch": 0.7532421389234323, "grad_norm": 2.4122501400628993, "learning_rate": 4.769315185874951e-06, "loss": 0.5961, "mean_token_accuracy": 0.8057848632335662, "num_tokens": 628430302.0, "step": 530 }, { "epoch": 0.7603481968378042, "grad_norm": 3.0932302891179235, "learning_rate": 4.621398257119266e-06, "loss": 0.5966, "mean_token_accuracy": 0.8043950840830802, "num_tokens": 634369704.0, "step": 535 }, { "epoch": 0.7674542547521762, "grad_norm": 2.761826923400515, "learning_rate": 4.476864914937923e-06, "loss": 0.5879, "mean_token_accuracy": 0.8060916163027286, "num_tokens": 640316507.0, "step": 540 }, { "epoch": 0.7745603126665482, "grad_norm": 1.810383003438599, "learning_rate": 4.335791828340183e-06, "loss": 0.6014, "mean_token_accuracy": 0.8044339507818222, "num_tokens": 646273380.0, "step": 545 }, { "epoch": 0.7816663705809203, "grad_norm": 1.57106874125145, "learning_rate": 4.1982538308116775e-06, "loss": 0.5933, "mean_token_accuracy": 0.8038996756076813, "num_tokens": 652201532.0, "step": 550 }, { "epoch": 0.7887724284952923, "grad_norm": 1.65882515510847, "learning_rate": 4.064323880618279e-06, "loss": 0.5979, "mean_token_accuracy": 0.8053264081478119, "num_tokens": 658135558.0, "step": 555 }, { "epoch": 0.7958784864096642, "grad_norm": 1.7520635112367258, "learning_rate": 3.934073022104759e-06, "loss": 0.5942, "mean_token_accuracy": 0.8051018618047238, "num_tokens": 664053021.0, "step": 560 }, { "epoch": 0.8029845443240362, "grad_norm": 2.747926156722073, "learning_rate": 3.807570348008672e-06, "loss": 0.5958, "mean_token_accuracy": 0.805073781311512, "num_tokens": 669989223.0, "step": 565 }, { "epoch": 0.8100906022384082, "grad_norm": 1.2910340620000669, "learning_rate": 3.684882962809484e-06, "loss": 0.6003, "mean_token_accuracy": 0.804193302989006, "num_tokens": 675921465.0, "step": 570 }, { "epoch": 0.8171966601527803, "grad_norm": 2.2556806715561817, "learning_rate": 3.5660759471324037e-06, "loss": 0.5971, "mean_token_accuracy": 0.8039252124726772, "num_tokens": 681852307.0, "step": 575 }, { "epoch": 0.8243027180671523, "grad_norm": 1.7373756069231372, "learning_rate": 3.451212323225786e-06, "loss": 0.5925, "mean_token_accuracy": 0.8057592801749707, "num_tokens": 687769737.0, "step": 580 }, { "epoch": 0.8314087759815243, "grad_norm": 3.192756357062325, "learning_rate": 3.340353021530409e-06, "loss": 0.5902, "mean_token_accuracy": 0.8047820582985878, "num_tokens": 693702256.0, "step": 585 }, { "epoch": 0.8385148338958962, "grad_norm": 2.3339162718389854, "learning_rate": 3.2335568483583708e-06, "loss": 0.5867, "mean_token_accuracy": 0.8085503794252873, "num_tokens": 699642883.0, "step": 590 }, { "epoch": 0.8456208918102682, "grad_norm": 1.7159764847482752, "learning_rate": 3.1308804546987615e-06, "loss": 0.5988, "mean_token_accuracy": 0.8035182796418667, "num_tokens": 705606507.0, "step": 595 }, { "epoch": 0.8527269497246402, "grad_norm": 2.8135588347439984, "learning_rate": 3.0323783061666307e-06, "loss": 0.5943, "mean_token_accuracy": 0.805966579914093, "num_tokens": 711533271.0, "step": 600 }, { "epoch": 0.8527269497246402, "eval_loss": 0.5903548002243042, "eval_mean_token_accuracy": 0.8059158665046358, "eval_num_tokens": 711533271.0, "eval_runtime": 296.5539, "eval_samples_per_second": 12.271, "eval_steps_per_second": 0.384, "step": 600 }, { "epoch": 0.8598330076390123, "grad_norm": 1.620026356285568, "learning_rate": 2.9381026541112145e-06, "loss": 0.5934, "mean_token_accuracy": 0.8051214568316937, "num_tokens": 717498751.0, "step": 605 }, { "epoch": 0.8669390655533843, "grad_norm": 1.346134723264547, "learning_rate": 2.848103507898745e-06, "loss": 0.5995, "mean_token_accuracy": 0.8026661708950996, "num_tokens": 723421175.0, "step": 610 }, { "epoch": 0.8740451234677563, "grad_norm": 1.3133061113418305, "learning_rate": 2.7624286083845187e-06, "loss": 0.5905, "mean_token_accuracy": 0.8046399556100369, "num_tokens": 729353013.0, "step": 615 }, { "epoch": 0.8811511813821282, "grad_norm": 1.7379818179026807, "learning_rate": 2.6811234025883457e-06, "loss": 0.5876, "mean_token_accuracy": 0.8071587301790715, "num_tokens": 735289319.0, "step": 620 }, { "epoch": 0.8882572392965002, "grad_norm": 4.0244229329833, "learning_rate": 2.604231019586761e-06, "loss": 0.5908, "mean_token_accuracy": 0.8060196414589882, "num_tokens": 741217107.0, "step": 625 }, { "epoch": 0.8953632972108723, "grad_norm": 1.7622879809387222, "learning_rate": 2.5317922476348194e-06, "loss": 0.5952, "mean_token_accuracy": 0.8054036945104599, "num_tokens": 747147403.0, "step": 630 }, { "epoch": 0.9024693551252443, "grad_norm": 2.2585412448119007, "learning_rate": 2.4638455125296043e-06, "loss": 0.5925, "mean_token_accuracy": 0.8054537117481232, "num_tokens": 753044605.0, "step": 635 }, { "epoch": 0.9095754130396163, "grad_norm": 1.5262303547489815, "learning_rate": 2.400426857226914e-06, "loss": 0.5879, "mean_token_accuracy": 0.8063779830932617, "num_tokens": 758944457.0, "step": 640 }, { "epoch": 0.9166814709539882, "grad_norm": 2.545493717270523, "learning_rate": 2.3415699227219517e-06, "loss": 0.5992, "mean_token_accuracy": 0.8029563590884209, "num_tokens": 764871608.0, "step": 645 }, { "epoch": 0.9237875288683602, "grad_norm": 1.7376685867634216, "learning_rate": 2.2873059302041627e-06, "loss": 0.5896, "mean_token_accuracy": 0.8063512444496155, "num_tokens": 770802003.0, "step": 650 }, { "epoch": 0.9308935867827323, "grad_norm": 1.7377001386795896, "learning_rate": 2.2376636644956656e-06, "loss": 0.5874, "mean_token_accuracy": 0.8068042829632759, "num_tokens": 776734106.0, "step": 655 }, { "epoch": 0.9379996446971043, "grad_norm": 3.942139353532877, "learning_rate": 2.192669458782096e-06, "loss": 0.5952, "mean_token_accuracy": 0.8051416061818599, "num_tokens": 782648208.0, "step": 660 }, { "epoch": 0.9451057026114763, "grad_norm": 2.1646964014669794, "learning_rate": 2.1523471806439205e-06, "loss": 0.6057, "mean_token_accuracy": 0.8031233668327331, "num_tokens": 788582279.0, "step": 665 }, { "epoch": 0.9522117605258483, "grad_norm": 2.4342297711554393, "learning_rate": 2.1167182193956738e-06, "loss": 0.5833, "mean_token_accuracy": 0.8084848992526531, "num_tokens": 794490250.0, "step": 670 }, { "epoch": 0.9593178184402202, "grad_norm": 1.3849504055994097, "learning_rate": 2.0858014747397952e-06, "loss": 0.5927, "mean_token_accuracy": 0.8059051290154458, "num_tokens": 800406765.0, "step": 675 }, { "epoch": 0.9664238763545923, "grad_norm": 1.8233541738404475, "learning_rate": 2.0596133467411213e-06, "loss": 0.5866, "mean_token_accuracy": 0.8072662524878979, "num_tokens": 806349672.0, "step": 680 }, { "epoch": 0.9735299342689643, "grad_norm": 1.3071754875862744, "learning_rate": 2.0381677271273177e-06, "loss": 0.589, "mean_token_accuracy": 0.8057045452296734, "num_tokens": 812264222.0, "step": 685 }, { "epoch": 0.9806359921833363, "grad_norm": 1.5289098269363515, "learning_rate": 2.0214759919198904e-06, "loss": 0.5831, "mean_token_accuracy": 0.8068839557468891, "num_tokens": 818190520.0, "step": 690 }, { "epoch": 0.9877420500977083, "grad_norm": 2.015373930862521, "learning_rate": 2.0095469953996724e-06, "loss": 0.5994, "mean_token_accuracy": 0.8041414134204388, "num_tokens": 824098000.0, "step": 695 }, { "epoch": 0.9948481080120803, "grad_norm": 2.2460839306695104, "learning_rate": 2.002387065409989e-06, "loss": 0.5962, "mean_token_accuracy": 0.8041631534695626, "num_tokens": 830080703.0, "step": 700 }, { "epoch": 0.9948481080120803, "eval_loss": 0.5880739688873291, "eval_mean_token_accuracy": 0.8062155152622023, "eval_num_tokens": 830080703.0, "eval_runtime": 296.6258, "eval_samples_per_second": 12.268, "eval_steps_per_second": 0.384, "step": 700 }, { "epoch": 1.0, "step": 704, "total_flos": 1.2125984924893184e+16, "train_loss": 0.0, "train_runtime": 1.6574, "train_samples_per_second": 217359.421, "train_steps_per_second": 424.761 } ], "logging_steps": 5, "max_steps": 704, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2125984924893184e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }