| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 704, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007106057914372002, | |
| "grad_norm": 66.55605725941018, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 2.4888, | |
| "mean_token_accuracy": 0.6310219071805477, | |
| "num_tokens": 5923034.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014212115828744005, | |
| "grad_norm": 31.27595336039531, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 1.6522, | |
| "mean_token_accuracy": 0.6847351841628552, | |
| "num_tokens": 11832986.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021318173743116006, | |
| "grad_norm": 6.505788538701925, | |
| "learning_rate": 1.2727272727272728e-05, | |
| "loss": 1.0489, | |
| "mean_token_accuracy": 0.7373726107180119, | |
| "num_tokens": 17770055.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02842423165748801, | |
| "grad_norm": 3.4652580364487653, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 0.8759, | |
| "mean_token_accuracy": 0.7604092583060265, | |
| "num_tokens": 23710854.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03553028957186001, | |
| "grad_norm": 3.073564258123934, | |
| "learning_rate": 1.999961805535155e-05, | |
| "loss": 0.8341, | |
| "mean_token_accuracy": 0.7634279936552048, | |
| "num_tokens": 29669291.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04263634748623201, | |
| "grad_norm": 2.908944153808897, | |
| "learning_rate": 1.9995321550350065e-05, | |
| "loss": 0.7871, | |
| "mean_token_accuracy": 0.7735667265951633, | |
| "num_tokens": 35600851.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04974240540060401, | |
| "grad_norm": 3.8874402850017673, | |
| "learning_rate": 1.998625339625423e-05, | |
| "loss": 0.7597, | |
| "mean_token_accuracy": 0.7763237416744232, | |
| "num_tokens": 41530042.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.05684846331497602, | |
| "grad_norm": 2.5449731597634697, | |
| "learning_rate": 1.9972418403347817e-05, | |
| "loss": 0.7219, | |
| "mean_token_accuracy": 0.7790968000888825, | |
| "num_tokens": 47452248.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06395452122934801, | |
| "grad_norm": 1.7369247244872457, | |
| "learning_rate": 1.9953823910527057e-05, | |
| "loss": 0.6924, | |
| "mean_token_accuracy": 0.7809838131070137, | |
| "num_tokens": 53373989.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07106057914372002, | |
| "grad_norm": 1.619526678453925, | |
| "learning_rate": 1.993047978140764e-05, | |
| "loss": 0.6759, | |
| "mean_token_accuracy": 0.7882510013878345, | |
| "num_tokens": 59280737.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07816663705809203, | |
| "grad_norm": 1.4050312043738487, | |
| "learning_rate": 1.9902398399092494e-05, | |
| "loss": 0.6604, | |
| "mean_token_accuracy": 0.7918689742684364, | |
| "num_tokens": 65195627.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08527269497246402, | |
| "grad_norm": 3.4085768530761555, | |
| "learning_rate": 1.9869594659603032e-05, | |
| "loss": 0.6644, | |
| "mean_token_accuracy": 0.7887415766716004, | |
| "num_tokens": 71142550.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09237875288683603, | |
| "grad_norm": 2.1575833755380707, | |
| "learning_rate": 1.9832085963977445e-05, | |
| "loss": 0.6732, | |
| "mean_token_accuracy": 0.786578668653965, | |
| "num_tokens": 77081694.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.09948481080120802, | |
| "grad_norm": 2.1880249716642743, | |
| "learning_rate": 1.978989220904016e-05, | |
| "loss": 0.6647, | |
| "mean_token_accuracy": 0.7869649574160575, | |
| "num_tokens": 83044225.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10659086871558003, | |
| "grad_norm": 3.500718658694026, | |
| "learning_rate": 1.9743035776847377e-05, | |
| "loss": 0.6618, | |
| "mean_token_accuracy": 0.7884958483278751, | |
| "num_tokens": 88988372.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11369692662995204, | |
| "grad_norm": 3.5435216425879, | |
| "learning_rate": 1.9691541522814327e-05, | |
| "loss": 0.6538, | |
| "mean_token_accuracy": 0.7917183473706245, | |
| "num_tokens": 94931302.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12080298454432403, | |
| "grad_norm": 3.3524381829737675, | |
| "learning_rate": 1.963543676253048e-05, | |
| "loss": 0.668, | |
| "mean_token_accuracy": 0.7873365215957164, | |
| "num_tokens": 100848612.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.12790904245869603, | |
| "grad_norm": 2.5235394583310655, | |
| "learning_rate": 1.9574751257269748e-05, | |
| "loss": 0.6632, | |
| "mean_token_accuracy": 0.7879712589085102, | |
| "num_tokens": 106754467.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13501510037306805, | |
| "grad_norm": 2.1804248444209673, | |
| "learning_rate": 1.950951719820335e-05, | |
| "loss": 0.6519, | |
| "mean_token_accuracy": 0.7906477533280849, | |
| "num_tokens": 112664258.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.14212115828744004, | |
| "grad_norm": 2.1726552942080497, | |
| "learning_rate": 1.9439769189323727e-05, | |
| "loss": 0.654, | |
| "mean_token_accuracy": 0.7900555059313774, | |
| "num_tokens": 118606688.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14212115828744004, | |
| "eval_loss": 0.6459761261940002, | |
| "eval_mean_token_accuracy": 0.7921642295101232, | |
| "eval_num_tokens": 118606688.0, | |
| "eval_runtime": 295.44, | |
| "eval_samples_per_second": 12.317, | |
| "eval_steps_per_second": 0.386, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14922721620181204, | |
| "grad_norm": 2.0323675408135484, | |
| "learning_rate": 1.9365544229088517e-05, | |
| "loss": 0.6521, | |
| "mean_token_accuracy": 0.7909861005842685, | |
| "num_tokens": 124512961.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.15633327411618406, | |
| "grad_norm": 5.430519736360723, | |
| "learning_rate": 1.9286881690794425e-05, | |
| "loss": 0.6453, | |
| "mean_token_accuracy": 0.7935434632003308, | |
| "num_tokens": 130441640.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.16343933203055605, | |
| "grad_norm": 2.6425373488919943, | |
| "learning_rate": 1.9203823301691272e-05, | |
| "loss": 0.6683, | |
| "mean_token_accuracy": 0.7861981622874736, | |
| "num_tokens": 136394287.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.17054538994492804, | |
| "grad_norm": 3.2273755870117724, | |
| "learning_rate": 1.9116413120847425e-05, | |
| "loss": 0.6649, | |
| "mean_token_accuracy": 0.7885523498058319, | |
| "num_tokens": 142309319.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17765144785930007, | |
| "grad_norm": 4.163268789121899, | |
| "learning_rate": 1.902469751577826e-05, | |
| "loss": 0.6679, | |
| "mean_token_accuracy": 0.7868955120444298, | |
| "num_tokens": 148247490.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.18475750577367206, | |
| "grad_norm": 1.6661402706531616, | |
| "learning_rate": 1.892872513785008e-05, | |
| "loss": 0.6394, | |
| "mean_token_accuracy": 0.7936947368085384, | |
| "num_tokens": 154195595.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.19186356368804405, | |
| "grad_norm": 1.5794423300632663, | |
| "learning_rate": 1.88285468964726e-05, | |
| "loss": 0.6373, | |
| "mean_token_accuracy": 0.7937119543552399, | |
| "num_tokens": 160139401.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.19896962160241605, | |
| "grad_norm": 1.9146265900976953, | |
| "learning_rate": 1.872421593209355e-05, | |
| "loss": 0.6367, | |
| "mean_token_accuracy": 0.7936570249497891, | |
| "num_tokens": 166057697.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20607567951678807, | |
| "grad_norm": 1.8920022402043013, | |
| "learning_rate": 1.861578758800989e-05, | |
| "loss": 0.6448, | |
| "mean_token_accuracy": 0.7935090765357018, | |
| "num_tokens": 171980981.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.21318173743116006, | |
| "grad_norm": 1.8829721816030007, | |
| "learning_rate": 1.8503319381010414e-05, | |
| "loss": 0.6394, | |
| "mean_token_accuracy": 0.7929822854697705, | |
| "num_tokens": 177920811.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22028779534553206, | |
| "grad_norm": 2.710625896660201, | |
| "learning_rate": 1.8386870970865488e-05, | |
| "loss": 0.6297, | |
| "mean_token_accuracy": 0.795507474988699, | |
| "num_tokens": 183857404.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.22739385325990408, | |
| "grad_norm": 1.7692011271897554, | |
| "learning_rate": 1.8266504128679988e-05, | |
| "loss": 0.6374, | |
| "mean_token_accuracy": 0.7952376157045364, | |
| "num_tokens": 189793021.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.23449991117427607, | |
| "grad_norm": 1.226150395650258, | |
| "learning_rate": 1.814228270412624e-05, | |
| "loss": 0.6525, | |
| "mean_token_accuracy": 0.7921756438910961, | |
| "num_tokens": 195715230.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.24160596908864806, | |
| "grad_norm": 2.1730739897367206, | |
| "learning_rate": 1.8014272591574405e-05, | |
| "loss": 0.6452, | |
| "mean_token_accuracy": 0.7933229982852936, | |
| "num_tokens": 201644724.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2487120270030201, | |
| "grad_norm": 1.1273239605881678, | |
| "learning_rate": 1.7882541695138224e-05, | |
| "loss": 0.6328, | |
| "mean_token_accuracy": 0.795696322619915, | |
| "num_tokens": 207585561.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.25581808491739205, | |
| "grad_norm": 1.094910349166684, | |
| "learning_rate": 1.7747159892654646e-05, | |
| "loss": 0.6294, | |
| "mean_token_accuracy": 0.7952122300863266, | |
| "num_tokens": 213540458.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2629241428317641, | |
| "grad_norm": 1.4265158165299685, | |
| "learning_rate": 1.7608198998616533e-05, | |
| "loss": 0.6401, | |
| "mean_token_accuracy": 0.7934339419007301, | |
| "num_tokens": 219456523.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2700302007461361, | |
| "grad_norm": 1.1866353031490813, | |
| "learning_rate": 1.7465732726077993e-05, | |
| "loss": 0.6376, | |
| "mean_token_accuracy": 0.7937880590558052, | |
| "num_tokens": 225385658.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.27713625866050806, | |
| "grad_norm": 1.1122272717695676, | |
| "learning_rate": 1.731983664755264e-05, | |
| "loss": 0.6434, | |
| "mean_token_accuracy": 0.793655838817358, | |
| "num_tokens": 231315244.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2842423165748801, | |
| "grad_norm": 0.9655859557982183, | |
| "learning_rate": 1.717058815492548e-05, | |
| "loss": 0.641, | |
| "mean_token_accuracy": 0.792879494279623, | |
| "num_tokens": 237275441.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2842423165748801, | |
| "eval_loss": 0.6347336769104004, | |
| "eval_mean_token_accuracy": 0.7943492432435354, | |
| "eval_num_tokens": 237275441.0, | |
| "eval_runtime": 296.27, | |
| "eval_samples_per_second": 12.283, | |
| "eval_steps_per_second": 0.385, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2913483744892521, | |
| "grad_norm": 0.9237497775331585, | |
| "learning_rate": 1.701806641839967e-05, | |
| "loss": 0.6373, | |
| "mean_token_accuracy": 0.7951489560306072, | |
| "num_tokens": 243189498.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.29845443240362407, | |
| "grad_norm": 0.953221738403784, | |
| "learning_rate": 1.6862352344500004e-05, | |
| "loss": 0.6402, | |
| "mean_token_accuracy": 0.7936429545283318, | |
| "num_tokens": 249109887.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3055604903179961, | |
| "grad_norm": 0.9703848636195304, | |
| "learning_rate": 1.6703528533155283e-05, | |
| "loss": 0.6414, | |
| "mean_token_accuracy": 0.7941999517381191, | |
| "num_tokens": 255049150.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3126665482323681, | |
| "grad_norm": 0.96915858246581, | |
| "learning_rate": 1.6541679233882477e-05, | |
| "loss": 0.6298, | |
| "mean_token_accuracy": 0.7961237229406833, | |
| "num_tokens": 260976846.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3197726061467401, | |
| "grad_norm": 0.9933766730385373, | |
| "learning_rate": 1.63768903010958e-05, | |
| "loss": 0.6356, | |
| "mean_token_accuracy": 0.795130829513073, | |
| "num_tokens": 266905427.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3268786640611121, | |
| "grad_norm": 1.0546956761351753, | |
| "learning_rate": 1.6209249148564437e-05, | |
| "loss": 0.6304, | |
| "mean_token_accuracy": 0.7961406633257866, | |
| "num_tokens": 272824016.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3339847219754841, | |
| "grad_norm": 0.9224242489597514, | |
| "learning_rate": 1.603884470304318e-05, | |
| "loss": 0.6414, | |
| "mean_token_accuracy": 0.7928595051169396, | |
| "num_tokens": 278772935.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3410907798898561, | |
| "grad_norm": 1.775037599109117, | |
| "learning_rate": 1.5865767357100383e-05, | |
| "loss": 0.6247, | |
| "mean_token_accuracy": 0.7980937138199806, | |
| "num_tokens": 284699860.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3481968378042281, | |
| "grad_norm": 3.6986968651179866, | |
| "learning_rate": 1.5690108921168428e-05, | |
| "loss": 0.6282, | |
| "mean_token_accuracy": 0.7979529812932015, | |
| "num_tokens": 290642638.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.35530289571860013, | |
| "grad_norm": 1.1970583265540296, | |
| "learning_rate": 1.5511962574842073e-05, | |
| "loss": 0.6265, | |
| "mean_token_accuracy": 0.7972055464982987, | |
| "num_tokens": 296584355.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3624089536329721, | |
| "grad_norm": 1.3725931963002411, | |
| "learning_rate": 1.5331422817450485e-05, | |
| "loss": 0.6316, | |
| "mean_token_accuracy": 0.7962015710771084, | |
| "num_tokens": 302514276.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3695150115473441, | |
| "grad_norm": 1.052505642053235, | |
| "learning_rate": 1.5148585417929212e-05, | |
| "loss": 0.6265, | |
| "mean_token_accuracy": 0.797739926725626, | |
| "num_tokens": 308442058.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.37662106946171614, | |
| "grad_norm": 1.2266076226995661, | |
| "learning_rate": 1.4963547364018711e-05, | |
| "loss": 0.6228, | |
| "mean_token_accuracy": 0.7978610590100288, | |
| "num_tokens": 314358386.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3837271273760881, | |
| "grad_norm": 0.9779433419910959, | |
| "learning_rate": 1.477640681081632e-05, | |
| "loss": 0.6274, | |
| "mean_token_accuracy": 0.797017228603363, | |
| "num_tokens": 320330186.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.39083318529046013, | |
| "grad_norm": 1.386190197186753, | |
| "learning_rate": 1.4587263028709013e-05, | |
| "loss": 0.6243, | |
| "mean_token_accuracy": 0.7981224037706852, | |
| "num_tokens": 326254997.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3979392432048321, | |
| "grad_norm": 2.614420120110884, | |
| "learning_rate": 1.4396216350714512e-05, | |
| "loss": 0.6262, | |
| "mean_token_accuracy": 0.7971527561545372, | |
| "num_tokens": 332131183.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4050453011192041, | |
| "grad_norm": 1.042309981896861, | |
| "learning_rate": 1.4203368119258759e-05, | |
| "loss": 0.6239, | |
| "mean_token_accuracy": 0.7973503857851029, | |
| "num_tokens": 338051217.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.41215135903357614, | |
| "grad_norm": 1.3072882069993246, | |
| "learning_rate": 1.4008820632417906e-05, | |
| "loss": 0.6153, | |
| "mean_token_accuracy": 0.8006650306284427, | |
| "num_tokens": 343962755.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4192574169479481, | |
| "grad_norm": 0.9860476843101288, | |
| "learning_rate": 1.381267708965339e-05, | |
| "loss": 0.6203, | |
| "mean_token_accuracy": 0.7994598127901554, | |
| "num_tokens": 349868870.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4263634748623201, | |
| "grad_norm": 1.4036170847387148, | |
| "learning_rate": 1.3615041537068831e-05, | |
| "loss": 0.6295, | |
| "mean_token_accuracy": 0.7960586912930012, | |
| "num_tokens": 355805962.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4263634748623201, | |
| "eval_loss": 0.6163658499717712, | |
| "eval_mean_token_accuracy": 0.7995022409840634, | |
| "eval_num_tokens": 355805962.0, | |
| "eval_runtime": 296.606, | |
| "eval_samples_per_second": 12.269, | |
| "eval_steps_per_second": 0.384, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.43346953277669215, | |
| "grad_norm": 1.4151324155929559, | |
| "learning_rate": 1.3416018812217866e-05, | |
| "loss": 0.6254, | |
| "mean_token_accuracy": 0.7966083332896232, | |
| "num_tokens": 361723150.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4405755906910641, | |
| "grad_norm": 1.2868115337774566, | |
| "learning_rate": 1.3215714488492121e-05, | |
| "loss": 0.6078, | |
| "mean_token_accuracy": 0.8015532568097115, | |
| "num_tokens": 367658526.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.44768164860543613, | |
| "grad_norm": 1.928461708072047, | |
| "learning_rate": 1.3014234819118846e-05, | |
| "loss": 0.606, | |
| "mean_token_accuracy": 0.8015141606330871, | |
| "num_tokens": 373569982.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.45478770651980815, | |
| "grad_norm": 1.899128993970621, | |
| "learning_rate": 1.2811686680797942e-05, | |
| "loss": 0.6182, | |
| "mean_token_accuracy": 0.7982937648892403, | |
| "num_tokens": 379503775.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4618937644341801, | |
| "grad_norm": 1.4589431652822673, | |
| "learning_rate": 1.2608177517008268e-05, | |
| "loss": 0.606, | |
| "mean_token_accuracy": 0.8024964012205601, | |
| "num_tokens": 385421798.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.46899982234855214, | |
| "grad_norm": 2.312165029030224, | |
| "learning_rate": 1.240381528101327e-05, | |
| "loss": 0.6036, | |
| "mean_token_accuracy": 0.8019742712378501, | |
| "num_tokens": 391368024.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.47610588026292416, | |
| "grad_norm": 1.538144906292264, | |
| "learning_rate": 1.2198708378596198e-05, | |
| "loss": 0.5993, | |
| "mean_token_accuracy": 0.804237449914217, | |
| "num_tokens": 397292736.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.48321193817729613, | |
| "grad_norm": 1.307446043015124, | |
| "learning_rate": 1.19929656105553e-05, | |
| "loss": 0.6046, | |
| "mean_token_accuracy": 0.8020064242184162, | |
| "num_tokens": 403217739.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.49031799609166815, | |
| "grad_norm": 1.4958823006704178, | |
| "learning_rate": 1.1786696114989455e-05, | |
| "loss": 0.6049, | |
| "mean_token_accuracy": 0.8029085315763951, | |
| "num_tokens": 409134483.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.4974240540060402, | |
| "grad_norm": 1.3590504704358832, | |
| "learning_rate": 1.1580009309404887e-05, | |
| "loss": 0.6068, | |
| "mean_token_accuracy": 0.8020060114562512, | |
| "num_tokens": 415063752.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5045301119204122, | |
| "grad_norm": 1.5361640914741626, | |
| "learning_rate": 1.1373014832673661e-05, | |
| "loss": 0.6058, | |
| "mean_token_accuracy": 0.8034303903579711, | |
| "num_tokens": 420986440.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5116361698347841, | |
| "grad_norm": 1.7406253504574596, | |
| "learning_rate": 1.1165822486874773e-05, | |
| "loss": 0.6013, | |
| "mean_token_accuracy": 0.8025040835142135, | |
| "num_tokens": 426919043.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5187422277491561, | |
| "grad_norm": 1.4252805000356146, | |
| "learning_rate": 1.0958542179048637e-05, | |
| "loss": 0.5975, | |
| "mean_token_accuracy": 0.8049191392958164, | |
| "num_tokens": 432835321.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5258482856635281, | |
| "grad_norm": 1.6635488719233447, | |
| "learning_rate": 1.0751283862895914e-05, | |
| "loss": 0.6198, | |
| "mean_token_accuracy": 0.7999552808701992, | |
| "num_tokens": 438731328.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5329543435779002, | |
| "grad_norm": 1.3128810337611407, | |
| "learning_rate": 1.0544157480451586e-05, | |
| "loss": 0.6091, | |
| "mean_token_accuracy": 0.800326906144619, | |
| "num_tokens": 444678429.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5400604014922722, | |
| "grad_norm": 1.4889952603189804, | |
| "learning_rate": 1.033727290376522e-05, | |
| "loss": 0.6138, | |
| "mean_token_accuracy": 0.8001590967178345, | |
| "num_tokens": 450570676.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5471664594066442, | |
| "grad_norm": 1.466863294864097, | |
| "learning_rate": 1.013073987661834e-05, | |
| "loss": 0.6135, | |
| "mean_token_accuracy": 0.7993282429873944, | |
| "num_tokens": 456505695.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5542725173210161, | |
| "grad_norm": 2.0382279212397996, | |
| "learning_rate": 9.924667956309862e-06, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8032713256776333, | |
| "num_tokens": 462436441.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5613785752353881, | |
| "grad_norm": 2.5944556078034373, | |
| "learning_rate": 9.719166455540437e-06, | |
| "loss": 0.6081, | |
| "mean_token_accuracy": 0.801172049343586, | |
| "num_tokens": 468376688.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5684846331497602, | |
| "grad_norm": 2.3431282794499113, | |
| "learning_rate": 9.51434438442655e-06, | |
| "loss": 0.6024, | |
| "mean_token_accuracy": 0.8036491274833679, | |
| "num_tokens": 474296296.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5684846331497602, | |
| "eval_loss": 0.6026122570037842, | |
| "eval_mean_token_accuracy": 0.8030629869092974, | |
| "eval_num_tokens": 474296296.0, | |
| "eval_runtime": 295.7635, | |
| "eval_samples_per_second": 12.304, | |
| "eval_steps_per_second": 0.385, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5755906910641322, | |
| "grad_norm": 2.004552904299561, | |
| "learning_rate": 9.310310392675132e-06, | |
| "loss": 0.5946, | |
| "mean_token_accuracy": 0.8059669084846973, | |
| "num_tokens": 480221232.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5826967489785042, | |
| "grad_norm": 2.182094919737894, | |
| "learning_rate": 9.107172711949324e-06, | |
| "loss": 0.6098, | |
| "mean_token_accuracy": 0.8020808771252632, | |
| "num_tokens": 486176414.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5898028068928762, | |
| "grad_norm": 2.181381325740844, | |
| "learning_rate": 8.905039098456049e-06, | |
| "loss": 0.6011, | |
| "mean_token_accuracy": 0.8030769810080528, | |
| "num_tokens": 492102670.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5969088648072481, | |
| "grad_norm": 1.8310617565549334, | |
| "learning_rate": 8.704016775785742e-06, | |
| "loss": 0.6047, | |
| "mean_token_accuracy": 0.8027667418122292, | |
| "num_tokens": 498034157.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6040149227216202, | |
| "grad_norm": 1.8408277960821589, | |
| "learning_rate": 8.50421237803464e-06, | |
| "loss": 0.6, | |
| "mean_token_accuracy": 0.8030680187046528, | |
| "num_tokens": 503966085.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6111209806359922, | |
| "grad_norm": 1.58781331586126, | |
| "learning_rate": 8.30573189323978e-06, | |
| "loss": 0.5923, | |
| "mean_token_accuracy": 0.8065764397382736, | |
| "num_tokens": 509912387.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6182270385503642, | |
| "grad_norm": 2.4917867524381085, | |
| "learning_rate": 8.108680607156669e-06, | |
| "loss": 0.6057, | |
| "mean_token_accuracy": 0.8019099831581116, | |
| "num_tokens": 515846667.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6253330964647362, | |
| "grad_norm": 1.4683997587556166, | |
| "learning_rate": 7.913163047409533e-06, | |
| "loss": 0.6007, | |
| "mean_token_accuracy": 0.8031819522380829, | |
| "num_tokens": 521759712.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6324391543791081, | |
| "grad_norm": 2.0497193131572993, | |
| "learning_rate": 7.719282928043688e-06, | |
| "loss": 0.6026, | |
| "mean_token_accuracy": 0.8027250319719315, | |
| "num_tokens": 527693723.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6395452122934802, | |
| "grad_norm": 1.9866627694997772, | |
| "learning_rate": 7.527143094509492e-06, | |
| "loss": 0.5962, | |
| "mean_token_accuracy": 0.8050611786544323, | |
| "num_tokens": 533641375.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6466512702078522, | |
| "grad_norm": 2.0671724220740284, | |
| "learning_rate": 7.336845469107061e-06, | |
| "loss": 0.6012, | |
| "mean_token_accuracy": 0.8034082941710949, | |
| "num_tokens": 539566506.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6537573281222242, | |
| "grad_norm": 1.1733461701395618, | |
| "learning_rate": 7.148490996920661e-06, | |
| "loss": 0.6005, | |
| "mean_token_accuracy": 0.8036525435745716, | |
| "num_tokens": 545484724.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6608633860365962, | |
| "grad_norm": 1.3593627896085958, | |
| "learning_rate": 6.9621795922714805e-06, | |
| "loss": 0.5938, | |
| "mean_token_accuracy": 0.8044769234955311, | |
| "num_tokens": 551415017.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6679694439509682, | |
| "grad_norm": 1.3812452385551917, | |
| "learning_rate": 6.778010085717202e-06, | |
| "loss": 0.5998, | |
| "mean_token_accuracy": 0.8046882562339306, | |
| "num_tokens": 557320461.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6750755018653402, | |
| "grad_norm": 1.8099599749434352, | |
| "learning_rate": 6.596080171626409e-06, | |
| "loss": 0.6023, | |
| "mean_token_accuracy": 0.8030730128288269, | |
| "num_tokens": 563258287.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6821815597797122, | |
| "grad_norm": 1.4003389577047656, | |
| "learning_rate": 6.416486356355769e-06, | |
| "loss": 0.6083, | |
| "mean_token_accuracy": 0.8014967061579228, | |
| "num_tokens": 569180910.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6892876176940842, | |
| "grad_norm": 1.064555140921138, | |
| "learning_rate": 6.239323907057342e-06, | |
| "loss": 0.6031, | |
| "mean_token_accuracy": 0.8031500183045864, | |
| "num_tokens": 575092087.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6963936756084562, | |
| "grad_norm": 1.42689507484952, | |
| "learning_rate": 6.064686801143271e-06, | |
| "loss": 0.5872, | |
| "mean_token_accuracy": 0.8078797787427903, | |
| "num_tokens": 581002946.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7034997335228282, | |
| "grad_norm": 1.4988954301366828, | |
| "learning_rate": 5.892667676434633e-06, | |
| "loss": 0.5969, | |
| "mean_token_accuracy": 0.805253654718399, | |
| "num_tokens": 586913668.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7106057914372003, | |
| "grad_norm": 1.812281786151278, | |
| "learning_rate": 5.723357782020867e-06, | |
| "loss": 0.5895, | |
| "mean_token_accuracy": 0.8048664882779122, | |
| "num_tokens": 592832114.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7106057914372003, | |
| "eval_loss": 0.5955030918121338, | |
| "eval_mean_token_accuracy": 0.804359252515592, | |
| "eval_num_tokens": 592832114.0, | |
| "eval_runtime": 295.6444, | |
| "eval_samples_per_second": 12.309, | |
| "eval_steps_per_second": 0.386, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7177118493515722, | |
| "grad_norm": 1.9139670158434299, | |
| "learning_rate": 5.556846929855857e-06, | |
| "loss": 0.5887, | |
| "mean_token_accuracy": 0.8057731881737709, | |
| "num_tokens": 598754045.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7248179072659442, | |
| "grad_norm": 1.5674173121754666, | |
| "learning_rate": 5.393223447116409e-06, | |
| "loss": 0.6035, | |
| "mean_token_accuracy": 0.803417231887579, | |
| "num_tokens": 604714198.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7319239651803162, | |
| "grad_norm": 1.4540694573929962, | |
| "learning_rate": 5.232574129348278e-06, | |
| "loss": 0.5945, | |
| "mean_token_accuracy": 0.804961483925581, | |
| "num_tokens": 610643984.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7390300230946882, | |
| "grad_norm": 1.5167979309975335, | |
| "learning_rate": 5.0749841944247e-06, | |
| "loss": 0.6049, | |
| "mean_token_accuracy": 0.8024425834417344, | |
| "num_tokens": 616589217.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7461360810090603, | |
| "grad_norm": 1.4872845521101352, | |
| "learning_rate": 4.92053723734182e-06, | |
| "loss": 0.6022, | |
| "mean_token_accuracy": 0.8034923203289509, | |
| "num_tokens": 622495105.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7532421389234323, | |
| "grad_norm": 2.4122501400628993, | |
| "learning_rate": 4.769315185874951e-06, | |
| "loss": 0.5961, | |
| "mean_token_accuracy": 0.8057848632335662, | |
| "num_tokens": 628430302.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7603481968378042, | |
| "grad_norm": 3.0932302891179235, | |
| "learning_rate": 4.621398257119266e-06, | |
| "loss": 0.5966, | |
| "mean_token_accuracy": 0.8043950840830802, | |
| "num_tokens": 634369704.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7674542547521762, | |
| "grad_norm": 2.761826923400515, | |
| "learning_rate": 4.476864914937923e-06, | |
| "loss": 0.5879, | |
| "mean_token_accuracy": 0.8060916163027286, | |
| "num_tokens": 640316507.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7745603126665482, | |
| "grad_norm": 1.810383003438599, | |
| "learning_rate": 4.335791828340183e-06, | |
| "loss": 0.6014, | |
| "mean_token_accuracy": 0.8044339507818222, | |
| "num_tokens": 646273380.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7816663705809203, | |
| "grad_norm": 1.57106874125145, | |
| "learning_rate": 4.1982538308116775e-06, | |
| "loss": 0.5933, | |
| "mean_token_accuracy": 0.8038996756076813, | |
| "num_tokens": 652201532.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7887724284952923, | |
| "grad_norm": 1.65882515510847, | |
| "learning_rate": 4.064323880618279e-06, | |
| "loss": 0.5979, | |
| "mean_token_accuracy": 0.8053264081478119, | |
| "num_tokens": 658135558.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7958784864096642, | |
| "grad_norm": 1.7520635112367258, | |
| "learning_rate": 3.934073022104759e-06, | |
| "loss": 0.5942, | |
| "mean_token_accuracy": 0.8051018618047238, | |
| "num_tokens": 664053021.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8029845443240362, | |
| "grad_norm": 2.747926156722073, | |
| "learning_rate": 3.807570348008672e-06, | |
| "loss": 0.5958, | |
| "mean_token_accuracy": 0.805073781311512, | |
| "num_tokens": 669989223.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8100906022384082, | |
| "grad_norm": 1.2910340620000669, | |
| "learning_rate": 3.684882962809484e-06, | |
| "loss": 0.6003, | |
| "mean_token_accuracy": 0.804193302989006, | |
| "num_tokens": 675921465.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8171966601527803, | |
| "grad_norm": 2.2556806715561817, | |
| "learning_rate": 3.5660759471324037e-06, | |
| "loss": 0.5971, | |
| "mean_token_accuracy": 0.8039252124726772, | |
| "num_tokens": 681852307.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8243027180671523, | |
| "grad_norm": 1.7373756069231372, | |
| "learning_rate": 3.451212323225786e-06, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.8057592801749707, | |
| "num_tokens": 687769737.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8314087759815243, | |
| "grad_norm": 3.192756357062325, | |
| "learning_rate": 3.340353021530409e-06, | |
| "loss": 0.5902, | |
| "mean_token_accuracy": 0.8047820582985878, | |
| "num_tokens": 693702256.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8385148338958962, | |
| "grad_norm": 2.3339162718389854, | |
| "learning_rate": 3.2335568483583708e-06, | |
| "loss": 0.5867, | |
| "mean_token_accuracy": 0.8085503794252873, | |
| "num_tokens": 699642883.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8456208918102682, | |
| "grad_norm": 1.7159764847482752, | |
| "learning_rate": 3.1308804546987615e-06, | |
| "loss": 0.5988, | |
| "mean_token_accuracy": 0.8035182796418667, | |
| "num_tokens": 705606507.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8527269497246402, | |
| "grad_norm": 2.8135588347439984, | |
| "learning_rate": 3.0323783061666307e-06, | |
| "loss": 0.5943, | |
| "mean_token_accuracy": 0.805966579914093, | |
| "num_tokens": 711533271.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8527269497246402, | |
| "eval_loss": 0.5903548002243042, | |
| "eval_mean_token_accuracy": 0.8059158665046358, | |
| "eval_num_tokens": 711533271.0, | |
| "eval_runtime": 296.5539, | |
| "eval_samples_per_second": 12.271, | |
| "eval_steps_per_second": 0.384, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8598330076390123, | |
| "grad_norm": 1.620026356285568, | |
| "learning_rate": 2.9381026541112145e-06, | |
| "loss": 0.5934, | |
| "mean_token_accuracy": 0.8051214568316937, | |
| "num_tokens": 717498751.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8669390655533843, | |
| "grad_norm": 1.346134723264547, | |
| "learning_rate": 2.848103507898745e-06, | |
| "loss": 0.5995, | |
| "mean_token_accuracy": 0.8026661708950996, | |
| "num_tokens": 723421175.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8740451234677563, | |
| "grad_norm": 1.3133061113418305, | |
| "learning_rate": 2.7624286083845187e-06, | |
| "loss": 0.5905, | |
| "mean_token_accuracy": 0.8046399556100369, | |
| "num_tokens": 729353013.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8811511813821282, | |
| "grad_norm": 1.7379818179026807, | |
| "learning_rate": 2.6811234025883457e-06, | |
| "loss": 0.5876, | |
| "mean_token_accuracy": 0.8071587301790715, | |
| "num_tokens": 735289319.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8882572392965002, | |
| "grad_norm": 4.0244229329833, | |
| "learning_rate": 2.604231019586761e-06, | |
| "loss": 0.5908, | |
| "mean_token_accuracy": 0.8060196414589882, | |
| "num_tokens": 741217107.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.8953632972108723, | |
| "grad_norm": 1.7622879809387222, | |
| "learning_rate": 2.5317922476348194e-06, | |
| "loss": 0.5952, | |
| "mean_token_accuracy": 0.8054036945104599, | |
| "num_tokens": 747147403.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9024693551252443, | |
| "grad_norm": 2.2585412448119007, | |
| "learning_rate": 2.4638455125296043e-06, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.8054537117481232, | |
| "num_tokens": 753044605.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9095754130396163, | |
| "grad_norm": 1.5262303547489815, | |
| "learning_rate": 2.400426857226914e-06, | |
| "loss": 0.5879, | |
| "mean_token_accuracy": 0.8063779830932617, | |
| "num_tokens": 758944457.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9166814709539882, | |
| "grad_norm": 2.545493717270523, | |
| "learning_rate": 2.3415699227219517e-06, | |
| "loss": 0.5992, | |
| "mean_token_accuracy": 0.8029563590884209, | |
| "num_tokens": 764871608.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9237875288683602, | |
| "grad_norm": 1.7376685867634216, | |
| "learning_rate": 2.2873059302041627e-06, | |
| "loss": 0.5896, | |
| "mean_token_accuracy": 0.8063512444496155, | |
| "num_tokens": 770802003.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9308935867827323, | |
| "grad_norm": 1.7377001386795896, | |
| "learning_rate": 2.2376636644956656e-06, | |
| "loss": 0.5874, | |
| "mean_token_accuracy": 0.8068042829632759, | |
| "num_tokens": 776734106.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9379996446971043, | |
| "grad_norm": 3.942139353532877, | |
| "learning_rate": 2.192669458782096e-06, | |
| "loss": 0.5952, | |
| "mean_token_accuracy": 0.8051416061818599, | |
| "num_tokens": 782648208.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9451057026114763, | |
| "grad_norm": 2.1646964014669794, | |
| "learning_rate": 2.1523471806439205e-06, | |
| "loss": 0.6057, | |
| "mean_token_accuracy": 0.8031233668327331, | |
| "num_tokens": 788582279.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.9522117605258483, | |
| "grad_norm": 2.4342297711554393, | |
| "learning_rate": 2.1167182193956738e-06, | |
| "loss": 0.5833, | |
| "mean_token_accuracy": 0.8084848992526531, | |
| "num_tokens": 794490250.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9593178184402202, | |
| "grad_norm": 1.3849504055994097, | |
| "learning_rate": 2.0858014747397952e-06, | |
| "loss": 0.5927, | |
| "mean_token_accuracy": 0.8059051290154458, | |
| "num_tokens": 800406765.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.9664238763545923, | |
| "grad_norm": 1.8233541738404475, | |
| "learning_rate": 2.0596133467411213e-06, | |
| "loss": 0.5866, | |
| "mean_token_accuracy": 0.8072662524878979, | |
| "num_tokens": 806349672.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9735299342689643, | |
| "grad_norm": 1.3071754875862744, | |
| "learning_rate": 2.0381677271273177e-06, | |
| "loss": 0.589, | |
| "mean_token_accuracy": 0.8057045452296734, | |
| "num_tokens": 812264222.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.9806359921833363, | |
| "grad_norm": 1.5289098269363515, | |
| "learning_rate": 2.0214759919198904e-06, | |
| "loss": 0.5831, | |
| "mean_token_accuracy": 0.8068839557468891, | |
| "num_tokens": 818190520.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9877420500977083, | |
| "grad_norm": 2.015373930862521, | |
| "learning_rate": 2.0095469953996724e-06, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8041414134204388, | |
| "num_tokens": 824098000.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.9948481080120803, | |
| "grad_norm": 2.2460839306695104, | |
| "learning_rate": 2.002387065409989e-06, | |
| "loss": 0.5962, | |
| "mean_token_accuracy": 0.8041631534695626, | |
| "num_tokens": 830080703.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9948481080120803, | |
| "eval_loss": 0.5880739688873291, | |
| "eval_mean_token_accuracy": 0.8062155152622023, | |
| "eval_num_tokens": 830080703.0, | |
| "eval_runtime": 296.6258, | |
| "eval_samples_per_second": 12.268, | |
| "eval_steps_per_second": 0.384, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 704, | |
| "total_flos": 1.2125984924893184e+16, | |
| "train_loss": 0.0, | |
| "train_runtime": 1.6574, | |
| "train_samples_per_second": 217359.421, | |
| "train_steps_per_second": 424.761 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 704, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2125984924893184e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |