| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 704, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007106057914372002, | |
| "grad_norm": 50.32903949504469, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 4.3632, | |
| "mean_token_accuracy": 0.441570908203721, | |
| "num_tokens": 5473393.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014212115828744005, | |
| "grad_norm": 14.807993883918904, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 3.1394, | |
| "mean_token_accuracy": 0.49803002886474135, | |
| "num_tokens": 10986730.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021318173743116006, | |
| "grad_norm": 5.124010397941281, | |
| "learning_rate": 1.2727272727272728e-05, | |
| "loss": 1.5984, | |
| "mean_token_accuracy": 0.6562768064439297, | |
| "num_tokens": 16504629.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02842423165748801, | |
| "grad_norm": 2.392040251927866, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 1.0563, | |
| "mean_token_accuracy": 0.7480768047273159, | |
| "num_tokens": 22018554.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03553028957186001, | |
| "grad_norm": 3.2881890911900955, | |
| "learning_rate": 1.999961805535155e-05, | |
| "loss": 0.8892, | |
| "mean_token_accuracy": 0.7724978730082512, | |
| "num_tokens": 27528237.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04263634748623201, | |
| "grad_norm": 3.296418319427866, | |
| "learning_rate": 1.9995321550350065e-05, | |
| "loss": 0.7968, | |
| "mean_token_accuracy": 0.7858201645314693, | |
| "num_tokens": 33059234.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04974240540060401, | |
| "grad_norm": 2.650113760947062, | |
| "learning_rate": 1.998625339625423e-05, | |
| "loss": 0.7639, | |
| "mean_token_accuracy": 0.7874479472637177, | |
| "num_tokens": 38579238.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.05684846331497602, | |
| "grad_norm": 0.8030808619999289, | |
| "learning_rate": 1.9972418403347817e-05, | |
| "loss": 0.7136, | |
| "mean_token_accuracy": 0.7952406644821167, | |
| "num_tokens": 44087596.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06395452122934801, | |
| "grad_norm": 0.38079517614839553, | |
| "learning_rate": 1.9953823910527057e-05, | |
| "loss": 0.6781, | |
| "mean_token_accuracy": 0.8054998718202114, | |
| "num_tokens": 49589200.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07106057914372002, | |
| "grad_norm": 0.35060604680564306, | |
| "learning_rate": 1.993047978140764e-05, | |
| "loss": 0.6594, | |
| "mean_token_accuracy": 0.8087423123419285, | |
| "num_tokens": 55106291.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07816663705809203, | |
| "grad_norm": 0.4403926191014202, | |
| "learning_rate": 1.9902398399092494e-05, | |
| "loss": 0.6293, | |
| "mean_token_accuracy": 0.8166272558271885, | |
| "num_tokens": 60615746.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08527269497246402, | |
| "grad_norm": 0.4059804779836136, | |
| "learning_rate": 1.9869594659603032e-05, | |
| "loss": 0.633, | |
| "mean_token_accuracy": 0.8155130945146084, | |
| "num_tokens": 66132359.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09237875288683603, | |
| "grad_norm": 0.35058497488520535, | |
| "learning_rate": 1.9832085963977445e-05, | |
| "loss": 0.6263, | |
| "mean_token_accuracy": 0.8166398376226425, | |
| "num_tokens": 71655901.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.09948481080120802, | |
| "grad_norm": 0.3514975941052687, | |
| "learning_rate": 1.978989220904016e-05, | |
| "loss": 0.6166, | |
| "mean_token_accuracy": 0.817786256223917, | |
| "num_tokens": 77177506.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10659086871558003, | |
| "grad_norm": 0.3305576165553241, | |
| "learning_rate": 1.9743035776847377e-05, | |
| "loss": 0.6112, | |
| "mean_token_accuracy": 0.8196637347340584, | |
| "num_tokens": 82719853.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11369692662995204, | |
| "grad_norm": 0.338564559273819, | |
| "learning_rate": 1.9691541522814327e-05, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.823684225231409, | |
| "num_tokens": 88237466.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12080298454432403, | |
| "grad_norm": 0.3094787313199236, | |
| "learning_rate": 1.963543676253048e-05, | |
| "loss": 0.6006, | |
| "mean_token_accuracy": 0.8217748202383518, | |
| "num_tokens": 93758651.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.12790904245869603, | |
| "grad_norm": 0.3244755207573469, | |
| "learning_rate": 1.9574751257269748e-05, | |
| "loss": 0.5922, | |
| "mean_token_accuracy": 0.8233424670994282, | |
| "num_tokens": 99280369.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13501510037306805, | |
| "grad_norm": 0.37452298605350337, | |
| "learning_rate": 1.950951719820335e-05, | |
| "loss": 0.586, | |
| "mean_token_accuracy": 0.825210265815258, | |
| "num_tokens": 104773871.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.14212115828744004, | |
| "grad_norm": 0.3637562399439702, | |
| "learning_rate": 1.9439769189323727e-05, | |
| "loss": 0.5942, | |
| "mean_token_accuracy": 0.8233202829957008, | |
| "num_tokens": 110286415.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14212115828744004, | |
| "eval_loss": 0.5650674104690552, | |
| "eval_mean_token_accuracy": 0.8260843633559712, | |
| "eval_num_tokens": 110286415.0, | |
| "eval_runtime": 149.0036, | |
| "eval_samples_per_second": 24.422, | |
| "eval_steps_per_second": 0.765, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14922721620181204, | |
| "grad_norm": 0.34962576562336867, | |
| "learning_rate": 1.9365544229088517e-05, | |
| "loss": 0.5897, | |
| "mean_token_accuracy": 0.8245358660817146, | |
| "num_tokens": 115819384.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.15633327411618406, | |
| "grad_norm": 0.32080702182710497, | |
| "learning_rate": 1.9286881690794425e-05, | |
| "loss": 0.5795, | |
| "mean_token_accuracy": 0.827671080827713, | |
| "num_tokens": 121352740.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.16343933203055605, | |
| "grad_norm": 0.3302528447867004, | |
| "learning_rate": 1.9203823301691272e-05, | |
| "loss": 0.5898, | |
| "mean_token_accuracy": 0.8234031349420547, | |
| "num_tokens": 126898367.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.17054538994492804, | |
| "grad_norm": 0.344422613860792, | |
| "learning_rate": 1.9116413120847425e-05, | |
| "loss": 0.5803, | |
| "mean_token_accuracy": 0.8264414891600609, | |
| "num_tokens": 132422935.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17765144785930007, | |
| "grad_norm": 0.31498034575502054, | |
| "learning_rate": 1.902469751577826e-05, | |
| "loss": 0.5736, | |
| "mean_token_accuracy": 0.8282143533229828, | |
| "num_tokens": 137934164.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.18475750577367206, | |
| "grad_norm": 0.29607953526948433, | |
| "learning_rate": 1.892872513785008e-05, | |
| "loss": 0.5625, | |
| "mean_token_accuracy": 0.8306705243885517, | |
| "num_tokens": 143442236.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.19186356368804405, | |
| "grad_norm": 0.33026938357250507, | |
| "learning_rate": 1.88285468964726e-05, | |
| "loss": 0.5674, | |
| "mean_token_accuracy": 0.8293713837862015, | |
| "num_tokens": 148967668.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.19896962160241605, | |
| "grad_norm": 0.3098906923396821, | |
| "learning_rate": 1.872421593209355e-05, | |
| "loss": 0.5625, | |
| "mean_token_accuracy": 0.8305731259286404, | |
| "num_tokens": 154497475.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20607567951678807, | |
| "grad_norm": 0.39805146321076146, | |
| "learning_rate": 1.861578758800989e-05, | |
| "loss": 0.569, | |
| "mean_token_accuracy": 0.8292202673852443, | |
| "num_tokens": 160003170.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.21318173743116006, | |
| "grad_norm": 0.314641318728057, | |
| "learning_rate": 1.8503319381010414e-05, | |
| "loss": 0.5632, | |
| "mean_token_accuracy": 0.8299683950841427, | |
| "num_tokens": 165528828.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22028779534553206, | |
| "grad_norm": 0.31172255219458456, | |
| "learning_rate": 1.8386870970865488e-05, | |
| "loss": 0.5561, | |
| "mean_token_accuracy": 0.8317106999456882, | |
| "num_tokens": 171050241.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.22739385325990408, | |
| "grad_norm": 0.3316716520690995, | |
| "learning_rate": 1.8266504128679988e-05, | |
| "loss": 0.5572, | |
| "mean_token_accuracy": 0.8323395892977714, | |
| "num_tokens": 176567106.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.23449991117427607, | |
| "grad_norm": 0.3018790729986631, | |
| "learning_rate": 1.814228270412624e-05, | |
| "loss": 0.5717, | |
| "mean_token_accuracy": 0.8280466146767139, | |
| "num_tokens": 182090185.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.24160596908864806, | |
| "grad_norm": 0.3340485136656981, | |
| "learning_rate": 1.8014272591574405e-05, | |
| "loss": 0.5666, | |
| "mean_token_accuracy": 0.8296592086553574, | |
| "num_tokens": 187606737.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2487120270030201, | |
| "grad_norm": 0.31420140799198965, | |
| "learning_rate": 1.7882541695138224e-05, | |
| "loss": 0.5521, | |
| "mean_token_accuracy": 0.8335933439433575, | |
| "num_tokens": 193124335.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.25581808491739205, | |
| "grad_norm": 0.3354458879208076, | |
| "learning_rate": 1.7747159892654646e-05, | |
| "loss": 0.5509, | |
| "mean_token_accuracy": 0.8328722730278969, | |
| "num_tokens": 198639349.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2629241428317641, | |
| "grad_norm": 0.3022133339191553, | |
| "learning_rate": 1.7608198998616533e-05, | |
| "loss": 0.5573, | |
| "mean_token_accuracy": 0.8310446247458458, | |
| "num_tokens": 204194484.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2700302007461361, | |
| "grad_norm": 0.3224922846056182, | |
| "learning_rate": 1.7465732726077993e-05, | |
| "loss": 0.5535, | |
| "mean_token_accuracy": 0.8318519063293934, | |
| "num_tokens": 209683141.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.27713625866050806, | |
| "grad_norm": 0.29765302794192444, | |
| "learning_rate": 1.731983664755264e-05, | |
| "loss": 0.5569, | |
| "mean_token_accuracy": 0.8318051770329475, | |
| "num_tokens": 215203256.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2842423165748801, | |
| "grad_norm": 0.3410703478926894, | |
| "learning_rate": 1.717058815492548e-05, | |
| "loss": 0.5569, | |
| "mean_token_accuracy": 0.8310887739062309, | |
| "num_tokens": 220715591.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2842423165748801, | |
| "eval_loss": 0.5341136455535889, | |
| "eval_mean_token_accuracy": 0.833399682714228, | |
| "eval_num_tokens": 220715591.0, | |
| "eval_runtime": 149.5883, | |
| "eval_samples_per_second": 24.327, | |
| "eval_steps_per_second": 0.762, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2913483744892521, | |
| "grad_norm": 0.2886034985645996, | |
| "learning_rate": 1.701806641839967e-05, | |
| "loss": 0.5567, | |
| "mean_token_accuracy": 0.8324723578989506, | |
| "num_tokens": 226242581.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.29845443240362407, | |
| "grad_norm": 0.28843151859178723, | |
| "learning_rate": 1.6862352344500004e-05, | |
| "loss": 0.5558, | |
| "mean_token_accuracy": 0.8317767918109894, | |
| "num_tokens": 231752698.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3055604903179961, | |
| "grad_norm": 0.2942105037955124, | |
| "learning_rate": 1.6703528533155283e-05, | |
| "loss": 0.5512, | |
| "mean_token_accuracy": 0.8333536356687545, | |
| "num_tokens": 237265750.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3126665482323681, | |
| "grad_norm": 0.29970782334352336, | |
| "learning_rate": 1.6541679233882477e-05, | |
| "loss": 0.5467, | |
| "mean_token_accuracy": 0.8344343066215515, | |
| "num_tokens": 242787815.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3197726061467401, | |
| "grad_norm": 0.29759819742183974, | |
| "learning_rate": 1.63768903010958e-05, | |
| "loss": 0.55, | |
| "mean_token_accuracy": 0.8330938413739204, | |
| "num_tokens": 248325122.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3268786640611121, | |
| "grad_norm": 0.2920108536684172, | |
| "learning_rate": 1.6209249148564437e-05, | |
| "loss": 0.5453, | |
| "mean_token_accuracy": 0.8345815449953079, | |
| "num_tokens": 253826880.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3339847219754841, | |
| "grad_norm": 0.29667699234937334, | |
| "learning_rate": 1.603884470304318e-05, | |
| "loss": 0.5578, | |
| "mean_token_accuracy": 0.8316668353974819, | |
| "num_tokens": 259356528.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3410907798898561, | |
| "grad_norm": 0.2867819840469066, | |
| "learning_rate": 1.5865767357100383e-05, | |
| "loss": 0.5394, | |
| "mean_token_accuracy": 0.8358893245458603, | |
| "num_tokens": 264887477.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3481968378042281, | |
| "grad_norm": 0.332209143957244, | |
| "learning_rate": 1.5690108921168428e-05, | |
| "loss": 0.5456, | |
| "mean_token_accuracy": 0.8347376808524132, | |
| "num_tokens": 270408845.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.35530289571860013, | |
| "grad_norm": 0.330011762312303, | |
| "learning_rate": 1.5511962574842073e-05, | |
| "loss": 0.5446, | |
| "mean_token_accuracy": 0.8345297470688819, | |
| "num_tokens": 275923409.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3624089536329721, | |
| "grad_norm": 0.31031497830420174, | |
| "learning_rate": 1.5331422817450485e-05, | |
| "loss": 0.5478, | |
| "mean_token_accuracy": 0.8336269296705723, | |
| "num_tokens": 281456923.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3695150115473441, | |
| "grad_norm": 0.2918876682786512, | |
| "learning_rate": 1.5148585417929212e-05, | |
| "loss": 0.5438, | |
| "mean_token_accuracy": 0.8351672604680062, | |
| "num_tokens": 286973486.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.37662106946171614, | |
| "grad_norm": 0.3365818739495239, | |
| "learning_rate": 1.4963547364018711e-05, | |
| "loss": 0.541, | |
| "mean_token_accuracy": 0.8354949586093425, | |
| "num_tokens": 292479427.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3837271273760881, | |
| "grad_norm": 0.293893859709652, | |
| "learning_rate": 1.477640681081632e-05, | |
| "loss": 0.5436, | |
| "mean_token_accuracy": 0.8349400483071804, | |
| "num_tokens": 298006653.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.39083318529046013, | |
| "grad_norm": 0.2944189536488024, | |
| "learning_rate": 1.4587263028709013e-05, | |
| "loss": 0.5401, | |
| "mean_token_accuracy": 0.8359036639332771, | |
| "num_tokens": 303515961.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3979392432048321, | |
| "grad_norm": 0.3027989379324982, | |
| "learning_rate": 1.4396216350714512e-05, | |
| "loss": 0.5421, | |
| "mean_token_accuracy": 0.8354827515780926, | |
| "num_tokens": 309030348.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4050453011192041, | |
| "grad_norm": 0.2978819457639701, | |
| "learning_rate": 1.4203368119258759e-05, | |
| "loss": 0.538, | |
| "mean_token_accuracy": 0.8356474481523037, | |
| "num_tokens": 314557830.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.41215135903357614, | |
| "grad_norm": 0.3149386649352245, | |
| "learning_rate": 1.4008820632417906e-05, | |
| "loss": 0.5339, | |
| "mean_token_accuracy": 0.8371641159057617, | |
| "num_tokens": 320080082.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4192574169479481, | |
| "grad_norm": 0.3059908505846885, | |
| "learning_rate": 1.381267708965339e-05, | |
| "loss": 0.5379, | |
| "mean_token_accuracy": 0.8365371204912663, | |
| "num_tokens": 325602548.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4263634748623201, | |
| "grad_norm": 0.3021029670481725, | |
| "learning_rate": 1.3615041537068831e-05, | |
| "loss": 0.5462, | |
| "mean_token_accuracy": 0.8336855717003345, | |
| "num_tokens": 331139258.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4263634748623201, | |
| "eval_loss": 0.5192646980285645, | |
| "eval_mean_token_accuracy": 0.8369944780542139, | |
| "eval_num_tokens": 331139258.0, | |
| "eval_runtime": 150.4327, | |
| "eval_samples_per_second": 24.19, | |
| "eval_steps_per_second": 0.758, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.43346953277669215, | |
| "grad_norm": 0.3062091073612256, | |
| "learning_rate": 1.3416018812217866e-05, | |
| "loss": 0.5441, | |
| "mean_token_accuracy": 0.8341693080961704, | |
| "num_tokens": 336661954.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4405755906910641, | |
| "grad_norm": 0.2877765065298344, | |
| "learning_rate": 1.3215714488492121e-05, | |
| "loss": 0.5288, | |
| "mean_token_accuracy": 0.838797652721405, | |
| "num_tokens": 342190308.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.44768164860543613, | |
| "grad_norm": 0.2986822105588957, | |
| "learning_rate": 1.3014234819118846e-05, | |
| "loss": 0.5269, | |
| "mean_token_accuracy": 0.8390726670622826, | |
| "num_tokens": 347716991.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.45478770651980815, | |
| "grad_norm": 0.28753831464660323, | |
| "learning_rate": 1.2811686680797942e-05, | |
| "loss": 0.54, | |
| "mean_token_accuracy": 0.8348217740654945, | |
| "num_tokens": 353240462.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4618937644341801, | |
| "grad_norm": 0.3189321553532571, | |
| "learning_rate": 1.2608177517008268e-05, | |
| "loss": 0.5316, | |
| "mean_token_accuracy": 0.8373772338032722, | |
| "num_tokens": 358757193.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.46899982234855214, | |
| "grad_norm": 0.2740676502206635, | |
| "learning_rate": 1.240381528101327e-05, | |
| "loss": 0.5245, | |
| "mean_token_accuracy": 0.8392882093787193, | |
| "num_tokens": 364274287.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.47610588026292416, | |
| "grad_norm": 0.29826705795684294, | |
| "learning_rate": 1.2198708378596198e-05, | |
| "loss": 0.5201, | |
| "mean_token_accuracy": 0.8405162297189236, | |
| "num_tokens": 369781348.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.48321193817729613, | |
| "grad_norm": 0.2810861157555765, | |
| "learning_rate": 1.19929656105553e-05, | |
| "loss": 0.5252, | |
| "mean_token_accuracy": 0.838694840669632, | |
| "num_tokens": 375291603.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.49031799609166815, | |
| "grad_norm": 0.28476568743444564, | |
| "learning_rate": 1.1786696114989455e-05, | |
| "loss": 0.5264, | |
| "mean_token_accuracy": 0.839257051050663, | |
| "num_tokens": 380805085.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.4974240540060402, | |
| "grad_norm": 0.2977392563082, | |
| "learning_rate": 1.1580009309404887e-05, | |
| "loss": 0.5276, | |
| "mean_token_accuracy": 0.8389153242111206, | |
| "num_tokens": 386334037.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5045301119204122, | |
| "grad_norm": 0.3033446749891465, | |
| "learning_rate": 1.1373014832673661e-05, | |
| "loss": 0.5298, | |
| "mean_token_accuracy": 0.8390403784811497, | |
| "num_tokens": 391841580.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5116361698347841, | |
| "grad_norm": 0.3969220524710466, | |
| "learning_rate": 1.1165822486874773e-05, | |
| "loss": 0.5229, | |
| "mean_token_accuracy": 0.8393726870417595, | |
| "num_tokens": 397371651.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5187422277491561, | |
| "grad_norm": 0.30559826647342453, | |
| "learning_rate": 1.0958542179048637e-05, | |
| "loss": 0.5244, | |
| "mean_token_accuracy": 0.8402129337191582, | |
| "num_tokens": 402867415.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5258482856635281, | |
| "grad_norm": 0.3494017728215535, | |
| "learning_rate": 1.0751283862895914e-05, | |
| "loss": 0.5361, | |
| "mean_token_accuracy": 0.8366998687386513, | |
| "num_tokens": 408390957.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5329543435779002, | |
| "grad_norm": 0.353265195510961, | |
| "learning_rate": 1.0544157480451586e-05, | |
| "loss": 0.534, | |
| "mean_token_accuracy": 0.8368604250252247, | |
| "num_tokens": 413913149.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5400604014922722, | |
| "grad_norm": 0.2611354161191268, | |
| "learning_rate": 1.033727290376522e-05, | |
| "loss": 0.5361, | |
| "mean_token_accuracy": 0.836609935760498, | |
| "num_tokens": 419431562.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5471664594066442, | |
| "grad_norm": 0.2872728729171679, | |
| "learning_rate": 1.013073987661834e-05, | |
| "loss": 0.5338, | |
| "mean_token_accuracy": 0.8370331548154354, | |
| "num_tokens": 424955146.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5542725173210161, | |
| "grad_norm": 0.28934761011723314, | |
| "learning_rate": 9.924667956309862e-06, | |
| "loss": 0.5251, | |
| "mean_token_accuracy": 0.8398349188268185, | |
| "num_tokens": 430476718.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5613785752353881, | |
| "grad_norm": 0.28067537969678, | |
| "learning_rate": 9.719166455540437e-06, | |
| "loss": 0.5304, | |
| "mean_token_accuracy": 0.8381435446441173, | |
| "num_tokens": 435994507.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5684846331497602, | |
| "grad_norm": 0.3568751332581294, | |
| "learning_rate": 9.51434438442655e-06, | |
| "loss": 0.5293, | |
| "mean_token_accuracy": 0.8387467741966248, | |
| "num_tokens": 441515444.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5684846331497602, | |
| "eval_loss": 0.5093328356742859, | |
| "eval_mean_token_accuracy": 0.8396573615701575, | |
| "eval_num_tokens": 441515444.0, | |
| "eval_runtime": 150.3037, | |
| "eval_samples_per_second": 24.211, | |
| "eval_steps_per_second": 0.758, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5755906910641322, | |
| "grad_norm": 0.2802098035879255, | |
| "learning_rate": 9.310310392675132e-06, | |
| "loss": 0.5167, | |
| "mean_token_accuracy": 0.8414627239108086, | |
| "num_tokens": 447005744.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5826967489785042, | |
| "grad_norm": 0.2668144948670277, | |
| "learning_rate": 9.107172711949324e-06, | |
| "loss": 0.5323, | |
| "mean_token_accuracy": 0.836710449308157, | |
| "num_tokens": 452533510.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5898028068928762, | |
| "grad_norm": 0.282320393535823, | |
| "learning_rate": 8.905039098456049e-06, | |
| "loss": 0.5237, | |
| "mean_token_accuracy": 0.8391933210194111, | |
| "num_tokens": 458057489.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5969088648072481, | |
| "grad_norm": 0.2623104974680133, | |
| "learning_rate": 8.704016775785742e-06, | |
| "loss": 0.5282, | |
| "mean_token_accuracy": 0.8383334554731846, | |
| "num_tokens": 463589349.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6040149227216202, | |
| "grad_norm": 0.28075733994367397, | |
| "learning_rate": 8.50421237803464e-06, | |
| "loss": 0.5226, | |
| "mean_token_accuracy": 0.8393978834152221, | |
| "num_tokens": 469104113.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6111209806359922, | |
| "grad_norm": 0.28552247132295744, | |
| "learning_rate": 8.30573189323978e-06, | |
| "loss": 0.5161, | |
| "mean_token_accuracy": 0.8426251098513603, | |
| "num_tokens": 474604196.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6182270385503642, | |
| "grad_norm": 0.2792007208746605, | |
| "learning_rate": 8.108680607156669e-06, | |
| "loss": 0.5307, | |
| "mean_token_accuracy": 0.8380370497703552, | |
| "num_tokens": 480124231.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6253330964647362, | |
| "grad_norm": 0.2876628277081085, | |
| "learning_rate": 7.913163047409533e-06, | |
| "loss": 0.5235, | |
| "mean_token_accuracy": 0.839199036359787, | |
| "num_tokens": 485642165.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6324391543791081, | |
| "grad_norm": 0.27864846137064453, | |
| "learning_rate": 7.719282928043688e-06, | |
| "loss": 0.5248, | |
| "mean_token_accuracy": 0.8390684366226197, | |
| "num_tokens": 491149290.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6395452122934802, | |
| "grad_norm": 0.28373247762189147, | |
| "learning_rate": 7.527143094509492e-06, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.8402359418570995, | |
| "num_tokens": 496664600.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6466512702078522, | |
| "grad_norm": 0.26738376755844384, | |
| "learning_rate": 7.336845469107061e-06, | |
| "loss": 0.5229, | |
| "mean_token_accuracy": 0.839232936501503, | |
| "num_tokens": 502162941.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6537573281222242, | |
| "grad_norm": 0.3006774536256795, | |
| "learning_rate": 7.148490996920661e-06, | |
| "loss": 0.5253, | |
| "mean_token_accuracy": 0.8390106722712517, | |
| "num_tokens": 507685810.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6608633860365962, | |
| "grad_norm": 0.26836022137138205, | |
| "learning_rate": 6.9621795922714805e-06, | |
| "loss": 0.5218, | |
| "mean_token_accuracy": 0.8404779210686684, | |
| "num_tokens": 513196397.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6679694439509682, | |
| "grad_norm": 0.26829584881370205, | |
| "learning_rate": 6.778010085717202e-06, | |
| "loss": 0.5209, | |
| "mean_token_accuracy": 0.8410870231688022, | |
| "num_tokens": 518716947.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6750755018653402, | |
| "grad_norm": 0.2707573848559289, | |
| "learning_rate": 6.596080171626409e-06, | |
| "loss": 0.5239, | |
| "mean_token_accuracy": 0.8392590440809726, | |
| "num_tokens": 524218898.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6821815597797122, | |
| "grad_norm": 0.2739664325900379, | |
| "learning_rate": 6.416486356355769e-06, | |
| "loss": 0.5306, | |
| "mean_token_accuracy": 0.8375619657337665, | |
| "num_tokens": 529729639.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6892876176940842, | |
| "grad_norm": 0.2868769682396871, | |
| "learning_rate": 6.239323907057342e-06, | |
| "loss": 0.5276, | |
| "mean_token_accuracy": 0.8388026498258114, | |
| "num_tokens": 535240450.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6963936756084562, | |
| "grad_norm": 0.27945127177338197, | |
| "learning_rate": 6.064686801143271e-06, | |
| "loss": 0.5096, | |
| "mean_token_accuracy": 0.8433919370174408, | |
| "num_tokens": 540730386.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7034997335228282, | |
| "grad_norm": 0.2797688763148102, | |
| "learning_rate": 5.892667676434633e-06, | |
| "loss": 0.5176, | |
| "mean_token_accuracy": 0.8411184750497341, | |
| "num_tokens": 546264785.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7106057914372003, | |
| "grad_norm": 0.29234119222810084, | |
| "learning_rate": 5.723357782020867e-06, | |
| "loss": 0.5154, | |
| "mean_token_accuracy": 0.8415673337876797, | |
| "num_tokens": 551771408.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7106057914372003, | |
| "eval_loss": 0.5027303099632263, | |
| "eval_mean_token_accuracy": 0.8409134248892466, | |
| "eval_num_tokens": 551771408.0, | |
| "eval_runtime": 150.6202, | |
| "eval_samples_per_second": 24.16, | |
| "eval_steps_per_second": 0.757, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7177118493515722, | |
| "grad_norm": 0.28402079440718836, | |
| "learning_rate": 5.556846929855857e-06, | |
| "loss": 0.5133, | |
| "mean_token_accuracy": 0.8421028688549995, | |
| "num_tokens": 557283870.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7248179072659442, | |
| "grad_norm": 0.2831441468283664, | |
| "learning_rate": 5.393223447116409e-06, | |
| "loss": 0.5278, | |
| "mean_token_accuracy": 0.8389511182904243, | |
| "num_tokens": 562803110.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7319239651803162, | |
| "grad_norm": 0.37193795016606795, | |
| "learning_rate": 5.232574129348278e-06, | |
| "loss": 0.5168, | |
| "mean_token_accuracy": 0.8417807504534721, | |
| "num_tokens": 568320103.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7390300230946882, | |
| "grad_norm": 0.27764965119819857, | |
| "learning_rate": 5.0749841944247e-06, | |
| "loss": 0.5274, | |
| "mean_token_accuracy": 0.8377346590161323, | |
| "num_tokens": 573851289.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7461360810090603, | |
| "grad_norm": 0.46925453686500695, | |
| "learning_rate": 4.92053723734182e-06, | |
| "loss": 0.525, | |
| "mean_token_accuracy": 0.8391022063791752, | |
| "num_tokens": 579354449.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7532421389234323, | |
| "grad_norm": 0.28343319532373823, | |
| "learning_rate": 4.769315185874951e-06, | |
| "loss": 0.5215, | |
| "mean_token_accuracy": 0.840414184331894, | |
| "num_tokens": 584875200.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7603481968378042, | |
| "grad_norm": 0.32287063748249534, | |
| "learning_rate": 4.621398257119266e-06, | |
| "loss": 0.5198, | |
| "mean_token_accuracy": 0.840663468837738, | |
| "num_tokens": 590401576.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7674542547521762, | |
| "grad_norm": 0.3396523052676484, | |
| "learning_rate": 4.476864914937923e-06, | |
| "loss": 0.5132, | |
| "mean_token_accuracy": 0.8424190938472748, | |
| "num_tokens": 595916751.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7745603126665482, | |
| "grad_norm": 0.275059324923501, | |
| "learning_rate": 4.335791828340183e-06, | |
| "loss": 0.5229, | |
| "mean_token_accuracy": 0.8403938293457032, | |
| "num_tokens": 601460941.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7816663705809203, | |
| "grad_norm": 0.26807372924267187, | |
| "learning_rate": 4.1982538308116775e-06, | |
| "loss": 0.5178, | |
| "mean_token_accuracy": 0.8396266974508763, | |
| "num_tokens": 606975325.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7887724284952923, | |
| "grad_norm": 0.3385442872306829, | |
| "learning_rate": 4.064323880618279e-06, | |
| "loss": 0.5207, | |
| "mean_token_accuracy": 0.8411053366959095, | |
| "num_tokens": 612497721.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7958784864096642, | |
| "grad_norm": 0.28005130030507386, | |
| "learning_rate": 3.934073022104759e-06, | |
| "loss": 0.517, | |
| "mean_token_accuracy": 0.8412538655102253, | |
| "num_tokens": 618029589.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8029845443240362, | |
| "grad_norm": 0.29164566017921, | |
| "learning_rate": 3.807570348008672e-06, | |
| "loss": 0.5173, | |
| "mean_token_accuracy": 0.8412310920655728, | |
| "num_tokens": 623561843.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8100906022384082, | |
| "grad_norm": 0.27485579792509013, | |
| "learning_rate": 3.684882962809484e-06, | |
| "loss": 0.5247, | |
| "mean_token_accuracy": 0.839312057942152, | |
| "num_tokens": 629091377.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8171966601527803, | |
| "grad_norm": 0.27816784714201154, | |
| "learning_rate": 3.5660759471324037e-06, | |
| "loss": 0.5226, | |
| "mean_token_accuracy": 0.8401588529348374, | |
| "num_tokens": 634600764.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8243027180671523, | |
| "grad_norm": 0.40137187990535683, | |
| "learning_rate": 3.451212323225786e-06, | |
| "loss": 0.5136, | |
| "mean_token_accuracy": 0.8420207679271698, | |
| "num_tokens": 640105985.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8314087759815243, | |
| "grad_norm": 0.277470349562551, | |
| "learning_rate": 3.340353021530409e-06, | |
| "loss": 0.5147, | |
| "mean_token_accuracy": 0.8408644467592239, | |
| "num_tokens": 645630496.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8385148338958962, | |
| "grad_norm": 0.2915696671499121, | |
| "learning_rate": 3.2335568483583708e-06, | |
| "loss": 0.5102, | |
| "mean_token_accuracy": 0.8447316095232964, | |
| "num_tokens": 651136302.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8456208918102682, | |
| "grad_norm": 0.31523011648160176, | |
| "learning_rate": 3.1308804546987615e-06, | |
| "loss": 0.5241, | |
| "mean_token_accuracy": 0.8398886010050773, | |
| "num_tokens": 656667592.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8527269497246402, | |
| "grad_norm": 0.2715441398857633, | |
| "learning_rate": 3.0323783061666307e-06, | |
| "loss": 0.5154, | |
| "mean_token_accuracy": 0.8416090242564678, | |
| "num_tokens": 662182702.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8527269497246402, | |
| "eval_loss": 0.49905067682266235, | |
| "eval_mean_token_accuracy": 0.8420795897642771, | |
| "eval_num_tokens": 662182702.0, | |
| "eval_runtime": 150.127, | |
| "eval_samples_per_second": 24.239, | |
| "eval_steps_per_second": 0.759, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8598330076390123, | |
| "grad_norm": 0.4459191186579175, | |
| "learning_rate": 2.9381026541112145e-06, | |
| "loss": 0.5176, | |
| "mean_token_accuracy": 0.8410927847027778, | |
| "num_tokens": 667713320.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8669390655533843, | |
| "grad_norm": 0.2700680983325809, | |
| "learning_rate": 2.848103507898745e-06, | |
| "loss": 0.5204, | |
| "mean_token_accuracy": 0.8398772545158864, | |
| "num_tokens": 673241578.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8740451234677563, | |
| "grad_norm": 0.3027315294815319, | |
| "learning_rate": 2.7624286083845187e-06, | |
| "loss": 0.5152, | |
| "mean_token_accuracy": 0.8407413326203823, | |
| "num_tokens": 678761901.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8811511813821282, | |
| "grad_norm": 0.35742788068442477, | |
| "learning_rate": 2.6811234025883457e-06, | |
| "loss": 0.5104, | |
| "mean_token_accuracy": 0.8433315142989158, | |
| "num_tokens": 684294891.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8882572392965002, | |
| "grad_norm": 0.29102667879601235, | |
| "learning_rate": 2.604231019586761e-06, | |
| "loss": 0.5141, | |
| "mean_token_accuracy": 0.8427356474101544, | |
| "num_tokens": 689811922.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.8953632972108723, | |
| "grad_norm": 0.27752275928788533, | |
| "learning_rate": 2.5317922476348194e-06, | |
| "loss": 0.5165, | |
| "mean_token_accuracy": 0.8411040998995304, | |
| "num_tokens": 695336104.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9024693551252443, | |
| "grad_norm": 0.3055065438673597, | |
| "learning_rate": 2.4638455125296043e-06, | |
| "loss": 0.5184, | |
| "mean_token_accuracy": 0.8411155760288238, | |
| "num_tokens": 700859085.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9095754130396163, | |
| "grad_norm": 0.2832455318439677, | |
| "learning_rate": 2.400426857226914e-06, | |
| "loss": 0.5116, | |
| "mean_token_accuracy": 0.8422174222767354, | |
| "num_tokens": 706390161.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9166814709539882, | |
| "grad_norm": 0.2887928206874307, | |
| "learning_rate": 2.3415699227219517e-06, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.8393123477697373, | |
| "num_tokens": 711902275.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9237875288683602, | |
| "grad_norm": 0.28711313316509707, | |
| "learning_rate": 2.2873059302041627e-06, | |
| "loss": 0.514, | |
| "mean_token_accuracy": 0.8423109248280525, | |
| "num_tokens": 717419225.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9308935867827323, | |
| "grad_norm": 0.2736325147914108, | |
| "learning_rate": 2.2376636644956656e-06, | |
| "loss": 0.5109, | |
| "mean_token_accuracy": 0.8425532042980194, | |
| "num_tokens": 722935006.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9379996446971043, | |
| "grad_norm": 0.30955429096645193, | |
| "learning_rate": 2.192669458782096e-06, | |
| "loss": 0.5197, | |
| "mean_token_accuracy": 0.8405652604997158, | |
| "num_tokens": 728439084.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9451057026114763, | |
| "grad_norm": 0.265130214896546, | |
| "learning_rate": 2.1523471806439205e-06, | |
| "loss": 0.5281, | |
| "mean_token_accuracy": 0.8385866686701775, | |
| "num_tokens": 733969356.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.9522117605258483, | |
| "grad_norm": 0.3128163738373232, | |
| "learning_rate": 2.1167182193956738e-06, | |
| "loss": 0.5099, | |
| "mean_token_accuracy": 0.843552653491497, | |
| "num_tokens": 739459750.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9593178184402202, | |
| "grad_norm": 0.2766118277915724, | |
| "learning_rate": 2.0858014747397952e-06, | |
| "loss": 0.5183, | |
| "mean_token_accuracy": 0.8413214348256588, | |
| "num_tokens": 744974432.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.9664238763545923, | |
| "grad_norm": 0.2848391339478646, | |
| "learning_rate": 2.0596133467411213e-06, | |
| "loss": 0.5109, | |
| "mean_token_accuracy": 0.8428529247641563, | |
| "num_tokens": 750470988.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9735299342689643, | |
| "grad_norm": 0.26348775943038943, | |
| "learning_rate": 2.0381677271273177e-06, | |
| "loss": 0.5149, | |
| "mean_token_accuracy": 0.8410044960677624, | |
| "num_tokens": 756002818.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.9806359921833363, | |
| "grad_norm": 0.2695158145512426, | |
| "learning_rate": 2.0214759919198904e-06, | |
| "loss": 0.5089, | |
| "mean_token_accuracy": 0.8422830864787102, | |
| "num_tokens": 761498903.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9877420500977083, | |
| "grad_norm": 0.2833155672506133, | |
| "learning_rate": 2.0095469953996724e-06, | |
| "loss": 0.5174, | |
| "mean_token_accuracy": 0.8406875729560852, | |
| "num_tokens": 767022510.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.9948481080120803, | |
| "grad_norm": 0.28210706359667653, | |
| "learning_rate": 2.002387065409989e-06, | |
| "loss": 0.5208, | |
| "mean_token_accuracy": 0.8403361722826957, | |
| "num_tokens": 772560079.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9948481080120803, | |
| "eval_loss": 0.4970676302909851, | |
| "eval_mean_token_accuracy": 0.8422517065416303, | |
| "eval_num_tokens": 772560079.0, | |
| "eval_runtime": 150.3658, | |
| "eval_samples_per_second": 24.201, | |
| "eval_steps_per_second": 0.758, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "mean_token_accuracy": 0.8405463315289596, | |
| "num_tokens": 776561265.0, | |
| "step": 704, | |
| "total_flos": 6033817814958080.0, | |
| "train_loss": 0.6046972061422738, | |
| "train_runtime": 27367.7567, | |
| "train_samples_per_second": 13.163, | |
| "train_steps_per_second": 0.026 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 704, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6033817814958080.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |