diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4086 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.468488566648076, + "eval_steps": 500, + "global_step": 5799, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011154489682097044, + "grad_norm": 0.4311506450176239, + "learning_rate": 1e-05, + "loss": 3.2953, + "step": 10 + }, + { + "epoch": 0.022308979364194088, + "grad_norm": 0.6115325689315796, + "learning_rate": 2e-05, + "loss": 3.3283, + "step": 20 + }, + { + "epoch": 0.03346346904629113, + "grad_norm": 0.831686794757843, + "learning_rate": 1.9999852237819516e-05, + "loss": 2.9624, + "step": 30 + }, + { + "epoch": 0.044617958728388175, + "grad_norm": 1.2678371667861938, + "learning_rate": 1.9999408955644783e-05, + "loss": 2.891, + "step": 40 + }, + { + "epoch": 0.05577244841048522, + "grad_norm": 0.7378564476966858, + "learning_rate": 1.9998670166575877e-05, + "loss": 2.6735, + "step": 50 + }, + { + "epoch": 0.06692693809258227, + "grad_norm": 1.0785635709762573, + "learning_rate": 1.9997635892445808e-05, + "loss": 2.6967, + "step": 60 + }, + { + "epoch": 0.0780814277746793, + "grad_norm": 0.9325912594795227, + "learning_rate": 1.9996306163819902e-05, + "loss": 2.1375, + "step": 70 + }, + { + "epoch": 0.08923591745677635, + "grad_norm": 0.4600697159767151, + "learning_rate": 1.999468101999488e-05, + "loss": 2.1378, + "step": 80 + }, + { + "epoch": 0.1003904071388734, + "grad_norm": 7.8976664543151855, + "learning_rate": 1.9992760508997694e-05, + "loss": 2.0217, + "step": 90 + }, + { + "epoch": 0.11154489682097044, + "grad_norm": 1.2072306871414185, + "learning_rate": 1.999054468758413e-05, + "loss": 2.0504, + "step": 100 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.561038613319397, + "learning_rate": 1.9988033621237107e-05, + "loss": 1.9364, + "step": 110 + }, + { + "epoch": 0.13385387618516453, + "grad_norm": 0.6420221924781799, + "learning_rate": 1.998522738416475e-05, + "loss": 1.8163, + "step": 120 + }, + { + "epoch": 0.14500836586726157, + "grad_norm": 1.0922359228134155, + "learning_rate": 1.9982126059298202e-05, + "loss": 1.7226, + "step": 130 + }, + { + "epoch": 0.1561628555493586, + "grad_norm": 4.673393726348877, + "learning_rate": 1.997872973828917e-05, + "loss": 1.6478, + "step": 140 + }, + { + "epoch": 0.16731734523145567, + "grad_norm": 0.7612175345420837, + "learning_rate": 1.997503852150721e-05, + "loss": 1.5993, + "step": 150 + }, + { + "epoch": 0.1784718349135527, + "grad_norm": 0.4687439799308777, + "learning_rate": 1.997105251803677e-05, + "loss": 1.5498, + "step": 160 + }, + { + "epoch": 0.18962632459564974, + "grad_norm": 0.5883293747901917, + "learning_rate": 1.9966771845673968e-05, + "loss": 1.4306, + "step": 170 + }, + { + "epoch": 0.2007808142777468, + "grad_norm": 0.6468557715415955, + "learning_rate": 1.9962196630923095e-05, + "loss": 1.5006, + "step": 180 + }, + { + "epoch": 0.21193530395984383, + "grad_norm": 0.487519234418869, + "learning_rate": 1.9957327008992895e-05, + "loss": 1.4175, + "step": 190 + }, + { + "epoch": 0.22308979364194087, + "grad_norm": 0.5795869827270508, + "learning_rate": 1.995216312379256e-05, + "loss": 1.3625, + "step": 200 + }, + { + "epoch": 0.23424428332403793, + "grad_norm": 0.7335501313209534, + "learning_rate": 1.9946705127927474e-05, + "loss": 1.4419, + "step": 210 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.6072186231613159, + "learning_rate": 1.9940953182694716e-05, + "loss": 1.3177, + "step": 220 + }, + { + "epoch": 0.25655326268823203, + "grad_norm": 0.5896971225738525, + "learning_rate": 1.9934907458078274e-05, + "loss": 1.3797, + "step": 230 + }, + { + "epoch": 0.26770775237032907, + "grad_norm": 0.4922662675380707, + "learning_rate": 1.9928568132744042e-05, + "loss": 1.3181, + "step": 240 + }, + { + "epoch": 0.2788622420524261, + "grad_norm": 0.6160895228385925, + "learning_rate": 1.9921935394034525e-05, + "loss": 1.2891, + "step": 250 + }, + { + "epoch": 0.29001673173452314, + "grad_norm": 0.690238356590271, + "learning_rate": 1.9915009437963315e-05, + "loss": 1.2798, + "step": 260 + }, + { + "epoch": 0.30117122141662017, + "grad_norm": 1.5020575523376465, + "learning_rate": 1.9907790469209282e-05, + "loss": 1.2952, + "step": 270 + }, + { + "epoch": 0.3123257110987172, + "grad_norm": 0.5224294066429138, + "learning_rate": 1.9900278701110536e-05, + "loss": 1.2377, + "step": 280 + }, + { + "epoch": 0.3234802007808143, + "grad_norm": 0.5903538465499878, + "learning_rate": 1.989247435565813e-05, + "loss": 1.177, + "step": 290 + }, + { + "epoch": 0.33463469046291133, + "grad_norm": 0.5945698618888855, + "learning_rate": 1.988437766348948e-05, + "loss": 1.3077, + "step": 300 + }, + { + "epoch": 0.34578918014500837, + "grad_norm": 3.4969353675842285, + "learning_rate": 1.9875988863881562e-05, + "loss": 1.2398, + "step": 310 + }, + { + "epoch": 0.3569436698271054, + "grad_norm": 0.7088640332221985, + "learning_rate": 1.9867308204743846e-05, + "loss": 1.2185, + "step": 320 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.6107991933822632, + "learning_rate": 1.985833594261095e-05, + "loss": 1.2245, + "step": 330 + }, + { + "epoch": 0.3792526491912995, + "grad_norm": 0.513905942440033, + "learning_rate": 1.9849072342635086e-05, + "loss": 1.17, + "step": 340 + }, + { + "epoch": 0.39040713887339656, + "grad_norm": 0.7432478666305542, + "learning_rate": 1.9839517678578194e-05, + "loss": 1.1798, + "step": 350 + }, + { + "epoch": 0.4015616285554936, + "grad_norm": 0.4735497832298279, + "learning_rate": 1.9829672232803873e-05, + "loss": 1.1861, + "step": 360 + }, + { + "epoch": 0.41271611823759063, + "grad_norm": 0.4925360083580017, + "learning_rate": 1.9819536296269036e-05, + "loss": 1.1734, + "step": 370 + }, + { + "epoch": 0.42387060791968767, + "grad_norm": 0.48574864864349365, + "learning_rate": 1.980911016851529e-05, + "loss": 1.1775, + "step": 380 + }, + { + "epoch": 0.4350250976017847, + "grad_norm": 0.5457922220230103, + "learning_rate": 1.9798394157660116e-05, + "loss": 1.1733, + "step": 390 + }, + { + "epoch": 0.44617958728388174, + "grad_norm": 0.5231193900108337, + "learning_rate": 1.9787388580387738e-05, + "loss": 1.1468, + "step": 400 + }, + { + "epoch": 0.45733407696597883, + "grad_norm": 0.7397568821907043, + "learning_rate": 1.9776093761939776e-05, + "loss": 1.139, + "step": 410 + }, + { + "epoch": 0.46848856664807587, + "grad_norm": 0.5115352869033813, + "learning_rate": 1.976451003610563e-05, + "loss": 1.2086, + "step": 420 + }, + { + "epoch": 0.4796430563301729, + "grad_norm": 0.6861724853515625, + "learning_rate": 1.975263774521262e-05, + "loss": 1.1599, + "step": 430 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.5191764235496521, + "learning_rate": 1.974047724011586e-05, + "loss": 1.1327, + "step": 440 + }, + { + "epoch": 0.501952035694367, + "grad_norm": 0.5202570557594299, + "learning_rate": 1.97280288801879e-05, + "loss": 1.1402, + "step": 450 + }, + { + "epoch": 0.5131065253764641, + "grad_norm": 0.5279308557510376, + "learning_rate": 1.9715293033308102e-05, + "loss": 1.1664, + "step": 460 + }, + { + "epoch": 0.524261015058561, + "grad_norm": 0.6293380856513977, + "learning_rate": 1.9702270075851767e-05, + "loss": 1.1578, + "step": 470 + }, + { + "epoch": 0.5354155047406581, + "grad_norm": 6.1000518798828125, + "learning_rate": 1.9688960392679014e-05, + "loss": 1.1117, + "step": 480 + }, + { + "epoch": 0.5465699944227551, + "grad_norm": 0.5824970602989197, + "learning_rate": 1.9675364377123405e-05, + "loss": 1.1407, + "step": 490 + }, + { + "epoch": 0.5577244841048522, + "grad_norm": 0.5770475268363953, + "learning_rate": 1.966148243098032e-05, + "loss": 1.1315, + "step": 500 + }, + { + "epoch": 0.5688789737869493, + "grad_norm": 103.2529296875, + "learning_rate": 1.9647314964495084e-05, + "loss": 1.1193, + "step": 510 + }, + { + "epoch": 0.5800334634690463, + "grad_norm": 0.5626741051673889, + "learning_rate": 1.9632862396350845e-05, + "loss": 1.1103, + "step": 520 + }, + { + "epoch": 0.5911879531511434, + "grad_norm": 0.5797271132469177, + "learning_rate": 1.9618125153656204e-05, + "loss": 1.0689, + "step": 530 + }, + { + "epoch": 0.6023424428332403, + "grad_norm": 0.6117788553237915, + "learning_rate": 1.960310367193258e-05, + "loss": 1.0919, + "step": 540 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.9841444492340088, + "learning_rate": 1.958779839510135e-05, + "loss": 1.1139, + "step": 550 + }, + { + "epoch": 0.6246514221974344, + "grad_norm": 0.5128628015518188, + "learning_rate": 1.9572209775470735e-05, + "loss": 1.1207, + "step": 560 + }, + { + "epoch": 0.6358059118795315, + "grad_norm": 0.6127890348434448, + "learning_rate": 1.9556338273722415e-05, + "loss": 1.0485, + "step": 570 + }, + { + "epoch": 0.6469604015616286, + "grad_norm": 0.6080933809280396, + "learning_rate": 1.9540184358897932e-05, + "loss": 1.0677, + "step": 580 + }, + { + "epoch": 0.6581148912437256, + "grad_norm": 136.31642150878906, + "learning_rate": 1.9523748508384827e-05, + "loss": 1.0736, + "step": 590 + }, + { + "epoch": 0.6692693809258227, + "grad_norm": 0.5842283368110657, + "learning_rate": 1.9507031207902515e-05, + "loss": 1.1008, + "step": 600 + }, + { + "epoch": 0.6804238706079196, + "grad_norm": 0.8367534279823303, + "learning_rate": 1.9490032951487955e-05, + "loss": 1.1138, + "step": 610 + }, + { + "epoch": 0.6915783602900167, + "grad_norm": 0.5327929258346558, + "learning_rate": 1.9472754241481035e-05, + "loss": 1.0249, + "step": 620 + }, + { + "epoch": 0.7027328499721138, + "grad_norm": 0.5699859857559204, + "learning_rate": 1.9455195588509723e-05, + "loss": 1.0513, + "step": 630 + }, + { + "epoch": 0.7138873396542108, + "grad_norm": 0.7566335797309875, + "learning_rate": 1.9437357511474987e-05, + "loss": 1.1048, + "step": 640 + }, + { + "epoch": 0.7250418293363079, + "grad_norm": 0.5633962154388428, + "learning_rate": 1.9419240537535468e-05, + "loss": 1.0867, + "step": 650 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.6204596161842346, + "learning_rate": 1.9400845202091872e-05, + "loss": 1.0773, + "step": 660 + }, + { + "epoch": 0.747350808700502, + "grad_norm": 0.6118286848068237, + "learning_rate": 1.938217204877118e-05, + "loss": 1.0388, + "step": 670 + }, + { + "epoch": 0.758505298382599, + "grad_norm": 0.5015501379966736, + "learning_rate": 1.9363221629410558e-05, + "loss": 1.0807, + "step": 680 + }, + { + "epoch": 0.769659788064696, + "grad_norm": 0.5233261585235596, + "learning_rate": 1.9343994504041067e-05, + "loss": 0.9913, + "step": 690 + }, + { + "epoch": 0.7808142777467931, + "grad_norm": 0.5710963010787964, + "learning_rate": 1.9324491240871098e-05, + "loss": 1.1108, + "step": 700 + }, + { + "epoch": 0.7919687674288901, + "grad_norm": 1.604062795639038, + "learning_rate": 1.930471241626959e-05, + "loss": 1.0384, + "step": 710 + }, + { + "epoch": 0.8031232571109872, + "grad_norm": 0.5526170134544373, + "learning_rate": 1.928465861474899e-05, + "loss": 1.0397, + "step": 720 + }, + { + "epoch": 0.8142777467930842, + "grad_norm": 0.7330529689788818, + "learning_rate": 1.926433042894799e-05, + "loss": 1.0294, + "step": 730 + }, + { + "epoch": 0.8254322364751813, + "grad_norm": 1.0434292554855347, + "learning_rate": 1.9243728459614006e-05, + "loss": 1.0274, + "step": 740 + }, + { + "epoch": 0.8365867261572784, + "grad_norm": 0.5535615682601929, + "learning_rate": 1.922285331558541e-05, + "loss": 1.0717, + "step": 750 + }, + { + "epoch": 0.8477412158393753, + "grad_norm": 1.111730694770813, + "learning_rate": 1.9201705613773575e-05, + "loss": 1.0248, + "step": 760 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.8851047158241272, + "learning_rate": 1.9180285979144594e-05, + "loss": 1.1022, + "step": 770 + }, + { + "epoch": 0.8700501952035694, + "grad_norm": 0.5645832419395447, + "learning_rate": 1.9158595044700862e-05, + "loss": 1.0598, + "step": 780 + }, + { + "epoch": 0.8812046848856665, + "grad_norm": 0.536708414554596, + "learning_rate": 1.913663345146232e-05, + "loss": 1.0778, + "step": 790 + }, + { + "epoch": 0.8923591745677635, + "grad_norm": 0.7798624038696289, + "learning_rate": 1.911440184844756e-05, + "loss": 1.0878, + "step": 800 + }, + { + "epoch": 0.9035136642498606, + "grad_norm": 0.5899933576583862, + "learning_rate": 1.9091900892654605e-05, + "loss": 1.0836, + "step": 810 + }, + { + "epoch": 0.9146681539319577, + "grad_norm": 1.4796583652496338, + "learning_rate": 1.9069131249041515e-05, + "loss": 1.0304, + "step": 820 + }, + { + "epoch": 0.9258226436140546, + "grad_norm": 0.5310823321342468, + "learning_rate": 1.9046093590506727e-05, + "loss": 1.0294, + "step": 830 + }, + { + "epoch": 0.9369771332961517, + "grad_norm": 3.327803373336792, + "learning_rate": 1.9022788597869174e-05, + "loss": 1.0093, + "step": 840 + }, + { + "epoch": 0.9481316229782487, + "grad_norm": 0.5713508129119873, + "learning_rate": 1.8999216959848154e-05, + "loss": 1.1078, + "step": 850 + }, + { + "epoch": 0.9592861126603458, + "grad_norm": 1.0793242454528809, + "learning_rate": 1.8975379373043004e-05, + "loss": 1.0149, + "step": 860 + }, + { + "epoch": 0.9704406023424428, + "grad_norm": 0.6370890140533447, + "learning_rate": 1.895127654191248e-05, + "loss": 1.0509, + "step": 870 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.4927578270435333, + "learning_rate": 1.892690917875396e-05, + "loss": 1.0758, + "step": 880 + }, + { + "epoch": 0.992749581706637, + "grad_norm": 0.6699560880661011, + "learning_rate": 1.8902278003682384e-05, + "loss": 1.0646, + "step": 890 + }, + { + "epoch": 1.003904071388734, + "grad_norm": 0.6143092513084412, + "learning_rate": 1.8877383744608978e-05, + "loss": 1.043, + "step": 900 + }, + { + "epoch": 1.015058561070831, + "grad_norm": 0.9502522945404053, + "learning_rate": 1.885222713721975e-05, + "loss": 1.0772, + "step": 910 + }, + { + "epoch": 1.0262130507529281, + "grad_norm": 1.8059970140457153, + "learning_rate": 1.8826808924953727e-05, + "loss": 1.0019, + "step": 920 + }, + { + "epoch": 1.037367540435025, + "grad_norm": 0.6168321967124939, + "learning_rate": 1.8801129858981002e-05, + "loss": 1.0157, + "step": 930 + }, + { + "epoch": 1.048522030117122, + "grad_norm": 0.572365939617157, + "learning_rate": 1.8775190698180533e-05, + "loss": 1.0131, + "step": 940 + }, + { + "epoch": 1.0596765197992193, + "grad_norm": 0.832007884979248, + "learning_rate": 1.8748992209117707e-05, + "loss": 1.1186, + "step": 950 + }, + { + "epoch": 1.0708310094813163, + "grad_norm": 0.5764344334602356, + "learning_rate": 1.8722535166021712e-05, + "loss": 1.0064, + "step": 960 + }, + { + "epoch": 1.0819854991634132, + "grad_norm": 0.6819553971290588, + "learning_rate": 1.8695820350762608e-05, + "loss": 1.0168, + "step": 970 + }, + { + "epoch": 1.0931399888455102, + "grad_norm": 0.5008575916290283, + "learning_rate": 1.8668848552828272e-05, + "loss": 0.9948, + "step": 980 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.5437910556793213, + "learning_rate": 1.864162056930104e-05, + "loss": 1.0749, + "step": 990 + }, + { + "epoch": 1.1154489682097044, + "grad_norm": 0.5878276824951172, + "learning_rate": 1.8614137204834152e-05, + "loss": 1.0363, + "step": 1000 + }, + { + "epoch": 1.1266034578918014, + "grad_norm": 0.6696128845214844, + "learning_rate": 1.8586399271627985e-05, + "loss": 0.9949, + "step": 1010 + }, + { + "epoch": 1.1377579475738986, + "grad_norm": 0.7180065512657166, + "learning_rate": 1.855840758940603e-05, + "loss": 1.0338, + "step": 1020 + }, + { + "epoch": 1.1489124372559956, + "grad_norm": 0.5862113833427429, + "learning_rate": 1.8530162985390697e-05, + "loss": 0.9799, + "step": 1030 + }, + { + "epoch": 1.1600669269380925, + "grad_norm": 0.5569765567779541, + "learning_rate": 1.8501666294278832e-05, + "loss": 1.0163, + "step": 1040 + }, + { + "epoch": 1.1712214166201895, + "grad_norm": 0.5795998573303223, + "learning_rate": 1.8472918358217084e-05, + "loss": 1.0079, + "step": 1050 + }, + { + "epoch": 1.1823759063022867, + "grad_norm": 1.1413137912750244, + "learning_rate": 1.8443920026776994e-05, + "loss": 1.0019, + "step": 1060 + }, + { + "epoch": 1.1935303959843837, + "grad_norm": 0.6494612097740173, + "learning_rate": 1.84146721569299e-05, + "loss": 1.0275, + "step": 1070 + }, + { + "epoch": 1.2046848856664807, + "grad_norm": 0.5971665978431702, + "learning_rate": 1.8385175613021603e-05, + "loss": 1.0442, + "step": 1080 + }, + { + "epoch": 1.2158393753485779, + "grad_norm": 0.5472281575202942, + "learning_rate": 1.835543126674684e-05, + "loss": 0.9969, + "step": 1090 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.9368584156036377, + "learning_rate": 1.832543999712349e-05, + "loss": 1.0235, + "step": 1100 + }, + { + "epoch": 1.2381483547127718, + "grad_norm": 0.6323676705360413, + "learning_rate": 1.8295202690466648e-05, + "loss": 1.0082, + "step": 1110 + }, + { + "epoch": 1.2493028443948688, + "grad_norm": 0.5201751589775085, + "learning_rate": 1.8264720240362376e-05, + "loss": 1.0199, + "step": 1120 + }, + { + "epoch": 1.260457334076966, + "grad_norm": 0.6257196068763733, + "learning_rate": 1.823399354764134e-05, + "loss": 1.0356, + "step": 1130 + }, + { + "epoch": 1.271611823759063, + "grad_norm": 0.517178475856781, + "learning_rate": 1.8203023520352155e-05, + "loss": 1.0162, + "step": 1140 + }, + { + "epoch": 1.28276631344116, + "grad_norm": 0.5029575228691101, + "learning_rate": 1.817181107373458e-05, + "loss": 1.0381, + "step": 1150 + }, + { + "epoch": 1.2939208031232572, + "grad_norm": 0.49766603112220764, + "learning_rate": 1.8140357130192443e-05, + "loss": 0.9934, + "step": 1160 + }, + { + "epoch": 1.3050752928053542, + "grad_norm": 0.5907878875732422, + "learning_rate": 1.8108662619266405e-05, + "loss": 0.9735, + "step": 1170 + }, + { + "epoch": 1.3162297824874511, + "grad_norm": 0.5396254062652588, + "learning_rate": 1.8076728477606476e-05, + "loss": 1.0221, + "step": 1180 + }, + { + "epoch": 1.3273842721695481, + "grad_norm": 0.5513020753860474, + "learning_rate": 1.8044555648944335e-05, + "loss": 1.0379, + "step": 1190 + }, + { + "epoch": 1.3385387618516453, + "grad_norm": 0.6715849041938782, + "learning_rate": 1.8012145084065445e-05, + "loss": 1.0168, + "step": 1200 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.949293315410614, + "learning_rate": 1.7979497740780953e-05, + "loss": 0.9649, + "step": 1210 + }, + { + "epoch": 1.3608477412158395, + "grad_norm": 0.6600781679153442, + "learning_rate": 1.794661458389938e-05, + "loss": 1.0238, + "step": 1220 + }, + { + "epoch": 1.3720022308979365, + "grad_norm": 0.811071515083313, + "learning_rate": 1.791349658519813e-05, + "loss": 1.0417, + "step": 1230 + }, + { + "epoch": 1.3831567205800335, + "grad_norm": 0.47761473059654236, + "learning_rate": 1.7880144723394735e-05, + "loss": 1.0288, + "step": 1240 + }, + { + "epoch": 1.3943112102621305, + "grad_norm": 0.627876341342926, + "learning_rate": 1.7846559984117963e-05, + "loss": 0.989, + "step": 1250 + }, + { + "epoch": 1.4054656999442274, + "grad_norm": 0.7207409739494324, + "learning_rate": 1.7812743359878672e-05, + "loss": 0.9831, + "step": 1260 + }, + { + "epoch": 1.4166201896263246, + "grad_norm": 0.7529335618019104, + "learning_rate": 1.7778695850040493e-05, + "loss": 0.9723, + "step": 1270 + }, + { + "epoch": 1.4277746793084216, + "grad_norm": 0.5672315359115601, + "learning_rate": 1.7744418460790282e-05, + "loss": 1.0139, + "step": 1280 + }, + { + "epoch": 1.4389291689905188, + "grad_norm": 1.991326928138733, + "learning_rate": 1.77099122051084e-05, + "loss": 0.9925, + "step": 1290 + }, + { + "epoch": 1.4500836586726158, + "grad_norm": 0.7485338449478149, + "learning_rate": 1.767517810273875e-05, + "loss": 1.0279, + "step": 1300 + }, + { + "epoch": 1.4612381483547128, + "grad_norm": 0.5592474937438965, + "learning_rate": 1.7640217180158688e-05, + "loss": 1.0485, + "step": 1310 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.5996952652931213, + "learning_rate": 1.7605030470548632e-05, + "loss": 1.0164, + "step": 1320 + }, + { + "epoch": 1.4835471277189067, + "grad_norm": 0.5637550354003906, + "learning_rate": 1.756961901376158e-05, + "loss": 1.0138, + "step": 1330 + }, + { + "epoch": 1.494701617401004, + "grad_norm": 0.6679148077964783, + "learning_rate": 1.7533983856292337e-05, + "loss": 1.0139, + "step": 1340 + }, + { + "epoch": 1.505856107083101, + "grad_norm": 0.5069565773010254, + "learning_rate": 1.749812605124662e-05, + "loss": 1.0234, + "step": 1350 + }, + { + "epoch": 1.5170105967651981, + "grad_norm": 0.6209201216697693, + "learning_rate": 1.7462046658309923e-05, + "loss": 0.9937, + "step": 1360 + }, + { + "epoch": 1.528165086447295, + "grad_norm": 0.5736936926841736, + "learning_rate": 1.74257467437162e-05, + "loss": 0.9878, + "step": 1370 + }, + { + "epoch": 1.539319576129392, + "grad_norm": 0.5310850739479065, + "learning_rate": 1.7389227380216353e-05, + "loss": 0.9825, + "step": 1380 + }, + { + "epoch": 1.550474065811489, + "grad_norm": 0.48725244402885437, + "learning_rate": 1.735248964704654e-05, + "loss": 1.041, + "step": 1390 + }, + { + "epoch": 1.561628555493586, + "grad_norm": 0.5187913775444031, + "learning_rate": 1.7315534629896282e-05, + "loss": 1.0171, + "step": 1400 + }, + { + "epoch": 1.5727830451756832, + "grad_norm": 0.5149972438812256, + "learning_rate": 1.7278363420876346e-05, + "loss": 1.0164, + "step": 1410 + }, + { + "epoch": 1.5839375348577802, + "grad_norm": 0.5611596703529358, + "learning_rate": 1.7240977118486523e-05, + "loss": 1.006, + "step": 1420 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.5297011137008667, + "learning_rate": 1.720337682758312e-05, + "loss": 0.9757, + "step": 1430 + }, + { + "epoch": 1.6062465142219744, + "grad_norm": 0.7282742857933044, + "learning_rate": 1.7165563659346336e-05, + "loss": 1.0052, + "step": 1440 + }, + { + "epoch": 1.6174010039040714, + "grad_norm": 0.6049071550369263, + "learning_rate": 1.7127538731247405e-05, + "loss": 0.9891, + "step": 1450 + }, + { + "epoch": 1.6285554935861684, + "grad_norm": 0.7181625366210938, + "learning_rate": 1.7089303167015582e-05, + "loss": 0.9877, + "step": 1460 + }, + { + "epoch": 1.6397099832682653, + "grad_norm": 0.7313211560249329, + "learning_rate": 1.7050858096604943e-05, + "loss": 0.9828, + "step": 1470 + }, + { + "epoch": 1.6508644729503625, + "grad_norm": 0.49667495489120483, + "learning_rate": 1.7012204656160968e-05, + "loss": 1.0058, + "step": 1480 + }, + { + "epoch": 1.6620189626324595, + "grad_norm": 2.2346487045288086, + "learning_rate": 1.697334398798699e-05, + "loss": 0.9777, + "step": 1490 + }, + { + "epoch": 1.6731734523145567, + "grad_norm": 0.6199758052825928, + "learning_rate": 1.693427724051042e-05, + "loss": 0.9339, + "step": 1500 + }, + { + "epoch": 1.6843279419966537, + "grad_norm": 0.5861572027206421, + "learning_rate": 1.6895005568248818e-05, + "loss": 1.0539, + "step": 1510 + }, + { + "epoch": 1.6954824316787507, + "grad_norm": 0.5904344320297241, + "learning_rate": 1.6855530131775765e-05, + "loss": 1.0419, + "step": 1520 + }, + { + "epoch": 1.7066369213608477, + "grad_norm": 0.5856732130050659, + "learning_rate": 1.6815852097686577e-05, + "loss": 0.9253, + "step": 1530 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.5267894864082336, + "learning_rate": 1.677597263856382e-05, + "loss": 0.958, + "step": 1540 + }, + { + "epoch": 1.7289459007250418, + "grad_norm": 3.836613178253174, + "learning_rate": 1.673589293294267e-05, + "loss": 0.9697, + "step": 1550 + }, + { + "epoch": 1.7401003904071388, + "grad_norm": 0.5841746926307678, + "learning_rate": 1.6695614165276052e-05, + "loss": 1.011, + "step": 1560 + }, + { + "epoch": 1.751254880089236, + "grad_norm": 0.6810708045959473, + "learning_rate": 1.665513752589968e-05, + "loss": 0.9918, + "step": 1570 + }, + { + "epoch": 1.762409369771333, + "grad_norm": 0.537075936794281, + "learning_rate": 1.6614464210996856e-05, + "loss": 0.9936, + "step": 1580 + }, + { + "epoch": 1.77356385945343, + "grad_norm": 0.53858482837677, + "learning_rate": 1.6573595422563117e-05, + "loss": 1.0108, + "step": 1590 + }, + { + "epoch": 1.784718349135527, + "grad_norm": 0.8306734561920166, + "learning_rate": 1.653253236837072e-05, + "loss": 0.9351, + "step": 1600 + }, + { + "epoch": 1.795872838817624, + "grad_norm": 1.4435315132141113, + "learning_rate": 1.6491276261932952e-05, + "loss": 0.9706, + "step": 1610 + }, + { + "epoch": 1.8070273284997211, + "grad_norm": 0.6465569138526917, + "learning_rate": 1.6449828322468258e-05, + "loss": 1.0211, + "step": 1620 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.5587633848190308, + "learning_rate": 1.640818977486423e-05, + "loss": 0.9857, + "step": 1630 + }, + { + "epoch": 1.8293363078639153, + "grad_norm": 0.5517516732215881, + "learning_rate": 1.636636184964137e-05, + "loss": 0.9574, + "step": 1640 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.5436330437660217, + "learning_rate": 1.632434578291678e-05, + "loss": 0.9546, + "step": 1650 + }, + { + "epoch": 1.8516452872281093, + "grad_norm": 0.5434009432792664, + "learning_rate": 1.6282142816367582e-05, + "loss": 0.99, + "step": 1660 + }, + { + "epoch": 1.8627997769102063, + "grad_norm": 0.6184738874435425, + "learning_rate": 1.6239754197194245e-05, + "loss": 0.9811, + "step": 1670 + }, + { + "epoch": 1.8739542665923032, + "grad_norm": 0.5969653725624084, + "learning_rate": 1.6197181178083726e-05, + "loss": 0.9942, + "step": 1680 + }, + { + "epoch": 1.8851087562744004, + "grad_norm": 0.5081932544708252, + "learning_rate": 1.6154425017172462e-05, + "loss": 1.0081, + "step": 1690 + }, + { + "epoch": 1.8962632459564976, + "grad_norm": 0.5772440433502197, + "learning_rate": 1.6111486978009162e-05, + "loss": 0.9536, + "step": 1700 + }, + { + "epoch": 1.9074177356385946, + "grad_norm": 0.7058711647987366, + "learning_rate": 1.606836832951748e-05, + "loss": 0.9602, + "step": 1710 + }, + { + "epoch": 1.9185722253206916, + "grad_norm": 0.5237627029418945, + "learning_rate": 1.6025070345958523e-05, + "loss": 0.9813, + "step": 1720 + }, + { + "epoch": 1.9297267150027886, + "grad_norm": 0.5791120529174805, + "learning_rate": 1.5981594306893186e-05, + "loss": 0.9906, + "step": 1730 + }, + { + "epoch": 1.9408812046848856, + "grad_norm": 0.5721683502197266, + "learning_rate": 1.593794149714433e-05, + "loss": 0.9907, + "step": 1740 + }, + { + "epoch": 1.9520356943669828, + "grad_norm": 0.5660907030105591, + "learning_rate": 1.589411320675883e-05, + "loss": 0.9786, + "step": 1750 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.6208905577659607, + "learning_rate": 1.5850110730969433e-05, + "loss": 0.9764, + "step": 1760 + }, + { + "epoch": 1.974344673731177, + "grad_norm": 0.5601891279220581, + "learning_rate": 1.5805935370156494e-05, + "loss": 0.9596, + "step": 1770 + }, + { + "epoch": 1.985499163413274, + "grad_norm": 0.4851455092430115, + "learning_rate": 1.5761588429809544e-05, + "loss": 0.9706, + "step": 1780 + }, + { + "epoch": 1.996653653095371, + "grad_norm": 0.5879290103912354, + "learning_rate": 1.57170712204887e-05, + "loss": 0.9856, + "step": 1790 + }, + { + "epoch": 2.007808142777468, + "grad_norm": 0.5131300687789917, + "learning_rate": 1.5672385057785948e-05, + "loss": 0.9965, + "step": 1800 + }, + { + "epoch": 2.018962632459565, + "grad_norm": 0.6118858456611633, + "learning_rate": 1.5627531262286253e-05, + "loss": 0.976, + "step": 1810 + }, + { + "epoch": 2.030117122141662, + "grad_norm": 0.6357288956642151, + "learning_rate": 1.5582511159528544e-05, + "loss": 0.968, + "step": 1820 + }, + { + "epoch": 2.0412716118237593, + "grad_norm": 0.5152722597122192, + "learning_rate": 1.553732607996653e-05, + "loss": 0.9813, + "step": 1830 + }, + { + "epoch": 2.0524261015058562, + "grad_norm": 0.5387617945671082, + "learning_rate": 1.5491977358929382e-05, + "loss": 0.9899, + "step": 1840 + }, + { + "epoch": 2.0635805911879532, + "grad_norm": 0.5080958604812622, + "learning_rate": 1.544646633658229e-05, + "loss": 0.9904, + "step": 1850 + }, + { + "epoch": 2.07473508087005, + "grad_norm": 0.6057727932929993, + "learning_rate": 1.5400794357886823e-05, + "loss": 0.9732, + "step": 1860 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 0.7106322646141052, + "learning_rate": 1.5354962772561218e-05, + "loss": 0.9706, + "step": 1870 + }, + { + "epoch": 2.097044060234244, + "grad_norm": 1.0154658555984497, + "learning_rate": 1.5308972935040472e-05, + "loss": 0.9527, + "step": 1880 + }, + { + "epoch": 2.108198549916341, + "grad_norm": 0.6521228551864624, + "learning_rate": 1.5262826204436325e-05, + "loss": 0.9869, + "step": 1890 + }, + { + "epoch": 2.1193530395984386, + "grad_norm": 0.5595114827156067, + "learning_rate": 1.5216523944497077e-05, + "loss": 0.9844, + "step": 1900 + }, + { + "epoch": 2.1305075292805356, + "grad_norm": 0.7357882857322693, + "learning_rate": 1.5170067523567303e-05, + "loss": 0.951, + "step": 1910 + }, + { + "epoch": 2.1416620189626325, + "grad_norm": 0.7726870179176331, + "learning_rate": 1.5123458314547422e-05, + "loss": 0.9568, + "step": 1920 + }, + { + "epoch": 2.1528165086447295, + "grad_norm": 0.5238220691680908, + "learning_rate": 1.50766976948531e-05, + "loss": 0.9825, + "step": 1930 + }, + { + "epoch": 2.1639709983268265, + "grad_norm": 0.5988953113555908, + "learning_rate": 1.5029787046374562e-05, + "loss": 0.9701, + "step": 1940 + }, + { + "epoch": 2.1751254880089235, + "grad_norm": 0.5412775278091431, + "learning_rate": 1.4982727755435755e-05, + "loss": 0.9319, + "step": 1950 + }, + { + "epoch": 2.1862799776910204, + "grad_norm": 0.5029479265213013, + "learning_rate": 1.4935521212753364e-05, + "loss": 0.9665, + "step": 1960 + }, + { + "epoch": 2.197434467373118, + "grad_norm": 0.5240495800971985, + "learning_rate": 1.488816881339572e-05, + "loss": 0.9788, + "step": 1970 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.5389825701713562, + "learning_rate": 1.4840671956741589e-05, + "loss": 0.9754, + "step": 1980 + }, + { + "epoch": 2.219743446737312, + "grad_norm": 0.5244821310043335, + "learning_rate": 1.4793032046438783e-05, + "loss": 0.9618, + "step": 1990 + }, + { + "epoch": 2.230897936419409, + "grad_norm": 0.5422099828720093, + "learning_rate": 1.474525049036271e-05, + "loss": 0.9568, + "step": 2000 + }, + { + "epoch": 2.242052426101506, + "grad_norm": 0.6588592529296875, + "learning_rate": 1.4697328700574756e-05, + "loss": 0.9553, + "step": 2010 + }, + { + "epoch": 2.2532069157836028, + "grad_norm": 0.5385764241218567, + "learning_rate": 1.4649268093280552e-05, + "loss": 0.9679, + "step": 2020 + }, + { + "epoch": 2.2643614054656998, + "grad_norm": 0.5763164162635803, + "learning_rate": 1.4601070088788114e-05, + "loss": 0.9802, + "step": 2030 + }, + { + "epoch": 2.275515895147797, + "grad_norm": 0.5320225358009338, + "learning_rate": 1.45527361114659e-05, + "loss": 0.952, + "step": 2040 + }, + { + "epoch": 2.286670384829894, + "grad_norm": 0.5828911662101746, + "learning_rate": 1.4504267589700683e-05, + "loss": 0.9588, + "step": 2050 + }, + { + "epoch": 2.297824874511991, + "grad_norm": 0.5583097338676453, + "learning_rate": 1.4455665955855351e-05, + "loss": 0.9397, + "step": 2060 + }, + { + "epoch": 2.308979364194088, + "grad_norm": 0.5835484862327576, + "learning_rate": 1.440693264622659e-05, + "loss": 0.9765, + "step": 2070 + }, + { + "epoch": 2.320133853876185, + "grad_norm": 0.5030141472816467, + "learning_rate": 1.4358069101002413e-05, + "loss": 0.96, + "step": 2080 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 1.1575989723205566, + "learning_rate": 1.4309076764219621e-05, + "loss": 0.9961, + "step": 2090 + }, + { + "epoch": 2.342442833240379, + "grad_norm": 0.5824646353721619, + "learning_rate": 1.4259957083721112e-05, + "loss": 0.9633, + "step": 2100 + }, + { + "epoch": 2.3535973229224765, + "grad_norm": 0.5903834104537964, + "learning_rate": 1.421071151111311e-05, + "loss": 0.9618, + "step": 2110 + }, + { + "epoch": 2.3647518126045735, + "grad_norm": 0.5352372527122498, + "learning_rate": 1.4161341501722251e-05, + "loss": 0.9595, + "step": 2120 + }, + { + "epoch": 2.3759063022866704, + "grad_norm": 0.5336405038833618, + "learning_rate": 1.4111848514552582e-05, + "loss": 0.9593, + "step": 2130 + }, + { + "epoch": 2.3870607919687674, + "grad_norm": 0.5585698485374451, + "learning_rate": 1.4062234012242444e-05, + "loss": 0.9849, + "step": 2140 + }, + { + "epoch": 2.3982152816508644, + "grad_norm": 161.10671997070312, + "learning_rate": 1.4012499461021243e-05, + "loss": 1.0208, + "step": 2150 + }, + { + "epoch": 2.4093697713329614, + "grad_norm": 0.5689246654510498, + "learning_rate": 1.396264633066613e-05, + "loss": 0.9939, + "step": 2160 + }, + { + "epoch": 2.420524261015059, + "grad_norm": 0.5657776594161987, + "learning_rate": 1.3912676094458552e-05, + "loss": 0.966, + "step": 2170 + }, + { + "epoch": 2.4316787506971558, + "grad_norm": 0.5576151013374329, + "learning_rate": 1.386259022914072e-05, + "loss": 0.9287, + "step": 2180 + }, + { + "epoch": 2.4428332403792528, + "grad_norm": 0.5689704418182373, + "learning_rate": 1.3812390214871973e-05, + "loss": 0.9863, + "step": 2190 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.5151479840278625, + "learning_rate": 1.3762077535185022e-05, + "loss": 0.9665, + "step": 2200 + }, + { + "epoch": 2.4651422197434467, + "grad_norm": 0.6008809804916382, + "learning_rate": 1.3711653676942115e-05, + "loss": 0.933, + "step": 2210 + }, + { + "epoch": 2.4762967094255437, + "grad_norm": 0.6128340363502502, + "learning_rate": 1.3661120130291106e-05, + "loss": 0.9411, + "step": 2220 + }, + { + "epoch": 2.4874511991076407, + "grad_norm": 0.5954192280769348, + "learning_rate": 1.3610478388621402e-05, + "loss": 0.9547, + "step": 2230 + }, + { + "epoch": 2.4986056887897377, + "grad_norm": 0.5410999059677124, + "learning_rate": 1.3559729948519835e-05, + "loss": 1.002, + "step": 2240 + }, + { + "epoch": 2.509760178471835, + "grad_norm": 0.5633314251899719, + "learning_rate": 1.3508876309726438e-05, + "loss": 0.9347, + "step": 2250 + }, + { + "epoch": 2.520914668153932, + "grad_norm": 0.701754629611969, + "learning_rate": 1.3457918975090127e-05, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 2.532069157836029, + "grad_norm": 0.5823484659194946, + "learning_rate": 1.3406859450524271e-05, + "loss": 0.9618, + "step": 2270 + }, + { + "epoch": 2.543223647518126, + "grad_norm": 0.6154325008392334, + "learning_rate": 1.335569924496221e-05, + "loss": 0.9747, + "step": 2280 + }, + { + "epoch": 2.554378137200223, + "grad_norm": 0.5738359093666077, + "learning_rate": 1.3304439870312651e-05, + "loss": 0.9645, + "step": 2290 + }, + { + "epoch": 2.56553262688232, + "grad_norm": 0.7494403719902039, + "learning_rate": 1.3253082841414984e-05, + "loss": 0.9645, + "step": 2300 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.6298063397407532, + "learning_rate": 1.3201629675994525e-05, + "loss": 0.9645, + "step": 2310 + }, + { + "epoch": 2.5878416062465144, + "grad_norm": 0.561741292476654, + "learning_rate": 1.3150081894617662e-05, + "loss": 0.9951, + "step": 2320 + }, + { + "epoch": 2.5989960959286114, + "grad_norm": 0.655669093132019, + "learning_rate": 1.3098441020646904e-05, + "loss": 0.9809, + "step": 2330 + }, + { + "epoch": 2.6101505856107083, + "grad_norm": 0.5869964957237244, + "learning_rate": 1.3046708580195882e-05, + "loss": 0.968, + "step": 2340 + }, + { + "epoch": 2.6213050752928053, + "grad_norm": 0.5096555948257446, + "learning_rate": 1.2994886102084235e-05, + "loss": 0.982, + "step": 2350 + }, + { + "epoch": 2.6324595649749023, + "grad_norm": 0.4704292118549347, + "learning_rate": 1.2942975117792436e-05, + "loss": 0.9422, + "step": 2360 + }, + { + "epoch": 2.6436140546569993, + "grad_norm": 0.6242173314094543, + "learning_rate": 1.289097716141653e-05, + "loss": 0.984, + "step": 2370 + }, + { + "epoch": 2.6547685443390963, + "grad_norm": 0.5799645185470581, + "learning_rate": 1.2838893769622804e-05, + "loss": 0.9589, + "step": 2380 + }, + { + "epoch": 2.6659230340211937, + "grad_norm": 0.49811428785324097, + "learning_rate": 1.2786726481602365e-05, + "loss": 0.946, + "step": 2390 + }, + { + "epoch": 2.6770775237032907, + "grad_norm": 0.5958947539329529, + "learning_rate": 1.2734476839025654e-05, + "loss": 1.0, + "step": 2400 + }, + { + "epoch": 2.6882320133853876, + "grad_norm": 0.5652265548706055, + "learning_rate": 1.2682146385996896e-05, + "loss": 0.9196, + "step": 2410 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.5902454257011414, + "learning_rate": 1.2629736669008464e-05, + "loss": 0.9854, + "step": 2420 + }, + { + "epoch": 2.7105409927495816, + "grad_norm": 1.0020962953567505, + "learning_rate": 1.2577249236895166e-05, + "loss": 0.9863, + "step": 2430 + }, + { + "epoch": 2.721695482431679, + "grad_norm": 0.6766297221183777, + "learning_rate": 1.2524685640788488e-05, + "loss": 0.9449, + "step": 2440 + }, + { + "epoch": 2.732849972113776, + "grad_norm": 0.5196471214294434, + "learning_rate": 1.2472047434070743e-05, + "loss": 0.9362, + "step": 2450 + }, + { + "epoch": 2.744004461795873, + "grad_norm": 0.6119663119316101, + "learning_rate": 1.2419336172329175e-05, + "loss": 0.9933, + "step": 2460 + }, + { + "epoch": 2.75515895147797, + "grad_norm": 0.6461163759231567, + "learning_rate": 1.2366553413309971e-05, + "loss": 0.9695, + "step": 2470 + }, + { + "epoch": 2.766313441160067, + "grad_norm": 0.6933746337890625, + "learning_rate": 1.2313700716872253e-05, + "loss": 0.8987, + "step": 2480 + }, + { + "epoch": 2.777467930842164, + "grad_norm": 0.602975070476532, + "learning_rate": 1.2260779644941949e-05, + "loss": 0.9538, + "step": 2490 + }, + { + "epoch": 2.788622420524261, + "grad_norm": 0.6569437980651855, + "learning_rate": 1.2207791761465658e-05, + "loss": 0.9537, + "step": 2500 + }, + { + "epoch": 2.799776910206358, + "grad_norm": 0.6225653886795044, + "learning_rate": 1.2154738632364418e-05, + "loss": 0.956, + "step": 2510 + }, + { + "epoch": 2.810931399888455, + "grad_norm": 0.6410306692123413, + "learning_rate": 1.2101621825487438e-05, + "loss": 0.9102, + "step": 2520 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4937409460544586, + "learning_rate": 1.2048442910565756e-05, + "loss": 0.9149, + "step": 2530 + }, + { + "epoch": 2.8332403792526493, + "grad_norm": 0.5088676810264587, + "learning_rate": 1.1995203459165866e-05, + "loss": 0.9319, + "step": 2540 + }, + { + "epoch": 2.8443948689347462, + "grad_norm": 0.5606518983840942, + "learning_rate": 1.1941905044643244e-05, + "loss": 0.925, + "step": 2550 + }, + { + "epoch": 2.8555493586168432, + "grad_norm": 0.541970431804657, + "learning_rate": 1.1888549242095881e-05, + "loss": 0.9692, + "step": 2560 + }, + { + "epoch": 2.86670384829894, + "grad_norm": 0.5640259385108948, + "learning_rate": 1.1835137628317728e-05, + "loss": 0.9588, + "step": 2570 + }, + { + "epoch": 2.8778583379810376, + "grad_norm": 0.7504070401191711, + "learning_rate": 1.1781671781752082e-05, + "loss": 0.9387, + "step": 2580 + }, + { + "epoch": 2.8890128276631346, + "grad_norm": 0.5966542363166809, + "learning_rate": 1.1728153282444956e-05, + "loss": 0.9601, + "step": 2590 + }, + { + "epoch": 2.9001673173452316, + "grad_norm": 0.5044750571250916, + "learning_rate": 1.1674583711998386e-05, + "loss": 0.9204, + "step": 2600 + }, + { + "epoch": 2.9113218070273286, + "grad_norm": 0.5839060544967651, + "learning_rate": 1.1620964653523679e-05, + "loss": 0.9646, + "step": 2610 + }, + { + "epoch": 2.9224762967094255, + "grad_norm": 0.6584033370018005, + "learning_rate": 1.1567297691594628e-05, + "loss": 0.9856, + "step": 2620 + }, + { + "epoch": 2.9336307863915225, + "grad_norm": 0.5905307531356812, + "learning_rate": 1.1513584412200702e-05, + "loss": 0.972, + "step": 2630 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5503203868865967, + "learning_rate": 1.1459826402700159e-05, + "loss": 0.9639, + "step": 2640 + }, + { + "epoch": 2.9559397657557165, + "grad_norm": 0.6256836652755737, + "learning_rate": 1.1406025251773131e-05, + "loss": 0.965, + "step": 2650 + }, + { + "epoch": 2.9670942554378135, + "grad_norm": 0.4728027284145355, + "learning_rate": 1.1352182549374702e-05, + "loss": 0.9385, + "step": 2660 + }, + { + "epoch": 2.978248745119911, + "grad_norm": 0.5927023887634277, + "learning_rate": 1.1298299886687891e-05, + "loss": 0.9852, + "step": 2670 + }, + { + "epoch": 2.989403234802008, + "grad_norm": 0.4461129307746887, + "learning_rate": 1.124437885607664e-05, + "loss": 0.9615, + "step": 2680 + }, + { + "epoch": 3.000557724484105, + "grad_norm": 0.6086824536323547, + "learning_rate": 1.1190421051038766e-05, + "loss": 0.9557, + "step": 2690 + }, + { + "epoch": 3.011712214166202, + "grad_norm": 0.6160380244255066, + "learning_rate": 1.1136428066158852e-05, + "loss": 0.9859, + "step": 2700 + }, + { + "epoch": 3.022866703848299, + "grad_norm": 0.579237163066864, + "learning_rate": 1.1082401497061133e-05, + "loss": 0.9641, + "step": 2710 + }, + { + "epoch": 3.034021193530396, + "grad_norm": 0.5745778679847717, + "learning_rate": 1.102834294036234e-05, + "loss": 0.96, + "step": 2720 + }, + { + "epoch": 3.045175683212493, + "grad_norm": 0.6007333993911743, + "learning_rate": 1.0974253993624514e-05, + "loss": 0.9637, + "step": 2730 + }, + { + "epoch": 3.05633017289459, + "grad_norm": 0.5479885339736938, + "learning_rate": 1.0920136255307801e-05, + "loss": 0.9359, + "step": 2740 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.6558302044868469, + "learning_rate": 1.08659913247232e-05, + "loss": 0.9709, + "step": 2750 + }, + { + "epoch": 3.078639152258784, + "grad_norm": 0.5437296628952026, + "learning_rate": 1.0811820801985316e-05, + "loss": 0.9479, + "step": 2760 + }, + { + "epoch": 3.089793641940881, + "grad_norm": 0.6434791684150696, + "learning_rate": 1.0757626287965057e-05, + "loss": 0.9421, + "step": 2770 + }, + { + "epoch": 3.100948131622978, + "grad_norm": 0.5511924624443054, + "learning_rate": 1.0703409384242335e-05, + "loss": 0.989, + "step": 2780 + }, + { + "epoch": 3.112102621305075, + "grad_norm": 0.6061602234840393, + "learning_rate": 1.0649171693058738e-05, + "loss": 1.0097, + "step": 2790 + }, + { + "epoch": 3.1232571109871725, + "grad_norm": 0.7485038042068481, + "learning_rate": 1.0594914817270165e-05, + "loss": 0.9886, + "step": 2800 + }, + { + "epoch": 3.1344116006692695, + "grad_norm": 0.7252965569496155, + "learning_rate": 1.054064036029947e-05, + "loss": 0.9489, + "step": 2810 + }, + { + "epoch": 3.1455660903513665, + "grad_norm": 0.5706859827041626, + "learning_rate": 1.0486349926089077e-05, + "loss": 0.98, + "step": 2820 + }, + { + "epoch": 3.1567205800334635, + "grad_norm": 0.5560559034347534, + "learning_rate": 1.0432045119053572e-05, + "loss": 0.9621, + "step": 2830 + }, + { + "epoch": 3.1678750697155604, + "grad_norm": 0.5444208383560181, + "learning_rate": 1.0377727544032292e-05, + "loss": 0.8854, + "step": 2840 + }, + { + "epoch": 3.1790295593976574, + "grad_norm": 0.5219970941543579, + "learning_rate": 1.0323398806241907e-05, + "loss": 0.9418, + "step": 2850 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.5396592617034912, + "learning_rate": 1.0269060511228968e-05, + "loss": 0.9515, + "step": 2860 + }, + { + "epoch": 3.201338538761852, + "grad_norm": 0.6761412620544434, + "learning_rate": 1.0214714264822461e-05, + "loss": 0.9301, + "step": 2870 + }, + { + "epoch": 3.212493028443949, + "grad_norm": 0.5131610035896301, + "learning_rate": 1.0160361673086365e-05, + "loss": 0.9679, + "step": 2880 + }, + { + "epoch": 3.2236475181260458, + "grad_norm": 0.5312976837158203, + "learning_rate": 1.0106004342272176e-05, + "loss": 0.975, + "step": 2890 + }, + { + "epoch": 3.2348020078081428, + "grad_norm": 0.5575296878814697, + "learning_rate": 1.005164387877143e-05, + "loss": 0.9748, + "step": 2900 + }, + { + "epoch": 3.2459564974902397, + "grad_norm": 0.5677649974822998, + "learning_rate": 9.997281889068261e-06, + "loss": 0.954, + "step": 2910 + }, + { + "epoch": 3.2571109871723367, + "grad_norm": 0.5441506505012512, + "learning_rate": 9.94291997969189e-06, + "loss": 0.9781, + "step": 2920 + }, + { + "epoch": 3.2682654768544337, + "grad_norm": 0.6877113580703735, + "learning_rate": 9.888559757169172e-06, + "loss": 0.9531, + "step": 2930 + }, + { + "epoch": 3.279419966536531, + "grad_norm": 0.5976460576057434, + "learning_rate": 9.834202827977107e-06, + "loss": 0.9579, + "step": 2940 + }, + { + "epoch": 3.290574456218628, + "grad_norm": 0.5610713362693787, + "learning_rate": 9.77985079849538e-06, + "loss": 0.9615, + "step": 2950 + }, + { + "epoch": 3.301728945900725, + "grad_norm": 0.5829696655273438, + "learning_rate": 9.725505274958855e-06, + "loss": 0.9353, + "step": 2960 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.5707248449325562, + "learning_rate": 9.671167863410156e-06, + "loss": 0.9578, + "step": 2970 + }, + { + "epoch": 3.324037925264919, + "grad_norm": 0.5084194540977478, + "learning_rate": 9.616840169652156e-06, + "loss": 0.9283, + "step": 2980 + }, + { + "epoch": 3.335192414947016, + "grad_norm": 0.5233070850372314, + "learning_rate": 9.562523799200558e-06, + "loss": 0.9249, + "step": 2990 + }, + { + "epoch": 3.3463469046291134, + "grad_norm": 0.5390496850013733, + "learning_rate": 9.508220357236431e-06, + "loss": 0.9806, + "step": 3000 + }, + { + "epoch": 3.3575013943112104, + "grad_norm": 0.5927379131317139, + "learning_rate": 9.453931448558768e-06, + "loss": 1.0214, + "step": 3010 + }, + { + "epoch": 3.3686558839933074, + "grad_norm": 0.5973530411720276, + "learning_rate": 9.39965867753708e-06, + "loss": 0.9858, + "step": 3020 + }, + { + "epoch": 3.3798103736754044, + "grad_norm": 0.5969598293304443, + "learning_rate": 9.345403648063958e-06, + "loss": 0.944, + "step": 3030 + }, + { + "epoch": 3.3909648633575014, + "grad_norm": 0.5890622735023499, + "learning_rate": 9.2911679635077e-06, + "loss": 0.9206, + "step": 3040 + }, + { + "epoch": 3.4021193530395983, + "grad_norm": 0.552482008934021, + "learning_rate": 9.2369532266649e-06, + "loss": 0.8879, + "step": 3050 + }, + { + "epoch": 3.4132738427216953, + "grad_norm": 0.7639123201370239, + "learning_rate": 9.182761039713112e-06, + "loss": 0.9069, + "step": 3060 + }, + { + "epoch": 3.4244283324037923, + "grad_norm": 0.6415414810180664, + "learning_rate": 9.12859300416347e-06, + "loss": 0.9366, + "step": 3070 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.5764932632446289, + "learning_rate": 9.07445072081339e-06, + "loss": 0.9503, + "step": 3080 + }, + { + "epoch": 3.4467373117679867, + "grad_norm": 0.5804705023765564, + "learning_rate": 9.020335789699238e-06, + "loss": 0.9067, + "step": 3090 + }, + { + "epoch": 3.4578918014500837, + "grad_norm": 0.48473402857780457, + "learning_rate": 8.966249810049057e-06, + "loss": 0.963, + "step": 3100 + }, + { + "epoch": 3.4690462911321807, + "grad_norm": 0.5405654311180115, + "learning_rate": 8.91219438023531e-06, + "loss": 0.965, + "step": 3110 + }, + { + "epoch": 3.4802007808142776, + "grad_norm": 0.5696051716804504, + "learning_rate": 8.85817109772762e-06, + "loss": 0.9686, + "step": 3120 + }, + { + "epoch": 3.4913552704963746, + "grad_norm": 0.45706358551979065, + "learning_rate": 8.804181559045609e-06, + "loss": 0.9337, + "step": 3130 + }, + { + "epoch": 3.502509760178472, + "grad_norm": 0.615371823310852, + "learning_rate": 8.750227359711652e-06, + "loss": 0.9665, + "step": 3140 + }, + { + "epoch": 3.513664249860569, + "grad_norm": 0.6421345472335815, + "learning_rate": 8.696310094203785e-06, + "loss": 0.9522, + "step": 3150 + }, + { + "epoch": 3.524818739542666, + "grad_norm": 0.5607454776763916, + "learning_rate": 8.642431355908549e-06, + "loss": 0.8975, + "step": 3160 + }, + { + "epoch": 3.535973229224763, + "grad_norm": 0.5480626225471497, + "learning_rate": 8.58859273707392e-06, + "loss": 0.9499, + "step": 3170 + }, + { + "epoch": 3.54712771890686, + "grad_norm": 0.6433860659599304, + "learning_rate": 8.53479582876223e-06, + "loss": 0.9716, + "step": 3180 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6692389845848083, + "learning_rate": 8.481042220803185e-06, + "loss": 0.9205, + "step": 3190 + }, + { + "epoch": 3.569436698271054, + "grad_norm": 0.529801070690155, + "learning_rate": 8.427333501746841e-06, + "loss": 0.9583, + "step": 3200 + }, + { + "epoch": 3.580591187953151, + "grad_norm": 0.47987642884254456, + "learning_rate": 8.373671258816692e-06, + "loss": 0.9565, + "step": 3210 + }, + { + "epoch": 3.5917456776352483, + "grad_norm": 0.5379712581634521, + "learning_rate": 8.32005707786274e-06, + "loss": 0.9489, + "step": 3220 + }, + { + "epoch": 3.6029001673173453, + "grad_norm": 0.5486000776290894, + "learning_rate": 8.266492543314642e-06, + "loss": 0.9547, + "step": 3230 + }, + { + "epoch": 3.6140546569994423, + "grad_norm": 0.5590048432350159, + "learning_rate": 8.212979238134883e-06, + "loss": 0.9159, + "step": 3240 + }, + { + "epoch": 3.6252091466815393, + "grad_norm": 0.5172351598739624, + "learning_rate": 8.159518743771992e-06, + "loss": 0.9404, + "step": 3250 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.5739352703094482, + "learning_rate": 8.10611264011382e-06, + "loss": 0.9201, + "step": 3260 + }, + { + "epoch": 3.6475181260457337, + "grad_norm": 0.6838170886039734, + "learning_rate": 8.052762505440822e-06, + "loss": 0.9567, + "step": 3270 + }, + { + "epoch": 3.6586726157278306, + "grad_norm": 0.4739402234554291, + "learning_rate": 7.999469916379452e-06, + "loss": 0.9166, + "step": 3280 + }, + { + "epoch": 3.6698271054099276, + "grad_norm": 0.5572711229324341, + "learning_rate": 7.946236447855535e-06, + "loss": 0.9301, + "step": 3290 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.47544723749160767, + "learning_rate": 7.893063673047758e-06, + "loss": 0.911, + "step": 3300 + }, + { + "epoch": 3.6921360847741216, + "grad_norm": 0.5537176132202148, + "learning_rate": 7.839953163341142e-06, + "loss": 0.9685, + "step": 3310 + }, + { + "epoch": 3.7032905744562186, + "grad_norm": 0.5580974221229553, + "learning_rate": 7.786906488280631e-06, + "loss": 0.9383, + "step": 3320 + }, + { + "epoch": 3.7144450641383155, + "grad_norm": 0.5065183639526367, + "learning_rate": 7.73392521552471e-06, + "loss": 0.9584, + "step": 3330 + }, + { + "epoch": 3.7255995538204125, + "grad_norm": 0.5675489902496338, + "learning_rate": 7.681010910799043e-06, + "loss": 0.9382, + "step": 3340 + }, + { + "epoch": 3.7367540435025095, + "grad_norm": 0.56146240234375, + "learning_rate": 7.628165137850252e-06, + "loss": 0.8888, + "step": 3350 + }, + { + "epoch": 3.747908533184607, + "grad_norm": 0.5241889953613281, + "learning_rate": 7.575389458399655e-06, + "loss": 0.8833, + "step": 3360 + }, + { + "epoch": 3.759063022866704, + "grad_norm": 0.6032584309577942, + "learning_rate": 7.522685432097152e-06, + "loss": 0.9508, + "step": 3370 + }, + { + "epoch": 3.770217512548801, + "grad_norm": 0.5508081912994385, + "learning_rate": 7.470054616475109e-06, + "loss": 0.9362, + "step": 3380 + }, + { + "epoch": 3.781372002230898, + "grad_norm": 0.5076966881752014, + "learning_rate": 7.417498566902344e-06, + "loss": 0.9168, + "step": 3390 + }, + { + "epoch": 3.792526491912995, + "grad_norm": 0.6123913526535034, + "learning_rate": 7.365018836538151e-06, + "loss": 0.9301, + "step": 3400 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.4558752477169037, + "learning_rate": 7.312616976286414e-06, + "loss": 0.9483, + "step": 3410 + }, + { + "epoch": 3.8148354712771892, + "grad_norm": 0.5639483332633972, + "learning_rate": 7.260294534749751e-06, + "loss": 0.9473, + "step": 3420 + }, + { + "epoch": 3.8259899609592862, + "grad_norm": 0.4958498477935791, + "learning_rate": 7.208053058183776e-06, + "loss": 0.9694, + "step": 3430 + }, + { + "epoch": 3.837144450641383, + "grad_norm": 0.5522082448005676, + "learning_rate": 7.15589409045139e-06, + "loss": 0.925, + "step": 3440 + }, + { + "epoch": 3.84829894032348, + "grad_norm": 0.512566328048706, + "learning_rate": 7.103819172977147e-06, + "loss": 0.8997, + "step": 3450 + }, + { + "epoch": 3.859453430005577, + "grad_norm": 0.6051592826843262, + "learning_rate": 7.0518298447017255e-06, + "loss": 0.966, + "step": 3460 + }, + { + "epoch": 3.870607919687674, + "grad_norm": 0.5349409580230713, + "learning_rate": 6.99992764203642e-06, + "loss": 0.9489, + "step": 3470 + }, + { + "epoch": 3.881762409369771, + "grad_norm": 0.568419873714447, + "learning_rate": 6.948114098817767e-06, + "loss": 0.9379, + "step": 3480 + }, + { + "epoch": 3.892916899051868, + "grad_norm": 0.5198797583580017, + "learning_rate": 6.8963907462621786e-06, + "loss": 0.9258, + "step": 3490 + }, + { + "epoch": 3.9040713887339655, + "grad_norm": 0.5569652915000916, + "learning_rate": 6.844759112920738e-06, + "loss": 0.9267, + "step": 3500 + }, + { + "epoch": 3.9152258784160625, + "grad_norm": 0.6207433938980103, + "learning_rate": 6.7932207246339845e-06, + "loss": 0.9034, + "step": 3510 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.8496944904327393, + "learning_rate": 6.741777104486843e-06, + "loss": 0.9285, + "step": 3520 + }, + { + "epoch": 3.9375348577802565, + "grad_norm": 0.5930597186088562, + "learning_rate": 6.690429772763616e-06, + "loss": 1.0073, + "step": 3530 + }, + { + "epoch": 3.9486893474623534, + "grad_norm": 0.5414257049560547, + "learning_rate": 6.6391802469030375e-06, + "loss": 0.9238, + "step": 3540 + }, + { + "epoch": 3.959843837144451, + "grad_norm": 0.5691171288490295, + "learning_rate": 6.58803004145345e-06, + "loss": 0.9394, + "step": 3550 + }, + { + "epoch": 3.970998326826548, + "grad_norm": 0.5480623841285706, + "learning_rate": 6.5369806680280275e-06, + "loss": 0.9349, + "step": 3560 + }, + { + "epoch": 3.982152816508645, + "grad_norm": 0.6629100441932678, + "learning_rate": 6.4860336352601225e-06, + "loss": 0.981, + "step": 3570 + }, + { + "epoch": 3.993307306190742, + "grad_norm": 0.5570813417434692, + "learning_rate": 6.435190448758657e-06, + "loss": 0.8973, + "step": 3580 + }, + { + "epoch": 4.004461795872839, + "grad_norm": 0.6893118619918823, + "learning_rate": 6.384452611063656e-06, + "loss": 0.9315, + "step": 3590 + }, + { + "epoch": 4.015616285554936, + "grad_norm": 0.5445614457130432, + "learning_rate": 6.3338216216018235e-06, + "loss": 0.9352, + "step": 3600 + }, + { + "epoch": 4.026770775237033, + "grad_norm": 0.5540854334831238, + "learning_rate": 6.283298976642242e-06, + "loss": 0.9668, + "step": 3610 + }, + { + "epoch": 4.03792526491913, + "grad_norm": 0.8352875113487244, + "learning_rate": 6.232886169252145e-06, + "loss": 0.9201, + "step": 3620 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 0.5931369662284851, + "learning_rate": 6.182584689252804e-06, + "loss": 0.9525, + "step": 3630 + }, + { + "epoch": 4.060234244283324, + "grad_norm": 0.5365961790084839, + "learning_rate": 6.1323960231754955e-06, + "loss": 0.9167, + "step": 3640 + }, + { + "epoch": 4.071388733965421, + "grad_norm": 0.4611726999282837, + "learning_rate": 6.082321654217558e-06, + "loss": 0.9409, + "step": 3650 + }, + { + "epoch": 4.0825432236475185, + "grad_norm": 0.42769667506217957, + "learning_rate": 6.03236306219859e-06, + "loss": 0.9358, + "step": 3660 + }, + { + "epoch": 4.0936977133296155, + "grad_norm": 0.571824848651886, + "learning_rate": 5.982521723516683e-06, + "loss": 0.9714, + "step": 3670 + }, + { + "epoch": 4.1048522030117125, + "grad_norm": 0.5529113411903381, + "learning_rate": 5.932799111104815e-06, + "loss": 0.8918, + "step": 3680 + }, + { + "epoch": 4.1160066926938095, + "grad_norm": 0.57806795835495, + "learning_rate": 5.883196694387312e-06, + "loss": 0.9456, + "step": 3690 + }, + { + "epoch": 4.1271611823759065, + "grad_norm": 0.5577197670936584, + "learning_rate": 5.833715939236428e-06, + "loss": 0.9498, + "step": 3700 + }, + { + "epoch": 4.138315672058003, + "grad_norm": 0.7209545969963074, + "learning_rate": 5.78435830792901e-06, + "loss": 0.9183, + "step": 3710 + }, + { + "epoch": 4.1494701617401, + "grad_norm": 1.1457955837249756, + "learning_rate": 5.7351252591033065e-06, + "loss": 0.9535, + "step": 3720 + }, + { + "epoch": 4.160624651422197, + "grad_norm": 0.5285237431526184, + "learning_rate": 5.68601824771585e-06, + "loss": 0.94, + "step": 3730 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 0.5437790751457214, + "learning_rate": 5.637038724998453e-06, + "loss": 0.9005, + "step": 3740 + }, + { + "epoch": 4.182933630786391, + "grad_norm": 0.5755524039268494, + "learning_rate": 5.588188138415336e-06, + "loss": 0.9191, + "step": 3750 + }, + { + "epoch": 4.194088120468488, + "grad_norm": 0.5454072952270508, + "learning_rate": 5.539467931620328e-06, + "loss": 0.9178, + "step": 3760 + }, + { + "epoch": 4.205242610150585, + "grad_norm": 0.5978593230247498, + "learning_rate": 5.490879544414238e-06, + "loss": 0.9391, + "step": 3770 + }, + { + "epoch": 4.216397099832682, + "grad_norm": 0.6521888971328735, + "learning_rate": 5.442424412702263e-06, + "loss": 0.9057, + "step": 3780 + }, + { + "epoch": 4.227551589514779, + "grad_norm": 0.5424159169197083, + "learning_rate": 5.394103968451592e-06, + "loss": 0.9175, + "step": 3790 + }, + { + "epoch": 4.238706079196877, + "grad_norm": 0.6317155361175537, + "learning_rate": 5.345919639649067e-06, + "loss": 0.9336, + "step": 3800 + }, + { + "epoch": 4.249860568878974, + "grad_norm": 0.5228768587112427, + "learning_rate": 5.297872850258987e-06, + "loss": 0.8923, + "step": 3810 + }, + { + "epoch": 4.261015058561071, + "grad_norm": 0.5018953084945679, + "learning_rate": 5.249965020181018e-06, + "loss": 0.9049, + "step": 3820 + }, + { + "epoch": 4.272169548243168, + "grad_norm": 0.5086571574211121, + "learning_rate": 5.202197565208256e-06, + "loss": 0.9354, + "step": 3830 + }, + { + "epoch": 4.283324037925265, + "grad_norm": 0.5657381415367126, + "learning_rate": 5.1545718969853546e-06, + "loss": 0.9411, + "step": 3840 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.5901178121566772, + "learning_rate": 5.107089422966831e-06, + "loss": 0.9259, + "step": 3850 + }, + { + "epoch": 4.305633017289459, + "grad_norm": 0.5313717126846313, + "learning_rate": 5.059751546375469e-06, + "loss": 0.9341, + "step": 3860 + }, + { + "epoch": 4.316787506971556, + "grad_norm": 0.5543375015258789, + "learning_rate": 5.012559666160836e-06, + "loss": 0.9601, + "step": 3870 + }, + { + "epoch": 4.327941996653653, + "grad_norm": 0.6120020151138306, + "learning_rate": 4.965515176957966e-06, + "loss": 0.9203, + "step": 3880 + }, + { + "epoch": 4.33909648633575, + "grad_norm": 0.6483854651451111, + "learning_rate": 4.918619469046108e-06, + "loss": 0.9579, + "step": 3890 + }, + { + "epoch": 4.350250976017847, + "grad_norm": 0.5992943048477173, + "learning_rate": 4.871873928307684e-06, + "loss": 0.959, + "step": 3900 + }, + { + "epoch": 4.361405465699944, + "grad_norm": 0.547232985496521, + "learning_rate": 4.825279936187289e-06, + "loss": 0.9164, + "step": 3910 + }, + { + "epoch": 4.372559955382041, + "grad_norm": 0.5406577587127686, + "learning_rate": 4.7788388696509035e-06, + "loss": 0.9526, + "step": 3920 + }, + { + "epoch": 4.383714445064138, + "grad_norm": 0.5376970171928406, + "learning_rate": 4.732552101145179e-06, + "loss": 0.928, + "step": 3930 + }, + { + "epoch": 4.394868934746236, + "grad_norm": 0.5624706745147705, + "learning_rate": 4.686420998556885e-06, + "loss": 0.8763, + "step": 3940 + }, + { + "epoch": 4.406023424428333, + "grad_norm": 0.6041187644004822, + "learning_rate": 4.640446925172478e-06, + "loss": 0.9731, + "step": 3950 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.5117301940917969, + "learning_rate": 4.594631239637826e-06, + "loss": 0.9687, + "step": 3960 + }, + { + "epoch": 4.428332403792527, + "grad_norm": 0.5197976231575012, + "learning_rate": 4.54897529591805e-06, + "loss": 0.9019, + "step": 3970 + }, + { + "epoch": 4.439486893474624, + "grad_norm": 0.5039201378822327, + "learning_rate": 4.503480443257508e-06, + "loss": 0.8707, + "step": 3980 + }, + { + "epoch": 4.450641383156721, + "grad_norm": 0.5956012606620789, + "learning_rate": 4.458148026139928e-06, + "loss": 0.9236, + "step": 3990 + }, + { + "epoch": 4.461795872838818, + "grad_norm": 0.47555774450302124, + "learning_rate": 4.412979384248665e-06, + "loss": 0.9178, + "step": 4000 + }, + { + "epoch": 4.472950362520915, + "grad_norm": 0.49644574522972107, + "learning_rate": 4.367975852427129e-06, + "loss": 0.9034, + "step": 4010 + }, + { + "epoch": 4.484104852203012, + "grad_norm": 0.6341298818588257, + "learning_rate": 4.32313876063931e-06, + "loss": 0.9602, + "step": 4020 + }, + { + "epoch": 4.495259341885109, + "grad_norm": 0.5583815574645996, + "learning_rate": 4.278469433930503e-06, + "loss": 0.9558, + "step": 4030 + }, + { + "epoch": 4.5064138315672055, + "grad_norm": 0.5553238391876221, + "learning_rate": 4.23396919238813e-06, + "loss": 0.934, + "step": 4040 + }, + { + "epoch": 4.5175683212493025, + "grad_norm": 0.47058847546577454, + "learning_rate": 4.1896393511027356e-06, + "loss": 0.9619, + "step": 4050 + }, + { + "epoch": 4.5287228109313995, + "grad_norm": 0.5800861120223999, + "learning_rate": 4.145481220129126e-06, + "loss": 0.9468, + "step": 4060 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 0.5830948352813721, + "learning_rate": 4.101496104447637e-06, + "loss": 0.9265, + "step": 4070 + }, + { + "epoch": 4.551031790295594, + "grad_norm": 0.5482431054115295, + "learning_rate": 4.057685303925597e-06, + "loss": 0.8726, + "step": 4080 + }, + { + "epoch": 4.562186279977691, + "grad_norm": 0.5066527724266052, + "learning_rate": 4.014050113278881e-06, + "loss": 0.9462, + "step": 4090 + }, + { + "epoch": 4.573340769659788, + "grad_norm": 0.6379840970039368, + "learning_rate": 3.970591822033676e-06, + "loss": 0.946, + "step": 4100 + }, + { + "epoch": 4.584495259341885, + "grad_norm": 0.5422645807266235, + "learning_rate": 3.927311714488356e-06, + "loss": 0.9416, + "step": 4110 + }, + { + "epoch": 4.595649749023982, + "grad_norm": 0.5916184782981873, + "learning_rate": 3.884211069675539e-06, + "loss": 0.9111, + "step": 4120 + }, + { + "epoch": 4.606804238706079, + "grad_norm": 0.5947620272636414, + "learning_rate": 3.841291161324267e-06, + "loss": 0.9389, + "step": 4130 + }, + { + "epoch": 4.617958728388176, + "grad_norm": 0.8486486077308655, + "learning_rate": 3.7985532578223984e-06, + "loss": 0.9219, + "step": 4140 + }, + { + "epoch": 4.629113218070273, + "grad_norm": 0.5247789621353149, + "learning_rate": 3.755998622179087e-06, + "loss": 0.9282, + "step": 4150 + }, + { + "epoch": 4.64026770775237, + "grad_norm": 0.6193472743034363, + "learning_rate": 3.713628511987486e-06, + "loss": 0.9463, + "step": 4160 + }, + { + "epoch": 4.651422197434467, + "grad_norm": 0.5674899220466614, + "learning_rate": 3.671444179387572e-06, + "loss": 0.9793, + "step": 4170 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.5814547538757324, + "learning_rate": 3.6294468710291365e-06, + "loss": 0.9772, + "step": 4180 + }, + { + "epoch": 4.673731176798661, + "grad_norm": 0.5413376092910767, + "learning_rate": 3.5876378280349543e-06, + "loss": 0.9171, + "step": 4190 + }, + { + "epoch": 4.684885666480758, + "grad_norm": 0.5505338311195374, + "learning_rate": 3.5460182859640914e-06, + "loss": 0.9548, + "step": 4200 + }, + { + "epoch": 4.696040156162855, + "grad_norm": 0.5410326719284058, + "learning_rate": 3.5045894747754104e-06, + "loss": 0.9124, + "step": 4210 + }, + { + "epoch": 4.707194645844953, + "grad_norm": 0.5259800553321838, + "learning_rate": 3.4633526187912004e-06, + "loss": 0.933, + "step": 4220 + }, + { + "epoch": 4.71834913552705, + "grad_norm": 0.583570122718811, + "learning_rate": 3.4223089366610174e-06, + "loss": 0.9591, + "step": 4230 + }, + { + "epoch": 4.729503625209147, + "grad_norm": 0.5068359375, + "learning_rate": 3.381459641325653e-06, + "loss": 0.9328, + "step": 4240 + }, + { + "epoch": 4.740658114891244, + "grad_norm": 0.5359495282173157, + "learning_rate": 3.3408059399813007e-06, + "loss": 0.979, + "step": 4250 + }, + { + "epoch": 4.751812604573341, + "grad_norm": 0.5905212163925171, + "learning_rate": 3.300349034043865e-06, + "loss": 0.9475, + "step": 4260 + }, + { + "epoch": 4.762967094255438, + "grad_norm": 0.529193103313446, + "learning_rate": 3.260090119113478e-06, + "loss": 0.9545, + "step": 4270 + }, + { + "epoch": 4.774121583937535, + "grad_norm": 0.6047744154930115, + "learning_rate": 3.22003038493915e-06, + "loss": 0.9565, + "step": 4280 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 0.5448325276374817, + "learning_rate": 3.180171015383614e-06, + "loss": 0.9583, + "step": 4290 + }, + { + "epoch": 4.796430563301729, + "grad_norm": 0.6160260438919067, + "learning_rate": 3.1405131883883466e-06, + "loss": 0.9427, + "step": 4300 + }, + { + "epoch": 4.807585052983826, + "grad_norm": 0.5397651195526123, + "learning_rate": 3.1010580759387377e-06, + "loss": 0.934, + "step": 4310 + }, + { + "epoch": 4.818739542665923, + "grad_norm": 0.5519613027572632, + "learning_rate": 3.0618068440294847e-06, + "loss": 0.9485, + "step": 4320 + }, + { + "epoch": 4.82989403234802, + "grad_norm": 0.5627745985984802, + "learning_rate": 3.0227606526301032e-06, + "loss": 0.9403, + "step": 4330 + }, + { + "epoch": 4.841048522030118, + "grad_norm": 0.5761961340904236, + "learning_rate": 2.983920655650673e-06, + "loss": 0.9949, + "step": 4340 + }, + { + "epoch": 4.852203011712215, + "grad_norm": 0.4996737241744995, + "learning_rate": 2.9452880009077212e-06, + "loss": 0.8788, + "step": 4350 + }, + { + "epoch": 4.8633575013943116, + "grad_norm": 0.5728593468666077, + "learning_rate": 2.906863830090314e-06, + "loss": 0.9781, + "step": 4360 + }, + { + "epoch": 4.8745119910764085, + "grad_norm": 54.11509323120117, + "learning_rate": 2.8686492787262966e-06, + "loss": 0.9177, + "step": 4370 + }, + { + "epoch": 4.8856664807585055, + "grad_norm": 0.5546409487724304, + "learning_rate": 2.830645476148759e-06, + "loss": 0.9255, + "step": 4380 + }, + { + "epoch": 4.8968209704406025, + "grad_norm": 0.6528788805007935, + "learning_rate": 2.7928535454626514e-06, + "loss": 0.9241, + "step": 4390 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.7199203372001648, + "learning_rate": 2.755274603511585e-06, + "loss": 0.9972, + "step": 4400 + }, + { + "epoch": 4.9191299498047965, + "grad_norm": 0.5689542889595032, + "learning_rate": 2.7179097608448435e-06, + "loss": 0.9259, + "step": 4410 + }, + { + "epoch": 4.930284439486893, + "grad_norm": 0.5618848204612732, + "learning_rate": 2.680760121684551e-06, + "loss": 0.9542, + "step": 4420 + }, + { + "epoch": 4.94143892916899, + "grad_norm": 0.5599839091300964, + "learning_rate": 2.64382678389305e-06, + "loss": 0.9011, + "step": 4430 + }, + { + "epoch": 4.952593418851087, + "grad_norm": 0.5971745252609253, + "learning_rate": 2.6071108389404367e-06, + "loss": 0.9091, + "step": 4440 + }, + { + "epoch": 4.963747908533184, + "grad_norm": 0.5364637970924377, + "learning_rate": 2.570613371872336e-06, + "loss": 0.9159, + "step": 4450 + }, + { + "epoch": 4.974902398215281, + "grad_norm": 0.5795035362243652, + "learning_rate": 2.534335461277805e-06, + "loss": 0.92, + "step": 4460 + }, + { + "epoch": 4.986056887897378, + "grad_norm": 0.5911030769348145, + "learning_rate": 2.4982781792574794e-06, + "loss": 0.9499, + "step": 4470 + }, + { + "epoch": 4.997211377579475, + "grad_norm": 0.527425229549408, + "learning_rate": 2.462442591391885e-06, + "loss": 0.9368, + "step": 4480 + }, + { + "epoch": 5.008365867261573, + "grad_norm": 0.6527435183525085, + "learning_rate": 2.4268297567099395e-06, + "loss": 0.9342, + "step": 4490 + }, + { + "epoch": 5.01952035694367, + "grad_norm": 0.5617554783821106, + "learning_rate": 2.391440727657669e-06, + "loss": 0.9302, + "step": 4500 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 0.5033459067344666, + "learning_rate": 2.356276550067086e-06, + "loss": 0.9211, + "step": 4510 + }, + { + "epoch": 5.041829336307864, + "grad_norm": 0.545575737953186, + "learning_rate": 2.321338263125308e-06, + "loss": 0.9204, + "step": 4520 + }, + { + "epoch": 5.052983825989961, + "grad_norm": 0.5373018383979797, + "learning_rate": 2.2866268993438214e-06, + "loss": 0.9736, + "step": 4530 + }, + { + "epoch": 5.064138315672058, + "grad_norm": 0.6192358732223511, + "learning_rate": 2.252143484527989e-06, + "loss": 0.9352, + "step": 4540 + }, + { + "epoch": 5.075292805354155, + "grad_norm": 0.514544665813446, + "learning_rate": 2.2178890377467234e-06, + "loss": 0.9198, + "step": 4550 + }, + { + "epoch": 5.086447295036252, + "grad_norm": 0.5495268702507019, + "learning_rate": 2.1838645713023767e-06, + "loss": 0.9173, + "step": 4560 + }, + { + "epoch": 5.097601784718349, + "grad_norm": 0.557684600353241, + "learning_rate": 2.150071090700815e-06, + "loss": 0.8829, + "step": 4570 + }, + { + "epoch": 5.108756274400446, + "grad_norm": 0.6613608002662659, + "learning_rate": 2.116509594621716e-06, + "loss": 0.9358, + "step": 4580 + }, + { + "epoch": 5.119910764082543, + "grad_norm": 0.537002444267273, + "learning_rate": 2.0831810748890513e-06, + "loss": 0.9361, + "step": 4590 + }, + { + "epoch": 5.13106525376464, + "grad_norm": 0.5655337572097778, + "learning_rate": 2.0500865164417614e-06, + "loss": 0.8873, + "step": 4600 + }, + { + "epoch": 5.142219743446737, + "grad_norm": 0.5133952498435974, + "learning_rate": 2.0172268973046815e-06, + "loss": 0.9147, + "step": 4610 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 0.46780067682266235, + "learning_rate": 1.9846031885595984e-06, + "loss": 0.9471, + "step": 4620 + }, + { + "epoch": 5.164528722810932, + "grad_norm": 0.5493624210357666, + "learning_rate": 1.952216354316585e-06, + "loss": 0.9283, + "step": 4630 + }, + { + "epoch": 5.175683212493029, + "grad_norm": 0.53765469789505, + "learning_rate": 1.920067351685485e-06, + "loss": 0.9098, + "step": 4640 + }, + { + "epoch": 5.186837702175126, + "grad_norm": 0.6295993328094482, + "learning_rate": 1.8881571307476477e-06, + "loss": 0.9579, + "step": 4650 + }, + { + "epoch": 5.197992191857223, + "grad_norm": 0.5361995697021484, + "learning_rate": 1.8564866345278376e-06, + "loss": 0.9299, + "step": 4660 + }, + { + "epoch": 5.20914668153932, + "grad_norm": 0.5145257711410522, + "learning_rate": 1.8250567989663736e-06, + "loss": 0.8962, + "step": 4670 + }, + { + "epoch": 5.220301171221417, + "grad_norm": 0.5769006013870239, + "learning_rate": 1.7938685528914579e-06, + "loss": 0.9527, + "step": 4680 + }, + { + "epoch": 5.231455660903514, + "grad_norm": 0.5536583662033081, + "learning_rate": 1.7629228179917413e-06, + "loss": 0.9427, + "step": 4690 + }, + { + "epoch": 5.242610150585611, + "grad_norm": 0.6008259654045105, + "learning_rate": 1.7322205087890798e-06, + "loss": 0.959, + "step": 4700 + }, + { + "epoch": 5.253764640267708, + "grad_norm": 0.5460879802703857, + "learning_rate": 1.7017625326114983e-06, + "loss": 0.9517, + "step": 4710 + }, + { + "epoch": 5.264919129949805, + "grad_norm": 0.5374237298965454, + "learning_rate": 1.6715497895663945e-06, + "loss": 0.8733, + "step": 4720 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 0.5077516436576843, + "learning_rate": 1.6415831725139275e-06, + "loss": 0.8602, + "step": 4730 + }, + { + "epoch": 5.287228109313999, + "grad_norm": 0.5702505707740784, + "learning_rate": 1.6118635670406346e-06, + "loss": 0.9652, + "step": 4740 + }, + { + "epoch": 5.2983825989960955, + "grad_norm": 0.5563054084777832, + "learning_rate": 1.5823918514332515e-06, + "loss": 0.9693, + "step": 4750 + }, + { + "epoch": 5.3095370886781925, + "grad_norm": 0.5390822887420654, + "learning_rate": 1.553168896652777e-06, + "loss": 0.9936, + "step": 4760 + }, + { + "epoch": 5.32069157836029, + "grad_norm": 0.6257708668708801, + "learning_rate": 1.5241955663087094e-06, + "loss": 0.9668, + "step": 4770 + }, + { + "epoch": 5.331846068042387, + "grad_norm": 0.5015426874160767, + "learning_rate": 1.4954727166335436e-06, + "loss": 0.8796, + "step": 4780 + }, + { + "epoch": 5.343000557724484, + "grad_norm": 0.5086983442306519, + "learning_rate": 1.4670011964574605e-06, + "loss": 0.9337, + "step": 4790 + }, + { + "epoch": 5.354155047406581, + "grad_norm": 0.5448817610740662, + "learning_rate": 1.4387818471832403e-06, + "loss": 0.9192, + "step": 4800 + }, + { + "epoch": 5.365309537088678, + "grad_norm": 0.65288907289505, + "learning_rate": 1.4108155027614e-06, + "loss": 0.878, + "step": 4810 + }, + { + "epoch": 5.376464026770775, + "grad_norm": 0.5269010663032532, + "learning_rate": 1.3831029896655446e-06, + "loss": 0.9213, + "step": 4820 + }, + { + "epoch": 5.387618516452872, + "grad_norm": 0.5757723450660706, + "learning_rate": 1.3556451268679483e-06, + "loss": 0.9588, + "step": 4830 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 0.49955371022224426, + "learning_rate": 1.3284427258153433e-06, + "loss": 0.9057, + "step": 4840 + }, + { + "epoch": 5.409927495817066, + "grad_norm": 0.5367484092712402, + "learning_rate": 1.3014965904049492e-06, + "loss": 0.8871, + "step": 4850 + }, + { + "epoch": 5.421081985499163, + "grad_norm": 0.5186311602592468, + "learning_rate": 1.2748075169607132e-06, + "loss": 0.924, + "step": 4860 + }, + { + "epoch": 5.43223647518126, + "grad_norm": 0.4818533957004547, + "learning_rate": 1.2483762942097744e-06, + "loss": 0.9807, + "step": 4870 + }, + { + "epoch": 5.443390964863357, + "grad_norm": 0.501106858253479, + "learning_rate": 1.2222037032591484e-06, + "loss": 0.9318, + "step": 4880 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.6269428133964539, + "learning_rate": 1.1962905175726626e-06, + "loss": 0.9632, + "step": 4890 + }, + { + "epoch": 5.465699944227552, + "grad_norm": 0.5876699090003967, + "learning_rate": 1.1706375029480755e-06, + "loss": 0.9098, + "step": 4900 + }, + { + "epoch": 5.476854433909649, + "grad_norm": 0.6998534202575684, + "learning_rate": 1.145245417494464e-06, + "loss": 0.9313, + "step": 4910 + }, + { + "epoch": 5.488008923591746, + "grad_norm": 0.5238382816314697, + "learning_rate": 1.1201150116098148e-06, + "loss": 0.9091, + "step": 4920 + }, + { + "epoch": 5.499163413273843, + "grad_norm": 0.562708854675293, + "learning_rate": 1.0952470279588378e-06, + "loss": 0.922, + "step": 4930 + }, + { + "epoch": 5.51031790295594, + "grad_norm": 0.5080922245979309, + "learning_rate": 1.0706422014510343e-06, + "loss": 0.9167, + "step": 4940 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 0.524152398109436, + "learning_rate": 1.0463012592189636e-06, + "loss": 0.9885, + "step": 4950 + }, + { + "epoch": 5.532626882320134, + "grad_norm": 0.5330775380134583, + "learning_rate": 1.0222249205967693e-06, + "loss": 0.9665, + "step": 4960 + }, + { + "epoch": 5.543781372002231, + "grad_norm": 0.5838893055915833, + "learning_rate": 9.984138970989033e-07, + "loss": 0.9439, + "step": 4970 + }, + { + "epoch": 5.554935861684328, + "grad_norm": 0.5749791264533997, + "learning_rate": 9.748688923991245e-07, + "loss": 0.9415, + "step": 4980 + }, + { + "epoch": 5.566090351366425, + "grad_norm": 0.5631915926933289, + "learning_rate": 9.515906023096733e-07, + "loss": 0.9282, + "step": 4990 + }, + { + "epoch": 5.577244841048522, + "grad_norm": 0.5311354398727417, + "learning_rate": 9.285797147607356e-07, + "loss": 0.9404, + "step": 5000 + }, + { + "epoch": 5.588399330730619, + "grad_norm": 0.47747719287872314, + "learning_rate": 9.05836909780089e-07, + "loss": 0.9537, + "step": 5010 + }, + { + "epoch": 5.599553820412716, + "grad_norm": 0.484561562538147, + "learning_rate": 8.833628594730281e-07, + "loss": 0.9638, + "step": 5020 + }, + { + "epoch": 5.610708310094813, + "grad_norm": 0.5075972676277161, + "learning_rate": 8.611582280024877e-07, + "loss": 0.8905, + "step": 5030 + }, + { + "epoch": 5.62186279977691, + "grad_norm": 0.571488082408905, + "learning_rate": 8.392236715694213e-07, + "loss": 0.9218, + "step": 5040 + }, + { + "epoch": 5.633017289459008, + "grad_norm": 0.4815698266029358, + "learning_rate": 8.175598383934058e-07, + "loss": 0.9161, + "step": 5050 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 0.4516790807247162, + "learning_rate": 7.961673686934857e-07, + "loss": 0.9217, + "step": 5060 + }, + { + "epoch": 5.6553262688232016, + "grad_norm": 0.582046389579773, + "learning_rate": 7.750468946692569e-07, + "loss": 0.8974, + "step": 5070 + }, + { + "epoch": 5.6664807585052985, + "grad_norm": 0.6478422284126282, + "learning_rate": 7.541990404821753e-07, + "loss": 0.9315, + "step": 5080 + }, + { + "epoch": 5.6776352481873955, + "grad_norm": 0.47425535321235657, + "learning_rate": 7.336244222371202e-07, + "loss": 0.9353, + "step": 5090 + }, + { + "epoch": 5.6887897378694925, + "grad_norm": 0.541813850402832, + "learning_rate": 7.133236479641847e-07, + "loss": 0.8781, + "step": 5100 + }, + { + "epoch": 5.6999442275515895, + "grad_norm": 0.6376804113388062, + "learning_rate": 6.932973176007019e-07, + "loss": 0.9331, + "step": 5110 + }, + { + "epoch": 5.7110987172336865, + "grad_norm": 0.5788676738739014, + "learning_rate": 6.735460229735213e-07, + "loss": 0.9374, + "step": 5120 + }, + { + "epoch": 5.722253206915783, + "grad_norm": 0.6712315678596497, + "learning_rate": 6.540703477815136e-07, + "loss": 0.9127, + "step": 5130 + }, + { + "epoch": 5.73340769659788, + "grad_norm": 0.5419293642044067, + "learning_rate": 6.348708675783266e-07, + "loss": 0.9239, + "step": 5140 + }, + { + "epoch": 5.744562186279977, + "grad_norm": 0.5727126002311707, + "learning_rate": 6.159481497553699e-07, + "loss": 0.9251, + "step": 5150 + }, + { + "epoch": 5.755716675962074, + "grad_norm": 0.5651090741157532, + "learning_rate": 5.973027535250541e-07, + "loss": 0.9339, + "step": 5160 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 0.5012417435646057, + "learning_rate": 5.789352299042606e-07, + "loss": 0.9172, + "step": 5170 + }, + { + "epoch": 5.778025655326269, + "grad_norm": 0.4940890967845917, + "learning_rate": 5.608461216980587e-07, + "loss": 0.9517, + "step": 5180 + }, + { + "epoch": 5.789180145008366, + "grad_norm": 0.551123321056366, + "learning_rate": 5.43035963483659e-07, + "loss": 0.898, + "step": 5190 + }, + { + "epoch": 5.800334634690463, + "grad_norm": 0.5738024115562439, + "learning_rate": 5.255052815946271e-07, + "loss": 0.9385, + "step": 5200 + }, + { + "epoch": 5.81148912437256, + "grad_norm": 0.6367058753967285, + "learning_rate": 5.082545941053174e-07, + "loss": 0.9842, + "step": 5210 + }, + { + "epoch": 5.822643614054657, + "grad_norm": 0.5020179748535156, + "learning_rate": 4.912844108155701e-07, + "loss": 0.8957, + "step": 5220 + }, + { + "epoch": 5.833798103736754, + "grad_norm": 0.5717551112174988, + "learning_rate": 4.745952332356418e-07, + "loss": 0.929, + "step": 5230 + }, + { + "epoch": 5.844952593418851, + "grad_norm": 0.5765886902809143, + "learning_rate": 4.5818755457138876e-07, + "loss": 0.9565, + "step": 5240 + }, + { + "epoch": 5.856107083100948, + "grad_norm": 0.5317378044128418, + "learning_rate": 4.4206185970968486e-07, + "loss": 0.9791, + "step": 5250 + }, + { + "epoch": 5.867261572783045, + "grad_norm": 0.47482070326805115, + "learning_rate": 4.262186252040956e-07, + "loss": 0.9215, + "step": 5260 + }, + { + "epoch": 5.878416062465142, + "grad_norm": 0.5285121202468872, + "learning_rate": 4.1065831926079937e-07, + "loss": 0.9223, + "step": 5270 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 0.5711480975151062, + "learning_rate": 3.9538140172473927e-07, + "loss": 0.9168, + "step": 5280 + }, + { + "epoch": 5.900725041829336, + "grad_norm": 0.594833254814148, + "learning_rate": 3.8038832406604775e-07, + "loss": 0.9215, + "step": 5290 + }, + { + "epoch": 5.911879531511433, + "grad_norm": 0.5133140683174133, + "learning_rate": 3.656795293666937e-07, + "loss": 0.934, + "step": 5300 + }, + { + "epoch": 5.92303402119353, + "grad_norm": 0.6132499575614929, + "learning_rate": 3.512554523073919e-07, + "loss": 0.9317, + "step": 5310 + }, + { + "epoch": 5.934188510875628, + "grad_norm": 0.6130938529968262, + "learning_rate": 3.3711651915475765e-07, + "loss": 0.9722, + "step": 5320 + }, + { + "epoch": 5.945343000557725, + "grad_norm": 0.5470441579818726, + "learning_rate": 3.2326314774870913e-07, + "loss": 0.9487, + "step": 5330 + }, + { + "epoch": 5.956497490239822, + "grad_norm": 0.6502159237861633, + "learning_rate": 3.0969574749012074e-07, + "loss": 0.9164, + "step": 5340 + }, + { + "epoch": 5.967651979921919, + "grad_norm": 0.4922711253166199, + "learning_rate": 2.964147193287215e-07, + "loss": 0.9312, + "step": 5350 + }, + { + "epoch": 5.978806469604016, + "grad_norm": 0.5844548344612122, + "learning_rate": 2.8342045575124764e-07, + "loss": 0.9549, + "step": 5360 + }, + { + "epoch": 5.989960959286113, + "grad_norm": 0.4878758192062378, + "learning_rate": 2.707133407698426e-07, + "loss": 0.9225, + "step": 5370 + }, + { + "epoch": 6.00111544896821, + "grad_norm": 0.5165712237358093, + "learning_rate": 2.5829374991070967e-07, + "loss": 0.9135, + "step": 5380 + }, + { + "epoch": 6.012269938650307, + "grad_norm": 0.5230659246444702, + "learning_rate": 2.4616205020301373e-07, + "loss": 0.9555, + "step": 5390 + }, + { + "epoch": 6.023424428332404, + "grad_norm": 0.6041128039360046, + "learning_rate": 2.3431860016803532e-07, + "loss": 0.9014, + "step": 5400 + }, + { + "epoch": 6.034578918014501, + "grad_norm": 0.525059700012207, + "learning_rate": 2.2276374980857552e-07, + "loss": 0.9216, + "step": 5410 + }, + { + "epoch": 6.045733407696598, + "grad_norm": 0.5333075523376465, + "learning_rate": 2.1149784059861233e-07, + "loss": 0.9157, + "step": 5420 + }, + { + "epoch": 6.056887897378695, + "grad_norm": 0.5331766605377197, + "learning_rate": 2.005212054732042e-07, + "loss": 0.9157, + "step": 5430 + }, + { + "epoch": 6.068042387060792, + "grad_norm": 0.6051472425460815, + "learning_rate": 1.8983416881866225e-07, + "loss": 0.952, + "step": 5440 + }, + { + "epoch": 6.079196876742889, + "grad_norm": 0.5551755428314209, + "learning_rate": 1.7943704646295356e-07, + "loss": 0.9298, + "step": 5450 + }, + { + "epoch": 6.090351366424986, + "grad_norm": 0.6593601703643799, + "learning_rate": 1.6933014566637206e-07, + "loss": 0.9707, + "step": 5460 + }, + { + "epoch": 6.101505856107083, + "grad_norm": 0.44987979531288147, + "learning_rate": 1.5951376511245676e-07, + "loss": 0.9041, + "step": 5470 + }, + { + "epoch": 6.11266034578918, + "grad_norm": 0.5908699631690979, + "learning_rate": 1.4998819489916771e-07, + "loss": 0.95, + "step": 5480 + }, + { + "epoch": 6.123814835471277, + "grad_norm": 0.5343087315559387, + "learning_rate": 1.4075371653031078e-07, + "loss": 0.9499, + "step": 5490 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 0.47075924277305603, + "learning_rate": 1.3181060290721525e-07, + "loss": 0.9594, + "step": 5500 + }, + { + "epoch": 6.146123814835471, + "grad_norm": 0.5868487358093262, + "learning_rate": 1.2315911832067818e-07, + "loss": 0.8791, + "step": 5510 + }, + { + "epoch": 6.157278304517568, + "grad_norm": 0.5475588440895081, + "learning_rate": 1.1479951844314164e-07, + "loss": 0.8894, + "step": 5520 + }, + { + "epoch": 6.168432794199665, + "grad_norm": 0.4885677099227905, + "learning_rate": 1.0673205032114886e-07, + "loss": 0.9173, + "step": 5530 + }, + { + "epoch": 6.179587283881762, + "grad_norm": 0.5436893105506897, + "learning_rate": 9.895695236803559e-08, + "loss": 0.8971, + "step": 5540 + }, + { + "epoch": 6.190741773563859, + "grad_norm": 0.6072864532470703, + "learning_rate": 9.147445435688796e-08, + "loss": 0.9459, + "step": 5550 + }, + { + "epoch": 6.201896263245956, + "grad_norm": 0.7242813110351562, + "learning_rate": 8.428477741375008e-08, + "loss": 0.9318, + "step": 5560 + }, + { + "epoch": 6.213050752928053, + "grad_norm": 0.5594833493232727, + "learning_rate": 7.738813401108824e-08, + "loss": 0.9021, + "step": 5570 + }, + { + "epoch": 6.22420524261015, + "grad_norm": 0.5553674101829529, + "learning_rate": 7.078472796151925e-08, + "loss": 0.9286, + "step": 5580 + }, + { + "epoch": 6.235359732292248, + "grad_norm": 0.5133783221244812, + "learning_rate": 6.44747544117752e-08, + "loss": 0.9517, + "step": 5590 + }, + { + "epoch": 6.246514221974345, + "grad_norm": 1.0486400127410889, + "learning_rate": 5.845839983694812e-08, + "loss": 0.9505, + "step": 5600 + }, + { + "epoch": 6.257668711656442, + "grad_norm": 0.5520800352096558, + "learning_rate": 5.2735842034971065e-08, + "loss": 0.95, + "step": 5610 + }, + { + "epoch": 6.268823201338539, + "grad_norm": 0.5525377988815308, + "learning_rate": 4.730725012137005e-08, + "loss": 0.9417, + "step": 5620 + }, + { + "epoch": 6.279977691020636, + "grad_norm": 0.5942236185073853, + "learning_rate": 4.2172784524258106e-08, + "loss": 0.9356, + "step": 5630 + }, + { + "epoch": 6.291132180702733, + "grad_norm": 0.5851506590843201, + "learning_rate": 3.733259697960234e-08, + "loss": 0.9426, + "step": 5640 + }, + { + "epoch": 6.30228667038483, + "grad_norm": 0.5317173600196838, + "learning_rate": 3.278683052673648e-08, + "loss": 0.9644, + "step": 5650 + }, + { + "epoch": 6.313441160066927, + "grad_norm": 0.5416374802589417, + "learning_rate": 2.8535619504133082e-08, + "loss": 0.896, + "step": 5660 + }, + { + "epoch": 6.324595649749024, + "grad_norm": 0.5867795348167419, + "learning_rate": 2.457908954543453e-08, + "loss": 0.9476, + "step": 5670 + }, + { + "epoch": 6.335750139431121, + "grad_norm": 0.5497348308563232, + "learning_rate": 2.0917357575738206e-08, + "loss": 0.9689, + "step": 5680 + }, + { + "epoch": 6.346904629113218, + "grad_norm": 0.5706804990768433, + "learning_rate": 1.755053180814481e-08, + "loss": 0.9638, + "step": 5690 + }, + { + "epoch": 6.358059118795315, + "grad_norm": 0.5580919981002808, + "learning_rate": 1.4478711740558704e-08, + "loss": 0.933, + "step": 5700 + }, + { + "epoch": 6.369213608477412, + "grad_norm": 0.5722364783287048, + "learning_rate": 1.1701988152744703e-08, + "loss": 0.9465, + "step": 5710 + }, + { + "epoch": 6.38036809815951, + "grad_norm": 0.5530805587768555, + "learning_rate": 9.220443103650222e-09, + "loss": 0.9202, + "step": 5720 + }, + { + "epoch": 6.391522587841607, + "grad_norm": 0.6193326115608215, + "learning_rate": 7.034149928976108e-09, + "loss": 0.9017, + "step": 5730 + }, + { + "epoch": 6.402677077523704, + "grad_norm": 0.6208977699279785, + "learning_rate": 5.1431732390117004e-09, + "loss": 0.9061, + "step": 5740 + }, + { + "epoch": 6.413831567205801, + "grad_norm": 0.5320120453834534, + "learning_rate": 3.5475689167252525e-09, + "loss": 0.9409, + "step": 5750 + }, + { + "epoch": 6.424986056887898, + "grad_norm": 0.5276364088058472, + "learning_rate": 2.2473841161108068e-09, + "loss": 0.9083, + "step": 5760 + }, + { + "epoch": 6.436140546569995, + "grad_norm": 0.44757771492004395, + "learning_rate": 1.2426572607981967e-09, + "loss": 0.953, + "step": 5770 + }, + { + "epoch": 6.4472950362520915, + "grad_norm": 0.583735466003418, + "learning_rate": 5.334180429117374e-10, + "loss": 0.9369, + "step": 5780 + }, + { + "epoch": 6.4584495259341885, + "grad_norm": 0.5257657170295715, + "learning_rate": 1.196874221986999e-10, + "loss": 0.9185, + "step": 5790 + } + ], + "logging_steps": 10, + "max_steps": 5799, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.275105081636946e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}