{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14705882352941177, "grad_norm": 0.14238034188747406, "learning_rate": 2.9411764705882355e-06, "loss": 0.8446, "step": 5 }, { "epoch": 0.29411764705882354, "grad_norm": 0.14489340782165527, "learning_rate": 5.882352941176471e-06, "loss": 0.8425, "step": 10 }, { "epoch": 0.4411764705882353, "grad_norm": 0.1551039218902588, "learning_rate": 8.823529411764707e-06, "loss": 0.8918, "step": 15 }, { "epoch": 0.5882352941176471, "grad_norm": 0.15667259693145752, "learning_rate": 1.1764705882352942e-05, "loss": 0.8473, "step": 20 }, { "epoch": 0.7352941176470589, "grad_norm": 0.14174659550189972, "learning_rate": 1.4705882352941177e-05, "loss": 0.8519, "step": 25 }, { "epoch": 0.8823529411764706, "grad_norm": 0.15220147371292114, "learning_rate": 1.7647058823529414e-05, "loss": 0.8201, "step": 30 }, { "epoch": 1.0294117647058822, "grad_norm": 0.14088748395442963, "learning_rate": 2.058823529411765e-05, "loss": 0.8345, "step": 35 }, { "epoch": 1.1764705882352942, "grad_norm": 0.1412580907344818, "learning_rate": 2.3529411764705884e-05, "loss": 0.8549, "step": 40 }, { "epoch": 1.3235294117647058, "grad_norm": 0.12632407248020172, "learning_rate": 2.647058823529412e-05, "loss": 0.8431, "step": 45 }, { "epoch": 1.4705882352941178, "grad_norm": 0.11696045845746994, "learning_rate": 2.9411764705882354e-05, "loss": 0.8157, "step": 50 }, { "epoch": 1.6176470588235294, "grad_norm": 0.1272636502981186, "learning_rate": 3.235294117647059e-05, "loss": 0.8236, "step": 55 }, { "epoch": 1.7647058823529411, "grad_norm": 0.10813312977552414, "learning_rate": 3.529411764705883e-05, "loss": 0.8019, "step": 60 }, { "epoch": 1.9117647058823528, "grad_norm": 0.11896699666976929, "learning_rate": 3.8235294117647055e-05, "loss": 0.8185, "step": 65 }, { "epoch": 2.0588235294117645, "grad_norm": 0.10265181213617325, "learning_rate": 4.11764705882353e-05, "loss": 0.8044, "step": 70 }, { "epoch": 2.2058823529411766, "grad_norm": 0.10106126219034195, "learning_rate": 4.411764705882353e-05, "loss": 0.7858, "step": 75 }, { "epoch": 2.3529411764705883, "grad_norm": 0.10131911188364029, "learning_rate": 4.705882352941177e-05, "loss": 0.8084, "step": 80 }, { "epoch": 2.5, "grad_norm": 0.09996718913316727, "learning_rate": 5e-05, "loss": 0.814, "step": 85 }, { "epoch": 2.6470588235294117, "grad_norm": 0.0959576964378357, "learning_rate": 4.999893574965545e-05, "loss": 0.8105, "step": 90 }, { "epoch": 2.7941176470588234, "grad_norm": 0.11985825002193451, "learning_rate": 4.9995743099299886e-05, "loss": 0.7813, "step": 95 }, { "epoch": 2.9411764705882355, "grad_norm": 0.09690634161233902, "learning_rate": 4.9990422350958156e-05, "loss": 0.7917, "step": 100 }, { "epoch": 3.088235294117647, "grad_norm": 0.11959923803806305, "learning_rate": 4.99829740079732e-05, "loss": 0.7531, "step": 105 }, { "epoch": 3.235294117647059, "grad_norm": 0.09686623513698578, "learning_rate": 4.99733987749585e-05, "loss": 0.783, "step": 110 }, { "epoch": 3.3823529411764706, "grad_norm": 0.0896935760974884, "learning_rate": 4.996169755773138e-05, "loss": 0.7506, "step": 115 }, { "epoch": 3.5294117647058822, "grad_norm": 0.09192899614572525, "learning_rate": 4.9947871463227374e-05, "loss": 0.7597, "step": 120 }, { "epoch": 3.6764705882352944, "grad_norm": 0.09252317994832993, "learning_rate": 4.993192179939542e-05, "loss": 0.767, "step": 125 }, { "epoch": 3.8235294117647056, "grad_norm": 0.09957632422447205, "learning_rate": 4.991385007507422e-05, "loss": 0.8006, "step": 130 }, { "epoch": 3.9705882352941178, "grad_norm": 0.12217137217521667, "learning_rate": 4.989365799984943e-05, "loss": 0.7513, "step": 135 }, { "epoch": 4.117647058823529, "grad_norm": 0.09010059386491776, "learning_rate": 4.9871347483892006e-05, "loss": 0.7531, "step": 140 }, { "epoch": 4.264705882352941, "grad_norm": 0.09149183332920074, "learning_rate": 4.984692063777743e-05, "loss": 0.7518, "step": 145 }, { "epoch": 4.411764705882353, "grad_norm": 0.09295113384723663, "learning_rate": 4.9820379772286095e-05, "loss": 0.7665, "step": 150 }, { "epoch": 4.5588235294117645, "grad_norm": 0.09854214638471603, "learning_rate": 4.979172739818469e-05, "loss": 0.7739, "step": 155 }, { "epoch": 4.705882352941177, "grad_norm": 0.09157629311084747, "learning_rate": 4.9760966225988675e-05, "loss": 0.7522, "step": 160 }, { "epoch": 4.852941176470588, "grad_norm": 0.10833761096000671, "learning_rate": 4.9728099165705895e-05, "loss": 0.7605, "step": 165 }, { "epoch": 5.0, "grad_norm": 0.09924355149269104, "learning_rate": 4.9693129326561254e-05, "loss": 0.7153, "step": 170 }, { "epoch": 5.147058823529412, "grad_norm": 0.09313185513019562, "learning_rate": 4.9656060016702606e-05, "loss": 0.7494, "step": 175 }, { "epoch": 5.294117647058823, "grad_norm": 0.11171400547027588, "learning_rate": 4.961689474288779e-05, "loss": 0.733, "step": 180 }, { "epoch": 5.4411764705882355, "grad_norm": 0.09828388690948486, "learning_rate": 4.957563721015293e-05, "loss": 0.7663, "step": 185 }, { "epoch": 5.588235294117647, "grad_norm": 0.09972433745861053, "learning_rate": 4.953229132146186e-05, "loss": 0.7576, "step": 190 }, { "epoch": 5.735294117647059, "grad_norm": 0.11432339251041412, "learning_rate": 4.948686117733699e-05, "loss": 0.7379, "step": 195 }, { "epoch": 5.882352941176471, "grad_norm": 0.10343588888645172, "learning_rate": 4.9439351075471346e-05, "loss": 0.7066, "step": 200 }, { "epoch": 6.029411764705882, "grad_norm": 0.0964265912771225, "learning_rate": 4.9389765510322026e-05, "loss": 0.7322, "step": 205 }, { "epoch": 6.176470588235294, "grad_norm": 0.11457476019859314, "learning_rate": 4.9338109172685006e-05, "loss": 0.742, "step": 210 }, { "epoch": 6.323529411764706, "grad_norm": 0.10812544822692871, "learning_rate": 4.92843869492514e-05, "loss": 0.7572, "step": 215 }, { "epoch": 6.470588235294118, "grad_norm": 0.1057206243276596, "learning_rate": 4.9228603922145206e-05, "loss": 0.7342, "step": 220 }, { "epoch": 6.617647058823529, "grad_norm": 0.11412467062473297, "learning_rate": 4.917076536844248e-05, "loss": 0.7331, "step": 225 }, { "epoch": 6.764705882352941, "grad_norm": 0.11059483885765076, "learning_rate": 4.9110876759672184e-05, "loss": 0.718, "step": 230 }, { "epoch": 6.911764705882353, "grad_norm": 0.10819140076637268, "learning_rate": 4.9048943761298544e-05, "loss": 0.7153, "step": 235 }, { "epoch": 7.0588235294117645, "grad_norm": 0.11002287268638611, "learning_rate": 4.89849722321851e-05, "loss": 0.7201, "step": 240 }, { "epoch": 7.205882352941177, "grad_norm": 0.11289830505847931, "learning_rate": 4.891896822404046e-05, "loss": 0.7261, "step": 245 }, { "epoch": 7.352941176470588, "grad_norm": 0.12590822577476501, "learning_rate": 4.885093798084583e-05, "loss": 0.7329, "step": 250 }, { "epoch": 7.5, "grad_norm": 0.10964758694171906, "learning_rate": 4.878088793826428e-05, "loss": 0.7413, "step": 255 }, { "epoch": 7.647058823529412, "grad_norm": 0.10680090636014938, "learning_rate": 4.8708824723031995e-05, "loss": 0.7174, "step": 260 }, { "epoch": 7.794117647058823, "grad_norm": 0.10832036286592484, "learning_rate": 4.8634755152331355e-05, "loss": 0.7345, "step": 265 }, { "epoch": 7.9411764705882355, "grad_norm": 0.10789214819669724, "learning_rate": 4.8558686233145996e-05, "loss": 0.7213, "step": 270 }, { "epoch": 8.088235294117647, "grad_norm": 0.11145245283842087, "learning_rate": 4.8480625161598e-05, "loss": 0.7184, "step": 275 }, { "epoch": 8.235294117647058, "grad_norm": 0.12131233513355255, "learning_rate": 4.840057932226715e-05, "loss": 0.737, "step": 280 }, { "epoch": 8.382352941176471, "grad_norm": 0.11897268146276474, "learning_rate": 4.831855628749228e-05, "loss": 0.7254, "step": 285 }, { "epoch": 8.529411764705882, "grad_norm": 0.1130763441324234, "learning_rate": 4.823456381665501e-05, "loss": 0.7213, "step": 290 }, { "epoch": 8.676470588235293, "grad_norm": 0.11835259944200516, "learning_rate": 4.8148609855445624e-05, "loss": 0.7102, "step": 295 }, { "epoch": 8.823529411764707, "grad_norm": 0.12309901416301727, "learning_rate": 4.806070253511151e-05, "loss": 0.7227, "step": 300 }, { "epoch": 8.970588235294118, "grad_norm": 0.11361519992351532, "learning_rate": 4.797085017168787e-05, "loss": 0.7125, "step": 305 }, { "epoch": 9.117647058823529, "grad_norm": 0.13154913485050201, "learning_rate": 4.7879061265211e-05, "loss": 0.7293, "step": 310 }, { "epoch": 9.264705882352942, "grad_norm": 0.12245271354913712, "learning_rate": 4.778534449891428e-05, "loss": 0.7216, "step": 315 }, { "epoch": 9.411764705882353, "grad_norm": 0.10899204015731812, "learning_rate": 4.768970873840669e-05, "loss": 0.706, "step": 320 }, { "epoch": 9.558823529411764, "grad_norm": 0.1145118996500969, "learning_rate": 4.75921630308341e-05, "loss": 0.7039, "step": 325 }, { "epoch": 9.705882352941176, "grad_norm": 0.12111522257328033, "learning_rate": 4.749271660402341e-05, "loss": 0.7359, "step": 330 }, { "epoch": 9.852941176470589, "grad_norm": 0.11270228773355484, "learning_rate": 4.739137886560966e-05, "loss": 0.7006, "step": 335 }, { "epoch": 10.0, "grad_norm": 0.10989291220903397, "learning_rate": 4.7288159402146e-05, "loss": 0.7123, "step": 340 }, { "epoch": 10.147058823529411, "grad_norm": 0.11979430168867111, "learning_rate": 4.7183067978196855e-05, "loss": 0.7213, "step": 345 }, { "epoch": 10.294117647058824, "grad_norm": 0.11735141277313232, "learning_rate": 4.707611453541412e-05, "loss": 0.7061, "step": 350 }, { "epoch": 10.441176470588236, "grad_norm": 0.12181384861469269, "learning_rate": 4.696730919159677e-05, "loss": 0.6962, "step": 355 }, { "epoch": 10.588235294117647, "grad_norm": 0.11275137960910797, "learning_rate": 4.6856662239733666e-05, "loss": 0.7467, "step": 360 }, { "epoch": 10.735294117647058, "grad_norm": 0.13028523325920105, "learning_rate": 4.674418414702985e-05, "loss": 0.7047, "step": 365 }, { "epoch": 10.882352941176471, "grad_norm": 0.12034178525209427, "learning_rate": 4.662988555391632e-05, "loss": 0.7061, "step": 370 }, { "epoch": 11.029411764705882, "grad_norm": 0.11595606803894043, "learning_rate": 4.6513777273043495e-05, "loss": 0.7023, "step": 375 }, { "epoch": 11.176470588235293, "grad_norm": 0.11920719593763351, "learning_rate": 4.63958702882583e-05, "loss": 0.6886, "step": 380 }, { "epoch": 11.323529411764707, "grad_norm": 0.12535597383975983, "learning_rate": 4.6276175753565105e-05, "loss": 0.7209, "step": 385 }, { "epoch": 11.470588235294118, "grad_norm": 0.12857039272785187, "learning_rate": 4.615470499207056e-05, "loss": 0.7018, "step": 390 }, { "epoch": 11.617647058823529, "grad_norm": 0.13531994819641113, "learning_rate": 4.6031469494912416e-05, "loss": 0.7145, "step": 395 }, { "epoch": 11.764705882352942, "grad_norm": 0.10658453404903412, "learning_rate": 4.59064809201725e-05, "loss": 0.723, "step": 400 }, { "epoch": 11.911764705882353, "grad_norm": 0.12011521309614182, "learning_rate": 4.5779751091773774e-05, "loss": 0.7011, "step": 405 }, { "epoch": 12.058823529411764, "grad_norm": 0.11478458344936371, "learning_rate": 4.5651291998361926e-05, "loss": 0.7117, "step": 410 }, { "epoch": 12.205882352941176, "grad_norm": 0.1333089917898178, "learning_rate": 4.55211157921711e-05, "loss": 0.7148, "step": 415 }, { "epoch": 12.352941176470589, "grad_norm": 0.11957768350839615, "learning_rate": 4.538923478787439e-05, "loss": 0.7049, "step": 420 }, { "epoch": 12.5, "grad_norm": 0.11590797454118729, "learning_rate": 4.5255661461418854e-05, "loss": 0.6797, "step": 425 }, { "epoch": 12.647058823529411, "grad_norm": 0.12927260994911194, "learning_rate": 4.5120408448845264e-05, "loss": 0.7126, "step": 430 }, { "epoch": 12.794117647058824, "grad_norm": 0.13119827210903168, "learning_rate": 4.4983488545092753e-05, "loss": 0.7082, "step": 435 }, { "epoch": 12.941176470588236, "grad_norm": 0.1294083297252655, "learning_rate": 4.4844914702788386e-05, "loss": 0.699, "step": 440 }, { "epoch": 13.088235294117647, "grad_norm": 0.11839265376329422, "learning_rate": 4.470470003102192e-05, "loss": 0.71, "step": 445 }, { "epoch": 13.235294117647058, "grad_norm": 0.12290767580270767, "learning_rate": 4.456285779410558e-05, "loss": 0.7058, "step": 450 }, { "epoch": 13.382352941176471, "grad_norm": 0.12060663849115372, "learning_rate": 4.4419401410319334e-05, "loss": 0.6744, "step": 455 }, { "epoch": 13.529411764705882, "grad_norm": 0.12197393923997879, "learning_rate": 4.427434445064148e-05, "loss": 0.6919, "step": 460 }, { "epoch": 13.676470588235293, "grad_norm": 0.12659871578216553, "learning_rate": 4.4127700637464834e-05, "loss": 0.7102, "step": 465 }, { "epoch": 13.823529411764707, "grad_norm": 0.12616273760795593, "learning_rate": 4.3979483843298624e-05, "loss": 0.6924, "step": 470 }, { "epoch": 13.970588235294118, "grad_norm": 0.1336318999528885, "learning_rate": 4.382970808945612e-05, "loss": 0.7248, "step": 475 }, { "epoch": 14.117647058823529, "grad_norm": 0.12975798547267914, "learning_rate": 4.367838754472821e-05, "loss": 0.7266, "step": 480 }, { "epoch": 14.264705882352942, "grad_norm": 0.1267329454421997, "learning_rate": 4.3525536524043076e-05, "loss": 0.7028, "step": 485 }, { "epoch": 14.411764705882353, "grad_norm": 0.12307338416576385, "learning_rate": 4.337116948711195e-05, "loss": 0.7052, "step": 490 }, { "epoch": 14.558823529411764, "grad_norm": 0.14381247758865356, "learning_rate": 4.3215301037061244e-05, "loss": 0.6947, "step": 495 }, { "epoch": 14.705882352941176, "grad_norm": 0.11929916590452194, "learning_rate": 4.305794591905113e-05, "loss": 0.691, "step": 500 }, { "epoch": 14.852941176470589, "grad_norm": 0.12451142817735672, "learning_rate": 4.289911901888056e-05, "loss": 0.6859, "step": 505 }, { "epoch": 15.0, "grad_norm": 0.12542015314102173, "learning_rate": 4.2738835361579175e-05, "loss": 0.7139, "step": 510 }, { "epoch": 15.147058823529411, "grad_norm": 0.11630310118198395, "learning_rate": 4.257711010998586e-05, "loss": 0.705, "step": 515 }, { "epoch": 15.294117647058824, "grad_norm": 0.13719907402992249, "learning_rate": 4.241395856331437e-05, "loss": 0.7001, "step": 520 }, { "epoch": 15.441176470588236, "grad_norm": 0.13168473541736603, "learning_rate": 4.224939615570602e-05, "loss": 0.7047, "step": 525 }, { "epoch": 15.588235294117647, "grad_norm": 0.11908990889787674, "learning_rate": 4.2083438454769606e-05, "loss": 0.7086, "step": 530 }, { "epoch": 15.735294117647058, "grad_norm": 0.12613283097743988, "learning_rate": 4.1916101160108715e-05, "loss": 0.6911, "step": 535 }, { "epoch": 15.882352941176471, "grad_norm": 0.12481500208377838, "learning_rate": 4.174740010183656e-05, "loss": 0.6845, "step": 540 }, { "epoch": 16.029411764705884, "grad_norm": 0.1311793029308319, "learning_rate": 4.15773512390784e-05, "loss": 0.6976, "step": 545 }, { "epoch": 16.176470588235293, "grad_norm": 0.11820737272500992, "learning_rate": 4.140597065846188e-05, "loss": 0.7101, "step": 550 }, { "epoch": 16.323529411764707, "grad_norm": 0.12259554117918015, "learning_rate": 4.123327457259517e-05, "loss": 0.6973, "step": 555 }, { "epoch": 16.470588235294116, "grad_norm": 0.12584823369979858, "learning_rate": 4.105927931853327e-05, "loss": 0.6903, "step": 560 }, { "epoch": 16.61764705882353, "grad_norm": 0.13863661885261536, "learning_rate": 4.088400135623256e-05, "loss": 0.6726, "step": 565 }, { "epoch": 16.764705882352942, "grad_norm": 0.13399486243724823, "learning_rate": 4.070745726699363e-05, "loss": 0.6977, "step": 570 }, { "epoch": 16.91176470588235, "grad_norm": 0.1234726756811142, "learning_rate": 4.0529663751892734e-05, "loss": 0.6907, "step": 575 }, { "epoch": 17.058823529411764, "grad_norm": 0.11935710906982422, "learning_rate": 4.035063763020185e-05, "loss": 0.7128, "step": 580 }, { "epoch": 17.205882352941178, "grad_norm": 0.12929606437683105, "learning_rate": 4.017039583779756e-05, "loss": 0.7106, "step": 585 }, { "epoch": 17.352941176470587, "grad_norm": 0.1248982772231102, "learning_rate": 3.9988955425558965e-05, "loss": 0.6897, "step": 590 }, { "epoch": 17.5, "grad_norm": 0.1281740367412567, "learning_rate": 3.980633355775461e-05, "loss": 0.6871, "step": 595 }, { "epoch": 17.647058823529413, "grad_norm": 0.12564094364643097, "learning_rate": 3.962254751041877e-05, "loss": 0.7008, "step": 600 }, { "epoch": 17.794117647058822, "grad_norm": 0.1336313784122467, "learning_rate": 3.943761466971717e-05, "loss": 0.6851, "step": 605 }, { "epoch": 17.941176470588236, "grad_norm": 0.1354963481426239, "learning_rate": 3.9251552530302206e-05, "loss": 0.6951, "step": 610 }, { "epoch": 18.08823529411765, "grad_norm": 0.15230301022529602, "learning_rate": 3.906437869365795e-05, "loss": 0.693, "step": 615 }, { "epoch": 18.235294117647058, "grad_norm": 0.1296202540397644, "learning_rate": 3.887611086643508e-05, "loss": 0.6874, "step": 620 }, { "epoch": 18.38235294117647, "grad_norm": 0.13370412588119507, "learning_rate": 3.8686766858775843e-05, "loss": 0.7085, "step": 625 }, { "epoch": 18.529411764705884, "grad_norm": 0.13132105767726898, "learning_rate": 3.849636458262913e-05, "loss": 0.7037, "step": 630 }, { "epoch": 18.676470588235293, "grad_norm": 0.13370752334594727, "learning_rate": 3.830492205005612e-05, "loss": 0.679, "step": 635 }, { "epoch": 18.823529411764707, "grad_norm": 0.12619805335998535, "learning_rate": 3.811245737152624e-05, "loss": 0.6846, "step": 640 }, { "epoch": 18.970588235294116, "grad_norm": 0.13043031096458435, "learning_rate": 3.7918988754203985e-05, "loss": 0.6729, "step": 645 }, { "epoch": 19.11764705882353, "grad_norm": 0.132464200258255, "learning_rate": 3.772453450022649e-05, "loss": 0.7112, "step": 650 }, { "epoch": 19.264705882352942, "grad_norm": 0.1268807351589203, "learning_rate": 3.752911300497212e-05, "loss": 0.6804, "step": 655 }, { "epoch": 19.41176470588235, "grad_norm": 0.14288154244422913, "learning_rate": 3.73327427553203e-05, "loss": 0.6867, "step": 660 }, { "epoch": 19.558823529411764, "grad_norm": 0.14849698543548584, "learning_rate": 3.7135442327902695e-05, "loss": 0.694, "step": 665 }, { "epoch": 19.705882352941178, "grad_norm": 0.12607896327972412, "learning_rate": 3.6937230387345746e-05, "loss": 0.6873, "step": 670 }, { "epoch": 19.852941176470587, "grad_norm": 0.12860074639320374, "learning_rate": 3.673812568450513e-05, "loss": 0.6942, "step": 675 }, { "epoch": 20.0, "grad_norm": 0.12468240410089493, "learning_rate": 3.6538147054691817e-05, "loss": 0.6844, "step": 680 }, { "epoch": 20.147058823529413, "grad_norm": 0.1367003470659256, "learning_rate": 3.6337313415890315e-05, "loss": 0.7005, "step": 685 }, { "epoch": 20.294117647058822, "grad_norm": 0.13072577118873596, "learning_rate": 3.6135643766969e-05, "loss": 0.671, "step": 690 }, { "epoch": 20.441176470588236, "grad_norm": 0.1326008439064026, "learning_rate": 3.593315718588286e-05, "loss": 0.6727, "step": 695 }, { "epoch": 20.58823529411765, "grad_norm": 0.1257023960351944, "learning_rate": 3.572987282786864e-05, "loss": 0.7073, "step": 700 }, { "epoch": 20.735294117647058, "grad_norm": 0.14335250854492188, "learning_rate": 3.552580992363285e-05, "loss": 0.6821, "step": 705 }, { "epoch": 20.88235294117647, "grad_norm": 0.13271793723106384, "learning_rate": 3.5320987777532465e-05, "loss": 0.6959, "step": 710 }, { "epoch": 21.029411764705884, "grad_norm": 0.12265238165855408, "learning_rate": 3.5115425765748793e-05, "loss": 0.6767, "step": 715 }, { "epoch": 21.176470588235293, "grad_norm": 0.13558083772659302, "learning_rate": 3.4909143334454454e-05, "loss": 0.6859, "step": 720 }, { "epoch": 21.323529411764707, "grad_norm": 0.1432723104953766, "learning_rate": 3.4702159997973747e-05, "loss": 0.6921, "step": 725 }, { "epoch": 21.470588235294116, "grad_norm": 0.13662855327129364, "learning_rate": 3.449449533693664e-05, "loss": 0.7063, "step": 730 }, { "epoch": 21.61764705882353, "grad_norm": 0.1422967165708542, "learning_rate": 3.428616899642645e-05, "loss": 0.6987, "step": 735 }, { "epoch": 21.764705882352942, "grad_norm": 0.1233050599694252, "learning_rate": 3.4077200684121345e-05, "loss": 0.6831, "step": 740 }, { "epoch": 21.91176470588235, "grad_norm": 0.13728494942188263, "learning_rate": 3.3867610168430084e-05, "loss": 0.6873, "step": 745 }, { "epoch": 22.058823529411764, "grad_norm": 0.1322290301322937, "learning_rate": 3.365741727662187e-05, "loss": 0.651, "step": 750 }, { "epoch": 22.205882352941178, "grad_norm": 0.13796144723892212, "learning_rate": 3.3446641892950696e-05, "loss": 0.671, "step": 755 }, { "epoch": 22.352941176470587, "grad_norm": 0.13293515145778656, "learning_rate": 3.3235303956774324e-05, "loss": 0.7056, "step": 760 }, { "epoch": 22.5, "grad_norm": 0.13630028069019318, "learning_rate": 3.3023423460667985e-05, "loss": 0.6866, "step": 765 }, { "epoch": 22.647058823529413, "grad_norm": 0.1360238939523697, "learning_rate": 3.281102044853309e-05, "loss": 0.6991, "step": 770 }, { "epoch": 22.794117647058822, "grad_norm": 0.13794392347335815, "learning_rate": 3.2598115013701114e-05, "loss": 0.6959, "step": 775 }, { "epoch": 22.941176470588236, "grad_norm": 0.1369139850139618, "learning_rate": 3.2384727297032705e-05, "loss": 0.6657, "step": 780 }, { "epoch": 23.08823529411765, "grad_norm": 0.12718403339385986, "learning_rate": 3.217087748501237e-05, "loss": 0.6733, "step": 785 }, { "epoch": 23.235294117647058, "grad_norm": 0.13353672623634338, "learning_rate": 3.1956585807838914e-05, "loss": 0.6774, "step": 790 }, { "epoch": 23.38235294117647, "grad_norm": 0.13364404439926147, "learning_rate": 3.1741872537511535e-05, "loss": 0.6752, "step": 795 }, { "epoch": 23.529411764705884, "grad_norm": 0.14464542269706726, "learning_rate": 3.152675798591219e-05, "loss": 0.6667, "step": 800 }, { "epoch": 23.676470588235293, "grad_norm": 0.13043712079524994, "learning_rate": 3.131126250288405e-05, "loss": 0.6924, "step": 805 }, { "epoch": 23.823529411764707, "grad_norm": 0.12341820448637009, "learning_rate": 3.109540647430641e-05, "loss": 0.6969, "step": 810 }, { "epoch": 23.970588235294116, "grad_norm": 0.14009004831314087, "learning_rate": 3.087921032016619e-05, "loss": 0.6947, "step": 815 }, { "epoch": 24.11764705882353, "grad_norm": 0.13528534770011902, "learning_rate": 3.066269449262618e-05, "loss": 0.6833, "step": 820 }, { "epoch": 24.264705882352942, "grad_norm": 0.1405653953552246, "learning_rate": 3.04458794740903e-05, "loss": 0.6919, "step": 825 }, { "epoch": 24.41176470588235, "grad_norm": 0.14450417459011078, "learning_rate": 3.0228785775265943e-05, "loss": 0.7085, "step": 830 }, { "epoch": 24.558823529411764, "grad_norm": 0.1257210224866867, "learning_rate": 3.001143393322368e-05, "loss": 0.7022, "step": 835 }, { "epoch": 24.705882352941178, "grad_norm": 0.14650067687034607, "learning_rate": 2.9793844509454417e-05, "loss": 0.6559, "step": 840 }, { "epoch": 24.852941176470587, "grad_norm": 0.1673348844051361, "learning_rate": 2.9576038087924297e-05, "loss": 0.6628, "step": 845 }, { "epoch": 25.0, "grad_norm": 0.13868844509124756, "learning_rate": 2.9358035273127483e-05, "loss": 0.6761, "step": 850 }, { "epoch": 25.147058823529413, "grad_norm": 0.12067105621099472, "learning_rate": 2.9139856688136917e-05, "loss": 0.6735, "step": 855 }, { "epoch": 25.294117647058822, "grad_norm": 0.1306021362543106, "learning_rate": 2.8921522972653437e-05, "loss": 0.6711, "step": 860 }, { "epoch": 25.441176470588236, "grad_norm": 0.13525615632534027, "learning_rate": 2.8703054781053194e-05, "loss": 0.6723, "step": 865 }, { "epoch": 25.58823529411765, "grad_norm": 0.1306258738040924, "learning_rate": 2.8484472780433828e-05, "loss": 0.6922, "step": 870 }, { "epoch": 25.735294117647058, "grad_norm": 0.14182746410369873, "learning_rate": 2.8265797648659283e-05, "loss": 0.6911, "step": 875 }, { "epoch": 25.88235294117647, "grad_norm": 0.13599254190921783, "learning_rate": 2.8047050072403713e-05, "loss": 0.6891, "step": 880 }, { "epoch": 26.029411764705884, "grad_norm": 0.1291087120771408, "learning_rate": 2.7828250745194544e-05, "loss": 0.6971, "step": 885 }, { "epoch": 26.176470588235293, "grad_norm": 0.11979696899652481, "learning_rate": 2.7609420365454823e-05, "loss": 0.6921, "step": 890 }, { "epoch": 26.323529411764707, "grad_norm": 0.1369645744562149, "learning_rate": 2.7390579634545182e-05, "loss": 0.667, "step": 895 }, { "epoch": 26.470588235294116, "grad_norm": 0.1354684978723526, "learning_rate": 2.7171749254805458e-05, "loss": 0.6918, "step": 900 }, { "epoch": 26.61764705882353, "grad_norm": 0.1434841752052307, "learning_rate": 2.6952949927596295e-05, "loss": 0.6961, "step": 905 }, { "epoch": 26.764705882352942, "grad_norm": 0.13030685484409332, "learning_rate": 2.6734202351340726e-05, "loss": 0.6742, "step": 910 }, { "epoch": 26.91176470588235, "grad_norm": 0.1375734657049179, "learning_rate": 2.651552721956617e-05, "loss": 0.66, "step": 915 }, { "epoch": 27.058823529411764, "grad_norm": 0.1508912891149521, "learning_rate": 2.6296945218946804e-05, "loss": 0.6928, "step": 920 }, { "epoch": 27.205882352941178, "grad_norm": 0.13976359367370605, "learning_rate": 2.6078477027346572e-05, "loss": 0.6916, "step": 925 }, { "epoch": 27.352941176470587, "grad_norm": 0.13399522006511688, "learning_rate": 2.586014331186309e-05, "loss": 0.6617, "step": 930 }, { "epoch": 27.5, "grad_norm": 0.1378486156463623, "learning_rate": 2.5641964726872526e-05, "loss": 0.6779, "step": 935 }, { "epoch": 27.647058823529413, "grad_norm": 0.1410367488861084, "learning_rate": 2.5423961912075712e-05, "loss": 0.6951, "step": 940 }, { "epoch": 27.794117647058822, "grad_norm": 0.1448415368795395, "learning_rate": 2.5206155490545585e-05, "loss": 0.6958, "step": 945 }, { "epoch": 27.941176470588236, "grad_norm": 0.1381085067987442, "learning_rate": 2.4988566066776327e-05, "loss": 0.6629, "step": 950 }, { "epoch": 28.08823529411765, "grad_norm": 0.14611601829528809, "learning_rate": 2.4771214224734056e-05, "loss": 0.6642, "step": 955 }, { "epoch": 28.235294117647058, "grad_norm": 0.13046316802501678, "learning_rate": 2.4554120525909703e-05, "loss": 0.6554, "step": 960 }, { "epoch": 28.38235294117647, "grad_norm": 0.1373993307352066, "learning_rate": 2.4337305507373832e-05, "loss": 0.6791, "step": 965 }, { "epoch": 28.529411764705884, "grad_norm": 0.140591099858284, "learning_rate": 2.4120789679833815e-05, "loss": 0.6729, "step": 970 }, { "epoch": 28.676470588235293, "grad_norm": 0.1307932734489441, "learning_rate": 2.3904593525693593e-05, "loss": 0.6887, "step": 975 }, { "epoch": 28.823529411764707, "grad_norm": 0.13051795959472656, "learning_rate": 2.3688737497115953e-05, "loss": 0.6823, "step": 980 }, { "epoch": 28.970588235294116, "grad_norm": 0.12720821797847748, "learning_rate": 2.3473242014087814e-05, "loss": 0.7063, "step": 985 }, { "epoch": 29.11764705882353, "grad_norm": 0.137127086520195, "learning_rate": 2.3258127462488467e-05, "loss": 0.6744, "step": 990 }, { "epoch": 29.264705882352942, "grad_norm": 0.13432453572750092, "learning_rate": 2.30434141921611e-05, "loss": 0.68, "step": 995 }, { "epoch": 29.41176470588235, "grad_norm": 0.14380089938640594, "learning_rate": 2.2829122514987634e-05, "loss": 0.6808, "step": 1000 }, { "epoch": 29.558823529411764, "grad_norm": 0.12999729812145233, "learning_rate": 2.2615272702967304e-05, "loss": 0.6963, "step": 1005 }, { "epoch": 29.705882352941178, "grad_norm": 0.13407659530639648, "learning_rate": 2.2401884986298892e-05, "loss": 0.6729, "step": 1010 }, { "epoch": 29.852941176470587, "grad_norm": 0.13908743858337402, "learning_rate": 2.2188979551466916e-05, "loss": 0.6766, "step": 1015 }, { "epoch": 30.0, "grad_norm": 0.13133063912391663, "learning_rate": 2.1976576539332024e-05, "loss": 0.664, "step": 1020 }, { "epoch": 30.147058823529413, "grad_norm": 0.15663307905197144, "learning_rate": 2.1764696043225685e-05, "loss": 0.7082, "step": 1025 }, { "epoch": 30.294117647058822, "grad_norm": 0.13505025207996368, "learning_rate": 2.155335810704931e-05, "loss": 0.6463, "step": 1030 }, { "epoch": 30.441176470588236, "grad_norm": 0.1344403475522995, "learning_rate": 2.134258272337814e-05, "loss": 0.6753, "step": 1035 }, { "epoch": 30.58823529411765, "grad_norm": 0.14067409932613373, "learning_rate": 2.1132389831569915e-05, "loss": 0.6715, "step": 1040 }, { "epoch": 30.735294117647058, "grad_norm": 0.13444367051124573, "learning_rate": 2.092279931587866e-05, "loss": 0.6838, "step": 1045 }, { "epoch": 30.88235294117647, "grad_norm": 0.13275469839572906, "learning_rate": 2.0713831003573564e-05, "loss": 0.6842, "step": 1050 }, { "epoch": 31.029411764705884, "grad_norm": 0.12724100053310394, "learning_rate": 2.0505504663063364e-05, "loss": 0.6745, "step": 1055 }, { "epoch": 31.176470588235293, "grad_norm": 0.12783651053905487, "learning_rate": 2.029784000202627e-05, "loss": 0.6839, "step": 1060 }, { "epoch": 31.323529411764707, "grad_norm": 0.13505741953849792, "learning_rate": 2.0090856665545554e-05, "loss": 0.6577, "step": 1065 }, { "epoch": 31.470588235294116, "grad_norm": 0.14324721693992615, "learning_rate": 1.98845742342512e-05, "loss": 0.6786, "step": 1070 }, { "epoch": 31.61764705882353, "grad_norm": 0.14350536465644836, "learning_rate": 1.967901222246754e-05, "loss": 0.6715, "step": 1075 }, { "epoch": 31.764705882352942, "grad_norm": 0.12864898145198822, "learning_rate": 1.947419007636716e-05, "loss": 0.6901, "step": 1080 }, { "epoch": 31.91176470588235, "grad_norm": 0.13163405656814575, "learning_rate": 1.9270127172131363e-05, "loss": 0.6767, "step": 1085 }, { "epoch": 32.05882352941177, "grad_norm": 0.13823044300079346, "learning_rate": 1.906684281411715e-05, "loss": 0.6888, "step": 1090 }, { "epoch": 32.205882352941174, "grad_norm": 0.13260214030742645, "learning_rate": 1.8864356233031e-05, "loss": 0.6899, "step": 1095 }, { "epoch": 32.35294117647059, "grad_norm": 0.13542212545871735, "learning_rate": 1.866268658410969e-05, "loss": 0.6604, "step": 1100 }, { "epoch": 32.5, "grad_norm": 0.14194779098033905, "learning_rate": 1.8461852945308196e-05, "loss": 0.6538, "step": 1105 }, { "epoch": 32.64705882352941, "grad_norm": 0.13551092147827148, "learning_rate": 1.8261874315494874e-05, "loss": 0.6851, "step": 1110 }, { "epoch": 32.794117647058826, "grad_norm": 0.13539521396160126, "learning_rate": 1.806276961265425e-05, "loss": 0.6731, "step": 1115 }, { "epoch": 32.94117647058823, "grad_norm": 0.14235951006412506, "learning_rate": 1.786455767209732e-05, "loss": 0.6798, "step": 1120 }, { "epoch": 33.088235294117645, "grad_norm": 0.12894190847873688, "learning_rate": 1.7667257244679702e-05, "loss": 0.6815, "step": 1125 }, { "epoch": 33.23529411764706, "grad_norm": 0.13332705199718475, "learning_rate": 1.747088699502789e-05, "loss": 0.6709, "step": 1130 }, { "epoch": 33.38235294117647, "grad_norm": 0.13527055084705353, "learning_rate": 1.727546549977352e-05, "loss": 0.689, "step": 1135 }, { "epoch": 33.529411764705884, "grad_norm": 0.13612490892410278, "learning_rate": 1.7081011245796013e-05, "loss": 0.6744, "step": 1140 }, { "epoch": 33.6764705882353, "grad_norm": 0.13099683821201324, "learning_rate": 1.6887542628473763e-05, "loss": 0.6871, "step": 1145 }, { "epoch": 33.8235294117647, "grad_norm": 0.13698424398899078, "learning_rate": 1.6695077949943892e-05, "loss": 0.6852, "step": 1150 }, { "epoch": 33.970588235294116, "grad_norm": 0.13121846318244934, "learning_rate": 1.6503635417370882e-05, "loss": 0.6529, "step": 1155 }, { "epoch": 34.11764705882353, "grad_norm": 0.1369757056236267, "learning_rate": 1.6313233141224165e-05, "loss": 0.6855, "step": 1160 }, { "epoch": 34.26470588235294, "grad_norm": 0.13654442131519318, "learning_rate": 1.612388913356493e-05, "loss": 0.6596, "step": 1165 }, { "epoch": 34.411764705882355, "grad_norm": 0.136439248919487, "learning_rate": 1.5935621306342057e-05, "loss": 0.6843, "step": 1170 }, { "epoch": 34.55882352941177, "grad_norm": 0.1410278081893921, "learning_rate": 1.5748447469697803e-05, "loss": 0.6786, "step": 1175 }, { "epoch": 34.705882352941174, "grad_norm": 0.16095899045467377, "learning_rate": 1.556238533028283e-05, "loss": 0.6563, "step": 1180 }, { "epoch": 34.85294117647059, "grad_norm": 0.13262508809566498, "learning_rate": 1.5377452489581234e-05, "loss": 0.6888, "step": 1185 }, { "epoch": 35.0, "grad_norm": 0.13472063839435577, "learning_rate": 1.5193666442245402e-05, "loss": 0.681, "step": 1190 }, { "epoch": 35.14705882352941, "grad_norm": 0.13549183309078217, "learning_rate": 1.5011044574441036e-05, "loss": 0.6755, "step": 1195 }, { "epoch": 35.294117647058826, "grad_norm": 0.1411600261926651, "learning_rate": 1.4829604162202442e-05, "loss": 0.7007, "step": 1200 }, { "epoch": 35.44117647058823, "grad_norm": 0.14127956330776215, "learning_rate": 1.4649362369798152e-05, "loss": 0.6551, "step": 1205 }, { "epoch": 35.588235294117645, "grad_norm": 0.13209925591945648, "learning_rate": 1.4470336248107266e-05, "loss": 0.6762, "step": 1210 }, { "epoch": 35.73529411764706, "grad_norm": 0.12888824939727783, "learning_rate": 1.4292542733006372e-05, "loss": 0.6775, "step": 1215 }, { "epoch": 35.88235294117647, "grad_norm": 0.14431186020374298, "learning_rate": 1.4115998643767447e-05, "loss": 0.6654, "step": 1220 }, { "epoch": 36.029411764705884, "grad_norm": 0.12955108284950256, "learning_rate": 1.3940720681466734e-05, "loss": 0.6807, "step": 1225 }, { "epoch": 36.1764705882353, "grad_norm": 0.13727155327796936, "learning_rate": 1.3766725427404843e-05, "loss": 0.6925, "step": 1230 }, { "epoch": 36.3235294117647, "grad_norm": 0.13375459611415863, "learning_rate": 1.3594029341538128e-05, "loss": 0.6884, "step": 1235 }, { "epoch": 36.470588235294116, "grad_norm": 0.13129761815071106, "learning_rate": 1.34226487609216e-05, "loss": 0.6868, "step": 1240 }, { "epoch": 36.61764705882353, "grad_norm": 0.1358431726694107, "learning_rate": 1.3252599898163454e-05, "loss": 0.6538, "step": 1245 }, { "epoch": 36.76470588235294, "grad_norm": 0.14378570020198822, "learning_rate": 1.3083898839891284e-05, "loss": 0.6457, "step": 1250 }, { "epoch": 36.911764705882355, "grad_norm": 0.14497900009155273, "learning_rate": 1.29165615452304e-05, "loss": 0.6746, "step": 1255 }, { "epoch": 37.05882352941177, "grad_norm": 0.13402092456817627, "learning_rate": 1.275060384429398e-05, "loss": 0.6721, "step": 1260 }, { "epoch": 37.205882352941174, "grad_norm": 0.13633766770362854, "learning_rate": 1.258604143668563e-05, "loss": 0.6724, "step": 1265 }, { "epoch": 37.35294117647059, "grad_norm": 0.13490943610668182, "learning_rate": 1.2422889890014143e-05, "loss": 0.6578, "step": 1270 }, { "epoch": 37.5, "grad_norm": 0.1326485574245453, "learning_rate": 1.2261164638420832e-05, "loss": 0.6664, "step": 1275 }, { "epoch": 37.64705882352941, "grad_norm": 0.14504876732826233, "learning_rate": 1.2100880981119447e-05, "loss": 0.6856, "step": 1280 }, { "epoch": 37.794117647058826, "grad_norm": 0.13638907670974731, "learning_rate": 1.1942054080948878e-05, "loss": 0.6842, "step": 1285 }, { "epoch": 37.94117647058823, "grad_norm": 0.16528142988681793, "learning_rate": 1.1784698962938763e-05, "loss": 0.6759, "step": 1290 }, { "epoch": 38.088235294117645, "grad_norm": 0.15061551332473755, "learning_rate": 1.1628830512888057e-05, "loss": 0.6899, "step": 1295 }, { "epoch": 38.23529411764706, "grad_norm": 0.13696105778217316, "learning_rate": 1.1474463475956926e-05, "loss": 0.6624, "step": 1300 }, { "epoch": 38.38235294117647, "grad_norm": 0.12491544336080551, "learning_rate": 1.1321612455271793e-05, "loss": 0.6725, "step": 1305 }, { "epoch": 38.529411764705884, "grad_norm": 0.13985736668109894, "learning_rate": 1.117029191054389e-05, "loss": 0.6942, "step": 1310 }, { "epoch": 38.6764705882353, "grad_norm": 0.14015409350395203, "learning_rate": 1.1020516156701383e-05, "loss": 0.6759, "step": 1315 }, { "epoch": 38.8235294117647, "grad_norm": 0.14540641009807587, "learning_rate": 1.0872299362535173e-05, "loss": 0.6645, "step": 1320 }, { "epoch": 38.970588235294116, "grad_norm": 0.1425599455833435, "learning_rate": 1.0725655549358532e-05, "loss": 0.6711, "step": 1325 }, { "epoch": 39.11764705882353, "grad_norm": 0.13927870988845825, "learning_rate": 1.0580598589680664e-05, "loss": 0.6956, "step": 1330 }, { "epoch": 39.26470588235294, "grad_norm": 0.13224616646766663, "learning_rate": 1.0437142205894418e-05, "loss": 0.6868, "step": 1335 }, { "epoch": 39.411764705882355, "grad_norm": 0.13682135939598083, "learning_rate": 1.029529996897808e-05, "loss": 0.6735, "step": 1340 }, { "epoch": 39.55882352941177, "grad_norm": 0.1319390833377838, "learning_rate": 1.0155085297211618e-05, "loss": 0.6513, "step": 1345 }, { "epoch": 39.705882352941174, "grad_norm": 0.1452108919620514, "learning_rate": 1.001651145490726e-05, "loss": 0.6772, "step": 1350 }, { "epoch": 39.85294117647059, "grad_norm": 0.14989398419857025, "learning_rate": 9.87959155115474e-06, "loss": 0.6633, "step": 1355 }, { "epoch": 40.0, "grad_norm": 0.14220058917999268, "learning_rate": 9.744338538581147e-06, "loss": 0.6778, "step": 1360 }, { "epoch": 40.14705882352941, "grad_norm": 0.15334346890449524, "learning_rate": 9.610765212125607e-06, "loss": 0.6775, "step": 1365 }, { "epoch": 40.294117647058826, "grad_norm": 0.1356540322303772, "learning_rate": 9.478884207828912e-06, "loss": 0.6513, "step": 1370 }, { "epoch": 40.44117647058823, "grad_norm": 0.14519663155078888, "learning_rate": 9.34870800163808e-06, "loss": 0.6847, "step": 1375 }, { "epoch": 40.588235294117645, "grad_norm": 0.13579830527305603, "learning_rate": 9.220248908226224e-06, "loss": 0.6661, "step": 1380 }, { "epoch": 40.73529411764706, "grad_norm": 0.13304731249809265, "learning_rate": 9.09351907982751e-06, "loss": 0.6569, "step": 1385 }, { "epoch": 40.88235294117647, "grad_norm": 0.14108242094516754, "learning_rate": 8.968530505087582e-06, "loss": 0.6894, "step": 1390 }, { "epoch": 41.029411764705884, "grad_norm": 0.13457804918289185, "learning_rate": 8.845295007929446e-06, "loss": 0.6814, "step": 1395 }, { "epoch": 41.1764705882353, "grad_norm": 0.13951298594474792, "learning_rate": 8.7238242464349e-06, "loss": 0.6721, "step": 1400 }, { "epoch": 41.3235294117647, "grad_norm": 0.14416338503360748, "learning_rate": 8.604129711741706e-06, "loss": 0.6881, "step": 1405 }, { "epoch": 41.470588235294116, "grad_norm": 0.13295041024684906, "learning_rate": 8.486222726956508e-06, "loss": 0.6624, "step": 1410 }, { "epoch": 41.61764705882353, "grad_norm": 0.1342659443616867, "learning_rate": 8.370114446083686e-06, "loss": 0.6956, "step": 1415 }, { "epoch": 41.76470588235294, "grad_norm": 0.13162069022655487, "learning_rate": 8.255815852970153e-06, "loss": 0.6646, "step": 1420 }, { "epoch": 41.911764705882355, "grad_norm": 0.12931868433952332, "learning_rate": 8.143337760266331e-06, "loss": 0.6618, "step": 1425 }, { "epoch": 42.05882352941177, "grad_norm": 0.13857227563858032, "learning_rate": 8.032690808403232e-06, "loss": 0.6672, "step": 1430 }, { "epoch": 42.205882352941174, "grad_norm": 0.13812746107578278, "learning_rate": 7.923885464585884e-06, "loss": 0.6866, "step": 1435 }, { "epoch": 42.35294117647059, "grad_norm": 0.1503993570804596, "learning_rate": 7.816932021803154e-06, "loss": 0.6885, "step": 1440 }, { "epoch": 42.5, "grad_norm": 0.13175919651985168, "learning_rate": 7.711840597853998e-06, "loss": 0.6686, "step": 1445 }, { "epoch": 42.64705882352941, "grad_norm": 0.13319700956344604, "learning_rate": 7.608621134390344e-06, "loss": 0.6561, "step": 1450 }, { "epoch": 42.794117647058826, "grad_norm": 0.1399184763431549, "learning_rate": 7.507283395976592e-06, "loss": 0.6537, "step": 1455 }, { "epoch": 42.94117647058823, "grad_norm": 0.13498006761074066, "learning_rate": 7.407836969165911e-06, "loss": 0.6886, "step": 1460 }, { "epoch": 43.088235294117645, "grad_norm": 0.1388946920633316, "learning_rate": 7.310291261593308e-06, "loss": 0.6797, "step": 1465 }, { "epoch": 43.23529411764706, "grad_norm": 0.13527587056159973, "learning_rate": 7.2146555010857155e-06, "loss": 0.6813, "step": 1470 }, { "epoch": 43.38235294117647, "grad_norm": 0.13819634914398193, "learning_rate": 7.120938734789012e-06, "loss": 0.6752, "step": 1475 }, { "epoch": 43.529411764705884, "grad_norm": 0.12921284139156342, "learning_rate": 7.029149828312145e-06, "loss": 0.6761, "step": 1480 }, { "epoch": 43.6764705882353, "grad_norm": 0.13316689431667328, "learning_rate": 6.93929746488849e-06, "loss": 0.6632, "step": 1485 }, { "epoch": 43.8235294117647, "grad_norm": 0.13092860579490662, "learning_rate": 6.851390144554372e-06, "loss": 0.6705, "step": 1490 }, { "epoch": 43.970588235294116, "grad_norm": 0.13236363232135773, "learning_rate": 6.765436183344996e-06, "loss": 0.6602, "step": 1495 }, { "epoch": 44.11764705882353, "grad_norm": 0.14111852645874023, "learning_rate": 6.6814437125077135e-06, "loss": 0.6554, "step": 1500 }, { "epoch": 44.26470588235294, "grad_norm": 0.13777071237564087, "learning_rate": 6.599420677732848e-06, "loss": 0.6783, "step": 1505 }, { "epoch": 44.411764705882355, "grad_norm": 0.15148292481899261, "learning_rate": 6.519374838401997e-06, "loss": 0.6818, "step": 1510 }, { "epoch": 44.55882352941177, "grad_norm": 0.14915376901626587, "learning_rate": 6.44131376685401e-06, "loss": 0.6758, "step": 1515 }, { "epoch": 44.705882352941174, "grad_norm": 0.13784313201904297, "learning_rate": 6.36524484766865e-06, "loss": 0.6652, "step": 1520 }, { "epoch": 44.85294117647059, "grad_norm": 0.13148203492164612, "learning_rate": 6.291175276968002e-06, "loss": 0.6758, "step": 1525 }, { "epoch": 45.0, "grad_norm": 0.14285942912101746, "learning_rate": 6.219112061735721e-06, "loss": 0.6716, "step": 1530 }, { "epoch": 45.14705882352941, "grad_norm": 0.14051543176174164, "learning_rate": 6.149062019154174e-06, "loss": 0.6833, "step": 1535 }, { "epoch": 45.294117647058826, "grad_norm": 0.12927637994289398, "learning_rate": 6.081031775959542e-06, "loss": 0.648, "step": 1540 }, { "epoch": 45.44117647058823, "grad_norm": 0.1433332860469818, "learning_rate": 6.0150277678149055e-06, "loss": 0.6377, "step": 1545 }, { "epoch": 45.588235294117645, "grad_norm": 0.1298450231552124, "learning_rate": 5.951056238701456e-06, "loss": 0.6866, "step": 1550 }, { "epoch": 45.73529411764706, "grad_norm": 0.13296933472156525, "learning_rate": 5.889123240327819e-06, "loss": 0.6747, "step": 1555 }, { "epoch": 45.88235294117647, "grad_norm": 0.1386398822069168, "learning_rate": 5.829234631557524e-06, "loss": 0.6827, "step": 1560 }, { "epoch": 46.029411764705884, "grad_norm": 0.12945467233657837, "learning_rate": 5.771396077854802e-06, "loss": 0.6823, "step": 1565 }, { "epoch": 46.1764705882353, "grad_norm": 0.1447058618068695, "learning_rate": 5.715613050748604e-06, "loss": 0.6542, "step": 1570 }, { "epoch": 46.3235294117647, "grad_norm": 0.13116587698459625, "learning_rate": 5.661890827315004e-06, "loss": 0.664, "step": 1575 }, { "epoch": 46.470588235294116, "grad_norm": 0.13510237634181976, "learning_rate": 5.61023448967798e-06, "loss": 0.6698, "step": 1580 }, { "epoch": 46.61764705882353, "grad_norm": 0.13816939294338226, "learning_rate": 5.560648924528657e-06, "loss": 0.7097, "step": 1585 }, { "epoch": 46.76470588235294, "grad_norm": 0.1419830620288849, "learning_rate": 5.513138822663016e-06, "loss": 0.6905, "step": 1590 }, { "epoch": 46.911764705882355, "grad_norm": 0.13188646733760834, "learning_rate": 5.467708678538148e-06, "loss": 0.6457, "step": 1595 }, { "epoch": 47.05882352941177, "grad_norm": 0.1413869559764862, "learning_rate": 5.424362789847082e-06, "loss": 0.6766, "step": 1600 }, { "epoch": 47.205882352941174, "grad_norm": 0.13414201140403748, "learning_rate": 5.38310525711221e-06, "loss": 0.6795, "step": 1605 }, { "epoch": 47.35294117647059, "grad_norm": 0.1363234668970108, "learning_rate": 5.343939983297398e-06, "loss": 0.6713, "step": 1610 }, { "epoch": 47.5, "grad_norm": 0.1344790905714035, "learning_rate": 5.3068706734387484e-06, "loss": 0.6584, "step": 1615 }, { "epoch": 47.64705882352941, "grad_norm": 0.1452033668756485, "learning_rate": 5.271900834294105e-06, "loss": 0.667, "step": 1620 }, { "epoch": 47.794117647058826, "grad_norm": 0.13405530154705048, "learning_rate": 5.239033774011322e-06, "loss": 0.669, "step": 1625 }, { "epoch": 47.94117647058823, "grad_norm": 0.13740584254264832, "learning_rate": 5.208272601815313e-06, "loss": 0.6836, "step": 1630 }, { "epoch": 48.088235294117645, "grad_norm": 0.12757079303264618, "learning_rate": 5.1796202277139075e-06, "loss": 0.6909, "step": 1635 }, { "epoch": 48.23529411764706, "grad_norm": 0.12381689995527267, "learning_rate": 5.1530793622225725e-06, "loss": 0.6605, "step": 1640 }, { "epoch": 48.38235294117647, "grad_norm": 0.13950037956237793, "learning_rate": 5.128652516107996e-06, "loss": 0.6814, "step": 1645 }, { "epoch": 48.529411764705884, "grad_norm": 0.13073165714740753, "learning_rate": 5.10634200015057e-06, "loss": 0.6866, "step": 1650 }, { "epoch": 48.6764705882353, "grad_norm": 0.1424126774072647, "learning_rate": 5.086149924925788e-06, "loss": 0.6697, "step": 1655 }, { "epoch": 48.8235294117647, "grad_norm": 0.15078318119049072, "learning_rate": 5.068078200604584e-06, "loss": 0.6615, "step": 1660 }, { "epoch": 48.970588235294116, "grad_norm": 0.1373935043811798, "learning_rate": 5.052128536772629e-06, "loss": 0.6665, "step": 1665 }, { "epoch": 49.11764705882353, "grad_norm": 0.1401982456445694, "learning_rate": 5.038302442268617e-06, "loss": 0.6597, "step": 1670 }, { "epoch": 49.26470588235294, "grad_norm": 0.1401190608739853, "learning_rate": 5.026601225041503e-06, "loss": 0.6929, "step": 1675 }, { "epoch": 49.411764705882355, "grad_norm": 0.1368054449558258, "learning_rate": 5.0170259920268025e-06, "loss": 0.6923, "step": 1680 }, { "epoch": 49.55882352941177, "grad_norm": 0.13621118664741516, "learning_rate": 5.009577649041847e-06, "loss": 0.6574, "step": 1685 }, { "epoch": 49.705882352941174, "grad_norm": 0.13294735550880432, "learning_rate": 5.004256900700115e-06, "loss": 0.6646, "step": 1690 }, { "epoch": 49.85294117647059, "grad_norm": 0.14144855737686157, "learning_rate": 5.001064250344557e-06, "loss": 0.666, "step": 1695 }, { "epoch": 50.0, "grad_norm": 0.13993091881275177, "learning_rate": 5e-06, "loss": 0.6593, "step": 1700 }, { "epoch": 50.0, "step": 1700, "total_flos": 2.629944131882844e+18, "train_loss": 0.6986723100437837, "train_runtime": 41334.8641, "train_samples_per_second": 0.481, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 1700, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.629944131882844e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }