{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 878, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04555808656036447, "grad_norm": 14.260233879089355, "learning_rate": 5.000000000000001e-07, "loss": 0.6404, "step": 20 }, { "epoch": 0.09111617312072894, "grad_norm": 13.856605529785156, "learning_rate": 1.0000000000000002e-06, "loss": 0.6072, "step": 40 }, { "epoch": 0.1366742596810934, "grad_norm": 10.69395637512207, "learning_rate": 1.5e-06, "loss": 0.4863, "step": 60 }, { "epoch": 0.18223234624145787, "grad_norm": 6.665396213531494, "learning_rate": 2.0000000000000003e-06, "loss": 0.3266, "step": 80 }, { "epoch": 0.22779043280182232, "grad_norm": 2.674818754196167, "learning_rate": 2.5e-06, "loss": 0.1972, "step": 100 }, { "epoch": 0.2733485193621868, "grad_norm": 0.46903184056282043, "learning_rate": 3e-06, "loss": 0.0999, "step": 120 }, { "epoch": 0.31890660592255127, "grad_norm": 0.6415174603462219, "learning_rate": 3.5e-06, "loss": 0.0698, "step": 140 }, { "epoch": 0.36446469248291574, "grad_norm": 0.3701508343219757, "learning_rate": 4.000000000000001e-06, "loss": 0.0656, "step": 160 }, { "epoch": 0.41002277904328016, "grad_norm": 0.4025450348854065, "learning_rate": 4.5e-06, "loss": 0.0574, "step": 180 }, { "epoch": 0.45558086560364464, "grad_norm": 0.5695033073425293, "learning_rate": 5e-06, "loss": 0.0562, "step": 200 }, { "epoch": 0.45558086560364464, "eval_accuracy": 0.8391241361293846, "eval_f1": 0.8391241361293846, "eval_f1_marco": 0.8249262659790968, "eval_loss": 0.056919749826192856, "eval_negative_f1": 0.8747828015823136, "eval_positive_f1": 0.7750697303758799, "eval_precision": 0.8391241361293846, "eval_recall": 0.8391241361293846, "eval_runtime": 9.3166, "eval_samples_per_second": 79.106, "eval_steps_per_second": 1.288, "step": 200 }, { "epoch": 0.5011389521640092, "grad_norm": 0.527057945728302, "learning_rate": 4.852507374631269e-06, "loss": 0.0622, "step": 220 }, { "epoch": 0.5466970387243736, "grad_norm": 0.47804608941078186, "learning_rate": 4.705014749262537e-06, "loss": 0.0527, "step": 240 }, { "epoch": 0.592255125284738, "grad_norm": 0.3202660381793976, "learning_rate": 4.557522123893805e-06, "loss": 0.0555, "step": 260 }, { "epoch": 0.6378132118451025, "grad_norm": 0.40933147072792053, "learning_rate": 4.410029498525074e-06, "loss": 0.0563, "step": 280 }, { "epoch": 0.683371298405467, "grad_norm": 0.4464198648929596, "learning_rate": 4.2625368731563425e-06, "loss": 0.054, "step": 300 }, { "epoch": 0.7289293849658315, "grad_norm": 0.5946183204650879, "learning_rate": 4.115044247787611e-06, "loss": 0.0543, "step": 320 }, { "epoch": 0.7744874715261959, "grad_norm": 0.8823751211166382, "learning_rate": 3.967551622418879e-06, "loss": 0.0552, "step": 340 }, { "epoch": 0.8200455580865603, "grad_norm": 0.29086050391197205, "learning_rate": 3.820058997050148e-06, "loss": 0.0556, "step": 360 }, { "epoch": 0.8656036446469249, "grad_norm": 0.36109957098960876, "learning_rate": 3.6725663716814163e-06, "loss": 0.0547, "step": 380 }, { "epoch": 0.9111617312072893, "grad_norm": 0.357105553150177, "learning_rate": 3.5250737463126845e-06, "loss": 0.054, "step": 400 }, { "epoch": 0.9111617312072893, "eval_accuracy": 0.857600873963949, "eval_f1": 0.857600873963949, "eval_f1_marco": 0.8454311469742184, "eval_loss": 0.05031890422105789, "eval_negative_f1": 0.8888023441267016, "eval_positive_f1": 0.8020599498217351, "eval_precision": 0.857600873963949, "eval_recall": 0.857600873963949, "eval_runtime": 9.081, "eval_samples_per_second": 81.158, "eval_steps_per_second": 1.321, "step": 400 }, { "epoch": 0.9567198177676538, "grad_norm": 0.3978097438812256, "learning_rate": 3.3775811209439528e-06, "loss": 0.0513, "step": 420 }, { "epoch": 1.0022779043280183, "grad_norm": 0.327373206615448, "learning_rate": 3.2300884955752214e-06, "loss": 0.0527, "step": 440 }, { "epoch": 1.0478359908883828, "grad_norm": 0.39979490637779236, "learning_rate": 3.08259587020649e-06, "loss": 0.0474, "step": 460 }, { "epoch": 1.0933940774487472, "grad_norm": 0.37922340631484985, "learning_rate": 2.935103244837758e-06, "loss": 0.0501, "step": 480 }, { "epoch": 1.1389521640091116, "grad_norm": 0.4099065363407135, "learning_rate": 2.7876106194690266e-06, "loss": 0.0461, "step": 500 }, { "epoch": 1.184510250569476, "grad_norm": 0.3328123390674591, "learning_rate": 2.6401179941002952e-06, "loss": 0.048, "step": 520 }, { "epoch": 1.2300683371298406, "grad_norm": 0.647693932056427, "learning_rate": 2.4926253687315635e-06, "loss": 0.0495, "step": 540 }, { "epoch": 1.275626423690205, "grad_norm": 0.6742229461669922, "learning_rate": 2.345132743362832e-06, "loss": 0.0496, "step": 560 }, { "epoch": 1.3211845102505695, "grad_norm": 0.2932397425174713, "learning_rate": 2.1976401179941004e-06, "loss": 0.0479, "step": 580 }, { "epoch": 1.366742596810934, "grad_norm": 0.32655268907546997, "learning_rate": 2.050147492625369e-06, "loss": 0.0472, "step": 600 }, { "epoch": 1.366742596810934, "eval_accuracy": 0.8680504429192296, "eval_f1": 0.8680504429192296, "eval_f1_marco": 0.853667569896885, "eval_loss": 0.04778573289513588, "eval_negative_f1": 0.8995443697114341, "eval_positive_f1": 0.8077907700823358, "eval_precision": 0.8680504429192296, "eval_recall": 0.8680504429192296, "eval_runtime": 9.6373, "eval_samples_per_second": 76.473, "eval_steps_per_second": 1.245, "step": 600 }, { "epoch": 1.4123006833712983, "grad_norm": 0.3308158814907074, "learning_rate": 1.9026548672566373e-06, "loss": 0.0455, "step": 620 }, { "epoch": 1.4578587699316627, "grad_norm": 0.4326237738132477, "learning_rate": 1.7551622418879058e-06, "loss": 0.0476, "step": 640 }, { "epoch": 1.5034168564920274, "grad_norm": 0.9873289465904236, "learning_rate": 1.607669616519174e-06, "loss": 0.0469, "step": 660 }, { "epoch": 1.5489749430523918, "grad_norm": 0.4288870096206665, "learning_rate": 1.4601769911504427e-06, "loss": 0.0459, "step": 680 }, { "epoch": 1.5945330296127562, "grad_norm": 0.4720146358013153, "learning_rate": 1.312684365781711e-06, "loss": 0.0494, "step": 700 }, { "epoch": 1.6400911161731209, "grad_norm": 0.6934795379638672, "learning_rate": 1.1651917404129796e-06, "loss": 0.05, "step": 720 }, { "epoch": 1.6856492027334853, "grad_norm": 0.3552420735359192, "learning_rate": 1.017699115044248e-06, "loss": 0.0499, "step": 740 }, { "epoch": 1.7312072892938497, "grad_norm": 0.32909882068634033, "learning_rate": 8.702064896755164e-07, "loss": 0.0485, "step": 760 }, { "epoch": 1.7767653758542141, "grad_norm": 0.2789745628833771, "learning_rate": 7.227138643067848e-07, "loss": 0.0468, "step": 780 }, { "epoch": 1.8223234624145785, "grad_norm": 0.3232771158218384, "learning_rate": 5.752212389380532e-07, "loss": 0.0481, "step": 800 }, { "epoch": 1.8223234624145785, "eval_accuracy": 0.8677417056546417, "eval_f1": 0.8677417056546417, "eval_f1_marco": 0.8536354633333805, "eval_loss": 0.04745788872241974, "eval_negative_f1": 0.8990739230504359, "eval_positive_f1": 0.8081970036163251, "eval_precision": 0.8677417056546417, "eval_recall": 0.8677417056546417, "eval_runtime": 8.8278, "eval_samples_per_second": 83.487, "eval_steps_per_second": 1.359, "step": 800 }, { "epoch": 1.867881548974943, "grad_norm": 0.4403178095817566, "learning_rate": 4.277286135693216e-07, "loss": 0.0502, "step": 820 }, { "epoch": 1.9134396355353074, "grad_norm": 0.4974011182785034, "learning_rate": 2.8023598820059e-07, "loss": 0.0453, "step": 840 }, { "epoch": 1.958997722095672, "grad_norm": 0.3315702974796295, "learning_rate": 1.327433628318584e-07, "loss": 0.0494, "step": 860 } ], "logging_steps": 20, "max_steps": 878, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.952805421392077e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }