| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 11.971223021582734, | |
| "eval_steps": 25, | |
| "global_step": 468, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9966348377433891, | |
| "learning_rate": 9.999874838141888e-05, | |
| "loss": 0.1831, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.12574630975723267, | |
| "eval_runtime": 110.3252, | |
| "eval_samples_per_second": 90.65, | |
| "eval_steps_per_second": 1.423, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.13412292959827216, | |
| "learning_rate": 9.915628588978522e-05, | |
| "loss": 0.1239, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.10444429516792297, | |
| "eval_runtime": 110.3167, | |
| "eval_samples_per_second": 90.657, | |
| "eval_steps_per_second": 1.423, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.1395974887205158, | |
| "learning_rate": 9.67797005288181e-05, | |
| "loss": 0.108, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.0995471403002739, | |
| "eval_runtime": 110.0262, | |
| "eval_samples_per_second": 90.897, | |
| "eval_steps_per_second": 1.427, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.1280464988861011, | |
| "learning_rate": 9.294316336102132e-05, | |
| "loss": 0.0976, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_loss": 0.09775934368371964, | |
| "eval_runtime": 109.8969, | |
| "eval_samples_per_second": 91.003, | |
| "eval_steps_per_second": 1.429, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.09561572064129414, | |
| "learning_rate": 8.776640921382584e-05, | |
| "loss": 0.094, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 0.08864283561706543, | |
| "eval_runtime": 110.0582, | |
| "eval_samples_per_second": 90.87, | |
| "eval_steps_per_second": 1.427, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.11127416265410695, | |
| "learning_rate": 8.141099986478212e-05, | |
| "loss": 0.0828, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "eval_loss": 0.08932521939277649, | |
| "eval_runtime": 109.3425, | |
| "eval_samples_per_second": 91.465, | |
| "eval_steps_per_second": 1.436, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.22132698379917157, | |
| "learning_rate": 7.407528184577019e-05, | |
| "loss": 0.078, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "eval_loss": 0.09065766632556915, | |
| "eval_runtime": 110.1178, | |
| "eval_samples_per_second": 90.821, | |
| "eval_steps_per_second": 1.426, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 0.08955720132434862, | |
| "learning_rate": 6.598819622856227e-05, | |
| "loss": 0.0767, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "eval_loss": 0.08660481870174408, | |
| "eval_runtime": 109.7782, | |
| "eval_samples_per_second": 91.102, | |
| "eval_steps_per_second": 1.43, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.09375419122940627, | |
| "learning_rate": 5.7402133582686576e-05, | |
| "loss": 0.0697, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "eval_loss": 0.08401340246200562, | |
| "eval_runtime": 109.3041, | |
| "eval_samples_per_second": 91.497, | |
| "eval_steps_per_second": 1.436, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "grad_norm": 0.0990995516325273, | |
| "learning_rate": 4.85850570958441e-05, | |
| "loss": 0.0646, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "eval_loss": 0.08192210644483566, | |
| "eval_runtime": 109.5799, | |
| "eval_samples_per_second": 91.267, | |
| "eval_steps_per_second": 1.433, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "grad_norm": 0.07754859448629634, | |
| "learning_rate": 3.9812139687108815e-05, | |
| "loss": 0.0594, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "eval_loss": 0.07945634424686432, | |
| "eval_runtime": 109.4058, | |
| "eval_samples_per_second": 91.412, | |
| "eval_steps_per_second": 1.435, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "grad_norm": 0.06681798482966637, | |
| "learning_rate": 3.135717611098458e-05, | |
| "loss": 0.052, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "eval_loss": 0.07952920347452164, | |
| "eval_runtime": 110.1672, | |
| "eval_samples_per_second": 90.78, | |
| "eval_steps_per_second": 1.425, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "grad_norm": 0.10690045013292092, | |
| "learning_rate": 2.3484038072721758e-05, | |
| "loss": 0.0478, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "eval_loss": 0.08034859597682953, | |
| "eval_runtime": 109.4321, | |
| "eval_samples_per_second": 91.39, | |
| "eval_steps_per_second": 1.435, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 0.08719559737170716, | |
| "learning_rate": 1.6438439032954855e-05, | |
| "loss": 0.0447, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "eval_loss": 0.07858795672655106, | |
| "eval_runtime": 110.1518, | |
| "eval_samples_per_second": 90.793, | |
| "eval_steps_per_second": 1.425, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 9.59, | |
| "grad_norm": 0.04932389885679807, | |
| "learning_rate": 1.0440265714600572e-05, | |
| "loss": 0.0392, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 9.59, | |
| "eval_loss": 0.07998502999544144, | |
| "eval_runtime": 109.5251, | |
| "eval_samples_per_second": 91.312, | |
| "eval_steps_per_second": 1.433, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 10.23, | |
| "grad_norm": 0.03799019571227359, | |
| "learning_rate": 5.676715638695063e-06, | |
| "loss": 0.038, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 10.23, | |
| "eval_loss": 0.08129393309354782, | |
| "eval_runtime": 109.5381, | |
| "eval_samples_per_second": 91.302, | |
| "eval_steps_per_second": 1.433, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 10.87, | |
| "grad_norm": 0.034874323591440555, | |
| "learning_rate": 2.2964548604209213e-06, | |
| "loss": 0.0357, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 10.87, | |
| "eval_loss": 0.08102953433990479, | |
| "eval_runtime": 110.2721, | |
| "eval_samples_per_second": 90.694, | |
| "eval_steps_per_second": 1.424, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 11.51, | |
| "grad_norm": 0.038884021048955025, | |
| "learning_rate": 4.049782370561583e-07, | |
| "loss": 0.035, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 11.51, | |
| "eval_loss": 0.08157423138618469, | |
| "eval_runtime": 109.9602, | |
| "eval_samples_per_second": 90.951, | |
| "eval_steps_per_second": 1.428, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 11.97, | |
| "step": 468, | |
| "total_flos": 1.0672624631808e+16, | |
| "train_loss": 0.07237311357106918, | |
| "train_runtime": 39649.873, | |
| "train_samples_per_second": 24.213, | |
| "train_steps_per_second": 0.012 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 468, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 2000, | |
| "total_flos": 1.0672624631808e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |