{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 471, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06369426751592357, "grad_norm": 6.031618595123291, "learning_rate": 1.8750000000000002e-05, "loss": 3.2979, "step": 10 }, { "epoch": 0.12738853503184713, "grad_norm": 5.124261379241943, "learning_rate": 3.958333333333333e-05, "loss": 2.8325, "step": 20 }, { "epoch": 0.1910828025477707, "grad_norm": 2.4147489070892334, "learning_rate": 6.041666666666667e-05, "loss": 1.716, "step": 30 }, { "epoch": 0.25477707006369427, "grad_norm": 1.0314959287643433, "learning_rate": 8.125000000000001e-05, "loss": 1.2752, "step": 40 }, { "epoch": 0.3184713375796178, "grad_norm": 0.5906504988670349, "learning_rate": 9.999862102299873e-05, "loss": 1.207, "step": 50 }, { "epoch": 0.3821656050955414, "grad_norm": 0.43713438510894775, "learning_rate": 9.983323579940351e-05, "loss": 1.1241, "step": 60 }, { "epoch": 0.445859872611465, "grad_norm": 0.40896832942962646, "learning_rate": 9.939310009499348e-05, "loss": 1.0958, "step": 70 }, { "epoch": 0.5095541401273885, "grad_norm": 0.31001320481300354, "learning_rate": 9.868064055324204e-05, "loss": 1.059, "step": 80 }, { "epoch": 0.5732484076433121, "grad_norm": 0.3313826024532318, "learning_rate": 9.769978524742229e-05, "loss": 1.0795, "step": 90 }, { "epoch": 0.6369426751592356, "grad_norm": 0.3064286708831787, "learning_rate": 9.645594202357439e-05, "loss": 1.0693, "step": 100 }, { "epoch": 0.7006369426751592, "grad_norm": 0.3159961402416229, "learning_rate": 9.495596868489588e-05, "loss": 1.0465, "step": 110 }, { "epoch": 0.7643312101910829, "grad_norm": 0.39494845271110535, "learning_rate": 9.320813518194084e-05, "loss": 1.0324, "step": 120 }, { "epoch": 0.8280254777070064, "grad_norm": 0.44708824157714844, "learning_rate": 9.122207801708802e-05, "loss": 1.0234, "step": 130 }, { "epoch": 0.89171974522293, "grad_norm": 0.41689348220825195, "learning_rate": 8.900874711466435e-05, "loss": 1.0376, "step": 140 }, { "epoch": 0.9554140127388535, "grad_norm": 0.44199317693710327, "learning_rate": 8.658034544965003e-05, "loss": 0.9996, "step": 150 }, { "epoch": 1.019108280254777, "grad_norm": 0.4607829451560974, "learning_rate": 8.395026176781627e-05, "loss": 0.9987, "step": 160 }, { "epoch": 1.0828025477707006, "grad_norm": 0.47777435183525085, "learning_rate": 8.113299676823614e-05, "loss": 0.9745, "step": 170 }, { "epoch": 1.1464968152866242, "grad_norm": 0.48650094866752625, "learning_rate": 7.814408315515418e-05, "loss": 1.0014, "step": 180 }, { "epoch": 1.2101910828025477, "grad_norm": 0.42078331112861633, "learning_rate": 7.500000000000001e-05, "loss": 1.0072, "step": 190 }, { "epoch": 1.2738853503184713, "grad_norm": 0.5042803883552551, "learning_rate": 7.171808188570291e-05, "loss": 1.0023, "step": 200 }, { "epoch": 1.3375796178343948, "grad_norm": 0.4602188766002655, "learning_rate": 6.831642333423067e-05, "loss": 0.9621, "step": 210 }, { "epoch": 1.4012738853503186, "grad_norm": 0.4462142586708069, "learning_rate": 6.481377904428171e-05, "loss": 0.9819, "step": 220 }, { "epoch": 1.4649681528662422, "grad_norm": 0.4639708697795868, "learning_rate": 6.122946048915991e-05, "loss": 0.9851, "step": 230 }, { "epoch": 1.5286624203821657, "grad_norm": 0.4281213581562042, "learning_rate": 5.75832294449293e-05, "loss": 0.9506, "step": 240 }, { "epoch": 1.5923566878980893, "grad_norm": 0.4628894627094269, "learning_rate": 5.389518903587017e-05, "loss": 0.9534, "step": 250 }, { "epoch": 1.6560509554140128, "grad_norm": 0.42661815881729126, "learning_rate": 5.018567289794651e-05, "loss": 0.9577, "step": 260 }, { "epoch": 1.7197452229299364, "grad_norm": 0.5284398198127747, "learning_rate": 4.647513307137076e-05, "loss": 0.9683, "step": 270 }, { "epoch": 1.78343949044586, "grad_norm": 0.4475821852684021, "learning_rate": 4.278402724035867e-05, "loss": 0.9667, "step": 280 }, { "epoch": 1.8471337579617835, "grad_norm": 0.44738808274269104, "learning_rate": 3.913270594176664e-05, "loss": 0.9544, "step": 290 }, { "epoch": 1.910828025477707, "grad_norm": 0.4267480969429016, "learning_rate": 3.554130036447506e-05, "loss": 0.9899, "step": 300 }, { "epoch": 1.9745222929936306, "grad_norm": 0.47479352355003357, "learning_rate": 3.202961135812437e-05, "loss": 0.9587, "step": 310 }, { "epoch": 2.038216560509554, "grad_norm": 0.41422757506370544, "learning_rate": 2.8617000263143078e-05, "loss": 0.9553, "step": 320 }, { "epoch": 2.1019108280254777, "grad_norm": 0.53452068567276, "learning_rate": 2.5322282163965095e-05, "loss": 0.9506, "step": 330 }, { "epoch": 2.1656050955414012, "grad_norm": 0.4785228371620178, "learning_rate": 2.216362215397393e-05, "loss": 0.9602, "step": 340 }, { "epoch": 2.229299363057325, "grad_norm": 0.46735599637031555, "learning_rate": 1.91584351841065e-05, "loss": 0.9424, "step": 350 }, { "epoch": 2.2929936305732483, "grad_norm": 0.47307127714157104, "learning_rate": 1.6323290047291194e-05, "loss": 0.9036, "step": 360 }, { "epoch": 2.356687898089172, "grad_norm": 0.4797765910625458, "learning_rate": 1.367381802809185e-05, "loss": 0.9354, "step": 370 }, { "epoch": 2.4203821656050954, "grad_norm": 0.4451310932636261, "learning_rate": 1.122462672120914e-05, "loss": 0.9319, "step": 380 }, { "epoch": 2.484076433121019, "grad_norm": 0.47275781631469727, "learning_rate": 8.98921949399179e-06, "loss": 0.99, "step": 390 }, { "epoch": 2.5477707006369426, "grad_norm": 0.449569970369339, "learning_rate": 6.979921036993042e-06, "loss": 0.9659, "step": 400 }, { "epoch": 2.611464968152866, "grad_norm": 0.5072051882743835, "learning_rate": 5.207809413041914e-06, "loss": 0.9192, "step": 410 }, { "epoch": 2.6751592356687897, "grad_norm": 0.44743847846984863, "learning_rate": 3.6826549794698074e-06, "loss": 0.9026, "step": 420 }, { "epoch": 2.738853503184713, "grad_norm": 0.4364466369152069, "learning_rate": 2.4128665202382326e-06, "loss": 0.9621, "step": 430 }, { "epoch": 2.802547770700637, "grad_norm": 0.4684942960739136, "learning_rate": 1.4054448849631085e-06, "loss": 0.9417, "step": 440 }, { "epoch": 2.8662420382165603, "grad_norm": 0.47080230712890625, "learning_rate": 6.659443904419637e-07, "loss": 0.9432, "step": 450 }, { "epoch": 2.9299363057324843, "grad_norm": 0.4364038109779358, "learning_rate": 1.984421974927375e-07, "loss": 0.9389, "step": 460 }, { "epoch": 2.9936305732484074, "grad_norm": 0.458974152803421, "learning_rate": 5.515831941993455e-09, "loss": 0.9422, "step": 470 }, { "epoch": 3.0, "step": 471, "total_flos": 1.2627781654203597e+17, "train_loss": 1.0978564871850793, "train_runtime": 364.5287, "train_samples_per_second": 165.378, "train_steps_per_second": 1.292 } ], "logging_steps": 10, "max_steps": 471, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2627781654203597e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }