| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 471, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06369426751592357, | |
| "grad_norm": 6.031618595123291, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 3.2979, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12738853503184713, | |
| "grad_norm": 5.124261379241943, | |
| "learning_rate": 3.958333333333333e-05, | |
| "loss": 2.8325, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1910828025477707, | |
| "grad_norm": 2.4147489070892334, | |
| "learning_rate": 6.041666666666667e-05, | |
| "loss": 1.716, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25477707006369427, | |
| "grad_norm": 1.0314959287643433, | |
| "learning_rate": 8.125000000000001e-05, | |
| "loss": 1.2752, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3184713375796178, | |
| "grad_norm": 0.5906504988670349, | |
| "learning_rate": 9.999862102299873e-05, | |
| "loss": 1.207, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3821656050955414, | |
| "grad_norm": 0.43713438510894775, | |
| "learning_rate": 9.983323579940351e-05, | |
| "loss": 1.1241, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.445859872611465, | |
| "grad_norm": 0.40896832942962646, | |
| "learning_rate": 9.939310009499348e-05, | |
| "loss": 1.0958, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5095541401273885, | |
| "grad_norm": 0.31001320481300354, | |
| "learning_rate": 9.868064055324204e-05, | |
| "loss": 1.059, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5732484076433121, | |
| "grad_norm": 0.3313826024532318, | |
| "learning_rate": 9.769978524742229e-05, | |
| "loss": 1.0795, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6369426751592356, | |
| "grad_norm": 0.3064286708831787, | |
| "learning_rate": 9.645594202357439e-05, | |
| "loss": 1.0693, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7006369426751592, | |
| "grad_norm": 0.3159961402416229, | |
| "learning_rate": 9.495596868489588e-05, | |
| "loss": 1.0465, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7643312101910829, | |
| "grad_norm": 0.39494845271110535, | |
| "learning_rate": 9.320813518194084e-05, | |
| "loss": 1.0324, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8280254777070064, | |
| "grad_norm": 0.44708824157714844, | |
| "learning_rate": 9.122207801708802e-05, | |
| "loss": 1.0234, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.89171974522293, | |
| "grad_norm": 0.41689348220825195, | |
| "learning_rate": 8.900874711466435e-05, | |
| "loss": 1.0376, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 0.44199317693710327, | |
| "learning_rate": 8.658034544965003e-05, | |
| "loss": 0.9996, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.019108280254777, | |
| "grad_norm": 0.4607829451560974, | |
| "learning_rate": 8.395026176781627e-05, | |
| "loss": 0.9987, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0828025477707006, | |
| "grad_norm": 0.47777435183525085, | |
| "learning_rate": 8.113299676823614e-05, | |
| "loss": 0.9745, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1464968152866242, | |
| "grad_norm": 0.48650094866752625, | |
| "learning_rate": 7.814408315515418e-05, | |
| "loss": 1.0014, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2101910828025477, | |
| "grad_norm": 0.42078331112861633, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 1.0072, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2738853503184713, | |
| "grad_norm": 0.5042803883552551, | |
| "learning_rate": 7.171808188570291e-05, | |
| "loss": 1.0023, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3375796178343948, | |
| "grad_norm": 0.4602188766002655, | |
| "learning_rate": 6.831642333423067e-05, | |
| "loss": 0.9621, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.4012738853503186, | |
| "grad_norm": 0.4462142586708069, | |
| "learning_rate": 6.481377904428171e-05, | |
| "loss": 0.9819, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4649681528662422, | |
| "grad_norm": 0.4639708697795868, | |
| "learning_rate": 6.122946048915991e-05, | |
| "loss": 0.9851, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5286624203821657, | |
| "grad_norm": 0.4281213581562042, | |
| "learning_rate": 5.75832294449293e-05, | |
| "loss": 0.9506, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5923566878980893, | |
| "grad_norm": 0.4628894627094269, | |
| "learning_rate": 5.389518903587017e-05, | |
| "loss": 0.9534, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6560509554140128, | |
| "grad_norm": 0.42661815881729126, | |
| "learning_rate": 5.018567289794651e-05, | |
| "loss": 0.9577, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7197452229299364, | |
| "grad_norm": 0.5284398198127747, | |
| "learning_rate": 4.647513307137076e-05, | |
| "loss": 0.9683, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.78343949044586, | |
| "grad_norm": 0.4475821852684021, | |
| "learning_rate": 4.278402724035867e-05, | |
| "loss": 0.9667, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8471337579617835, | |
| "grad_norm": 0.44738808274269104, | |
| "learning_rate": 3.913270594176664e-05, | |
| "loss": 0.9544, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.910828025477707, | |
| "grad_norm": 0.4267480969429016, | |
| "learning_rate": 3.554130036447506e-05, | |
| "loss": 0.9899, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9745222929936306, | |
| "grad_norm": 0.47479352355003357, | |
| "learning_rate": 3.202961135812437e-05, | |
| "loss": 0.9587, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.038216560509554, | |
| "grad_norm": 0.41422757506370544, | |
| "learning_rate": 2.8617000263143078e-05, | |
| "loss": 0.9553, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.1019108280254777, | |
| "grad_norm": 0.53452068567276, | |
| "learning_rate": 2.5322282163965095e-05, | |
| "loss": 0.9506, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1656050955414012, | |
| "grad_norm": 0.4785228371620178, | |
| "learning_rate": 2.216362215397393e-05, | |
| "loss": 0.9602, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.229299363057325, | |
| "grad_norm": 0.46735599637031555, | |
| "learning_rate": 1.91584351841065e-05, | |
| "loss": 0.9424, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.2929936305732483, | |
| "grad_norm": 0.47307127714157104, | |
| "learning_rate": 1.6323290047291194e-05, | |
| "loss": 0.9036, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.356687898089172, | |
| "grad_norm": 0.4797765910625458, | |
| "learning_rate": 1.367381802809185e-05, | |
| "loss": 0.9354, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4203821656050954, | |
| "grad_norm": 0.4451310932636261, | |
| "learning_rate": 1.122462672120914e-05, | |
| "loss": 0.9319, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.484076433121019, | |
| "grad_norm": 0.47275781631469727, | |
| "learning_rate": 8.98921949399179e-06, | |
| "loss": 0.99, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5477707006369426, | |
| "grad_norm": 0.449569970369339, | |
| "learning_rate": 6.979921036993042e-06, | |
| "loss": 0.9659, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.611464968152866, | |
| "grad_norm": 0.5072051882743835, | |
| "learning_rate": 5.207809413041914e-06, | |
| "loss": 0.9192, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.6751592356687897, | |
| "grad_norm": 0.44743847846984863, | |
| "learning_rate": 3.6826549794698074e-06, | |
| "loss": 0.9026, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.738853503184713, | |
| "grad_norm": 0.4364466369152069, | |
| "learning_rate": 2.4128665202382326e-06, | |
| "loss": 0.9621, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.802547770700637, | |
| "grad_norm": 0.4684942960739136, | |
| "learning_rate": 1.4054448849631085e-06, | |
| "loss": 0.9417, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8662420382165603, | |
| "grad_norm": 0.47080230712890625, | |
| "learning_rate": 6.659443904419637e-07, | |
| "loss": 0.9432, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9299363057324843, | |
| "grad_norm": 0.4364038109779358, | |
| "learning_rate": 1.984421974927375e-07, | |
| "loss": 0.9389, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.9936305732484074, | |
| "grad_norm": 0.458974152803421, | |
| "learning_rate": 5.515831941993455e-09, | |
| "loss": 0.9422, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 471, | |
| "total_flos": 1.2627781654203597e+17, | |
| "train_loss": 1.0978564871850793, | |
| "train_runtime": 364.5287, | |
| "train_samples_per_second": 165.378, | |
| "train_steps_per_second": 1.292 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 471, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2627781654203597e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |