{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.619834710743802, "eval_steps": 10, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1652892561983471, "grad_norm": 2.5004961490631104, "learning_rate": 3.998781654038192e-05, "loss": 1.3959, "step": 10 }, { "epoch": 0.1652892561983471, "eval_loss": 0.9773627519607544, "eval_runtime": 1.0911, "eval_samples_per_second": 10.998, "eval_steps_per_second": 1.833, "step": 10 }, { "epoch": 0.3305785123966942, "grad_norm": 3.3259832859039307, "learning_rate": 3.9951281005196486e-05, "loss": 1.195, "step": 20 }, { "epoch": 0.3305785123966942, "eval_loss": 0.783854603767395, "eval_runtime": 1.1063, "eval_samples_per_second": 10.847, "eval_steps_per_second": 1.808, "step": 20 }, { "epoch": 0.49586776859504134, "grad_norm": 2.37343168258667, "learning_rate": 3.989043790736547e-05, "loss": 1.035, "step": 30 }, { "epoch": 0.49586776859504134, "eval_loss": 0.6894669532775879, "eval_runtime": 1.4555, "eval_samples_per_second": 8.245, "eval_steps_per_second": 1.374, "step": 30 }, { "epoch": 0.6611570247933884, "grad_norm": 3.1549763679504395, "learning_rate": 3.980536137483141e-05, "loss": 0.9097, "step": 40 }, { "epoch": 0.6611570247933884, "eval_loss": 0.6716279983520508, "eval_runtime": 1.1151, "eval_samples_per_second": 10.761, "eval_steps_per_second": 1.794, "step": 40 }, { "epoch": 0.8264462809917356, "grad_norm": 2.8315067291259766, "learning_rate": 3.9696155060244166e-05, "loss": 0.8614, "step": 50 }, { "epoch": 0.8264462809917356, "eval_loss": 0.6319628357887268, "eval_runtime": 1.1073, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.806, "step": 50 }, { "epoch": 0.9917355371900827, "grad_norm": 3.369000196456909, "learning_rate": 3.9562952014676116e-05, "loss": 0.9259, "step": 60 }, { "epoch": 0.9917355371900827, "eval_loss": 0.5961965918540955, "eval_runtime": 1.3743, "eval_samples_per_second": 8.732, "eval_steps_per_second": 1.455, "step": 60 }, { "epoch": 1.1570247933884297, "grad_norm": 2.863161087036133, "learning_rate": 3.940591452551993e-05, "loss": 0.7896, "step": 70 }, { "epoch": 1.1570247933884297, "eval_loss": 0.5863456726074219, "eval_runtime": 1.0994, "eval_samples_per_second": 10.915, "eval_steps_per_second": 1.819, "step": 70 }, { "epoch": 1.322314049586777, "grad_norm": 2.5636465549468994, "learning_rate": 3.922523391876638e-05, "loss": 0.8601, "step": 80 }, { "epoch": 1.322314049586777, "eval_loss": 0.5487304925918579, "eval_runtime": 1.0935, "eval_samples_per_second": 10.974, "eval_steps_per_second": 1.829, "step": 80 }, { "epoch": 1.487603305785124, "grad_norm": 2.7444987297058105, "learning_rate": 3.9021130325903076e-05, "loss": 0.7592, "step": 90 }, { "epoch": 1.487603305785124, "eval_loss": 0.543647050857544, "eval_runtime": 1.1357, "eval_samples_per_second": 10.566, "eval_steps_per_second": 1.761, "step": 90 }, { "epoch": 1.6528925619834711, "grad_norm": 4.245235919952393, "learning_rate": 3.879385241571817e-05, "loss": 0.7439, "step": 100 }, { "epoch": 1.6528925619834711, "eval_loss": 0.5209221839904785, "eval_runtime": 1.1049, "eval_samples_per_second": 10.861, "eval_steps_per_second": 1.81, "step": 100 }, { "epoch": 1.8181818181818183, "grad_norm": 3.720158815383911, "learning_rate": 3.854367709133575e-05, "loss": 0.7675, "step": 110 }, { "epoch": 1.8181818181818183, "eval_loss": 0.502068281173706, "eval_runtime": 1.416, "eval_samples_per_second": 8.474, "eval_steps_per_second": 1.412, "step": 110 }, { "epoch": 1.9834710743801653, "grad_norm": 3.368499279022217, "learning_rate": 3.827090915285202e-05, "loss": 0.7396, "step": 120 }, { "epoch": 1.9834710743801653, "eval_loss": 0.4688035845756531, "eval_runtime": 1.1068, "eval_samples_per_second": 10.842, "eval_steps_per_second": 1.807, "step": 120 }, { "epoch": 2.1487603305785123, "grad_norm": 3.335733652114868, "learning_rate": 3.7975880925983345e-05, "loss": 0.5938, "step": 130 }, { "epoch": 2.1487603305785123, "eval_loss": 0.45896804332733154, "eval_runtime": 1.0901, "eval_samples_per_second": 11.008, "eval_steps_per_second": 1.835, "step": 130 }, { "epoch": 2.3140495867768593, "grad_norm": 4.038167953491211, "learning_rate": 3.7658951857178544e-05, "loss": 0.5712, "step": 140 }, { "epoch": 2.3140495867768593, "eval_loss": 0.4418785274028778, "eval_runtime": 1.3332, "eval_samples_per_second": 9.001, "eval_steps_per_second": 1.5, "step": 140 }, { "epoch": 2.479338842975207, "grad_norm": 4.076087951660156, "learning_rate": 3.732050807568878e-05, "loss": 0.6814, "step": 150 }, { "epoch": 2.479338842975207, "eval_loss": 0.4231901168823242, "eval_runtime": 1.0886, "eval_samples_per_second": 11.023, "eval_steps_per_second": 1.837, "step": 150 }, { "epoch": 2.644628099173554, "grad_norm": 4.039748668670654, "learning_rate": 3.696096192312852e-05, "loss": 0.6155, "step": 160 }, { "epoch": 2.644628099173554, "eval_loss": 0.4101436138153076, "eval_runtime": 1.1018, "eval_samples_per_second": 10.891, "eval_steps_per_second": 1.815, "step": 160 }, { "epoch": 2.809917355371901, "grad_norm": 4.491166114807129, "learning_rate": 3.658075145110083e-05, "loss": 0.5797, "step": 170 }, { "epoch": 2.809917355371901, "eval_loss": 0.3929840326309204, "eval_runtime": 1.3687, "eval_samples_per_second": 8.768, "eval_steps_per_second": 1.461, "step": 170 }, { "epoch": 2.975206611570248, "grad_norm": 4.6367106437683105, "learning_rate": 3.6180339887498953e-05, "loss": 0.6346, "step": 180 }, { "epoch": 2.975206611570248, "eval_loss": 0.3631528615951538, "eval_runtime": 1.1071, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.806, "step": 180 }, { "epoch": 3.1404958677685952, "grad_norm": 3.62372088432312, "learning_rate": 3.576021507213444e-05, "loss": 0.4537, "step": 190 }, { "epoch": 3.1404958677685952, "eval_loss": 0.35405832529067993, "eval_runtime": 1.2103, "eval_samples_per_second": 9.915, "eval_steps_per_second": 1.652, "step": 190 }, { "epoch": 3.3057851239669422, "grad_norm": 3.1658225059509277, "learning_rate": 3.532088886237956e-05, "loss": 0.4568, "step": 200 }, { "epoch": 3.3057851239669422, "eval_loss": 0.3364166021347046, "eval_runtime": 1.1013, "eval_samples_per_second": 10.896, "eval_steps_per_second": 1.816, "step": 200 }, { "epoch": 3.4710743801652892, "grad_norm": 5.640577793121338, "learning_rate": 3.4862896509547886e-05, "loss": 0.4796, "step": 210 }, { "epoch": 3.4710743801652892, "eval_loss": 0.31573551893234253, "eval_runtime": 1.0982, "eval_samples_per_second": 10.927, "eval_steps_per_second": 1.821, "step": 210 }, { "epoch": 3.6363636363636362, "grad_norm": 5.163906097412109, "learning_rate": 3.438679600677303e-05, "loss": 0.4309, "step": 220 }, { "epoch": 3.6363636363636362, "eval_loss": 0.29981058835983276, "eval_runtime": 1.4641, "eval_samples_per_second": 8.196, "eval_steps_per_second": 1.366, "step": 220 }, { "epoch": 3.8016528925619832, "grad_norm": 10.097945213317871, "learning_rate": 3.3893167409179945e-05, "loss": 0.5423, "step": 230 }, { "epoch": 3.8016528925619832, "eval_loss": 0.30080342292785645, "eval_runtime": 1.0952, "eval_samples_per_second": 10.957, "eval_steps_per_second": 1.826, "step": 230 }, { "epoch": 3.9669421487603307, "grad_norm": 5.08881950378418, "learning_rate": 3.3382612127177166e-05, "loss": 0.51, "step": 240 }, { "epoch": 3.9669421487603307, "eval_loss": 0.30431026220321655, "eval_runtime": 1.1103, "eval_samples_per_second": 10.807, "eval_steps_per_second": 1.801, "step": 240 }, { "epoch": 4.132231404958677, "grad_norm": 3.142008066177368, "learning_rate": 3.285575219373079e-05, "loss": 0.4218, "step": 250 }, { "epoch": 4.132231404958677, "eval_loss": 0.29047852754592896, "eval_runtime": 1.2712, "eval_samples_per_second": 9.44, "eval_steps_per_second": 1.573, "step": 250 }, { "epoch": 4.297520661157025, "grad_norm": 4.357353687286377, "learning_rate": 3.2313229506513167e-05, "loss": 0.3503, "step": 260 }, { "epoch": 4.297520661157025, "eval_loss": 0.29693418741226196, "eval_runtime": 1.0964, "eval_samples_per_second": 10.945, "eval_steps_per_second": 1.824, "step": 260 }, { "epoch": 4.462809917355372, "grad_norm": 5.129684925079346, "learning_rate": 3.1755705045849465e-05, "loss": 0.3235, "step": 270 }, { "epoch": 4.462809917355372, "eval_loss": 0.27224814891815186, "eval_runtime": 1.0905, "eval_samples_per_second": 11.004, "eval_steps_per_second": 1.834, "step": 270 }, { "epoch": 4.628099173553719, "grad_norm": 5.414126396179199, "learning_rate": 3.1183858069414936e-05, "loss": 0.3671, "step": 280 }, { "epoch": 4.628099173553719, "eval_loss": 0.2563490867614746, "eval_runtime": 1.0963, "eval_samples_per_second": 10.946, "eval_steps_per_second": 1.824, "step": 280 }, { "epoch": 4.793388429752066, "grad_norm": 4.964554309844971, "learning_rate": 3.05983852846641e-05, "loss": 0.3399, "step": 290 }, { "epoch": 4.793388429752066, "eval_loss": 0.25828373432159424, "eval_runtime": 1.1181, "eval_samples_per_second": 10.732, "eval_steps_per_second": 1.789, "step": 290 }, { "epoch": 4.958677685950414, "grad_norm": 4.63615083694458, "learning_rate": 3.0000000000000004e-05, "loss": 0.3685, "step": 300 }, { "epoch": 4.958677685950414, "eval_loss": 0.24292020499706268, "eval_runtime": 1.445, "eval_samples_per_second": 8.305, "eval_steps_per_second": 1.384, "step": 300 }, { "epoch": 5.12396694214876, "grad_norm": 4.34980583190918, "learning_rate": 2.938943125571782e-05, "loss": 0.2943, "step": 310 }, { "epoch": 5.12396694214876, "eval_loss": 0.20842806994915009, "eval_runtime": 1.1081, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.805, "step": 310 }, { "epoch": 5.289256198347108, "grad_norm": 6.3195881843566895, "learning_rate": 2.876742293578155e-05, "loss": 0.2589, "step": 320 }, { "epoch": 5.289256198347108, "eval_loss": 0.217214435338974, "eval_runtime": 1.0894, "eval_samples_per_second": 11.015, "eval_steps_per_second": 1.836, "step": 320 }, { "epoch": 5.454545454545454, "grad_norm": 6.084163665771484, "learning_rate": 2.813473286151601e-05, "loss": 0.2143, "step": 330 }, { "epoch": 5.454545454545454, "eval_loss": 0.22991180419921875, "eval_runtime": 1.2554, "eval_samples_per_second": 9.559, "eval_steps_per_second": 1.593, "step": 330 }, { "epoch": 5.619834710743802, "grad_norm": 5.4227728843688965, "learning_rate": 2.7492131868318247e-05, "loss": 0.3084, "step": 340 }, { "epoch": 5.619834710743802, "eval_loss": 0.21594808995723724, "eval_runtime": 1.1006, "eval_samples_per_second": 10.903, "eval_steps_per_second": 1.817, "step": 340 } ], "logging_steps": 10, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 545316196581376.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }