| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9984, | |
| "eval_steps": 500, | |
| "global_step": 52, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 2.5694146156311035, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.5167, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 2.4537222385406494, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.5203, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 2.321495532989502, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4748, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 1.8380646705627441, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.5253, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.4723880290985107, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.4606, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 1.1131113767623901, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4019, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.9251970648765564, | |
| "learning_rate": 9.988343845952697e-06, | |
| "loss": 0.4271, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 1.1454741954803467, | |
| "learning_rate": 9.953429730181653e-06, | |
| "loss": 0.3827, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.9471750259399414, | |
| "learning_rate": 9.895420438411616e-06, | |
| "loss": 0.3547, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.7063102126121521, | |
| "learning_rate": 9.814586436738998e-06, | |
| "loss": 0.3375, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.6931116580963135, | |
| "learning_rate": 9.711304610594104e-06, | |
| "loss": 0.3156, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.7782188653945923, | |
| "learning_rate": 9.586056507527266e-06, | |
| "loss": 0.3111, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.7079610228538513, | |
| "learning_rate": 9.439426092011877e-06, | |
| "loss": 0.2994, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.6569077968597412, | |
| "learning_rate": 9.272097022732444e-06, | |
| "loss": 0.2973, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.680906355381012, | |
| "learning_rate": 9.08484946505221e-06, | |
| "loss": 0.2882, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.5492838025093079, | |
| "learning_rate": 8.8785564535221e-06, | |
| "loss": 0.2531, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.6430338025093079, | |
| "learning_rate": 8.65417982139062e-06, | |
| "loss": 0.2819, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.5319820046424866, | |
| "learning_rate": 8.412765716093273e-06, | |
| "loss": 0.2765, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.4264802932739258, | |
| "learning_rate": 8.155439721630265e-06, | |
| "loss": 0.2588, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.42277446389198303, | |
| "learning_rate": 7.883401610574338e-06, | |
| "loss": 0.2572, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.46798449754714966, | |
| "learning_rate": 7.597919750177168e-06, | |
| "loss": 0.2319, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.4637551009654999, | |
| "learning_rate": 7.300325188655762e-06, | |
| "loss": 0.2307, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.477437824010849, | |
| "learning_rate": 6.9920054492312086e-06, | |
| "loss": 0.2495, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.40543755888938904, | |
| "learning_rate": 6.674398060854931e-06, | |
| "loss": 0.2272, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4067147672176361, | |
| "learning_rate": 6.348983855785122e-06, | |
| "loss": 0.2349, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.37864550948143005, | |
| "learning_rate": 6.0172800652631706e-06, | |
| "loss": 0.2345, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.3472854197025299, | |
| "learning_rate": 5.680833245481234e-06, | |
| "loss": 0.2192, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.36279794573783875, | |
| "learning_rate": 5.341212066823356e-06, | |
| "loss": 0.2212, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.35561856627464294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2218, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.34567970037460327, | |
| "learning_rate": 4.6587879331766465e-06, | |
| "loss": 0.2157, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.3285064101219177, | |
| "learning_rate": 4.319166754518768e-06, | |
| "loss": 0.2214, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.28714433312416077, | |
| "learning_rate": 3.982719934736832e-06, | |
| "loss": 0.1882, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.31616437435150146, | |
| "learning_rate": 3.6510161442148783e-06, | |
| "loss": 0.1859, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.3225463330745697, | |
| "learning_rate": 3.3256019391450696e-06, | |
| "loss": 0.2063, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.33429035544395447, | |
| "learning_rate": 3.007994550768793e-06, | |
| "loss": 0.2043, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.33725297451019287, | |
| "learning_rate": 2.6996748113442397e-06, | |
| "loss": 0.2168, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.3270690143108368, | |
| "learning_rate": 2.4020802498228333e-06, | |
| "loss": 0.2076, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.34343093633651733, | |
| "learning_rate": 2.1165983894256647e-06, | |
| "loss": 0.2022, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.3212158977985382, | |
| "learning_rate": 1.8445602783697375e-06, | |
| "loss": 0.2043, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.4333406388759613, | |
| "learning_rate": 1.5872342839067305e-06, | |
| "loss": 0.1961, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.3265644609928131, | |
| "learning_rate": 1.3458201786093795e-06, | |
| "loss": 0.206, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.3291531503200531, | |
| "learning_rate": 1.1214435464779006e-06, | |
| "loss": 0.2164, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.3214084506034851, | |
| "learning_rate": 9.151505349477901e-07, | |
| "loss": 0.1954, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 0.33091598749160767, | |
| "learning_rate": 7.279029772675572e-07, | |
| "loss": 0.2029, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.305397629737854, | |
| "learning_rate": 5.60573907988124e-07, | |
| "loss": 0.1895, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.3404708504676819, | |
| "learning_rate": 4.139434924727359e-07, | |
| "loss": 0.206, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.3258639872074127, | |
| "learning_rate": 2.88695389405898e-07, | |
| "loss": 0.2152, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.3110244870185852, | |
| "learning_rate": 1.8541356326100436e-07, | |
| "loss": 0.2068, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.31024959683418274, | |
| "learning_rate": 1.0457956158838545e-07, | |
| "loss": 0.1988, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.3411162495613098, | |
| "learning_rate": 4.657026981834623e-08, | |
| "loss": 0.2048, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 0.32382944226264954, | |
| "learning_rate": 1.1656154047303691e-08, | |
| "loss": 0.2035, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.3142213821411133, | |
| "learning_rate": 0.0, | |
| "loss": 0.2071, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "step": 52, | |
| "total_flos": 149313677950976.0, | |
| "train_loss": 0.26948727323458743, | |
| "train_runtime": 7475.3088, | |
| "train_samples_per_second": 0.669, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 52, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 149313677950976.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |