| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.384, | |
| "eval_steps": 500, | |
| "global_step": 60, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 6.651687052805929, | |
| "learning_rate": 3.125e-07, | |
| "loss": 0.4684, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 6.265915305107489, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.4563, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 6.401273380714525, | |
| "learning_rate": 9.375000000000001e-07, | |
| "loss": 0.4547, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 6.56315483941447, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.4494, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 6.59719209633345, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 0.4575, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 7.53537793152979, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.4427, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 9.571011007558802, | |
| "learning_rate": 2.1875000000000002e-06, | |
| "loss": 0.4378, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 10.148947574276539, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.4369, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 5.2264536848502035, | |
| "learning_rate": 2.8125e-06, | |
| "loss": 0.4002, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 4.788413945220771, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.3976, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 4.514781509921602, | |
| "learning_rate": 3.4375e-06, | |
| "loss": 0.3942, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 4.538614240888282, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.3761, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 3.0503754653246355, | |
| "learning_rate": 4.0625000000000005e-06, | |
| "loss": 0.3664, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 3.0554055571037093, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.3585, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.9763698617399021, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 0.3561, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 1.5108193822789298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3205, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 1.3652592836546582, | |
| "learning_rate": 4.999370587356267e-06, | |
| "loss": 0.2957, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 1.8791692065602286, | |
| "learning_rate": 4.997482666353287e-06, | |
| "loss": 0.2974, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 1.6499778792766742, | |
| "learning_rate": 4.99433718761614e-06, | |
| "loss": 0.2882, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.7803281647114307, | |
| "learning_rate": 4.989935734988098e-06, | |
| "loss": 0.2822, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.6228676679830198, | |
| "learning_rate": 4.984280524733107e-06, | |
| "loss": 0.2708, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.6503139687960523, | |
| "learning_rate": 4.977374404419838e-06, | |
| "loss": 0.2624, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 0.6050613097065756, | |
| "learning_rate": 4.9692208514878445e-06, | |
| "loss": 0.2595, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.513863899627231, | |
| "learning_rate": 4.959823971496575e-06, | |
| "loss": 0.24, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.5401602458085882, | |
| "learning_rate": 4.949188496058089e-06, | |
| "loss": 0.2564, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 0.4588626341682895, | |
| "learning_rate": 4.937319780454559e-06, | |
| "loss": 0.2559, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.5618821751922192, | |
| "learning_rate": 4.924223800941718e-06, | |
| "loss": 0.2396, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 0.47667362996506163, | |
| "learning_rate": 4.909907151739634e-06, | |
| "loss": 0.2417, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.4186548264796007, | |
| "learning_rate": 4.894377041712327e-06, | |
| "loss": 0.2439, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.4531599370801774, | |
| "learning_rate": 4.8776412907378845e-06, | |
| "loss": 0.2415, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.3909364624586131, | |
| "learning_rate": 4.859708325770919e-06, | |
| "loss": 0.2302, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.31305049271337076, | |
| "learning_rate": 4.8405871765993435e-06, | |
| "loss": 0.2333, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.3219167869220544, | |
| "learning_rate": 4.820287471297598e-06, | |
| "loss": 0.2341, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.3118803828567028, | |
| "learning_rate": 4.7988194313786275e-06, | |
| "loss": 0.2189, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.29310345514929165, | |
| "learning_rate": 4.7761938666470405e-06, | |
| "loss": 0.2114, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.3347074091997152, | |
| "learning_rate": 4.752422169756048e-06, | |
| "loss": 0.2249, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.29649389449972713, | |
| "learning_rate": 4.72751631047092e-06, | |
| "loss": 0.2206, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.2837151778804135, | |
| "learning_rate": 4.701488829641845e-06, | |
| "loss": 0.2304, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.2806309744043982, | |
| "learning_rate": 4.674352832889239e-06, | |
| "loss": 0.2207, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.27783183257142374, | |
| "learning_rate": 4.646121984004666e-06, | |
| "loss": 0.2155, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.27666228945567495, | |
| "learning_rate": 4.6168104980707105e-06, | |
| "loss": 0.2127, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.275485776092337, | |
| "learning_rate": 4.586433134303257e-06, | |
| "loss": 0.2238, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.27153628095668647, | |
| "learning_rate": 4.555005188619776e-06, | |
| "loss": 0.21, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.274290824493659, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.2131, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.2694663744414641, | |
| "learning_rate": 4.4890613722044526e-06, | |
| "loss": 0.2108, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.2642992894492684, | |
| "learning_rate": 4.454578706170075e-06, | |
| "loss": 0.2069, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.26815076119929016, | |
| "learning_rate": 4.4191118508950286e-06, | |
| "loss": 0.2063, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.25684981864747414, | |
| "learning_rate": 4.382678665009028e-06, | |
| "loss": 0.2121, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.2536186069276322, | |
| "learning_rate": 4.345297493718352e-06, | |
| "loss": 0.2074, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.256667169168501, | |
| "learning_rate": 4.3069871595684795e-06, | |
| "loss": 0.2106, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.2631748867077966, | |
| "learning_rate": 4.267766952966369e-06, | |
| "loss": 0.2031, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.2567671139005443, | |
| "learning_rate": 4.227656622467162e-06, | |
| "loss": 0.2063, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.26280340427063503, | |
| "learning_rate": 4.186676364830187e-06, | |
| "loss": 0.212, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.2599237100252332, | |
| "learning_rate": 4.144846814849282e-06, | |
| "loss": 0.2161, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.25711041735736345, | |
| "learning_rate": 4.102189034962561e-06, | |
| "loss": 0.1959, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.26568396343464284, | |
| "learning_rate": 4.058724504646834e-06, | |
| "loss": 0.2053, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.26965052490303654, | |
| "learning_rate": 4.01447510960205e-06, | |
| "loss": 0.197, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.2474576437319212, | |
| "learning_rate": 3.969463130731183e-06, | |
| "loss": 0.1963, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.26050605159298984, | |
| "learning_rate": 3.92371123292113e-06, | |
| "loss": 0.1985, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.25523756004269815, | |
| "learning_rate": 3.8772424536302565e-06, | |
| "loss": 0.1957, | |
| "step": 60 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 156, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 121473953169408.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |