{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 19079, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02620682425703653, "grad_norm": 1.2087392807006836, "learning_rate": 4.868965878714817e-05, "loss": 4.3671, "step": 500 }, { "epoch": 0.05241364851407306, "grad_norm": 2.226062774658203, "learning_rate": 4.737931757429635e-05, "loss": 2.91, "step": 1000 }, { "epoch": 0.05241364851407306, "eval_accuracy": 0.442662621636582, "eval_loss": 2.4653749465942383, "eval_runtime": 54.3093, "eval_samples_per_second": 113.093, "eval_steps_per_second": 3.535, "step": 1000 }, { "epoch": 0.0786204727711096, "grad_norm": 1.7887016534805298, "learning_rate": 4.606897636144452e-05, "loss": 2.2957, "step": 1500 }, { "epoch": 0.10482729702814612, "grad_norm": 2.325981378555298, "learning_rate": 4.47586351485927e-05, "loss": 1.978, "step": 2000 }, { "epoch": 0.10482729702814612, "eval_accuracy": 0.5670936102339134, "eval_loss": 1.8015469312667847, "eval_runtime": 54.291, "eval_samples_per_second": 113.131, "eval_steps_per_second": 3.536, "step": 2000 }, { "epoch": 0.13103412128518266, "grad_norm": 1.324131727218628, "learning_rate": 4.344829393574087e-05, "loss": 1.8209, "step": 2500 }, { "epoch": 0.1572409455422192, "grad_norm": 1.4347504377365112, "learning_rate": 4.213795272288904e-05, "loss": 1.7257, "step": 3000 }, { "epoch": 0.1572409455422192, "eval_accuracy": 0.6006511581715623, "eval_loss": 1.613455057144165, "eval_runtime": 53.9449, "eval_samples_per_second": 113.857, "eval_steps_per_second": 3.559, "step": 3000 }, { "epoch": 0.18344776979925573, "grad_norm": 1.4435532093048096, "learning_rate": 4.0827611510037215e-05, "loss": 1.6598, "step": 3500 }, { "epoch": 0.20965459405629225, "grad_norm": 1.3218313455581665, "learning_rate": 3.9517270297185385e-05, "loss": 1.6065, "step": 4000 }, { "epoch": 0.20965459405629225, "eval_accuracy": 0.6195605915776923, "eval_loss": 1.5118882656097412, "eval_runtime": 54.1194, "eval_samples_per_second": 113.49, "eval_steps_per_second": 3.548, "step": 4000 }, { "epoch": 0.2358614183133288, "grad_norm": 1.2389506101608276, "learning_rate": 3.820692908433356e-05, "loss": 1.5659, "step": 4500 }, { "epoch": 0.2620682425703653, "grad_norm": 1.2128058671951294, "learning_rate": 3.689658787148173e-05, "loss": 1.535, "step": 5000 }, { "epoch": 0.2620682425703653, "eval_accuracy": 0.6311397607549959, "eval_loss": 1.4495291709899902, "eval_runtime": 52.7273, "eval_samples_per_second": 116.486, "eval_steps_per_second": 3.641, "step": 5000 }, { "epoch": 0.28827506682740184, "grad_norm": 1.1584937572479248, "learning_rate": 3.558624665862991e-05, "loss": 1.5057, "step": 5500 }, { "epoch": 0.3144818910844384, "grad_norm": 1.3666248321533203, "learning_rate": 3.427590544577809e-05, "loss": 1.4837, "step": 6000 }, { "epoch": 0.3144818910844384, "eval_accuracy": 0.6397868560713489, "eval_loss": 1.4026538133621216, "eval_runtime": 52.6912, "eval_samples_per_second": 116.566, "eval_steps_per_second": 3.644, "step": 6000 }, { "epoch": 0.34068871534147493, "grad_norm": 1.1634854078292847, "learning_rate": 3.296556423292626e-05, "loss": 1.4618, "step": 6500 }, { "epoch": 0.36689553959851146, "grad_norm": 1.1630429029464722, "learning_rate": 3.1655223020074435e-05, "loss": 1.441, "step": 7000 }, { "epoch": 0.36689553959851146, "eval_accuracy": 0.6463188093580632, "eval_loss": 1.3684494495391846, "eval_runtime": 52.879, "eval_samples_per_second": 116.152, "eval_steps_per_second": 3.631, "step": 7000 }, { "epoch": 0.393102363855548, "grad_norm": 1.2840286493301392, "learning_rate": 3.03448818072226e-05, "loss": 1.4246, "step": 7500 }, { "epoch": 0.4193091881125845, "grad_norm": 1.2869335412979126, "learning_rate": 2.9034540594370775e-05, "loss": 1.4106, "step": 8000 }, { "epoch": 0.4193091881125845, "eval_accuracy": 0.6521773867284943, "eval_loss": 1.3395406007766724, "eval_runtime": 52.7351, "eval_samples_per_second": 116.469, "eval_steps_per_second": 3.641, "step": 8000 }, { "epoch": 0.4455160123696211, "grad_norm": 1.1085759401321411, "learning_rate": 2.772419938151895e-05, "loss": 1.3958, "step": 8500 }, { "epoch": 0.4717228366266576, "grad_norm": 1.141212821006775, "learning_rate": 2.6413858168667123e-05, "loss": 1.3837, "step": 9000 }, { "epoch": 0.4717228366266576, "eval_accuracy": 0.6567033768743835, "eval_loss": 1.3167238235473633, "eval_runtime": 52.6041, "eval_samples_per_second": 116.759, "eval_steps_per_second": 3.65, "step": 9000 }, { "epoch": 0.4979296608836941, "grad_norm": 1.125022530555725, "learning_rate": 2.51035169558153e-05, "loss": 1.3756, "step": 9500 }, { "epoch": 0.5241364851407306, "grad_norm": 1.1555612087249756, "learning_rate": 2.379317574296347e-05, "loss": 1.3638, "step": 10000 }, { "epoch": 0.5241364851407306, "eval_accuracy": 0.6602091969367523, "eval_loss": 1.2976551055908203, "eval_runtime": 52.7145, "eval_samples_per_second": 116.514, "eval_steps_per_second": 3.642, "step": 10000 }, { "epoch": 0.5503433093977672, "grad_norm": 1.1034129858016968, "learning_rate": 2.2482834530111644e-05, "loss": 1.3533, "step": 10500 }, { "epoch": 0.5765501336548037, "grad_norm": 1.1435010433197021, "learning_rate": 2.1172493317259814e-05, "loss": 1.3448, "step": 11000 }, { "epoch": 0.5765501336548037, "eval_accuracy": 0.6630680604640962, "eval_loss": 1.2827322483062744, "eval_runtime": 52.5848, "eval_samples_per_second": 116.802, "eval_steps_per_second": 3.651, "step": 11000 }, { "epoch": 0.6027569579118403, "grad_norm": 1.1294775009155273, "learning_rate": 1.9862152104407988e-05, "loss": 1.3362, "step": 11500 }, { "epoch": 0.6289637821688768, "grad_norm": 1.1538615226745605, "learning_rate": 1.855181089155616e-05, "loss": 1.3316, "step": 12000 }, { "epoch": 0.6289637821688768, "eval_accuracy": 0.6663852843409781, "eval_loss": 1.2661738395690918, "eval_runtime": 52.8399, "eval_samples_per_second": 116.238, "eval_steps_per_second": 3.634, "step": 12000 }, { "epoch": 0.6551706064259133, "grad_norm": 1.1159409284591675, "learning_rate": 1.7241469678704335e-05, "loss": 1.3235, "step": 12500 }, { "epoch": 0.6813774306829499, "grad_norm": 1.1080243587493896, "learning_rate": 1.593112846585251e-05, "loss": 1.3128, "step": 13000 }, { "epoch": 0.6813774306829499, "eval_accuracy": 0.6692328480124827, "eval_loss": 1.2530109882354736, "eval_runtime": 53.754, "eval_samples_per_second": 114.261, "eval_steps_per_second": 3.572, "step": 13000 }, { "epoch": 0.7075842549399863, "grad_norm": 1.0835435390472412, "learning_rate": 1.4620787253000681e-05, "loss": 1.3105, "step": 13500 }, { "epoch": 0.7337910791970229, "grad_norm": 1.0971386432647705, "learning_rate": 1.3310446040148855e-05, "loss": 1.3066, "step": 14000 }, { "epoch": 0.7337910791970229, "eval_accuracy": 0.671052443108409, "eval_loss": 1.2431377172470093, "eval_runtime": 53.5059, "eval_samples_per_second": 114.791, "eval_steps_per_second": 3.588, "step": 14000 }, { "epoch": 0.7599979034540595, "grad_norm": 1.1002813577651978, "learning_rate": 1.2000104827297029e-05, "loss": 1.2971, "step": 14500 }, { "epoch": 0.786204727711096, "grad_norm": 1.1184098720550537, "learning_rate": 1.0689763614445202e-05, "loss": 1.2959, "step": 15000 }, { "epoch": 0.786204727711096, "eval_accuracy": 0.6728834972130736, "eval_loss": 1.234721302986145, "eval_runtime": 53.5694, "eval_samples_per_second": 114.655, "eval_steps_per_second": 3.584, "step": 15000 }, { "epoch": 0.8124115519681325, "grad_norm": 1.1383655071258545, "learning_rate": 9.379422401593376e-06, "loss": 1.2886, "step": 15500 }, { "epoch": 0.838618376225169, "grad_norm": 1.1217840909957886, "learning_rate": 8.06908118874155e-06, "loss": 1.2861, "step": 16000 }, { "epoch": 0.838618376225169, "eval_accuracy": 0.6747158245409314, "eval_loss": 1.2258507013320923, "eval_runtime": 53.5862, "eval_samples_per_second": 114.619, "eval_steps_per_second": 3.583, "step": 16000 }, { "epoch": 0.8648252004822056, "grad_norm": 1.128594994544983, "learning_rate": 6.758739975889722e-06, "loss": 1.2784, "step": 16500 }, { "epoch": 0.8910320247392421, "grad_norm": 1.0965055227279663, "learning_rate": 5.4483987630378955e-06, "loss": 1.2782, "step": 17000 }, { "epoch": 0.8910320247392421, "eval_accuracy": 0.6760406132734155, "eval_loss": 1.219490647315979, "eval_runtime": 52.7149, "eval_samples_per_second": 116.513, "eval_steps_per_second": 3.642, "step": 17000 }, { "epoch": 0.9172388489962786, "grad_norm": 1.155154824256897, "learning_rate": 4.138057550186069e-06, "loss": 1.2761, "step": 17500 }, { "epoch": 0.9434456732533152, "grad_norm": 1.1318458318710327, "learning_rate": 2.8277163373342417e-06, "loss": 1.2709, "step": 18000 }, { "epoch": 0.9434456732533152, "eval_accuracy": 0.6772598836337662, "eval_loss": 1.2137212753295898, "eval_runtime": 52.7841, "eval_samples_per_second": 116.361, "eval_steps_per_second": 3.637, "step": 18000 }, { "epoch": 0.9696524975103517, "grad_norm": 1.1269466876983643, "learning_rate": 1.5173751244824152e-06, "loss": 1.2703, "step": 18500 }, { "epoch": 0.9958593217673882, "grad_norm": 1.1108003854751587, "learning_rate": 2.070339116305886e-07, "loss": 1.2679, "step": 19000 }, { "epoch": 0.9958593217673882, "eval_accuracy": 0.6779942151104219, "eval_loss": 1.2103650569915771, "eval_runtime": 52.8651, "eval_samples_per_second": 116.183, "eval_steps_per_second": 3.632, "step": 19000 }, { "epoch": 1.0, "step": 19079, "total_flos": 3.19050113089536e+17, "train_loss": 1.555102582010085, "train_runtime": 7445.2613, "train_samples_per_second": 82.002, "train_steps_per_second": 2.563 } ], "logging_steps": 500, "max_steps": 19079, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.19050113089536e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }