| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 19048, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02624947501049979, | |
| "grad_norm": 1.2762126922607422, | |
| "learning_rate": 4.868752624947501e-05, | |
| "loss": 4.3755, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05249895002099958, | |
| "grad_norm": 2.1579689979553223, | |
| "learning_rate": 4.7375052498950025e-05, | |
| "loss": 2.9041, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05249895002099958, | |
| "eval_accuracy": 0.4456327704194102, | |
| "eval_loss": 2.4551732540130615, | |
| "eval_runtime": 52.3921, | |
| "eval_samples_per_second": 116.945, | |
| "eval_steps_per_second": 3.665, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07874842503149937, | |
| "grad_norm": 2.8109207153320312, | |
| "learning_rate": 4.606257874842503e-05, | |
| "loss": 2.2866, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.10499790004199916, | |
| "grad_norm": 1.555198073387146, | |
| "learning_rate": 4.475010499790005e-05, | |
| "loss": 1.975, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10499790004199916, | |
| "eval_accuracy": 0.566995340241206, | |
| "eval_loss": 1.800054907798767, | |
| "eval_runtime": 52.4411, | |
| "eval_samples_per_second": 116.836, | |
| "eval_steps_per_second": 3.661, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.13124737505249895, | |
| "grad_norm": 1.6156566143035889, | |
| "learning_rate": 4.3437631247375055e-05, | |
| "loss": 1.8239, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15749685006299874, | |
| "grad_norm": 1.3470197916030884, | |
| "learning_rate": 4.212515749685006e-05, | |
| "loss": 1.7304, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15749685006299874, | |
| "eval_accuracy": 0.6007325235911557, | |
| "eval_loss": 1.610120177268982, | |
| "eval_runtime": 52.0749, | |
| "eval_samples_per_second": 117.657, | |
| "eval_steps_per_second": 3.687, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18374632507349853, | |
| "grad_norm": 1.3028109073638916, | |
| "learning_rate": 4.081268374632508e-05, | |
| "loss": 1.6586, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.20999580008399832, | |
| "grad_norm": 1.4646774530410767, | |
| "learning_rate": 3.9500209995800084e-05, | |
| "loss": 1.6082, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.20999580008399832, | |
| "eval_accuracy": 0.6184141440200028, | |
| "eval_loss": 1.5135186910629272, | |
| "eval_runtime": 52.2531, | |
| "eval_samples_per_second": 117.256, | |
| "eval_steps_per_second": 3.674, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2362452750944981, | |
| "grad_norm": 1.2850710153579712, | |
| "learning_rate": 3.81877362452751e-05, | |
| "loss": 1.5687, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2624947501049979, | |
| "grad_norm": 1.3608484268188477, | |
| "learning_rate": 3.687526249475011e-05, | |
| "loss": 1.5347, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2624947501049979, | |
| "eval_accuracy": 0.6303795149938871, | |
| "eval_loss": 1.4518604278564453, | |
| "eval_runtime": 52.3842, | |
| "eval_samples_per_second": 116.963, | |
| "eval_steps_per_second": 3.665, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2887442251154977, | |
| "grad_norm": 1.2262423038482666, | |
| "learning_rate": 3.5562788744225114e-05, | |
| "loss": 1.5052, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3149937001259975, | |
| "grad_norm": 1.1985517740249634, | |
| "learning_rate": 3.425031499370013e-05, | |
| "loss": 1.481, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3149937001259975, | |
| "eval_accuracy": 0.6388963740927813, | |
| "eval_loss": 1.406346082687378, | |
| "eval_runtime": 52.3896, | |
| "eval_samples_per_second": 116.951, | |
| "eval_steps_per_second": 3.665, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.34124317513649727, | |
| "grad_norm": 1.2099275588989258, | |
| "learning_rate": 3.2937841243175137e-05, | |
| "loss": 1.4638, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.36749265014699706, | |
| "grad_norm": 1.30404794216156, | |
| "learning_rate": 3.162536749265015e-05, | |
| "loss": 1.4437, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.36749265014699706, | |
| "eval_accuracy": 0.6458597675369553, | |
| "eval_loss": 1.3707749843597412, | |
| "eval_runtime": 51.9205, | |
| "eval_samples_per_second": 118.007, | |
| "eval_steps_per_second": 3.698, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.39374212515749685, | |
| "grad_norm": 1.2142497301101685, | |
| "learning_rate": 3.031289374212516e-05, | |
| "loss": 1.4271, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.41999160016799664, | |
| "grad_norm": 1.209383487701416, | |
| "learning_rate": 2.900041999160017e-05, | |
| "loss": 1.4146, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41999160016799664, | |
| "eval_accuracy": 0.6510902099755246, | |
| "eval_loss": 1.342362880706787, | |
| "eval_runtime": 52.0734, | |
| "eval_samples_per_second": 117.661, | |
| "eval_steps_per_second": 3.687, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4462410751784964, | |
| "grad_norm": 1.0856986045837402, | |
| "learning_rate": 2.768794624107518e-05, | |
| "loss": 1.399, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4724905501889962, | |
| "grad_norm": 1.1674331426620483, | |
| "learning_rate": 2.6375472490550192e-05, | |
| "loss": 1.3851, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4724905501889962, | |
| "eval_accuracy": 0.6559099899312707, | |
| "eval_loss": 1.3188802003860474, | |
| "eval_runtime": 52.2608, | |
| "eval_samples_per_second": 117.239, | |
| "eval_steps_per_second": 3.674, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.498740025199496, | |
| "grad_norm": 1.103853464126587, | |
| "learning_rate": 2.50629987400252e-05, | |
| "loss": 1.3755, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5249895002099958, | |
| "grad_norm": 1.1234028339385986, | |
| "learning_rate": 2.375052498950021e-05, | |
| "loss": 1.3649, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5249895002099958, | |
| "eval_accuracy": 0.659545964283851, | |
| "eval_loss": 1.2999330759048462, | |
| "eval_runtime": 52.336, | |
| "eval_samples_per_second": 117.07, | |
| "eval_steps_per_second": 3.669, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5512389752204956, | |
| "grad_norm": 1.092689037322998, | |
| "learning_rate": 2.2438051238975222e-05, | |
| "loss": 1.3586, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5774884502309954, | |
| "grad_norm": 1.1136916875839233, | |
| "learning_rate": 2.1125577488450233e-05, | |
| "loss": 1.3483, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5774884502309954, | |
| "eval_accuracy": 0.6628488138251902, | |
| "eval_loss": 1.284113883972168, | |
| "eval_runtime": 52.2557, | |
| "eval_samples_per_second": 117.25, | |
| "eval_steps_per_second": 3.674, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.6037379252414952, | |
| "grad_norm": 1.1153841018676758, | |
| "learning_rate": 1.9813103737925244e-05, | |
| "loss": 1.3389, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.629987400251995, | |
| "grad_norm": 1.117702841758728, | |
| "learning_rate": 1.8500629987400252e-05, | |
| "loss": 1.3335, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.629987400251995, | |
| "eval_accuracy": 0.6658040201846832, | |
| "eval_loss": 1.2690930366516113, | |
| "eval_runtime": 51.8646, | |
| "eval_samples_per_second": 118.135, | |
| "eval_steps_per_second": 3.702, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6562368752624947, | |
| "grad_norm": 1.1310999393463135, | |
| "learning_rate": 1.7188156236875263e-05, | |
| "loss": 1.3266, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6824863502729945, | |
| "grad_norm": 1.1650059223175049, | |
| "learning_rate": 1.5875682486350274e-05, | |
| "loss": 1.3166, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6824863502729945, | |
| "eval_accuracy": 0.6683760053772215, | |
| "eval_loss": 1.2568169832229614, | |
| "eval_runtime": 51.9065, | |
| "eval_samples_per_second": 118.039, | |
| "eval_steps_per_second": 3.699, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.7087358252834943, | |
| "grad_norm": 1.0919125080108643, | |
| "learning_rate": 1.4563208735825285e-05, | |
| "loss": 1.3098, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7349853002939941, | |
| "grad_norm": 1.129116177558899, | |
| "learning_rate": 1.3250734985300295e-05, | |
| "loss": 1.307, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7349853002939941, | |
| "eval_accuracy": 0.6705609403819863, | |
| "eval_loss": 1.2448893785476685, | |
| "eval_runtime": 51.8742, | |
| "eval_samples_per_second": 118.113, | |
| "eval_steps_per_second": 3.701, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7612347753044939, | |
| "grad_norm": 1.131659984588623, | |
| "learning_rate": 1.1938261234775306e-05, | |
| "loss": 1.2994, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7874842503149937, | |
| "grad_norm": 1.100225567817688, | |
| "learning_rate": 1.0625787484250315e-05, | |
| "loss": 1.2959, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7874842503149937, | |
| "eval_accuracy": 0.6726351528680722, | |
| "eval_loss": 1.234655499458313, | |
| "eval_runtime": 52.0418, | |
| "eval_samples_per_second": 117.732, | |
| "eval_steps_per_second": 3.689, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8137337253254935, | |
| "grad_norm": 1.1060478687286377, | |
| "learning_rate": 9.313313733725326e-06, | |
| "loss": 1.2905, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8399832003359933, | |
| "grad_norm": 1.1289230585098267, | |
| "learning_rate": 8.000839983200337e-06, | |
| "loss": 1.286, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8399832003359933, | |
| "eval_accuracy": 0.6743912375411241, | |
| "eval_loss": 1.2270058393478394, | |
| "eval_runtime": 52.0522, | |
| "eval_samples_per_second": 117.709, | |
| "eval_steps_per_second": 3.689, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8662326753464931, | |
| "grad_norm": 1.1142446994781494, | |
| "learning_rate": 6.6883662326753475e-06, | |
| "loss": 1.2826, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8924821503569929, | |
| "grad_norm": 1.1280709505081177, | |
| "learning_rate": 5.375892482150358e-06, | |
| "loss": 1.2818, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8924821503569929, | |
| "eval_accuracy": 0.6755184055446775, | |
| "eval_loss": 1.2210115194320679, | |
| "eval_runtime": 52.3903, | |
| "eval_samples_per_second": 116.949, | |
| "eval_steps_per_second": 3.665, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.9187316253674926, | |
| "grad_norm": 1.1283568143844604, | |
| "learning_rate": 4.063418731625368e-06, | |
| "loss": 1.2752, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9449811003779924, | |
| "grad_norm": 1.1261204481124878, | |
| "learning_rate": 2.7509449811003783e-06, | |
| "loss": 1.2701, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9449811003779924, | |
| "eval_accuracy": 0.6768737193720215, | |
| "eval_loss": 1.2154804468154907, | |
| "eval_runtime": 51.9422, | |
| "eval_samples_per_second": 117.958, | |
| "eval_steps_per_second": 3.696, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9712305753884922, | |
| "grad_norm": 1.1426098346710205, | |
| "learning_rate": 1.4384712305753885e-06, | |
| "loss": 1.2699, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.997480050398992, | |
| "grad_norm": 1.1207588911056519, | |
| "learning_rate": 1.25997480050399e-07, | |
| "loss": 1.2691, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.997480050398992, | |
| "eval_accuracy": 0.6775645385447583, | |
| "eval_loss": 1.2121435403823853, | |
| "eval_runtime": 51.8514, | |
| "eval_samples_per_second": 118.165, | |
| "eval_steps_per_second": 3.703, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 19048, | |
| "total_flos": 3.18526483857408e+17, | |
| "train_loss": 1.5567941711710962, | |
| "train_runtime": 7364.253, | |
| "train_samples_per_second": 82.768, | |
| "train_steps_per_second": 2.587 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19048, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.18526483857408e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |