| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 716.0, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 21.0165, | |
| "mean_token_accuracy": 0.6174646154046058, | |
| "num_input_tokens_seen": 27461, | |
| "num_tokens": 27461.0, | |
| "step": 5, | |
| "train_runtime": 58.529, | |
| "train_tokens_per_second": 469.186 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 420.0, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 17.7281, | |
| "mean_token_accuracy": 0.6165577113628388, | |
| "num_input_tokens_seen": 55051, | |
| "num_tokens": 55051.0, | |
| "step": 10, | |
| "train_runtime": 84.4052, | |
| "train_tokens_per_second": 652.223 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 284.0, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 15.0203, | |
| "mean_token_accuracy": 0.6702655613422394, | |
| "num_input_tokens_seen": 83951, | |
| "num_tokens": 83951.0, | |
| "step": 15, | |
| "train_runtime": 111.8863, | |
| "train_tokens_per_second": 750.324 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 224.0, | |
| "learning_rate": 1e-05, | |
| "loss": 11.6076, | |
| "mean_token_accuracy": 0.7083307519555092, | |
| "num_input_tokens_seen": 109657, | |
| "num_tokens": 109657.0, | |
| "step": 20, | |
| "train_runtime": 136.978, | |
| "train_tokens_per_second": 800.544 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 270.0, | |
| "learning_rate": 1.263157894736842e-05, | |
| "loss": 9.6461, | |
| "mean_token_accuracy": 0.7309058234095573, | |
| "num_input_tokens_seen": 139061, | |
| "num_tokens": 139061.0, | |
| "step": 25, | |
| "train_runtime": 165.0733, | |
| "train_tokens_per_second": 842.419 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.5263157894736846e-05, | |
| "loss": 8.7466, | |
| "mean_token_accuracy": 0.7324644327163696, | |
| "num_input_tokens_seen": 173335, | |
| "num_tokens": 173335.0, | |
| "step": 30, | |
| "train_runtime": 195.3505, | |
| "train_tokens_per_second": 887.303 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 101.0, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 8.494, | |
| "mean_token_accuracy": 0.7436975419521332, | |
| "num_input_tokens_seen": 201778, | |
| "num_tokens": 201778.0, | |
| "step": 35, | |
| "train_runtime": 223.7451, | |
| "train_tokens_per_second": 901.821 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 316.0, | |
| "learning_rate": 1.9999966405802828e-05, | |
| "loss": 7.8429, | |
| "mean_token_accuracy": 0.733650079369545, | |
| "num_input_tokens_seen": 234594, | |
| "num_tokens": 234594.0, | |
| "step": 40, | |
| "train_runtime": 254.1938, | |
| "train_tokens_per_second": 922.894 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 67.0, | |
| "learning_rate": 1.9998790632601496e-05, | |
| "loss": 7.2857, | |
| "mean_token_accuracy": 0.7769168972969055, | |
| "num_input_tokens_seen": 263946, | |
| "num_tokens": 263946.0, | |
| "step": 45, | |
| "train_runtime": 282.687, | |
| "train_tokens_per_second": 933.704 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 56.25, | |
| "learning_rate": 1.9995935375248608e-05, | |
| "loss": 7.2708, | |
| "mean_token_accuracy": 0.7696839615702629, | |
| "num_input_tokens_seen": 292127, | |
| "num_tokens": 292127.0, | |
| "step": 50, | |
| "train_runtime": 309.685, | |
| "train_tokens_per_second": 943.304 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 79.0, | |
| "learning_rate": 1.9991401113338103e-05, | |
| "loss": 7.5369, | |
| "mean_token_accuracy": 0.7532998159527778, | |
| "num_input_tokens_seen": 318452, | |
| "num_tokens": 318452.0, | |
| "step": 55, | |
| "train_runtime": 336.3519, | |
| "train_tokens_per_second": 946.782 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.99851886084842e-05, | |
| "loss": 7.4304, | |
| "mean_token_accuracy": 0.7594221189618111, | |
| "num_input_tokens_seen": 344023, | |
| "num_tokens": 344023.0, | |
| "step": 60, | |
| "train_runtime": 361.6326, | |
| "train_tokens_per_second": 951.305 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.9977298904193438e-05, | |
| "loss": 7.1512, | |
| "mean_token_accuracy": 0.7550393640995026, | |
| "num_input_tokens_seen": 372600, | |
| "num_tokens": 372600.0, | |
| "step": 65, | |
| "train_runtime": 389.9343, | |
| "train_tokens_per_second": 955.546 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.9967733325689412e-05, | |
| "loss": 6.7062, | |
| "mean_token_accuracy": 0.7626852974295616, | |
| "num_input_tokens_seen": 399843, | |
| "num_tokens": 399843.0, | |
| "step": 70, | |
| "train_runtime": 415.5105, | |
| "train_tokens_per_second": 962.293 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.995649347969019e-05, | |
| "loss": 6.7049, | |
| "mean_token_accuracy": 0.76889388859272, | |
| "num_input_tokens_seen": 426039, | |
| "num_tokens": 426039.0, | |
| "step": 75, | |
| "train_runtime": 440.456, | |
| "train_tokens_per_second": 967.268 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 171.0, | |
| "learning_rate": 1.994358125413841e-05, | |
| "loss": 6.1025, | |
| "mean_token_accuracy": 0.7738867923617363, | |
| "num_input_tokens_seen": 454551, | |
| "num_tokens": 454551.0, | |
| "step": 80, | |
| "train_runtime": 468.9331, | |
| "train_tokens_per_second": 969.33 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 207.0, | |
| "learning_rate": 1.9928998817884185e-05, | |
| "loss": 6.4376, | |
| "mean_token_accuracy": 0.772473418712616, | |
| "num_input_tokens_seen": 480692, | |
| "num_tokens": 480692.0, | |
| "step": 85, | |
| "train_runtime": 493.6013, | |
| "train_tokens_per_second": 973.847 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 214.0, | |
| "learning_rate": 1.9912748620320796e-05, | |
| "loss": 6.6575, | |
| "mean_token_accuracy": 0.7715476334095002, | |
| "num_input_tokens_seen": 509235, | |
| "num_tokens": 509235.0, | |
| "step": 90, | |
| "train_runtime": 521.9092, | |
| "train_tokens_per_second": 975.716 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.9894833390973266e-05, | |
| "loss": 6.8634, | |
| "mean_token_accuracy": 0.761625699698925, | |
| "num_input_tokens_seen": 541734, | |
| "num_tokens": 541734.0, | |
| "step": 95, | |
| "train_runtime": 551.2186, | |
| "train_tokens_per_second": 982.793 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 79.0, | |
| "learning_rate": 1.98752561390399e-05, | |
| "loss": 6.1263, | |
| "mean_token_accuracy": 0.7742740377783776, | |
| "num_input_tokens_seen": 572168, | |
| "num_tokens": 572168.0, | |
| "step": 100, | |
| "train_runtime": 579.6467, | |
| "train_tokens_per_second": 987.098 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 89.5, | |
| "learning_rate": 1.9854020152886816e-05, | |
| "loss": 6.3594, | |
| "mean_token_accuracy": 0.7709073334932327, | |
| "num_input_tokens_seen": 598451, | |
| "num_tokens": 598451.0, | |
| "step": 105, | |
| "train_runtime": 605.7065, | |
| "train_tokens_per_second": 988.021 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 260.0, | |
| "learning_rate": 1.9831128999495605e-05, | |
| "loss": 6.1262, | |
| "mean_token_accuracy": 0.7783429339528084, | |
| "num_input_tokens_seen": 626393, | |
| "num_tokens": 626393.0, | |
| "step": 110, | |
| "train_runtime": 633.7147, | |
| "train_tokens_per_second": 988.446 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.9806586523864212e-05, | |
| "loss": 6.1485, | |
| "mean_token_accuracy": 0.7764704540371895, | |
| "num_input_tokens_seen": 649813, | |
| "num_tokens": 649813.0, | |
| "step": 115, | |
| "train_runtime": 657.2228, | |
| "train_tokens_per_second": 988.726 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 81.0, | |
| "learning_rate": 1.978039684836106e-05, | |
| "loss": 5.7248, | |
| "mean_token_accuracy": 0.7734859913587571, | |
| "num_input_tokens_seen": 676687, | |
| "num_tokens": 676687.0, | |
| "step": 120, | |
| "train_runtime": 683.5183, | |
| "train_tokens_per_second": 990.006 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.9752564372032655e-05, | |
| "loss": 5.9166, | |
| "mean_token_accuracy": 0.7785823866724968, | |
| "num_input_tokens_seen": 708413, | |
| "num_tokens": 708413.0, | |
| "step": 125, | |
| "train_runtime": 714.4677, | |
| "train_tokens_per_second": 991.526 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 624.0, | |
| "learning_rate": 1.9723093769864663e-05, | |
| "loss": 6.1637, | |
| "mean_token_accuracy": 0.7725436985492706, | |
| "num_input_tokens_seen": 733682, | |
| "num_tokens": 733682.0, | |
| "step": 130, | |
| "train_runtime": 740.5204, | |
| "train_tokens_per_second": 990.765 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 63.75, | |
| "learning_rate": 1.9691989991996663e-05, | |
| "loss": 6.0692, | |
| "mean_token_accuracy": 0.7738721042871475, | |
| "num_input_tokens_seen": 763284, | |
| "num_tokens": 763284.0, | |
| "step": 135, | |
| "train_runtime": 769.434, | |
| "train_tokens_per_second": 992.007 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.9659258262890683e-05, | |
| "loss": 5.9618, | |
| "mean_token_accuracy": 0.7771038174629211, | |
| "num_input_tokens_seen": 791236, | |
| "num_tokens": 791236.0, | |
| "step": 140, | |
| "train_runtime": 795.0855, | |
| "train_tokens_per_second": 995.158 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.9624904080453656e-05, | |
| "loss": 6.2847, | |
| "mean_token_accuracy": 0.768642008304596, | |
| "num_input_tokens_seen": 818107, | |
| "num_tokens": 818107.0, | |
| "step": 145, | |
| "train_runtime": 821.0436, | |
| "train_tokens_per_second": 996.423 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 113.5, | |
| "learning_rate": 1.9588933215113926e-05, | |
| "loss": 6.0329, | |
| "mean_token_accuracy": 0.7712928548455238, | |
| "num_input_tokens_seen": 846017, | |
| "num_tokens": 846017.0, | |
| "step": 150, | |
| "train_runtime": 849.152, | |
| "train_tokens_per_second": 996.308 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 564.0, | |
| "learning_rate": 1.955135170885202e-05, | |
| "loss": 5.7569, | |
| "mean_token_accuracy": 0.7793401271104813, | |
| "num_input_tokens_seen": 879137, | |
| "num_tokens": 879137.0, | |
| "step": 155, | |
| "train_runtime": 879.9983, | |
| "train_tokens_per_second": 999.021 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 59.25, | |
| "learning_rate": 1.9512165874185768e-05, | |
| "loss": 5.9181, | |
| "mean_token_accuracy": 0.7819835215806961, | |
| "num_input_tokens_seen": 905824, | |
| "num_tokens": 905824.0, | |
| "step": 160, | |
| "train_runtime": 905.886, | |
| "train_tokens_per_second": 999.932 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 85.5, | |
| "learning_rate": 1.9471382293110004e-05, | |
| "loss": 5.6098, | |
| "mean_token_accuracy": 0.7914141818881035, | |
| "num_input_tokens_seen": 932292, | |
| "num_tokens": 932292.0, | |
| "step": 165, | |
| "train_runtime": 931.5649, | |
| "train_tokens_per_second": 1000.781 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 58.0, | |
| "learning_rate": 1.9429007815990995e-05, | |
| "loss": 6.4777, | |
| "mean_token_accuracy": 0.763878983259201, | |
| "num_input_tokens_seen": 965956, | |
| "num_tokens": 965956.0, | |
| "step": 170, | |
| "train_runtime": 962.6472, | |
| "train_tokens_per_second": 1003.437 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 130.0, | |
| "learning_rate": 1.9385049560415794e-05, | |
| "loss": 6.0058, | |
| "mean_token_accuracy": 0.7746358260512352, | |
| "num_input_tokens_seen": 992908, | |
| "num_tokens": 992908.0, | |
| "step": 175, | |
| "train_runtime": 988.7648, | |
| "train_tokens_per_second": 1004.19 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 620.0, | |
| "learning_rate": 1.9339514909996706e-05, | |
| "loss": 6.0002, | |
| "mean_token_accuracy": 0.7761899515986442, | |
| "num_input_tokens_seen": 1027079, | |
| "num_tokens": 1027079.0, | |
| "step": 180, | |
| "train_runtime": 1019.9491, | |
| "train_tokens_per_second": 1006.99 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 83.5, | |
| "learning_rate": 1.929241151313108e-05, | |
| "loss": 5.872, | |
| "mean_token_accuracy": 0.7852786988019943, | |
| "num_input_tokens_seen": 1056613, | |
| "num_tokens": 1056613.0, | |
| "step": 185, | |
| "train_runtime": 1048.5847, | |
| "train_tokens_per_second": 1007.656 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 840.0, | |
| "learning_rate": 1.9243747281716604e-05, | |
| "loss": 5.682, | |
| "mean_token_accuracy": 0.7868907496333122, | |
| "num_input_tokens_seen": 1085315, | |
| "num_tokens": 1085315.0, | |
| "step": 190, | |
| "train_runtime": 1074.4166, | |
| "train_tokens_per_second": 1010.144 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 81.5, | |
| "learning_rate": 1.9193530389822364e-05, | |
| "loss": 5.5209, | |
| "mean_token_accuracy": 0.7940514251589775, | |
| "num_input_tokens_seen": 1115115, | |
| "num_tokens": 1115115.0, | |
| "step": 195, | |
| "train_runtime": 1103.2538, | |
| "train_tokens_per_second": 1010.751 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 100.5, | |
| "learning_rate": 1.9141769272315857e-05, | |
| "loss": 5.5272, | |
| "mean_token_accuracy": 0.7890612185001373, | |
| "num_input_tokens_seen": 1139079, | |
| "num_tokens": 1139079.0, | |
| "step": 200, | |
| "train_runtime": 1127.1395, | |
| "train_tokens_per_second": 1010.593 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 62.25, | |
| "learning_rate": 1.9088472623446182e-05, | |
| "loss": 5.7363, | |
| "mean_token_accuracy": 0.7819481372833252, | |
| "num_input_tokens_seen": 1171419, | |
| "num_tokens": 1171419.0, | |
| "step": 205, | |
| "train_runtime": 1158.0624, | |
| "train_tokens_per_second": 1011.534 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 169.0, | |
| "learning_rate": 1.90336493953837e-05, | |
| "loss": 5.914, | |
| "mean_token_accuracy": 0.7838666513562202, | |
| "num_input_tokens_seen": 1199784, | |
| "num_tokens": 1199784.0, | |
| "step": 210, | |
| "train_runtime": 1184.5189, | |
| "train_tokens_per_second": 1012.887 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 78.0, | |
| "learning_rate": 1.897730879671634e-05, | |
| "loss": 5.3131, | |
| "mean_token_accuracy": 0.7945496052503586, | |
| "num_input_tokens_seen": 1224308, | |
| "num_tokens": 1224308.0, | |
| "step": 215, | |
| "train_runtime": 1209.327, | |
| "train_tokens_per_second": 1012.388 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 1656.0, | |
| "learning_rate": 1.891946029090283e-05, | |
| "loss": 5.6709, | |
| "mean_token_accuracy": 0.7843748390674591, | |
| "num_input_tokens_seen": 1252051, | |
| "num_tokens": 1252051.0, | |
| "step": 220, | |
| "train_runtime": 1236.4054, | |
| "train_tokens_per_second": 1012.654 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 95.5, | |
| "learning_rate": 1.8860113594683148e-05, | |
| "loss": 5.9523, | |
| "mean_token_accuracy": 0.7736531987786293, | |
| "num_input_tokens_seen": 1278020, | |
| "num_tokens": 1278020.0, | |
| "step": 225, | |
| "train_runtime": 1262.0193, | |
| "train_tokens_per_second": 1012.679 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 75.0, | |
| "learning_rate": 1.8799278676446425e-05, | |
| "loss": 5.5729, | |
| "mean_token_accuracy": 0.7859392121434212, | |
| "num_input_tokens_seen": 1307055, | |
| "num_tokens": 1307055.0, | |
| "step": 230, | |
| "train_runtime": 1289.4069, | |
| "train_tokens_per_second": 1013.687 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 54.75, | |
| "learning_rate": 1.8736965754556527e-05, | |
| "loss": 5.6597, | |
| "mean_token_accuracy": 0.7832028537988662, | |
| "num_input_tokens_seen": 1332482, | |
| "num_tokens": 1332482.0, | |
| "step": 235, | |
| "train_runtime": 1314.3047, | |
| "train_tokens_per_second": 1013.83 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 60.75, | |
| "learning_rate": 1.867318529563574e-05, | |
| "loss": 6.0631, | |
| "mean_token_accuracy": 0.7801717698574067, | |
| "num_input_tokens_seen": 1359469, | |
| "num_tokens": 1359469.0, | |
| "step": 240, | |
| "train_runtime": 1341.0816, | |
| "train_tokens_per_second": 1013.711 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 72.0, | |
| "learning_rate": 1.8607948012806664e-05, | |
| "loss": 5.7384, | |
| "mean_token_accuracy": 0.7812554731965065, | |
| "num_input_tokens_seen": 1390888, | |
| "num_tokens": 1390888.0, | |
| "step": 245, | |
| "train_runtime": 1371.2979, | |
| "train_tokens_per_second": 1014.286 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 96.0, | |
| "learning_rate": 1.8541264863892755e-05, | |
| "loss": 5.871, | |
| "mean_token_accuracy": 0.7782064586877823, | |
| "num_input_tokens_seen": 1414969, | |
| "num_tokens": 1414969.0, | |
| "step": 250, | |
| "train_runtime": 1395.692, | |
| "train_tokens_per_second": 1013.812 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.8473147049577777e-05, | |
| "loss": 5.9369, | |
| "mean_token_accuracy": 0.7823293015360833, | |
| "num_input_tokens_seen": 1441261, | |
| "num_tokens": 1441261.0, | |
| "step": 255, | |
| "train_runtime": 1420.1833, | |
| "train_tokens_per_second": 1014.842 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 1168.0, | |
| "learning_rate": 1.84036060115244e-05, | |
| "loss": 5.7154, | |
| "mean_token_accuracy": 0.7839734643697739, | |
| "num_input_tokens_seen": 1470281, | |
| "num_tokens": 1470281.0, | |
| "step": 260, | |
| "train_runtime": 1447.4831, | |
| "train_tokens_per_second": 1015.75 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 65.0, | |
| "learning_rate": 1.8332653430452375e-05, | |
| "loss": 5.4737, | |
| "mean_token_accuracy": 0.7898797050118447, | |
| "num_input_tokens_seen": 1502818, | |
| "num_tokens": 1502818.0, | |
| "step": 265, | |
| "train_runtime": 1477.0712, | |
| "train_tokens_per_second": 1017.431 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 175.0, | |
| "learning_rate": 1.826030122417656e-05, | |
| "loss": 5.5671, | |
| "mean_token_accuracy": 0.780795156955719, | |
| "num_input_tokens_seen": 1531913, | |
| "num_tokens": 1531913.0, | |
| "step": 270, | |
| "train_runtime": 1503.8595, | |
| "train_tokens_per_second": 1018.654 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 62.75, | |
| "learning_rate": 1.8186561545605055e-05, | |
| "loss": 5.9096, | |
| "mean_token_accuracy": 0.7873492911458015, | |
| "num_input_tokens_seen": 1563135, | |
| "num_tokens": 1563135.0, | |
| "step": 275, | |
| "train_runtime": 1532.3365, | |
| "train_tokens_per_second": 1020.099 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 200.0, | |
| "learning_rate": 1.811144678069793e-05, | |
| "loss": 5.2624, | |
| "mean_token_accuracy": 0.7928400427103043, | |
| "num_input_tokens_seen": 1591144, | |
| "num_tokens": 1591144.0, | |
| "step": 280, | |
| "train_runtime": 1560.0406, | |
| "train_tokens_per_second": 1019.938 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.803496954638676e-05, | |
| "loss": 5.4277, | |
| "mean_token_accuracy": 0.7846548587083817, | |
| "num_input_tokens_seen": 1621253, | |
| "num_tokens": 1621253.0, | |
| "step": 285, | |
| "train_runtime": 1590.4126, | |
| "train_tokens_per_second": 1019.391 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 67.0, | |
| "learning_rate": 1.7957142688455362e-05, | |
| "loss": 5.6897, | |
| "mean_token_accuracy": 0.7835722789168358, | |
| "num_input_tokens_seen": 1646466, | |
| "num_tokens": 1646466.0, | |
| "step": 290, | |
| "train_runtime": 1615.728, | |
| "train_tokens_per_second": 1019.024 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 121.5, | |
| "learning_rate": 1.7877979279382135e-05, | |
| "loss": 5.8008, | |
| "mean_token_accuracy": 0.777139276266098, | |
| "num_input_tokens_seen": 1674583, | |
| "num_tokens": 1674583.0, | |
| "step": 295, | |
| "train_runtime": 1641.5179, | |
| "train_tokens_per_second": 1020.143 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.7797492616144256e-05, | |
| "loss": 5.794, | |
| "mean_token_accuracy": 0.7790872991085053, | |
| "num_input_tokens_seen": 1702490, | |
| "num_tokens": 1702490.0, | |
| "step": 300, | |
| "train_runtime": 1667.5303, | |
| "train_tokens_per_second": 1020.965 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 84.0, | |
| "learning_rate": 1.7715696217984233e-05, | |
| "loss": 5.7744, | |
| "mean_token_accuracy": 0.7830787718296051, | |
| "num_input_tokens_seen": 1732134, | |
| "num_tokens": 1732134.0, | |
| "step": 305, | |
| "train_runtime": 1695.751, | |
| "train_tokens_per_second": 1021.455 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 97.5, | |
| "learning_rate": 1.7632603824139086e-05, | |
| "loss": 5.7534, | |
| "mean_token_accuracy": 0.7765944376587868, | |
| "num_input_tokens_seen": 1760431, | |
| "num_tokens": 1760431.0, | |
| "step": 310, | |
| "train_runtime": 1721.7874, | |
| "train_tokens_per_second": 1022.444 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 214.0, | |
| "learning_rate": 1.7548229391532572e-05, | |
| "loss": 5.5022, | |
| "mean_token_accuracy": 0.7858106374740601, | |
| "num_input_tokens_seen": 1786890, | |
| "num_tokens": 1786890.0, | |
| "step": 315, | |
| "train_runtime": 1746.8899, | |
| "train_tokens_per_second": 1022.898 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 128.0, | |
| "learning_rate": 1.7462587092430877e-05, | |
| "loss": 5.6599, | |
| "mean_token_accuracy": 0.7865968465805053, | |
| "num_input_tokens_seen": 1812677, | |
| "num_tokens": 1812677.0, | |
| "step": 320, | |
| "train_runtime": 1774.44, | |
| "train_tokens_per_second": 1021.549 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 75.0, | |
| "learning_rate": 1.7375691312062102e-05, | |
| "loss": 5.5823, | |
| "mean_token_accuracy": 0.798132348060608, | |
| "num_input_tokens_seen": 1844191, | |
| "num_tokens": 1844191.0, | |
| "step": 325, | |
| "train_runtime": 1803.6261, | |
| "train_tokens_per_second": 1022.491 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 68.0, | |
| "learning_rate": 1.728755664620002e-05, | |
| "loss": 5.4933, | |
| "mean_token_accuracy": 0.7956582695245743, | |
| "num_input_tokens_seen": 1873707, | |
| "num_tokens": 1873707.0, | |
| "step": 330, | |
| "train_runtime": 1831.8481, | |
| "train_tokens_per_second": 1022.851 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 174.0, | |
| "learning_rate": 1.7198197898712402e-05, | |
| "loss": 5.6207, | |
| "mean_token_accuracy": 0.7940072804689408, | |
| "num_input_tokens_seen": 1897405, | |
| "num_tokens": 1897405.0, | |
| "step": 335, | |
| "train_runtime": 1855.6387, | |
| "train_tokens_per_second": 1022.508 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 87.0, | |
| "learning_rate": 1.7107630079074477e-05, | |
| "loss": 5.3281, | |
| "mean_token_accuracy": 0.7956176668405532, | |
| "num_input_tokens_seen": 1924306, | |
| "num_tokens": 1924306.0, | |
| "step": 340, | |
| "train_runtime": 1882.2812, | |
| "train_tokens_per_second": 1022.327 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 58.0, | |
| "learning_rate": 1.7015868399847768e-05, | |
| "loss": 5.3952, | |
| "mean_token_accuracy": 0.795125538110733, | |
| "num_input_tokens_seen": 1957493, | |
| "num_tokens": 1957493.0, | |
| "step": 345, | |
| "train_runtime": 1912.1661, | |
| "train_tokens_per_second": 1023.704 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.6922928274124887e-05, | |
| "loss": 5.5721, | |
| "mean_token_accuracy": 0.786569619178772, | |
| "num_input_tokens_seen": 1989461, | |
| "num_tokens": 1989461.0, | |
| "step": 350, | |
| "train_runtime": 1941.3276, | |
| "train_tokens_per_second": 1024.794 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.6828825312940594e-05, | |
| "loss": 5.6287, | |
| "mean_token_accuracy": 0.7822471752762794, | |
| "num_input_tokens_seen": 2018840, | |
| "num_tokens": 2018840.0, | |
| "step": 355, | |
| "train_runtime": 1968.6202, | |
| "train_tokens_per_second": 1025.51 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 78.0, | |
| "learning_rate": 1.673357532264966e-05, | |
| "loss": 5.6751, | |
| "mean_token_accuracy": 0.7819222688674927, | |
| "num_input_tokens_seen": 2046554, | |
| "num_tokens": 2046554.0, | |
| "step": 360, | |
| "train_runtime": 1994.9582, | |
| "train_tokens_per_second": 1025.863 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 58.25, | |
| "learning_rate": 1.663719430227186e-05, | |
| "loss": 5.3833, | |
| "mean_token_accuracy": 0.7945975109934806, | |
| "num_input_tokens_seen": 2074389, | |
| "num_tokens": 2074389.0, | |
| "step": 365, | |
| "train_runtime": 2023.2385, | |
| "train_tokens_per_second": 1025.282 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 63.25, | |
| "learning_rate": 1.653969844080466e-05, | |
| "loss": 5.3062, | |
| "mean_token_accuracy": 0.7965321630239487, | |
| "num_input_tokens_seen": 2102788, | |
| "num_tokens": 2102788.0, | |
| "step": 370, | |
| "train_runtime": 2051.2147, | |
| "train_tokens_per_second": 1025.143 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 169.0, | |
| "learning_rate": 1.644110411450398e-05, | |
| "loss": 5.6184, | |
| "mean_token_accuracy": 0.7859084010124207, | |
| "num_input_tokens_seen": 2127585, | |
| "num_tokens": 2127585.0, | |
| "step": 375, | |
| "train_runtime": 2076.5681, | |
| "train_tokens_per_second": 1024.568 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 67.0, | |
| "learning_rate": 1.634142788413346e-05, | |
| "loss": 5.7921, | |
| "mean_token_accuracy": 0.7838741362094879, | |
| "num_input_tokens_seen": 2153488, | |
| "num_tokens": 2153488.0, | |
| "step": 380, | |
| "train_runtime": 2101.2637, | |
| "train_tokens_per_second": 1024.854 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 139.0, | |
| "learning_rate": 1.6240686492182806e-05, | |
| "loss": 5.7157, | |
| "mean_token_accuracy": 0.7820939481258392, | |
| "num_input_tokens_seen": 2180494, | |
| "num_tokens": 2180494.0, | |
| "step": 385, | |
| "train_runtime": 2127.9074, | |
| "train_tokens_per_second": 1024.713 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.6138896860055555e-05, | |
| "loss": 5.3245, | |
| "mean_token_accuracy": 0.7927197381854058, | |
| "num_input_tokens_seen": 2209057, | |
| "num_tokens": 2209057.0, | |
| "step": 390, | |
| "train_runtime": 2153.5427, | |
| "train_tokens_per_second": 1025.778 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 131.0, | |
| "learning_rate": 1.6036076085226813e-05, | |
| "loss": 5.2268, | |
| "mean_token_accuracy": 0.7993880152702332, | |
| "num_input_tokens_seen": 2238624, | |
| "num_tokens": 2238624.0, | |
| "step": 395, | |
| "train_runtime": 2181.8136, | |
| "train_tokens_per_second": 1026.038 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 94.5, | |
| "learning_rate": 1.593224143837142e-05, | |
| "loss": 5.6083, | |
| "mean_token_accuracy": 0.7860068812966347, | |
| "num_input_tokens_seen": 2266912, | |
| "num_tokens": 2266912.0, | |
| "step": 400, | |
| "train_runtime": 2208.6845, | |
| "train_tokens_per_second": 1026.363 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.582741036046301e-05, | |
| "loss": 5.6723, | |
| "mean_token_accuracy": 0.7925399646162987, | |
| "num_input_tokens_seen": 2298813, | |
| "num_tokens": 2298813.0, | |
| "step": 405, | |
| "train_runtime": 2237.0379, | |
| "train_tokens_per_second": 1027.615 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.572160045984447e-05, | |
| "loss": 5.3201, | |
| "mean_token_accuracy": 0.8001187354326248, | |
| "num_input_tokens_seen": 2331446, | |
| "num_tokens": 2331446.0, | |
| "step": 410, | |
| "train_runtime": 2266.6842, | |
| "train_tokens_per_second": 1028.571 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.561482950927029e-05, | |
| "loss": 5.587, | |
| "mean_token_accuracy": 0.7832968756556511, | |
| "num_input_tokens_seen": 2363201, | |
| "num_tokens": 2363201.0, | |
| "step": 415, | |
| "train_runtime": 2296.0603, | |
| "train_tokens_per_second": 1029.242 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 142.0, | |
| "learning_rate": 1.550711544292131e-05, | |
| "loss": 5.6473, | |
| "mean_token_accuracy": 0.7904362455010414, | |
| "num_input_tokens_seen": 2389216, | |
| "num_tokens": 2389216.0, | |
| "step": 420, | |
| "train_runtime": 2320.7153, | |
| "train_tokens_per_second": 1029.517 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.5398476353392323e-05, | |
| "loss": 5.4438, | |
| "mean_token_accuracy": 0.7908162623643875, | |
| "num_input_tokens_seen": 2419743, | |
| "num_tokens": 2419743.0, | |
| "step": 425, | |
| "train_runtime": 2350.136, | |
| "train_tokens_per_second": 1029.618 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.5288930488653094e-05, | |
| "loss": 5.2794, | |
| "mean_token_accuracy": 0.7933614462614059, | |
| "num_input_tokens_seen": 2447011, | |
| "num_tokens": 2447011.0, | |
| "step": 430, | |
| "train_runtime": 2375.2075, | |
| "train_tokens_per_second": 1030.23 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 83.5, | |
| "learning_rate": 1.5178496248983254e-05, | |
| "loss": 6.0266, | |
| "mean_token_accuracy": 0.7747324109077454, | |
| "num_input_tokens_seen": 2475216, | |
| "num_tokens": 2475216.0, | |
| "step": 435, | |
| "train_runtime": 2401.9371, | |
| "train_tokens_per_second": 1030.508 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 89.0, | |
| "learning_rate": 1.5067192183881658e-05, | |
| "loss": 5.4756, | |
| "mean_token_accuracy": 0.7997275143861771, | |
| "num_input_tokens_seen": 2502081, | |
| "num_tokens": 2502081.0, | |
| "step": 440, | |
| "train_runtime": 2428.7281, | |
| "train_tokens_per_second": 1030.202 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.4955036988950617e-05, | |
| "loss": 6.1068, | |
| "mean_token_accuracy": 0.7708079561591148, | |
| "num_input_tokens_seen": 2529391, | |
| "num_tokens": 2529391.0, | |
| "step": 445, | |
| "train_runtime": 2455.553, | |
| "train_tokens_per_second": 1030.07 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 127.5, | |
| "learning_rate": 1.484204950275565e-05, | |
| "loss": 5.5882, | |
| "mean_token_accuracy": 0.7801107332110405, | |
| "num_input_tokens_seen": 2554232, | |
| "num_tokens": 2554232.0, | |
| "step": 450, | |
| "train_runtime": 2479.6181, | |
| "train_tokens_per_second": 1030.091 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.4728248703661183e-05, | |
| "loss": 5.4756, | |
| "mean_token_accuracy": 0.7862519830465317, | |
| "num_input_tokens_seen": 2582182, | |
| "num_tokens": 2582182.0, | |
| "step": 455, | |
| "train_runtime": 2505.889, | |
| "train_tokens_per_second": 1030.445 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 66.5, | |
| "learning_rate": 1.461365370664276e-05, | |
| "loss": 5.3923, | |
| "mean_token_accuracy": 0.7940330818295479, | |
| "num_input_tokens_seen": 2617794, | |
| "num_tokens": 2617794.0, | |
| "step": 460, | |
| "train_runtime": 2539.941, | |
| "train_tokens_per_second": 1030.652 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 104.5, | |
| "learning_rate": 1.4498283760076362e-05, | |
| "loss": 5.5707, | |
| "mean_token_accuracy": 0.7927709832787514, | |
| "num_input_tokens_seen": 2643272, | |
| "num_tokens": 2643272.0, | |
| "step": 465, | |
| "train_runtime": 2564.5252, | |
| "train_tokens_per_second": 1030.706 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 59.25, | |
| "learning_rate": 1.4382158242505236e-05, | |
| "loss": 5.644, | |
| "mean_token_accuracy": 0.7871902465820313, | |
| "num_input_tokens_seen": 2671869, | |
| "num_tokens": 2671869.0, | |
| "step": 470, | |
| "train_runtime": 2592.8854, | |
| "train_tokens_per_second": 1030.462 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 31.625, | |
| "learning_rate": 1.4265296659384956e-05, | |
| "loss": 5.6562, | |
| "mean_token_accuracy": 0.7877990290522575, | |
| "num_input_tokens_seen": 2702041, | |
| "num_tokens": 2702041.0, | |
| "step": 475, | |
| "train_runtime": 2621.0737, | |
| "train_tokens_per_second": 1030.891 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 86.5, | |
| "learning_rate": 1.4147718639807071e-05, | |
| "loss": 5.621, | |
| "mean_token_accuracy": 0.7925810098648072, | |
| "num_input_tokens_seen": 2732153, | |
| "num_tokens": 2732153.0, | |
| "step": 480, | |
| "train_runtime": 2649.9389, | |
| "train_tokens_per_second": 1031.025 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 74.5, | |
| "learning_rate": 1.4029443933202059e-05, | |
| "loss": 5.4204, | |
| "mean_token_accuracy": 0.7914870575070381, | |
| "num_input_tokens_seen": 2758024, | |
| "num_tokens": 2758024.0, | |
| "step": 485, | |
| "train_runtime": 2676.6477, | |
| "train_tokens_per_second": 1030.402 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 70.5, | |
| "learning_rate": 1.3910492406022033e-05, | |
| "loss": 5.7675, | |
| "mean_token_accuracy": 0.7754184618592262, | |
| "num_input_tokens_seen": 2786096, | |
| "num_tokens": 2786096.0, | |
| "step": 490, | |
| "train_runtime": 2703.9298, | |
| "train_tokens_per_second": 1030.388 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 57.0, | |
| "learning_rate": 1.3790884038403796e-05, | |
| "loss": 5.5642, | |
| "mean_token_accuracy": 0.7880642995238304, | |
| "num_input_tokens_seen": 2814487, | |
| "num_tokens": 2814487.0, | |
| "step": 495, | |
| "train_runtime": 2731.3935, | |
| "train_tokens_per_second": 1030.422 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 248.0, | |
| "learning_rate": 1.36706389208128e-05, | |
| "loss": 5.5255, | |
| "mean_token_accuracy": 0.7882839411497116, | |
| "num_input_tokens_seen": 2847405, | |
| "num_tokens": 2847405.0, | |
| "step": 500, | |
| "train_runtime": 2762.1229, | |
| "train_tokens_per_second": 1030.876 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 98.0, | |
| "learning_rate": 1.354977725066859e-05, | |
| "loss": 5.3838, | |
| "mean_token_accuracy": 0.7869982674717904, | |
| "num_input_tokens_seen": 2876359, | |
| "num_tokens": 2876359.0, | |
| "step": 505, | |
| "train_runtime": 2787.1872, | |
| "train_tokens_per_second": 1031.993 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 352.0, | |
| "learning_rate": 1.3428319328952254e-05, | |
| "loss": 5.4099, | |
| "mean_token_accuracy": 0.7843140512704849, | |
| "num_input_tokens_seen": 2902835, | |
| "num_tokens": 2902835.0, | |
| "step": 510, | |
| "train_runtime": 2811.8431, | |
| "train_tokens_per_second": 1032.36 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 112.0, | |
| "learning_rate": 1.3306285556796494e-05, | |
| "loss": 5.3675, | |
| "mean_token_accuracy": 0.7961455345153808, | |
| "num_input_tokens_seen": 2932928, | |
| "num_tokens": 2932928.0, | |
| "step": 515, | |
| "train_runtime": 2839.8383, | |
| "train_tokens_per_second": 1032.78 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 163.0, | |
| "learning_rate": 1.3183696432058889e-05, | |
| "loss": 5.2575, | |
| "mean_token_accuracy": 0.7971223339438438, | |
| "num_input_tokens_seen": 2957517, | |
| "num_tokens": 2957517.0, | |
| "step": 520, | |
| "train_runtime": 2865.8475, | |
| "train_tokens_per_second": 1031.987 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 56.0, | |
| "learning_rate": 1.3060572545878875e-05, | |
| "loss": 5.4625, | |
| "mean_token_accuracy": 0.7884187951683999, | |
| "num_input_tokens_seen": 2987924, | |
| "num_tokens": 2987924.0, | |
| "step": 525, | |
| "train_runtime": 2893.0838, | |
| "train_tokens_per_second": 1032.782 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 84.5, | |
| "learning_rate": 1.2936934579219094e-05, | |
| "loss": 4.9978, | |
| "mean_token_accuracy": 0.8107392936944962, | |
| "num_input_tokens_seen": 3014169, | |
| "num_tokens": 3014169.0, | |
| "step": 530, | |
| "train_runtime": 2919.0783, | |
| "train_tokens_per_second": 1032.576 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 66.5, | |
| "learning_rate": 1.2812803299391629e-05, | |
| "loss": 5.7573, | |
| "mean_token_accuracy": 0.789200983941555, | |
| "num_input_tokens_seen": 3046708, | |
| "num_tokens": 3046708.0, | |
| "step": 535, | |
| "train_runtime": 2948.6574, | |
| "train_tokens_per_second": 1033.253 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 70.5, | |
| "learning_rate": 1.2688199556569753e-05, | |
| "loss": 5.4901, | |
| "mean_token_accuracy": 0.7852542266249657, | |
| "num_input_tokens_seen": 3074318, | |
| "num_tokens": 3074318.0, | |
| "step": 540, | |
| "train_runtime": 2975.382, | |
| "train_tokens_per_second": 1033.252 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 146.0, | |
| "learning_rate": 1.2563144280285742e-05, | |
| "loss": 5.1747, | |
| "mean_token_accuracy": 0.8006948977708817, | |
| "num_input_tokens_seen": 3102044, | |
| "num_tokens": 3102044.0, | |
| "step": 545, | |
| "train_runtime": 3001.1559, | |
| "train_tokens_per_second": 1033.616 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 63.5, | |
| "learning_rate": 1.2437658475915378e-05, | |
| "loss": 5.5294, | |
| "mean_token_accuracy": 0.7853314474225044, | |
| "num_input_tokens_seen": 3129406, | |
| "num_tokens": 3129406.0, | |
| "step": 550, | |
| "train_runtime": 3027.9577, | |
| "train_tokens_per_second": 1033.504 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 72.5, | |
| "learning_rate": 1.23117632211497e-05, | |
| "loss": 5.1169, | |
| "mean_token_accuracy": 0.8012081518769264, | |
| "num_input_tokens_seen": 3159496, | |
| "num_tokens": 3159496.0, | |
| "step": 555, | |
| "train_runtime": 3057.971, | |
| "train_tokens_per_second": 1033.2 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 102.5, | |
| "learning_rate": 1.2185479662454596e-05, | |
| "loss": 5.5137, | |
| "mean_token_accuracy": 0.7913802459836006, | |
| "num_input_tokens_seen": 3185619, | |
| "num_tokens": 3185619.0, | |
| "step": 560, | |
| "train_runtime": 3083.8761, | |
| "train_tokens_per_second": 1032.992 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 132.0, | |
| "learning_rate": 1.2058829011518896e-05, | |
| "loss": 5.2765, | |
| "mean_token_accuracy": 0.7947102382779121, | |
| "num_input_tokens_seen": 3212881, | |
| "num_tokens": 3212881.0, | |
| "step": 565, | |
| "train_runtime": 3109.5104, | |
| "train_tokens_per_second": 1033.243 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 167.0, | |
| "learning_rate": 1.193183254169142e-05, | |
| "loss": 5.3215, | |
| "mean_token_accuracy": 0.7931090787053108, | |
| "num_input_tokens_seen": 3240673, | |
| "num_tokens": 3240673.0, | |
| "step": 570, | |
| "train_runtime": 3136.4109, | |
| "train_tokens_per_second": 1033.242 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.1804511584407763e-05, | |
| "loss": 5.9359, | |
| "mean_token_accuracy": 0.7751367390155792, | |
| "num_input_tokens_seen": 3272505, | |
| "num_tokens": 3272505.0, | |
| "step": 575, | |
| "train_runtime": 3164.4979, | |
| "train_tokens_per_second": 1034.131 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 108.0, | |
| "learning_rate": 1.1676887525607272e-05, | |
| "loss": 5.4725, | |
| "mean_token_accuracy": 0.7907213315367698, | |
| "num_input_tokens_seen": 3302432, | |
| "num_tokens": 3302432.0, | |
| "step": 580, | |
| "train_runtime": 3193.0515, | |
| "train_tokens_per_second": 1034.256 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 86.0, | |
| "learning_rate": 1.1548981802140849e-05, | |
| "loss": 5.2601, | |
| "mean_token_accuracy": 0.7978497371077538, | |
| "num_input_tokens_seen": 3328723, | |
| "num_tokens": 3328723.0, | |
| "step": 585, | |
| "train_runtime": 3219.6782, | |
| "train_tokens_per_second": 1033.868 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 90.5, | |
| "learning_rate": 1.142081589817027e-05, | |
| "loss": 4.9631, | |
| "mean_token_accuracy": 0.8012796014547348, | |
| "num_input_tokens_seen": 3356380, | |
| "num_tokens": 3356380.0, | |
| "step": 590, | |
| "train_runtime": 3246.7315, | |
| "train_tokens_per_second": 1033.772 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 229.0, | |
| "learning_rate": 1.129241134155949e-05, | |
| "loss": 5.4675, | |
| "mean_token_accuracy": 0.7908360511064529, | |
| "num_input_tokens_seen": 3383429, | |
| "num_tokens": 3383429.0, | |
| "step": 595, | |
| "train_runtime": 3273.3996, | |
| "train_tokens_per_second": 1033.613 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.1163789700258656e-05, | |
| "loss": 5.1986, | |
| "mean_token_accuracy": 0.8053984194993973, | |
| "num_input_tokens_seen": 3415705, | |
| "num_tokens": 3415705.0, | |
| "step": 600, | |
| "train_runtime": 3305.0605, | |
| "train_tokens_per_second": 1033.477 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 219.0, | |
| "learning_rate": 1.1034972578681338e-05, | |
| "loss": 5.1812, | |
| "mean_token_accuracy": 0.7935044363141059, | |
| "num_input_tokens_seen": 3443164, | |
| "num_tokens": 3443164.0, | |
| "step": 605, | |
| "train_runtime": 3332.0068, | |
| "train_tokens_per_second": 1033.36 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 157.0, | |
| "learning_rate": 1.0905981614075693e-05, | |
| "loss": 5.1947, | |
| "mean_token_accuracy": 0.796344393491745, | |
| "num_input_tokens_seen": 3467326, | |
| "num_tokens": 3467326.0, | |
| "step": 610, | |
| "train_runtime": 3356.439, | |
| "train_tokens_per_second": 1033.037 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 716.0, | |
| "learning_rate": 1.0776838472890065e-05, | |
| "loss": 5.292, | |
| "mean_token_accuracy": 0.7951879158616066, | |
| "num_input_tokens_seen": 3495396, | |
| "num_tokens": 3495396.0, | |
| "step": 615, | |
| "train_runtime": 3383.0011, | |
| "train_tokens_per_second": 1033.223 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.06475648471337e-05, | |
| "loss": 5.0782, | |
| "mean_token_accuracy": 0.8010394781827926, | |
| "num_input_tokens_seen": 3520634, | |
| "num_tokens": 3520634.0, | |
| "step": 620, | |
| "train_runtime": 3409.0668, | |
| "train_tokens_per_second": 1032.727 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 95.5, | |
| "learning_rate": 1.0518182450733185e-05, | |
| "loss": 5.5994, | |
| "mean_token_accuracy": 0.7855334684252739, | |
| "num_input_tokens_seen": 3549657, | |
| "num_tokens": 3549657.0, | |
| "step": 625, | |
| "train_runtime": 3436.945, | |
| "train_tokens_per_second": 1032.794 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 76.0, | |
| "learning_rate": 1.0388713015885161e-05, | |
| "loss": 5.5592, | |
| "mean_token_accuracy": 0.7845589280128479, | |
| "num_input_tokens_seen": 3579684, | |
| "num_tokens": 3579684.0, | |
| "step": 630, | |
| "train_runtime": 3464.2838, | |
| "train_tokens_per_second": 1033.311 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.0259178289406011e-05, | |
| "loss": 5.3878, | |
| "mean_token_accuracy": 0.7951082989573479, | |
| "num_input_tokens_seen": 3605087, | |
| "num_tokens": 3605087.0, | |
| "step": 635, | |
| "train_runtime": 3488.9594, | |
| "train_tokens_per_second": 1033.284 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 59.5, | |
| "learning_rate": 1.0129600029079072e-05, | |
| "loss": 5.7061, | |
| "mean_token_accuracy": 0.7883853644132615, | |
| "num_input_tokens_seen": 3632957, | |
| "num_tokens": 3632957.0, | |
| "step": 640, | |
| "train_runtime": 3515.7879, | |
| "train_tokens_per_second": 1033.327 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1e-05, | |
| "loss": 5.2977, | |
| "mean_token_accuracy": 0.7912828177213669, | |
| "num_input_tokens_seen": 3660791, | |
| "num_tokens": 3660791.0, | |
| "step": 645, | |
| "train_runtime": 3543.3758, | |
| "train_tokens_per_second": 1033.137 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 52.0, | |
| "learning_rate": 9.870399970920932e-06, | |
| "loss": 5.4416, | |
| "mean_token_accuracy": 0.7928792417049408, | |
| "num_input_tokens_seen": 3689640, | |
| "num_tokens": 3689640.0, | |
| "step": 650, | |
| "train_runtime": 3570.9162, | |
| "train_tokens_per_second": 1033.247 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 62.75, | |
| "learning_rate": 9.740821710593989e-06, | |
| "loss": 5.3682, | |
| "mean_token_accuracy": 0.798756355047226, | |
| "num_input_tokens_seen": 3715172, | |
| "num_tokens": 3715172.0, | |
| "step": 655, | |
| "train_runtime": 3596.1899, | |
| "train_tokens_per_second": 1033.086 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 89.5, | |
| "learning_rate": 9.61128698411484e-06, | |
| "loss": 5.168, | |
| "mean_token_accuracy": 0.7963975608348847, | |
| "num_input_tokens_seen": 3741030, | |
| "num_tokens": 3741030.0, | |
| "step": 660, | |
| "train_runtime": 3620.5686, | |
| "train_tokens_per_second": 1033.271 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 74.0, | |
| "learning_rate": 9.481817549266817e-06, | |
| "loss": 4.9416, | |
| "mean_token_accuracy": 0.8047078907489776, | |
| "num_input_tokens_seen": 3768537, | |
| "num_tokens": 3768537.0, | |
| "step": 665, | |
| "train_runtime": 3647.1969, | |
| "train_tokens_per_second": 1033.269 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 45.25, | |
| "learning_rate": 9.352435152866299e-06, | |
| "loss": 5.3385, | |
| "mean_token_accuracy": 0.7905179291963578, | |
| "num_input_tokens_seen": 3799526, | |
| "num_tokens": 3799526.0, | |
| "step": 670, | |
| "train_runtime": 3674.2376, | |
| "train_tokens_per_second": 1034.099 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 50.75, | |
| "learning_rate": 9.223161527109938e-06, | |
| "loss": 5.1579, | |
| "mean_token_accuracy": 0.8002543315291405, | |
| "num_input_tokens_seen": 3833970, | |
| "num_tokens": 3833970.0, | |
| "step": 675, | |
| "train_runtime": 3706.0243, | |
| "train_tokens_per_second": 1034.524 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 185.0, | |
| "learning_rate": 9.09401838592431e-06, | |
| "loss": 5.1117, | |
| "mean_token_accuracy": 0.7966769933700562, | |
| "num_input_tokens_seen": 3860184, | |
| "num_tokens": 3860184.0, | |
| "step": 680, | |
| "train_runtime": 3730.8409, | |
| "train_tokens_per_second": 1034.669 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 94.0, | |
| "learning_rate": 8.965027421318666e-06, | |
| "loss": 5.1979, | |
| "mean_token_accuracy": 0.7981098249554635, | |
| "num_input_tokens_seen": 3886677, | |
| "num_tokens": 3886677.0, | |
| "step": 685, | |
| "train_runtime": 3756.8191, | |
| "train_tokens_per_second": 1034.566 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 47.75, | |
| "learning_rate": 8.836210299741346e-06, | |
| "loss": 5.6591, | |
| "mean_token_accuracy": 0.7767333656549453, | |
| "num_input_tokens_seen": 3914353, | |
| "num_tokens": 3914353.0, | |
| "step": 690, | |
| "train_runtime": 3789.086, | |
| "train_tokens_per_second": 1033.06 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 106.0, | |
| "learning_rate": 8.707588658440511e-06, | |
| "loss": 5.5255, | |
| "mean_token_accuracy": 0.7878315180540085, | |
| "num_input_tokens_seen": 3940719, | |
| "num_tokens": 3940719.0, | |
| "step": 695, | |
| "train_runtime": 3814.8928, | |
| "train_tokens_per_second": 1032.983 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 41.25, | |
| "learning_rate": 8.579184101829734e-06, | |
| "loss": 5.5184, | |
| "mean_token_accuracy": 0.7867645308375358, | |
| "num_input_tokens_seen": 3978888, | |
| "num_tokens": 3978888.0, | |
| "step": 700, | |
| "train_runtime": 3851.4853, | |
| "train_tokens_per_second": 1033.079 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": 90.5, | |
| "learning_rate": 8.451018197859153e-06, | |
| "loss": 5.3507, | |
| "mean_token_accuracy": 0.7968938469886779, | |
| "num_input_tokens_seen": 4011090, | |
| "num_tokens": 4011090.0, | |
| "step": 705, | |
| "train_runtime": 3880.8486, | |
| "train_tokens_per_second": 1033.56 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 101.0, | |
| "learning_rate": 8.323112474392731e-06, | |
| "loss": 5.4665, | |
| "mean_token_accuracy": 0.7853416830301285, | |
| "num_input_tokens_seen": 4038326, | |
| "num_tokens": 4038326.0, | |
| "step": 710, | |
| "train_runtime": 3907.9772, | |
| "train_tokens_per_second": 1033.355 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": 318.0, | |
| "learning_rate": 8.195488415592238e-06, | |
| "loss": 5.5897, | |
| "mean_token_accuracy": 0.7923433750867843, | |
| "num_input_tokens_seen": 4069168, | |
| "num_tokens": 4069168.0, | |
| "step": 715, | |
| "train_runtime": 3938.7796, | |
| "train_tokens_per_second": 1033.104 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 192.0, | |
| "learning_rate": 8.068167458308582e-06, | |
| "loss": 5.4862, | |
| "mean_token_accuracy": 0.7898807466030121, | |
| "num_input_tokens_seen": 4096317, | |
| "num_tokens": 4096317.0, | |
| "step": 720, | |
| "train_runtime": 3965.423, | |
| "train_tokens_per_second": 1033.009 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 54.25, | |
| "learning_rate": 7.941170988481108e-06, | |
| "loss": 5.6047, | |
| "mean_token_accuracy": 0.7875970765948296, | |
| "num_input_tokens_seen": 4125700, | |
| "num_tokens": 4125700.0, | |
| "step": 725, | |
| "train_runtime": 3992.1983, | |
| "train_tokens_per_second": 1033.441 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 56.0, | |
| "learning_rate": 7.814520337545405e-06, | |
| "loss": 5.628, | |
| "mean_token_accuracy": 0.7885335445404053, | |
| "num_input_tokens_seen": 4151858, | |
| "num_tokens": 4151858.0, | |
| "step": 730, | |
| "train_runtime": 4017.6605, | |
| "train_tokens_per_second": 1033.402 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": 79.0, | |
| "learning_rate": 7.688236778850307e-06, | |
| "loss": 5.6834, | |
| "mean_token_accuracy": 0.7778642490506172, | |
| "num_input_tokens_seen": 4175955, | |
| "num_tokens": 4175955.0, | |
| "step": 735, | |
| "train_runtime": 4042.321, | |
| "train_tokens_per_second": 1033.059 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 57.5, | |
| "learning_rate": 7.5623415240846235e-06, | |
| "loss": 5.4957, | |
| "mean_token_accuracy": 0.7904580295085907, | |
| "num_input_tokens_seen": 4203815, | |
| "num_tokens": 4203815.0, | |
| "step": 740, | |
| "train_runtime": 4068.3684, | |
| "train_tokens_per_second": 1033.293 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": 63.0, | |
| "learning_rate": 7.4368557197142596e-06, | |
| "loss": 4.8703, | |
| "mean_token_accuracy": 0.8159982651472092, | |
| "num_input_tokens_seen": 4233192, | |
| "num_tokens": 4233192.0, | |
| "step": 745, | |
| "train_runtime": 4097.3421, | |
| "train_tokens_per_second": 1033.156 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 71.5, | |
| "learning_rate": 7.311800443430251e-06, | |
| "loss": 5.3462, | |
| "mean_token_accuracy": 0.794153805077076, | |
| "num_input_tokens_seen": 4262718, | |
| "num_tokens": 4262718.0, | |
| "step": 750, | |
| "train_runtime": 4124.7729, | |
| "train_tokens_per_second": 1033.443 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": 152.0, | |
| "learning_rate": 7.187196700608373e-06, | |
| "loss": 4.9215, | |
| "mean_token_accuracy": 0.8081203132867814, | |
| "num_input_tokens_seen": 4288296, | |
| "num_tokens": 4288296.0, | |
| "step": 755, | |
| "train_runtime": 4151.2234, | |
| "train_tokens_per_second": 1033.02 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 73.5, | |
| "learning_rate": 7.063065420780909e-06, | |
| "loss": 5.2529, | |
| "mean_token_accuracy": 0.7946424350142479, | |
| "num_input_tokens_seen": 4318154, | |
| "num_tokens": 4318154.0, | |
| "step": 760, | |
| "train_runtime": 4180.0353, | |
| "train_tokens_per_second": 1033.042 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": 247.0, | |
| "learning_rate": 6.939427454121128e-06, | |
| "loss": 4.9659, | |
| "mean_token_accuracy": 0.8127795070409775, | |
| "num_input_tokens_seen": 4345188, | |
| "num_tokens": 4345188.0, | |
| "step": 765, | |
| "train_runtime": 4205.4405, | |
| "train_tokens_per_second": 1033.23 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 80.0, | |
| "learning_rate": 6.816303567941111e-06, | |
| "loss": 5.2259, | |
| "mean_token_accuracy": 0.7914159163832665, | |
| "num_input_tokens_seen": 4373786, | |
| "num_tokens": 4373786.0, | |
| "step": 770, | |
| "train_runtime": 4233.1681, | |
| "train_tokens_per_second": 1033.218 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 130.0, | |
| "learning_rate": 6.693714443203507e-06, | |
| "loss": 5.6243, | |
| "mean_token_accuracy": 0.7848341032862663, | |
| "num_input_tokens_seen": 4404430, | |
| "num_tokens": 4404430.0, | |
| "step": 775, | |
| "train_runtime": 4261.1517, | |
| "train_tokens_per_second": 1033.624 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 59.25, | |
| "learning_rate": 6.571680671047749e-06, | |
| "loss": 4.9764, | |
| "mean_token_accuracy": 0.8033898919820786, | |
| "num_input_tokens_seen": 4434585, | |
| "num_tokens": 4434585.0, | |
| "step": 780, | |
| "train_runtime": 4290.2703, | |
| "train_tokens_per_second": 1033.638 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": 109.5, | |
| "learning_rate": 6.450222749331414e-06, | |
| "loss": 5.7304, | |
| "mean_token_accuracy": 0.7839483708143234, | |
| "num_input_tokens_seen": 4463472, | |
| "num_tokens": 4463472.0, | |
| "step": 785, | |
| "train_runtime": 4317.149, | |
| "train_tokens_per_second": 1033.893 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 64.5, | |
| "learning_rate": 6.329361079187199e-06, | |
| "loss": 5.7255, | |
| "mean_token_accuracy": 0.7803121000528336, | |
| "num_input_tokens_seen": 4490469, | |
| "num_tokens": 4490469.0, | |
| "step": 790, | |
| "train_runtime": 4342.3523, | |
| "train_tokens_per_second": 1034.11 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": 116.0, | |
| "learning_rate": 6.209115961596208e-06, | |
| "loss": 5.347, | |
| "mean_token_accuracy": 0.7927216425538063, | |
| "num_input_tokens_seen": 4517649, | |
| "num_tokens": 4517649.0, | |
| "step": 795, | |
| "train_runtime": 4368.0708, | |
| "train_tokens_per_second": 1034.244 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 57.5, | |
| "learning_rate": 6.0895075939779705e-06, | |
| "loss": 5.5641, | |
| "mean_token_accuracy": 0.795795188844204, | |
| "num_input_tokens_seen": 4550712, | |
| "num_tokens": 4550712.0, | |
| "step": 800, | |
| "train_runtime": 4400.2836, | |
| "train_tokens_per_second": 1034.186 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": 61.5, | |
| "learning_rate": 5.970556066797941e-06, | |
| "loss": 5.8032, | |
| "mean_token_accuracy": 0.7773613944649697, | |
| "num_input_tokens_seen": 4585129, | |
| "num_tokens": 4585129.0, | |
| "step": 805, | |
| "train_runtime": 4430.3262, | |
| "train_tokens_per_second": 1034.942 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 46.75, | |
| "learning_rate": 5.852281360192933e-06, | |
| "loss": 5.4492, | |
| "mean_token_accuracy": 0.7958677127957344, | |
| "num_input_tokens_seen": 4614693, | |
| "num_tokens": 4614693.0, | |
| "step": 810, | |
| "train_runtime": 4458.5188, | |
| "train_tokens_per_second": 1035.028 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": 135.0, | |
| "learning_rate": 5.7347033406150494e-06, | |
| "loss": 5.2903, | |
| "mean_token_accuracy": 0.7904482677578926, | |
| "num_input_tokens_seen": 4644174, | |
| "num_tokens": 4644174.0, | |
| "step": 815, | |
| "train_runtime": 4486.8043, | |
| "train_tokens_per_second": 1035.074 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 62.0, | |
| "learning_rate": 5.617841757494762e-06, | |
| "loss": 5.322, | |
| "mean_token_accuracy": 0.7915823593735695, | |
| "num_input_tokens_seen": 4673847, | |
| "num_tokens": 4673847.0, | |
| "step": 820, | |
| "train_runtime": 4515.9617, | |
| "train_tokens_per_second": 1034.962 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 258.0, | |
| "learning_rate": 5.501716239923642e-06, | |
| "loss": 5.0579, | |
| "mean_token_accuracy": 0.8000400334596633, | |
| "num_input_tokens_seen": 4700972, | |
| "num_tokens": 4700972.0, | |
| "step": 825, | |
| "train_runtime": 4541.9592, | |
| "train_tokens_per_second": 1035.01 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 70.0, | |
| "learning_rate": 5.386346293357242e-06, | |
| "loss": 5.2416, | |
| "mean_token_accuracy": 0.8032521203160286, | |
| "num_input_tokens_seen": 4728665, | |
| "num_tokens": 4728665.0, | |
| "step": 830, | |
| "train_runtime": 4570.1941, | |
| "train_tokens_per_second": 1034.675 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": 100.5, | |
| "learning_rate": 5.271751296338823e-06, | |
| "loss": 5.4391, | |
| "mean_token_accuracy": 0.7925701707601547, | |
| "num_input_tokens_seen": 4756105, | |
| "num_tokens": 4756105.0, | |
| "step": 835, | |
| "train_runtime": 4596.0174, | |
| "train_tokens_per_second": 1034.832 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 109.5, | |
| "learning_rate": 5.15795049724435e-06, | |
| "loss": 5.7334, | |
| "mean_token_accuracy": 0.7834463611245155, | |
| "num_input_tokens_seen": 4784316, | |
| "num_tokens": 4784316.0, | |
| "step": 840, | |
| "train_runtime": 4623.5278, | |
| "train_tokens_per_second": 1034.776 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": 58.75, | |
| "learning_rate": 5.044963011049384e-06, | |
| "loss": 5.3186, | |
| "mean_token_accuracy": 0.7899114429950714, | |
| "num_input_tokens_seen": 4810582, | |
| "num_tokens": 4810582.0, | |
| "step": 845, | |
| "train_runtime": 4648.9466, | |
| "train_tokens_per_second": 1034.768 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 47.0, | |
| "learning_rate": 4.932807816118347e-06, | |
| "loss": 5.5814, | |
| "mean_token_accuracy": 0.7850132435560226, | |
| "num_input_tokens_seen": 4846569, | |
| "num_tokens": 4846569.0, | |
| "step": 850, | |
| "train_runtime": 4682.0829, | |
| "train_tokens_per_second": 1035.131 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": 34.5, | |
| "learning_rate": 4.821503751016746e-06, | |
| "loss": 5.3389, | |
| "mean_token_accuracy": 0.7990380316972733, | |
| "num_input_tokens_seen": 4876152, | |
| "num_tokens": 4876152.0, | |
| "step": 855, | |
| "train_runtime": 4709.9784, | |
| "train_tokens_per_second": 1035.281 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 412.0, | |
| "learning_rate": 4.711069511346909e-06, | |
| "loss": 5.7378, | |
| "mean_token_accuracy": 0.7886842951178551, | |
| "num_input_tokens_seen": 4913773, | |
| "num_tokens": 4913773.0, | |
| "step": 860, | |
| "train_runtime": 4744.369, | |
| "train_tokens_per_second": 1035.706 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": 62.75, | |
| "learning_rate": 4.601523646607675e-06, | |
| "loss": 5.8298, | |
| "mean_token_accuracy": 0.7812601879239083, | |
| "num_input_tokens_seen": 4943904, | |
| "num_tokens": 4943904.0, | |
| "step": 865, | |
| "train_runtime": 4772.8549, | |
| "train_tokens_per_second": 1035.838 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 57.75, | |
| "learning_rate": 4.492884557078688e-06, | |
| "loss": 4.8728, | |
| "mean_token_accuracy": 0.8109473079442978, | |
| "num_input_tokens_seen": 4969286, | |
| "num_tokens": 4969286.0, | |
| "step": 870, | |
| "train_runtime": 4796.4072, | |
| "train_tokens_per_second": 1036.043 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 49.5, | |
| "learning_rate": 4.385170490729712e-06, | |
| "loss": 5.3615, | |
| "mean_token_accuracy": 0.7962699040770531, | |
| "num_input_tokens_seen": 4996130, | |
| "num_tokens": 4996130.0, | |
| "step": 875, | |
| "train_runtime": 4822.3119, | |
| "train_tokens_per_second": 1036.045 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 36.75, | |
| "learning_rate": 4.278399540155536e-06, | |
| "loss": 5.6546, | |
| "mean_token_accuracy": 0.7811813220381737, | |
| "num_input_tokens_seen": 5024440, | |
| "num_tokens": 5024440.0, | |
| "step": 880, | |
| "train_runtime": 4849.0415, | |
| "train_tokens_per_second": 1036.172 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": 198.0, | |
| "learning_rate": 4.172589639536992e-06, | |
| "loss": 5.3273, | |
| "mean_token_accuracy": 0.7892009258270264, | |
| "num_input_tokens_seen": 5051605, | |
| "num_tokens": 5051605.0, | |
| "step": 885, | |
| "train_runtime": 4873.5248, | |
| "train_tokens_per_second": 1036.54 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 91.0, | |
| "learning_rate": 4.067758561628577e-06, | |
| "loss": 5.7275, | |
| "mean_token_accuracy": 0.7812389150261879, | |
| "num_input_tokens_seen": 5080104, | |
| "num_tokens": 5080104.0, | |
| "step": 890, | |
| "train_runtime": 4901.1022, | |
| "train_tokens_per_second": 1036.523 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": 35.5, | |
| "learning_rate": 3.9639239147731865e-06, | |
| "loss": 5.425, | |
| "mean_token_accuracy": 0.7942001700401307, | |
| "num_input_tokens_seen": 5113018, | |
| "num_tokens": 5113018.0, | |
| "step": 895, | |
| "train_runtime": 4930.903, | |
| "train_tokens_per_second": 1036.933 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 61.5, | |
| "learning_rate": 3.861103139944448e-06, | |
| "loss": 5.2248, | |
| "mean_token_accuracy": 0.8011772260069847, | |
| "num_input_tokens_seen": 5142941, | |
| "num_tokens": 5142941.0, | |
| "step": 900, | |
| "train_runtime": 4959.3927, | |
| "train_tokens_per_second": 1037.01 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": 41.75, | |
| "learning_rate": 3.759313507817196e-06, | |
| "loss": 5.1091, | |
| "mean_token_accuracy": 0.7965022444725036, | |
| "num_input_tokens_seen": 5172026, | |
| "num_tokens": 5172026.0, | |
| "step": 905, | |
| "train_runtime": 4985.9195, | |
| "train_tokens_per_second": 1037.326 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 53.25, | |
| "learning_rate": 3.658572115866541e-06, | |
| "loss": 5.7217, | |
| "mean_token_accuracy": 0.7808126404881477, | |
| "num_input_tokens_seen": 5202257, | |
| "num_tokens": 5202257.0, | |
| "step": 910, | |
| "train_runtime": 5014.4393, | |
| "train_tokens_per_second": 1037.455 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": 44.5, | |
| "learning_rate": 3.558895885496023e-06, | |
| "loss": 5.3946, | |
| "mean_token_accuracy": 0.789019052684307, | |
| "num_input_tokens_seen": 5229589, | |
| "num_tokens": 5229589.0, | |
| "step": 915, | |
| "train_runtime": 5040.3771, | |
| "train_tokens_per_second": 1037.539 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 318.0, | |
| "learning_rate": 3.4603015591953393e-06, | |
| "loss": 5.3473, | |
| "mean_token_accuracy": 0.805972746014595, | |
| "num_input_tokens_seen": 5258125, | |
| "num_tokens": 5258125.0, | |
| "step": 920, | |
| "train_runtime": 5066.3915, | |
| "train_tokens_per_second": 1037.844 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 41.0, | |
| "learning_rate": 3.3628056977281456e-06, | |
| "loss": 5.6571, | |
| "mean_token_accuracy": 0.7789395034313202, | |
| "num_input_tokens_seen": 5286827, | |
| "num_tokens": 5286827.0, | |
| "step": 925, | |
| "train_runtime": 5093.3897, | |
| "train_tokens_per_second": 1037.978 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 50.25, | |
| "learning_rate": 3.266424677350346e-06, | |
| "loss": 5.2024, | |
| "mean_token_accuracy": 0.7979300260543823, | |
| "num_input_tokens_seen": 5313346, | |
| "num_tokens": 5313346.0, | |
| "step": 930, | |
| "train_runtime": 5119.0652, | |
| "train_tokens_per_second": 1037.952 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": 70.0, | |
| "learning_rate": 3.1711746870594083e-06, | |
| "loss": 5.2941, | |
| "mean_token_accuracy": 0.805306826531887, | |
| "num_input_tokens_seen": 5343136, | |
| "num_tokens": 5343136.0, | |
| "step": 935, | |
| "train_runtime": 5148.1407, | |
| "train_tokens_per_second": 1037.877 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 56.25, | |
| "learning_rate": 3.077071725875116e-06, | |
| "loss": 5.1137, | |
| "mean_token_accuracy": 0.8032099828124046, | |
| "num_input_tokens_seen": 5372738, | |
| "num_tokens": 5372738.0, | |
| "step": 940, | |
| "train_runtime": 5176.4597, | |
| "train_tokens_per_second": 1037.917 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": 46.75, | |
| "learning_rate": 2.9841316001522345e-06, | |
| "loss": 4.861, | |
| "mean_token_accuracy": 0.809606908261776, | |
| "num_input_tokens_seen": 5400053, | |
| "num_tokens": 5400053.0, | |
| "step": 945, | |
| "train_runtime": 5203.2867, | |
| "train_tokens_per_second": 1037.816 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 306.0, | |
| "learning_rate": 2.8923699209255285e-06, | |
| "loss": 5.4282, | |
| "mean_token_accuracy": 0.7930289775133132, | |
| "num_input_tokens_seen": 5423237, | |
| "num_tokens": 5423237.0, | |
| "step": 950, | |
| "train_runtime": 5227.2135, | |
| "train_tokens_per_second": 1037.501 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": 61.25, | |
| "learning_rate": 2.8018021012875994e-06, | |
| "loss": 5.0416, | |
| "mean_token_accuracy": 0.8067665219306945, | |
| "num_input_tokens_seen": 5451018, | |
| "num_tokens": 5451018.0, | |
| "step": 955, | |
| "train_runtime": 5254.5977, | |
| "train_tokens_per_second": 1037.381 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 76.5, | |
| "learning_rate": 2.7124433537999838e-06, | |
| "loss": 5.3888, | |
| "mean_token_accuracy": 0.7953536674380303, | |
| "num_input_tokens_seen": 5481559, | |
| "num_tokens": 5481559.0, | |
| "step": 960, | |
| "train_runtime": 5281.8099, | |
| "train_tokens_per_second": 1037.818 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": 76.5, | |
| "learning_rate": 2.6243086879379e-06, | |
| "loss": 5.3031, | |
| "mean_token_accuracy": 0.7893404349684715, | |
| "num_input_tokens_seen": 5509600, | |
| "num_tokens": 5509600.0, | |
| "step": 965, | |
| "train_runtime": 5308.2353, | |
| "train_tokens_per_second": 1037.934 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 53.0, | |
| "learning_rate": 2.537412907569127e-06, | |
| "loss": 5.2369, | |
| "mean_token_accuracy": 0.7978512957692147, | |
| "num_input_tokens_seen": 5538862, | |
| "num_tokens": 5538862.0, | |
| "step": 970, | |
| "train_runtime": 5335.0361, | |
| "train_tokens_per_second": 1038.205 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 66.5, | |
| "learning_rate": 2.451770608467432e-06, | |
| "loss": 5.1098, | |
| "mean_token_accuracy": 0.805266946554184, | |
| "num_input_tokens_seen": 5566918, | |
| "num_tokens": 5566918.0, | |
| "step": 975, | |
| "train_runtime": 5362.4471, | |
| "train_tokens_per_second": 1038.13 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 49.25, | |
| "learning_rate": 2.3673961758609156e-06, | |
| "loss": 5.186, | |
| "mean_token_accuracy": 0.8020475476980209, | |
| "num_input_tokens_seen": 5593453, | |
| "num_tokens": 5593453.0, | |
| "step": 980, | |
| "train_runtime": 5387.3852, | |
| "train_tokens_per_second": 1038.25 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": 83.0, | |
| "learning_rate": 2.2843037820157678e-06, | |
| "loss": 5.2522, | |
| "mean_token_accuracy": 0.7931860506534576, | |
| "num_input_tokens_seen": 5621006, | |
| "num_tokens": 5621006.0, | |
| "step": 985, | |
| "train_runtime": 5413.5032, | |
| "train_tokens_per_second": 1038.331 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 121.5, | |
| "learning_rate": 2.2025073838557454e-06, | |
| "loss": 5.142, | |
| "mean_token_accuracy": 0.7944848969578743, | |
| "num_input_tokens_seen": 5649809, | |
| "num_tokens": 5649809.0, | |
| "step": 990, | |
| "train_runtime": 5441.3683, | |
| "train_tokens_per_second": 1038.307 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": 70.0, | |
| "learning_rate": 2.122020720617869e-06, | |
| "loss": 5.3737, | |
| "mean_token_accuracy": 0.7889134347438812, | |
| "num_input_tokens_seen": 5676379, | |
| "num_tokens": 5676379.0, | |
| "step": 995, | |
| "train_runtime": 5465.6045, | |
| "train_tokens_per_second": 1038.564 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 63.25, | |
| "learning_rate": 2.0428573115446394e-06, | |
| "loss": 5.336, | |
| "mean_token_accuracy": 0.791542237997055, | |
| "num_input_tokens_seen": 5700207, | |
| "num_tokens": 5700207.0, | |
| "step": 1000, | |
| "train_runtime": 5489.0151, | |
| "train_tokens_per_second": 1038.475 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": 260.0, | |
| "learning_rate": 1.9650304536132426e-06, | |
| "loss": 5.263, | |
| "mean_token_accuracy": 0.7995662987232208, | |
| "num_input_tokens_seen": 5725964, | |
| "num_tokens": 5725964.0, | |
| "step": 1005, | |
| "train_runtime": 5514.6903, | |
| "train_tokens_per_second": 1038.311 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 158.0, | |
| "learning_rate": 1.8885532193020706e-06, | |
| "loss": 5.3317, | |
| "mean_token_accuracy": 0.7876490637660026, | |
| "num_input_tokens_seen": 5754571, | |
| "num_tokens": 5754571.0, | |
| "step": 1010, | |
| "train_runtime": 5540.9562, | |
| "train_tokens_per_second": 1038.552 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.813438454394948e-06, | |
| "loss": 5.2275, | |
| "mean_token_accuracy": 0.7981337189674378, | |
| "num_input_tokens_seen": 5783614, | |
| "num_tokens": 5783614.0, | |
| "step": 1015, | |
| "train_runtime": 5567.5443, | |
| "train_tokens_per_second": 1038.809 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.7396987758234418e-06, | |
| "loss": 4.7787, | |
| "mean_token_accuracy": 0.8220131769776344, | |
| "num_input_tokens_seen": 5809495, | |
| "num_tokens": 5809495.0, | |
| "step": 1020, | |
| "train_runtime": 5592.6221, | |
| "train_tokens_per_second": 1038.778 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 80.5, | |
| "learning_rate": 1.6673465695476233e-06, | |
| "loss": 4.8219, | |
| "mean_token_accuracy": 0.8117146581411362, | |
| "num_input_tokens_seen": 5836071, | |
| "num_tokens": 5836071.0, | |
| "step": 1025, | |
| "train_runtime": 5618.3237, | |
| "train_tokens_per_second": 1038.757 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.5963939884756042e-06, | |
| "loss": 5.5483, | |
| "mean_token_accuracy": 0.7852835282683372, | |
| "num_input_tokens_seen": 5867317, | |
| "num_tokens": 5867317.0, | |
| "step": 1030, | |
| "train_runtime": 5647.6112, | |
| "train_tokens_per_second": 1038.902 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": 56.25, | |
| "learning_rate": 1.5268529504222262e-06, | |
| "loss": 5.2143, | |
| "mean_token_accuracy": 0.799188706278801, | |
| "num_input_tokens_seen": 5897945, | |
| "num_tokens": 5897945.0, | |
| "step": 1035, | |
| "train_runtime": 5677.9014, | |
| "train_tokens_per_second": 1038.754 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.4587351361072455e-06, | |
| "loss": 5.4266, | |
| "mean_token_accuracy": 0.7901453331112862, | |
| "num_input_tokens_seen": 5923523, | |
| "num_tokens": 5923523.0, | |
| "step": 1040, | |
| "train_runtime": 5702.9066, | |
| "train_tokens_per_second": 1038.685 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": 144.0, | |
| "learning_rate": 1.3920519871933425e-06, | |
| "loss": 5.4357, | |
| "mean_token_accuracy": 0.7978359222412109, | |
| "num_input_tokens_seen": 5956415, | |
| "num_tokens": 5956415.0, | |
| "step": 1045, | |
| "train_runtime": 5733.0236, | |
| "train_tokens_per_second": 1038.966 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 177.0, | |
| "learning_rate": 1.326814704364262e-06, | |
| "loss": 5.3211, | |
| "mean_token_accuracy": 0.7962174996733665, | |
| "num_input_tokens_seen": 5985449, | |
| "num_tokens": 5985449.0, | |
| "step": 1050, | |
| "train_runtime": 5761.1481, | |
| "train_tokens_per_second": 1038.933 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": 74.0, | |
| "learning_rate": 1.263034245443473e-06, | |
| "loss": 5.2974, | |
| "mean_token_accuracy": 0.7944592133164405, | |
| "num_input_tokens_seen": 6013339, | |
| "num_tokens": 6013339.0, | |
| "step": 1055, | |
| "train_runtime": 5788.1881, | |
| "train_tokens_per_second": 1038.898 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 75.0, | |
| "learning_rate": 1.2007213235535785e-06, | |
| "loss": 5.3725, | |
| "mean_token_accuracy": 0.7937130227684974, | |
| "num_input_tokens_seen": 6043191, | |
| "num_tokens": 6043191.0, | |
| "step": 1060, | |
| "train_runtime": 5815.2046, | |
| "train_tokens_per_second": 1039.205 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": 60.0, | |
| "learning_rate": 1.1398864053168534e-06, | |
| "loss": 4.9906, | |
| "mean_token_accuracy": 0.8006555408239364, | |
| "num_input_tokens_seen": 6071081, | |
| "num_tokens": 6071081.0, | |
| "step": 1065, | |
| "train_runtime": 5841.7991, | |
| "train_tokens_per_second": 1039.249 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 60.5, | |
| "learning_rate": 1.0805397090971738e-06, | |
| "loss": 4.9086, | |
| "mean_token_accuracy": 0.8100662231445312, | |
| "num_input_tokens_seen": 6097563, | |
| "num_tokens": 6097563.0, | |
| "step": 1070, | |
| "train_runtime": 5868.5095, | |
| "train_tokens_per_second": 1039.031 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 57.25, | |
| "learning_rate": 1.022691203283661e-06, | |
| "loss": 5.4238, | |
| "mean_token_accuracy": 0.792490765452385, | |
| "num_input_tokens_seen": 6126398, | |
| "num_tokens": 6126398.0, | |
| "step": 1075, | |
| "train_runtime": 5895.323, | |
| "train_tokens_per_second": 1039.196 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 38.5, | |
| "learning_rate": 9.663506046162986e-07, | |
| "loss": 5.1983, | |
| "mean_token_accuracy": 0.7941671445965767, | |
| "num_input_tokens_seen": 6154407, | |
| "num_tokens": 6154407.0, | |
| "step": 1080, | |
| "train_runtime": 5920.8046, | |
| "train_tokens_per_second": 1039.455 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": 536.0, | |
| "learning_rate": 9.115273765538202e-07, | |
| "loss": 5.5089, | |
| "mean_token_accuracy": 0.7930781245231628, | |
| "num_input_tokens_seen": 6182939, | |
| "num_tokens": 6182939.0, | |
| "step": 1085, | |
| "train_runtime": 5947.8298, | |
| "train_tokens_per_second": 1039.529 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 47.25, | |
| "learning_rate": 8.582307276841461e-07, | |
| "loss": 5.3598, | |
| "mean_token_accuracy": 0.7864043831825256, | |
| "num_input_tokens_seen": 6212094, | |
| "num_tokens": 6212094.0, | |
| "step": 1090, | |
| "train_runtime": 5975.8227, | |
| "train_tokens_per_second": 1039.538 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": 41.5, | |
| "learning_rate": 8.06469610177636e-07, | |
| "loss": 5.3994, | |
| "mean_token_accuracy": 0.7908027723431588, | |
| "num_input_tokens_seen": 6238778, | |
| "num_tokens": 6238778.0, | |
| "step": 1095, | |
| "train_runtime": 6000.9384, | |
| "train_tokens_per_second": 1039.634 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 312.0, | |
| "learning_rate": 7.562527182833978e-07, | |
| "loss": 5.3973, | |
| "mean_token_accuracy": 0.793465219438076, | |
| "num_input_tokens_seen": 6265105, | |
| "num_tokens": 6265105.0, | |
| "step": 1100, | |
| "train_runtime": 6025.9546, | |
| "train_tokens_per_second": 1039.687 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": 128.0, | |
| "learning_rate": 7.07588486868922e-07, | |
| "loss": 5.1888, | |
| "mean_token_accuracy": 0.8035556092858315, | |
| "num_input_tokens_seen": 6290027, | |
| "num_tokens": 6290027.0, | |
| "step": 1105, | |
| "train_runtime": 6051.3714, | |
| "train_tokens_per_second": 1039.438 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 49.0, | |
| "learning_rate": 6.604850900032956e-07, | |
| "loss": 4.7405, | |
| "mean_token_accuracy": 0.8212268218398094, | |
| "num_input_tokens_seen": 6317712, | |
| "num_tokens": 6317712.0, | |
| "step": 1110, | |
| "train_runtime": 6077.8737, | |
| "train_tokens_per_second": 1039.461 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": 79.5, | |
| "learning_rate": 6.149504395842087e-07, | |
| "loss": 5.3393, | |
| "mean_token_accuracy": 0.7951319962739944, | |
| "num_input_tokens_seen": 6343602, | |
| "num_tokens": 6343602.0, | |
| "step": 1115, | |
| "train_runtime": 6105.1625, | |
| "train_tokens_per_second": 1039.055 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 123.0, | |
| "learning_rate": 5.709921840090072e-07, | |
| "loss": 5.2021, | |
| "mean_token_accuracy": 0.7978611201047897, | |
| "num_input_tokens_seen": 6382499, | |
| "num_tokens": 6382499.0, | |
| "step": 1120, | |
| "train_runtime": 6145.3873, | |
| "train_tokens_per_second": 1038.584 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 108.5, | |
| "learning_rate": 5.286177068899989e-07, | |
| "loss": 5.2466, | |
| "mean_token_accuracy": 0.7941580578684807, | |
| "num_input_tokens_seen": 6409279, | |
| "num_tokens": 6409279.0, | |
| "step": 1125, | |
| "train_runtime": 6172.0603, | |
| "train_tokens_per_second": 1038.434 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 50.0, | |
| "learning_rate": 4.878341258142349e-07, | |
| "loss": 5.4412, | |
| "mean_token_accuracy": 0.7916087701916694, | |
| "num_input_tokens_seen": 6440759, | |
| "num_tokens": 6440759.0, | |
| "step": 1130, | |
| "train_runtime": 6198.541, | |
| "train_tokens_per_second": 1039.077 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": 130.0, | |
| "learning_rate": 4.4864829114798394e-07, | |
| "loss": 4.9766, | |
| "mean_token_accuracy": 0.8005422234535218, | |
| "num_input_tokens_seen": 6468905, | |
| "num_tokens": 6468905.0, | |
| "step": 1135, | |
| "train_runtime": 6225.8798, | |
| "train_tokens_per_second": 1039.035 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 79.0, | |
| "learning_rate": 4.11066784886075e-07, | |
| "loss": 5.3727, | |
| "mean_token_accuracy": 0.7998930081725121, | |
| "num_input_tokens_seen": 6493165, | |
| "num_tokens": 6493165.0, | |
| "step": 1140, | |
| "train_runtime": 6249.8312, | |
| "train_tokens_per_second": 1038.934 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": 54.0, | |
| "learning_rate": 3.750959195463466e-07, | |
| "loss": 5.3525, | |
| "mean_token_accuracy": 0.7876800760626793, | |
| "num_input_tokens_seen": 6522770, | |
| "num_tokens": 6522770.0, | |
| "step": 1145, | |
| "train_runtime": 6278.1358, | |
| "train_tokens_per_second": 1038.966 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 206.0, | |
| "learning_rate": 3.4074173710931804e-07, | |
| "loss": 5.7993, | |
| "mean_token_accuracy": 0.7791803061962128, | |
| "num_input_tokens_seen": 6549386, | |
| "num_tokens": 6549386.0, | |
| "step": 1150, | |
| "train_runtime": 6302.9235, | |
| "train_tokens_per_second": 1039.103 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": 99.0, | |
| "learning_rate": 3.080100080033388e-07, | |
| "loss": 5.0963, | |
| "mean_token_accuracy": 0.7987044736742973, | |
| "num_input_tokens_seen": 6578069, | |
| "num_tokens": 6578069.0, | |
| "step": 1155, | |
| "train_runtime": 6330.1523, | |
| "train_tokens_per_second": 1039.164 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 83.0, | |
| "learning_rate": 2.769062301353398e-07, | |
| "loss": 5.5875, | |
| "mean_token_accuracy": 0.7888873621821404, | |
| "num_input_tokens_seen": 6603075, | |
| "num_tokens": 6603075.0, | |
| "step": 1160, | |
| "train_runtime": 6354.1556, | |
| "train_tokens_per_second": 1039.174 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": 102.0, | |
| "learning_rate": 2.474356279673462e-07, | |
| "loss": 5.6995, | |
| "mean_token_accuracy": 0.7825249642133713, | |
| "num_input_tokens_seen": 6635809, | |
| "num_tokens": 6635809.0, | |
| "step": 1165, | |
| "train_runtime": 6382.7967, | |
| "train_tokens_per_second": 1039.64 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 56.5, | |
| "learning_rate": 2.1960315163894075e-07, | |
| "loss": 5.1911, | |
| "mean_token_accuracy": 0.7973916217684746, | |
| "num_input_tokens_seen": 6661327, | |
| "num_tokens": 6661327.0, | |
| "step": 1170, | |
| "train_runtime": 6406.6683, | |
| "train_tokens_per_second": 1039.749 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.9341347613579086e-07, | |
| "loss": 5.1047, | |
| "mean_token_accuracy": 0.8013954371213913, | |
| "num_input_tokens_seen": 6690372, | |
| "num_tokens": 6690372.0, | |
| "step": 1175, | |
| "train_runtime": 6435.8803, | |
| "train_tokens_per_second": 1039.543 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 73.5, | |
| "learning_rate": 1.6887100050439587e-07, | |
| "loss": 5.6123, | |
| "mean_token_accuracy": 0.7876136094331742, | |
| "num_input_tokens_seen": 6719639, | |
| "num_tokens": 6719639.0, | |
| "step": 1180, | |
| "train_runtime": 6464.1419, | |
| "train_tokens_per_second": 1039.525 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": 87.0, | |
| "learning_rate": 1.459798471131868e-07, | |
| "loss": 5.4049, | |
| "mean_token_accuracy": 0.7874793767929077, | |
| "num_input_tokens_seen": 6745687, | |
| "num_tokens": 6745687.0, | |
| "step": 1185, | |
| "train_runtime": 6489.0041, | |
| "train_tokens_per_second": 1039.557 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 454.0, | |
| "learning_rate": 1.2474386096010037e-07, | |
| "loss": 5.0158, | |
| "mean_token_accuracy": 0.8049899056553841, | |
| "num_input_tokens_seen": 6774080, | |
| "num_tokens": 6774080.0, | |
| "step": 1190, | |
| "train_runtime": 6514.6768, | |
| "train_tokens_per_second": 1039.818 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": 74.5, | |
| "learning_rate": 1.0516660902673448e-07, | |
| "loss": 5.4421, | |
| "mean_token_accuracy": 0.7932335063815117, | |
| "num_input_tokens_seen": 6803981, | |
| "num_tokens": 6803981.0, | |
| "step": 1195, | |
| "train_runtime": 6542.1628, | |
| "train_tokens_per_second": 1040.02 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 67.5, | |
| "learning_rate": 8.725137967920739e-08, | |
| "loss": 5.6285, | |
| "mean_token_accuracy": 0.7887401878833771, | |
| "num_input_tokens_seen": 6829351, | |
| "num_tokens": 6829351.0, | |
| "step": 1200, | |
| "train_runtime": 6567.4241, | |
| "train_tokens_per_second": 1039.883 | |
| }, | |
| { | |
| "epoch": 0.964, | |
| "grad_norm": 98.0, | |
| "learning_rate": 7.100118211581852e-08, | |
| "loss": 5.5201, | |
| "mean_token_accuracy": 0.7889957845211029, | |
| "num_input_tokens_seen": 6856748, | |
| "num_tokens": 6856748.0, | |
| "step": 1205, | |
| "train_runtime": 6594.1889, | |
| "train_tokens_per_second": 1039.817 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 46.75, | |
| "learning_rate": 5.6418745861593905e-08, | |
| "loss": 5.0326, | |
| "mean_token_accuracy": 0.8010287463665009, | |
| "num_input_tokens_seen": 6886116, | |
| "num_tokens": 6886116.0, | |
| "step": 1210, | |
| "train_runtime": 6621.5933, | |
| "train_tokens_per_second": 1039.949 | |
| }, | |
| { | |
| "epoch": 0.972, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.350652030981395e-08, | |
| "loss": 5.7011, | |
| "mean_token_accuracy": 0.7844116255640984, | |
| "num_input_tokens_seen": 6912415, | |
| "num_tokens": 6912415.0, | |
| "step": 1215, | |
| "train_runtime": 6646.1851, | |
| "train_tokens_per_second": 1040.058 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 50.75, | |
| "learning_rate": 3.2266674310589276e-08, | |
| "loss": 5.481, | |
| "mean_token_accuracy": 0.7872747302055358, | |
| "num_input_tokens_seen": 6937018, | |
| "num_tokens": 6937018.0, | |
| "step": 1220, | |
| "train_runtime": 6670.2576, | |
| "train_tokens_per_second": 1039.993 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 81.0, | |
| "learning_rate": 2.2701095806565432e-08, | |
| "loss": 5.5155, | |
| "mean_token_accuracy": 0.7937034830451012, | |
| "num_input_tokens_seen": 6968725, | |
| "num_tokens": 6968725.0, | |
| "step": 1225, | |
| "train_runtime": 6700.0416, | |
| "train_tokens_per_second": 1040.102 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.4811391515799911e-08, | |
| "loss": 5.0932, | |
| "mean_token_accuracy": 0.7983416199684144, | |
| "num_input_tokens_seen": 6996046, | |
| "num_tokens": 6996046.0, | |
| "step": 1230, | |
| "train_runtime": 6725.8984, | |
| "train_tokens_per_second": 1040.165 | |
| }, | |
| { | |
| "epoch": 0.988, | |
| "grad_norm": 47.0, | |
| "learning_rate": 8.59888666189579e-09, | |
| "loss": 5.6719, | |
| "mean_token_accuracy": 0.783245125412941, | |
| "num_input_tokens_seen": 7026420, | |
| "num_tokens": 7026420.0, | |
| "step": 1235, | |
| "train_runtime": 6754.2343, | |
| "train_tokens_per_second": 1040.299 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 211.0, | |
| "learning_rate": 4.064624751394242e-09, | |
| "loss": 5.7314, | |
| "mean_token_accuracy": 0.7820679724216462, | |
| "num_input_tokens_seen": 7056594, | |
| "num_tokens": 7056594.0, | |
| "step": 1240, | |
| "train_runtime": 6782.773, | |
| "train_tokens_per_second": 1040.37 | |
| }, | |
| { | |
| "epoch": 0.996, | |
| "grad_norm": 114.0, | |
| "learning_rate": 1.209367398504746e-09, | |
| "loss": 5.0323, | |
| "mean_token_accuracy": 0.8049638271331787, | |
| "num_input_tokens_seen": 7081035, | |
| "num_tokens": 7081035.0, | |
| "step": 1245, | |
| "train_runtime": 6807.8837, | |
| "train_tokens_per_second": 1040.123 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 57.0, | |
| "learning_rate": 3.3594197175190743e-11, | |
| "loss": 5.217, | |
| "mean_token_accuracy": 0.8012784749269486, | |
| "num_input_tokens_seen": 7107438, | |
| "num_tokens": 7107438.0, | |
| "step": 1250, | |
| "train_runtime": 6833.6244, | |
| "train_tokens_per_second": 1040.069 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_input_tokens_seen": 7107438, | |
| "step": 1250, | |
| "total_flos": 1.4684498749056614e+17, | |
| "train_loss": 5.760735061645508, | |
| "train_runtime": 6833.6666, | |
| "train_samples_per_second": 1.463, | |
| "train_steps_per_second": 0.183, | |
| "train_tokens_per_second": 1040.062 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 7107438, | |
| "num_train_epochs": 1, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4684498749056614e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |