{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 716.0, "learning_rate": 2.105263157894737e-06, "loss": 21.0165, "mean_token_accuracy": 0.6174646154046058, "num_input_tokens_seen": 27461, "num_tokens": 27461.0, "step": 5, "train_runtime": 58.529, "train_tokens_per_second": 469.186 }, { "epoch": 0.008, "grad_norm": 420.0, "learning_rate": 4.736842105263158e-06, "loss": 17.7281, "mean_token_accuracy": 0.6165577113628388, "num_input_tokens_seen": 55051, "num_tokens": 55051.0, "step": 10, "train_runtime": 84.4052, "train_tokens_per_second": 652.223 }, { "epoch": 0.012, "grad_norm": 284.0, "learning_rate": 7.368421052631579e-06, "loss": 15.0203, "mean_token_accuracy": 0.6702655613422394, "num_input_tokens_seen": 83951, "num_tokens": 83951.0, "step": 15, "train_runtime": 111.8863, "train_tokens_per_second": 750.324 }, { "epoch": 0.016, "grad_norm": 224.0, "learning_rate": 1e-05, "loss": 11.6076, "mean_token_accuracy": 0.7083307519555092, "num_input_tokens_seen": 109657, "num_tokens": 109657.0, "step": 20, "train_runtime": 136.978, "train_tokens_per_second": 800.544 }, { "epoch": 0.02, "grad_norm": 270.0, "learning_rate": 1.263157894736842e-05, "loss": 9.6461, "mean_token_accuracy": 0.7309058234095573, "num_input_tokens_seen": 139061, "num_tokens": 139061.0, "step": 25, "train_runtime": 165.0733, "train_tokens_per_second": 842.419 }, { "epoch": 0.024, "grad_norm": 50.25, "learning_rate": 1.5263157894736846e-05, "loss": 8.7466, "mean_token_accuracy": 0.7324644327163696, "num_input_tokens_seen": 173335, "num_tokens": 173335.0, "step": 30, "train_runtime": 195.3505, "train_tokens_per_second": 887.303 }, { "epoch": 0.028, "grad_norm": 101.0, "learning_rate": 1.7894736842105264e-05, "loss": 8.494, "mean_token_accuracy": 0.7436975419521332, "num_input_tokens_seen": 201778, "num_tokens": 201778.0, "step": 35, "train_runtime": 223.7451, "train_tokens_per_second": 901.821 }, { "epoch": 0.032, "grad_norm": 316.0, "learning_rate": 1.9999966405802828e-05, "loss": 7.8429, "mean_token_accuracy": 0.733650079369545, "num_input_tokens_seen": 234594, "num_tokens": 234594.0, "step": 40, "train_runtime": 254.1938, "train_tokens_per_second": 922.894 }, { "epoch": 0.036, "grad_norm": 67.0, "learning_rate": 1.9998790632601496e-05, "loss": 7.2857, "mean_token_accuracy": 0.7769168972969055, "num_input_tokens_seen": 263946, "num_tokens": 263946.0, "step": 45, "train_runtime": 282.687, "train_tokens_per_second": 933.704 }, { "epoch": 0.04, "grad_norm": 56.25, "learning_rate": 1.9995935375248608e-05, "loss": 7.2708, "mean_token_accuracy": 0.7696839615702629, "num_input_tokens_seen": 292127, "num_tokens": 292127.0, "step": 50, "train_runtime": 309.685, "train_tokens_per_second": 943.304 }, { "epoch": 0.044, "grad_norm": 79.0, "learning_rate": 1.9991401113338103e-05, "loss": 7.5369, "mean_token_accuracy": 0.7532998159527778, "num_input_tokens_seen": 318452, "num_tokens": 318452.0, "step": 55, "train_runtime": 336.3519, "train_tokens_per_second": 946.782 }, { "epoch": 0.048, "grad_norm": 52.75, "learning_rate": 1.99851886084842e-05, "loss": 7.4304, "mean_token_accuracy": 0.7594221189618111, "num_input_tokens_seen": 344023, "num_tokens": 344023.0, "step": 60, "train_runtime": 361.6326, "train_tokens_per_second": 951.305 }, { "epoch": 0.052, "grad_norm": 53.5, "learning_rate": 1.9977298904193438e-05, "loss": 7.1512, "mean_token_accuracy": 0.7550393640995026, "num_input_tokens_seen": 372600, "num_tokens": 372600.0, "step": 65, "train_runtime": 389.9343, "train_tokens_per_second": 955.546 }, { "epoch": 0.056, "grad_norm": 52.75, "learning_rate": 1.9967733325689412e-05, "loss": 6.7062, "mean_token_accuracy": 0.7626852974295616, "num_input_tokens_seen": 399843, "num_tokens": 399843.0, "step": 70, "train_runtime": 415.5105, "train_tokens_per_second": 962.293 }, { "epoch": 0.06, "grad_norm": 51.75, "learning_rate": 1.995649347969019e-05, "loss": 6.7049, "mean_token_accuracy": 0.76889388859272, "num_input_tokens_seen": 426039, "num_tokens": 426039.0, "step": 75, "train_runtime": 440.456, "train_tokens_per_second": 967.268 }, { "epoch": 0.064, "grad_norm": 171.0, "learning_rate": 1.994358125413841e-05, "loss": 6.1025, "mean_token_accuracy": 0.7738867923617363, "num_input_tokens_seen": 454551, "num_tokens": 454551.0, "step": 80, "train_runtime": 468.9331, "train_tokens_per_second": 969.33 }, { "epoch": 0.068, "grad_norm": 207.0, "learning_rate": 1.9928998817884185e-05, "loss": 6.4376, "mean_token_accuracy": 0.772473418712616, "num_input_tokens_seen": 480692, "num_tokens": 480692.0, "step": 85, "train_runtime": 493.6013, "train_tokens_per_second": 973.847 }, { "epoch": 0.072, "grad_norm": 214.0, "learning_rate": 1.9912748620320796e-05, "loss": 6.6575, "mean_token_accuracy": 0.7715476334095002, "num_input_tokens_seen": 509235, "num_tokens": 509235.0, "step": 90, "train_runtime": 521.9092, "train_tokens_per_second": 975.716 }, { "epoch": 0.076, "grad_norm": 55.0, "learning_rate": 1.9894833390973266e-05, "loss": 6.8634, "mean_token_accuracy": 0.761625699698925, "num_input_tokens_seen": 541734, "num_tokens": 541734.0, "step": 95, "train_runtime": 551.2186, "train_tokens_per_second": 982.793 }, { "epoch": 0.08, "grad_norm": 79.0, "learning_rate": 1.98752561390399e-05, "loss": 6.1263, "mean_token_accuracy": 0.7742740377783776, "num_input_tokens_seen": 572168, "num_tokens": 572168.0, "step": 100, "train_runtime": 579.6467, "train_tokens_per_second": 987.098 }, { "epoch": 0.084, "grad_norm": 89.5, "learning_rate": 1.9854020152886816e-05, "loss": 6.3594, "mean_token_accuracy": 0.7709073334932327, "num_input_tokens_seen": 598451, "num_tokens": 598451.0, "step": 105, "train_runtime": 605.7065, "train_tokens_per_second": 988.021 }, { "epoch": 0.088, "grad_norm": 260.0, "learning_rate": 1.9831128999495605e-05, "loss": 6.1262, "mean_token_accuracy": 0.7783429339528084, "num_input_tokens_seen": 626393, "num_tokens": 626393.0, "step": 110, "train_runtime": 633.7147, "train_tokens_per_second": 988.446 }, { "epoch": 0.092, "grad_norm": 46.25, "learning_rate": 1.9806586523864212e-05, "loss": 6.1485, "mean_token_accuracy": 0.7764704540371895, "num_input_tokens_seen": 649813, "num_tokens": 649813.0, "step": 115, "train_runtime": 657.2228, "train_tokens_per_second": 988.726 }, { "epoch": 0.096, "grad_norm": 81.0, "learning_rate": 1.978039684836106e-05, "loss": 5.7248, "mean_token_accuracy": 0.7734859913587571, "num_input_tokens_seen": 676687, "num_tokens": 676687.0, "step": 120, "train_runtime": 683.5183, "train_tokens_per_second": 990.006 }, { "epoch": 0.1, "grad_norm": 44.0, "learning_rate": 1.9752564372032655e-05, "loss": 5.9166, "mean_token_accuracy": 0.7785823866724968, "num_input_tokens_seen": 708413, "num_tokens": 708413.0, "step": 125, "train_runtime": 714.4677, "train_tokens_per_second": 991.526 }, { "epoch": 0.104, "grad_norm": 624.0, "learning_rate": 1.9723093769864663e-05, "loss": 6.1637, "mean_token_accuracy": 0.7725436985492706, "num_input_tokens_seen": 733682, "num_tokens": 733682.0, "step": 130, "train_runtime": 740.5204, "train_tokens_per_second": 990.765 }, { "epoch": 0.108, "grad_norm": 63.75, "learning_rate": 1.9691989991996663e-05, "loss": 6.0692, "mean_token_accuracy": 0.7738721042871475, "num_input_tokens_seen": 763284, "num_tokens": 763284.0, "step": 135, "train_runtime": 769.434, "train_tokens_per_second": 992.007 }, { "epoch": 0.112, "grad_norm": 71.5, "learning_rate": 1.9659258262890683e-05, "loss": 5.9618, "mean_token_accuracy": 0.7771038174629211, "num_input_tokens_seen": 791236, "num_tokens": 791236.0, "step": 140, "train_runtime": 795.0855, "train_tokens_per_second": 995.158 }, { "epoch": 0.116, "grad_norm": 50.5, "learning_rate": 1.9624904080453656e-05, "loss": 6.2847, "mean_token_accuracy": 0.768642008304596, "num_input_tokens_seen": 818107, "num_tokens": 818107.0, "step": 145, "train_runtime": 821.0436, "train_tokens_per_second": 996.423 }, { "epoch": 0.12, "grad_norm": 113.5, "learning_rate": 1.9588933215113926e-05, "loss": 6.0329, "mean_token_accuracy": 0.7712928548455238, "num_input_tokens_seen": 846017, "num_tokens": 846017.0, "step": 150, "train_runtime": 849.152, "train_tokens_per_second": 996.308 }, { "epoch": 0.124, "grad_norm": 564.0, "learning_rate": 1.955135170885202e-05, "loss": 5.7569, "mean_token_accuracy": 0.7793401271104813, "num_input_tokens_seen": 879137, "num_tokens": 879137.0, "step": 155, "train_runtime": 879.9983, "train_tokens_per_second": 999.021 }, { "epoch": 0.128, "grad_norm": 59.25, "learning_rate": 1.9512165874185768e-05, "loss": 5.9181, "mean_token_accuracy": 0.7819835215806961, "num_input_tokens_seen": 905824, "num_tokens": 905824.0, "step": 160, "train_runtime": 905.886, "train_tokens_per_second": 999.932 }, { "epoch": 0.132, "grad_norm": 85.5, "learning_rate": 1.9471382293110004e-05, "loss": 5.6098, "mean_token_accuracy": 0.7914141818881035, "num_input_tokens_seen": 932292, "num_tokens": 932292.0, "step": 165, "train_runtime": 931.5649, "train_tokens_per_second": 1000.781 }, { "epoch": 0.136, "grad_norm": 58.0, "learning_rate": 1.9429007815990995e-05, "loss": 6.4777, "mean_token_accuracy": 0.763878983259201, "num_input_tokens_seen": 965956, "num_tokens": 965956.0, "step": 170, "train_runtime": 962.6472, "train_tokens_per_second": 1003.437 }, { "epoch": 0.14, "grad_norm": 130.0, "learning_rate": 1.9385049560415794e-05, "loss": 6.0058, "mean_token_accuracy": 0.7746358260512352, "num_input_tokens_seen": 992908, "num_tokens": 992908.0, "step": 175, "train_runtime": 988.7648, "train_tokens_per_second": 1004.19 }, { "epoch": 0.144, "grad_norm": 620.0, "learning_rate": 1.9339514909996706e-05, "loss": 6.0002, "mean_token_accuracy": 0.7761899515986442, "num_input_tokens_seen": 1027079, "num_tokens": 1027079.0, "step": 180, "train_runtime": 1019.9491, "train_tokens_per_second": 1006.99 }, { "epoch": 0.148, "grad_norm": 83.5, "learning_rate": 1.929241151313108e-05, "loss": 5.872, "mean_token_accuracy": 0.7852786988019943, "num_input_tokens_seen": 1056613, "num_tokens": 1056613.0, "step": 185, "train_runtime": 1048.5847, "train_tokens_per_second": 1007.656 }, { "epoch": 0.152, "grad_norm": 840.0, "learning_rate": 1.9243747281716604e-05, "loss": 5.682, "mean_token_accuracy": 0.7868907496333122, "num_input_tokens_seen": 1085315, "num_tokens": 1085315.0, "step": 190, "train_runtime": 1074.4166, "train_tokens_per_second": 1010.144 }, { "epoch": 0.156, "grad_norm": 81.5, "learning_rate": 1.9193530389822364e-05, "loss": 5.5209, "mean_token_accuracy": 0.7940514251589775, "num_input_tokens_seen": 1115115, "num_tokens": 1115115.0, "step": 195, "train_runtime": 1103.2538, "train_tokens_per_second": 1010.751 }, { "epoch": 0.16, "grad_norm": 100.5, "learning_rate": 1.9141769272315857e-05, "loss": 5.5272, "mean_token_accuracy": 0.7890612185001373, "num_input_tokens_seen": 1139079, "num_tokens": 1139079.0, "step": 200, "train_runtime": 1127.1395, "train_tokens_per_second": 1010.593 }, { "epoch": 0.164, "grad_norm": 62.25, "learning_rate": 1.9088472623446182e-05, "loss": 5.7363, "mean_token_accuracy": 0.7819481372833252, "num_input_tokens_seen": 1171419, "num_tokens": 1171419.0, "step": 205, "train_runtime": 1158.0624, "train_tokens_per_second": 1011.534 }, { "epoch": 0.168, "grad_norm": 169.0, "learning_rate": 1.90336493953837e-05, "loss": 5.914, "mean_token_accuracy": 0.7838666513562202, "num_input_tokens_seen": 1199784, "num_tokens": 1199784.0, "step": 210, "train_runtime": 1184.5189, "train_tokens_per_second": 1012.887 }, { "epoch": 0.172, "grad_norm": 78.0, "learning_rate": 1.897730879671634e-05, "loss": 5.3131, "mean_token_accuracy": 0.7945496052503586, "num_input_tokens_seen": 1224308, "num_tokens": 1224308.0, "step": 215, "train_runtime": 1209.327, "train_tokens_per_second": 1012.388 }, { "epoch": 0.176, "grad_norm": 1656.0, "learning_rate": 1.891946029090283e-05, "loss": 5.6709, "mean_token_accuracy": 0.7843748390674591, "num_input_tokens_seen": 1252051, "num_tokens": 1252051.0, "step": 220, "train_runtime": 1236.4054, "train_tokens_per_second": 1012.654 }, { "epoch": 0.18, "grad_norm": 95.5, "learning_rate": 1.8860113594683148e-05, "loss": 5.9523, "mean_token_accuracy": 0.7736531987786293, "num_input_tokens_seen": 1278020, "num_tokens": 1278020.0, "step": 225, "train_runtime": 1262.0193, "train_tokens_per_second": 1012.679 }, { "epoch": 0.184, "grad_norm": 75.0, "learning_rate": 1.8799278676446425e-05, "loss": 5.5729, "mean_token_accuracy": 0.7859392121434212, "num_input_tokens_seen": 1307055, "num_tokens": 1307055.0, "step": 230, "train_runtime": 1289.4069, "train_tokens_per_second": 1013.687 }, { "epoch": 0.188, "grad_norm": 54.75, "learning_rate": 1.8736965754556527e-05, "loss": 5.6597, "mean_token_accuracy": 0.7832028537988662, "num_input_tokens_seen": 1332482, "num_tokens": 1332482.0, "step": 235, "train_runtime": 1314.3047, "train_tokens_per_second": 1013.83 }, { "epoch": 0.192, "grad_norm": 60.75, "learning_rate": 1.867318529563574e-05, "loss": 6.0631, "mean_token_accuracy": 0.7801717698574067, "num_input_tokens_seen": 1359469, "num_tokens": 1359469.0, "step": 240, "train_runtime": 1341.0816, "train_tokens_per_second": 1013.711 }, { "epoch": 0.196, "grad_norm": 72.0, "learning_rate": 1.8607948012806664e-05, "loss": 5.7384, "mean_token_accuracy": 0.7812554731965065, "num_input_tokens_seen": 1390888, "num_tokens": 1390888.0, "step": 245, "train_runtime": 1371.2979, "train_tokens_per_second": 1014.286 }, { "epoch": 0.2, "grad_norm": 96.0, "learning_rate": 1.8541264863892755e-05, "loss": 5.871, "mean_token_accuracy": 0.7782064586877823, "num_input_tokens_seen": 1414969, "num_tokens": 1414969.0, "step": 250, "train_runtime": 1395.692, "train_tokens_per_second": 1013.812 }, { "epoch": 0.204, "grad_norm": 43.25, "learning_rate": 1.8473147049577777e-05, "loss": 5.9369, "mean_token_accuracy": 0.7823293015360833, "num_input_tokens_seen": 1441261, "num_tokens": 1441261.0, "step": 255, "train_runtime": 1420.1833, "train_tokens_per_second": 1014.842 }, { "epoch": 0.208, "grad_norm": 1168.0, "learning_rate": 1.84036060115244e-05, "loss": 5.7154, "mean_token_accuracy": 0.7839734643697739, "num_input_tokens_seen": 1470281, "num_tokens": 1470281.0, "step": 260, "train_runtime": 1447.4831, "train_tokens_per_second": 1015.75 }, { "epoch": 0.212, "grad_norm": 65.0, "learning_rate": 1.8332653430452375e-05, "loss": 5.4737, "mean_token_accuracy": 0.7898797050118447, "num_input_tokens_seen": 1502818, "num_tokens": 1502818.0, "step": 265, "train_runtime": 1477.0712, "train_tokens_per_second": 1017.431 }, { "epoch": 0.216, "grad_norm": 175.0, "learning_rate": 1.826030122417656e-05, "loss": 5.5671, "mean_token_accuracy": 0.780795156955719, "num_input_tokens_seen": 1531913, "num_tokens": 1531913.0, "step": 270, "train_runtime": 1503.8595, "train_tokens_per_second": 1018.654 }, { "epoch": 0.22, "grad_norm": 62.75, "learning_rate": 1.8186561545605055e-05, "loss": 5.9096, "mean_token_accuracy": 0.7873492911458015, "num_input_tokens_seen": 1563135, "num_tokens": 1563135.0, "step": 275, "train_runtime": 1532.3365, "train_tokens_per_second": 1020.099 }, { "epoch": 0.224, "grad_norm": 200.0, "learning_rate": 1.811144678069793e-05, "loss": 5.2624, "mean_token_accuracy": 0.7928400427103043, "num_input_tokens_seen": 1591144, "num_tokens": 1591144.0, "step": 280, "train_runtime": 1560.0406, "train_tokens_per_second": 1019.938 }, { "epoch": 0.228, "grad_norm": 38.5, "learning_rate": 1.803496954638676e-05, "loss": 5.4277, "mean_token_accuracy": 0.7846548587083817, "num_input_tokens_seen": 1621253, "num_tokens": 1621253.0, "step": 285, "train_runtime": 1590.4126, "train_tokens_per_second": 1019.391 }, { "epoch": 0.232, "grad_norm": 67.0, "learning_rate": 1.7957142688455362e-05, "loss": 5.6897, "mean_token_accuracy": 0.7835722789168358, "num_input_tokens_seen": 1646466, "num_tokens": 1646466.0, "step": 290, "train_runtime": 1615.728, "train_tokens_per_second": 1019.024 }, { "epoch": 0.236, "grad_norm": 121.5, "learning_rate": 1.7877979279382135e-05, "loss": 5.8008, "mean_token_accuracy": 0.777139276266098, "num_input_tokens_seen": 1674583, "num_tokens": 1674583.0, "step": 295, "train_runtime": 1641.5179, "train_tokens_per_second": 1020.143 }, { "epoch": 0.24, "grad_norm": 43.5, "learning_rate": 1.7797492616144256e-05, "loss": 5.794, "mean_token_accuracy": 0.7790872991085053, "num_input_tokens_seen": 1702490, "num_tokens": 1702490.0, "step": 300, "train_runtime": 1667.5303, "train_tokens_per_second": 1020.965 }, { "epoch": 0.244, "grad_norm": 84.0, "learning_rate": 1.7715696217984233e-05, "loss": 5.7744, "mean_token_accuracy": 0.7830787718296051, "num_input_tokens_seen": 1732134, "num_tokens": 1732134.0, "step": 305, "train_runtime": 1695.751, "train_tokens_per_second": 1021.455 }, { "epoch": 0.248, "grad_norm": 97.5, "learning_rate": 1.7632603824139086e-05, "loss": 5.7534, "mean_token_accuracy": 0.7765944376587868, "num_input_tokens_seen": 1760431, "num_tokens": 1760431.0, "step": 310, "train_runtime": 1721.7874, "train_tokens_per_second": 1022.444 }, { "epoch": 0.252, "grad_norm": 214.0, "learning_rate": 1.7548229391532572e-05, "loss": 5.5022, "mean_token_accuracy": 0.7858106374740601, "num_input_tokens_seen": 1786890, "num_tokens": 1786890.0, "step": 315, "train_runtime": 1746.8899, "train_tokens_per_second": 1022.898 }, { "epoch": 0.256, "grad_norm": 128.0, "learning_rate": 1.7462587092430877e-05, "loss": 5.6599, "mean_token_accuracy": 0.7865968465805053, "num_input_tokens_seen": 1812677, "num_tokens": 1812677.0, "step": 320, "train_runtime": 1774.44, "train_tokens_per_second": 1021.549 }, { "epoch": 0.26, "grad_norm": 75.0, "learning_rate": 1.7375691312062102e-05, "loss": 5.5823, "mean_token_accuracy": 0.798132348060608, "num_input_tokens_seen": 1844191, "num_tokens": 1844191.0, "step": 325, "train_runtime": 1803.6261, "train_tokens_per_second": 1022.491 }, { "epoch": 0.264, "grad_norm": 68.0, "learning_rate": 1.728755664620002e-05, "loss": 5.4933, "mean_token_accuracy": 0.7956582695245743, "num_input_tokens_seen": 1873707, "num_tokens": 1873707.0, "step": 330, "train_runtime": 1831.8481, "train_tokens_per_second": 1022.851 }, { "epoch": 0.268, "grad_norm": 174.0, "learning_rate": 1.7198197898712402e-05, "loss": 5.6207, "mean_token_accuracy": 0.7940072804689408, "num_input_tokens_seen": 1897405, "num_tokens": 1897405.0, "step": 335, "train_runtime": 1855.6387, "train_tokens_per_second": 1022.508 }, { "epoch": 0.272, "grad_norm": 87.0, "learning_rate": 1.7107630079074477e-05, "loss": 5.3281, "mean_token_accuracy": 0.7956176668405532, "num_input_tokens_seen": 1924306, "num_tokens": 1924306.0, "step": 340, "train_runtime": 1882.2812, "train_tokens_per_second": 1022.327 }, { "epoch": 0.276, "grad_norm": 58.0, "learning_rate": 1.7015868399847768e-05, "loss": 5.3952, "mean_token_accuracy": 0.795125538110733, "num_input_tokens_seen": 1957493, "num_tokens": 1957493.0, "step": 345, "train_runtime": 1912.1661, "train_tokens_per_second": 1023.704 }, { "epoch": 0.28, "grad_norm": 69.5, "learning_rate": 1.6922928274124887e-05, "loss": 5.5721, "mean_token_accuracy": 0.786569619178772, "num_input_tokens_seen": 1989461, "num_tokens": 1989461.0, "step": 350, "train_runtime": 1941.3276, "train_tokens_per_second": 1024.794 }, { "epoch": 0.284, "grad_norm": 52.75, "learning_rate": 1.6828825312940594e-05, "loss": 5.6287, "mean_token_accuracy": 0.7822471752762794, "num_input_tokens_seen": 2018840, "num_tokens": 2018840.0, "step": 355, "train_runtime": 1968.6202, "train_tokens_per_second": 1025.51 }, { "epoch": 0.288, "grad_norm": 78.0, "learning_rate": 1.673357532264966e-05, "loss": 5.6751, "mean_token_accuracy": 0.7819222688674927, "num_input_tokens_seen": 2046554, "num_tokens": 2046554.0, "step": 360, "train_runtime": 1994.9582, "train_tokens_per_second": 1025.863 }, { "epoch": 0.292, "grad_norm": 58.25, "learning_rate": 1.663719430227186e-05, "loss": 5.3833, "mean_token_accuracy": 0.7945975109934806, "num_input_tokens_seen": 2074389, "num_tokens": 2074389.0, "step": 365, "train_runtime": 2023.2385, "train_tokens_per_second": 1025.282 }, { "epoch": 0.296, "grad_norm": 63.25, "learning_rate": 1.653969844080466e-05, "loss": 5.3062, "mean_token_accuracy": 0.7965321630239487, "num_input_tokens_seen": 2102788, "num_tokens": 2102788.0, "step": 370, "train_runtime": 2051.2147, "train_tokens_per_second": 1025.143 }, { "epoch": 0.3, "grad_norm": 169.0, "learning_rate": 1.644110411450398e-05, "loss": 5.6184, "mean_token_accuracy": 0.7859084010124207, "num_input_tokens_seen": 2127585, "num_tokens": 2127585.0, "step": 375, "train_runtime": 2076.5681, "train_tokens_per_second": 1024.568 }, { "epoch": 0.304, "grad_norm": 67.0, "learning_rate": 1.634142788413346e-05, "loss": 5.7921, "mean_token_accuracy": 0.7838741362094879, "num_input_tokens_seen": 2153488, "num_tokens": 2153488.0, "step": 380, "train_runtime": 2101.2637, "train_tokens_per_second": 1024.854 }, { "epoch": 0.308, "grad_norm": 139.0, "learning_rate": 1.6240686492182806e-05, "loss": 5.7157, "mean_token_accuracy": 0.7820939481258392, "num_input_tokens_seen": 2180494, "num_tokens": 2180494.0, "step": 385, "train_runtime": 2127.9074, "train_tokens_per_second": 1024.713 }, { "epoch": 0.312, "grad_norm": 45.25, "learning_rate": 1.6138896860055555e-05, "loss": 5.3245, "mean_token_accuracy": 0.7927197381854058, "num_input_tokens_seen": 2209057, "num_tokens": 2209057.0, "step": 390, "train_runtime": 2153.5427, "train_tokens_per_second": 1025.778 }, { "epoch": 0.316, "grad_norm": 131.0, "learning_rate": 1.6036076085226813e-05, "loss": 5.2268, "mean_token_accuracy": 0.7993880152702332, "num_input_tokens_seen": 2238624, "num_tokens": 2238624.0, "step": 395, "train_runtime": 2181.8136, "train_tokens_per_second": 1026.038 }, { "epoch": 0.32, "grad_norm": 94.5, "learning_rate": 1.593224143837142e-05, "loss": 5.6083, "mean_token_accuracy": 0.7860068812966347, "num_input_tokens_seen": 2266912, "num_tokens": 2266912.0, "step": 400, "train_runtime": 2208.6845, "train_tokens_per_second": 1026.363 }, { "epoch": 0.324, "grad_norm": 67.5, "learning_rate": 1.582741036046301e-05, "loss": 5.6723, "mean_token_accuracy": 0.7925399646162987, "num_input_tokens_seen": 2298813, "num_tokens": 2298813.0, "step": 405, "train_runtime": 2237.0379, "train_tokens_per_second": 1027.615 }, { "epoch": 0.328, "grad_norm": 39.0, "learning_rate": 1.572160045984447e-05, "loss": 5.3201, "mean_token_accuracy": 0.8001187354326248, "num_input_tokens_seen": 2331446, "num_tokens": 2331446.0, "step": 410, "train_runtime": 2266.6842, "train_tokens_per_second": 1028.571 }, { "epoch": 0.332, "grad_norm": 39.0, "learning_rate": 1.561482950927029e-05, "loss": 5.587, "mean_token_accuracy": 0.7832968756556511, "num_input_tokens_seen": 2363201, "num_tokens": 2363201.0, "step": 415, "train_runtime": 2296.0603, "train_tokens_per_second": 1029.242 }, { "epoch": 0.336, "grad_norm": 142.0, "learning_rate": 1.550711544292131e-05, "loss": 5.6473, "mean_token_accuracy": 0.7904362455010414, "num_input_tokens_seen": 2389216, "num_tokens": 2389216.0, "step": 420, "train_runtime": 2320.7153, "train_tokens_per_second": 1029.517 }, { "epoch": 0.34, "grad_norm": 47.5, "learning_rate": 1.5398476353392323e-05, "loss": 5.4438, "mean_token_accuracy": 0.7908162623643875, "num_input_tokens_seen": 2419743, "num_tokens": 2419743.0, "step": 425, "train_runtime": 2350.136, "train_tokens_per_second": 1029.618 }, { "epoch": 0.344, "grad_norm": 49.25, "learning_rate": 1.5288930488653094e-05, "loss": 5.2794, "mean_token_accuracy": 0.7933614462614059, "num_input_tokens_seen": 2447011, "num_tokens": 2447011.0, "step": 430, "train_runtime": 2375.2075, "train_tokens_per_second": 1030.23 }, { "epoch": 0.348, "grad_norm": 83.5, "learning_rate": 1.5178496248983254e-05, "loss": 6.0266, "mean_token_accuracy": 0.7747324109077454, "num_input_tokens_seen": 2475216, "num_tokens": 2475216.0, "step": 435, "train_runtime": 2401.9371, "train_tokens_per_second": 1030.508 }, { "epoch": 0.352, "grad_norm": 89.0, "learning_rate": 1.5067192183881658e-05, "loss": 5.4756, "mean_token_accuracy": 0.7997275143861771, "num_input_tokens_seen": 2502081, "num_tokens": 2502081.0, "step": 440, "train_runtime": 2428.7281, "train_tokens_per_second": 1030.202 }, { "epoch": 0.356, "grad_norm": 52.0, "learning_rate": 1.4955036988950617e-05, "loss": 6.1068, "mean_token_accuracy": 0.7708079561591148, "num_input_tokens_seen": 2529391, "num_tokens": 2529391.0, "step": 445, "train_runtime": 2455.553, "train_tokens_per_second": 1030.07 }, { "epoch": 0.36, "grad_norm": 127.5, "learning_rate": 1.484204950275565e-05, "loss": 5.5882, "mean_token_accuracy": 0.7801107332110405, "num_input_tokens_seen": 2554232, "num_tokens": 2554232.0, "step": 450, "train_runtime": 2479.6181, "train_tokens_per_second": 1030.091 }, { "epoch": 0.364, "grad_norm": 49.5, "learning_rate": 1.4728248703661183e-05, "loss": 5.4756, "mean_token_accuracy": 0.7862519830465317, "num_input_tokens_seen": 2582182, "num_tokens": 2582182.0, "step": 455, "train_runtime": 2505.889, "train_tokens_per_second": 1030.445 }, { "epoch": 0.368, "grad_norm": 66.5, "learning_rate": 1.461365370664276e-05, "loss": 5.3923, "mean_token_accuracy": 0.7940330818295479, "num_input_tokens_seen": 2617794, "num_tokens": 2617794.0, "step": 460, "train_runtime": 2539.941, "train_tokens_per_second": 1030.652 }, { "epoch": 0.372, "grad_norm": 104.5, "learning_rate": 1.4498283760076362e-05, "loss": 5.5707, "mean_token_accuracy": 0.7927709832787514, "num_input_tokens_seen": 2643272, "num_tokens": 2643272.0, "step": 465, "train_runtime": 2564.5252, "train_tokens_per_second": 1030.706 }, { "epoch": 0.376, "grad_norm": 59.25, "learning_rate": 1.4382158242505236e-05, "loss": 5.644, "mean_token_accuracy": 0.7871902465820313, "num_input_tokens_seen": 2671869, "num_tokens": 2671869.0, "step": 470, "train_runtime": 2592.8854, "train_tokens_per_second": 1030.462 }, { "epoch": 0.38, "grad_norm": 31.625, "learning_rate": 1.4265296659384956e-05, "loss": 5.6562, "mean_token_accuracy": 0.7877990290522575, "num_input_tokens_seen": 2702041, "num_tokens": 2702041.0, "step": 475, "train_runtime": 2621.0737, "train_tokens_per_second": 1030.891 }, { "epoch": 0.384, "grad_norm": 86.5, "learning_rate": 1.4147718639807071e-05, "loss": 5.621, "mean_token_accuracy": 0.7925810098648072, "num_input_tokens_seen": 2732153, "num_tokens": 2732153.0, "step": 480, "train_runtime": 2649.9389, "train_tokens_per_second": 1031.025 }, { "epoch": 0.388, "grad_norm": 74.5, "learning_rate": 1.4029443933202059e-05, "loss": 5.4204, "mean_token_accuracy": 0.7914870575070381, "num_input_tokens_seen": 2758024, "num_tokens": 2758024.0, "step": 485, "train_runtime": 2676.6477, "train_tokens_per_second": 1030.402 }, { "epoch": 0.392, "grad_norm": 70.5, "learning_rate": 1.3910492406022033e-05, "loss": 5.7675, "mean_token_accuracy": 0.7754184618592262, "num_input_tokens_seen": 2786096, "num_tokens": 2786096.0, "step": 490, "train_runtime": 2703.9298, "train_tokens_per_second": 1030.388 }, { "epoch": 0.396, "grad_norm": 57.0, "learning_rate": 1.3790884038403796e-05, "loss": 5.5642, "mean_token_accuracy": 0.7880642995238304, "num_input_tokens_seen": 2814487, "num_tokens": 2814487.0, "step": 495, "train_runtime": 2731.3935, "train_tokens_per_second": 1030.422 }, { "epoch": 0.4, "grad_norm": 248.0, "learning_rate": 1.36706389208128e-05, "loss": 5.5255, "mean_token_accuracy": 0.7882839411497116, "num_input_tokens_seen": 2847405, "num_tokens": 2847405.0, "step": 500, "train_runtime": 2762.1229, "train_tokens_per_second": 1030.876 }, { "epoch": 0.404, "grad_norm": 98.0, "learning_rate": 1.354977725066859e-05, "loss": 5.3838, "mean_token_accuracy": 0.7869982674717904, "num_input_tokens_seen": 2876359, "num_tokens": 2876359.0, "step": 505, "train_runtime": 2787.1872, "train_tokens_per_second": 1031.993 }, { "epoch": 0.408, "grad_norm": 352.0, "learning_rate": 1.3428319328952254e-05, "loss": 5.4099, "mean_token_accuracy": 0.7843140512704849, "num_input_tokens_seen": 2902835, "num_tokens": 2902835.0, "step": 510, "train_runtime": 2811.8431, "train_tokens_per_second": 1032.36 }, { "epoch": 0.412, "grad_norm": 112.0, "learning_rate": 1.3306285556796494e-05, "loss": 5.3675, "mean_token_accuracy": 0.7961455345153808, "num_input_tokens_seen": 2932928, "num_tokens": 2932928.0, "step": 515, "train_runtime": 2839.8383, "train_tokens_per_second": 1032.78 }, { "epoch": 0.416, "grad_norm": 163.0, "learning_rate": 1.3183696432058889e-05, "loss": 5.2575, "mean_token_accuracy": 0.7971223339438438, "num_input_tokens_seen": 2957517, "num_tokens": 2957517.0, "step": 520, "train_runtime": 2865.8475, "train_tokens_per_second": 1031.987 }, { "epoch": 0.42, "grad_norm": 56.0, "learning_rate": 1.3060572545878875e-05, "loss": 5.4625, "mean_token_accuracy": 0.7884187951683999, "num_input_tokens_seen": 2987924, "num_tokens": 2987924.0, "step": 525, "train_runtime": 2893.0838, "train_tokens_per_second": 1032.782 }, { "epoch": 0.424, "grad_norm": 84.5, "learning_rate": 1.2936934579219094e-05, "loss": 4.9978, "mean_token_accuracy": 0.8107392936944962, "num_input_tokens_seen": 3014169, "num_tokens": 3014169.0, "step": 530, "train_runtime": 2919.0783, "train_tokens_per_second": 1032.576 }, { "epoch": 0.428, "grad_norm": 66.5, "learning_rate": 1.2812803299391629e-05, "loss": 5.7573, "mean_token_accuracy": 0.789200983941555, "num_input_tokens_seen": 3046708, "num_tokens": 3046708.0, "step": 535, "train_runtime": 2948.6574, "train_tokens_per_second": 1033.253 }, { "epoch": 0.432, "grad_norm": 70.5, "learning_rate": 1.2688199556569753e-05, "loss": 5.4901, "mean_token_accuracy": 0.7852542266249657, "num_input_tokens_seen": 3074318, "num_tokens": 3074318.0, "step": 540, "train_runtime": 2975.382, "train_tokens_per_second": 1033.252 }, { "epoch": 0.436, "grad_norm": 146.0, "learning_rate": 1.2563144280285742e-05, "loss": 5.1747, "mean_token_accuracy": 0.8006948977708817, "num_input_tokens_seen": 3102044, "num_tokens": 3102044.0, "step": 545, "train_runtime": 3001.1559, "train_tokens_per_second": 1033.616 }, { "epoch": 0.44, "grad_norm": 63.5, "learning_rate": 1.2437658475915378e-05, "loss": 5.5294, "mean_token_accuracy": 0.7853314474225044, "num_input_tokens_seen": 3129406, "num_tokens": 3129406.0, "step": 550, "train_runtime": 3027.9577, "train_tokens_per_second": 1033.504 }, { "epoch": 0.444, "grad_norm": 72.5, "learning_rate": 1.23117632211497e-05, "loss": 5.1169, "mean_token_accuracy": 0.8012081518769264, "num_input_tokens_seen": 3159496, "num_tokens": 3159496.0, "step": 555, "train_runtime": 3057.971, "train_tokens_per_second": 1033.2 }, { "epoch": 0.448, "grad_norm": 102.5, "learning_rate": 1.2185479662454596e-05, "loss": 5.5137, "mean_token_accuracy": 0.7913802459836006, "num_input_tokens_seen": 3185619, "num_tokens": 3185619.0, "step": 560, "train_runtime": 3083.8761, "train_tokens_per_second": 1032.992 }, { "epoch": 0.452, "grad_norm": 132.0, "learning_rate": 1.2058829011518896e-05, "loss": 5.2765, "mean_token_accuracy": 0.7947102382779121, "num_input_tokens_seen": 3212881, "num_tokens": 3212881.0, "step": 565, "train_runtime": 3109.5104, "train_tokens_per_second": 1033.243 }, { "epoch": 0.456, "grad_norm": 167.0, "learning_rate": 1.193183254169142e-05, "loss": 5.3215, "mean_token_accuracy": 0.7931090787053108, "num_input_tokens_seen": 3240673, "num_tokens": 3240673.0, "step": 570, "train_runtime": 3136.4109, "train_tokens_per_second": 1033.242 }, { "epoch": 0.46, "grad_norm": 40.5, "learning_rate": 1.1804511584407763e-05, "loss": 5.9359, "mean_token_accuracy": 0.7751367390155792, "num_input_tokens_seen": 3272505, "num_tokens": 3272505.0, "step": 575, "train_runtime": 3164.4979, "train_tokens_per_second": 1034.131 }, { "epoch": 0.464, "grad_norm": 108.0, "learning_rate": 1.1676887525607272e-05, "loss": 5.4725, "mean_token_accuracy": 0.7907213315367698, "num_input_tokens_seen": 3302432, "num_tokens": 3302432.0, "step": 580, "train_runtime": 3193.0515, "train_tokens_per_second": 1034.256 }, { "epoch": 0.468, "grad_norm": 86.0, "learning_rate": 1.1548981802140849e-05, "loss": 5.2601, "mean_token_accuracy": 0.7978497371077538, "num_input_tokens_seen": 3328723, "num_tokens": 3328723.0, "step": 585, "train_runtime": 3219.6782, "train_tokens_per_second": 1033.868 }, { "epoch": 0.472, "grad_norm": 90.5, "learning_rate": 1.142081589817027e-05, "loss": 4.9631, "mean_token_accuracy": 0.8012796014547348, "num_input_tokens_seen": 3356380, "num_tokens": 3356380.0, "step": 590, "train_runtime": 3246.7315, "train_tokens_per_second": 1033.772 }, { "epoch": 0.476, "grad_norm": 229.0, "learning_rate": 1.129241134155949e-05, "loss": 5.4675, "mean_token_accuracy": 0.7908360511064529, "num_input_tokens_seen": 3383429, "num_tokens": 3383429.0, "step": 595, "train_runtime": 3273.3996, "train_tokens_per_second": 1033.613 }, { "epoch": 0.48, "grad_norm": 50.75, "learning_rate": 1.1163789700258656e-05, "loss": 5.1986, "mean_token_accuracy": 0.8053984194993973, "num_input_tokens_seen": 3415705, "num_tokens": 3415705.0, "step": 600, "train_runtime": 3305.0605, "train_tokens_per_second": 1033.477 }, { "epoch": 0.484, "grad_norm": 219.0, "learning_rate": 1.1034972578681338e-05, "loss": 5.1812, "mean_token_accuracy": 0.7935044363141059, "num_input_tokens_seen": 3443164, "num_tokens": 3443164.0, "step": 605, "train_runtime": 3332.0068, "train_tokens_per_second": 1033.36 }, { "epoch": 0.488, "grad_norm": 157.0, "learning_rate": 1.0905981614075693e-05, "loss": 5.1947, "mean_token_accuracy": 0.796344393491745, "num_input_tokens_seen": 3467326, "num_tokens": 3467326.0, "step": 610, "train_runtime": 3356.439, "train_tokens_per_second": 1033.037 }, { "epoch": 0.492, "grad_norm": 716.0, "learning_rate": 1.0776838472890065e-05, "loss": 5.292, "mean_token_accuracy": 0.7951879158616066, "num_input_tokens_seen": 3495396, "num_tokens": 3495396.0, "step": 615, "train_runtime": 3383.0011, "train_tokens_per_second": 1033.223 }, { "epoch": 0.496, "grad_norm": 49.5, "learning_rate": 1.06475648471337e-05, "loss": 5.0782, "mean_token_accuracy": 0.8010394781827926, "num_input_tokens_seen": 3520634, "num_tokens": 3520634.0, "step": 620, "train_runtime": 3409.0668, "train_tokens_per_second": 1032.727 }, { "epoch": 0.5, "grad_norm": 95.5, "learning_rate": 1.0518182450733185e-05, "loss": 5.5994, "mean_token_accuracy": 0.7855334684252739, "num_input_tokens_seen": 3549657, "num_tokens": 3549657.0, "step": 625, "train_runtime": 3436.945, "train_tokens_per_second": 1032.794 }, { "epoch": 0.504, "grad_norm": 76.0, "learning_rate": 1.0388713015885161e-05, "loss": 5.5592, "mean_token_accuracy": 0.7845589280128479, "num_input_tokens_seen": 3579684, "num_tokens": 3579684.0, "step": 630, "train_runtime": 3464.2838, "train_tokens_per_second": 1033.311 }, { "epoch": 0.508, "grad_norm": 44.5, "learning_rate": 1.0259178289406011e-05, "loss": 5.3878, "mean_token_accuracy": 0.7951082989573479, "num_input_tokens_seen": 3605087, "num_tokens": 3605087.0, "step": 635, "train_runtime": 3488.9594, "train_tokens_per_second": 1033.284 }, { "epoch": 0.512, "grad_norm": 59.5, "learning_rate": 1.0129600029079072e-05, "loss": 5.7061, "mean_token_accuracy": 0.7883853644132615, "num_input_tokens_seen": 3632957, "num_tokens": 3632957.0, "step": 640, "train_runtime": 3515.7879, "train_tokens_per_second": 1033.327 }, { "epoch": 0.516, "grad_norm": 52.25, "learning_rate": 1e-05, "loss": 5.2977, "mean_token_accuracy": 0.7912828177213669, "num_input_tokens_seen": 3660791, "num_tokens": 3660791.0, "step": 645, "train_runtime": 3543.3758, "train_tokens_per_second": 1033.137 }, { "epoch": 0.52, "grad_norm": 52.0, "learning_rate": 9.870399970920932e-06, "loss": 5.4416, "mean_token_accuracy": 0.7928792417049408, "num_input_tokens_seen": 3689640, "num_tokens": 3689640.0, "step": 650, "train_runtime": 3570.9162, "train_tokens_per_second": 1033.247 }, { "epoch": 0.524, "grad_norm": 62.75, "learning_rate": 9.740821710593989e-06, "loss": 5.3682, "mean_token_accuracy": 0.798756355047226, "num_input_tokens_seen": 3715172, "num_tokens": 3715172.0, "step": 655, "train_runtime": 3596.1899, "train_tokens_per_second": 1033.086 }, { "epoch": 0.528, "grad_norm": 89.5, "learning_rate": 9.61128698411484e-06, "loss": 5.168, "mean_token_accuracy": 0.7963975608348847, "num_input_tokens_seen": 3741030, "num_tokens": 3741030.0, "step": 660, "train_runtime": 3620.5686, "train_tokens_per_second": 1033.271 }, { "epoch": 0.532, "grad_norm": 74.0, "learning_rate": 9.481817549266817e-06, "loss": 4.9416, "mean_token_accuracy": 0.8047078907489776, "num_input_tokens_seen": 3768537, "num_tokens": 3768537.0, "step": 665, "train_runtime": 3647.1969, "train_tokens_per_second": 1033.269 }, { "epoch": 0.536, "grad_norm": 45.25, "learning_rate": 9.352435152866299e-06, "loss": 5.3385, "mean_token_accuracy": 0.7905179291963578, "num_input_tokens_seen": 3799526, "num_tokens": 3799526.0, "step": 670, "train_runtime": 3674.2376, "train_tokens_per_second": 1034.099 }, { "epoch": 0.54, "grad_norm": 50.75, "learning_rate": 9.223161527109938e-06, "loss": 5.1579, "mean_token_accuracy": 0.8002543315291405, "num_input_tokens_seen": 3833970, "num_tokens": 3833970.0, "step": 675, "train_runtime": 3706.0243, "train_tokens_per_second": 1034.524 }, { "epoch": 0.544, "grad_norm": 185.0, "learning_rate": 9.09401838592431e-06, "loss": 5.1117, "mean_token_accuracy": 0.7966769933700562, "num_input_tokens_seen": 3860184, "num_tokens": 3860184.0, "step": 680, "train_runtime": 3730.8409, "train_tokens_per_second": 1034.669 }, { "epoch": 0.548, "grad_norm": 94.0, "learning_rate": 8.965027421318666e-06, "loss": 5.1979, "mean_token_accuracy": 0.7981098249554635, "num_input_tokens_seen": 3886677, "num_tokens": 3886677.0, "step": 685, "train_runtime": 3756.8191, "train_tokens_per_second": 1034.566 }, { "epoch": 0.552, "grad_norm": 47.75, "learning_rate": 8.836210299741346e-06, "loss": 5.6591, "mean_token_accuracy": 0.7767333656549453, "num_input_tokens_seen": 3914353, "num_tokens": 3914353.0, "step": 690, "train_runtime": 3789.086, "train_tokens_per_second": 1033.06 }, { "epoch": 0.556, "grad_norm": 106.0, "learning_rate": 8.707588658440511e-06, "loss": 5.5255, "mean_token_accuracy": 0.7878315180540085, "num_input_tokens_seen": 3940719, "num_tokens": 3940719.0, "step": 695, "train_runtime": 3814.8928, "train_tokens_per_second": 1032.983 }, { "epoch": 0.56, "grad_norm": 41.25, "learning_rate": 8.579184101829734e-06, "loss": 5.5184, "mean_token_accuracy": 0.7867645308375358, "num_input_tokens_seen": 3978888, "num_tokens": 3978888.0, "step": 700, "train_runtime": 3851.4853, "train_tokens_per_second": 1033.079 }, { "epoch": 0.564, "grad_norm": 90.5, "learning_rate": 8.451018197859153e-06, "loss": 5.3507, "mean_token_accuracy": 0.7968938469886779, "num_input_tokens_seen": 4011090, "num_tokens": 4011090.0, "step": 705, "train_runtime": 3880.8486, "train_tokens_per_second": 1033.56 }, { "epoch": 0.568, "grad_norm": 101.0, "learning_rate": 8.323112474392731e-06, "loss": 5.4665, "mean_token_accuracy": 0.7853416830301285, "num_input_tokens_seen": 4038326, "num_tokens": 4038326.0, "step": 710, "train_runtime": 3907.9772, "train_tokens_per_second": 1033.355 }, { "epoch": 0.572, "grad_norm": 318.0, "learning_rate": 8.195488415592238e-06, "loss": 5.5897, "mean_token_accuracy": 0.7923433750867843, "num_input_tokens_seen": 4069168, "num_tokens": 4069168.0, "step": 715, "train_runtime": 3938.7796, "train_tokens_per_second": 1033.104 }, { "epoch": 0.576, "grad_norm": 192.0, "learning_rate": 8.068167458308582e-06, "loss": 5.4862, "mean_token_accuracy": 0.7898807466030121, "num_input_tokens_seen": 4096317, "num_tokens": 4096317.0, "step": 720, "train_runtime": 3965.423, "train_tokens_per_second": 1033.009 }, { "epoch": 0.58, "grad_norm": 54.25, "learning_rate": 7.941170988481108e-06, "loss": 5.6047, "mean_token_accuracy": 0.7875970765948296, "num_input_tokens_seen": 4125700, "num_tokens": 4125700.0, "step": 725, "train_runtime": 3992.1983, "train_tokens_per_second": 1033.441 }, { "epoch": 0.584, "grad_norm": 56.0, "learning_rate": 7.814520337545405e-06, "loss": 5.628, "mean_token_accuracy": 0.7885335445404053, "num_input_tokens_seen": 4151858, "num_tokens": 4151858.0, "step": 730, "train_runtime": 4017.6605, "train_tokens_per_second": 1033.402 }, { "epoch": 0.588, "grad_norm": 79.0, "learning_rate": 7.688236778850307e-06, "loss": 5.6834, "mean_token_accuracy": 0.7778642490506172, "num_input_tokens_seen": 4175955, "num_tokens": 4175955.0, "step": 735, "train_runtime": 4042.321, "train_tokens_per_second": 1033.059 }, { "epoch": 0.592, "grad_norm": 57.5, "learning_rate": 7.5623415240846235e-06, "loss": 5.4957, "mean_token_accuracy": 0.7904580295085907, "num_input_tokens_seen": 4203815, "num_tokens": 4203815.0, "step": 740, "train_runtime": 4068.3684, "train_tokens_per_second": 1033.293 }, { "epoch": 0.596, "grad_norm": 63.0, "learning_rate": 7.4368557197142596e-06, "loss": 4.8703, "mean_token_accuracy": 0.8159982651472092, "num_input_tokens_seen": 4233192, "num_tokens": 4233192.0, "step": 745, "train_runtime": 4097.3421, "train_tokens_per_second": 1033.156 }, { "epoch": 0.6, "grad_norm": 71.5, "learning_rate": 7.311800443430251e-06, "loss": 5.3462, "mean_token_accuracy": 0.794153805077076, "num_input_tokens_seen": 4262718, "num_tokens": 4262718.0, "step": 750, "train_runtime": 4124.7729, "train_tokens_per_second": 1033.443 }, { "epoch": 0.604, "grad_norm": 152.0, "learning_rate": 7.187196700608373e-06, "loss": 4.9215, "mean_token_accuracy": 0.8081203132867814, "num_input_tokens_seen": 4288296, "num_tokens": 4288296.0, "step": 755, "train_runtime": 4151.2234, "train_tokens_per_second": 1033.02 }, { "epoch": 0.608, "grad_norm": 73.5, "learning_rate": 7.063065420780909e-06, "loss": 5.2529, "mean_token_accuracy": 0.7946424350142479, "num_input_tokens_seen": 4318154, "num_tokens": 4318154.0, "step": 760, "train_runtime": 4180.0353, "train_tokens_per_second": 1033.042 }, { "epoch": 0.612, "grad_norm": 247.0, "learning_rate": 6.939427454121128e-06, "loss": 4.9659, "mean_token_accuracy": 0.8127795070409775, "num_input_tokens_seen": 4345188, "num_tokens": 4345188.0, "step": 765, "train_runtime": 4205.4405, "train_tokens_per_second": 1033.23 }, { "epoch": 0.616, "grad_norm": 80.0, "learning_rate": 6.816303567941111e-06, "loss": 5.2259, "mean_token_accuracy": 0.7914159163832665, "num_input_tokens_seen": 4373786, "num_tokens": 4373786.0, "step": 770, "train_runtime": 4233.1681, "train_tokens_per_second": 1033.218 }, { "epoch": 0.62, "grad_norm": 130.0, "learning_rate": 6.693714443203507e-06, "loss": 5.6243, "mean_token_accuracy": 0.7848341032862663, "num_input_tokens_seen": 4404430, "num_tokens": 4404430.0, "step": 775, "train_runtime": 4261.1517, "train_tokens_per_second": 1033.624 }, { "epoch": 0.624, "grad_norm": 59.25, "learning_rate": 6.571680671047749e-06, "loss": 4.9764, "mean_token_accuracy": 0.8033898919820786, "num_input_tokens_seen": 4434585, "num_tokens": 4434585.0, "step": 780, "train_runtime": 4290.2703, "train_tokens_per_second": 1033.638 }, { "epoch": 0.628, "grad_norm": 109.5, "learning_rate": 6.450222749331414e-06, "loss": 5.7304, "mean_token_accuracy": 0.7839483708143234, "num_input_tokens_seen": 4463472, "num_tokens": 4463472.0, "step": 785, "train_runtime": 4317.149, "train_tokens_per_second": 1033.893 }, { "epoch": 0.632, "grad_norm": 64.5, "learning_rate": 6.329361079187199e-06, "loss": 5.7255, "mean_token_accuracy": 0.7803121000528336, "num_input_tokens_seen": 4490469, "num_tokens": 4490469.0, "step": 790, "train_runtime": 4342.3523, "train_tokens_per_second": 1034.11 }, { "epoch": 0.636, "grad_norm": 116.0, "learning_rate": 6.209115961596208e-06, "loss": 5.347, "mean_token_accuracy": 0.7927216425538063, "num_input_tokens_seen": 4517649, "num_tokens": 4517649.0, "step": 795, "train_runtime": 4368.0708, "train_tokens_per_second": 1034.244 }, { "epoch": 0.64, "grad_norm": 57.5, "learning_rate": 6.0895075939779705e-06, "loss": 5.5641, "mean_token_accuracy": 0.795795188844204, "num_input_tokens_seen": 4550712, "num_tokens": 4550712.0, "step": 800, "train_runtime": 4400.2836, "train_tokens_per_second": 1034.186 }, { "epoch": 0.644, "grad_norm": 61.5, "learning_rate": 5.970556066797941e-06, "loss": 5.8032, "mean_token_accuracy": 0.7773613944649697, "num_input_tokens_seen": 4585129, "num_tokens": 4585129.0, "step": 805, "train_runtime": 4430.3262, "train_tokens_per_second": 1034.942 }, { "epoch": 0.648, "grad_norm": 46.75, "learning_rate": 5.852281360192933e-06, "loss": 5.4492, "mean_token_accuracy": 0.7958677127957344, "num_input_tokens_seen": 4614693, "num_tokens": 4614693.0, "step": 810, "train_runtime": 4458.5188, "train_tokens_per_second": 1035.028 }, { "epoch": 0.652, "grad_norm": 135.0, "learning_rate": 5.7347033406150494e-06, "loss": 5.2903, "mean_token_accuracy": 0.7904482677578926, "num_input_tokens_seen": 4644174, "num_tokens": 4644174.0, "step": 815, "train_runtime": 4486.8043, "train_tokens_per_second": 1035.074 }, { "epoch": 0.656, "grad_norm": 62.0, "learning_rate": 5.617841757494762e-06, "loss": 5.322, "mean_token_accuracy": 0.7915823593735695, "num_input_tokens_seen": 4673847, "num_tokens": 4673847.0, "step": 820, "train_runtime": 4515.9617, "train_tokens_per_second": 1034.962 }, { "epoch": 0.66, "grad_norm": 258.0, "learning_rate": 5.501716239923642e-06, "loss": 5.0579, "mean_token_accuracy": 0.8000400334596633, "num_input_tokens_seen": 4700972, "num_tokens": 4700972.0, "step": 825, "train_runtime": 4541.9592, "train_tokens_per_second": 1035.01 }, { "epoch": 0.664, "grad_norm": 70.0, "learning_rate": 5.386346293357242e-06, "loss": 5.2416, "mean_token_accuracy": 0.8032521203160286, "num_input_tokens_seen": 4728665, "num_tokens": 4728665.0, "step": 830, "train_runtime": 4570.1941, "train_tokens_per_second": 1034.675 }, { "epoch": 0.668, "grad_norm": 100.5, "learning_rate": 5.271751296338823e-06, "loss": 5.4391, "mean_token_accuracy": 0.7925701707601547, "num_input_tokens_seen": 4756105, "num_tokens": 4756105.0, "step": 835, "train_runtime": 4596.0174, "train_tokens_per_second": 1034.832 }, { "epoch": 0.672, "grad_norm": 109.5, "learning_rate": 5.15795049724435e-06, "loss": 5.7334, "mean_token_accuracy": 0.7834463611245155, "num_input_tokens_seen": 4784316, "num_tokens": 4784316.0, "step": 840, "train_runtime": 4623.5278, "train_tokens_per_second": 1034.776 }, { "epoch": 0.676, "grad_norm": 58.75, "learning_rate": 5.044963011049384e-06, "loss": 5.3186, "mean_token_accuracy": 0.7899114429950714, "num_input_tokens_seen": 4810582, "num_tokens": 4810582.0, "step": 845, "train_runtime": 4648.9466, "train_tokens_per_second": 1034.768 }, { "epoch": 0.68, "grad_norm": 47.0, "learning_rate": 4.932807816118347e-06, "loss": 5.5814, "mean_token_accuracy": 0.7850132435560226, "num_input_tokens_seen": 4846569, "num_tokens": 4846569.0, "step": 850, "train_runtime": 4682.0829, "train_tokens_per_second": 1035.131 }, { "epoch": 0.684, "grad_norm": 34.5, "learning_rate": 4.821503751016746e-06, "loss": 5.3389, "mean_token_accuracy": 0.7990380316972733, "num_input_tokens_seen": 4876152, "num_tokens": 4876152.0, "step": 855, "train_runtime": 4709.9784, "train_tokens_per_second": 1035.281 }, { "epoch": 0.688, "grad_norm": 412.0, "learning_rate": 4.711069511346909e-06, "loss": 5.7378, "mean_token_accuracy": 0.7886842951178551, "num_input_tokens_seen": 4913773, "num_tokens": 4913773.0, "step": 860, "train_runtime": 4744.369, "train_tokens_per_second": 1035.706 }, { "epoch": 0.692, "grad_norm": 62.75, "learning_rate": 4.601523646607675e-06, "loss": 5.8298, "mean_token_accuracy": 0.7812601879239083, "num_input_tokens_seen": 4943904, "num_tokens": 4943904.0, "step": 865, "train_runtime": 4772.8549, "train_tokens_per_second": 1035.838 }, { "epoch": 0.696, "grad_norm": 57.75, "learning_rate": 4.492884557078688e-06, "loss": 4.8728, "mean_token_accuracy": 0.8109473079442978, "num_input_tokens_seen": 4969286, "num_tokens": 4969286.0, "step": 870, "train_runtime": 4796.4072, "train_tokens_per_second": 1036.043 }, { "epoch": 0.7, "grad_norm": 49.5, "learning_rate": 4.385170490729712e-06, "loss": 5.3615, "mean_token_accuracy": 0.7962699040770531, "num_input_tokens_seen": 4996130, "num_tokens": 4996130.0, "step": 875, "train_runtime": 4822.3119, "train_tokens_per_second": 1036.045 }, { "epoch": 0.704, "grad_norm": 36.75, "learning_rate": 4.278399540155536e-06, "loss": 5.6546, "mean_token_accuracy": 0.7811813220381737, "num_input_tokens_seen": 5024440, "num_tokens": 5024440.0, "step": 880, "train_runtime": 4849.0415, "train_tokens_per_second": 1036.172 }, { "epoch": 0.708, "grad_norm": 198.0, "learning_rate": 4.172589639536992e-06, "loss": 5.3273, "mean_token_accuracy": 0.7892009258270264, "num_input_tokens_seen": 5051605, "num_tokens": 5051605.0, "step": 885, "train_runtime": 4873.5248, "train_tokens_per_second": 1036.54 }, { "epoch": 0.712, "grad_norm": 91.0, "learning_rate": 4.067758561628577e-06, "loss": 5.7275, "mean_token_accuracy": 0.7812389150261879, "num_input_tokens_seen": 5080104, "num_tokens": 5080104.0, "step": 890, "train_runtime": 4901.1022, "train_tokens_per_second": 1036.523 }, { "epoch": 0.716, "grad_norm": 35.5, "learning_rate": 3.9639239147731865e-06, "loss": 5.425, "mean_token_accuracy": 0.7942001700401307, "num_input_tokens_seen": 5113018, "num_tokens": 5113018.0, "step": 895, "train_runtime": 4930.903, "train_tokens_per_second": 1036.933 }, { "epoch": 0.72, "grad_norm": 61.5, "learning_rate": 3.861103139944448e-06, "loss": 5.2248, "mean_token_accuracy": 0.8011772260069847, "num_input_tokens_seen": 5142941, "num_tokens": 5142941.0, "step": 900, "train_runtime": 4959.3927, "train_tokens_per_second": 1037.01 }, { "epoch": 0.724, "grad_norm": 41.75, "learning_rate": 3.759313507817196e-06, "loss": 5.1091, "mean_token_accuracy": 0.7965022444725036, "num_input_tokens_seen": 5172026, "num_tokens": 5172026.0, "step": 905, "train_runtime": 4985.9195, "train_tokens_per_second": 1037.326 }, { "epoch": 0.728, "grad_norm": 53.25, "learning_rate": 3.658572115866541e-06, "loss": 5.7217, "mean_token_accuracy": 0.7808126404881477, "num_input_tokens_seen": 5202257, "num_tokens": 5202257.0, "step": 910, "train_runtime": 5014.4393, "train_tokens_per_second": 1037.455 }, { "epoch": 0.732, "grad_norm": 44.5, "learning_rate": 3.558895885496023e-06, "loss": 5.3946, "mean_token_accuracy": 0.789019052684307, "num_input_tokens_seen": 5229589, "num_tokens": 5229589.0, "step": 915, "train_runtime": 5040.3771, "train_tokens_per_second": 1037.539 }, { "epoch": 0.736, "grad_norm": 318.0, "learning_rate": 3.4603015591953393e-06, "loss": 5.3473, "mean_token_accuracy": 0.805972746014595, "num_input_tokens_seen": 5258125, "num_tokens": 5258125.0, "step": 920, "train_runtime": 5066.3915, "train_tokens_per_second": 1037.844 }, { "epoch": 0.74, "grad_norm": 41.0, "learning_rate": 3.3628056977281456e-06, "loss": 5.6571, "mean_token_accuracy": 0.7789395034313202, "num_input_tokens_seen": 5286827, "num_tokens": 5286827.0, "step": 925, "train_runtime": 5093.3897, "train_tokens_per_second": 1037.978 }, { "epoch": 0.744, "grad_norm": 50.25, "learning_rate": 3.266424677350346e-06, "loss": 5.2024, "mean_token_accuracy": 0.7979300260543823, "num_input_tokens_seen": 5313346, "num_tokens": 5313346.0, "step": 930, "train_runtime": 5119.0652, "train_tokens_per_second": 1037.952 }, { "epoch": 0.748, "grad_norm": 70.0, "learning_rate": 3.1711746870594083e-06, "loss": 5.2941, "mean_token_accuracy": 0.805306826531887, "num_input_tokens_seen": 5343136, "num_tokens": 5343136.0, "step": 935, "train_runtime": 5148.1407, "train_tokens_per_second": 1037.877 }, { "epoch": 0.752, "grad_norm": 56.25, "learning_rate": 3.077071725875116e-06, "loss": 5.1137, "mean_token_accuracy": 0.8032099828124046, "num_input_tokens_seen": 5372738, "num_tokens": 5372738.0, "step": 940, "train_runtime": 5176.4597, "train_tokens_per_second": 1037.917 }, { "epoch": 0.756, "grad_norm": 46.75, "learning_rate": 2.9841316001522345e-06, "loss": 4.861, "mean_token_accuracy": 0.809606908261776, "num_input_tokens_seen": 5400053, "num_tokens": 5400053.0, "step": 945, "train_runtime": 5203.2867, "train_tokens_per_second": 1037.816 }, { "epoch": 0.76, "grad_norm": 306.0, "learning_rate": 2.8923699209255285e-06, "loss": 5.4282, "mean_token_accuracy": 0.7930289775133132, "num_input_tokens_seen": 5423237, "num_tokens": 5423237.0, "step": 950, "train_runtime": 5227.2135, "train_tokens_per_second": 1037.501 }, { "epoch": 0.764, "grad_norm": 61.25, "learning_rate": 2.8018021012875994e-06, "loss": 5.0416, "mean_token_accuracy": 0.8067665219306945, "num_input_tokens_seen": 5451018, "num_tokens": 5451018.0, "step": 955, "train_runtime": 5254.5977, "train_tokens_per_second": 1037.381 }, { "epoch": 0.768, "grad_norm": 76.5, "learning_rate": 2.7124433537999838e-06, "loss": 5.3888, "mean_token_accuracy": 0.7953536674380303, "num_input_tokens_seen": 5481559, "num_tokens": 5481559.0, "step": 960, "train_runtime": 5281.8099, "train_tokens_per_second": 1037.818 }, { "epoch": 0.772, "grad_norm": 76.5, "learning_rate": 2.6243086879379e-06, "loss": 5.3031, "mean_token_accuracy": 0.7893404349684715, "num_input_tokens_seen": 5509600, "num_tokens": 5509600.0, "step": 965, "train_runtime": 5308.2353, "train_tokens_per_second": 1037.934 }, { "epoch": 0.776, "grad_norm": 53.0, "learning_rate": 2.537412907569127e-06, "loss": 5.2369, "mean_token_accuracy": 0.7978512957692147, "num_input_tokens_seen": 5538862, "num_tokens": 5538862.0, "step": 970, "train_runtime": 5335.0361, "train_tokens_per_second": 1038.205 }, { "epoch": 0.78, "grad_norm": 66.5, "learning_rate": 2.451770608467432e-06, "loss": 5.1098, "mean_token_accuracy": 0.805266946554184, "num_input_tokens_seen": 5566918, "num_tokens": 5566918.0, "step": 975, "train_runtime": 5362.4471, "train_tokens_per_second": 1038.13 }, { "epoch": 0.784, "grad_norm": 49.25, "learning_rate": 2.3673961758609156e-06, "loss": 5.186, "mean_token_accuracy": 0.8020475476980209, "num_input_tokens_seen": 5593453, "num_tokens": 5593453.0, "step": 980, "train_runtime": 5387.3852, "train_tokens_per_second": 1038.25 }, { "epoch": 0.788, "grad_norm": 83.0, "learning_rate": 2.2843037820157678e-06, "loss": 5.2522, "mean_token_accuracy": 0.7931860506534576, "num_input_tokens_seen": 5621006, "num_tokens": 5621006.0, "step": 985, "train_runtime": 5413.5032, "train_tokens_per_second": 1038.331 }, { "epoch": 0.792, "grad_norm": 121.5, "learning_rate": 2.2025073838557454e-06, "loss": 5.142, "mean_token_accuracy": 0.7944848969578743, "num_input_tokens_seen": 5649809, "num_tokens": 5649809.0, "step": 990, "train_runtime": 5441.3683, "train_tokens_per_second": 1038.307 }, { "epoch": 0.796, "grad_norm": 70.0, "learning_rate": 2.122020720617869e-06, "loss": 5.3737, "mean_token_accuracy": 0.7889134347438812, "num_input_tokens_seen": 5676379, "num_tokens": 5676379.0, "step": 995, "train_runtime": 5465.6045, "train_tokens_per_second": 1038.564 }, { "epoch": 0.8, "grad_norm": 63.25, "learning_rate": 2.0428573115446394e-06, "loss": 5.336, "mean_token_accuracy": 0.791542237997055, "num_input_tokens_seen": 5700207, "num_tokens": 5700207.0, "step": 1000, "train_runtime": 5489.0151, "train_tokens_per_second": 1038.475 }, { "epoch": 0.804, "grad_norm": 260.0, "learning_rate": 1.9650304536132426e-06, "loss": 5.263, "mean_token_accuracy": 0.7995662987232208, "num_input_tokens_seen": 5725964, "num_tokens": 5725964.0, "step": 1005, "train_runtime": 5514.6903, "train_tokens_per_second": 1038.311 }, { "epoch": 0.808, "grad_norm": 158.0, "learning_rate": 1.8885532193020706e-06, "loss": 5.3317, "mean_token_accuracy": 0.7876490637660026, "num_input_tokens_seen": 5754571, "num_tokens": 5754571.0, "step": 1010, "train_runtime": 5540.9562, "train_tokens_per_second": 1038.552 }, { "epoch": 0.812, "grad_norm": 45.25, "learning_rate": 1.813438454394948e-06, "loss": 5.2275, "mean_token_accuracy": 0.7981337189674378, "num_input_tokens_seen": 5783614, "num_tokens": 5783614.0, "step": 1015, "train_runtime": 5567.5443, "train_tokens_per_second": 1038.809 }, { "epoch": 0.816, "grad_norm": 36.25, "learning_rate": 1.7396987758234418e-06, "loss": 4.7787, "mean_token_accuracy": 0.8220131769776344, "num_input_tokens_seen": 5809495, "num_tokens": 5809495.0, "step": 1020, "train_runtime": 5592.6221, "train_tokens_per_second": 1038.778 }, { "epoch": 0.82, "grad_norm": 80.5, "learning_rate": 1.6673465695476233e-06, "loss": 4.8219, "mean_token_accuracy": 0.8117146581411362, "num_input_tokens_seen": 5836071, "num_tokens": 5836071.0, "step": 1025, "train_runtime": 5618.3237, "train_tokens_per_second": 1038.757 }, { "epoch": 0.824, "grad_norm": 49.5, "learning_rate": 1.5963939884756042e-06, "loss": 5.5483, "mean_token_accuracy": 0.7852835282683372, "num_input_tokens_seen": 5867317, "num_tokens": 5867317.0, "step": 1030, "train_runtime": 5647.6112, "train_tokens_per_second": 1038.902 }, { "epoch": 0.828, "grad_norm": 56.25, "learning_rate": 1.5268529504222262e-06, "loss": 5.2143, "mean_token_accuracy": 0.799188706278801, "num_input_tokens_seen": 5897945, "num_tokens": 5897945.0, "step": 1035, "train_runtime": 5677.9014, "train_tokens_per_second": 1038.754 }, { "epoch": 0.832, "grad_norm": 59.0, "learning_rate": 1.4587351361072455e-06, "loss": 5.4266, "mean_token_accuracy": 0.7901453331112862, "num_input_tokens_seen": 5923523, "num_tokens": 5923523.0, "step": 1040, "train_runtime": 5702.9066, "train_tokens_per_second": 1038.685 }, { "epoch": 0.836, "grad_norm": 144.0, "learning_rate": 1.3920519871933425e-06, "loss": 5.4357, "mean_token_accuracy": 0.7978359222412109, "num_input_tokens_seen": 5956415, "num_tokens": 5956415.0, "step": 1045, "train_runtime": 5733.0236, "train_tokens_per_second": 1038.966 }, { "epoch": 0.84, "grad_norm": 177.0, "learning_rate": 1.326814704364262e-06, "loss": 5.3211, "mean_token_accuracy": 0.7962174996733665, "num_input_tokens_seen": 5985449, "num_tokens": 5985449.0, "step": 1050, "train_runtime": 5761.1481, "train_tokens_per_second": 1038.933 }, { "epoch": 0.844, "grad_norm": 74.0, "learning_rate": 1.263034245443473e-06, "loss": 5.2974, "mean_token_accuracy": 0.7944592133164405, "num_input_tokens_seen": 6013339, "num_tokens": 6013339.0, "step": 1055, "train_runtime": 5788.1881, "train_tokens_per_second": 1038.898 }, { "epoch": 0.848, "grad_norm": 75.0, "learning_rate": 1.2007213235535785e-06, "loss": 5.3725, "mean_token_accuracy": 0.7937130227684974, "num_input_tokens_seen": 6043191, "num_tokens": 6043191.0, "step": 1060, "train_runtime": 5815.2046, "train_tokens_per_second": 1039.205 }, { "epoch": 0.852, "grad_norm": 60.0, "learning_rate": 1.1398864053168534e-06, "loss": 4.9906, "mean_token_accuracy": 0.8006555408239364, "num_input_tokens_seen": 6071081, "num_tokens": 6071081.0, "step": 1065, "train_runtime": 5841.7991, "train_tokens_per_second": 1039.249 }, { "epoch": 0.856, "grad_norm": 60.5, "learning_rate": 1.0805397090971738e-06, "loss": 4.9086, "mean_token_accuracy": 0.8100662231445312, "num_input_tokens_seen": 6097563, "num_tokens": 6097563.0, "step": 1070, "train_runtime": 5868.5095, "train_tokens_per_second": 1039.031 }, { "epoch": 0.86, "grad_norm": 57.25, "learning_rate": 1.022691203283661e-06, "loss": 5.4238, "mean_token_accuracy": 0.792490765452385, "num_input_tokens_seen": 6126398, "num_tokens": 6126398.0, "step": 1075, "train_runtime": 5895.323, "train_tokens_per_second": 1039.196 }, { "epoch": 0.864, "grad_norm": 38.5, "learning_rate": 9.663506046162986e-07, "loss": 5.1983, "mean_token_accuracy": 0.7941671445965767, "num_input_tokens_seen": 6154407, "num_tokens": 6154407.0, "step": 1080, "train_runtime": 5920.8046, "train_tokens_per_second": 1039.455 }, { "epoch": 0.868, "grad_norm": 536.0, "learning_rate": 9.115273765538202e-07, "loss": 5.5089, "mean_token_accuracy": 0.7930781245231628, "num_input_tokens_seen": 6182939, "num_tokens": 6182939.0, "step": 1085, "train_runtime": 5947.8298, "train_tokens_per_second": 1039.529 }, { "epoch": 0.872, "grad_norm": 47.25, "learning_rate": 8.582307276841461e-07, "loss": 5.3598, "mean_token_accuracy": 0.7864043831825256, "num_input_tokens_seen": 6212094, "num_tokens": 6212094.0, "step": 1090, "train_runtime": 5975.8227, "train_tokens_per_second": 1039.538 }, { "epoch": 0.876, "grad_norm": 41.5, "learning_rate": 8.06469610177636e-07, "loss": 5.3994, "mean_token_accuracy": 0.7908027723431588, "num_input_tokens_seen": 6238778, "num_tokens": 6238778.0, "step": 1095, "train_runtime": 6000.9384, "train_tokens_per_second": 1039.634 }, { "epoch": 0.88, "grad_norm": 312.0, "learning_rate": 7.562527182833978e-07, "loss": 5.3973, "mean_token_accuracy": 0.793465219438076, "num_input_tokens_seen": 6265105, "num_tokens": 6265105.0, "step": 1100, "train_runtime": 6025.9546, "train_tokens_per_second": 1039.687 }, { "epoch": 0.884, "grad_norm": 128.0, "learning_rate": 7.07588486868922e-07, "loss": 5.1888, "mean_token_accuracy": 0.8035556092858315, "num_input_tokens_seen": 6290027, "num_tokens": 6290027.0, "step": 1105, "train_runtime": 6051.3714, "train_tokens_per_second": 1039.438 }, { "epoch": 0.888, "grad_norm": 49.0, "learning_rate": 6.604850900032956e-07, "loss": 4.7405, "mean_token_accuracy": 0.8212268218398094, "num_input_tokens_seen": 6317712, "num_tokens": 6317712.0, "step": 1110, "train_runtime": 6077.8737, "train_tokens_per_second": 1039.461 }, { "epoch": 0.892, "grad_norm": 79.5, "learning_rate": 6.149504395842087e-07, "loss": 5.3393, "mean_token_accuracy": 0.7951319962739944, "num_input_tokens_seen": 6343602, "num_tokens": 6343602.0, "step": 1115, "train_runtime": 6105.1625, "train_tokens_per_second": 1039.055 }, { "epoch": 0.896, "grad_norm": 123.0, "learning_rate": 5.709921840090072e-07, "loss": 5.2021, "mean_token_accuracy": 0.7978611201047897, "num_input_tokens_seen": 6382499, "num_tokens": 6382499.0, "step": 1120, "train_runtime": 6145.3873, "train_tokens_per_second": 1038.584 }, { "epoch": 0.9, "grad_norm": 108.5, "learning_rate": 5.286177068899989e-07, "loss": 5.2466, "mean_token_accuracy": 0.7941580578684807, "num_input_tokens_seen": 6409279, "num_tokens": 6409279.0, "step": 1125, "train_runtime": 6172.0603, "train_tokens_per_second": 1038.434 }, { "epoch": 0.904, "grad_norm": 50.0, "learning_rate": 4.878341258142349e-07, "loss": 5.4412, "mean_token_accuracy": 0.7916087701916694, "num_input_tokens_seen": 6440759, "num_tokens": 6440759.0, "step": 1130, "train_runtime": 6198.541, "train_tokens_per_second": 1039.077 }, { "epoch": 0.908, "grad_norm": 130.0, "learning_rate": 4.4864829114798394e-07, "loss": 4.9766, "mean_token_accuracy": 0.8005422234535218, "num_input_tokens_seen": 6468905, "num_tokens": 6468905.0, "step": 1135, "train_runtime": 6225.8798, "train_tokens_per_second": 1039.035 }, { "epoch": 0.912, "grad_norm": 79.0, "learning_rate": 4.11066784886075e-07, "loss": 5.3727, "mean_token_accuracy": 0.7998930081725121, "num_input_tokens_seen": 6493165, "num_tokens": 6493165.0, "step": 1140, "train_runtime": 6249.8312, "train_tokens_per_second": 1038.934 }, { "epoch": 0.916, "grad_norm": 54.0, "learning_rate": 3.750959195463466e-07, "loss": 5.3525, "mean_token_accuracy": 0.7876800760626793, "num_input_tokens_seen": 6522770, "num_tokens": 6522770.0, "step": 1145, "train_runtime": 6278.1358, "train_tokens_per_second": 1038.966 }, { "epoch": 0.92, "grad_norm": 206.0, "learning_rate": 3.4074173710931804e-07, "loss": 5.7993, "mean_token_accuracy": 0.7791803061962128, "num_input_tokens_seen": 6549386, "num_tokens": 6549386.0, "step": 1150, "train_runtime": 6302.9235, "train_tokens_per_second": 1039.103 }, { "epoch": 0.924, "grad_norm": 99.0, "learning_rate": 3.080100080033388e-07, "loss": 5.0963, "mean_token_accuracy": 0.7987044736742973, "num_input_tokens_seen": 6578069, "num_tokens": 6578069.0, "step": 1155, "train_runtime": 6330.1523, "train_tokens_per_second": 1039.164 }, { "epoch": 0.928, "grad_norm": 83.0, "learning_rate": 2.769062301353398e-07, "loss": 5.5875, "mean_token_accuracy": 0.7888873621821404, "num_input_tokens_seen": 6603075, "num_tokens": 6603075.0, "step": 1160, "train_runtime": 6354.1556, "train_tokens_per_second": 1039.174 }, { "epoch": 0.932, "grad_norm": 102.0, "learning_rate": 2.474356279673462e-07, "loss": 5.6995, "mean_token_accuracy": 0.7825249642133713, "num_input_tokens_seen": 6635809, "num_tokens": 6635809.0, "step": 1165, "train_runtime": 6382.7967, "train_tokens_per_second": 1039.64 }, { "epoch": 0.936, "grad_norm": 56.5, "learning_rate": 2.1960315163894075e-07, "loss": 5.1911, "mean_token_accuracy": 0.7973916217684746, "num_input_tokens_seen": 6661327, "num_tokens": 6661327.0, "step": 1170, "train_runtime": 6406.6683, "train_tokens_per_second": 1039.749 }, { "epoch": 0.94, "grad_norm": 69.5, "learning_rate": 1.9341347613579086e-07, "loss": 5.1047, "mean_token_accuracy": 0.8013954371213913, "num_input_tokens_seen": 6690372, "num_tokens": 6690372.0, "step": 1175, "train_runtime": 6435.8803, "train_tokens_per_second": 1039.543 }, { "epoch": 0.944, "grad_norm": 73.5, "learning_rate": 1.6887100050439587e-07, "loss": 5.6123, "mean_token_accuracy": 0.7876136094331742, "num_input_tokens_seen": 6719639, "num_tokens": 6719639.0, "step": 1180, "train_runtime": 6464.1419, "train_tokens_per_second": 1039.525 }, { "epoch": 0.948, "grad_norm": 87.0, "learning_rate": 1.459798471131868e-07, "loss": 5.4049, "mean_token_accuracy": 0.7874793767929077, "num_input_tokens_seen": 6745687, "num_tokens": 6745687.0, "step": 1185, "train_runtime": 6489.0041, "train_tokens_per_second": 1039.557 }, { "epoch": 0.952, "grad_norm": 454.0, "learning_rate": 1.2474386096010037e-07, "loss": 5.0158, "mean_token_accuracy": 0.8049899056553841, "num_input_tokens_seen": 6774080, "num_tokens": 6774080.0, "step": 1190, "train_runtime": 6514.6768, "train_tokens_per_second": 1039.818 }, { "epoch": 0.956, "grad_norm": 74.5, "learning_rate": 1.0516660902673448e-07, "loss": 5.4421, "mean_token_accuracy": 0.7932335063815117, "num_input_tokens_seen": 6803981, "num_tokens": 6803981.0, "step": 1195, "train_runtime": 6542.1628, "train_tokens_per_second": 1040.02 }, { "epoch": 0.96, "grad_norm": 67.5, "learning_rate": 8.725137967920739e-08, "loss": 5.6285, "mean_token_accuracy": 0.7887401878833771, "num_input_tokens_seen": 6829351, "num_tokens": 6829351.0, "step": 1200, "train_runtime": 6567.4241, "train_tokens_per_second": 1039.883 }, { "epoch": 0.964, "grad_norm": 98.0, "learning_rate": 7.100118211581852e-08, "loss": 5.5201, "mean_token_accuracy": 0.7889957845211029, "num_input_tokens_seen": 6856748, "num_tokens": 6856748.0, "step": 1205, "train_runtime": 6594.1889, "train_tokens_per_second": 1039.817 }, { "epoch": 0.968, "grad_norm": 46.75, "learning_rate": 5.6418745861593905e-08, "loss": 5.0326, "mean_token_accuracy": 0.8010287463665009, "num_input_tokens_seen": 6886116, "num_tokens": 6886116.0, "step": 1210, "train_runtime": 6621.5933, "train_tokens_per_second": 1039.949 }, { "epoch": 0.972, "grad_norm": 60.25, "learning_rate": 4.350652030981395e-08, "loss": 5.7011, "mean_token_accuracy": 0.7844116255640984, "num_input_tokens_seen": 6912415, "num_tokens": 6912415.0, "step": 1215, "train_runtime": 6646.1851, "train_tokens_per_second": 1040.058 }, { "epoch": 0.976, "grad_norm": 50.75, "learning_rate": 3.2266674310589276e-08, "loss": 5.481, "mean_token_accuracy": 0.7872747302055358, "num_input_tokens_seen": 6937018, "num_tokens": 6937018.0, "step": 1220, "train_runtime": 6670.2576, "train_tokens_per_second": 1039.993 }, { "epoch": 0.98, "grad_norm": 81.0, "learning_rate": 2.2701095806565432e-08, "loss": 5.5155, "mean_token_accuracy": 0.7937034830451012, "num_input_tokens_seen": 6968725, "num_tokens": 6968725.0, "step": 1225, "train_runtime": 6700.0416, "train_tokens_per_second": 1040.102 }, { "epoch": 0.984, "grad_norm": 42.75, "learning_rate": 1.4811391515799911e-08, "loss": 5.0932, "mean_token_accuracy": 0.7983416199684144, "num_input_tokens_seen": 6996046, "num_tokens": 6996046.0, "step": 1230, "train_runtime": 6725.8984, "train_tokens_per_second": 1040.165 }, { "epoch": 0.988, "grad_norm": 47.0, "learning_rate": 8.59888666189579e-09, "loss": 5.6719, "mean_token_accuracy": 0.783245125412941, "num_input_tokens_seen": 7026420, "num_tokens": 7026420.0, "step": 1235, "train_runtime": 6754.2343, "train_tokens_per_second": 1040.299 }, { "epoch": 0.992, "grad_norm": 211.0, "learning_rate": 4.064624751394242e-09, "loss": 5.7314, "mean_token_accuracy": 0.7820679724216462, "num_input_tokens_seen": 7056594, "num_tokens": 7056594.0, "step": 1240, "train_runtime": 6782.773, "train_tokens_per_second": 1040.37 }, { "epoch": 0.996, "grad_norm": 114.0, "learning_rate": 1.209367398504746e-09, "loss": 5.0323, "mean_token_accuracy": 0.8049638271331787, "num_input_tokens_seen": 7081035, "num_tokens": 7081035.0, "step": 1245, "train_runtime": 6807.8837, "train_tokens_per_second": 1040.123 }, { "epoch": 1.0, "grad_norm": 57.0, "learning_rate": 3.3594197175190743e-11, "loss": 5.217, "mean_token_accuracy": 0.8012784749269486, "num_input_tokens_seen": 7107438, "num_tokens": 7107438.0, "step": 1250, "train_runtime": 6833.6244, "train_tokens_per_second": 1040.069 }, { "epoch": 1.0, "num_input_tokens_seen": 7107438, "step": 1250, "total_flos": 1.4684498749056614e+17, "train_loss": 5.760735061645508, "train_runtime": 6833.6666, "train_samples_per_second": 1.463, "train_steps_per_second": 0.183, "train_tokens_per_second": 1040.062 } ], "logging_steps": 5, "max_steps": 1250, "num_input_tokens_seen": 7107438, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4684498749056614e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }