qwen-synth / trainer_state.json
geopar's picture
Upload folder using huggingface_hub
1ade774 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004,
"grad_norm": 716.0,
"learning_rate": 2.105263157894737e-06,
"loss": 21.0165,
"mean_token_accuracy": 0.6174646154046058,
"num_input_tokens_seen": 27461,
"num_tokens": 27461.0,
"step": 5,
"train_runtime": 58.529,
"train_tokens_per_second": 469.186
},
{
"epoch": 0.008,
"grad_norm": 420.0,
"learning_rate": 4.736842105263158e-06,
"loss": 17.7281,
"mean_token_accuracy": 0.6165577113628388,
"num_input_tokens_seen": 55051,
"num_tokens": 55051.0,
"step": 10,
"train_runtime": 84.4052,
"train_tokens_per_second": 652.223
},
{
"epoch": 0.012,
"grad_norm": 284.0,
"learning_rate": 7.368421052631579e-06,
"loss": 15.0203,
"mean_token_accuracy": 0.6702655613422394,
"num_input_tokens_seen": 83951,
"num_tokens": 83951.0,
"step": 15,
"train_runtime": 111.8863,
"train_tokens_per_second": 750.324
},
{
"epoch": 0.016,
"grad_norm": 224.0,
"learning_rate": 1e-05,
"loss": 11.6076,
"mean_token_accuracy": 0.7083307519555092,
"num_input_tokens_seen": 109657,
"num_tokens": 109657.0,
"step": 20,
"train_runtime": 136.978,
"train_tokens_per_second": 800.544
},
{
"epoch": 0.02,
"grad_norm": 270.0,
"learning_rate": 1.263157894736842e-05,
"loss": 9.6461,
"mean_token_accuracy": 0.7309058234095573,
"num_input_tokens_seen": 139061,
"num_tokens": 139061.0,
"step": 25,
"train_runtime": 165.0733,
"train_tokens_per_second": 842.419
},
{
"epoch": 0.024,
"grad_norm": 50.25,
"learning_rate": 1.5263157894736846e-05,
"loss": 8.7466,
"mean_token_accuracy": 0.7324644327163696,
"num_input_tokens_seen": 173335,
"num_tokens": 173335.0,
"step": 30,
"train_runtime": 195.3505,
"train_tokens_per_second": 887.303
},
{
"epoch": 0.028,
"grad_norm": 101.0,
"learning_rate": 1.7894736842105264e-05,
"loss": 8.494,
"mean_token_accuracy": 0.7436975419521332,
"num_input_tokens_seen": 201778,
"num_tokens": 201778.0,
"step": 35,
"train_runtime": 223.7451,
"train_tokens_per_second": 901.821
},
{
"epoch": 0.032,
"grad_norm": 316.0,
"learning_rate": 1.9999966405802828e-05,
"loss": 7.8429,
"mean_token_accuracy": 0.733650079369545,
"num_input_tokens_seen": 234594,
"num_tokens": 234594.0,
"step": 40,
"train_runtime": 254.1938,
"train_tokens_per_second": 922.894
},
{
"epoch": 0.036,
"grad_norm": 67.0,
"learning_rate": 1.9998790632601496e-05,
"loss": 7.2857,
"mean_token_accuracy": 0.7769168972969055,
"num_input_tokens_seen": 263946,
"num_tokens": 263946.0,
"step": 45,
"train_runtime": 282.687,
"train_tokens_per_second": 933.704
},
{
"epoch": 0.04,
"grad_norm": 56.25,
"learning_rate": 1.9995935375248608e-05,
"loss": 7.2708,
"mean_token_accuracy": 0.7696839615702629,
"num_input_tokens_seen": 292127,
"num_tokens": 292127.0,
"step": 50,
"train_runtime": 309.685,
"train_tokens_per_second": 943.304
},
{
"epoch": 0.044,
"grad_norm": 79.0,
"learning_rate": 1.9991401113338103e-05,
"loss": 7.5369,
"mean_token_accuracy": 0.7532998159527778,
"num_input_tokens_seen": 318452,
"num_tokens": 318452.0,
"step": 55,
"train_runtime": 336.3519,
"train_tokens_per_second": 946.782
},
{
"epoch": 0.048,
"grad_norm": 52.75,
"learning_rate": 1.99851886084842e-05,
"loss": 7.4304,
"mean_token_accuracy": 0.7594221189618111,
"num_input_tokens_seen": 344023,
"num_tokens": 344023.0,
"step": 60,
"train_runtime": 361.6326,
"train_tokens_per_second": 951.305
},
{
"epoch": 0.052,
"grad_norm": 53.5,
"learning_rate": 1.9977298904193438e-05,
"loss": 7.1512,
"mean_token_accuracy": 0.7550393640995026,
"num_input_tokens_seen": 372600,
"num_tokens": 372600.0,
"step": 65,
"train_runtime": 389.9343,
"train_tokens_per_second": 955.546
},
{
"epoch": 0.056,
"grad_norm": 52.75,
"learning_rate": 1.9967733325689412e-05,
"loss": 6.7062,
"mean_token_accuracy": 0.7626852974295616,
"num_input_tokens_seen": 399843,
"num_tokens": 399843.0,
"step": 70,
"train_runtime": 415.5105,
"train_tokens_per_second": 962.293
},
{
"epoch": 0.06,
"grad_norm": 51.75,
"learning_rate": 1.995649347969019e-05,
"loss": 6.7049,
"mean_token_accuracy": 0.76889388859272,
"num_input_tokens_seen": 426039,
"num_tokens": 426039.0,
"step": 75,
"train_runtime": 440.456,
"train_tokens_per_second": 967.268
},
{
"epoch": 0.064,
"grad_norm": 171.0,
"learning_rate": 1.994358125413841e-05,
"loss": 6.1025,
"mean_token_accuracy": 0.7738867923617363,
"num_input_tokens_seen": 454551,
"num_tokens": 454551.0,
"step": 80,
"train_runtime": 468.9331,
"train_tokens_per_second": 969.33
},
{
"epoch": 0.068,
"grad_norm": 207.0,
"learning_rate": 1.9928998817884185e-05,
"loss": 6.4376,
"mean_token_accuracy": 0.772473418712616,
"num_input_tokens_seen": 480692,
"num_tokens": 480692.0,
"step": 85,
"train_runtime": 493.6013,
"train_tokens_per_second": 973.847
},
{
"epoch": 0.072,
"grad_norm": 214.0,
"learning_rate": 1.9912748620320796e-05,
"loss": 6.6575,
"mean_token_accuracy": 0.7715476334095002,
"num_input_tokens_seen": 509235,
"num_tokens": 509235.0,
"step": 90,
"train_runtime": 521.9092,
"train_tokens_per_second": 975.716
},
{
"epoch": 0.076,
"grad_norm": 55.0,
"learning_rate": 1.9894833390973266e-05,
"loss": 6.8634,
"mean_token_accuracy": 0.761625699698925,
"num_input_tokens_seen": 541734,
"num_tokens": 541734.0,
"step": 95,
"train_runtime": 551.2186,
"train_tokens_per_second": 982.793
},
{
"epoch": 0.08,
"grad_norm": 79.0,
"learning_rate": 1.98752561390399e-05,
"loss": 6.1263,
"mean_token_accuracy": 0.7742740377783776,
"num_input_tokens_seen": 572168,
"num_tokens": 572168.0,
"step": 100,
"train_runtime": 579.6467,
"train_tokens_per_second": 987.098
},
{
"epoch": 0.084,
"grad_norm": 89.5,
"learning_rate": 1.9854020152886816e-05,
"loss": 6.3594,
"mean_token_accuracy": 0.7709073334932327,
"num_input_tokens_seen": 598451,
"num_tokens": 598451.0,
"step": 105,
"train_runtime": 605.7065,
"train_tokens_per_second": 988.021
},
{
"epoch": 0.088,
"grad_norm": 260.0,
"learning_rate": 1.9831128999495605e-05,
"loss": 6.1262,
"mean_token_accuracy": 0.7783429339528084,
"num_input_tokens_seen": 626393,
"num_tokens": 626393.0,
"step": 110,
"train_runtime": 633.7147,
"train_tokens_per_second": 988.446
},
{
"epoch": 0.092,
"grad_norm": 46.25,
"learning_rate": 1.9806586523864212e-05,
"loss": 6.1485,
"mean_token_accuracy": 0.7764704540371895,
"num_input_tokens_seen": 649813,
"num_tokens": 649813.0,
"step": 115,
"train_runtime": 657.2228,
"train_tokens_per_second": 988.726
},
{
"epoch": 0.096,
"grad_norm": 81.0,
"learning_rate": 1.978039684836106e-05,
"loss": 5.7248,
"mean_token_accuracy": 0.7734859913587571,
"num_input_tokens_seen": 676687,
"num_tokens": 676687.0,
"step": 120,
"train_runtime": 683.5183,
"train_tokens_per_second": 990.006
},
{
"epoch": 0.1,
"grad_norm": 44.0,
"learning_rate": 1.9752564372032655e-05,
"loss": 5.9166,
"mean_token_accuracy": 0.7785823866724968,
"num_input_tokens_seen": 708413,
"num_tokens": 708413.0,
"step": 125,
"train_runtime": 714.4677,
"train_tokens_per_second": 991.526
},
{
"epoch": 0.104,
"grad_norm": 624.0,
"learning_rate": 1.9723093769864663e-05,
"loss": 6.1637,
"mean_token_accuracy": 0.7725436985492706,
"num_input_tokens_seen": 733682,
"num_tokens": 733682.0,
"step": 130,
"train_runtime": 740.5204,
"train_tokens_per_second": 990.765
},
{
"epoch": 0.108,
"grad_norm": 63.75,
"learning_rate": 1.9691989991996663e-05,
"loss": 6.0692,
"mean_token_accuracy": 0.7738721042871475,
"num_input_tokens_seen": 763284,
"num_tokens": 763284.0,
"step": 135,
"train_runtime": 769.434,
"train_tokens_per_second": 992.007
},
{
"epoch": 0.112,
"grad_norm": 71.5,
"learning_rate": 1.9659258262890683e-05,
"loss": 5.9618,
"mean_token_accuracy": 0.7771038174629211,
"num_input_tokens_seen": 791236,
"num_tokens": 791236.0,
"step": 140,
"train_runtime": 795.0855,
"train_tokens_per_second": 995.158
},
{
"epoch": 0.116,
"grad_norm": 50.5,
"learning_rate": 1.9624904080453656e-05,
"loss": 6.2847,
"mean_token_accuracy": 0.768642008304596,
"num_input_tokens_seen": 818107,
"num_tokens": 818107.0,
"step": 145,
"train_runtime": 821.0436,
"train_tokens_per_second": 996.423
},
{
"epoch": 0.12,
"grad_norm": 113.5,
"learning_rate": 1.9588933215113926e-05,
"loss": 6.0329,
"mean_token_accuracy": 0.7712928548455238,
"num_input_tokens_seen": 846017,
"num_tokens": 846017.0,
"step": 150,
"train_runtime": 849.152,
"train_tokens_per_second": 996.308
},
{
"epoch": 0.124,
"grad_norm": 564.0,
"learning_rate": 1.955135170885202e-05,
"loss": 5.7569,
"mean_token_accuracy": 0.7793401271104813,
"num_input_tokens_seen": 879137,
"num_tokens": 879137.0,
"step": 155,
"train_runtime": 879.9983,
"train_tokens_per_second": 999.021
},
{
"epoch": 0.128,
"grad_norm": 59.25,
"learning_rate": 1.9512165874185768e-05,
"loss": 5.9181,
"mean_token_accuracy": 0.7819835215806961,
"num_input_tokens_seen": 905824,
"num_tokens": 905824.0,
"step": 160,
"train_runtime": 905.886,
"train_tokens_per_second": 999.932
},
{
"epoch": 0.132,
"grad_norm": 85.5,
"learning_rate": 1.9471382293110004e-05,
"loss": 5.6098,
"mean_token_accuracy": 0.7914141818881035,
"num_input_tokens_seen": 932292,
"num_tokens": 932292.0,
"step": 165,
"train_runtime": 931.5649,
"train_tokens_per_second": 1000.781
},
{
"epoch": 0.136,
"grad_norm": 58.0,
"learning_rate": 1.9429007815990995e-05,
"loss": 6.4777,
"mean_token_accuracy": 0.763878983259201,
"num_input_tokens_seen": 965956,
"num_tokens": 965956.0,
"step": 170,
"train_runtime": 962.6472,
"train_tokens_per_second": 1003.437
},
{
"epoch": 0.14,
"grad_norm": 130.0,
"learning_rate": 1.9385049560415794e-05,
"loss": 6.0058,
"mean_token_accuracy": 0.7746358260512352,
"num_input_tokens_seen": 992908,
"num_tokens": 992908.0,
"step": 175,
"train_runtime": 988.7648,
"train_tokens_per_second": 1004.19
},
{
"epoch": 0.144,
"grad_norm": 620.0,
"learning_rate": 1.9339514909996706e-05,
"loss": 6.0002,
"mean_token_accuracy": 0.7761899515986442,
"num_input_tokens_seen": 1027079,
"num_tokens": 1027079.0,
"step": 180,
"train_runtime": 1019.9491,
"train_tokens_per_second": 1006.99
},
{
"epoch": 0.148,
"grad_norm": 83.5,
"learning_rate": 1.929241151313108e-05,
"loss": 5.872,
"mean_token_accuracy": 0.7852786988019943,
"num_input_tokens_seen": 1056613,
"num_tokens": 1056613.0,
"step": 185,
"train_runtime": 1048.5847,
"train_tokens_per_second": 1007.656
},
{
"epoch": 0.152,
"grad_norm": 840.0,
"learning_rate": 1.9243747281716604e-05,
"loss": 5.682,
"mean_token_accuracy": 0.7868907496333122,
"num_input_tokens_seen": 1085315,
"num_tokens": 1085315.0,
"step": 190,
"train_runtime": 1074.4166,
"train_tokens_per_second": 1010.144
},
{
"epoch": 0.156,
"grad_norm": 81.5,
"learning_rate": 1.9193530389822364e-05,
"loss": 5.5209,
"mean_token_accuracy": 0.7940514251589775,
"num_input_tokens_seen": 1115115,
"num_tokens": 1115115.0,
"step": 195,
"train_runtime": 1103.2538,
"train_tokens_per_second": 1010.751
},
{
"epoch": 0.16,
"grad_norm": 100.5,
"learning_rate": 1.9141769272315857e-05,
"loss": 5.5272,
"mean_token_accuracy": 0.7890612185001373,
"num_input_tokens_seen": 1139079,
"num_tokens": 1139079.0,
"step": 200,
"train_runtime": 1127.1395,
"train_tokens_per_second": 1010.593
},
{
"epoch": 0.164,
"grad_norm": 62.25,
"learning_rate": 1.9088472623446182e-05,
"loss": 5.7363,
"mean_token_accuracy": 0.7819481372833252,
"num_input_tokens_seen": 1171419,
"num_tokens": 1171419.0,
"step": 205,
"train_runtime": 1158.0624,
"train_tokens_per_second": 1011.534
},
{
"epoch": 0.168,
"grad_norm": 169.0,
"learning_rate": 1.90336493953837e-05,
"loss": 5.914,
"mean_token_accuracy": 0.7838666513562202,
"num_input_tokens_seen": 1199784,
"num_tokens": 1199784.0,
"step": 210,
"train_runtime": 1184.5189,
"train_tokens_per_second": 1012.887
},
{
"epoch": 0.172,
"grad_norm": 78.0,
"learning_rate": 1.897730879671634e-05,
"loss": 5.3131,
"mean_token_accuracy": 0.7945496052503586,
"num_input_tokens_seen": 1224308,
"num_tokens": 1224308.0,
"step": 215,
"train_runtime": 1209.327,
"train_tokens_per_second": 1012.388
},
{
"epoch": 0.176,
"grad_norm": 1656.0,
"learning_rate": 1.891946029090283e-05,
"loss": 5.6709,
"mean_token_accuracy": 0.7843748390674591,
"num_input_tokens_seen": 1252051,
"num_tokens": 1252051.0,
"step": 220,
"train_runtime": 1236.4054,
"train_tokens_per_second": 1012.654
},
{
"epoch": 0.18,
"grad_norm": 95.5,
"learning_rate": 1.8860113594683148e-05,
"loss": 5.9523,
"mean_token_accuracy": 0.7736531987786293,
"num_input_tokens_seen": 1278020,
"num_tokens": 1278020.0,
"step": 225,
"train_runtime": 1262.0193,
"train_tokens_per_second": 1012.679
},
{
"epoch": 0.184,
"grad_norm": 75.0,
"learning_rate": 1.8799278676446425e-05,
"loss": 5.5729,
"mean_token_accuracy": 0.7859392121434212,
"num_input_tokens_seen": 1307055,
"num_tokens": 1307055.0,
"step": 230,
"train_runtime": 1289.4069,
"train_tokens_per_second": 1013.687
},
{
"epoch": 0.188,
"grad_norm": 54.75,
"learning_rate": 1.8736965754556527e-05,
"loss": 5.6597,
"mean_token_accuracy": 0.7832028537988662,
"num_input_tokens_seen": 1332482,
"num_tokens": 1332482.0,
"step": 235,
"train_runtime": 1314.3047,
"train_tokens_per_second": 1013.83
},
{
"epoch": 0.192,
"grad_norm": 60.75,
"learning_rate": 1.867318529563574e-05,
"loss": 6.0631,
"mean_token_accuracy": 0.7801717698574067,
"num_input_tokens_seen": 1359469,
"num_tokens": 1359469.0,
"step": 240,
"train_runtime": 1341.0816,
"train_tokens_per_second": 1013.711
},
{
"epoch": 0.196,
"grad_norm": 72.0,
"learning_rate": 1.8607948012806664e-05,
"loss": 5.7384,
"mean_token_accuracy": 0.7812554731965065,
"num_input_tokens_seen": 1390888,
"num_tokens": 1390888.0,
"step": 245,
"train_runtime": 1371.2979,
"train_tokens_per_second": 1014.286
},
{
"epoch": 0.2,
"grad_norm": 96.0,
"learning_rate": 1.8541264863892755e-05,
"loss": 5.871,
"mean_token_accuracy": 0.7782064586877823,
"num_input_tokens_seen": 1414969,
"num_tokens": 1414969.0,
"step": 250,
"train_runtime": 1395.692,
"train_tokens_per_second": 1013.812
},
{
"epoch": 0.204,
"grad_norm": 43.25,
"learning_rate": 1.8473147049577777e-05,
"loss": 5.9369,
"mean_token_accuracy": 0.7823293015360833,
"num_input_tokens_seen": 1441261,
"num_tokens": 1441261.0,
"step": 255,
"train_runtime": 1420.1833,
"train_tokens_per_second": 1014.842
},
{
"epoch": 0.208,
"grad_norm": 1168.0,
"learning_rate": 1.84036060115244e-05,
"loss": 5.7154,
"mean_token_accuracy": 0.7839734643697739,
"num_input_tokens_seen": 1470281,
"num_tokens": 1470281.0,
"step": 260,
"train_runtime": 1447.4831,
"train_tokens_per_second": 1015.75
},
{
"epoch": 0.212,
"grad_norm": 65.0,
"learning_rate": 1.8332653430452375e-05,
"loss": 5.4737,
"mean_token_accuracy": 0.7898797050118447,
"num_input_tokens_seen": 1502818,
"num_tokens": 1502818.0,
"step": 265,
"train_runtime": 1477.0712,
"train_tokens_per_second": 1017.431
},
{
"epoch": 0.216,
"grad_norm": 175.0,
"learning_rate": 1.826030122417656e-05,
"loss": 5.5671,
"mean_token_accuracy": 0.780795156955719,
"num_input_tokens_seen": 1531913,
"num_tokens": 1531913.0,
"step": 270,
"train_runtime": 1503.8595,
"train_tokens_per_second": 1018.654
},
{
"epoch": 0.22,
"grad_norm": 62.75,
"learning_rate": 1.8186561545605055e-05,
"loss": 5.9096,
"mean_token_accuracy": 0.7873492911458015,
"num_input_tokens_seen": 1563135,
"num_tokens": 1563135.0,
"step": 275,
"train_runtime": 1532.3365,
"train_tokens_per_second": 1020.099
},
{
"epoch": 0.224,
"grad_norm": 200.0,
"learning_rate": 1.811144678069793e-05,
"loss": 5.2624,
"mean_token_accuracy": 0.7928400427103043,
"num_input_tokens_seen": 1591144,
"num_tokens": 1591144.0,
"step": 280,
"train_runtime": 1560.0406,
"train_tokens_per_second": 1019.938
},
{
"epoch": 0.228,
"grad_norm": 38.5,
"learning_rate": 1.803496954638676e-05,
"loss": 5.4277,
"mean_token_accuracy": 0.7846548587083817,
"num_input_tokens_seen": 1621253,
"num_tokens": 1621253.0,
"step": 285,
"train_runtime": 1590.4126,
"train_tokens_per_second": 1019.391
},
{
"epoch": 0.232,
"grad_norm": 67.0,
"learning_rate": 1.7957142688455362e-05,
"loss": 5.6897,
"mean_token_accuracy": 0.7835722789168358,
"num_input_tokens_seen": 1646466,
"num_tokens": 1646466.0,
"step": 290,
"train_runtime": 1615.728,
"train_tokens_per_second": 1019.024
},
{
"epoch": 0.236,
"grad_norm": 121.5,
"learning_rate": 1.7877979279382135e-05,
"loss": 5.8008,
"mean_token_accuracy": 0.777139276266098,
"num_input_tokens_seen": 1674583,
"num_tokens": 1674583.0,
"step": 295,
"train_runtime": 1641.5179,
"train_tokens_per_second": 1020.143
},
{
"epoch": 0.24,
"grad_norm": 43.5,
"learning_rate": 1.7797492616144256e-05,
"loss": 5.794,
"mean_token_accuracy": 0.7790872991085053,
"num_input_tokens_seen": 1702490,
"num_tokens": 1702490.0,
"step": 300,
"train_runtime": 1667.5303,
"train_tokens_per_second": 1020.965
},
{
"epoch": 0.244,
"grad_norm": 84.0,
"learning_rate": 1.7715696217984233e-05,
"loss": 5.7744,
"mean_token_accuracy": 0.7830787718296051,
"num_input_tokens_seen": 1732134,
"num_tokens": 1732134.0,
"step": 305,
"train_runtime": 1695.751,
"train_tokens_per_second": 1021.455
},
{
"epoch": 0.248,
"grad_norm": 97.5,
"learning_rate": 1.7632603824139086e-05,
"loss": 5.7534,
"mean_token_accuracy": 0.7765944376587868,
"num_input_tokens_seen": 1760431,
"num_tokens": 1760431.0,
"step": 310,
"train_runtime": 1721.7874,
"train_tokens_per_second": 1022.444
},
{
"epoch": 0.252,
"grad_norm": 214.0,
"learning_rate": 1.7548229391532572e-05,
"loss": 5.5022,
"mean_token_accuracy": 0.7858106374740601,
"num_input_tokens_seen": 1786890,
"num_tokens": 1786890.0,
"step": 315,
"train_runtime": 1746.8899,
"train_tokens_per_second": 1022.898
},
{
"epoch": 0.256,
"grad_norm": 128.0,
"learning_rate": 1.7462587092430877e-05,
"loss": 5.6599,
"mean_token_accuracy": 0.7865968465805053,
"num_input_tokens_seen": 1812677,
"num_tokens": 1812677.0,
"step": 320,
"train_runtime": 1774.44,
"train_tokens_per_second": 1021.549
},
{
"epoch": 0.26,
"grad_norm": 75.0,
"learning_rate": 1.7375691312062102e-05,
"loss": 5.5823,
"mean_token_accuracy": 0.798132348060608,
"num_input_tokens_seen": 1844191,
"num_tokens": 1844191.0,
"step": 325,
"train_runtime": 1803.6261,
"train_tokens_per_second": 1022.491
},
{
"epoch": 0.264,
"grad_norm": 68.0,
"learning_rate": 1.728755664620002e-05,
"loss": 5.4933,
"mean_token_accuracy": 0.7956582695245743,
"num_input_tokens_seen": 1873707,
"num_tokens": 1873707.0,
"step": 330,
"train_runtime": 1831.8481,
"train_tokens_per_second": 1022.851
},
{
"epoch": 0.268,
"grad_norm": 174.0,
"learning_rate": 1.7198197898712402e-05,
"loss": 5.6207,
"mean_token_accuracy": 0.7940072804689408,
"num_input_tokens_seen": 1897405,
"num_tokens": 1897405.0,
"step": 335,
"train_runtime": 1855.6387,
"train_tokens_per_second": 1022.508
},
{
"epoch": 0.272,
"grad_norm": 87.0,
"learning_rate": 1.7107630079074477e-05,
"loss": 5.3281,
"mean_token_accuracy": 0.7956176668405532,
"num_input_tokens_seen": 1924306,
"num_tokens": 1924306.0,
"step": 340,
"train_runtime": 1882.2812,
"train_tokens_per_second": 1022.327
},
{
"epoch": 0.276,
"grad_norm": 58.0,
"learning_rate": 1.7015868399847768e-05,
"loss": 5.3952,
"mean_token_accuracy": 0.795125538110733,
"num_input_tokens_seen": 1957493,
"num_tokens": 1957493.0,
"step": 345,
"train_runtime": 1912.1661,
"train_tokens_per_second": 1023.704
},
{
"epoch": 0.28,
"grad_norm": 69.5,
"learning_rate": 1.6922928274124887e-05,
"loss": 5.5721,
"mean_token_accuracy": 0.786569619178772,
"num_input_tokens_seen": 1989461,
"num_tokens": 1989461.0,
"step": 350,
"train_runtime": 1941.3276,
"train_tokens_per_second": 1024.794
},
{
"epoch": 0.284,
"grad_norm": 52.75,
"learning_rate": 1.6828825312940594e-05,
"loss": 5.6287,
"mean_token_accuracy": 0.7822471752762794,
"num_input_tokens_seen": 2018840,
"num_tokens": 2018840.0,
"step": 355,
"train_runtime": 1968.6202,
"train_tokens_per_second": 1025.51
},
{
"epoch": 0.288,
"grad_norm": 78.0,
"learning_rate": 1.673357532264966e-05,
"loss": 5.6751,
"mean_token_accuracy": 0.7819222688674927,
"num_input_tokens_seen": 2046554,
"num_tokens": 2046554.0,
"step": 360,
"train_runtime": 1994.9582,
"train_tokens_per_second": 1025.863
},
{
"epoch": 0.292,
"grad_norm": 58.25,
"learning_rate": 1.663719430227186e-05,
"loss": 5.3833,
"mean_token_accuracy": 0.7945975109934806,
"num_input_tokens_seen": 2074389,
"num_tokens": 2074389.0,
"step": 365,
"train_runtime": 2023.2385,
"train_tokens_per_second": 1025.282
},
{
"epoch": 0.296,
"grad_norm": 63.25,
"learning_rate": 1.653969844080466e-05,
"loss": 5.3062,
"mean_token_accuracy": 0.7965321630239487,
"num_input_tokens_seen": 2102788,
"num_tokens": 2102788.0,
"step": 370,
"train_runtime": 2051.2147,
"train_tokens_per_second": 1025.143
},
{
"epoch": 0.3,
"grad_norm": 169.0,
"learning_rate": 1.644110411450398e-05,
"loss": 5.6184,
"mean_token_accuracy": 0.7859084010124207,
"num_input_tokens_seen": 2127585,
"num_tokens": 2127585.0,
"step": 375,
"train_runtime": 2076.5681,
"train_tokens_per_second": 1024.568
},
{
"epoch": 0.304,
"grad_norm": 67.0,
"learning_rate": 1.634142788413346e-05,
"loss": 5.7921,
"mean_token_accuracy": 0.7838741362094879,
"num_input_tokens_seen": 2153488,
"num_tokens": 2153488.0,
"step": 380,
"train_runtime": 2101.2637,
"train_tokens_per_second": 1024.854
},
{
"epoch": 0.308,
"grad_norm": 139.0,
"learning_rate": 1.6240686492182806e-05,
"loss": 5.7157,
"mean_token_accuracy": 0.7820939481258392,
"num_input_tokens_seen": 2180494,
"num_tokens": 2180494.0,
"step": 385,
"train_runtime": 2127.9074,
"train_tokens_per_second": 1024.713
},
{
"epoch": 0.312,
"grad_norm": 45.25,
"learning_rate": 1.6138896860055555e-05,
"loss": 5.3245,
"mean_token_accuracy": 0.7927197381854058,
"num_input_tokens_seen": 2209057,
"num_tokens": 2209057.0,
"step": 390,
"train_runtime": 2153.5427,
"train_tokens_per_second": 1025.778
},
{
"epoch": 0.316,
"grad_norm": 131.0,
"learning_rate": 1.6036076085226813e-05,
"loss": 5.2268,
"mean_token_accuracy": 0.7993880152702332,
"num_input_tokens_seen": 2238624,
"num_tokens": 2238624.0,
"step": 395,
"train_runtime": 2181.8136,
"train_tokens_per_second": 1026.038
},
{
"epoch": 0.32,
"grad_norm": 94.5,
"learning_rate": 1.593224143837142e-05,
"loss": 5.6083,
"mean_token_accuracy": 0.7860068812966347,
"num_input_tokens_seen": 2266912,
"num_tokens": 2266912.0,
"step": 400,
"train_runtime": 2208.6845,
"train_tokens_per_second": 1026.363
},
{
"epoch": 0.324,
"grad_norm": 67.5,
"learning_rate": 1.582741036046301e-05,
"loss": 5.6723,
"mean_token_accuracy": 0.7925399646162987,
"num_input_tokens_seen": 2298813,
"num_tokens": 2298813.0,
"step": 405,
"train_runtime": 2237.0379,
"train_tokens_per_second": 1027.615
},
{
"epoch": 0.328,
"grad_norm": 39.0,
"learning_rate": 1.572160045984447e-05,
"loss": 5.3201,
"mean_token_accuracy": 0.8001187354326248,
"num_input_tokens_seen": 2331446,
"num_tokens": 2331446.0,
"step": 410,
"train_runtime": 2266.6842,
"train_tokens_per_second": 1028.571
},
{
"epoch": 0.332,
"grad_norm": 39.0,
"learning_rate": 1.561482950927029e-05,
"loss": 5.587,
"mean_token_accuracy": 0.7832968756556511,
"num_input_tokens_seen": 2363201,
"num_tokens": 2363201.0,
"step": 415,
"train_runtime": 2296.0603,
"train_tokens_per_second": 1029.242
},
{
"epoch": 0.336,
"grad_norm": 142.0,
"learning_rate": 1.550711544292131e-05,
"loss": 5.6473,
"mean_token_accuracy": 0.7904362455010414,
"num_input_tokens_seen": 2389216,
"num_tokens": 2389216.0,
"step": 420,
"train_runtime": 2320.7153,
"train_tokens_per_second": 1029.517
},
{
"epoch": 0.34,
"grad_norm": 47.5,
"learning_rate": 1.5398476353392323e-05,
"loss": 5.4438,
"mean_token_accuracy": 0.7908162623643875,
"num_input_tokens_seen": 2419743,
"num_tokens": 2419743.0,
"step": 425,
"train_runtime": 2350.136,
"train_tokens_per_second": 1029.618
},
{
"epoch": 0.344,
"grad_norm": 49.25,
"learning_rate": 1.5288930488653094e-05,
"loss": 5.2794,
"mean_token_accuracy": 0.7933614462614059,
"num_input_tokens_seen": 2447011,
"num_tokens": 2447011.0,
"step": 430,
"train_runtime": 2375.2075,
"train_tokens_per_second": 1030.23
},
{
"epoch": 0.348,
"grad_norm": 83.5,
"learning_rate": 1.5178496248983254e-05,
"loss": 6.0266,
"mean_token_accuracy": 0.7747324109077454,
"num_input_tokens_seen": 2475216,
"num_tokens": 2475216.0,
"step": 435,
"train_runtime": 2401.9371,
"train_tokens_per_second": 1030.508
},
{
"epoch": 0.352,
"grad_norm": 89.0,
"learning_rate": 1.5067192183881658e-05,
"loss": 5.4756,
"mean_token_accuracy": 0.7997275143861771,
"num_input_tokens_seen": 2502081,
"num_tokens": 2502081.0,
"step": 440,
"train_runtime": 2428.7281,
"train_tokens_per_second": 1030.202
},
{
"epoch": 0.356,
"grad_norm": 52.0,
"learning_rate": 1.4955036988950617e-05,
"loss": 6.1068,
"mean_token_accuracy": 0.7708079561591148,
"num_input_tokens_seen": 2529391,
"num_tokens": 2529391.0,
"step": 445,
"train_runtime": 2455.553,
"train_tokens_per_second": 1030.07
},
{
"epoch": 0.36,
"grad_norm": 127.5,
"learning_rate": 1.484204950275565e-05,
"loss": 5.5882,
"mean_token_accuracy": 0.7801107332110405,
"num_input_tokens_seen": 2554232,
"num_tokens": 2554232.0,
"step": 450,
"train_runtime": 2479.6181,
"train_tokens_per_second": 1030.091
},
{
"epoch": 0.364,
"grad_norm": 49.5,
"learning_rate": 1.4728248703661183e-05,
"loss": 5.4756,
"mean_token_accuracy": 0.7862519830465317,
"num_input_tokens_seen": 2582182,
"num_tokens": 2582182.0,
"step": 455,
"train_runtime": 2505.889,
"train_tokens_per_second": 1030.445
},
{
"epoch": 0.368,
"grad_norm": 66.5,
"learning_rate": 1.461365370664276e-05,
"loss": 5.3923,
"mean_token_accuracy": 0.7940330818295479,
"num_input_tokens_seen": 2617794,
"num_tokens": 2617794.0,
"step": 460,
"train_runtime": 2539.941,
"train_tokens_per_second": 1030.652
},
{
"epoch": 0.372,
"grad_norm": 104.5,
"learning_rate": 1.4498283760076362e-05,
"loss": 5.5707,
"mean_token_accuracy": 0.7927709832787514,
"num_input_tokens_seen": 2643272,
"num_tokens": 2643272.0,
"step": 465,
"train_runtime": 2564.5252,
"train_tokens_per_second": 1030.706
},
{
"epoch": 0.376,
"grad_norm": 59.25,
"learning_rate": 1.4382158242505236e-05,
"loss": 5.644,
"mean_token_accuracy": 0.7871902465820313,
"num_input_tokens_seen": 2671869,
"num_tokens": 2671869.0,
"step": 470,
"train_runtime": 2592.8854,
"train_tokens_per_second": 1030.462
},
{
"epoch": 0.38,
"grad_norm": 31.625,
"learning_rate": 1.4265296659384956e-05,
"loss": 5.6562,
"mean_token_accuracy": 0.7877990290522575,
"num_input_tokens_seen": 2702041,
"num_tokens": 2702041.0,
"step": 475,
"train_runtime": 2621.0737,
"train_tokens_per_second": 1030.891
},
{
"epoch": 0.384,
"grad_norm": 86.5,
"learning_rate": 1.4147718639807071e-05,
"loss": 5.621,
"mean_token_accuracy": 0.7925810098648072,
"num_input_tokens_seen": 2732153,
"num_tokens": 2732153.0,
"step": 480,
"train_runtime": 2649.9389,
"train_tokens_per_second": 1031.025
},
{
"epoch": 0.388,
"grad_norm": 74.5,
"learning_rate": 1.4029443933202059e-05,
"loss": 5.4204,
"mean_token_accuracy": 0.7914870575070381,
"num_input_tokens_seen": 2758024,
"num_tokens": 2758024.0,
"step": 485,
"train_runtime": 2676.6477,
"train_tokens_per_second": 1030.402
},
{
"epoch": 0.392,
"grad_norm": 70.5,
"learning_rate": 1.3910492406022033e-05,
"loss": 5.7675,
"mean_token_accuracy": 0.7754184618592262,
"num_input_tokens_seen": 2786096,
"num_tokens": 2786096.0,
"step": 490,
"train_runtime": 2703.9298,
"train_tokens_per_second": 1030.388
},
{
"epoch": 0.396,
"grad_norm": 57.0,
"learning_rate": 1.3790884038403796e-05,
"loss": 5.5642,
"mean_token_accuracy": 0.7880642995238304,
"num_input_tokens_seen": 2814487,
"num_tokens": 2814487.0,
"step": 495,
"train_runtime": 2731.3935,
"train_tokens_per_second": 1030.422
},
{
"epoch": 0.4,
"grad_norm": 248.0,
"learning_rate": 1.36706389208128e-05,
"loss": 5.5255,
"mean_token_accuracy": 0.7882839411497116,
"num_input_tokens_seen": 2847405,
"num_tokens": 2847405.0,
"step": 500,
"train_runtime": 2762.1229,
"train_tokens_per_second": 1030.876
},
{
"epoch": 0.404,
"grad_norm": 98.0,
"learning_rate": 1.354977725066859e-05,
"loss": 5.3838,
"mean_token_accuracy": 0.7869982674717904,
"num_input_tokens_seen": 2876359,
"num_tokens": 2876359.0,
"step": 505,
"train_runtime": 2787.1872,
"train_tokens_per_second": 1031.993
},
{
"epoch": 0.408,
"grad_norm": 352.0,
"learning_rate": 1.3428319328952254e-05,
"loss": 5.4099,
"mean_token_accuracy": 0.7843140512704849,
"num_input_tokens_seen": 2902835,
"num_tokens": 2902835.0,
"step": 510,
"train_runtime": 2811.8431,
"train_tokens_per_second": 1032.36
},
{
"epoch": 0.412,
"grad_norm": 112.0,
"learning_rate": 1.3306285556796494e-05,
"loss": 5.3675,
"mean_token_accuracy": 0.7961455345153808,
"num_input_tokens_seen": 2932928,
"num_tokens": 2932928.0,
"step": 515,
"train_runtime": 2839.8383,
"train_tokens_per_second": 1032.78
},
{
"epoch": 0.416,
"grad_norm": 163.0,
"learning_rate": 1.3183696432058889e-05,
"loss": 5.2575,
"mean_token_accuracy": 0.7971223339438438,
"num_input_tokens_seen": 2957517,
"num_tokens": 2957517.0,
"step": 520,
"train_runtime": 2865.8475,
"train_tokens_per_second": 1031.987
},
{
"epoch": 0.42,
"grad_norm": 56.0,
"learning_rate": 1.3060572545878875e-05,
"loss": 5.4625,
"mean_token_accuracy": 0.7884187951683999,
"num_input_tokens_seen": 2987924,
"num_tokens": 2987924.0,
"step": 525,
"train_runtime": 2893.0838,
"train_tokens_per_second": 1032.782
},
{
"epoch": 0.424,
"grad_norm": 84.5,
"learning_rate": 1.2936934579219094e-05,
"loss": 4.9978,
"mean_token_accuracy": 0.8107392936944962,
"num_input_tokens_seen": 3014169,
"num_tokens": 3014169.0,
"step": 530,
"train_runtime": 2919.0783,
"train_tokens_per_second": 1032.576
},
{
"epoch": 0.428,
"grad_norm": 66.5,
"learning_rate": 1.2812803299391629e-05,
"loss": 5.7573,
"mean_token_accuracy": 0.789200983941555,
"num_input_tokens_seen": 3046708,
"num_tokens": 3046708.0,
"step": 535,
"train_runtime": 2948.6574,
"train_tokens_per_second": 1033.253
},
{
"epoch": 0.432,
"grad_norm": 70.5,
"learning_rate": 1.2688199556569753e-05,
"loss": 5.4901,
"mean_token_accuracy": 0.7852542266249657,
"num_input_tokens_seen": 3074318,
"num_tokens": 3074318.0,
"step": 540,
"train_runtime": 2975.382,
"train_tokens_per_second": 1033.252
},
{
"epoch": 0.436,
"grad_norm": 146.0,
"learning_rate": 1.2563144280285742e-05,
"loss": 5.1747,
"mean_token_accuracy": 0.8006948977708817,
"num_input_tokens_seen": 3102044,
"num_tokens": 3102044.0,
"step": 545,
"train_runtime": 3001.1559,
"train_tokens_per_second": 1033.616
},
{
"epoch": 0.44,
"grad_norm": 63.5,
"learning_rate": 1.2437658475915378e-05,
"loss": 5.5294,
"mean_token_accuracy": 0.7853314474225044,
"num_input_tokens_seen": 3129406,
"num_tokens": 3129406.0,
"step": 550,
"train_runtime": 3027.9577,
"train_tokens_per_second": 1033.504
},
{
"epoch": 0.444,
"grad_norm": 72.5,
"learning_rate": 1.23117632211497e-05,
"loss": 5.1169,
"mean_token_accuracy": 0.8012081518769264,
"num_input_tokens_seen": 3159496,
"num_tokens": 3159496.0,
"step": 555,
"train_runtime": 3057.971,
"train_tokens_per_second": 1033.2
},
{
"epoch": 0.448,
"grad_norm": 102.5,
"learning_rate": 1.2185479662454596e-05,
"loss": 5.5137,
"mean_token_accuracy": 0.7913802459836006,
"num_input_tokens_seen": 3185619,
"num_tokens": 3185619.0,
"step": 560,
"train_runtime": 3083.8761,
"train_tokens_per_second": 1032.992
},
{
"epoch": 0.452,
"grad_norm": 132.0,
"learning_rate": 1.2058829011518896e-05,
"loss": 5.2765,
"mean_token_accuracy": 0.7947102382779121,
"num_input_tokens_seen": 3212881,
"num_tokens": 3212881.0,
"step": 565,
"train_runtime": 3109.5104,
"train_tokens_per_second": 1033.243
},
{
"epoch": 0.456,
"grad_norm": 167.0,
"learning_rate": 1.193183254169142e-05,
"loss": 5.3215,
"mean_token_accuracy": 0.7931090787053108,
"num_input_tokens_seen": 3240673,
"num_tokens": 3240673.0,
"step": 570,
"train_runtime": 3136.4109,
"train_tokens_per_second": 1033.242
},
{
"epoch": 0.46,
"grad_norm": 40.5,
"learning_rate": 1.1804511584407763e-05,
"loss": 5.9359,
"mean_token_accuracy": 0.7751367390155792,
"num_input_tokens_seen": 3272505,
"num_tokens": 3272505.0,
"step": 575,
"train_runtime": 3164.4979,
"train_tokens_per_second": 1034.131
},
{
"epoch": 0.464,
"grad_norm": 108.0,
"learning_rate": 1.1676887525607272e-05,
"loss": 5.4725,
"mean_token_accuracy": 0.7907213315367698,
"num_input_tokens_seen": 3302432,
"num_tokens": 3302432.0,
"step": 580,
"train_runtime": 3193.0515,
"train_tokens_per_second": 1034.256
},
{
"epoch": 0.468,
"grad_norm": 86.0,
"learning_rate": 1.1548981802140849e-05,
"loss": 5.2601,
"mean_token_accuracy": 0.7978497371077538,
"num_input_tokens_seen": 3328723,
"num_tokens": 3328723.0,
"step": 585,
"train_runtime": 3219.6782,
"train_tokens_per_second": 1033.868
},
{
"epoch": 0.472,
"grad_norm": 90.5,
"learning_rate": 1.142081589817027e-05,
"loss": 4.9631,
"mean_token_accuracy": 0.8012796014547348,
"num_input_tokens_seen": 3356380,
"num_tokens": 3356380.0,
"step": 590,
"train_runtime": 3246.7315,
"train_tokens_per_second": 1033.772
},
{
"epoch": 0.476,
"grad_norm": 229.0,
"learning_rate": 1.129241134155949e-05,
"loss": 5.4675,
"mean_token_accuracy": 0.7908360511064529,
"num_input_tokens_seen": 3383429,
"num_tokens": 3383429.0,
"step": 595,
"train_runtime": 3273.3996,
"train_tokens_per_second": 1033.613
},
{
"epoch": 0.48,
"grad_norm": 50.75,
"learning_rate": 1.1163789700258656e-05,
"loss": 5.1986,
"mean_token_accuracy": 0.8053984194993973,
"num_input_tokens_seen": 3415705,
"num_tokens": 3415705.0,
"step": 600,
"train_runtime": 3305.0605,
"train_tokens_per_second": 1033.477
},
{
"epoch": 0.484,
"grad_norm": 219.0,
"learning_rate": 1.1034972578681338e-05,
"loss": 5.1812,
"mean_token_accuracy": 0.7935044363141059,
"num_input_tokens_seen": 3443164,
"num_tokens": 3443164.0,
"step": 605,
"train_runtime": 3332.0068,
"train_tokens_per_second": 1033.36
},
{
"epoch": 0.488,
"grad_norm": 157.0,
"learning_rate": 1.0905981614075693e-05,
"loss": 5.1947,
"mean_token_accuracy": 0.796344393491745,
"num_input_tokens_seen": 3467326,
"num_tokens": 3467326.0,
"step": 610,
"train_runtime": 3356.439,
"train_tokens_per_second": 1033.037
},
{
"epoch": 0.492,
"grad_norm": 716.0,
"learning_rate": 1.0776838472890065e-05,
"loss": 5.292,
"mean_token_accuracy": 0.7951879158616066,
"num_input_tokens_seen": 3495396,
"num_tokens": 3495396.0,
"step": 615,
"train_runtime": 3383.0011,
"train_tokens_per_second": 1033.223
},
{
"epoch": 0.496,
"grad_norm": 49.5,
"learning_rate": 1.06475648471337e-05,
"loss": 5.0782,
"mean_token_accuracy": 0.8010394781827926,
"num_input_tokens_seen": 3520634,
"num_tokens": 3520634.0,
"step": 620,
"train_runtime": 3409.0668,
"train_tokens_per_second": 1032.727
},
{
"epoch": 0.5,
"grad_norm": 95.5,
"learning_rate": 1.0518182450733185e-05,
"loss": 5.5994,
"mean_token_accuracy": 0.7855334684252739,
"num_input_tokens_seen": 3549657,
"num_tokens": 3549657.0,
"step": 625,
"train_runtime": 3436.945,
"train_tokens_per_second": 1032.794
},
{
"epoch": 0.504,
"grad_norm": 76.0,
"learning_rate": 1.0388713015885161e-05,
"loss": 5.5592,
"mean_token_accuracy": 0.7845589280128479,
"num_input_tokens_seen": 3579684,
"num_tokens": 3579684.0,
"step": 630,
"train_runtime": 3464.2838,
"train_tokens_per_second": 1033.311
},
{
"epoch": 0.508,
"grad_norm": 44.5,
"learning_rate": 1.0259178289406011e-05,
"loss": 5.3878,
"mean_token_accuracy": 0.7951082989573479,
"num_input_tokens_seen": 3605087,
"num_tokens": 3605087.0,
"step": 635,
"train_runtime": 3488.9594,
"train_tokens_per_second": 1033.284
},
{
"epoch": 0.512,
"grad_norm": 59.5,
"learning_rate": 1.0129600029079072e-05,
"loss": 5.7061,
"mean_token_accuracy": 0.7883853644132615,
"num_input_tokens_seen": 3632957,
"num_tokens": 3632957.0,
"step": 640,
"train_runtime": 3515.7879,
"train_tokens_per_second": 1033.327
},
{
"epoch": 0.516,
"grad_norm": 52.25,
"learning_rate": 1e-05,
"loss": 5.2977,
"mean_token_accuracy": 0.7912828177213669,
"num_input_tokens_seen": 3660791,
"num_tokens": 3660791.0,
"step": 645,
"train_runtime": 3543.3758,
"train_tokens_per_second": 1033.137
},
{
"epoch": 0.52,
"grad_norm": 52.0,
"learning_rate": 9.870399970920932e-06,
"loss": 5.4416,
"mean_token_accuracy": 0.7928792417049408,
"num_input_tokens_seen": 3689640,
"num_tokens": 3689640.0,
"step": 650,
"train_runtime": 3570.9162,
"train_tokens_per_second": 1033.247
},
{
"epoch": 0.524,
"grad_norm": 62.75,
"learning_rate": 9.740821710593989e-06,
"loss": 5.3682,
"mean_token_accuracy": 0.798756355047226,
"num_input_tokens_seen": 3715172,
"num_tokens": 3715172.0,
"step": 655,
"train_runtime": 3596.1899,
"train_tokens_per_second": 1033.086
},
{
"epoch": 0.528,
"grad_norm": 89.5,
"learning_rate": 9.61128698411484e-06,
"loss": 5.168,
"mean_token_accuracy": 0.7963975608348847,
"num_input_tokens_seen": 3741030,
"num_tokens": 3741030.0,
"step": 660,
"train_runtime": 3620.5686,
"train_tokens_per_second": 1033.271
},
{
"epoch": 0.532,
"grad_norm": 74.0,
"learning_rate": 9.481817549266817e-06,
"loss": 4.9416,
"mean_token_accuracy": 0.8047078907489776,
"num_input_tokens_seen": 3768537,
"num_tokens": 3768537.0,
"step": 665,
"train_runtime": 3647.1969,
"train_tokens_per_second": 1033.269
},
{
"epoch": 0.536,
"grad_norm": 45.25,
"learning_rate": 9.352435152866299e-06,
"loss": 5.3385,
"mean_token_accuracy": 0.7905179291963578,
"num_input_tokens_seen": 3799526,
"num_tokens": 3799526.0,
"step": 670,
"train_runtime": 3674.2376,
"train_tokens_per_second": 1034.099
},
{
"epoch": 0.54,
"grad_norm": 50.75,
"learning_rate": 9.223161527109938e-06,
"loss": 5.1579,
"mean_token_accuracy": 0.8002543315291405,
"num_input_tokens_seen": 3833970,
"num_tokens": 3833970.0,
"step": 675,
"train_runtime": 3706.0243,
"train_tokens_per_second": 1034.524
},
{
"epoch": 0.544,
"grad_norm": 185.0,
"learning_rate": 9.09401838592431e-06,
"loss": 5.1117,
"mean_token_accuracy": 0.7966769933700562,
"num_input_tokens_seen": 3860184,
"num_tokens": 3860184.0,
"step": 680,
"train_runtime": 3730.8409,
"train_tokens_per_second": 1034.669
},
{
"epoch": 0.548,
"grad_norm": 94.0,
"learning_rate": 8.965027421318666e-06,
"loss": 5.1979,
"mean_token_accuracy": 0.7981098249554635,
"num_input_tokens_seen": 3886677,
"num_tokens": 3886677.0,
"step": 685,
"train_runtime": 3756.8191,
"train_tokens_per_second": 1034.566
},
{
"epoch": 0.552,
"grad_norm": 47.75,
"learning_rate": 8.836210299741346e-06,
"loss": 5.6591,
"mean_token_accuracy": 0.7767333656549453,
"num_input_tokens_seen": 3914353,
"num_tokens": 3914353.0,
"step": 690,
"train_runtime": 3789.086,
"train_tokens_per_second": 1033.06
},
{
"epoch": 0.556,
"grad_norm": 106.0,
"learning_rate": 8.707588658440511e-06,
"loss": 5.5255,
"mean_token_accuracy": 0.7878315180540085,
"num_input_tokens_seen": 3940719,
"num_tokens": 3940719.0,
"step": 695,
"train_runtime": 3814.8928,
"train_tokens_per_second": 1032.983
},
{
"epoch": 0.56,
"grad_norm": 41.25,
"learning_rate": 8.579184101829734e-06,
"loss": 5.5184,
"mean_token_accuracy": 0.7867645308375358,
"num_input_tokens_seen": 3978888,
"num_tokens": 3978888.0,
"step": 700,
"train_runtime": 3851.4853,
"train_tokens_per_second": 1033.079
},
{
"epoch": 0.564,
"grad_norm": 90.5,
"learning_rate": 8.451018197859153e-06,
"loss": 5.3507,
"mean_token_accuracy": 0.7968938469886779,
"num_input_tokens_seen": 4011090,
"num_tokens": 4011090.0,
"step": 705,
"train_runtime": 3880.8486,
"train_tokens_per_second": 1033.56
},
{
"epoch": 0.568,
"grad_norm": 101.0,
"learning_rate": 8.323112474392731e-06,
"loss": 5.4665,
"mean_token_accuracy": 0.7853416830301285,
"num_input_tokens_seen": 4038326,
"num_tokens": 4038326.0,
"step": 710,
"train_runtime": 3907.9772,
"train_tokens_per_second": 1033.355
},
{
"epoch": 0.572,
"grad_norm": 318.0,
"learning_rate": 8.195488415592238e-06,
"loss": 5.5897,
"mean_token_accuracy": 0.7923433750867843,
"num_input_tokens_seen": 4069168,
"num_tokens": 4069168.0,
"step": 715,
"train_runtime": 3938.7796,
"train_tokens_per_second": 1033.104
},
{
"epoch": 0.576,
"grad_norm": 192.0,
"learning_rate": 8.068167458308582e-06,
"loss": 5.4862,
"mean_token_accuracy": 0.7898807466030121,
"num_input_tokens_seen": 4096317,
"num_tokens": 4096317.0,
"step": 720,
"train_runtime": 3965.423,
"train_tokens_per_second": 1033.009
},
{
"epoch": 0.58,
"grad_norm": 54.25,
"learning_rate": 7.941170988481108e-06,
"loss": 5.6047,
"mean_token_accuracy": 0.7875970765948296,
"num_input_tokens_seen": 4125700,
"num_tokens": 4125700.0,
"step": 725,
"train_runtime": 3992.1983,
"train_tokens_per_second": 1033.441
},
{
"epoch": 0.584,
"grad_norm": 56.0,
"learning_rate": 7.814520337545405e-06,
"loss": 5.628,
"mean_token_accuracy": 0.7885335445404053,
"num_input_tokens_seen": 4151858,
"num_tokens": 4151858.0,
"step": 730,
"train_runtime": 4017.6605,
"train_tokens_per_second": 1033.402
},
{
"epoch": 0.588,
"grad_norm": 79.0,
"learning_rate": 7.688236778850307e-06,
"loss": 5.6834,
"mean_token_accuracy": 0.7778642490506172,
"num_input_tokens_seen": 4175955,
"num_tokens": 4175955.0,
"step": 735,
"train_runtime": 4042.321,
"train_tokens_per_second": 1033.059
},
{
"epoch": 0.592,
"grad_norm": 57.5,
"learning_rate": 7.5623415240846235e-06,
"loss": 5.4957,
"mean_token_accuracy": 0.7904580295085907,
"num_input_tokens_seen": 4203815,
"num_tokens": 4203815.0,
"step": 740,
"train_runtime": 4068.3684,
"train_tokens_per_second": 1033.293
},
{
"epoch": 0.596,
"grad_norm": 63.0,
"learning_rate": 7.4368557197142596e-06,
"loss": 4.8703,
"mean_token_accuracy": 0.8159982651472092,
"num_input_tokens_seen": 4233192,
"num_tokens": 4233192.0,
"step": 745,
"train_runtime": 4097.3421,
"train_tokens_per_second": 1033.156
},
{
"epoch": 0.6,
"grad_norm": 71.5,
"learning_rate": 7.311800443430251e-06,
"loss": 5.3462,
"mean_token_accuracy": 0.794153805077076,
"num_input_tokens_seen": 4262718,
"num_tokens": 4262718.0,
"step": 750,
"train_runtime": 4124.7729,
"train_tokens_per_second": 1033.443
},
{
"epoch": 0.604,
"grad_norm": 152.0,
"learning_rate": 7.187196700608373e-06,
"loss": 4.9215,
"mean_token_accuracy": 0.8081203132867814,
"num_input_tokens_seen": 4288296,
"num_tokens": 4288296.0,
"step": 755,
"train_runtime": 4151.2234,
"train_tokens_per_second": 1033.02
},
{
"epoch": 0.608,
"grad_norm": 73.5,
"learning_rate": 7.063065420780909e-06,
"loss": 5.2529,
"mean_token_accuracy": 0.7946424350142479,
"num_input_tokens_seen": 4318154,
"num_tokens": 4318154.0,
"step": 760,
"train_runtime": 4180.0353,
"train_tokens_per_second": 1033.042
},
{
"epoch": 0.612,
"grad_norm": 247.0,
"learning_rate": 6.939427454121128e-06,
"loss": 4.9659,
"mean_token_accuracy": 0.8127795070409775,
"num_input_tokens_seen": 4345188,
"num_tokens": 4345188.0,
"step": 765,
"train_runtime": 4205.4405,
"train_tokens_per_second": 1033.23
},
{
"epoch": 0.616,
"grad_norm": 80.0,
"learning_rate": 6.816303567941111e-06,
"loss": 5.2259,
"mean_token_accuracy": 0.7914159163832665,
"num_input_tokens_seen": 4373786,
"num_tokens": 4373786.0,
"step": 770,
"train_runtime": 4233.1681,
"train_tokens_per_second": 1033.218
},
{
"epoch": 0.62,
"grad_norm": 130.0,
"learning_rate": 6.693714443203507e-06,
"loss": 5.6243,
"mean_token_accuracy": 0.7848341032862663,
"num_input_tokens_seen": 4404430,
"num_tokens": 4404430.0,
"step": 775,
"train_runtime": 4261.1517,
"train_tokens_per_second": 1033.624
},
{
"epoch": 0.624,
"grad_norm": 59.25,
"learning_rate": 6.571680671047749e-06,
"loss": 4.9764,
"mean_token_accuracy": 0.8033898919820786,
"num_input_tokens_seen": 4434585,
"num_tokens": 4434585.0,
"step": 780,
"train_runtime": 4290.2703,
"train_tokens_per_second": 1033.638
},
{
"epoch": 0.628,
"grad_norm": 109.5,
"learning_rate": 6.450222749331414e-06,
"loss": 5.7304,
"mean_token_accuracy": 0.7839483708143234,
"num_input_tokens_seen": 4463472,
"num_tokens": 4463472.0,
"step": 785,
"train_runtime": 4317.149,
"train_tokens_per_second": 1033.893
},
{
"epoch": 0.632,
"grad_norm": 64.5,
"learning_rate": 6.329361079187199e-06,
"loss": 5.7255,
"mean_token_accuracy": 0.7803121000528336,
"num_input_tokens_seen": 4490469,
"num_tokens": 4490469.0,
"step": 790,
"train_runtime": 4342.3523,
"train_tokens_per_second": 1034.11
},
{
"epoch": 0.636,
"grad_norm": 116.0,
"learning_rate": 6.209115961596208e-06,
"loss": 5.347,
"mean_token_accuracy": 0.7927216425538063,
"num_input_tokens_seen": 4517649,
"num_tokens": 4517649.0,
"step": 795,
"train_runtime": 4368.0708,
"train_tokens_per_second": 1034.244
},
{
"epoch": 0.64,
"grad_norm": 57.5,
"learning_rate": 6.0895075939779705e-06,
"loss": 5.5641,
"mean_token_accuracy": 0.795795188844204,
"num_input_tokens_seen": 4550712,
"num_tokens": 4550712.0,
"step": 800,
"train_runtime": 4400.2836,
"train_tokens_per_second": 1034.186
},
{
"epoch": 0.644,
"grad_norm": 61.5,
"learning_rate": 5.970556066797941e-06,
"loss": 5.8032,
"mean_token_accuracy": 0.7773613944649697,
"num_input_tokens_seen": 4585129,
"num_tokens": 4585129.0,
"step": 805,
"train_runtime": 4430.3262,
"train_tokens_per_second": 1034.942
},
{
"epoch": 0.648,
"grad_norm": 46.75,
"learning_rate": 5.852281360192933e-06,
"loss": 5.4492,
"mean_token_accuracy": 0.7958677127957344,
"num_input_tokens_seen": 4614693,
"num_tokens": 4614693.0,
"step": 810,
"train_runtime": 4458.5188,
"train_tokens_per_second": 1035.028
},
{
"epoch": 0.652,
"grad_norm": 135.0,
"learning_rate": 5.7347033406150494e-06,
"loss": 5.2903,
"mean_token_accuracy": 0.7904482677578926,
"num_input_tokens_seen": 4644174,
"num_tokens": 4644174.0,
"step": 815,
"train_runtime": 4486.8043,
"train_tokens_per_second": 1035.074
},
{
"epoch": 0.656,
"grad_norm": 62.0,
"learning_rate": 5.617841757494762e-06,
"loss": 5.322,
"mean_token_accuracy": 0.7915823593735695,
"num_input_tokens_seen": 4673847,
"num_tokens": 4673847.0,
"step": 820,
"train_runtime": 4515.9617,
"train_tokens_per_second": 1034.962
},
{
"epoch": 0.66,
"grad_norm": 258.0,
"learning_rate": 5.501716239923642e-06,
"loss": 5.0579,
"mean_token_accuracy": 0.8000400334596633,
"num_input_tokens_seen": 4700972,
"num_tokens": 4700972.0,
"step": 825,
"train_runtime": 4541.9592,
"train_tokens_per_second": 1035.01
},
{
"epoch": 0.664,
"grad_norm": 70.0,
"learning_rate": 5.386346293357242e-06,
"loss": 5.2416,
"mean_token_accuracy": 0.8032521203160286,
"num_input_tokens_seen": 4728665,
"num_tokens": 4728665.0,
"step": 830,
"train_runtime": 4570.1941,
"train_tokens_per_second": 1034.675
},
{
"epoch": 0.668,
"grad_norm": 100.5,
"learning_rate": 5.271751296338823e-06,
"loss": 5.4391,
"mean_token_accuracy": 0.7925701707601547,
"num_input_tokens_seen": 4756105,
"num_tokens": 4756105.0,
"step": 835,
"train_runtime": 4596.0174,
"train_tokens_per_second": 1034.832
},
{
"epoch": 0.672,
"grad_norm": 109.5,
"learning_rate": 5.15795049724435e-06,
"loss": 5.7334,
"mean_token_accuracy": 0.7834463611245155,
"num_input_tokens_seen": 4784316,
"num_tokens": 4784316.0,
"step": 840,
"train_runtime": 4623.5278,
"train_tokens_per_second": 1034.776
},
{
"epoch": 0.676,
"grad_norm": 58.75,
"learning_rate": 5.044963011049384e-06,
"loss": 5.3186,
"mean_token_accuracy": 0.7899114429950714,
"num_input_tokens_seen": 4810582,
"num_tokens": 4810582.0,
"step": 845,
"train_runtime": 4648.9466,
"train_tokens_per_second": 1034.768
},
{
"epoch": 0.68,
"grad_norm": 47.0,
"learning_rate": 4.932807816118347e-06,
"loss": 5.5814,
"mean_token_accuracy": 0.7850132435560226,
"num_input_tokens_seen": 4846569,
"num_tokens": 4846569.0,
"step": 850,
"train_runtime": 4682.0829,
"train_tokens_per_second": 1035.131
},
{
"epoch": 0.684,
"grad_norm": 34.5,
"learning_rate": 4.821503751016746e-06,
"loss": 5.3389,
"mean_token_accuracy": 0.7990380316972733,
"num_input_tokens_seen": 4876152,
"num_tokens": 4876152.0,
"step": 855,
"train_runtime": 4709.9784,
"train_tokens_per_second": 1035.281
},
{
"epoch": 0.688,
"grad_norm": 412.0,
"learning_rate": 4.711069511346909e-06,
"loss": 5.7378,
"mean_token_accuracy": 0.7886842951178551,
"num_input_tokens_seen": 4913773,
"num_tokens": 4913773.0,
"step": 860,
"train_runtime": 4744.369,
"train_tokens_per_second": 1035.706
},
{
"epoch": 0.692,
"grad_norm": 62.75,
"learning_rate": 4.601523646607675e-06,
"loss": 5.8298,
"mean_token_accuracy": 0.7812601879239083,
"num_input_tokens_seen": 4943904,
"num_tokens": 4943904.0,
"step": 865,
"train_runtime": 4772.8549,
"train_tokens_per_second": 1035.838
},
{
"epoch": 0.696,
"grad_norm": 57.75,
"learning_rate": 4.492884557078688e-06,
"loss": 4.8728,
"mean_token_accuracy": 0.8109473079442978,
"num_input_tokens_seen": 4969286,
"num_tokens": 4969286.0,
"step": 870,
"train_runtime": 4796.4072,
"train_tokens_per_second": 1036.043
},
{
"epoch": 0.7,
"grad_norm": 49.5,
"learning_rate": 4.385170490729712e-06,
"loss": 5.3615,
"mean_token_accuracy": 0.7962699040770531,
"num_input_tokens_seen": 4996130,
"num_tokens": 4996130.0,
"step": 875,
"train_runtime": 4822.3119,
"train_tokens_per_second": 1036.045
},
{
"epoch": 0.704,
"grad_norm": 36.75,
"learning_rate": 4.278399540155536e-06,
"loss": 5.6546,
"mean_token_accuracy": 0.7811813220381737,
"num_input_tokens_seen": 5024440,
"num_tokens": 5024440.0,
"step": 880,
"train_runtime": 4849.0415,
"train_tokens_per_second": 1036.172
},
{
"epoch": 0.708,
"grad_norm": 198.0,
"learning_rate": 4.172589639536992e-06,
"loss": 5.3273,
"mean_token_accuracy": 0.7892009258270264,
"num_input_tokens_seen": 5051605,
"num_tokens": 5051605.0,
"step": 885,
"train_runtime": 4873.5248,
"train_tokens_per_second": 1036.54
},
{
"epoch": 0.712,
"grad_norm": 91.0,
"learning_rate": 4.067758561628577e-06,
"loss": 5.7275,
"mean_token_accuracy": 0.7812389150261879,
"num_input_tokens_seen": 5080104,
"num_tokens": 5080104.0,
"step": 890,
"train_runtime": 4901.1022,
"train_tokens_per_second": 1036.523
},
{
"epoch": 0.716,
"grad_norm": 35.5,
"learning_rate": 3.9639239147731865e-06,
"loss": 5.425,
"mean_token_accuracy": 0.7942001700401307,
"num_input_tokens_seen": 5113018,
"num_tokens": 5113018.0,
"step": 895,
"train_runtime": 4930.903,
"train_tokens_per_second": 1036.933
},
{
"epoch": 0.72,
"grad_norm": 61.5,
"learning_rate": 3.861103139944448e-06,
"loss": 5.2248,
"mean_token_accuracy": 0.8011772260069847,
"num_input_tokens_seen": 5142941,
"num_tokens": 5142941.0,
"step": 900,
"train_runtime": 4959.3927,
"train_tokens_per_second": 1037.01
},
{
"epoch": 0.724,
"grad_norm": 41.75,
"learning_rate": 3.759313507817196e-06,
"loss": 5.1091,
"mean_token_accuracy": 0.7965022444725036,
"num_input_tokens_seen": 5172026,
"num_tokens": 5172026.0,
"step": 905,
"train_runtime": 4985.9195,
"train_tokens_per_second": 1037.326
},
{
"epoch": 0.728,
"grad_norm": 53.25,
"learning_rate": 3.658572115866541e-06,
"loss": 5.7217,
"mean_token_accuracy": 0.7808126404881477,
"num_input_tokens_seen": 5202257,
"num_tokens": 5202257.0,
"step": 910,
"train_runtime": 5014.4393,
"train_tokens_per_second": 1037.455
},
{
"epoch": 0.732,
"grad_norm": 44.5,
"learning_rate": 3.558895885496023e-06,
"loss": 5.3946,
"mean_token_accuracy": 0.789019052684307,
"num_input_tokens_seen": 5229589,
"num_tokens": 5229589.0,
"step": 915,
"train_runtime": 5040.3771,
"train_tokens_per_second": 1037.539
},
{
"epoch": 0.736,
"grad_norm": 318.0,
"learning_rate": 3.4603015591953393e-06,
"loss": 5.3473,
"mean_token_accuracy": 0.805972746014595,
"num_input_tokens_seen": 5258125,
"num_tokens": 5258125.0,
"step": 920,
"train_runtime": 5066.3915,
"train_tokens_per_second": 1037.844
},
{
"epoch": 0.74,
"grad_norm": 41.0,
"learning_rate": 3.3628056977281456e-06,
"loss": 5.6571,
"mean_token_accuracy": 0.7789395034313202,
"num_input_tokens_seen": 5286827,
"num_tokens": 5286827.0,
"step": 925,
"train_runtime": 5093.3897,
"train_tokens_per_second": 1037.978
},
{
"epoch": 0.744,
"grad_norm": 50.25,
"learning_rate": 3.266424677350346e-06,
"loss": 5.2024,
"mean_token_accuracy": 0.7979300260543823,
"num_input_tokens_seen": 5313346,
"num_tokens": 5313346.0,
"step": 930,
"train_runtime": 5119.0652,
"train_tokens_per_second": 1037.952
},
{
"epoch": 0.748,
"grad_norm": 70.0,
"learning_rate": 3.1711746870594083e-06,
"loss": 5.2941,
"mean_token_accuracy": 0.805306826531887,
"num_input_tokens_seen": 5343136,
"num_tokens": 5343136.0,
"step": 935,
"train_runtime": 5148.1407,
"train_tokens_per_second": 1037.877
},
{
"epoch": 0.752,
"grad_norm": 56.25,
"learning_rate": 3.077071725875116e-06,
"loss": 5.1137,
"mean_token_accuracy": 0.8032099828124046,
"num_input_tokens_seen": 5372738,
"num_tokens": 5372738.0,
"step": 940,
"train_runtime": 5176.4597,
"train_tokens_per_second": 1037.917
},
{
"epoch": 0.756,
"grad_norm": 46.75,
"learning_rate": 2.9841316001522345e-06,
"loss": 4.861,
"mean_token_accuracy": 0.809606908261776,
"num_input_tokens_seen": 5400053,
"num_tokens": 5400053.0,
"step": 945,
"train_runtime": 5203.2867,
"train_tokens_per_second": 1037.816
},
{
"epoch": 0.76,
"grad_norm": 306.0,
"learning_rate": 2.8923699209255285e-06,
"loss": 5.4282,
"mean_token_accuracy": 0.7930289775133132,
"num_input_tokens_seen": 5423237,
"num_tokens": 5423237.0,
"step": 950,
"train_runtime": 5227.2135,
"train_tokens_per_second": 1037.501
},
{
"epoch": 0.764,
"grad_norm": 61.25,
"learning_rate": 2.8018021012875994e-06,
"loss": 5.0416,
"mean_token_accuracy": 0.8067665219306945,
"num_input_tokens_seen": 5451018,
"num_tokens": 5451018.0,
"step": 955,
"train_runtime": 5254.5977,
"train_tokens_per_second": 1037.381
},
{
"epoch": 0.768,
"grad_norm": 76.5,
"learning_rate": 2.7124433537999838e-06,
"loss": 5.3888,
"mean_token_accuracy": 0.7953536674380303,
"num_input_tokens_seen": 5481559,
"num_tokens": 5481559.0,
"step": 960,
"train_runtime": 5281.8099,
"train_tokens_per_second": 1037.818
},
{
"epoch": 0.772,
"grad_norm": 76.5,
"learning_rate": 2.6243086879379e-06,
"loss": 5.3031,
"mean_token_accuracy": 0.7893404349684715,
"num_input_tokens_seen": 5509600,
"num_tokens": 5509600.0,
"step": 965,
"train_runtime": 5308.2353,
"train_tokens_per_second": 1037.934
},
{
"epoch": 0.776,
"grad_norm": 53.0,
"learning_rate": 2.537412907569127e-06,
"loss": 5.2369,
"mean_token_accuracy": 0.7978512957692147,
"num_input_tokens_seen": 5538862,
"num_tokens": 5538862.0,
"step": 970,
"train_runtime": 5335.0361,
"train_tokens_per_second": 1038.205
},
{
"epoch": 0.78,
"grad_norm": 66.5,
"learning_rate": 2.451770608467432e-06,
"loss": 5.1098,
"mean_token_accuracy": 0.805266946554184,
"num_input_tokens_seen": 5566918,
"num_tokens": 5566918.0,
"step": 975,
"train_runtime": 5362.4471,
"train_tokens_per_second": 1038.13
},
{
"epoch": 0.784,
"grad_norm": 49.25,
"learning_rate": 2.3673961758609156e-06,
"loss": 5.186,
"mean_token_accuracy": 0.8020475476980209,
"num_input_tokens_seen": 5593453,
"num_tokens": 5593453.0,
"step": 980,
"train_runtime": 5387.3852,
"train_tokens_per_second": 1038.25
},
{
"epoch": 0.788,
"grad_norm": 83.0,
"learning_rate": 2.2843037820157678e-06,
"loss": 5.2522,
"mean_token_accuracy": 0.7931860506534576,
"num_input_tokens_seen": 5621006,
"num_tokens": 5621006.0,
"step": 985,
"train_runtime": 5413.5032,
"train_tokens_per_second": 1038.331
},
{
"epoch": 0.792,
"grad_norm": 121.5,
"learning_rate": 2.2025073838557454e-06,
"loss": 5.142,
"mean_token_accuracy": 0.7944848969578743,
"num_input_tokens_seen": 5649809,
"num_tokens": 5649809.0,
"step": 990,
"train_runtime": 5441.3683,
"train_tokens_per_second": 1038.307
},
{
"epoch": 0.796,
"grad_norm": 70.0,
"learning_rate": 2.122020720617869e-06,
"loss": 5.3737,
"mean_token_accuracy": 0.7889134347438812,
"num_input_tokens_seen": 5676379,
"num_tokens": 5676379.0,
"step": 995,
"train_runtime": 5465.6045,
"train_tokens_per_second": 1038.564
},
{
"epoch": 0.8,
"grad_norm": 63.25,
"learning_rate": 2.0428573115446394e-06,
"loss": 5.336,
"mean_token_accuracy": 0.791542237997055,
"num_input_tokens_seen": 5700207,
"num_tokens": 5700207.0,
"step": 1000,
"train_runtime": 5489.0151,
"train_tokens_per_second": 1038.475
},
{
"epoch": 0.804,
"grad_norm": 260.0,
"learning_rate": 1.9650304536132426e-06,
"loss": 5.263,
"mean_token_accuracy": 0.7995662987232208,
"num_input_tokens_seen": 5725964,
"num_tokens": 5725964.0,
"step": 1005,
"train_runtime": 5514.6903,
"train_tokens_per_second": 1038.311
},
{
"epoch": 0.808,
"grad_norm": 158.0,
"learning_rate": 1.8885532193020706e-06,
"loss": 5.3317,
"mean_token_accuracy": 0.7876490637660026,
"num_input_tokens_seen": 5754571,
"num_tokens": 5754571.0,
"step": 1010,
"train_runtime": 5540.9562,
"train_tokens_per_second": 1038.552
},
{
"epoch": 0.812,
"grad_norm": 45.25,
"learning_rate": 1.813438454394948e-06,
"loss": 5.2275,
"mean_token_accuracy": 0.7981337189674378,
"num_input_tokens_seen": 5783614,
"num_tokens": 5783614.0,
"step": 1015,
"train_runtime": 5567.5443,
"train_tokens_per_second": 1038.809
},
{
"epoch": 0.816,
"grad_norm": 36.25,
"learning_rate": 1.7396987758234418e-06,
"loss": 4.7787,
"mean_token_accuracy": 0.8220131769776344,
"num_input_tokens_seen": 5809495,
"num_tokens": 5809495.0,
"step": 1020,
"train_runtime": 5592.6221,
"train_tokens_per_second": 1038.778
},
{
"epoch": 0.82,
"grad_norm": 80.5,
"learning_rate": 1.6673465695476233e-06,
"loss": 4.8219,
"mean_token_accuracy": 0.8117146581411362,
"num_input_tokens_seen": 5836071,
"num_tokens": 5836071.0,
"step": 1025,
"train_runtime": 5618.3237,
"train_tokens_per_second": 1038.757
},
{
"epoch": 0.824,
"grad_norm": 49.5,
"learning_rate": 1.5963939884756042e-06,
"loss": 5.5483,
"mean_token_accuracy": 0.7852835282683372,
"num_input_tokens_seen": 5867317,
"num_tokens": 5867317.0,
"step": 1030,
"train_runtime": 5647.6112,
"train_tokens_per_second": 1038.902
},
{
"epoch": 0.828,
"grad_norm": 56.25,
"learning_rate": 1.5268529504222262e-06,
"loss": 5.2143,
"mean_token_accuracy": 0.799188706278801,
"num_input_tokens_seen": 5897945,
"num_tokens": 5897945.0,
"step": 1035,
"train_runtime": 5677.9014,
"train_tokens_per_second": 1038.754
},
{
"epoch": 0.832,
"grad_norm": 59.0,
"learning_rate": 1.4587351361072455e-06,
"loss": 5.4266,
"mean_token_accuracy": 0.7901453331112862,
"num_input_tokens_seen": 5923523,
"num_tokens": 5923523.0,
"step": 1040,
"train_runtime": 5702.9066,
"train_tokens_per_second": 1038.685
},
{
"epoch": 0.836,
"grad_norm": 144.0,
"learning_rate": 1.3920519871933425e-06,
"loss": 5.4357,
"mean_token_accuracy": 0.7978359222412109,
"num_input_tokens_seen": 5956415,
"num_tokens": 5956415.0,
"step": 1045,
"train_runtime": 5733.0236,
"train_tokens_per_second": 1038.966
},
{
"epoch": 0.84,
"grad_norm": 177.0,
"learning_rate": 1.326814704364262e-06,
"loss": 5.3211,
"mean_token_accuracy": 0.7962174996733665,
"num_input_tokens_seen": 5985449,
"num_tokens": 5985449.0,
"step": 1050,
"train_runtime": 5761.1481,
"train_tokens_per_second": 1038.933
},
{
"epoch": 0.844,
"grad_norm": 74.0,
"learning_rate": 1.263034245443473e-06,
"loss": 5.2974,
"mean_token_accuracy": 0.7944592133164405,
"num_input_tokens_seen": 6013339,
"num_tokens": 6013339.0,
"step": 1055,
"train_runtime": 5788.1881,
"train_tokens_per_second": 1038.898
},
{
"epoch": 0.848,
"grad_norm": 75.0,
"learning_rate": 1.2007213235535785e-06,
"loss": 5.3725,
"mean_token_accuracy": 0.7937130227684974,
"num_input_tokens_seen": 6043191,
"num_tokens": 6043191.0,
"step": 1060,
"train_runtime": 5815.2046,
"train_tokens_per_second": 1039.205
},
{
"epoch": 0.852,
"grad_norm": 60.0,
"learning_rate": 1.1398864053168534e-06,
"loss": 4.9906,
"mean_token_accuracy": 0.8006555408239364,
"num_input_tokens_seen": 6071081,
"num_tokens": 6071081.0,
"step": 1065,
"train_runtime": 5841.7991,
"train_tokens_per_second": 1039.249
},
{
"epoch": 0.856,
"grad_norm": 60.5,
"learning_rate": 1.0805397090971738e-06,
"loss": 4.9086,
"mean_token_accuracy": 0.8100662231445312,
"num_input_tokens_seen": 6097563,
"num_tokens": 6097563.0,
"step": 1070,
"train_runtime": 5868.5095,
"train_tokens_per_second": 1039.031
},
{
"epoch": 0.86,
"grad_norm": 57.25,
"learning_rate": 1.022691203283661e-06,
"loss": 5.4238,
"mean_token_accuracy": 0.792490765452385,
"num_input_tokens_seen": 6126398,
"num_tokens": 6126398.0,
"step": 1075,
"train_runtime": 5895.323,
"train_tokens_per_second": 1039.196
},
{
"epoch": 0.864,
"grad_norm": 38.5,
"learning_rate": 9.663506046162986e-07,
"loss": 5.1983,
"mean_token_accuracy": 0.7941671445965767,
"num_input_tokens_seen": 6154407,
"num_tokens": 6154407.0,
"step": 1080,
"train_runtime": 5920.8046,
"train_tokens_per_second": 1039.455
},
{
"epoch": 0.868,
"grad_norm": 536.0,
"learning_rate": 9.115273765538202e-07,
"loss": 5.5089,
"mean_token_accuracy": 0.7930781245231628,
"num_input_tokens_seen": 6182939,
"num_tokens": 6182939.0,
"step": 1085,
"train_runtime": 5947.8298,
"train_tokens_per_second": 1039.529
},
{
"epoch": 0.872,
"grad_norm": 47.25,
"learning_rate": 8.582307276841461e-07,
"loss": 5.3598,
"mean_token_accuracy": 0.7864043831825256,
"num_input_tokens_seen": 6212094,
"num_tokens": 6212094.0,
"step": 1090,
"train_runtime": 5975.8227,
"train_tokens_per_second": 1039.538
},
{
"epoch": 0.876,
"grad_norm": 41.5,
"learning_rate": 8.06469610177636e-07,
"loss": 5.3994,
"mean_token_accuracy": 0.7908027723431588,
"num_input_tokens_seen": 6238778,
"num_tokens": 6238778.0,
"step": 1095,
"train_runtime": 6000.9384,
"train_tokens_per_second": 1039.634
},
{
"epoch": 0.88,
"grad_norm": 312.0,
"learning_rate": 7.562527182833978e-07,
"loss": 5.3973,
"mean_token_accuracy": 0.793465219438076,
"num_input_tokens_seen": 6265105,
"num_tokens": 6265105.0,
"step": 1100,
"train_runtime": 6025.9546,
"train_tokens_per_second": 1039.687
},
{
"epoch": 0.884,
"grad_norm": 128.0,
"learning_rate": 7.07588486868922e-07,
"loss": 5.1888,
"mean_token_accuracy": 0.8035556092858315,
"num_input_tokens_seen": 6290027,
"num_tokens": 6290027.0,
"step": 1105,
"train_runtime": 6051.3714,
"train_tokens_per_second": 1039.438
},
{
"epoch": 0.888,
"grad_norm": 49.0,
"learning_rate": 6.604850900032956e-07,
"loss": 4.7405,
"mean_token_accuracy": 0.8212268218398094,
"num_input_tokens_seen": 6317712,
"num_tokens": 6317712.0,
"step": 1110,
"train_runtime": 6077.8737,
"train_tokens_per_second": 1039.461
},
{
"epoch": 0.892,
"grad_norm": 79.5,
"learning_rate": 6.149504395842087e-07,
"loss": 5.3393,
"mean_token_accuracy": 0.7951319962739944,
"num_input_tokens_seen": 6343602,
"num_tokens": 6343602.0,
"step": 1115,
"train_runtime": 6105.1625,
"train_tokens_per_second": 1039.055
},
{
"epoch": 0.896,
"grad_norm": 123.0,
"learning_rate": 5.709921840090072e-07,
"loss": 5.2021,
"mean_token_accuracy": 0.7978611201047897,
"num_input_tokens_seen": 6382499,
"num_tokens": 6382499.0,
"step": 1120,
"train_runtime": 6145.3873,
"train_tokens_per_second": 1038.584
},
{
"epoch": 0.9,
"grad_norm": 108.5,
"learning_rate": 5.286177068899989e-07,
"loss": 5.2466,
"mean_token_accuracy": 0.7941580578684807,
"num_input_tokens_seen": 6409279,
"num_tokens": 6409279.0,
"step": 1125,
"train_runtime": 6172.0603,
"train_tokens_per_second": 1038.434
},
{
"epoch": 0.904,
"grad_norm": 50.0,
"learning_rate": 4.878341258142349e-07,
"loss": 5.4412,
"mean_token_accuracy": 0.7916087701916694,
"num_input_tokens_seen": 6440759,
"num_tokens": 6440759.0,
"step": 1130,
"train_runtime": 6198.541,
"train_tokens_per_second": 1039.077
},
{
"epoch": 0.908,
"grad_norm": 130.0,
"learning_rate": 4.4864829114798394e-07,
"loss": 4.9766,
"mean_token_accuracy": 0.8005422234535218,
"num_input_tokens_seen": 6468905,
"num_tokens": 6468905.0,
"step": 1135,
"train_runtime": 6225.8798,
"train_tokens_per_second": 1039.035
},
{
"epoch": 0.912,
"grad_norm": 79.0,
"learning_rate": 4.11066784886075e-07,
"loss": 5.3727,
"mean_token_accuracy": 0.7998930081725121,
"num_input_tokens_seen": 6493165,
"num_tokens": 6493165.0,
"step": 1140,
"train_runtime": 6249.8312,
"train_tokens_per_second": 1038.934
},
{
"epoch": 0.916,
"grad_norm": 54.0,
"learning_rate": 3.750959195463466e-07,
"loss": 5.3525,
"mean_token_accuracy": 0.7876800760626793,
"num_input_tokens_seen": 6522770,
"num_tokens": 6522770.0,
"step": 1145,
"train_runtime": 6278.1358,
"train_tokens_per_second": 1038.966
},
{
"epoch": 0.92,
"grad_norm": 206.0,
"learning_rate": 3.4074173710931804e-07,
"loss": 5.7993,
"mean_token_accuracy": 0.7791803061962128,
"num_input_tokens_seen": 6549386,
"num_tokens": 6549386.0,
"step": 1150,
"train_runtime": 6302.9235,
"train_tokens_per_second": 1039.103
},
{
"epoch": 0.924,
"grad_norm": 99.0,
"learning_rate": 3.080100080033388e-07,
"loss": 5.0963,
"mean_token_accuracy": 0.7987044736742973,
"num_input_tokens_seen": 6578069,
"num_tokens": 6578069.0,
"step": 1155,
"train_runtime": 6330.1523,
"train_tokens_per_second": 1039.164
},
{
"epoch": 0.928,
"grad_norm": 83.0,
"learning_rate": 2.769062301353398e-07,
"loss": 5.5875,
"mean_token_accuracy": 0.7888873621821404,
"num_input_tokens_seen": 6603075,
"num_tokens": 6603075.0,
"step": 1160,
"train_runtime": 6354.1556,
"train_tokens_per_second": 1039.174
},
{
"epoch": 0.932,
"grad_norm": 102.0,
"learning_rate": 2.474356279673462e-07,
"loss": 5.6995,
"mean_token_accuracy": 0.7825249642133713,
"num_input_tokens_seen": 6635809,
"num_tokens": 6635809.0,
"step": 1165,
"train_runtime": 6382.7967,
"train_tokens_per_second": 1039.64
},
{
"epoch": 0.936,
"grad_norm": 56.5,
"learning_rate": 2.1960315163894075e-07,
"loss": 5.1911,
"mean_token_accuracy": 0.7973916217684746,
"num_input_tokens_seen": 6661327,
"num_tokens": 6661327.0,
"step": 1170,
"train_runtime": 6406.6683,
"train_tokens_per_second": 1039.749
},
{
"epoch": 0.94,
"grad_norm": 69.5,
"learning_rate": 1.9341347613579086e-07,
"loss": 5.1047,
"mean_token_accuracy": 0.8013954371213913,
"num_input_tokens_seen": 6690372,
"num_tokens": 6690372.0,
"step": 1175,
"train_runtime": 6435.8803,
"train_tokens_per_second": 1039.543
},
{
"epoch": 0.944,
"grad_norm": 73.5,
"learning_rate": 1.6887100050439587e-07,
"loss": 5.6123,
"mean_token_accuracy": 0.7876136094331742,
"num_input_tokens_seen": 6719639,
"num_tokens": 6719639.0,
"step": 1180,
"train_runtime": 6464.1419,
"train_tokens_per_second": 1039.525
},
{
"epoch": 0.948,
"grad_norm": 87.0,
"learning_rate": 1.459798471131868e-07,
"loss": 5.4049,
"mean_token_accuracy": 0.7874793767929077,
"num_input_tokens_seen": 6745687,
"num_tokens": 6745687.0,
"step": 1185,
"train_runtime": 6489.0041,
"train_tokens_per_second": 1039.557
},
{
"epoch": 0.952,
"grad_norm": 454.0,
"learning_rate": 1.2474386096010037e-07,
"loss": 5.0158,
"mean_token_accuracy": 0.8049899056553841,
"num_input_tokens_seen": 6774080,
"num_tokens": 6774080.0,
"step": 1190,
"train_runtime": 6514.6768,
"train_tokens_per_second": 1039.818
},
{
"epoch": 0.956,
"grad_norm": 74.5,
"learning_rate": 1.0516660902673448e-07,
"loss": 5.4421,
"mean_token_accuracy": 0.7932335063815117,
"num_input_tokens_seen": 6803981,
"num_tokens": 6803981.0,
"step": 1195,
"train_runtime": 6542.1628,
"train_tokens_per_second": 1040.02
},
{
"epoch": 0.96,
"grad_norm": 67.5,
"learning_rate": 8.725137967920739e-08,
"loss": 5.6285,
"mean_token_accuracy": 0.7887401878833771,
"num_input_tokens_seen": 6829351,
"num_tokens": 6829351.0,
"step": 1200,
"train_runtime": 6567.4241,
"train_tokens_per_second": 1039.883
},
{
"epoch": 0.964,
"grad_norm": 98.0,
"learning_rate": 7.100118211581852e-08,
"loss": 5.5201,
"mean_token_accuracy": 0.7889957845211029,
"num_input_tokens_seen": 6856748,
"num_tokens": 6856748.0,
"step": 1205,
"train_runtime": 6594.1889,
"train_tokens_per_second": 1039.817
},
{
"epoch": 0.968,
"grad_norm": 46.75,
"learning_rate": 5.6418745861593905e-08,
"loss": 5.0326,
"mean_token_accuracy": 0.8010287463665009,
"num_input_tokens_seen": 6886116,
"num_tokens": 6886116.0,
"step": 1210,
"train_runtime": 6621.5933,
"train_tokens_per_second": 1039.949
},
{
"epoch": 0.972,
"grad_norm": 60.25,
"learning_rate": 4.350652030981395e-08,
"loss": 5.7011,
"mean_token_accuracy": 0.7844116255640984,
"num_input_tokens_seen": 6912415,
"num_tokens": 6912415.0,
"step": 1215,
"train_runtime": 6646.1851,
"train_tokens_per_second": 1040.058
},
{
"epoch": 0.976,
"grad_norm": 50.75,
"learning_rate": 3.2266674310589276e-08,
"loss": 5.481,
"mean_token_accuracy": 0.7872747302055358,
"num_input_tokens_seen": 6937018,
"num_tokens": 6937018.0,
"step": 1220,
"train_runtime": 6670.2576,
"train_tokens_per_second": 1039.993
},
{
"epoch": 0.98,
"grad_norm": 81.0,
"learning_rate": 2.2701095806565432e-08,
"loss": 5.5155,
"mean_token_accuracy": 0.7937034830451012,
"num_input_tokens_seen": 6968725,
"num_tokens": 6968725.0,
"step": 1225,
"train_runtime": 6700.0416,
"train_tokens_per_second": 1040.102
},
{
"epoch": 0.984,
"grad_norm": 42.75,
"learning_rate": 1.4811391515799911e-08,
"loss": 5.0932,
"mean_token_accuracy": 0.7983416199684144,
"num_input_tokens_seen": 6996046,
"num_tokens": 6996046.0,
"step": 1230,
"train_runtime": 6725.8984,
"train_tokens_per_second": 1040.165
},
{
"epoch": 0.988,
"grad_norm": 47.0,
"learning_rate": 8.59888666189579e-09,
"loss": 5.6719,
"mean_token_accuracy": 0.783245125412941,
"num_input_tokens_seen": 7026420,
"num_tokens": 7026420.0,
"step": 1235,
"train_runtime": 6754.2343,
"train_tokens_per_second": 1040.299
},
{
"epoch": 0.992,
"grad_norm": 211.0,
"learning_rate": 4.064624751394242e-09,
"loss": 5.7314,
"mean_token_accuracy": 0.7820679724216462,
"num_input_tokens_seen": 7056594,
"num_tokens": 7056594.0,
"step": 1240,
"train_runtime": 6782.773,
"train_tokens_per_second": 1040.37
},
{
"epoch": 0.996,
"grad_norm": 114.0,
"learning_rate": 1.209367398504746e-09,
"loss": 5.0323,
"mean_token_accuracy": 0.8049638271331787,
"num_input_tokens_seen": 7081035,
"num_tokens": 7081035.0,
"step": 1245,
"train_runtime": 6807.8837,
"train_tokens_per_second": 1040.123
},
{
"epoch": 1.0,
"grad_norm": 57.0,
"learning_rate": 3.3594197175190743e-11,
"loss": 5.217,
"mean_token_accuracy": 0.8012784749269486,
"num_input_tokens_seen": 7107438,
"num_tokens": 7107438.0,
"step": 1250,
"train_runtime": 6833.6244,
"train_tokens_per_second": 1040.069
},
{
"epoch": 1.0,
"num_input_tokens_seen": 7107438,
"step": 1250,
"total_flos": 1.4684498749056614e+17,
"train_loss": 5.760735061645508,
"train_runtime": 6833.6666,
"train_samples_per_second": 1.463,
"train_steps_per_second": 0.183,
"train_tokens_per_second": 1040.062
}
],
"logging_steps": 5,
"max_steps": 1250,
"num_input_tokens_seen": 7107438,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4684498749056614e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}