{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997605937275557, "eval_steps": 500, "global_step": 261, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038305003591094086, "grad_norm": 0.9418857097625732, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -1.0421228408813477, "logits/rejected": -1.0402593612670898, "logps/chosen": -905.7581787109375, "logps/rejected": -924.099853515625, "loss": 0.6931, "num_input_tokens_seen": 362496, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007661000718218817, "grad_norm": 1.0012181997299194, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.0420148372650146, "logits/rejected": -1.0282371044158936, "logps/chosen": -898.2119140625, "logps/rejected": -893.538818359375, "loss": 0.6931, "num_input_tokens_seen": 718144, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.011491501077328227, "grad_norm": 0.9935199618339539, "learning_rate": 3e-06, "logits/chosen": -1.0312358140945435, "logits/rejected": -1.023416519165039, "logps/chosen": -918.4049072265625, "logps/rejected": -903.5130615234375, "loss": 0.7036, "num_input_tokens_seen": 1075392, "rewards/accuracies": 0.4375, "rewards/chosen": -0.012404728680849075, "rewards/margins": -0.017508292570710182, "rewards/rejected": 0.0051035634241998196, "step": 3 }, { "epoch": 0.015322001436437634, "grad_norm": 0.9774389266967773, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.0343425273895264, "logits/rejected": -1.036562204360962, "logps/chosen": -896.435302734375, "logps/rejected": -899.7354736328125, "loss": 0.6913, "num_input_tokens_seen": 1430080, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.0023043155670166016, "rewards/margins": 0.00819547288119793, "rewards/rejected": -0.010499788448214531, "step": 4 }, { "epoch": 0.019152501795547044, "grad_norm": 0.9151395559310913, "learning_rate": 5e-06, "logits/chosen": -1.0331459045410156, "logits/rejected": -1.0269978046417236, "logps/chosen": -948.02978515625, "logps/rejected": -949.6331787109375, "loss": 0.6978, "num_input_tokens_seen": 1786304, "rewards/accuracies": 0.5, "rewards/chosen": -0.003812384558841586, "rewards/margins": -0.005953050684183836, "rewards/rejected": 0.002140665426850319, "step": 5 }, { "epoch": 0.022983002154656453, "grad_norm": 0.9881334900856018, "learning_rate": 6e-06, "logits/chosen": -1.0443257093429565, "logits/rejected": -1.0362162590026855, "logps/chosen": -881.51611328125, "logps/rejected": -884.4502563476562, "loss": 0.6808, "num_input_tokens_seen": 2132544, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.005164409056305885, "rewards/margins": 0.028560878708958626, "rewards/rejected": -0.02339646965265274, "step": 6 }, { "epoch": 0.02681350251376586, "grad_norm": 0.9441390037536621, "learning_rate": 7e-06, "logits/chosen": -1.0445983409881592, "logits/rejected": -1.0422747135162354, "logps/chosen": -899.3116455078125, "logps/rejected": -894.371337890625, "loss": 0.6866, "num_input_tokens_seen": 2488832, "rewards/accuracies": 0.546875, "rewards/chosen": -0.0004974845796823502, "rewards/margins": 0.016697216778993607, "rewards/rejected": -0.017194701358675957, "step": 7 }, { "epoch": 0.03064400287287527, "grad_norm": 1.0369701385498047, "learning_rate": 8.000000000000001e-06, "logits/chosen": -1.0434613227844238, "logits/rejected": -1.0450818538665771, "logps/chosen": -924.5731201171875, "logps/rejected": -928.7008056640625, "loss": 0.6897, "num_input_tokens_seen": 2852544, "rewards/accuracies": 0.515625, "rewards/chosen": 0.007696032524108887, "rewards/margins": 0.011814999394118786, "rewards/rejected": -0.004118966404348612, "step": 8 }, { "epoch": 0.03447450323198468, "grad_norm": 0.9359210133552551, "learning_rate": 9e-06, "logits/chosen": -1.0368328094482422, "logits/rejected": -1.0320663452148438, "logps/chosen": -965.3587036132812, "logps/rejected": -946.9695434570312, "loss": 0.6901, "num_input_tokens_seen": 3220608, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.005610942840576172, "rewards/margins": 0.009852622635662556, "rewards/rejected": -0.004241681657731533, "step": 9 }, { "epoch": 0.03830500359109409, "grad_norm": 0.9655279517173767, "learning_rate": 1e-05, "logits/chosen": -1.0187113285064697, "logits/rejected": -1.0166434049606323, "logps/chosen": -953.6986083984375, "logps/rejected": -927.5408935546875, "loss": 0.691, "num_input_tokens_seen": 3592128, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0018309114966541529, "rewards/margins": 0.008346128277480602, "rewards/rejected": -0.010177039541304111, "step": 10 }, { "epoch": 0.0421355039502035, "grad_norm": 0.9258586168289185, "learning_rate": 9.999608360361114e-06, "logits/chosen": -1.0451351404190063, "logits/rejected": -1.0583577156066895, "logps/chosen": -958.3239135742188, "logps/rejected": -946.181640625, "loss": 0.6915, "num_input_tokens_seen": 3971392, "rewards/accuracies": 0.5, "rewards/chosen": 0.004320908337831497, "rewards/margins": 0.008112478069961071, "rewards/rejected": -0.003791570197790861, "step": 11 }, { "epoch": 0.045966004309312906, "grad_norm": 1.1154330968856812, "learning_rate": 9.998433502797097e-06, "logits/chosen": -1.0229029655456543, "logits/rejected": -1.026119589805603, "logps/chosen": -947.627685546875, "logps/rejected": -935.3385009765625, "loss": 0.691, "num_input_tokens_seen": 4327552, "rewards/accuracies": 0.515625, "rewards/chosen": -0.007967257872223854, "rewards/margins": 0.008088444359600544, "rewards/rejected": -0.016055703163146973, "step": 12 }, { "epoch": 0.049796504668422316, "grad_norm": 1.0335172414779663, "learning_rate": 9.996475611356265e-06, "logits/chosen": -1.0391089916229248, "logits/rejected": -1.0464394092559814, "logps/chosen": -949.6978149414062, "logps/rejected": -913.878173828125, "loss": 0.6828, "num_input_tokens_seen": 4701568, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006509041413664818, "rewards/margins": 0.025559592992067337, "rewards/rejected": -0.0320686399936676, "step": 13 }, { "epoch": 0.05362700502753172, "grad_norm": 1.0432441234588623, "learning_rate": 9.993734992753777e-06, "logits/chosen": -1.0264198780059814, "logits/rejected": -1.039010763168335, "logps/chosen": -957.4710693359375, "logps/rejected": -918.60400390625, "loss": 0.6787, "num_input_tokens_seen": 5056448, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.005599617958068848, "rewards/margins": 0.03315434232354164, "rewards/rejected": -0.03875396400690079, "step": 14 }, { "epoch": 0.05745750538664113, "grad_norm": 1.0883227586746216, "learning_rate": 9.990212076323587e-06, "logits/chosen": -1.047149896621704, "logits/rejected": -1.0479631423950195, "logps/chosen": -911.3246459960938, "logps/rejected": -907.6549072265625, "loss": 0.6765, "num_input_tokens_seen": 5405056, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.0018613815773278475, "rewards/margins": 0.0383245013654232, "rewards/rejected": -0.04018588364124298, "step": 15 }, { "epoch": 0.06128800574575054, "grad_norm": 0.9944784045219421, "learning_rate": 9.98590741395118e-06, "logits/chosen": -1.0353460311889648, "logits/rejected": -1.0215399265289307, "logps/chosen": -875.39697265625, "logps/rejected": -897.5616455078125, "loss": 0.6792, "num_input_tokens_seen": 5758464, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.004133629146963358, "rewards/margins": 0.034194350242614746, "rewards/rejected": -0.038327984511852264, "step": 16 }, { "epoch": 0.06511850610485995, "grad_norm": 1.0136407613754272, "learning_rate": 9.980821679987125e-06, "logits/chosen": -1.0320684909820557, "logits/rejected": -1.033224105834961, "logps/chosen": -929.647216796875, "logps/rejected": -911.69580078125, "loss": 0.6786, "num_input_tokens_seen": 6116480, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0052222274243831635, "rewards/margins": 0.036130644381046295, "rewards/rejected": -0.04135286808013916, "step": 17 }, { "epoch": 0.06894900646396936, "grad_norm": 0.9946112036705017, "learning_rate": 9.974955671141425e-06, "logits/chosen": -1.0013747215270996, "logits/rejected": -0.9874509572982788, "logps/chosen": -966.796875, "logps/rejected": -983.128173828125, "loss": 0.6739, "num_input_tokens_seen": 6482624, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.006748510058969259, "rewards/margins": 0.044564343988895416, "rewards/rejected": -0.03781583532691002, "step": 18 }, { "epoch": 0.07277950682307877, "grad_norm": 1.0340311527252197, "learning_rate": 9.968310306358715e-06, "logits/chosen": -1.027139663696289, "logits/rejected": -1.0141856670379639, "logps/chosen": -920.3916625976562, "logps/rejected": -924.4229736328125, "loss": 0.66, "num_input_tokens_seen": 6835712, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.0031435242854058743, "rewards/margins": 0.07400324940681458, "rewards/rejected": -0.07085972279310226, "step": 19 }, { "epoch": 0.07661000718218818, "grad_norm": 1.0329723358154297, "learning_rate": 9.960886626674302e-06, "logits/chosen": -1.0163955688476562, "logits/rejected": -1.0183823108673096, "logps/chosen": -912.9906616210938, "logps/rejected": -900.5574951171875, "loss": 0.6575, "num_input_tokens_seen": 7183040, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.028547190129756927, "rewards/margins": 0.07918091118335724, "rewards/rejected": -0.050633713603019714, "step": 20 }, { "epoch": 0.08044050754129758, "grad_norm": 0.9984006881713867, "learning_rate": 9.952685795051078e-06, "logits/chosen": -1.0189419984817505, "logits/rejected": -1.0193971395492554, "logps/chosen": -962.4049072265625, "logps/rejected": -968.8514404296875, "loss": 0.6635, "num_input_tokens_seen": 7552000, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.0059728133492171764, "rewards/margins": 0.06682717800140381, "rewards/rejected": -0.060854364186525345, "step": 21 }, { "epoch": 0.084271007900407, "grad_norm": 0.9608243107795715, "learning_rate": 9.943709096197334e-06, "logits/chosen": -1.0346640348434448, "logits/rejected": -1.0270400047302246, "logps/chosen": -943.638427734375, "logps/rejected": -949.8948364257812, "loss": 0.6437, "num_input_tokens_seen": 7918784, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.031516481190919876, "rewards/margins": 0.11247380077838898, "rewards/rejected": -0.0809573158621788, "step": 22 }, { "epoch": 0.0881015082595164, "grad_norm": 0.9745583534240723, "learning_rate": 9.933957936365515e-06, "logits/chosen": -1.0197912454605103, "logits/rejected": -1.0225633382797241, "logps/chosen": -924.9276733398438, "logps/rejected": -902.6036987304688, "loss": 0.6572, "num_input_tokens_seen": 8275392, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.015636922791600227, "rewards/margins": 0.08563442528247833, "rewards/rejected": -0.06999750435352325, "step": 23 }, { "epoch": 0.09193200861862581, "grad_norm": 1.012017846107483, "learning_rate": 9.9234338431319e-06, "logits/chosen": -1.0310735702514648, "logits/rejected": -1.0211362838745117, "logps/chosen": -943.5421752929688, "logps/rejected": -943.68896484375, "loss": 0.6214, "num_input_tokens_seen": 8630976, "rewards/accuracies": 0.734375, "rewards/chosen": 0.049514107406139374, "rewards/margins": 0.16673865914344788, "rewards/rejected": -0.1172245517373085, "step": 24 }, { "epoch": 0.09576250897773522, "grad_norm": 1.0184485912322998, "learning_rate": 9.912138465157325e-06, "logits/chosen": -1.040213942527771, "logits/rejected": -1.0284395217895508, "logps/chosen": -944.1600952148438, "logps/rejected": -950.0694580078125, "loss": 0.6145, "num_input_tokens_seen": 8994496, "rewards/accuracies": 0.75, "rewards/chosen": 0.04350156709551811, "rewards/margins": 0.18440237641334534, "rewards/rejected": -0.14090080559253693, "step": 25 }, { "epoch": 0.09959300933684463, "grad_norm": 0.9871705174446106, "learning_rate": 9.900073571928887e-06, "logits/chosen": -1.0415468215942383, "logits/rejected": -1.02827787399292, "logps/chosen": -930.43798828125, "logps/rejected": -943.5054931640625, "loss": 0.6296, "num_input_tokens_seen": 9363648, "rewards/accuracies": 0.75, "rewards/chosen": 0.01750366762280464, "rewards/margins": 0.15038560330867767, "rewards/rejected": -0.13288193941116333, "step": 26 }, { "epoch": 0.10342350969595403, "grad_norm": 1.0247187614440918, "learning_rate": 9.887241053482756e-06, "logits/chosen": -1.0407733917236328, "logits/rejected": -1.0286006927490234, "logps/chosen": -901.0194702148438, "logps/rejected": -897.4530029296875, "loss": 0.6069, "num_input_tokens_seen": 9716544, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.0669078379869461, "rewards/margins": 0.2108609676361084, "rewards/rejected": -0.1439531445503235, "step": 27 }, { "epoch": 0.10725401005506344, "grad_norm": 1.011189341545105, "learning_rate": 9.87364292010809e-06, "logits/chosen": -1.0111464262008667, "logits/rejected": -1.0069133043289185, "logps/chosen": -919.2281494140625, "logps/rejected": -911.4672241210938, "loss": 0.5903, "num_input_tokens_seen": 10062592, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.10726039111614227, "rewards/margins": 0.2591054439544678, "rewards/rejected": -0.1518450379371643, "step": 28 }, { "epoch": 0.11108451041417285, "grad_norm": 0.9787644147872925, "learning_rate": 9.859281302032107e-06, "logits/chosen": -1.0300339460372925, "logits/rejected": -1.0144660472869873, "logps/chosen": -905.2968139648438, "logps/rejected": -912.3560791015625, "loss": 0.5922, "num_input_tokens_seen": 10416320, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.07246110588312149, "rewards/margins": 0.2484126091003418, "rewards/rejected": -0.1759515106678009, "step": 29 }, { "epoch": 0.11491501077328226, "grad_norm": 1.0253512859344482, "learning_rate": 9.844158449086372e-06, "logits/chosen": -1.0572880506515503, "logits/rejected": -1.0502582788467407, "logps/chosen": -945.9561767578125, "logps/rejected": -944.3990478515625, "loss": 0.5985, "num_input_tokens_seen": 10785856, "rewards/accuracies": 0.796875, "rewards/chosen": 0.06250870227813721, "rewards/margins": 0.23862135410308838, "rewards/rejected": -0.17611265182495117, "step": 30 }, { "epoch": 0.11874551113239167, "grad_norm": 1.0238397121429443, "learning_rate": 9.828276730354353e-06, "logits/chosen": -1.076204776763916, "logits/rejected": -1.0662384033203125, "logps/chosen": -930.0040893554688, "logps/rejected": -932.3923950195312, "loss": 0.5576, "num_input_tokens_seen": 11150656, "rewards/accuracies": 0.859375, "rewards/chosen": 0.07753083854913712, "rewards/margins": 0.3478483259677887, "rewards/rejected": -0.2703174948692322, "step": 31 }, { "epoch": 0.12257601149150107, "grad_norm": 0.9527409672737122, "learning_rate": 9.811638633800287e-06, "logits/chosen": -1.0208882093429565, "logits/rejected": -1.0182850360870361, "logps/chosen": -911.5631103515625, "logps/rejected": -909.9766845703125, "loss": 0.5511, "num_input_tokens_seen": 11493568, "rewards/accuracies": 0.84375, "rewards/chosen": 0.11385183781385422, "rewards/margins": 0.3859643340110779, "rewards/rejected": -0.27211251854896545, "step": 32 }, { "epoch": 0.1264065118506105, "grad_norm": 1.0301215648651123, "learning_rate": 9.794246765879421e-06, "logits/chosen": -1.041532278060913, "logits/rejected": -1.0406360626220703, "logps/chosen": -962.494873046875, "logps/rejected": -962.85986328125, "loss": 0.5723, "num_input_tokens_seen": 11861824, "rewards/accuracies": 0.828125, "rewards/chosen": 0.08621297031641006, "rewards/margins": 0.34169405698776245, "rewards/rejected": -0.25548112392425537, "step": 33 }, { "epoch": 0.1302370122097199, "grad_norm": 0.9654497504234314, "learning_rate": 9.776103851129706e-06, "logits/chosen": -1.0220344066619873, "logits/rejected": -1.015979528427124, "logps/chosen": -949.3598022460938, "logps/rejected": -960.7059326171875, "loss": 0.5751, "num_input_tokens_seen": 12229824, "rewards/accuracies": 0.828125, "rewards/chosen": 0.09405691176652908, "rewards/margins": 0.32782119512557983, "rewards/rejected": -0.23376427590847015, "step": 34 }, { "epoch": 0.13406751256882932, "grad_norm": 1.038521647453308, "learning_rate": 9.757212731744973e-06, "logits/chosen": -1.076894998550415, "logits/rejected": -1.0720102787017822, "logps/chosen": -925.4364013671875, "logps/rejected": -950.203369140625, "loss": 0.5121, "num_input_tokens_seen": 12610880, "rewards/accuracies": 0.8984375, "rewards/chosen": 0.059632543474435806, "rewards/margins": 0.48721498250961304, "rewards/rejected": -0.42758244276046753, "step": 35 }, { "epoch": 0.1378980129279387, "grad_norm": 1.0083974599838257, "learning_rate": 9.737576367129694e-06, "logits/chosen": -1.034104585647583, "logits/rejected": -1.0210139751434326, "logps/chosen": -871.239013671875, "logps/rejected": -879.6558837890625, "loss": 0.536, "num_input_tokens_seen": 12958592, "rewards/accuracies": 0.796875, "rewards/chosen": 0.1341094672679901, "rewards/margins": 0.45023247599601746, "rewards/rejected": -0.31612300872802734, "step": 36 }, { "epoch": 0.1417285132870481, "grad_norm": 1.0032159090042114, "learning_rate": 9.717197833435367e-06, "logits/chosen": -1.0570846796035767, "logits/rejected": -1.049383521080017, "logps/chosen": -875.153076171875, "logps/rejected": -868.0505981445312, "loss": 0.5294, "num_input_tokens_seen": 13310336, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.11251354217529297, "rewards/margins": 0.4998120069503784, "rewards/rejected": -0.38729846477508545, "step": 37 }, { "epoch": 0.14555901364615753, "grad_norm": 0.973970353603363, "learning_rate": 9.696080323078621e-06, "logits/chosen": -1.0454506874084473, "logits/rejected": -1.0396976470947266, "logps/chosen": -945.59521484375, "logps/rejected": -946.391845703125, "loss": 0.5191, "num_input_tokens_seen": 13681920, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.1408233493566513, "rewards/margins": 0.5279620885848999, "rewards/rejected": -0.3871387839317322, "step": 38 }, { "epoch": 0.14938951400526693, "grad_norm": 0.9903539419174194, "learning_rate": 9.67422714424111e-06, "logits/chosen": -1.0471606254577637, "logits/rejected": -1.029019832611084, "logps/chosen": -898.5994262695312, "logps/rejected": -893.9972534179688, "loss": 0.5371, "num_input_tokens_seen": 14033792, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.05680917203426361, "rewards/margins": 0.4695061147212982, "rewards/rejected": -0.4126969575881958, "step": 39 }, { "epoch": 0.15322001436437635, "grad_norm": 0.9689764976501465, "learning_rate": 9.651641720351262e-06, "logits/chosen": -1.0580934286117554, "logits/rejected": -1.065984845161438, "logps/chosen": -908.4832763671875, "logps/rejected": -895.956298828125, "loss": 0.49, "num_input_tokens_seen": 14392064, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.17145198583602905, "rewards/margins": 0.5947294235229492, "rewards/rejected": -0.42327749729156494, "step": 40 }, { "epoch": 0.15705051472348575, "grad_norm": 0.9584202170372009, "learning_rate": 9.628327589547977e-06, "logits/chosen": -1.0466926097869873, "logits/rejected": -1.0385541915893555, "logps/chosen": -913.1895141601562, "logps/rejected": -910.8951416015625, "loss": 0.4843, "num_input_tokens_seen": 14750144, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.12090866267681122, "rewards/margins": 0.5805420875549316, "rewards/rejected": -0.45963340997695923, "step": 41 }, { "epoch": 0.16088101508259517, "grad_norm": 0.9737739562988281, "learning_rate": 9.604288404126362e-06, "logits/chosen": -1.0408737659454346, "logits/rejected": -1.0337262153625488, "logps/chosen": -947.9314575195312, "logps/rejected": -940.6116943359375, "loss": 0.4864, "num_input_tokens_seen": 15115648, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.10129190981388092, "rewards/margins": 0.609031081199646, "rewards/rejected": -0.5077391862869263, "step": 42 }, { "epoch": 0.16471151544170456, "grad_norm": 1.0550665855407715, "learning_rate": 9.579527929965581e-06, "logits/chosen": -1.0519779920578003, "logits/rejected": -1.0420559644699097, "logps/chosen": -938.4093017578125, "logps/rejected": -936.85498046875, "loss": 0.4906, "num_input_tokens_seen": 15473984, "rewards/accuracies": 0.890625, "rewards/chosen": 0.12262112647294998, "rewards/margins": 0.6415243148803711, "rewards/rejected": -0.5189031958580017, "step": 43 }, { "epoch": 0.168542015800814, "grad_norm": 0.951257050037384, "learning_rate": 9.554050045938893e-06, "logits/chosen": -1.0564297437667847, "logits/rejected": -1.036995530128479, "logps/chosen": -922.6273803710938, "logps/rejected": -950.3002319335938, "loss": 0.4451, "num_input_tokens_seen": 15818304, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.0952005386352539, "rewards/margins": 0.8160958886146545, "rewards/rejected": -0.7208954095840454, "step": 44 }, { "epoch": 0.17237251615992338, "grad_norm": 0.8960257172584534, "learning_rate": 9.52785874330602e-06, "logits/chosen": -1.0616629123687744, "logits/rejected": -1.059366226196289, "logps/chosen": -919.501220703125, "logps/rejected": -901.478759765625, "loss": 0.448, "num_input_tokens_seen": 16173312, "rewards/accuracies": 0.875, "rewards/chosen": 0.227450430393219, "rewards/margins": 0.8265661001205444, "rewards/rejected": -0.5991157293319702, "step": 45 }, { "epoch": 0.1762030165190328, "grad_norm": 0.8991801738739014, "learning_rate": 9.500958125087882e-06, "logits/chosen": -1.0398144721984863, "logits/rejected": -1.0337408781051636, "logps/chosen": -921.608154296875, "logps/rejected": -926.8184814453125, "loss": 0.4443, "num_input_tokens_seen": 16542400, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.1689867377281189, "rewards/margins": 0.7900327444076538, "rewards/rejected": -0.6210460066795349, "step": 46 }, { "epoch": 0.1800335168781422, "grad_norm": 0.9368561506271362, "learning_rate": 9.473352405423845e-06, "logits/chosen": -1.0370045900344849, "logits/rejected": -1.034510612487793, "logps/chosen": -927.1217041015625, "logps/rejected": -940.380615234375, "loss": 0.4219, "num_input_tokens_seen": 16906368, "rewards/accuracies": 0.921875, "rewards/chosen": 0.12228920310735703, "rewards/margins": 0.91481614112854, "rewards/rejected": -0.7925269603729248, "step": 47 }, { "epoch": 0.18386401723725163, "grad_norm": 0.8737442493438721, "learning_rate": 9.445045908911536e-06, "logits/chosen": -1.0180749893188477, "logits/rejected": -1.0195181369781494, "logps/chosen": -914.4266357421875, "logps/rejected": -919.2938842773438, "loss": 0.4229, "num_input_tokens_seen": 17244160, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.22764116525650024, "rewards/margins": 0.9235501289367676, "rewards/rejected": -0.6959089040756226, "step": 48 }, { "epoch": 0.18769451759636102, "grad_norm": 0.9246760606765747, "learning_rate": 9.416043069929389e-06, "logits/chosen": -1.0696072578430176, "logits/rejected": -1.0749962329864502, "logps/chosen": -915.3636474609375, "logps/rejected": -915.0205688476562, "loss": 0.421, "num_input_tokens_seen": 17605632, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.03486424311995506, "rewards/margins": 0.9285318851470947, "rewards/rejected": -0.8936675786972046, "step": 49 }, { "epoch": 0.19152501795547044, "grad_norm": 0.8932781219482422, "learning_rate": 9.386348431941953e-06, "logits/chosen": -1.030189871788025, "logits/rejected": -1.0274410247802734, "logps/chosen": -927.2352905273438, "logps/rejected": -913.6710205078125, "loss": 0.4248, "num_input_tokens_seen": 17966784, "rewards/accuracies": 0.890625, "rewards/chosen": 0.15712372958660126, "rewards/margins": 0.8464722037315369, "rewards/rejected": -0.6893484592437744, "step": 50 }, { "epoch": 0.19535551831457984, "grad_norm": 0.8208088278770447, "learning_rate": 9.355966646788152e-06, "logits/chosen": -1.0399882793426514, "logits/rejected": -1.049922227859497, "logps/chosen": -922.302490234375, "logps/rejected": -926.0870971679688, "loss": 0.3664, "num_input_tokens_seen": 18328768, "rewards/accuracies": 0.8984375, "rewards/chosen": 0.2352968007326126, "rewards/margins": 1.2056996822357178, "rewards/rejected": -0.9704028367996216, "step": 51 }, { "epoch": 0.19918601867368926, "grad_norm": 0.9681217670440674, "learning_rate": 9.324902473952529e-06, "logits/chosen": -1.0520392656326294, "logits/rejected": -1.0435035228729248, "logps/chosen": -912.5722045898438, "logps/rejected": -910.1470947265625, "loss": 0.427, "num_input_tokens_seen": 18682304, "rewards/accuracies": 0.859375, "rewards/chosen": 0.007225107401609421, "rewards/margins": 0.9377115368843079, "rewards/rejected": -0.9304863810539246, "step": 52 }, { "epoch": 0.20301651903279866, "grad_norm": 0.8848323225975037, "learning_rate": 9.293160779819658e-06, "logits/chosen": -1.0435223579406738, "logits/rejected": -1.0460057258605957, "logps/chosen": -913.4359741210938, "logps/rejected": -926.9989624023438, "loss": 0.3614, "num_input_tokens_seen": 19037760, "rewards/accuracies": 0.890625, "rewards/chosen": 0.2528911828994751, "rewards/margins": 1.2186728715896606, "rewards/rejected": -0.9657817482948303, "step": 53 }, { "epoch": 0.20684701939190805, "grad_norm": 0.8267402052879333, "learning_rate": 9.260746536911792e-06, "logits/chosen": -1.0281553268432617, "logits/rejected": -1.0281143188476562, "logps/chosen": -927.7498779296875, "logps/rejected": -912.3658447265625, "loss": 0.3549, "num_input_tokens_seen": 19394688, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.14229370653629303, "rewards/margins": 1.2576112747192383, "rewards/rejected": -1.1153175830841064, "step": 54 }, { "epoch": 0.21067751975101748, "grad_norm": 0.8928641080856323, "learning_rate": 9.227664823109884e-06, "logits/chosen": -1.0537580251693726, "logits/rejected": -1.0493597984313965, "logps/chosen": -946.2432250976562, "logps/rejected": -960.2678833007812, "loss": 0.3531, "num_input_tokens_seen": 19760704, "rewards/accuracies": 0.890625, "rewards/chosen": 0.15718036890029907, "rewards/margins": 1.3443400859832764, "rewards/rejected": -1.187159776687622, "step": 55 }, { "epoch": 0.21450802011012687, "grad_norm": 0.8100174069404602, "learning_rate": 9.193920820858113e-06, "logits/chosen": -1.0525925159454346, "logits/rejected": -1.0595781803131104, "logps/chosen": -925.456298828125, "logps/rejected": -904.4075927734375, "loss": 0.3455, "num_input_tokens_seen": 20121856, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.1917523741722107, "rewards/margins": 1.3067724704742432, "rewards/rejected": -1.1150201559066772, "step": 56 }, { "epoch": 0.2183385204692363, "grad_norm": 0.7760931253433228, "learning_rate": 9.159519816352021e-06, "logits/chosen": -1.0448260307312012, "logits/rejected": -1.0453829765319824, "logps/chosen": -930.4881591796875, "logps/rejected": -940.2239990234375, "loss": 0.3191, "num_input_tokens_seen": 20482304, "rewards/accuracies": 0.9296875, "rewards/chosen": 0.07732844352722168, "rewards/margins": 1.3429336547851562, "rewards/rejected": -1.2656052112579346, "step": 57 }, { "epoch": 0.2221690208283457, "grad_norm": 0.7795583605766296, "learning_rate": 9.124467198710401e-06, "logits/chosen": -1.0545520782470703, "logits/rejected": -1.0434613227844238, "logps/chosen": -903.4138793945312, "logps/rejected": -914.3572998046875, "loss": 0.3171, "num_input_tokens_seen": 20822656, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.00938415713608265, "rewards/margins": 1.46409273147583, "rewards/rejected": -1.4734768867492676, "step": 58 }, { "epoch": 0.22599952118745512, "grad_norm": 0.7819092273712158, "learning_rate": 9.08876845913106e-06, "logits/chosen": -1.065345287322998, "logits/rejected": -1.071925163269043, "logps/chosen": -912.1345825195312, "logps/rejected": -898.795166015625, "loss": 0.3078, "num_input_tokens_seen": 21187392, "rewards/accuracies": 0.9609375, "rewards/chosen": 0.2630062997341156, "rewards/margins": 1.5684618949890137, "rewards/rejected": -1.3054554462432861, "step": 59 }, { "epoch": 0.2298300215465645, "grad_norm": 0.810053288936615, "learning_rate": 9.052429190030589e-06, "logits/chosen": -1.0523853302001953, "logits/rejected": -1.0394606590270996, "logps/chosen": -940.7644653320312, "logps/rejected": -968.588134765625, "loss": 0.3253, "num_input_tokens_seen": 21545600, "rewards/accuracies": 0.90625, "rewards/chosen": 0.11970774829387665, "rewards/margins": 1.4579362869262695, "rewards/rejected": -1.3382285833358765, "step": 60 }, { "epoch": 0.23366052190567393, "grad_norm": 0.8415746092796326, "learning_rate": 9.015455084168279e-06, "logits/chosen": -1.0666999816894531, "logits/rejected": -1.0684211254119873, "logps/chosen": -869.7279052734375, "logps/rejected": -885.7716064453125, "loss": 0.3212, "num_input_tokens_seen": 21887936, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.0877489447593689, "rewards/margins": 1.599064588546753, "rewards/rejected": -1.5113155841827393, "step": 61 }, { "epoch": 0.23749102226478333, "grad_norm": 0.8196543455123901, "learning_rate": 8.977851933754317e-06, "logits/chosen": -1.0270411968231201, "logits/rejected": -1.0330016613006592, "logps/chosen": -931.991455078125, "logps/rejected": -925.2451171875, "loss": 0.3188, "num_input_tokens_seen": 22233024, "rewards/accuracies": 0.9609375, "rewards/chosen": 0.1586177945137024, "rewards/margins": 1.4711058139801025, "rewards/rejected": -1.312488079071045, "step": 62 }, { "epoch": 0.24132152262389275, "grad_norm": 0.7564411759376526, "learning_rate": 8.939625629542401e-06, "logits/chosen": -1.0505956411361694, "logits/rejected": -1.0580633878707886, "logps/chosen": -902.9009399414062, "logps/rejected": -906.3607177734375, "loss": 0.2957, "num_input_tokens_seen": 22585472, "rewards/accuracies": 0.9140625, "rewards/chosen": 0.10947957634925842, "rewards/margins": 1.675947666168213, "rewards/rejected": -1.5664680004119873, "step": 63 }, { "epoch": 0.24515202298300215, "grad_norm": 0.805608868598938, "learning_rate": 8.900782159906927e-06, "logits/chosen": -1.0741575956344604, "logits/rejected": -1.0680464506149292, "logps/chosen": -903.8414306640625, "logps/rejected": -929.549560546875, "loss": 0.2977, "num_input_tokens_seen": 22946880, "rewards/accuracies": 0.890625, "rewards/chosen": 0.1019095927476883, "rewards/margins": 1.7661054134368896, "rewards/rejected": -1.6641957759857178, "step": 64 }, { "epoch": 0.24898252334211157, "grad_norm": 0.733465313911438, "learning_rate": 8.861327609904859e-06, "logits/chosen": -1.0529414415359497, "logits/rejected": -1.048985481262207, "logps/chosen": -935.5690307617188, "logps/rejected": -958.1524658203125, "loss": 0.2786, "num_input_tokens_seen": 23304320, "rewards/accuracies": 0.921875, "rewards/chosen": 0.12524667382240295, "rewards/margins": 1.8969340324401855, "rewards/rejected": -1.7716875076293945, "step": 65 }, { "epoch": 0.252813023701221, "grad_norm": 0.8141180872917175, "learning_rate": 8.821268160322482e-06, "logits/chosen": -1.0694968700408936, "logits/rejected": -1.0694319009780884, "logps/chosen": -927.3983154296875, "logps/rejected": -939.122314453125, "loss": 0.2977, "num_input_tokens_seen": 23665536, "rewards/accuracies": 0.859375, "rewards/chosen": 0.02337331511080265, "rewards/margins": 1.768088936805725, "rewards/rejected": -1.7447155714035034, "step": 66 }, { "epoch": 0.25664352406033036, "grad_norm": 0.7073566317558289, "learning_rate": 8.780610086707149e-06, "logits/chosen": -1.0607355833053589, "logits/rejected": -1.0568872690200806, "logps/chosen": -948.0847778320312, "logps/rejected": -965.6436767578125, "loss": 0.2613, "num_input_tokens_seen": 24021376, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.10131228715181351, "rewards/margins": 1.7949880361557007, "rewards/rejected": -1.8963003158569336, "step": 67 }, { "epoch": 0.2604740244194398, "grad_norm": 0.7342793941497803, "learning_rate": 8.739359758384162e-06, "logits/chosen": -1.0782382488250732, "logits/rejected": -1.077387809753418, "logps/chosen": -932.740478515625, "logps/rejected": -940.6168823242188, "loss": 0.2722, "num_input_tokens_seen": 24385216, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.0452754907310009, "rewards/margins": 1.871610164642334, "rewards/rejected": -1.916885495185852, "step": 68 }, { "epoch": 0.2643045247785492, "grad_norm": 0.812954306602478, "learning_rate": 8.697523637458997e-06, "logits/chosen": -1.063349962234497, "logits/rejected": -1.0582802295684814, "logps/chosen": -923.201171875, "logps/rejected": -935.93603515625, "loss": 0.2872, "num_input_tokens_seen": 24750080, "rewards/accuracies": 0.90625, "rewards/chosen": 0.011177586391568184, "rewards/margins": 1.758663296699524, "rewards/rejected": -1.747485637664795, "step": 69 }, { "epoch": 0.26813502513765863, "grad_norm": 0.7448921203613281, "learning_rate": 8.655108277804975e-06, "logits/chosen": -1.0851333141326904, "logits/rejected": -1.079848289489746, "logps/chosen": -936.8907470703125, "logps/rejected": -967.2071533203125, "loss": 0.272, "num_input_tokens_seen": 25116480, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.16820263862609863, "rewards/margins": 1.8837871551513672, "rewards/rejected": -2.051989793777466, "step": 70 }, { "epoch": 0.271965525496768, "grad_norm": 0.6441535353660583, "learning_rate": 8.612120324036548e-06, "logits/chosen": -1.0761862993240356, "logits/rejected": -1.0810930728912354, "logps/chosen": -910.4297485351562, "logps/rejected": -922.0003662109375, "loss": 0.2075, "num_input_tokens_seen": 25465536, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.0027351118624210358, "rewards/margins": 2.2052128314971924, "rewards/rejected": -2.2079479694366455, "step": 71 }, { "epoch": 0.2757960258558774, "grad_norm": 0.7206379771232605, "learning_rate": 8.568566510468392e-06, "logits/chosen": -1.0868092775344849, "logits/rejected": -1.0811575651168823, "logps/chosen": -941.665771484375, "logps/rejected": -973.3070068359375, "loss": 0.2464, "num_input_tokens_seen": 25833344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2859920263290405, "rewards/margins": 2.0870699882507324, "rewards/rejected": -2.3730621337890625, "step": 72 }, { "epoch": 0.27962652621498685, "grad_norm": 0.8260279297828674, "learning_rate": 8.524453660060434e-06, "logits/chosen": -1.0668997764587402, "logits/rejected": -1.06461763381958, "logps/chosen": -894.1995849609375, "logps/rejected": -909.6748657226562, "loss": 0.2724, "num_input_tokens_seen": 26183872, "rewards/accuracies": 0.921875, "rewards/chosen": -0.35972511768341064, "rewards/margins": 1.8394525051116943, "rewards/rejected": -2.1991777420043945, "step": 73 }, { "epoch": 0.2834570265740962, "grad_norm": 0.8776343464851379, "learning_rate": 8.479788683348996e-06, "logits/chosen": -1.0618414878845215, "logits/rejected": -1.0610969066619873, "logps/chosen": -920.0272216796875, "logps/rejected": -902.9821166992188, "loss": 0.2695, "num_input_tokens_seen": 26539904, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3319213390350342, "rewards/margins": 1.9274811744689941, "rewards/rejected": -2.259402275085449, "step": 74 }, { "epoch": 0.28728752693320564, "grad_norm": 0.7757173180580139, "learning_rate": 8.434578577364218e-06, "logits/chosen": -1.0758130550384521, "logits/rejected": -1.0636892318725586, "logps/chosen": -931.8839111328125, "logps/rejected": -947.8310546875, "loss": 0.266, "num_input_tokens_seen": 26902464, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3294548988342285, "rewards/margins": 2.0559473037719727, "rewards/rejected": -2.385402202606201, "step": 75 }, { "epoch": 0.29111802729231506, "grad_norm": 0.7610207796096802, "learning_rate": 8.388830424533935e-06, "logits/chosen": -1.078747272491455, "logits/rejected": -1.0726932287216187, "logps/chosen": -928.5173950195312, "logps/rejected": -959.848876953125, "loss": 0.2346, "num_input_tokens_seen": 27269504, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.27425873279571533, "rewards/margins": 2.1390914916992188, "rewards/rejected": -2.4133503437042236, "step": 76 }, { "epoch": 0.2949485276514245, "grad_norm": 0.7980623841285706, "learning_rate": 8.342551391574165e-06, "logits/chosen": -1.0808157920837402, "logits/rejected": -1.0889337062835693, "logps/chosen": -942.5945434570312, "logps/rejected": -927.933349609375, "loss": 0.263, "num_input_tokens_seen": 27629248, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3413712978363037, "rewards/margins": 2.096808433532715, "rewards/rejected": -2.4381794929504395, "step": 77 }, { "epoch": 0.29877902801053385, "grad_norm": 0.7272222638130188, "learning_rate": 8.295748728366414e-06, "logits/chosen": -1.0566456317901611, "logits/rejected": -1.051896333694458, "logps/chosen": -941.3970947265625, "logps/rejected": -983.9376220703125, "loss": 0.2319, "num_input_tokens_seen": 27988096, "rewards/accuracies": 0.921875, "rewards/chosen": -0.41651400923728943, "rewards/margins": 2.1791977882385254, "rewards/rejected": -2.5957119464874268, "step": 78 }, { "epoch": 0.3026095283696433, "grad_norm": 0.7870813608169556, "learning_rate": 8.248429766821925e-06, "logits/chosen": -1.0612484216690063, "logits/rejected": -1.0511322021484375, "logps/chosen": -965.112548828125, "logps/rejected": -980.8787231445312, "loss": 0.2636, "num_input_tokens_seen": 28346432, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.5220478177070618, "rewards/margins": 2.1551966667175293, "rewards/rejected": -2.6772446632385254, "step": 79 }, { "epoch": 0.3064400287287527, "grad_norm": 0.8286476731300354, "learning_rate": 8.200601919733106e-06, "logits/chosen": -1.084940791130066, "logits/rejected": -1.0818970203399658, "logps/chosen": -959.9788818359375, "logps/rejected": -981.2055053710938, "loss": 0.2616, "num_input_tokens_seen": 28715200, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.49848473072052, "rewards/margins": 2.191138744354248, "rewards/rejected": -2.6896233558654785, "step": 80 }, { "epoch": 0.3102705290878621, "grad_norm": 0.7395047545433044, "learning_rate": 8.15227267961226e-06, "logits/chosen": -1.071365237236023, "logits/rejected": -1.0694466829299927, "logps/chosen": -950.6376342773438, "logps/rejected": -960.358154296875, "loss": 0.2285, "num_input_tokens_seen": 29068800, "rewards/accuracies": 0.921875, "rewards/chosen": -0.37492138147354126, "rewards/margins": 2.5624656677246094, "rewards/rejected": -2.937386989593506, "step": 81 }, { "epoch": 0.3141010294469715, "grad_norm": 0.8469851613044739, "learning_rate": 8.10344961751785e-06, "logits/chosen": -1.0703625679016113, "logits/rejected": -1.0763499736785889, "logps/chosen": -921.09326171875, "logps/rejected": -924.1490478515625, "loss": 0.257, "num_input_tokens_seen": 29416000, "rewards/accuracies": 0.875, "rewards/chosen": -0.47378402948379517, "rewards/margins": 2.3074848651885986, "rewards/rejected": -2.781269073486328, "step": 82 }, { "epoch": 0.3179315298060809, "grad_norm": 0.7990094423294067, "learning_rate": 8.054140381868435e-06, "logits/chosen": -1.1002495288848877, "logits/rejected": -1.086364984512329, "logps/chosen": -932.9931640625, "logps/rejected": -955.93017578125, "loss": 0.2235, "num_input_tokens_seen": 29768704, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.4593415856361389, "rewards/margins": 2.4236037731170654, "rewards/rejected": -2.8829455375671387, "step": 83 }, { "epoch": 0.32176203016519034, "grad_norm": 0.8141796588897705, "learning_rate": 8.004352697244516e-06, "logits/chosen": -1.0791590213775635, "logits/rejected": -1.07072114944458, "logps/chosen": -971.5861206054688, "logps/rejected": -981.2337646484375, "loss": 0.2401, "num_input_tokens_seen": 30143360, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.5980307459831238, "rewards/margins": 2.2348849773406982, "rewards/rejected": -2.8329155445098877, "step": 84 }, { "epoch": 0.32559253052429976, "grad_norm": 0.6741338968276978, "learning_rate": 7.954094363178421e-06, "logits/chosen": -1.0647037029266357, "logits/rejected": -1.053682804107666, "logps/chosen": -924.1261596679688, "logps/rejected": -958.7186889648438, "loss": 0.2107, "num_input_tokens_seen": 30496832, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.48919373750686646, "rewards/margins": 2.4839816093444824, "rewards/rejected": -2.973175525665283, "step": 85 }, { "epoch": 0.32942303088340913, "grad_norm": 0.7606660723686218, "learning_rate": 7.903373252932474e-06, "logits/chosen": -1.090849757194519, "logits/rejected": -1.0853286981582642, "logps/chosen": -919.6220092773438, "logps/rejected": -942.2991943359375, "loss": 0.2237, "num_input_tokens_seen": 30854400, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6518514156341553, "rewards/margins": 2.4913079738616943, "rewards/rejected": -3.1431593894958496, "step": 86 }, { "epoch": 0.33325353124251855, "grad_norm": 0.7608671188354492, "learning_rate": 7.852197312265592e-06, "logits/chosen": -1.0895733833312988, "logits/rejected": -1.0881158113479614, "logps/chosen": -909.59716796875, "logps/rejected": -909.6011962890625, "loss": 0.2435, "num_input_tokens_seen": 31212288, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.43335431814193726, "rewards/margins": 2.310985565185547, "rewards/rejected": -2.744339942932129, "step": 87 }, { "epoch": 0.337084031601628, "grad_norm": 0.8353620171546936, "learning_rate": 7.800574558188548e-06, "logits/chosen": -1.090893030166626, "logits/rejected": -1.0922658443450928, "logps/chosen": -939.388427734375, "logps/rejected": -945.8134765625, "loss": 0.2273, "num_input_tokens_seen": 31576000, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6613324284553528, "rewards/margins": 2.4176292419433594, "rewards/rejected": -3.0789618492126465, "step": 88 }, { "epoch": 0.34091453196073734, "grad_norm": 0.8224096298217773, "learning_rate": 7.748513077708044e-06, "logits/chosen": -1.0749074220657349, "logits/rejected": -1.0710772275924683, "logps/chosen": -918.1564331054688, "logps/rejected": -931.9786376953125, "loss": 0.2568, "num_input_tokens_seen": 31930560, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7582589387893677, "rewards/margins": 2.1596531867980957, "rewards/rejected": -2.917912006378174, "step": 89 }, { "epoch": 0.34474503231984677, "grad_norm": 0.6878222227096558, "learning_rate": 7.69602102655985e-06, "logits/chosen": -1.1127848625183105, "logits/rejected": -1.113338589668274, "logps/chosen": -934.9884643554688, "logps/rejected": -962.1505126953125, "loss": 0.1811, "num_input_tokens_seen": 32295488, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.605439305305481, "rewards/margins": 2.958923816680908, "rewards/rejected": -3.5643632411956787, "step": 90 }, { "epoch": 0.3485755326789562, "grad_norm": 0.6399231553077698, "learning_rate": 7.643106627931148e-06, "logits/chosen": -1.1128592491149902, "logits/rejected": -1.107898473739624, "logps/chosen": -984.710205078125, "logps/rejected": -1000.18701171875, "loss": 0.1776, "num_input_tokens_seen": 32673984, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.5592378377914429, "rewards/margins": 2.7699828147888184, "rewards/rejected": -3.3292205333709717, "step": 91 }, { "epoch": 0.3524060330380656, "grad_norm": 0.8195556998252869, "learning_rate": 7.5897781711723215e-06, "logits/chosen": -1.097868800163269, "logits/rejected": -1.0952892303466797, "logps/chosen": -920.0769653320312, "logps/rejected": -936.2384033203125, "loss": 0.2421, "num_input_tokens_seen": 33035968, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.6490311622619629, "rewards/margins": 2.5610485076904297, "rewards/rejected": -3.2100796699523926, "step": 92 }, { "epoch": 0.356236533397175, "grad_norm": 0.7686013579368591, "learning_rate": 7.536044010498396e-06, "logits/chosen": -1.095731258392334, "logits/rejected": -1.1059951782226562, "logps/chosen": -896.857421875, "logps/rejected": -911.7064208984375, "loss": 0.2218, "num_input_tokens_seen": 33392192, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6852568984031677, "rewards/margins": 2.4506115913391113, "rewards/rejected": -3.1358680725097656, "step": 93 }, { "epoch": 0.3600670337562844, "grad_norm": 0.6556101441383362, "learning_rate": 7.48191256368028e-06, "logits/chosen": -1.1224572658538818, "logits/rejected": -1.114153265953064, "logps/chosen": -952.65966796875, "logps/rejected": -986.4073486328125, "loss": 0.1707, "num_input_tokens_seen": 33758976, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6452797651290894, "rewards/margins": 3.0503456592559814, "rewards/rejected": -3.6956255435943604, "step": 94 }, { "epoch": 0.36389753411539383, "grad_norm": 0.7410134673118591, "learning_rate": 7.427392310726088e-06, "logits/chosen": -1.1127829551696777, "logits/rejected": -1.098555326461792, "logps/chosen": -939.870849609375, "logps/rejected": -954.7718505859375, "loss": 0.2029, "num_input_tokens_seen": 34123840, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.7152698040008545, "rewards/margins": 2.7910680770874023, "rewards/rejected": -3.5063376426696777, "step": 95 }, { "epoch": 0.36772803447450325, "grad_norm": 0.6477031707763672, "learning_rate": 7.372491792552694e-06, "logits/chosen": -1.1184864044189453, "logits/rejected": -1.110473394393921, "logps/chosen": -943.9148559570312, "logps/rejected": -957.20068359375, "loss": 0.1596, "num_input_tokens_seen": 34479552, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.680740237236023, "rewards/margins": 2.777618885040283, "rewards/rejected": -3.4583587646484375, "step": 96 }, { "epoch": 0.3715585348336126, "grad_norm": 0.6712774634361267, "learning_rate": 7.31721960964774e-06, "logits/chosen": -1.0910961627960205, "logits/rejected": -1.0890021324157715, "logps/chosen": -918.433349609375, "logps/rejected": -942.1868896484375, "loss": 0.1881, "num_input_tokens_seen": 34835840, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.40874814987182617, "rewards/margins": 2.9342246055603027, "rewards/rejected": -3.342972755432129, "step": 97 }, { "epoch": 0.37538903519272204, "grad_norm": 0.8931744694709778, "learning_rate": 7.261584420722328e-06, "logits/chosen": -1.111809253692627, "logits/rejected": -1.117790699005127, "logps/chosen": -947.8694458007812, "logps/rejected": -973.1971435546875, "loss": 0.2405, "num_input_tokens_seen": 35210176, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.7285137176513672, "rewards/margins": 2.551473617553711, "rewards/rejected": -3.279987335205078, "step": 98 }, { "epoch": 0.37921953555183147, "grad_norm": 0.722402811050415, "learning_rate": 7.20559494135458e-06, "logits/chosen": -1.0994579792022705, "logits/rejected": -1.1112189292907715, "logps/chosen": -933.018798828125, "logps/rejected": -954.1309814453125, "loss": 0.195, "num_input_tokens_seen": 35568576, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6893086433410645, "rewards/margins": 2.9537758827209473, "rewards/rejected": -3.643084764480591, "step": 99 }, { "epoch": 0.3830500359109409, "grad_norm": 0.7066813707351685, "learning_rate": 7.149259942624287e-06, "logits/chosen": -1.118371605873108, "logits/rejected": -1.1083605289459229, "logps/chosen": -933.5869750976562, "logps/rejected": -955.2203369140625, "loss": 0.186, "num_input_tokens_seen": 35929024, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6603685617446899, "rewards/margins": 2.6682770252227783, "rewards/rejected": -3.328645706176758, "step": 100 }, { "epoch": 0.38688053627005026, "grad_norm": 0.7495232224464417, "learning_rate": 7.092588249738871e-06, "logits/chosen": -1.1121894121170044, "logits/rejected": -1.1247451305389404, "logps/chosen": -929.92431640625, "logps/rejected": -921.3439331054688, "loss": 0.1837, "num_input_tokens_seen": 36292032, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.5730222463607788, "rewards/margins": 2.735011100769043, "rewards/rejected": -3.3080334663391113, "step": 101 }, { "epoch": 0.3907110366291597, "grad_norm": 0.7650550007820129, "learning_rate": 7.03558874065087e-06, "logits/chosen": -1.1010173559188843, "logits/rejected": -1.1006340980529785, "logps/chosen": -895.2073974609375, "logps/rejected": -915.1663208007812, "loss": 0.181, "num_input_tokens_seen": 36647104, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6195077896118164, "rewards/margins": 2.826021432876587, "rewards/rejected": -3.4455294609069824, "step": 102 }, { "epoch": 0.3945415369882691, "grad_norm": 0.6909847259521484, "learning_rate": 6.978270344667143e-06, "logits/chosen": -1.1063586473464966, "logits/rejected": -1.105713129043579, "logps/chosen": -921.406005859375, "logps/rejected": -936.5244140625, "loss": 0.1815, "num_input_tokens_seen": 36996736, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.5552771687507629, "rewards/margins": 2.764892578125, "rewards/rejected": -3.320169448852539, "step": 103 }, { "epoch": 0.3983720373473785, "grad_norm": 0.9186336398124695, "learning_rate": 6.920642041050055e-06, "logits/chosen": -1.0915818214416504, "logits/rejected": -1.094931960105896, "logps/chosen": -947.0763549804688, "logps/rejected": -944.1019897460938, "loss": 0.2464, "num_input_tokens_seen": 37351296, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.7677267789840698, "rewards/margins": 2.4657530784606934, "rewards/rejected": -3.2334797382354736, "step": 104 }, { "epoch": 0.4022025377064879, "grad_norm": 0.6091768741607666, "learning_rate": 6.862712857610812e-06, "logits/chosen": -1.100638508796692, "logits/rejected": -1.102898120880127, "logps/chosen": -956.29833984375, "logps/rejected": -961.6322021484375, "loss": 0.128, "num_input_tokens_seen": 37710464, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5590561032295227, "rewards/margins": 3.1229941844940186, "rewards/rejected": -3.6820502281188965, "step": 105 }, { "epoch": 0.4060330380655973, "grad_norm": 0.7067527770996094, "learning_rate": 6.804491869295207e-06, "logits/chosen": -1.1310827732086182, "logits/rejected": -1.1312439441680908, "logps/chosen": -952.262451171875, "logps/rejected": -952.248291015625, "loss": 0.1922, "num_input_tokens_seen": 38075136, "rewards/accuracies": 0.953125, "rewards/chosen": -0.574321448802948, "rewards/margins": 2.7026026248931885, "rewards/rejected": -3.2769241333007812, "step": 106 }, { "epoch": 0.40986353842470674, "grad_norm": 0.6532292366027832, "learning_rate": 6.745988196761976e-06, "logits/chosen": -1.0836384296417236, "logits/rejected": -1.082061529159546, "logps/chosen": -958.8799438476562, "logps/rejected": -977.3197021484375, "loss": 0.1652, "num_input_tokens_seen": 38438464, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4467155337333679, "rewards/margins": 3.1544971466064453, "rewards/rejected": -3.601212739944458, "step": 107 }, { "epoch": 0.4136940387838161, "grad_norm": 0.593960165977478, "learning_rate": 6.687211004953992e-06, "logits/chosen": -1.1067237854003906, "logits/rejected": -1.0953024625778198, "logps/chosen": -911.4755249023438, "logps/rejected": -923.3955078125, "loss": 0.1384, "num_input_tokens_seen": 38786112, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5207453370094299, "rewards/margins": 3.25771427154541, "rewards/rejected": -3.7784600257873535, "step": 108 }, { "epoch": 0.41752453914292553, "grad_norm": 0.6579384803771973, "learning_rate": 6.628169501662527e-06, "logits/chosen": -1.1251742839813232, "logits/rejected": -1.1185029745101929, "logps/chosen": -912.025390625, "logps/rejected": -927.5040283203125, "loss": 0.1425, "num_input_tokens_seen": 39137920, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.6473366618156433, "rewards/margins": 3.030540943145752, "rewards/rejected": -3.67787766456604, "step": 109 }, { "epoch": 0.42135503950203496, "grad_norm": 0.8059095144271851, "learning_rate": 6.568872936084789e-06, "logits/chosen": -1.120704174041748, "logits/rejected": -1.1155285835266113, "logps/chosen": -918.2825317382812, "logps/rejected": -955.4237060546875, "loss": 0.2094, "num_input_tokens_seen": 39496896, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.7922378778457642, "rewards/margins": 2.839639902114868, "rewards/rejected": -3.6318776607513428, "step": 110 }, { "epoch": 0.4251855398611444, "grad_norm": 0.7774308323860168, "learning_rate": 6.509330597374993e-06, "logits/chosen": -1.1296982765197754, "logits/rejected": -1.12125825881958, "logps/chosen": -955.875, "logps/rejected": -982.0862426757812, "loss": 0.1983, "num_input_tokens_seen": 39862784, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.7407379150390625, "rewards/margins": 2.9459829330444336, "rewards/rejected": -3.686720848083496, "step": 111 }, { "epoch": 0.42901604022025375, "grad_norm": 0.8787282705307007, "learning_rate": 6.44955181318915e-06, "logits/chosen": -1.130476951599121, "logits/rejected": -1.124819040298462, "logps/chosen": -951.8047485351562, "logps/rejected": -974.5203857421875, "loss": 0.2063, "num_input_tokens_seen": 40229504, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7864696383476257, "rewards/margins": 3.234025478363037, "rewards/rejected": -4.020495414733887, "step": 112 }, { "epoch": 0.43284654057936317, "grad_norm": 0.8070006966590881, "learning_rate": 6.389545948223841e-06, "logits/chosen": -1.1072964668273926, "logits/rejected": -1.0939278602600098, "logps/chosen": -906.2698974609375, "logps/rejected": -934.371337890625, "loss": 0.2079, "num_input_tokens_seen": 40578304, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.6210307478904724, "rewards/margins": 2.9884490966796875, "rewards/rejected": -3.6094799041748047, "step": 113 }, { "epoch": 0.4366770409384726, "grad_norm": 0.6278772354125977, "learning_rate": 6.329322402749181e-06, "logits/chosen": -1.1068530082702637, "logits/rejected": -1.0973081588745117, "logps/chosen": -923.3222045898438, "logps/rejected": -973.787109375, "loss": 0.1346, "num_input_tokens_seen": 40926400, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6330747604370117, "rewards/margins": 3.4524285793304443, "rewards/rejected": -4.085503578186035, "step": 114 }, { "epoch": 0.440507541297582, "grad_norm": 0.7459933161735535, "learning_rate": 6.2688906111362115e-06, "logits/chosen": -1.1147935390472412, "logits/rejected": -1.1260608434677124, "logps/chosen": -942.9789428710938, "logps/rejected": -942.0307006835938, "loss": 0.1798, "num_input_tokens_seen": 41283008, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8582703471183777, "rewards/margins": 3.0465266704559326, "rewards/rejected": -3.904797077178955, "step": 115 }, { "epoch": 0.4443380416566914, "grad_norm": 0.7103105187416077, "learning_rate": 6.208260040378946e-06, "logits/chosen": -1.1097609996795654, "logits/rejected": -1.09922194480896, "logps/chosen": -877.43603515625, "logps/rejected": -892.90771484375, "loss": 0.1762, "num_input_tokens_seen": 41619520, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.48140305280685425, "rewards/margins": 3.118536949157715, "rewards/rejected": -3.5999398231506348, "step": 116 }, { "epoch": 0.4481685420158008, "grad_norm": 0.7059844136238098, "learning_rate": 6.147440188611324e-06, "logits/chosen": -1.1247339248657227, "logits/rejected": -1.1204876899719238, "logps/chosen": -969.3895263671875, "logps/rejected": -1005.1538696289062, "loss": 0.1582, "num_input_tokens_seen": 41986368, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6705014109611511, "rewards/margins": 3.128121852874756, "rewards/rejected": -3.7986230850219727, "step": 117 }, { "epoch": 0.45199904237491023, "grad_norm": 0.683424711227417, "learning_rate": 6.0864405836192575e-06, "logits/chosen": -1.1361314058303833, "logits/rejected": -1.1225978136062622, "logps/chosen": -896.8311157226562, "logps/rejected": -937.1458129882812, "loss": 0.1576, "num_input_tokens_seen": 42347008, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.5880584716796875, "rewards/margins": 3.3888418674468994, "rewards/rejected": -3.976900339126587, "step": 118 }, { "epoch": 0.45582954273401965, "grad_norm": 0.6680045127868652, "learning_rate": 6.025270781348055e-06, "logits/chosen": -1.114820957183838, "logits/rejected": -1.1184210777282715, "logps/chosen": -940.8297119140625, "logps/rejected": -953.9190673828125, "loss": 0.1499, "num_input_tokens_seen": 42698240, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6581835746765137, "rewards/margins": 3.291698932647705, "rewards/rejected": -3.9498825073242188, "step": 119 }, { "epoch": 0.459660043093129, "grad_norm": 0.7042851448059082, "learning_rate": 5.963940364405425e-06, "logits/chosen": -1.1057615280151367, "logits/rejected": -1.1188658475875854, "logps/chosen": -935.0254516601562, "logps/rejected": -929.1004028320312, "loss": 0.1678, "num_input_tokens_seen": 43050688, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7741080522537231, "rewards/margins": 3.301640033721924, "rewards/rejected": -4.075747966766357, "step": 120 }, { "epoch": 0.46349054345223845, "grad_norm": 0.773540198802948, "learning_rate": 5.902458940560304e-06, "logits/chosen": -1.1311380863189697, "logits/rejected": -1.1221275329589844, "logps/chosen": -964.8699951171875, "logps/rejected": -970.51171875, "loss": 0.1805, "num_input_tokens_seen": 43415744, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7004190683364868, "rewards/margins": 3.115402936935425, "rewards/rejected": -3.815822124481201, "step": 121 }, { "epoch": 0.46732104381134787, "grad_norm": 0.6558127999305725, "learning_rate": 5.8408361412377475e-06, "logits/chosen": -1.1317683458328247, "logits/rejected": -1.1370477676391602, "logps/chosen": -915.1818237304688, "logps/rejected": -931.9642333984375, "loss": 0.1412, "num_input_tokens_seen": 43769984, "rewards/accuracies": 0.953125, "rewards/chosen": -0.46988123655319214, "rewards/margins": 3.3140010833740234, "rewards/rejected": -3.7838823795318604, "step": 122 }, { "epoch": 0.4711515441704573, "grad_norm": 0.608486533164978, "learning_rate": 5.779081620010104e-06, "logits/chosen": -1.1101739406585693, "logits/rejected": -1.1085362434387207, "logps/chosen": -950.239501953125, "logps/rejected": -966.5133056640625, "loss": 0.1359, "num_input_tokens_seen": 44125376, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.5609409213066101, "rewards/margins": 3.2361843585968018, "rewards/rejected": -3.7971253395080566, "step": 123 }, { "epoch": 0.47498204452956666, "grad_norm": 0.6579767465591431, "learning_rate": 5.717205051084731e-06, "logits/chosen": -1.12186861038208, "logits/rejected": -1.119635820388794, "logps/chosen": -970.2237548828125, "logps/rejected": -1001.27880859375, "loss": 0.159, "num_input_tokens_seen": 44496896, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.5873010754585266, "rewards/margins": 3.099958896636963, "rewards/rejected": -3.687260150909424, "step": 124 }, { "epoch": 0.4788125448886761, "grad_norm": 0.6986194849014282, "learning_rate": 5.655216127788472e-06, "logits/chosen": -1.1113070249557495, "logits/rejected": -1.1141784191131592, "logps/chosen": -978.4093017578125, "logps/rejected": -995.634765625, "loss": 0.1569, "num_input_tokens_seen": 44858496, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.5494914650917053, "rewards/margins": 3.107781171798706, "rewards/rejected": -3.6572725772857666, "step": 125 }, { "epoch": 0.4826430452477855, "grad_norm": 0.727834939956665, "learning_rate": 5.593124561049141e-06, "logits/chosen": -1.1294147968292236, "logits/rejected": -1.1388496160507202, "logps/chosen": -930.8519287109375, "logps/rejected": -939.8653564453125, "loss": 0.1527, "num_input_tokens_seen": 45215680, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.6654321551322937, "rewards/margins": 3.4994919300079346, "rewards/rejected": -4.164923667907715, "step": 126 }, { "epoch": 0.4864735456068949, "grad_norm": 0.8437095880508423, "learning_rate": 5.530940077874248e-06, "logits/chosen": -1.1250782012939453, "logits/rejected": -1.1162904500961304, "logps/chosen": -915.8817138671875, "logps/rejected": -932.8961181640625, "loss": 0.1848, "num_input_tokens_seen": 45575872, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.8156818151473999, "rewards/margins": 2.9767351150512695, "rewards/rejected": -3.792417049407959, "step": 127 }, { "epoch": 0.4903040459660043, "grad_norm": 0.6814424991607666, "learning_rate": 5.468672419827208e-06, "logits/chosen": -1.1594264507293701, "logits/rejected": -1.150148868560791, "logps/chosen": -945.5958862304688, "logps/rejected": -989.4842529296875, "loss": 0.1428, "num_input_tokens_seen": 45951488, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5141391754150391, "rewards/margins": 3.470471143722534, "rewards/rejected": -3.984610080718994, "step": 128 }, { "epoch": 0.4941345463251137, "grad_norm": 0.8450974822044373, "learning_rate": 5.406331341501264e-06, "logits/chosen": -1.1080952882766724, "logits/rejected": -1.1018576622009277, "logps/chosen": -936.5384521484375, "logps/rejected": -965.4146728515625, "loss": 0.1905, "num_input_tokens_seen": 46313472, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.655163049697876, "rewards/margins": 3.0373594760894775, "rewards/rejected": -3.6925225257873535, "step": 129 }, { "epoch": 0.49796504668422314, "grad_norm": 0.7701146006584167, "learning_rate": 5.34392660899138e-06, "logits/chosen": -1.1277241706848145, "logits/rejected": -1.1296637058258057, "logps/chosen": -920.0867919921875, "logps/rejected": -939.5274658203125, "loss": 0.1683, "num_input_tokens_seen": 46683136, "rewards/accuracies": 0.953125, "rewards/chosen": -0.764032244682312, "rewards/margins": 3.104192018508911, "rewards/rejected": -3.8682241439819336, "step": 130 }, { "epoch": 0.5017955470433325, "grad_norm": 0.8093336820602417, "learning_rate": 5.281467998364314e-06, "logits/chosen": -1.1174012422561646, "logits/rejected": -1.1064536571502686, "logps/chosen": -930.655517578125, "logps/rejected": -972.0757446289062, "loss": 0.1818, "num_input_tokens_seen": 47033152, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6495652198791504, "rewards/margins": 3.3375585079193115, "rewards/rejected": -3.987123966217041, "step": 131 }, { "epoch": 0.505626047402442, "grad_norm": 0.6671730875968933, "learning_rate": 5.218965294127155e-06, "logits/chosen": -1.1446601152420044, "logits/rejected": -1.1331120729446411, "logps/chosen": -955.3788452148438, "logps/rejected": -990.6216430664062, "loss": 0.1355, "num_input_tokens_seen": 47407552, "rewards/accuracies": 0.96875, "rewards/chosen": -0.708296537399292, "rewards/margins": 3.5515570640563965, "rewards/rejected": -4.259853363037109, "step": 132 }, { "epoch": 0.5094565477615514, "grad_norm": 0.7050759196281433, "learning_rate": 5.156428287694508e-06, "logits/chosen": -1.1224825382232666, "logits/rejected": -1.1169312000274658, "logps/chosen": -975.88671875, "logps/rejected": -1000.4472045898438, "loss": 0.1517, "num_input_tokens_seen": 47777088, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.9042822122573853, "rewards/margins": 3.1744394302368164, "rewards/rejected": -4.078721523284912, "step": 133 }, { "epoch": 0.5132870481206607, "grad_norm": 0.6690219640731812, "learning_rate": 5.093866775854618e-06, "logits/chosen": -1.1281201839447021, "logits/rejected": -1.118111252784729, "logps/chosen": -950.8082885742188, "logps/rejected": -967.8411865234375, "loss": 0.1486, "num_input_tokens_seen": 48146176, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.4145764708518982, "rewards/margins": 3.500164031982422, "rewards/rejected": -3.914740562438965, "step": 134 }, { "epoch": 0.5171175484797702, "grad_norm": 0.6712304353713989, "learning_rate": 5.03129055923465e-06, "logits/chosen": -1.152693510055542, "logits/rejected": -1.1446518898010254, "logps/chosen": -931.6079711914062, "logps/rejected": -960.1605834960938, "loss": 0.1411, "num_input_tokens_seen": 48511488, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.683695912361145, "rewards/margins": 3.693258285522461, "rewards/rejected": -4.376954078674316, "step": 135 }, { "epoch": 0.5209480488388796, "grad_norm": 0.6588895320892334, "learning_rate": 4.968709440765352e-06, "logits/chosen": -1.1270694732666016, "logits/rejected": -1.144219160079956, "logps/chosen": -921.58251953125, "logps/rejected": -936.573486328125, "loss": 0.1288, "num_input_tokens_seen": 48862336, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6117515563964844, "rewards/margins": 3.4521889686584473, "rewards/rejected": -4.063940525054932, "step": 136 }, { "epoch": 0.5247785491979889, "grad_norm": 0.7437635660171509, "learning_rate": 4.906133224145384e-06, "logits/chosen": -1.1290967464447021, "logits/rejected": -1.1110491752624512, "logps/chosen": -939.525634765625, "logps/rejected": -999.9234008789062, "loss": 0.1461, "num_input_tokens_seen": 49226048, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5727760791778564, "rewards/margins": 3.2480969429016113, "rewards/rejected": -3.820873260498047, "step": 137 }, { "epoch": 0.5286090495570984, "grad_norm": 0.7835173010826111, "learning_rate": 4.843571712305493e-06, "logits/chosen": -1.116170883178711, "logits/rejected": -1.123787760734558, "logps/chosen": -927.2295532226562, "logps/rejected": -945.8253173828125, "loss": 0.1788, "num_input_tokens_seen": 49581888, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.6745204925537109, "rewards/margins": 3.3101115226745605, "rewards/rejected": -3.9846320152282715, "step": 138 }, { "epoch": 0.5324395499162078, "grad_norm": 0.5597019791603088, "learning_rate": 4.781034705872846e-06, "logits/chosen": -1.1176984310150146, "logits/rejected": -1.117140531539917, "logps/chosen": -933.116455078125, "logps/rejected": -970.36669921875, "loss": 0.1061, "num_input_tokens_seen": 49936512, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.6796407103538513, "rewards/margins": 3.7050676345825195, "rewards/rejected": -4.384708404541016, "step": 139 }, { "epoch": 0.5362700502753173, "grad_norm": 0.615868091583252, "learning_rate": 4.7185320016356865e-06, "logits/chosen": -1.1357049942016602, "logits/rejected": -1.1324001550674438, "logps/chosen": -943.2921142578125, "logps/rejected": -975.155517578125, "loss": 0.1376, "num_input_tokens_seen": 50309824, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.8460093140602112, "rewards/margins": 3.435059070587158, "rewards/rejected": -4.281068801879883, "step": 140 }, { "epoch": 0.5401005506344266, "grad_norm": 0.6606157422065735, "learning_rate": 4.656073391008622e-06, "logits/chosen": -1.1140908002853394, "logits/rejected": -1.108850121498108, "logps/chosen": -895.8781127929688, "logps/rejected": -936.751220703125, "loss": 0.1519, "num_input_tokens_seen": 50659712, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.7830679416656494, "rewards/margins": 3.3882458209991455, "rewards/rejected": -4.171314239501953, "step": 141 }, { "epoch": 0.543931050993536, "grad_norm": 0.7001786231994629, "learning_rate": 4.593668658498737e-06, "logits/chosen": -1.149137020111084, "logits/rejected": -1.1395349502563477, "logps/chosen": -941.0504150390625, "logps/rejected": -963.2518310546875, "loss": 0.1491, "num_input_tokens_seen": 51020992, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6404644846916199, "rewards/margins": 3.440929412841797, "rewards/rejected": -4.081393718719482, "step": 142 }, { "epoch": 0.5477615513526455, "grad_norm": 0.6836583614349365, "learning_rate": 4.531327580172794e-06, "logits/chosen": -1.1339120864868164, "logits/rejected": -1.1352245807647705, "logps/chosen": -964.3081665039062, "logps/rejected": -978.4052734375, "loss": 0.1244, "num_input_tokens_seen": 51378624, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.8491271734237671, "rewards/margins": 3.6870431900024414, "rewards/rejected": -4.536170482635498, "step": 143 }, { "epoch": 0.5515920517117548, "grad_norm": 0.7100102305412292, "learning_rate": 4.469059922125753e-06, "logits/chosen": -1.148242712020874, "logits/rejected": -1.1385154724121094, "logps/chosen": -998.3412475585938, "logps/rejected": -1044.08935546875, "loss": 0.1593, "num_input_tokens_seen": 51760384, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.0328764915466309, "rewards/margins": 3.6171908378601074, "rewards/rejected": -4.650067329406738, "step": 144 }, { "epoch": 0.5554225520708642, "grad_norm": 0.7961605191230774, "learning_rate": 4.4068754389508616e-06, "logits/chosen": -1.1213231086730957, "logits/rejected": -1.1205980777740479, "logps/chosen": -946.205810546875, "logps/rejected": -974.6113891601562, "loss": 0.1374, "num_input_tokens_seen": 52114304, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.7596480250358582, "rewards/margins": 3.813307762145996, "rewards/rejected": -4.572955131530762, "step": 145 }, { "epoch": 0.5592530524299737, "grad_norm": 0.7323804497718811, "learning_rate": 4.34478387221153e-06, "logits/chosen": -1.115981101989746, "logits/rejected": -1.1200889348983765, "logps/chosen": -942.8148193359375, "logps/rejected": -965.3985595703125, "loss": 0.1643, "num_input_tokens_seen": 52486784, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.8299667239189148, "rewards/margins": 3.4589433670043945, "rewards/rejected": -4.288909912109375, "step": 146 }, { "epoch": 0.5630835527890831, "grad_norm": 0.5200232267379761, "learning_rate": 4.282794948915271e-06, "logits/chosen": -1.120424747467041, "logits/rejected": -1.1166388988494873, "logps/chosen": -906.1971435546875, "logps/rejected": -953.6253662109375, "loss": 0.0999, "num_input_tokens_seen": 52831680, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.5943716764450073, "rewards/margins": 3.9285550117492676, "rewards/rejected": -4.522926330566406, "step": 147 }, { "epoch": 0.5669140531481924, "grad_norm": 0.5851389765739441, "learning_rate": 4.220918379989898e-06, "logits/chosen": -1.1159402132034302, "logits/rejected": -1.1205713748931885, "logps/chosen": -923.9879760742188, "logps/rejected": -925.5596923828125, "loss": 0.1095, "num_input_tokens_seen": 53175680, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.8688675165176392, "rewards/margins": 3.639319896697998, "rewards/rejected": -4.508187770843506, "step": 148 }, { "epoch": 0.5707445535073019, "grad_norm": 0.6805673241615295, "learning_rate": 4.159163858762255e-06, "logits/chosen": -1.108860731124878, "logits/rejected": -1.1102187633514404, "logps/chosen": -937.15478515625, "logps/rejected": -952.2811279296875, "loss": 0.1384, "num_input_tokens_seen": 53537024, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9211418628692627, "rewards/margins": 3.680717706680298, "rewards/rejected": -4.601859092712402, "step": 149 }, { "epoch": 0.5745750538664113, "grad_norm": 0.631395161151886, "learning_rate": 4.097541059439698e-06, "logits/chosen": -1.1475733518600464, "logits/rejected": -1.1469578742980957, "logps/chosen": -928.3204956054688, "logps/rejected": -947.0354614257812, "loss": 0.1174, "num_input_tokens_seen": 53894528, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0530691146850586, "rewards/margins": 3.8212642669677734, "rewards/rejected": -4.874333381652832, "step": 150 }, { "epoch": 0.5784055542255208, "grad_norm": 0.7877935171127319, "learning_rate": 4.036059635594578e-06, "logits/chosen": -1.1463123559951782, "logits/rejected": -1.1365106105804443, "logps/chosen": -949.924072265625, "logps/rejected": -962.1156005859375, "loss": 0.1603, "num_input_tokens_seen": 54262848, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.9331613779067993, "rewards/margins": 3.1643664836883545, "rewards/rejected": -4.097527980804443, "step": 151 }, { "epoch": 0.5822360545846301, "grad_norm": 0.5540483593940735, "learning_rate": 3.974729218651946e-06, "logits/chosen": -1.166491150856018, "logits/rejected": -1.1675529479980469, "logps/chosen": -940.0262451171875, "logps/rejected": -979.6585083007812, "loss": 0.1102, "num_input_tokens_seen": 54635776, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0611553192138672, "rewards/margins": 3.9694066047668457, "rewards/rejected": -5.030562400817871, "step": 152 }, { "epoch": 0.5860665549437395, "grad_norm": 0.6366962790489197, "learning_rate": 3.913559416380743e-06, "logits/chosen": -1.135746955871582, "logits/rejected": -1.1493985652923584, "logps/chosen": -921.271484375, "logps/rejected": -946.207763671875, "loss": 0.1277, "num_input_tokens_seen": 54999744, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.9022237658500671, "rewards/margins": 3.6466073989868164, "rewards/rejected": -4.548830986022949, "step": 153 }, { "epoch": 0.589897055302849, "grad_norm": 0.4817214012145996, "learning_rate": 3.852559811388676e-06, "logits/chosen": -1.1166200637817383, "logits/rejected": -1.1204478740692139, "logps/chosen": -932.076416015625, "logps/rejected": -951.6412353515625, "loss": 0.0814, "num_input_tokens_seen": 55337280, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8710342049598694, "rewards/margins": 4.16388463973999, "rewards/rejected": -5.034918785095215, "step": 154 }, { "epoch": 0.5937275556619583, "grad_norm": 0.7406041026115417, "learning_rate": 3.791739959621054e-06, "logits/chosen": -1.1301652193069458, "logits/rejected": -1.1277787685394287, "logps/chosen": -939.067626953125, "logps/rejected": -976.2159423828125, "loss": 0.1583, "num_input_tokens_seen": 55702336, "rewards/accuracies": 0.96875, "rewards/chosen": -1.148180603981018, "rewards/margins": 3.349360704421997, "rewards/rejected": -4.497541427612305, "step": 155 }, { "epoch": 0.5975580560210677, "grad_norm": 0.6545127034187317, "learning_rate": 3.7311093888637906e-06, "logits/chosen": -1.1367216110229492, "logits/rejected": -1.135441541671753, "logps/chosen": -947.7169799804688, "logps/rejected": -957.501953125, "loss": 0.1345, "num_input_tokens_seen": 56070336, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.793830394744873, "rewards/margins": 3.778103828430176, "rewards/rejected": -4.571934223175049, "step": 156 }, { "epoch": 0.6013885563801772, "grad_norm": 0.6249314546585083, "learning_rate": 3.670677597250819e-06, "logits/chosen": -1.1367524862289429, "logits/rejected": -1.1322413682937622, "logps/chosen": -919.898193359375, "logps/rejected": -953.7801513671875, "loss": 0.1229, "num_input_tokens_seen": 56428800, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0889718532562256, "rewards/margins": 3.635096788406372, "rewards/rejected": -4.724068641662598, "step": 157 }, { "epoch": 0.6052190567392866, "grad_norm": 0.6867994070053101, "learning_rate": 3.6104540517761594e-06, "logits/chosen": -1.1208783388137817, "logits/rejected": -1.1281899213790894, "logps/chosen": -942.1203002929688, "logps/rejected": -941.297607421875, "loss": 0.1324, "num_input_tokens_seen": 56783040, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8960638046264648, "rewards/margins": 3.8797450065612793, "rewards/rejected": -4.775808811187744, "step": 158 }, { "epoch": 0.609049557098396, "grad_norm": 0.5042504668235779, "learning_rate": 3.55044818681085e-06, "logits/chosen": -1.1335363388061523, "logits/rejected": -1.1184139251708984, "logps/chosen": -954.19921875, "logps/rejected": -998.2352905273438, "loss": 0.0872, "num_input_tokens_seen": 57145920, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.7512755990028381, "rewards/margins": 4.050450325012207, "rewards/rejected": -4.801725387573242, "step": 159 }, { "epoch": 0.6128800574575054, "grad_norm": 0.6282093524932861, "learning_rate": 3.4906694026250075e-06, "logits/chosen": -1.1352434158325195, "logits/rejected": -1.1456270217895508, "logps/chosen": -935.1751708984375, "logps/rejected": -959.360107421875, "loss": 0.1244, "num_input_tokens_seen": 57502528, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.9671937227249146, "rewards/margins": 3.746589422225952, "rewards/rejected": -4.713783264160156, "step": 160 }, { "epoch": 0.6167105578166148, "grad_norm": 0.7335207462310791, "learning_rate": 3.431127063915213e-06, "logits/chosen": -1.1380847692489624, "logits/rejected": -1.1418529748916626, "logps/chosen": -939.71142578125, "logps/rejected": -956.5693359375, "loss": 0.1404, "num_input_tokens_seen": 57864896, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.9701119065284729, "rewards/margins": 3.5857200622558594, "rewards/rejected": -4.5558319091796875, "step": 161 }, { "epoch": 0.6205410581757242, "grad_norm": 0.525834321975708, "learning_rate": 3.371830498337475e-06, "logits/chosen": -1.1229634284973145, "logits/rejected": -1.1113239526748657, "logps/chosen": -906.3638916015625, "logps/rejected": -968.2791748046875, "loss": 0.0902, "num_input_tokens_seen": 58218496, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.853689432144165, "rewards/margins": 4.11014986038208, "rewards/rejected": -4.963839054107666, "step": 162 }, { "epoch": 0.6243715585348336, "grad_norm": 0.597028911113739, "learning_rate": 3.3127889950460094e-06, "logits/chosen": -1.115204095840454, "logits/rejected": -1.1108593940734863, "logps/chosen": -898.75390625, "logps/rejected": -959.1620483398438, "loss": 0.1118, "num_input_tokens_seen": 58561152, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.8724937438964844, "rewards/margins": 3.783143997192383, "rewards/rejected": -4.655637741088867, "step": 163 }, { "epoch": 0.628202058893943, "grad_norm": 0.773404598236084, "learning_rate": 3.254011803238026e-06, "logits/chosen": -1.1355056762695312, "logits/rejected": -1.1327035427093506, "logps/chosen": -964.1258544921875, "logps/rejected": -1013.5973510742188, "loss": 0.154, "num_input_tokens_seen": 58925504, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.280595302581787, "rewards/margins": 3.811772346496582, "rewards/rejected": -5.092367649078369, "step": 164 }, { "epoch": 0.6320325592530525, "grad_norm": 0.6448091268539429, "learning_rate": 3.195508130704795e-06, "logits/chosen": -1.1301665306091309, "logits/rejected": -1.1308307647705078, "logps/chosen": -919.0938720703125, "logps/rejected": -935.1650390625, "loss": 0.1232, "num_input_tokens_seen": 59269120, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9420739412307739, "rewards/margins": 3.73829984664917, "rewards/rejected": -4.680373668670654, "step": 165 }, { "epoch": 0.6358630596121618, "grad_norm": 0.717473566532135, "learning_rate": 3.1372871423891894e-06, "logits/chosen": -1.1233844757080078, "logits/rejected": -1.1225996017456055, "logps/chosen": -926.3175659179688, "logps/rejected": -951.1182861328125, "loss": 0.1412, "num_input_tokens_seen": 59620928, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.8571826219558716, "rewards/margins": 3.7891440391540527, "rewards/rejected": -4.646326065063477, "step": 166 }, { "epoch": 0.6396935599712712, "grad_norm": 0.5268615484237671, "learning_rate": 3.079357958949946e-06, "logits/chosen": -1.1460094451904297, "logits/rejected": -1.1440532207489014, "logps/chosen": -923.2747802734375, "logps/rejected": -965.303955078125, "loss": 0.0887, "num_input_tokens_seen": 59974656, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.8316411972045898, "rewards/margins": 4.237669944763184, "rewards/rejected": -5.069311618804932, "step": 167 }, { "epoch": 0.6435240603303807, "grad_norm": 0.7504249811172485, "learning_rate": 3.021729655332858e-06, "logits/chosen": -1.1353933811187744, "logits/rejected": -1.126386046409607, "logps/chosen": -941.4724731445312, "logps/rejected": -971.0452880859375, "loss": 0.1555, "num_input_tokens_seen": 60329216, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8113993406295776, "rewards/margins": 3.449709892272949, "rewards/rejected": -4.261109352111816, "step": 168 }, { "epoch": 0.64735456068949, "grad_norm": 0.5045068264007568, "learning_rate": 2.9644112593491315e-06, "logits/chosen": -1.113010048866272, "logits/rejected": -1.1124825477600098, "logps/chosen": -913.4754638671875, "logps/rejected": -939.3057861328125, "loss": 0.086, "num_input_tokens_seen": 60686464, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.6832062005996704, "rewards/margins": 4.223135948181152, "rewards/rejected": -4.906341552734375, "step": 169 }, { "epoch": 0.6511850610485995, "grad_norm": 0.7200180888175964, "learning_rate": 2.90741175026113e-06, "logits/chosen": -1.143190622329712, "logits/rejected": -1.1372324228286743, "logps/chosen": -933.2850952148438, "logps/rejected": -970.13232421875, "loss": 0.134, "num_input_tokens_seen": 61046144, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9384927153587341, "rewards/margins": 4.193242073059082, "rewards/rejected": -5.131734848022461, "step": 170 }, { "epoch": 0.6550155614077089, "grad_norm": 0.770698070526123, "learning_rate": 2.850740057375716e-06, "logits/chosen": -1.1460022926330566, "logits/rejected": -1.1353541612625122, "logps/chosen": -906.2474365234375, "logps/rejected": -944.6781005859375, "loss": 0.1397, "num_input_tokens_seen": 61395776, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2490543127059937, "rewards/margins": 3.7003769874572754, "rewards/rejected": -4.949431419372559, "step": 171 }, { "epoch": 0.6588460617668183, "grad_norm": 0.7854964137077332, "learning_rate": 2.7944050586454215e-06, "logits/chosen": -1.1241507530212402, "logits/rejected": -1.1199979782104492, "logps/chosen": -881.9456787109375, "logps/rejected": -925.2673950195312, "loss": 0.1393, "num_input_tokens_seen": 61739072, "rewards/accuracies": 0.9296875, "rewards/chosen": -1.119002342224121, "rewards/margins": 4.081979274749756, "rewards/rejected": -5.200981616973877, "step": 172 }, { "epoch": 0.6626765621259277, "grad_norm": 0.614799976348877, "learning_rate": 2.7384155792776724e-06, "logits/chosen": -1.1314318180084229, "logits/rejected": -1.1282424926757812, "logps/chosen": -937.385498046875, "logps/rejected": -974.9067993164062, "loss": 0.107, "num_input_tokens_seen": 62091840, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2679835557937622, "rewards/margins": 4.420315265655518, "rewards/rejected": -5.688299179077148, "step": 173 }, { "epoch": 0.6665070624850371, "grad_norm": 0.7183716297149658, "learning_rate": 2.682780390352262e-06, "logits/chosen": -1.147839903831482, "logits/rejected": -1.1522760391235352, "logps/chosen": -915.7791748046875, "logps/rejected": -955.96240234375, "loss": 0.1178, "num_input_tokens_seen": 62452416, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.1561710834503174, "rewards/margins": 4.244651794433594, "rewards/rejected": -5.40082311630249, "step": 174 }, { "epoch": 0.6703375628441465, "grad_norm": 0.7553685903549194, "learning_rate": 2.627508207447308e-06, "logits/chosen": -1.1484794616699219, "logits/rejected": -1.1376214027404785, "logps/chosen": -959.5919799804688, "logps/rejected": -1002.1358642578125, "loss": 0.1382, "num_input_tokens_seen": 62826432, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.2018121480941772, "rewards/margins": 4.0351972579956055, "rewards/rejected": -5.237009048461914, "step": 175 }, { "epoch": 0.674168063203256, "grad_norm": 0.7371348738670349, "learning_rate": 2.5726076892739127e-06, "logits/chosen": -1.133622646331787, "logits/rejected": -1.1346787214279175, "logps/chosen": -981.44091796875, "logps/rejected": -1008.0592041015625, "loss": 0.1451, "num_input_tokens_seen": 63188736, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.9973090887069702, "rewards/margins": 3.770475387573242, "rewards/rejected": -4.767784118652344, "step": 176 }, { "epoch": 0.6779985635623653, "grad_norm": 0.6027622222900391, "learning_rate": 2.5180874363197217e-06, "logits/chosen": -1.1473917961120605, "logits/rejected": -1.136796474456787, "logps/chosen": -966.965087890625, "logps/rejected": -992.8527221679688, "loss": 0.1113, "num_input_tokens_seen": 63559040, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.037949562072754, "rewards/margins": 3.698408365249634, "rewards/rejected": -4.736357688903809, "step": 177 }, { "epoch": 0.6818290639214747, "grad_norm": 0.6934034824371338, "learning_rate": 2.463955989501607e-06, "logits/chosen": -1.1472060680389404, "logits/rejected": -1.142443299293518, "logps/chosen": -948.6500244140625, "logps/rejected": -976.03955078125, "loss": 0.1304, "num_input_tokens_seen": 63918080, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2286975383758545, "rewards/margins": 4.057483673095703, "rewards/rejected": -5.286181449890137, "step": 178 }, { "epoch": 0.6856595642805842, "grad_norm": 0.574459969997406, "learning_rate": 2.41022182882768e-06, "logits/chosen": -1.1256879568099976, "logits/rejected": -1.1263935565948486, "logps/chosen": -920.3141479492188, "logps/rejected": -937.0470581054688, "loss": 0.1064, "num_input_tokens_seen": 64264384, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.0772805213928223, "rewards/margins": 3.8612425327301025, "rewards/rejected": -4.938523292541504, "step": 179 }, { "epoch": 0.6894900646396935, "grad_norm": 0.6900708079338074, "learning_rate": 2.356893372068855e-06, "logits/chosen": -1.146414041519165, "logits/rejected": -1.1333107948303223, "logps/chosen": -920.6194458007812, "logps/rejected": -948.978515625, "loss": 0.1286, "num_input_tokens_seen": 64627520, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.1199493408203125, "rewards/margins": 4.144634246826172, "rewards/rejected": -5.264584064483643, "step": 180 }, { "epoch": 0.693320564998803, "grad_norm": 0.6781615018844604, "learning_rate": 2.3039789734401524e-06, "logits/chosen": -1.1684128046035767, "logits/rejected": -1.156698226928711, "logps/chosen": -886.8367919921875, "logps/rejected": -941.868896484375, "loss": 0.1146, "num_input_tokens_seen": 64985472, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0930626392364502, "rewards/margins": 4.096789836883545, "rewards/rejected": -5.189852714538574, "step": 181 }, { "epoch": 0.6971510653579124, "grad_norm": 0.6015796661376953, "learning_rate": 2.251486922291957e-06, "logits/chosen": -1.1457183361053467, "logits/rejected": -1.1400107145309448, "logps/chosen": -878.5657958984375, "logps/rejected": -924.116943359375, "loss": 0.1146, "num_input_tokens_seen": 65333056, "rewards/accuracies": 0.96875, "rewards/chosen": -1.3387953042984009, "rewards/margins": 3.838289976119995, "rewards/rejected": -5.177084922790527, "step": 182 }, { "epoch": 0.7009815657170217, "grad_norm": 0.7410080432891846, "learning_rate": 2.1994254418114524e-06, "logits/chosen": -1.1427900791168213, "logits/rejected": -1.1322269439697266, "logps/chosen": -970.937255859375, "logps/rejected": -1017.3334350585938, "loss": 0.1325, "num_input_tokens_seen": 65701376, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.2159329652786255, "rewards/margins": 3.69895601272583, "rewards/rejected": -4.914888858795166, "step": 183 }, { "epoch": 0.7048120660761312, "grad_norm": 0.6679302453994751, "learning_rate": 2.147802687734409e-06, "logits/chosen": -1.124457597732544, "logits/rejected": -1.1152386665344238, "logps/chosen": -939.2366943359375, "logps/rejected": -980.7069702148438, "loss": 0.1278, "num_input_tokens_seen": 66057152, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2831361293792725, "rewards/margins": 3.8988122940063477, "rewards/rejected": -5.181948661804199, "step": 184 }, { "epoch": 0.7086425664352406, "grad_norm": 0.6172199249267578, "learning_rate": 2.0966267470675273e-06, "logits/chosen": -1.1368470191955566, "logits/rejected": -1.1430565118789673, "logps/chosen": -935.0543212890625, "logps/rejected": -954.5626220703125, "loss": 0.1137, "num_input_tokens_seen": 66413248, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.1827101707458496, "rewards/margins": 4.264597415924072, "rewards/rejected": -5.447307586669922, "step": 185 }, { "epoch": 0.71247306679435, "grad_norm": 0.7237995862960815, "learning_rate": 2.0459056368215786e-06, "logits/chosen": -1.142282247543335, "logits/rejected": -1.1388722658157349, "logps/chosen": -920.2362060546875, "logps/rejected": -950.0621337890625, "loss": 0.1468, "num_input_tokens_seen": 66777600, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1261134147644043, "rewards/margins": 3.742077350616455, "rewards/rejected": -4.868190765380859, "step": 186 }, { "epoch": 0.7163035671534594, "grad_norm": 0.8806514739990234, "learning_rate": 1.9956473027554846e-06, "logits/chosen": -1.1200953722000122, "logits/rejected": -1.12364661693573, "logps/chosen": -913.1375732421875, "logps/rejected": -942.7782592773438, "loss": 0.1509, "num_input_tokens_seen": 67126784, "rewards/accuracies": 0.953125, "rewards/chosen": -1.018437385559082, "rewards/margins": 3.621182918548584, "rewards/rejected": -4.639620304107666, "step": 187 }, { "epoch": 0.7201340675125688, "grad_norm": 0.6537996530532837, "learning_rate": 1.9458596181315643e-06, "logits/chosen": -1.1244375705718994, "logits/rejected": -1.1293549537658691, "logps/chosen": -950.2227783203125, "logps/rejected": -959.3658447265625, "loss": 0.1212, "num_input_tokens_seen": 67486848, "rewards/accuracies": 0.96875, "rewards/chosen": -1.301206111907959, "rewards/margins": 3.810688018798828, "rewards/rejected": -5.111893653869629, "step": 188 }, { "epoch": 0.7239645678716783, "grad_norm": 0.5897997617721558, "learning_rate": 1.8965503824821496e-06, "logits/chosen": -1.115061640739441, "logits/rejected": -1.113053321838379, "logps/chosen": -918.2599487304688, "logps/rejected": -964.2479248046875, "loss": 0.1096, "num_input_tokens_seen": 67830400, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.3825678825378418, "rewards/margins": 4.108716011047363, "rewards/rejected": -5.491284370422363, "step": 189 }, { "epoch": 0.7277950682307877, "grad_norm": 0.7189119458198547, "learning_rate": 1.84772732038774e-06, "logits/chosen": -1.1144726276397705, "logits/rejected": -1.1201146841049194, "logps/chosen": -930.0636596679688, "logps/rejected": -959.1786499023438, "loss": 0.156, "num_input_tokens_seen": 68184448, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.131813406944275, "rewards/margins": 3.7312142848968506, "rewards/rejected": -4.863027572631836, "step": 190 }, { "epoch": 0.731625568589897, "grad_norm": 0.5770811438560486, "learning_rate": 1.7993980802668947e-06, "logits/chosen": -1.1150379180908203, "logits/rejected": -1.1045951843261719, "logps/chosen": -927.9925537109375, "logps/rejected": -981.8740234375, "loss": 0.105, "num_input_tokens_seen": 68536640, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8180875778198242, "rewards/margins": 4.335508823394775, "rewards/rejected": -5.153596878051758, "step": 191 }, { "epoch": 0.7354560689490065, "grad_norm": 0.6197463870048523, "learning_rate": 1.7515702331780753e-06, "logits/chosen": -1.1397392749786377, "logits/rejected": -1.1332669258117676, "logps/chosen": -888.0272216796875, "logps/rejected": -920.6478271484375, "loss": 0.124, "num_input_tokens_seen": 68881216, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.044264793395996, "rewards/margins": 3.917402744293213, "rewards/rejected": -4.961667060852051, "step": 192 }, { "epoch": 0.7392865693081159, "grad_norm": 0.6910933256149292, "learning_rate": 1.7042512716335873e-06, "logits/chosen": -1.144860863685608, "logits/rejected": -1.1406577825546265, "logps/chosen": -968.2261962890625, "logps/rejected": -1015.7372436523438, "loss": 0.1209, "num_input_tokens_seen": 69258368, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4211889505386353, "rewards/margins": 3.7208251953125, "rewards/rejected": -5.142014026641846, "step": 193 }, { "epoch": 0.7431170696672252, "grad_norm": 0.591352105140686, "learning_rate": 1.6574486084258369e-06, "logits/chosen": -1.1517504453659058, "logits/rejected": -1.1535696983337402, "logps/chosen": -941.3185424804688, "logps/rejected": -941.9823608398438, "loss": 0.1065, "num_input_tokens_seen": 69617152, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.0724523067474365, "rewards/margins": 4.041698932647705, "rewards/rejected": -5.114151477813721, "step": 194 }, { "epoch": 0.7469475700263347, "grad_norm": 0.6189924478530884, "learning_rate": 1.6111695754660667e-06, "logits/chosen": -1.1325167417526245, "logits/rejected": -1.1246585845947266, "logps/chosen": -957.8402099609375, "logps/rejected": -987.0758056640625, "loss": 0.1183, "num_input_tokens_seen": 69979072, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.395351767539978, "rewards/margins": 4.022106170654297, "rewards/rejected": -5.4174580574035645, "step": 195 }, { "epoch": 0.7507780703854441, "grad_norm": 0.48755350708961487, "learning_rate": 1.5654214226357822e-06, "logits/chosen": -1.1281023025512695, "logits/rejected": -1.1127030849456787, "logps/chosen": -953.8341064453125, "logps/rejected": -1023.0802001953125, "loss": 0.0792, "num_input_tokens_seen": 70344768, "rewards/accuracies": 0.9921875, "rewards/chosen": -1.2984752655029297, "rewards/margins": 3.9046237468719482, "rewards/rejected": -5.203099250793457, "step": 196 }, { "epoch": 0.7546085707445535, "grad_norm": 0.7763097882270813, "learning_rate": 1.5202113166510058e-06, "logits/chosen": -1.1633872985839844, "logits/rejected": -1.1649155616760254, "logps/chosen": -940.2249755859375, "logps/rejected": -974.9447021484375, "loss": 0.1402, "num_input_tokens_seen": 70721984, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.1531100273132324, "rewards/margins": 4.258032321929932, "rewards/rejected": -5.411142349243164, "step": 197 }, { "epoch": 0.7584390711036629, "grad_norm": 0.5113739371299744, "learning_rate": 1.475546339939568e-06, "logits/chosen": -1.1158440113067627, "logits/rejected": -1.1148247718811035, "logps/chosen": -949.8948974609375, "logps/rejected": -1000.0040283203125, "loss": 0.0939, "num_input_tokens_seen": 71082368, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.1646056175231934, "rewards/margins": 3.9723587036132812, "rewards/rejected": -5.136964797973633, "step": 198 }, { "epoch": 0.7622695714627723, "grad_norm": 0.655764102935791, "learning_rate": 1.4314334895316095e-06, "logits/chosen": -1.1430144309997559, "logits/rejected": -1.1407995223999023, "logps/chosen": -922.4736328125, "logps/rejected": -955.0093383789062, "loss": 0.1234, "num_input_tokens_seen": 71436928, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1026368141174316, "rewards/margins": 4.110426425933838, "rewards/rejected": -5.2130632400512695, "step": 199 }, { "epoch": 0.7661000718218818, "grad_norm": 0.5658289790153503, "learning_rate": 1.3878796759634544e-06, "logits/chosen": -1.1523005962371826, "logits/rejected": -1.1472749710083008, "logps/chosen": -942.7454833984375, "logps/rejected": -991.8628540039062, "loss": 0.1007, "num_input_tokens_seen": 71801024, "rewards/accuracies": 0.984375, "rewards/chosen": -1.088900089263916, "rewards/margins": 4.152194976806641, "rewards/rejected": -5.241095542907715, "step": 200 }, { "epoch": 0.7699305721809911, "grad_norm": 0.6669675707817078, "learning_rate": 1.3448917221950264e-06, "logits/chosen": -1.1476324796676636, "logits/rejected": -1.1498050689697266, "logps/chosen": -956.4420166015625, "logps/rejected": -968.7861328125, "loss": 0.1182, "num_input_tokens_seen": 72172800, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1832457780838013, "rewards/margins": 3.7966558933258057, "rewards/rejected": -4.9799017906188965, "step": 201 }, { "epoch": 0.7737610725401005, "grad_norm": 0.5420801639556885, "learning_rate": 1.3024763625410025e-06, "logits/chosen": -1.133588194847107, "logits/rejected": -1.1364024877548218, "logps/chosen": -943.519775390625, "logps/rejected": -960.7150268554688, "loss": 0.0918, "num_input_tokens_seen": 72536000, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.925828218460083, "rewards/margins": 4.087068557739258, "rewards/rejected": -5.012896537780762, "step": 202 }, { "epoch": 0.77759157289921, "grad_norm": 0.6389860510826111, "learning_rate": 1.2606402416158391e-06, "logits/chosen": -1.1481473445892334, "logits/rejected": -1.1378352642059326, "logps/chosen": -939.3446655273438, "logps/rejected": -977.73876953125, "loss": 0.1069, "num_input_tokens_seen": 72900672, "rewards/accuracies": 0.984375, "rewards/chosen": -1.294952154159546, "rewards/margins": 4.067165374755859, "rewards/rejected": -5.362117767333984, "step": 203 }, { "epoch": 0.7814220732583194, "grad_norm": 0.6677370667457581, "learning_rate": 1.2193899132928539e-06, "logits/chosen": -1.1154191493988037, "logits/rejected": -1.1181483268737793, "logps/chosen": -905.6656494140625, "logps/rejected": -916.131103515625, "loss": 0.1164, "num_input_tokens_seen": 73242240, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1728456020355225, "rewards/margins": 4.02401065826416, "rewards/rejected": -5.1968560218811035, "step": 204 }, { "epoch": 0.7852525736174287, "grad_norm": 0.6846520900726318, "learning_rate": 1.1787318396775188e-06, "logits/chosen": -1.1279726028442383, "logits/rejected": -1.1289410591125488, "logps/chosen": -945.6762084960938, "logps/rejected": -985.7901611328125, "loss": 0.1268, "num_input_tokens_seen": 73604288, "rewards/accuracies": 0.953125, "rewards/chosen": -1.2765809297561646, "rewards/margins": 4.0222930908203125, "rewards/rejected": -5.2988739013671875, "step": 205 }, { "epoch": 0.7890830739765382, "grad_norm": 0.7878302335739136, "learning_rate": 1.138672390095143e-06, "logits/chosen": -1.1450525522232056, "logits/rejected": -1.1443026065826416, "logps/chosen": -924.56103515625, "logps/rejected": -933.9475708007812, "loss": 0.1272, "num_input_tokens_seen": 73953984, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.205392837524414, "rewards/margins": 4.218493938446045, "rewards/rejected": -5.423886775970459, "step": 206 }, { "epoch": 0.7929135743356476, "grad_norm": 0.7352907061576843, "learning_rate": 1.0992178400930753e-06, "logits/chosen": -1.1588523387908936, "logits/rejected": -1.1516337394714355, "logps/chosen": -935.2378540039062, "logps/rejected": -979.25244140625, "loss": 0.1464, "num_input_tokens_seen": 74322752, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.1177600622177124, "rewards/margins": 4.066137313842773, "rewards/rejected": -5.183897018432617, "step": 207 }, { "epoch": 0.796744074694757, "grad_norm": 0.5740320682525635, "learning_rate": 1.0603743704575992e-06, "logits/chosen": -1.1424620151519775, "logits/rejected": -1.1347261667251587, "logps/chosen": -982.45751953125, "logps/rejected": -1003.0057373046875, "loss": 0.0975, "num_input_tokens_seen": 74692672, "rewards/accuracies": 0.96875, "rewards/chosen": -1.184654712677002, "rewards/margins": 4.110013484954834, "rewards/rejected": -5.294668197631836, "step": 208 }, { "epoch": 0.8005745750538664, "grad_norm": 0.7261523008346558, "learning_rate": 1.0221480662456845e-06, "logits/chosen": -1.1375901699066162, "logits/rejected": -1.1277306079864502, "logps/chosen": -949.3741455078125, "logps/rejected": -985.18896484375, "loss": 0.1429, "num_input_tokens_seen": 75043648, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4194461107254028, "rewards/margins": 3.990281105041504, "rewards/rejected": -5.409727096557617, "step": 209 }, { "epoch": 0.8044050754129758, "grad_norm": 0.5886514782905579, "learning_rate": 9.845449158317216e-07, "logits/chosen": -1.1304702758789062, "logits/rejected": -1.1260240077972412, "logps/chosen": -939.6072998046875, "logps/rejected": -961.4407348632812, "loss": 0.1222, "num_input_tokens_seen": 75408832, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1726312637329102, "rewards/margins": 3.7976341247558594, "rewards/rejected": -4.9702653884887695, "step": 210 }, { "epoch": 0.8082355757720853, "grad_norm": 0.5627762079238892, "learning_rate": 9.475708099694125e-07, "logits/chosen": -1.1464478969573975, "logits/rejected": -1.1441315412521362, "logps/chosen": -963.1434936523438, "logps/rejected": -995.056884765625, "loss": 0.09, "num_input_tokens_seen": 75767744, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3474948406219482, "rewards/margins": 4.602136135101318, "rewards/rejected": -5.949631214141846, "step": 211 }, { "epoch": 0.8120660761311946, "grad_norm": 0.8463510274887085, "learning_rate": 9.112315408689415e-07, "logits/chosen": -1.1504496335983276, "logits/rejected": -1.145203709602356, "logps/chosen": -953.384521484375, "logps/rejected": -979.9056396484375, "loss": 0.151, "num_input_tokens_seen": 76143744, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.280893087387085, "rewards/margins": 3.798523426055908, "rewards/rejected": -5.079416275024414, "step": 212 }, { "epoch": 0.815896576490304, "grad_norm": 0.6893050670623779, "learning_rate": 8.755328012896002e-07, "logits/chosen": -1.1258955001831055, "logits/rejected": -1.1326565742492676, "logps/chosen": -915.5339965820312, "logps/rejected": -946.0889892578125, "loss": 0.1176, "num_input_tokens_seen": 76490112, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.3296691179275513, "rewards/margins": 4.028443336486816, "rewards/rejected": -5.358112335205078, "step": 213 }, { "epoch": 0.8197270768494135, "grad_norm": 0.7727059125900269, "learning_rate": 8.404801836479809e-07, "logits/chosen": -1.1532236337661743, "logits/rejected": -1.1491987705230713, "logps/chosen": -971.9912109375, "logps/rejected": -1012.7716064453125, "loss": 0.1373, "num_input_tokens_seen": 76866048, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.4460994005203247, "rewards/margins": 4.311342716217041, "rewards/rejected": -5.757441997528076, "step": 214 }, { "epoch": 0.8235575772085229, "grad_norm": 0.6790155172348022, "learning_rate": 8.060791791418887e-07, "logits/chosen": -1.1398448944091797, "logits/rejected": -1.1423518657684326, "logps/chosen": -952.3850708007812, "logps/rejected": -974.2009887695312, "loss": 0.1295, "num_input_tokens_seen": 77235840, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.5031237602233887, "rewards/margins": 4.125448703765869, "rewards/rejected": -5.628572463989258, "step": 215 }, { "epoch": 0.8273880775676322, "grad_norm": 0.6497118473052979, "learning_rate": 7.723351768901172e-07, "logits/chosen": -1.125891923904419, "logits/rejected": -1.1162381172180176, "logps/chosen": -928.42919921875, "logps/rejected": -950.609375, "loss": 0.1176, "num_input_tokens_seen": 77590592, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.1432021856307983, "rewards/margins": 4.01569938659668, "rewards/rejected": -5.158901214599609, "step": 216 }, { "epoch": 0.8312185779267417, "grad_norm": 0.6960030198097229, "learning_rate": 7.392534630882092e-07, "logits/chosen": -1.1320910453796387, "logits/rejected": -1.1252555847167969, "logps/chosen": -906.5370483398438, "logps/rejected": -943.4966430664062, "loss": 0.1177, "num_input_tokens_seen": 77947072, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.3496477603912354, "rewards/margins": 3.8906993865966797, "rewards/rejected": -5.240346908569336, "step": 217 }, { "epoch": 0.8350490782858511, "grad_norm": 0.5776065587997437, "learning_rate": 7.06839220180342e-07, "logits/chosen": -1.124521017074585, "logits/rejected": -1.1157386302947998, "logps/chosen": -957.765380859375, "logps/rejected": -1006.8399658203125, "loss": 0.1055, "num_input_tokens_seen": 78299264, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0568280220031738, "rewards/margins": 4.409466743469238, "rewards/rejected": -5.46629524230957, "step": 218 }, { "epoch": 0.8388795786449605, "grad_norm": 0.7293028235435486, "learning_rate": 6.750975260474718e-07, "logits/chosen": -1.125768780708313, "logits/rejected": -1.1296476125717163, "logps/chosen": -926.0774536132812, "logps/rejected": -962.322998046875, "loss": 0.1392, "num_input_tokens_seen": 78658112, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2768231630325317, "rewards/margins": 3.9475696086883545, "rewards/rejected": -5.224392890930176, "step": 219 }, { "epoch": 0.8427100790040699, "grad_norm": 0.5765385627746582, "learning_rate": 6.440333532118503e-07, "logits/chosen": -1.127885103225708, "logits/rejected": -1.141706943511963, "logps/chosen": -926.9733276367188, "logps/rejected": -950.8040771484375, "loss": 0.1061, "num_input_tokens_seen": 79021824, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.9148169159889221, "rewards/margins": 4.253884792327881, "rewards/rejected": -5.168701648712158, "step": 220 }, { "epoch": 0.8465405793631793, "grad_norm": 0.4936072826385498, "learning_rate": 6.136515680580479e-07, "logits/chosen": -1.1449276208877563, "logits/rejected": -1.144270420074463, "logps/chosen": -951.5328979492188, "logps/rejected": -974.8268432617188, "loss": 0.0769, "num_input_tokens_seen": 79377856, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3788504600524902, "rewards/margins": 4.367871284484863, "rewards/rejected": -5.746721267700195, "step": 221 }, { "epoch": 0.8503710797222888, "grad_norm": 0.7145355343818665, "learning_rate": 5.839569300706127e-07, "logits/chosen": -1.1404030323028564, "logits/rejected": -1.1440632343292236, "logps/chosen": -955.458984375, "logps/rejected": -985.0751342773438, "loss": 0.0993, "num_input_tokens_seen": 79735616, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.1539299488067627, "rewards/margins": 4.464937210083008, "rewards/rejected": -5.618866920471191, "step": 222 }, { "epoch": 0.8542015800813981, "grad_norm": 0.728387713432312, "learning_rate": 5.549540910884649e-07, "logits/chosen": -1.149441123008728, "logits/rejected": -1.142416000366211, "logps/chosen": -958.7587890625, "logps/rejected": -992.5391845703125, "loss": 0.0954, "num_input_tokens_seen": 80104704, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1392351388931274, "rewards/margins": 4.274951934814453, "rewards/rejected": -5.414187431335449, "step": 223 }, { "epoch": 0.8580320804405075, "grad_norm": 0.44472503662109375, "learning_rate": 5.266475945761562e-07, "logits/chosen": -1.1332907676696777, "logits/rejected": -1.1400041580200195, "logps/chosen": -939.7073364257812, "logps/rejected": -959.638916015625, "loss": 0.0731, "num_input_tokens_seen": 80450752, "rewards/accuracies": 0.9921875, "rewards/chosen": -1.034857153892517, "rewards/margins": 4.585856914520264, "rewards/rejected": -5.62071418762207, "step": 224 }, { "epoch": 0.861862580799617, "grad_norm": 0.6553906202316284, "learning_rate": 4.990418749121179e-07, "logits/chosen": -1.157515525817871, "logits/rejected": -1.1576440334320068, "logps/chosen": -981.6168212890625, "logps/rejected": -996.4295043945312, "loss": 0.1237, "num_input_tokens_seen": 80818432, "rewards/accuracies": 0.96875, "rewards/chosen": -1.253407597541809, "rewards/margins": 3.791903257369995, "rewards/rejected": -5.045310974121094, "step": 225 }, { "epoch": 0.8656930811587263, "grad_norm": 0.5193009972572327, "learning_rate": 4.721412566939804e-07, "logits/chosen": -1.1454250812530518, "logits/rejected": -1.1320409774780273, "logps/chosen": -903.3690185546875, "logps/rejected": -966.2847900390625, "loss": 0.0816, "num_input_tokens_seen": 81175296, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.2726759910583496, "rewards/margins": 4.386380195617676, "rewards/rejected": -5.659055709838867, "step": 226 }, { "epoch": 0.8695235815178358, "grad_norm": 0.558116614818573, "learning_rate": 4.4594995406110785e-07, "logits/chosen": -1.1296035051345825, "logits/rejected": -1.126936912536621, "logps/chosen": -904.9466552734375, "logps/rejected": -930.12255859375, "loss": 0.1063, "num_input_tokens_seen": 81528576, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2420897483825684, "rewards/margins": 4.35035514831543, "rewards/rejected": -5.59244441986084, "step": 227 }, { "epoch": 0.8733540818769452, "grad_norm": 0.7199344038963318, "learning_rate": 4.2047207003442003e-07, "logits/chosen": -1.1620718240737915, "logits/rejected": -1.1633644104003906, "logps/chosen": -944.5686645507812, "logps/rejected": -986.4454345703125, "loss": 0.1303, "num_input_tokens_seen": 81893888, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.367161750793457, "rewards/margins": 3.982255220413208, "rewards/rejected": -5.349416732788086, "step": 228 }, { "epoch": 0.8771845822360546, "grad_norm": 0.770560085773468, "learning_rate": 3.957115958736374e-07, "logits/chosen": -1.1352251768112183, "logits/rejected": -1.1215541362762451, "logps/chosen": -881.5318603515625, "logps/rejected": -917.5421142578125, "loss": 0.1263, "num_input_tokens_seen": 82220416, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.1326651573181152, "rewards/margins": 4.0631184577941895, "rewards/rejected": -5.195783615112305, "step": 229 }, { "epoch": 0.881015082595164, "grad_norm": 0.6659558415412903, "learning_rate": 3.7167241045202474e-07, "logits/chosen": -1.143093228340149, "logits/rejected": -1.1408135890960693, "logps/chosen": -967.25830078125, "logps/rejected": -995.8428955078125, "loss": 0.1151, "num_input_tokens_seen": 82582784, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2594966888427734, "rewards/margins": 4.3952860832214355, "rewards/rejected": -5.654782772064209, "step": 230 }, { "epoch": 0.8848455829542734, "grad_norm": 0.5715085864067078, "learning_rate": 3.483582796487395e-07, "logits/chosen": -1.1340765953063965, "logits/rejected": -1.1376596689224243, "logps/chosen": -950.77294921875, "logps/rejected": -955.8455200195312, "loss": 0.0953, "num_input_tokens_seen": 82945856, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.2390849590301514, "rewards/margins": 4.258786201477051, "rewards/rejected": -5.497871398925781, "step": 231 }, { "epoch": 0.8886760833133828, "grad_norm": 0.6102214455604553, "learning_rate": 3.257728557588902e-07, "logits/chosen": -1.151562213897705, "logits/rejected": -1.1459643840789795, "logps/chosen": -951.6468505859375, "logps/rejected": -975.72314453125, "loss": 0.1001, "num_input_tokens_seen": 83325760, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.3845793008804321, "rewards/margins": 4.108867645263672, "rewards/rejected": -5.493447303771973, "step": 232 }, { "epoch": 0.8925065836724922, "grad_norm": 0.5366233587265015, "learning_rate": 3.039196769213787e-07, "logits/chosen": -1.140844464302063, "logits/rejected": -1.1291863918304443, "logps/chosen": -923.5592041015625, "logps/rejected": -968.033203125, "loss": 0.0911, "num_input_tokens_seen": 83684032, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2117785215377808, "rewards/margins": 4.085620880126953, "rewards/rejected": -5.297399520874023, "step": 233 }, { "epoch": 0.8963370840316016, "grad_norm": 0.6304879784584045, "learning_rate": 2.828021665646341e-07, "logits/chosen": -1.1628239154815674, "logits/rejected": -1.1564610004425049, "logps/chosen": -984.918701171875, "logps/rejected": -1041.814453125, "loss": 0.1165, "num_input_tokens_seen": 84058624, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4553375244140625, "rewards/margins": 4.140695571899414, "rewards/rejected": -5.596033096313477, "step": 234 }, { "epoch": 0.900167584390711, "grad_norm": 0.8794420957565308, "learning_rate": 2.6242363287030617e-07, "logits/chosen": -1.153059959411621, "logits/rejected": -1.152005910873413, "logps/chosen": -919.1626586914062, "logps/rejected": -951.9708251953125, "loss": 0.1757, "num_input_tokens_seen": 84418816, "rewards/accuracies": 0.9296875, "rewards/chosen": -1.485947608947754, "rewards/margins": 3.8721084594726562, "rewards/rejected": -5.35805606842041, "step": 235 }, { "epoch": 0.9039980847498205, "grad_norm": 0.8899986743927002, "learning_rate": 2.4278726825502696e-07, "logits/chosen": -1.172439694404602, "logits/rejected": -1.1706933975219727, "logps/chosen": -941.1187744140625, "logps/rejected": -960.712158203125, "loss": 0.1672, "num_input_tokens_seen": 84791232, "rewards/accuracies": 0.9296875, "rewards/chosen": -1.478002905845642, "rewards/margins": 4.020918846130371, "rewards/rejected": -5.4989213943481445, "step": 236 }, { "epoch": 0.9078285851089298, "grad_norm": 0.7009980082511902, "learning_rate": 2.2389614887029564e-07, "logits/chosen": -1.158771276473999, "logits/rejected": -1.1504409313201904, "logps/chosen": -931.8646240234375, "logps/rejected": -975.0748901367188, "loss": 0.1262, "num_input_tokens_seen": 85148544, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1419326066970825, "rewards/margins": 4.0716447830200195, "rewards/rejected": -5.2135772705078125, "step": 237 }, { "epoch": 0.9116590854680393, "grad_norm": 0.8356382846832275, "learning_rate": 2.0575323412058036e-07, "logits/chosen": -1.1342267990112305, "logits/rejected": -1.1222342252731323, "logps/chosen": -956.0548095703125, "logps/rejected": -1007.9989013671875, "loss": 0.1671, "num_input_tokens_seen": 85514624, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.1702017784118652, "rewards/margins": 3.732792377471924, "rewards/rejected": -4.902994155883789, "step": 238 }, { "epoch": 0.9154895858271487, "grad_norm": 0.5944716930389404, "learning_rate": 1.8836136619971468e-07, "logits/chosen": -1.1205250024795532, "logits/rejected": -1.1232792139053345, "logps/chosen": -959.98681640625, "logps/rejected": -1005.0180053710938, "loss": 0.1018, "num_input_tokens_seen": 85866560, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.3545148372650146, "rewards/margins": 4.15582275390625, "rewards/rejected": -5.510337829589844, "step": 239 }, { "epoch": 0.919320086186258, "grad_norm": 0.6897103190422058, "learning_rate": 1.7172326964564777e-07, "logits/chosen": -1.1565879583358765, "logits/rejected": -1.1416594982147217, "logps/chosen": -908.073974609375, "logps/rejected": -963.4183349609375, "loss": 0.1222, "num_input_tokens_seen": 86227584, "rewards/accuracies": 0.96875, "rewards/chosen": -1.198516607284546, "rewards/margins": 4.089000225067139, "rewards/rejected": -5.2875165939331055, "step": 240 }, { "epoch": 0.9231505865453675, "grad_norm": 0.7914372682571411, "learning_rate": 1.5584155091362907e-07, "logits/chosen": -1.161057949066162, "logits/rejected": -1.1563315391540527, "logps/chosen": -974.1826171875, "logps/rejected": -986.8939208984375, "loss": 0.1439, "num_input_tokens_seen": 86599360, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2627431154251099, "rewards/margins": 3.8127615451812744, "rewards/rejected": -5.075504302978516, "step": 241 }, { "epoch": 0.9269810869044769, "grad_norm": 0.622022807598114, "learning_rate": 1.4071869796789427e-07, "logits/chosen": -1.1303247213363647, "logits/rejected": -1.122231125831604, "logps/chosen": -922.9158935546875, "logps/rejected": -966.453125, "loss": 0.1047, "num_input_tokens_seen": 86962112, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.1618643999099731, "rewards/margins": 4.100822448730469, "rewards/rejected": -5.262686729431152, "step": 242 }, { "epoch": 0.9308115872635863, "grad_norm": 0.6456294059753418, "learning_rate": 1.263570798919106e-07, "logits/chosen": -1.153860330581665, "logits/rejected": -1.1510493755340576, "logps/chosen": -967.277587890625, "logps/rejected": -1002.4358520507812, "loss": 0.0951, "num_input_tokens_seen": 87320064, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.1851540803909302, "rewards/margins": 4.55943489074707, "rewards/rejected": -5.744588851928711, "step": 243 }, { "epoch": 0.9346420876226957, "grad_norm": 0.6258938908576965, "learning_rate": 1.1275894651724517e-07, "logits/chosen": -1.1263501644134521, "logits/rejected": -1.1272790431976318, "logps/chosen": -916.5689086914062, "logps/rejected": -943.59814453125, "loss": 0.111, "num_input_tokens_seen": 87670976, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.3743226528167725, "rewards/margins": 4.170286178588867, "rewards/rejected": -5.5446085929870605, "step": 244 }, { "epoch": 0.9384725879818051, "grad_norm": 0.6042916178703308, "learning_rate": 9.992642807111486e-08, "logits/chosen": -1.1254693269729614, "logits/rejected": -1.1179457902908325, "logps/chosen": -866.78955078125, "logps/rejected": -914.8248901367188, "loss": 0.0918, "num_input_tokens_seen": 88002176, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1663532257080078, "rewards/margins": 4.326272010803223, "rewards/rejected": -5.4926252365112305, "step": 245 }, { "epoch": 0.9423030883409146, "grad_norm": 0.4170897603034973, "learning_rate": 8.78615348426759e-08, "logits/chosen": -1.1225149631500244, "logits/rejected": -1.121935486793518, "logps/chosen": -905.6043701171875, "logps/rejected": -942.7872314453125, "loss": 0.0682, "num_input_tokens_seen": 88350976, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.9678192734718323, "rewards/margins": 4.1514763832092285, "rewards/rejected": -5.119295597076416, "step": 246 }, { "epoch": 0.946133588700024, "grad_norm": 0.5161164999008179, "learning_rate": 7.656615686809976e-08, "logits/chosen": -1.1459851264953613, "logits/rejected": -1.1481428146362305, "logps/chosen": -931.0963134765625, "logps/rejected": -971.57080078125, "loss": 0.0867, "num_input_tokens_seen": 88715456, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2464854717254639, "rewards/margins": 4.32740592956543, "rewards/rejected": -5.5738911628723145, "step": 247 }, { "epoch": 0.9499640890591333, "grad_norm": 0.5249192118644714, "learning_rate": 6.604206363448662e-08, "logits/chosen": -1.1482356786727905, "logits/rejected": -1.1453156471252441, "logps/chosen": -953.91796875, "logps/rejected": -984.6043701171875, "loss": 0.0874, "num_input_tokens_seen": 89086656, "rewards/accuracies": 0.984375, "rewards/chosen": -1.0581538677215576, "rewards/margins": 4.51418399810791, "rewards/rejected": -5.572338104248047, "step": 248 }, { "epoch": 0.9537945894182428, "grad_norm": 0.5978475213050842, "learning_rate": 5.6290903802665444e-08, "logits/chosen": -1.1580530405044556, "logits/rejected": -1.1452796459197998, "logps/chosen": -985.2161865234375, "logps/rejected": -1021.5433959960938, "loss": 0.0937, "num_input_tokens_seen": 89469120, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.0257568359375, "rewards/margins": 4.238351821899414, "rewards/rejected": -5.264108657836914, "step": 249 }, { "epoch": 0.9576250897773522, "grad_norm": 0.726586103439331, "learning_rate": 4.7314204948923356e-08, "logits/chosen": -1.126878261566162, "logits/rejected": -1.1194556951522827, "logps/chosen": -931.3563842773438, "logps/rejected": -967.1663818359375, "loss": 0.1485, "num_input_tokens_seen": 89832832, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1478817462921143, "rewards/margins": 3.753807544708252, "rewards/rejected": -4.901689529418945, "step": 250 }, { "epoch": 0.9614555901364615, "grad_norm": 0.705651581287384, "learning_rate": 3.911337332569876e-08, "logits/chosen": -1.1198232173919678, "logits/rejected": -1.1242961883544922, "logps/chosen": -925.5709228515625, "logps/rejected": -947.575927734375, "loss": 0.143, "num_input_tokens_seen": 90184960, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1102354526519775, "rewards/margins": 4.033443450927734, "rewards/rejected": -5.143678665161133, "step": 251 }, { "epoch": 0.965286090495571, "grad_norm": 0.619276762008667, "learning_rate": 3.168969364128527e-08, "logits/chosen": -1.1234852075576782, "logits/rejected": -1.1284223794937134, "logps/chosen": -901.2196044921875, "logps/rejected": -934.9398193359375, "loss": 0.1005, "num_input_tokens_seen": 90540608, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.3671045303344727, "rewards/margins": 4.210000991821289, "rewards/rejected": -5.577105522155762, "step": 252 }, { "epoch": 0.9691165908546804, "grad_norm": 0.6134979128837585, "learning_rate": 2.5044328858576105e-08, "logits/chosen": -1.1416277885437012, "logits/rejected": -1.1393930912017822, "logps/chosen": -944.0597534179688, "logps/rejected": -983.457275390625, "loss": 0.1082, "num_input_tokens_seen": 90903552, "rewards/accuracies": 0.96875, "rewards/chosen": -1.03385329246521, "rewards/margins": 4.257451057434082, "rewards/rejected": -5.291304111480713, "step": 253 }, { "epoch": 0.9729470912137897, "grad_norm": 0.6683414578437805, "learning_rate": 1.917832001287645e-08, "logits/chosen": -1.1274126768112183, "logits/rejected": -1.1147723197937012, "logps/chosen": -930.3221435546875, "logps/rejected": -962.2817993164062, "loss": 0.1193, "num_input_tokens_seen": 91277824, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.2914443016052246, "rewards/margins": 4.013041019439697, "rewards/rejected": -5.304485321044922, "step": 254 }, { "epoch": 0.9767775915728992, "grad_norm": 0.6220396757125854, "learning_rate": 1.4092586048820578e-08, "logits/chosen": -1.156324863433838, "logits/rejected": -1.1507492065429688, "logps/chosen": -936.469482421875, "logps/rejected": -965.8038330078125, "loss": 0.1073, "num_input_tokens_seen": 91637568, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1197013854980469, "rewards/margins": 4.163107872009277, "rewards/rejected": -5.282809257507324, "step": 255 }, { "epoch": 0.9806080919320086, "grad_norm": 0.6376728415489197, "learning_rate": 9.787923676414235e-09, "logits/chosen": -1.131098985671997, "logits/rejected": -1.1358366012573242, "logps/chosen": -904.1517944335938, "logps/rejected": -921.351806640625, "loss": 0.1165, "num_input_tokens_seen": 91986624, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.3765130043029785, "rewards/margins": 3.9553894996643066, "rewards/rejected": -5.331902503967285, "step": 256 }, { "epoch": 0.9844385922911181, "grad_norm": 0.6460384726524353, "learning_rate": 6.265007246223365e-09, "logits/chosen": -1.1434887647628784, "logits/rejected": -1.1374716758728027, "logps/chosen": -952.5870361328125, "logps/rejected": -980.957275390625, "loss": 0.1232, "num_input_tokens_seen": 92359232, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.1729092597961426, "rewards/margins": 3.816133499145508, "rewards/rejected": -4.98904275894165, "step": 257 }, { "epoch": 0.9882690926502274, "grad_norm": 0.7076448202133179, "learning_rate": 3.524388643736387e-09, "logits/chosen": -1.1206657886505127, "logits/rejected": -1.1284191608428955, "logps/chosen": -945.7650146484375, "logps/rejected": -951.6602783203125, "loss": 0.1287, "num_input_tokens_seen": 92725888, "rewards/accuracies": 0.9609375, "rewards/chosen": -1.2057055234909058, "rewards/margins": 3.72121000289917, "rewards/rejected": -4.926915645599365, "step": 258 }, { "epoch": 0.9920995930093368, "grad_norm": 0.8474130034446716, "learning_rate": 1.566497202904471e-09, "logits/chosen": -1.1205601692199707, "logits/rejected": -1.1278849840164185, "logps/chosen": -967.0809326171875, "logps/rejected": -996.765380859375, "loss": 0.1515, "num_input_tokens_seen": 93092224, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9331145882606506, "rewards/margins": 3.8447437286376953, "rewards/rejected": -4.777858734130859, "step": 259 }, { "epoch": 0.9959300933684463, "grad_norm": 0.6645534634590149, "learning_rate": 3.916396388869981e-10, "logits/chosen": -1.1607704162597656, "logits/rejected": -1.154091238975525, "logps/chosen": -940.1096801757812, "logps/rejected": -995.0182495117188, "loss": 0.1015, "num_input_tokens_seen": 93450688, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.526294231414795, "rewards/margins": 4.032198429107666, "rewards/rejected": -5.558492660522461, "step": 260 }, { "epoch": 0.9997605937275557, "grad_norm": 0.7298867106437683, "learning_rate": 0.0, "logits/chosen": -1.1293647289276123, "logits/rejected": -1.123779296875, "logps/chosen": -938.12451171875, "logps/rejected": -974.994140625, "loss": 0.1225, "num_input_tokens_seen": 93801152, "rewards/accuracies": 0.9453125, "rewards/chosen": -1.6063673496246338, "rewards/margins": 4.141149044036865, "rewards/rejected": -5.747516632080078, "step": 261 }, { "epoch": 0.9997605937275557, "num_input_tokens_seen": 93801152, "step": 261, "total_flos": 4.318246812239528e+18, "train_loss": 0.2454147177009747, "train_runtime": 22841.5151, "train_samples_per_second": 1.463, "train_steps_per_second": 0.011 } ], "logging_steps": 1.0, "max_steps": 261, "num_input_tokens_seen": 93801152, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.318246812239528e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }