{ "best_global_step": null, "best_metric": 4.09207010269165, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 697, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_kl/ref_to_policy/chosen": -0.40358495712280273, "eval_kl/ref_to_policy/mean": -0.15629473328590393, "eval_kl/ref_to_policy/rejected": 0.0909954234957695, "eval_logits/chosen": -0.35763809084892273, "eval_logits/rejected": -0.24047470092773438, "eval_logps/chosen": -1523.8323974609375, "eval_logps/rejected": -1523.813720703125, "eval_loss": 18.300949096679688, "eval_nll_loss": 4.402440071105957, "eval_rewards/accuracies": 0.5797872543334961, "eval_rewards/chosen": 0.004035849589854479, "eval_rewards/margins": 0.004945802967995405, "eval_rewards/rejected": -0.0009099542512558401, "eval_runtime": 120.1404, "eval_samples_per_second": 3.13, "eval_steps_per_second": 1.565, "step": 0 }, { "epoch": 0.0014347202295552368, "grad_norm": 494.91876220703125, "kl/ref_to_policy/chosen": 0.908806562423706, "kl/ref_to_policy/mean": -0.009529799222946167, "kl/ref_to_policy/rejected": -0.9278661608695984, "learning_rate": 0.0, "logits/chosen": -0.3938221037387848, "logits/rejected": -0.26003432273864746, "logps/chosen": -1292.9713134765625, "logps/rejected": -1292.991943359375, "loss": 17.3741, "nll_loss": 4.167889595031738, "rewards/accuracies": 0.3125, "rewards/chosen": -0.009088065475225449, "rewards/margins": -0.018366727977991104, "rewards/rejected": 0.009278660640120506, "step": 1 }, { "epoch": 0.0028694404591104736, "grad_norm": 344.77386474609375, "kl/ref_to_policy/chosen": 2.1636548042297363, "kl/ref_to_policy/mean": 0.08187121152877808, "kl/ref_to_policy/rejected": -1.9999126195907593, "learning_rate": 1.4285714285714287e-07, "logits/chosen": -0.1529161036014557, "logits/rejected": -0.09012061357498169, "logps/chosen": -1941.6636962890625, "logps/rejected": -1949.24609375, "loss": 12.9316, "nll_loss": 3.0542612075805664, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02163654938340187, "rewards/margins": -0.041635677218437195, "rewards/rejected": 0.019999125972390175, "step": 2 }, { "epoch": 0.00430416068866571, "grad_norm": 545.0651245117188, "kl/ref_to_policy/chosen": -1.0799988508224487, "kl/ref_to_policy/mean": -0.1950250267982483, "kl/ref_to_policy/rejected": 0.6899487972259521, "learning_rate": 2.8571428571428575e-07, "logits/chosen": -0.0428161583840847, "logits/rejected": -0.04000319913029671, "logps/chosen": -1827.5684814453125, "logps/rejected": -1835.48486328125, "loss": 15.9548, "nll_loss": 3.817466974258423, "rewards/accuracies": 0.625, "rewards/chosen": 0.010799989104270935, "rewards/margins": 0.017699476331472397, "rewards/rejected": -0.006899488158524036, "step": 3 }, { "epoch": 0.005738880918220947, "grad_norm": 434.3913879394531, "kl/ref_to_policy/chosen": -0.4099351167678833, "kl/ref_to_policy/mean": -0.7823785543441772, "kl/ref_to_policy/rejected": -1.1548219919204712, "learning_rate": 4.285714285714286e-07, "logits/chosen": -0.17474456131458282, "logits/rejected": -0.06484867632389069, "logps/chosen": -2235.12890625, "logps/rejected": -2240.85888671875, "loss": 14.5612, "nll_loss": 3.4658567905426025, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004099351353943348, "rewards/margins": -0.007448868826031685, "rewards/rejected": 0.011548219248652458, "step": 4 }, { "epoch": 0.007173601147776184, "grad_norm": 620.6605834960938, "kl/ref_to_policy/chosen": 0.4555070400238037, "kl/ref_to_policy/mean": -0.22258111834526062, "kl/ref_to_policy/rejected": -0.9006692171096802, "learning_rate": 5.714285714285715e-07, "logits/chosen": -0.4763789772987366, "logits/rejected": -0.3004745841026306, "logps/chosen": -1180.2523193359375, "logps/rejected": -1177.632568359375, "loss": 18.9678, "nll_loss": 4.566905975341797, "rewards/accuracies": 0.3125, "rewards/chosen": -0.004555069841444492, "rewards/margins": -0.013561764732003212, "rewards/rejected": 0.009006692096590996, "step": 5 }, { "epoch": 0.00860832137733142, "grad_norm": 592.2278442382812, "kl/ref_to_policy/chosen": -0.08633410930633545, "kl/ref_to_policy/mean": -0.8875598907470703, "kl/ref_to_policy/rejected": -1.6887859106063843, "learning_rate": 7.142857142857143e-07, "logits/chosen": -0.4700797200202942, "logits/rejected": -0.295685738325119, "logps/chosen": -1534.57763671875, "logps/rejected": -1534.541748046875, "loss": 17.2371, "nll_loss": 4.133915901184082, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0008633416146039963, "rewards/margins": -0.0160245168954134, "rewards/rejected": 0.016887856647372246, "step": 6 }, { "epoch": 0.010043041606886656, "grad_norm": 423.8679504394531, "kl/ref_to_policy/chosen": 0.3851618766784668, "kl/ref_to_policy/mean": 0.8802648186683655, "kl/ref_to_policy/rejected": 1.3753677606582642, "learning_rate": 8.571428571428572e-07, "logits/chosen": -0.21322765946388245, "logits/rejected": -0.17988765239715576, "logps/chosen": -1304.90087890625, "logps/rejected": -1313.3792724609375, "loss": 13.9299, "nll_loss": 3.3103785514831543, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0038516188506036997, "rewards/margins": 0.00990205816924572, "rewards/rejected": -0.013753676787018776, "step": 7 }, { "epoch": 0.011477761836441894, "grad_norm": 423.3901062011719, "kl/ref_to_policy/chosen": -1.3573721647262573, "kl/ref_to_policy/mean": -0.6721571683883667, "kl/ref_to_policy/rejected": 0.013057827949523926, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.1933058500289917, "logits/rejected": -0.08955292403697968, "logps/chosen": -1956.424072265625, "logps/rejected": -1961.333251953125, "loss": 15.8245, "nll_loss": 3.7842984199523926, "rewards/accuracies": 0.625, "rewards/chosen": 0.013573720119893551, "rewards/margins": 0.013704298995435238, "rewards/rejected": -0.0001305784098803997, "step": 8 }, { "epoch": 0.01291248206599713, "grad_norm": 846.3291625976562, "kl/ref_to_policy/chosen": 0.07702922821044922, "kl/ref_to_policy/mean": -0.9662662744522095, "kl/ref_to_policy/rejected": -2.009561538696289, "learning_rate": 1.142857142857143e-06, "logits/chosen": -0.3511194884777069, "logits/rejected": -0.1783822476863861, "logps/chosen": -1127.5302734375, "logps/rejected": -1122.570556640625, "loss": 21.9661, "nll_loss": 5.315551280975342, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0007702915463596582, "rewards/margins": -0.0208659078925848, "rewards/rejected": 0.020095618441700935, "step": 9 }, { "epoch": 0.014347202295552367, "grad_norm": 591.8719482421875, "kl/ref_to_policy/chosen": -2.226840019226074, "kl/ref_to_policy/mean": -1.4663748741149902, "kl/ref_to_policy/rejected": -0.7059094905853271, "learning_rate": 1.2857142857142856e-06, "logits/chosen": -0.4361830949783325, "logits/rejected": -0.3252297639846802, "logps/chosen": -1535.7576904296875, "logps/rejected": -1534.882080078125, "loss": 18.2582, "nll_loss": 4.392968654632568, "rewards/accuracies": 0.6875, "rewards/chosen": 0.022268401458859444, "rewards/margins": 0.015209305100142956, "rewards/rejected": 0.007059094496071339, "step": 10 }, { "epoch": 0.014347202295552367, "eval_kl/ref_to_policy/chosen": -1.6686835289001465, "eval_kl/ref_to_policy/mean": -1.4389091730117798, "eval_kl/ref_to_policy/rejected": -1.2091349363327026, "eval_logits/chosen": -0.37488141655921936, "eval_logits/rejected": -0.25654885172843933, "eval_logps/chosen": -1522.5672607421875, "eval_logps/rejected": -1522.513427734375, "eval_loss": 17.374561309814453, "eval_nll_loss": 4.17080020904541, "eval_rewards/accuracies": 0.7739361524581909, "eval_rewards/chosen": 0.016686834394931793, "eval_rewards/margins": 0.004595485981553793, "eval_rewards/rejected": 0.012091348879039288, "eval_runtime": 114.5905, "eval_samples_per_second": 3.281, "eval_steps_per_second": 1.641, "step": 10 }, { "epoch": 0.015781922525107604, "grad_norm": 453.10882568359375, "kl/ref_to_policy/chosen": -1.1838700771331787, "kl/ref_to_policy/mean": -2.083493709564209, "kl/ref_to_policy/rejected": -2.98311710357666, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -0.28012779355049133, "logits/rejected": -0.16846787929534912, "logps/chosen": -1643.2847900390625, "logps/rejected": -1648.188720703125, "loss": 14.3737, "nll_loss": 3.417686939239502, "rewards/accuracies": 0.5, "rewards/chosen": 0.011838700622320175, "rewards/margins": -0.017992470413446426, "rewards/rejected": 0.0298311710357666, "step": 11 }, { "epoch": 0.01721664275466284, "grad_norm": 710.6464233398438, "kl/ref_to_policy/chosen": -0.9157615900039673, "kl/ref_to_policy/mean": -0.3812723755836487, "kl/ref_to_policy/rejected": 0.15321683883666992, "learning_rate": 1.5714285714285714e-06, "logits/chosen": -0.41443413496017456, "logits/rejected": -0.3108136057853699, "logps/chosen": -893.217041015625, "logps/rejected": -887.3524780273438, "loss": 21.7227, "nll_loss": 5.258667469024658, "rewards/accuracies": 0.875, "rewards/chosen": 0.009157615713775158, "rewards/margins": 0.010689783841371536, "rewards/rejected": -0.0015321681275963783, "step": 12 }, { "epoch": 0.018651362984218076, "grad_norm": 693.8386840820312, "kl/ref_to_policy/chosen": -2.712230682373047, "kl/ref_to_policy/mean": -2.5489916801452637, "kl/ref_to_policy/rejected": -2.3857524394989014, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -0.38992393016815186, "logits/rejected": -0.28846949338912964, "logps/chosen": -1236.6002197265625, "logps/rejected": -1230.9364013671875, "loss": 22.3184, "nll_loss": 5.406492233276367, "rewards/accuracies": 0.75, "rewards/chosen": 0.02712230756878853, "rewards/margins": 0.0032647824846208096, "rewards/rejected": 0.02385752461850643, "step": 13 }, { "epoch": 0.020086083213773313, "grad_norm": 397.45074462890625, "kl/ref_to_policy/chosen": -2.579573631286621, "kl/ref_to_policy/mean": -2.8969709873199463, "kl/ref_to_policy/rejected": -3.2143678665161133, "learning_rate": 1.8571428571428573e-06, "logits/chosen": -0.30835121870040894, "logits/rejected": -0.17859502136707306, "logps/chosen": -1614.02978515625, "logps/rejected": -1614.91455078125, "loss": 17.3558, "nll_loss": 4.164644718170166, "rewards/accuracies": 0.75, "rewards/chosen": 0.02579573728144169, "rewards/margins": -0.006347940303385258, "rewards/rejected": 0.03214367851614952, "step": 14 }, { "epoch": 0.021520803443328552, "grad_norm": 390.5135192871094, "kl/ref_to_policy/chosen": -2.97192120552063, "kl/ref_to_policy/mean": -4.118235111236572, "kl/ref_to_policy/rejected": -5.264548301696777, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.38788124918937683, "logits/rejected": -0.3091478943824768, "logps/chosen": -1678.3560791015625, "logps/rejected": -1681.3897705078125, "loss": 14.1834, "nll_loss": 3.369521141052246, "rewards/accuracies": 0.5, "rewards/chosen": 0.029719211161136627, "rewards/margins": -0.022926270961761475, "rewards/rejected": 0.0526454858481884, "step": 15 }, { "epoch": 0.02295552367288379, "grad_norm": 542.6211547851562, "kl/ref_to_policy/chosen": -3.9361000061035156, "kl/ref_to_policy/mean": -3.2760801315307617, "kl/ref_to_policy/rejected": -2.616060256958008, "learning_rate": 2.1428571428571427e-06, "logits/chosen": -0.32236912846565247, "logits/rejected": -0.19862160086631775, "logps/chosen": -1271.2904052734375, "logps/rejected": -1269.6846923828125, "loss": 17.3647, "nll_loss": 4.169492721557617, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03936099633574486, "rewards/margins": 0.013200395740568638, "rewards/rejected": 0.026160601526498795, "step": 16 }, { "epoch": 0.024390243902439025, "grad_norm": 536.9942016601562, "kl/ref_to_policy/chosen": -5.522911071777344, "kl/ref_to_policy/mean": -5.77244758605957, "kl/ref_to_policy/rejected": -6.021984100341797, "learning_rate": 2.285714285714286e-06, "logits/chosen": -0.7363728880882263, "logits/rejected": -0.5525289177894592, "logps/chosen": -1153.40673828125, "logps/rejected": -1145.98583984375, "loss": 15.6922, "nll_loss": 3.7488489151000977, "rewards/accuracies": 0.75, "rewards/chosen": 0.05522910878062248, "rewards/margins": -0.004990733694285154, "rewards/rejected": 0.060219842940568924, "step": 17 }, { "epoch": 0.02582496413199426, "grad_norm": 252.41664123535156, "kl/ref_to_policy/chosen": -3.6294994354248047, "kl/ref_to_policy/mean": -3.5256941318511963, "kl/ref_to_policy/rejected": -3.4218881130218506, "learning_rate": 2.428571428571429e-06, "logits/chosen": -0.26896944642066956, "logits/rejected": -0.193424254655838, "logps/chosen": -1605.8314208984375, "logps/rejected": -1610.07421875, "loss": 11.7977, "nll_loss": 2.7763473987579346, "rewards/accuracies": 0.5, "rewards/chosen": 0.03629499673843384, "rewards/margins": 0.0020761124324053526, "rewards/rejected": 0.034218885004520416, "step": 18 }, { "epoch": 0.027259684361549498, "grad_norm": 1089.07373046875, "kl/ref_to_policy/chosen": -5.052062034606934, "kl/ref_to_policy/mean": -5.504865646362305, "kl/ref_to_policy/rejected": -5.957669734954834, "learning_rate": 2.571428571428571e-06, "logits/chosen": -0.6185619831085205, "logits/rejected": -0.42389124631881714, "logps/chosen": -739.924560546875, "logps/rejected": -737.1302490234375, "loss": 14.7166, "nll_loss": 3.504467487335205, "rewards/accuracies": 0.625, "rewards/chosen": 0.05052062124013901, "rewards/margins": -0.009056070819497108, "rewards/rejected": 0.05957669019699097, "step": 19 }, { "epoch": 0.028694404591104734, "grad_norm": 810.7783203125, "kl/ref_to_policy/chosen": -3.87860369682312, "kl/ref_to_policy/mean": -2.7028255462646484, "kl/ref_to_policy/rejected": -1.5270473957061768, "learning_rate": 2.7142857142857144e-06, "logits/chosen": -0.8962723016738892, "logits/rejected": -0.7409090399742126, "logps/chosen": -759.0901489257812, "logps/rejected": -754.060791015625, "loss": 13.8619, "nll_loss": 3.2950050830841064, "rewards/accuracies": 0.6875, "rewards/chosen": 0.038786035031080246, "rewards/margins": 0.02351555973291397, "rewards/rejected": 0.015270473435521126, "step": 20 }, { "epoch": 0.028694404591104734, "eval_kl/ref_to_policy/chosen": -3.6462671756744385, "eval_kl/ref_to_policy/mean": -3.354346990585327, "eval_kl/ref_to_policy/rejected": -3.062427282333374, "eval_logits/chosen": -0.5590272545814514, "eval_logits/rejected": -0.4421873986721039, "eval_logps/chosen": -1520.5897216796875, "eval_logps/rejected": -1520.6602783203125, "eval_loss": 12.646102905273438, "eval_nll_loss": 2.9887468814849854, "eval_rewards/accuracies": 0.5691489577293396, "eval_rewards/chosen": 0.03646266832947731, "eval_rewards/margins": 0.0058383941650390625, "eval_rewards/rejected": 0.030624276027083397, "eval_runtime": 116.116, "eval_samples_per_second": 3.238, "eval_steps_per_second": 1.619, "step": 20 }, { "epoch": 0.03012912482065997, "grad_norm": 147.5976104736328, "kl/ref_to_policy/chosen": 1.2663776874542236, "kl/ref_to_policy/mean": 1.3666975498199463, "kl/ref_to_policy/rejected": 1.4670178890228271, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -0.2961276173591614, "logits/rejected": -0.20615200698375702, "logps/chosen": -1549.055419921875, "logps/rejected": -1555.3009033203125, "loss": 9.643, "nll_loss": 2.2376596927642822, "rewards/accuracies": 0.375, "rewards/chosen": -0.01266377605497837, "rewards/margins": 0.0020064006093889475, "rewards/rejected": -0.014670176431536674, "step": 21 }, { "epoch": 0.03156384505021521, "grad_norm": 586.3958129882812, "kl/ref_to_policy/chosen": -1.8029770851135254, "kl/ref_to_policy/mean": -0.7705388069152832, "kl/ref_to_policy/rejected": 0.2618992328643799, "learning_rate": 3e-06, "logits/chosen": -0.534240186214447, "logits/rejected": -0.4590243101119995, "logps/chosen": -1673.009521484375, "logps/rejected": -1675.183837890625, "loss": 12.0589, "nll_loss": 2.8437447547912598, "rewards/accuracies": 0.625, "rewards/chosen": 0.018029768019914627, "rewards/margins": 0.020648760721087456, "rewards/rejected": -0.0026189908385276794, "step": 22 }, { "epoch": 0.03299856527977044, "grad_norm": 390.13592529296875, "kl/ref_to_policy/chosen": -0.18882310390472412, "kl/ref_to_policy/mean": -0.422491192817688, "kl/ref_to_policy/rejected": -0.6561591625213623, "learning_rate": 3.142857142857143e-06, "logits/chosen": -0.6055192947387695, "logits/rejected": -0.5022357702255249, "logps/chosen": -1406.764404296875, "logps/rejected": -1406.03515625, "loss": 11.1682, "nll_loss": 2.6179728507995605, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018882360309362411, "rewards/margins": -0.0046733589842915535, "rewards/rejected": 0.006561591289937496, "step": 23 }, { "epoch": 0.03443328550932568, "grad_norm": 232.59466552734375, "kl/ref_to_policy/chosen": -0.3575623035430908, "kl/ref_to_policy/mean": -0.5770483016967773, "kl/ref_to_policy/rejected": -0.7965340614318848, "learning_rate": 3.285714285714286e-06, "logits/chosen": -0.5532061457633972, "logits/rejected": -0.4555804431438446, "logps/chosen": -1315.470458984375, "logps/rejected": -1315.5980224609375, "loss": 12.4159, "nll_loss": 2.9297842979431152, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003575623035430908, "rewards/margins": -0.004389718174934387, "rewards/rejected": 0.007965339347720146, "step": 24 }, { "epoch": 0.035868005738880916, "grad_norm": 142.84181213378906, "kl/ref_to_policy/chosen": 12.437005996704102, "kl/ref_to_policy/mean": 11.34588623046875, "kl/ref_to_policy/rejected": 10.254768371582031, "learning_rate": 3.428571428571429e-06, "logits/chosen": -0.2829191982746124, "logits/rejected": -0.22633709013462067, "logps/chosen": -2207.249267578125, "logps/rejected": -2210.3779296875, "loss": 10.643, "nll_loss": 2.4843459129333496, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12437005341053009, "rewards/margins": -0.021822378039360046, "rewards/rejected": -0.10254767537117004, "step": 25 }, { "epoch": 0.03730272596843615, "grad_norm": 175.3594207763672, "kl/ref_to_policy/chosen": 7.294755935668945, "kl/ref_to_policy/mean": 6.262661933898926, "kl/ref_to_policy/rejected": 5.230567932128906, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -0.47494372725486755, "logits/rejected": -0.4126063883304596, "logps/chosen": -1286.747802734375, "logps/rejected": -1288.6407470703125, "loss": 10.341, "nll_loss": 2.4089815616607666, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07294755429029465, "rewards/margins": -0.020641878247261047, "rewards/rejected": -0.0523056760430336, "step": 26 }, { "epoch": 0.03873744619799139, "grad_norm": 183.50201416015625, "kl/ref_to_policy/chosen": 11.912699699401855, "kl/ref_to_policy/mean": 10.4098482131958, "kl/ref_to_policy/rejected": 8.906997680664062, "learning_rate": 3.7142857142857146e-06, "logits/chosen": -0.3228733241558075, "logits/rejected": -0.25247856974601746, "logps/chosen": -1097.08447265625, "logps/rejected": -1100.199951171875, "loss": 10.0605, "nll_loss": 2.3377304077148438, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11912700533866882, "rewards/margins": -0.030057024210691452, "rewards/rejected": -0.08906996995210648, "step": 27 }, { "epoch": 0.040172166427546625, "grad_norm": 205.595947265625, "kl/ref_to_policy/chosen": 13.346490859985352, "kl/ref_to_policy/mean": 11.828468322753906, "kl/ref_to_policy/rejected": 10.310447692871094, "learning_rate": 3.857142857142858e-06, "logits/chosen": -0.6047613620758057, "logits/rejected": -0.4682534337043762, "logps/chosen": -1790.2606201171875, "logps/rejected": -1790.5703125, "loss": 10.4708, "nll_loss": 2.439980983734131, "rewards/accuracies": 0.5, "rewards/chosen": -0.13346490263938904, "rewards/margins": -0.03036043420433998, "rewards/rejected": -0.10310447216033936, "step": 28 }, { "epoch": 0.04160688665710186, "grad_norm": 164.3492889404297, "kl/ref_to_policy/chosen": 14.946104049682617, "kl/ref_to_policy/mean": 13.865564346313477, "kl/ref_to_policy/rejected": 12.785024642944336, "learning_rate": 4.000000000000001e-06, "logits/chosen": -0.6123009324073792, "logits/rejected": -0.4982730746269226, "logps/chosen": -1651.8133544921875, "logps/rejected": -1653.3592529296875, "loss": 10.0555, "nll_loss": 2.3373565673828125, "rewards/accuracies": 0.5, "rewards/chosen": -0.14946101605892181, "rewards/margins": -0.021610792726278305, "rewards/rejected": -0.1278502494096756, "step": 29 }, { "epoch": 0.043041606886657105, "grad_norm": 310.773681640625, "kl/ref_to_policy/chosen": 21.70225715637207, "kl/ref_to_policy/mean": 21.553295135498047, "kl/ref_to_policy/rejected": 21.40433120727539, "learning_rate": 4.1428571428571435e-06, "logits/chosen": -0.6566712260246277, "logits/rejected": -0.4915165305137634, "logps/chosen": -1746.2255859375, "logps/rejected": -1746.751953125, "loss": 10.4887, "nll_loss": 2.4479622840881348, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21702256798744202, "rewards/margins": -0.0029792729765176773, "rewards/rejected": -0.2140432894229889, "step": 30 }, { "epoch": 0.043041606886657105, "eval_kl/ref_to_policy/chosen": 13.84704303741455, "eval_kl/ref_to_policy/mean": 14.104690551757812, "eval_kl/ref_to_policy/rejected": 14.362338066101074, "eval_logits/chosen": -0.7097444534301758, "eval_logits/rejected": -0.5530003309249878, "eval_logps/chosen": -1538.0831298828125, "eval_logps/rejected": -1538.0849609375, "eval_loss": 10.201334953308105, "eval_nll_loss": 2.3772478103637695, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.13847042620182037, "eval_rewards/margins": 0.005152958445250988, "eval_rewards/rejected": -0.14362338185310364, "eval_runtime": 117.0141, "eval_samples_per_second": 3.213, "eval_steps_per_second": 1.607, "step": 30 }, { "epoch": 0.04447632711621234, "grad_norm": 147.33013916015625, "kl/ref_to_policy/chosen": 17.50128936767578, "kl/ref_to_policy/mean": 16.066198348999023, "kl/ref_to_policy/rejected": 14.631108283996582, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -0.6007938981056213, "logits/rejected": -0.44816553592681885, "logps/chosen": -1629.821533203125, "logps/rejected": -1631.3519287109375, "loss": 9.1783, "nll_loss": 2.1171088218688965, "rewards/accuracies": 0.5, "rewards/chosen": -0.17501290142536163, "rewards/margins": -0.028701812028884888, "rewards/rejected": -0.14631108939647675, "step": 31 }, { "epoch": 0.04591104734576758, "grad_norm": 83.03153228759766, "kl/ref_to_policy/chosen": 37.65632247924805, "kl/ref_to_policy/mean": 33.31318283081055, "kl/ref_to_policy/rejected": 28.97003936767578, "learning_rate": 4.428571428571429e-06, "logits/chosen": -0.7389390468597412, "logits/rejected": -0.5787756443023682, "logps/chosen": -2450.1796875, "logps/rejected": -2453.27880859375, "loss": 9.3751, "nll_loss": 2.1587131023406982, "rewards/accuracies": 0.375, "rewards/chosen": -0.3765631914138794, "rewards/margins": -0.08686283230781555, "rewards/rejected": -0.28970035910606384, "step": 32 }, { "epoch": 0.047345767575322814, "grad_norm": 127.7071304321289, "kl/ref_to_policy/chosen": 14.79667854309082, "kl/ref_to_policy/mean": 15.92458438873291, "kl/ref_to_policy/rejected": 17.052490234375, "learning_rate": 4.571428571428572e-06, "logits/chosen": -0.8169029355049133, "logits/rejected": -0.6144357919692993, "logps/chosen": -904.234130859375, "logps/rejected": -903.4177856445312, "loss": 9.9209, "nll_loss": 2.3093130588531494, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14796678721904755, "rewards/margins": 0.022558096796274185, "rewards/rejected": -0.17052488029003143, "step": 33 }, { "epoch": 0.04878048780487805, "grad_norm": 99.5516586303711, "kl/ref_to_policy/chosen": 34.05644989013672, "kl/ref_to_policy/mean": 34.9261589050293, "kl/ref_to_policy/rejected": 35.79586410522461, "learning_rate": 4.714285714285715e-06, "logits/chosen": -0.6680092215538025, "logits/rejected": -0.44610312581062317, "logps/chosen": -2374.525146484375, "logps/rejected": -2374.6787109375, "loss": 10.1768, "nll_loss": 2.3724586963653564, "rewards/accuracies": 0.625, "rewards/chosen": -0.34056445956230164, "rewards/margins": 0.017394136637449265, "rewards/rejected": -0.3579586446285248, "step": 34 }, { "epoch": 0.05021520803443329, "grad_norm": 138.3177947998047, "kl/ref_to_policy/chosen": 43.08303451538086, "kl/ref_to_policy/mean": 42.21685791015625, "kl/ref_to_policy/rejected": 41.350677490234375, "learning_rate": 4.857142857142858e-06, "logits/chosen": -0.4769934117794037, "logits/rejected": -0.339036226272583, "logps/chosen": -1580.1925048828125, "logps/rejected": -1582.4366455078125, "loss": 9.0461, "nll_loss": 2.0855720043182373, "rewards/accuracies": 0.4375, "rewards/chosen": -0.43083032965660095, "rewards/margins": -0.01732354611158371, "rewards/rejected": -0.41350680589675903, "step": 35 }, { "epoch": 0.05164992826398852, "grad_norm": 116.54885864257812, "kl/ref_to_policy/chosen": 14.099658012390137, "kl/ref_to_policy/mean": 15.921082496643066, "kl/ref_to_policy/rejected": 17.742504119873047, "learning_rate": 5e-06, "logits/chosen": -0.853381335735321, "logits/rejected": -0.5676892995834351, "logps/chosen": -1137.615234375, "logps/rejected": -1136.1156005859375, "loss": 10.3682, "nll_loss": 2.4227492809295654, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14099659025669098, "rewards/margins": 0.03642846643924713, "rewards/rejected": -0.1774250566959381, "step": 36 }, { "epoch": 0.05308464849354376, "grad_norm": 75.70368194580078, "kl/ref_to_policy/chosen": 45.902164459228516, "kl/ref_to_policy/mean": 43.580230712890625, "kl/ref_to_policy/rejected": 41.25830078125, "learning_rate": 5.142857142857142e-06, "logits/chosen": -0.6782353520393372, "logits/rejected": -0.4936344623565674, "logps/chosen": -2163.136474609375, "logps/rejected": -2164.43603515625, "loss": 9.4221, "nll_loss": 2.175588607788086, "rewards/accuracies": 0.5, "rewards/chosen": -0.459021657705307, "rewards/margins": -0.04643864184617996, "rewards/rejected": -0.41258299350738525, "step": 37 }, { "epoch": 0.054519368723098996, "grad_norm": 139.2948760986328, "kl/ref_to_policy/chosen": 37.24257278442383, "kl/ref_to_policy/mean": 37.88835144042969, "kl/ref_to_policy/rejected": 38.53413009643555, "learning_rate": 5.285714285714286e-06, "logits/chosen": -0.7857111692428589, "logits/rejected": -0.4988228678703308, "logps/chosen": -1387.031494140625, "logps/rejected": -1386.1668701171875, "loss": 9.4702, "nll_loss": 2.195127010345459, "rewards/accuracies": 0.625, "rewards/chosen": -0.37242570519447327, "rewards/margins": 0.012915574945509434, "rewards/rejected": -0.3853413164615631, "step": 38 }, { "epoch": 0.05595408895265423, "grad_norm": 92.82568359375, "kl/ref_to_policy/chosen": 17.142776489257812, "kl/ref_to_policy/mean": 18.945871353149414, "kl/ref_to_policy/rejected": 20.74896240234375, "learning_rate": 5.428571428571429e-06, "logits/chosen": -0.8582590818405151, "logits/rejected": -0.5574302673339844, "logps/chosen": -944.814697265625, "logps/rejected": -943.0985717773438, "loss": 9.5759, "nll_loss": 2.224743127822876, "rewards/accuracies": 0.625, "rewards/chosen": -0.17142778635025024, "rewards/margins": 0.0360618457198143, "rewards/rejected": -0.20748960971832275, "step": 39 }, { "epoch": 0.05738880918220947, "grad_norm": 74.48859405517578, "kl/ref_to_policy/chosen": 42.407901763916016, "kl/ref_to_policy/mean": 40.72885513305664, "kl/ref_to_policy/rejected": 39.049808502197266, "learning_rate": 5.571428571428572e-06, "logits/chosen": -0.5759834051132202, "logits/rejected": -0.39960235357284546, "logps/chosen": -1610.6031494140625, "logps/rejected": -1611.6318359375, "loss": 9.1385, "nll_loss": 2.1064229011535645, "rewards/accuracies": 0.4375, "rewards/chosen": -0.424079030752182, "rewards/margins": -0.033580973744392395, "rewards/rejected": -0.3904981017112732, "step": 40 }, { "epoch": 0.05738880918220947, "eval_kl/ref_to_policy/chosen": 35.68086242675781, "eval_kl/ref_to_policy/mean": 35.88503646850586, "eval_kl/ref_to_policy/rejected": 36.08921432495117, "eval_logits/chosen": -0.6401399970054626, "eval_logits/rejected": -0.4124494194984436, "eval_logps/chosen": -1559.9168701171875, "eval_logps/rejected": -1559.8118896484375, "eval_loss": 9.266788482666016, "eval_nll_loss": 2.1433265209198, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.356808602809906, "eval_rewards/margins": 0.004083483945578337, "eval_rewards/rejected": -0.36089208722114563, "eval_runtime": 116.0882, "eval_samples_per_second": 3.239, "eval_steps_per_second": 1.619, "step": 40 }, { "epoch": 0.058823529411764705, "grad_norm": 81.45085144042969, "kl/ref_to_policy/chosen": 45.937557220458984, "kl/ref_to_policy/mean": 43.52786636352539, "kl/ref_to_policy/rejected": 41.11817169189453, "learning_rate": 5.7142857142857145e-06, "logits/chosen": -0.38241833448410034, "logits/rejected": -0.22093409299850464, "logps/chosen": -1874.3046875, "logps/rejected": -1876.3897705078125, "loss": 9.2865, "nll_loss": 2.141667366027832, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4593755900859833, "rewards/margins": -0.04819389432668686, "rewards/rejected": -0.41118165850639343, "step": 41 }, { "epoch": 0.06025824964131994, "grad_norm": 69.51065826416016, "kl/ref_to_policy/chosen": 23.142425537109375, "kl/ref_to_policy/mean": 24.827072143554688, "kl/ref_to_policy/rejected": 26.51171875, "learning_rate": 5.857142857142858e-06, "logits/chosen": -0.6976562738418579, "logits/rejected": -0.4269261956214905, "logps/chosen": -1241.6658935546875, "logps/rejected": -1241.111083984375, "loss": 9.0736, "nll_loss": 2.098804235458374, "rewards/accuracies": 0.625, "rewards/chosen": -0.2314242422580719, "rewards/margins": 0.03369291126728058, "rewards/rejected": -0.26511716842651367, "step": 42 }, { "epoch": 0.06169296987087518, "grad_norm": 67.34141540527344, "kl/ref_to_policy/chosen": 33.76701736450195, "kl/ref_to_policy/mean": 34.06694412231445, "kl/ref_to_policy/rejected": 34.36686706542969, "learning_rate": 6e-06, "logits/chosen": -0.4527082145214081, "logits/rejected": -0.23598480224609375, "logps/chosen": -1570.454345703125, "logps/rejected": -1571.591552734375, "loss": 8.9751, "nll_loss": 2.0706355571746826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3376702070236206, "rewards/margins": 0.005998471286147833, "rewards/rejected": -0.3436686396598816, "step": 43 }, { "epoch": 0.06312769010043041, "grad_norm": 66.52509307861328, "kl/ref_to_policy/chosen": 24.926050186157227, "kl/ref_to_policy/mean": 24.77595329284668, "kl/ref_to_policy/rejected": 24.625858306884766, "learning_rate": 6.142857142857144e-06, "logits/chosen": -0.66541987657547, "logits/rejected": -0.36377108097076416, "logps/chosen": -1118.246337890625, "logps/rejected": -1117.9051513671875, "loss": 8.9965, "nll_loss": 2.0745370388031006, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24926048517227173, "rewards/margins": -0.003001917153596878, "rewards/rejected": -0.24625854194164276, "step": 44 }, { "epoch": 0.06456241032998565, "grad_norm": 56.859676361083984, "kl/ref_to_policy/chosen": 19.461811065673828, "kl/ref_to_policy/mean": 17.866352081298828, "kl/ref_to_policy/rejected": 16.27089500427246, "learning_rate": 6.285714285714286e-06, "logits/chosen": -0.4730224013328552, "logits/rejected": -0.26287344098091125, "logps/chosen": -1304.7742919921875, "logps/rejected": -1305.49365234375, "loss": 8.8041, "nll_loss": 2.0228688716888428, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1946180760860443, "rewards/margins": -0.031909164041280746, "rewards/rejected": -0.16270893812179565, "step": 45 }, { "epoch": 0.06599713055954089, "grad_norm": 73.23810577392578, "kl/ref_to_policy/chosen": 22.29429054260254, "kl/ref_to_policy/mean": 22.225746154785156, "kl/ref_to_policy/rejected": 22.157197952270508, "learning_rate": 6.4285714285714295e-06, "logits/chosen": -0.8113654255867004, "logits/rejected": -0.4611293077468872, "logps/chosen": -1802.770751953125, "logps/rejected": -1802.3836669921875, "loss": 9.1792, "nll_loss": 2.12013840675354, "rewards/accuracies": 0.625, "rewards/chosen": -0.22294288873672485, "rewards/margins": -0.001370944082736969, "rewards/rejected": -0.22157195210456848, "step": 46 }, { "epoch": 0.06743185078909612, "grad_norm": 44.25760269165039, "kl/ref_to_policy/chosen": 25.83917999267578, "kl/ref_to_policy/mean": 24.35340118408203, "kl/ref_to_policy/rejected": 22.86762237548828, "learning_rate": 6.571428571428572e-06, "logits/chosen": -0.5725135207176208, "logits/rejected": -0.38416242599487305, "logps/chosen": -2144.62060546875, "logps/rejected": -2145.984619140625, "loss": 8.3919, "nll_loss": 1.920141339302063, "rewards/accuracies": 0.5, "rewards/chosen": -0.25839176774024963, "rewards/margins": -0.029715517535805702, "rewards/rejected": -0.22867624461650848, "step": 47 }, { "epoch": 0.06886657101865136, "grad_norm": 57.058677673339844, "kl/ref_to_policy/chosen": 16.616636276245117, "kl/ref_to_policy/mean": 16.120519638061523, "kl/ref_to_policy/rejected": 15.624404907226562, "learning_rate": 6.714285714285714e-06, "logits/chosen": -0.6281272172927856, "logits/rejected": -0.3071502149105072, "logps/chosen": -1776.84326171875, "logps/rejected": -1776.776611328125, "loss": 9.0602, "nll_loss": 2.0895743370056152, "rewards/accuracies": 0.5, "rewards/chosen": -0.16616636514663696, "rewards/margins": -0.009922307915985584, "rewards/rejected": -0.15624403953552246, "step": 48 }, { "epoch": 0.0703012912482066, "grad_norm": 48.79561233520508, "kl/ref_to_policy/chosen": 20.498491287231445, "kl/ref_to_policy/mean": 19.105344772338867, "kl/ref_to_policy/rejected": 17.712196350097656, "learning_rate": 6.857142857142858e-06, "logits/chosen": -0.5798385739326477, "logits/rejected": -0.327796995639801, "logps/chosen": -2112.33447265625, "logps/rejected": -2112.741455078125, "loss": 8.3832, "nll_loss": 1.917919635772705, "rewards/accuracies": 0.5, "rewards/chosen": -0.20498493313789368, "rewards/margins": -0.027862947434186935, "rewards/rejected": -0.17712196707725525, "step": 49 }, { "epoch": 0.07173601147776183, "grad_norm": 52.90082931518555, "kl/ref_to_policy/chosen": 19.256507873535156, "kl/ref_to_policy/mean": 16.65262794494629, "kl/ref_to_policy/rejected": 14.048749923706055, "learning_rate": 7e-06, "logits/chosen": -0.6716729402542114, "logits/rejected": -0.4553607106208801, "logps/chosen": -1363.115966796875, "logps/rejected": -1363.505859375, "loss": 8.1228, "nll_loss": 1.8499079942703247, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19256505370140076, "rewards/margins": -0.05207756906747818, "rewards/rejected": -0.14048749208450317, "step": 50 }, { "epoch": 0.07173601147776183, "eval_kl/ref_to_policy/chosen": 22.066265106201172, "eval_kl/ref_to_policy/mean": 22.298574447631836, "eval_kl/ref_to_policy/rejected": 22.5308780670166, "eval_logits/chosen": -0.9036331176757812, "eval_logits/rejected": -0.6333881616592407, "eval_logps/chosen": -1546.302490234375, "eval_logps/rejected": -1546.2535400390625, "eval_loss": 8.49919605255127, "eval_nll_loss": 1.9512840509414673, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.2206626683473587, "eval_rewards/margins": 0.004646116402000189, "eval_rewards/rejected": -0.22530879080295563, "eval_runtime": 112.5273, "eval_samples_per_second": 3.341, "eval_steps_per_second": 1.671, "step": 50 }, { "epoch": 0.07317073170731707, "grad_norm": 72.79331970214844, "kl/ref_to_policy/chosen": 5.951180934906006, "kl/ref_to_policy/mean": 8.069159507751465, "kl/ref_to_policy/rejected": 10.187139511108398, "learning_rate": 7.1428571428571436e-06, "logits/chosen": -1.1963375806808472, "logits/rejected": -0.8610094785690308, "logps/chosen": -1092.732421875, "logps/rejected": -1091.2808837890625, "loss": 8.4121, "nll_loss": 1.934317946434021, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05951179563999176, "rewards/margins": 0.042359575629234314, "rewards/rejected": -0.10187138617038727, "step": 51 }, { "epoch": 0.0746054519368723, "grad_norm": 78.18092346191406, "kl/ref_to_policy/chosen": -1.0044903755187988, "kl/ref_to_policy/mean": 1.288290023803711, "kl/ref_to_policy/rejected": 3.5810701847076416, "learning_rate": 7.285714285714286e-06, "logits/chosen": -1.2153067588806152, "logits/rejected": -0.8909232020378113, "logps/chosen": -968.0267944335938, "logps/rejected": -966.8643188476562, "loss": 8.0669, "nll_loss": 1.8484219312667847, "rewards/accuracies": 0.625, "rewards/chosen": 0.010044906288385391, "rewards/margins": 0.045855604112148285, "rewards/rejected": -0.03581070154905319, "step": 52 }, { "epoch": 0.07604017216642754, "grad_norm": 50.289207458496094, "kl/ref_to_policy/chosen": 26.262279510498047, "kl/ref_to_policy/mean": 23.400026321411133, "kl/ref_to_policy/rejected": 20.53777503967285, "learning_rate": 7.428571428571429e-06, "logits/chosen": -0.682560384273529, "logits/rejected": -0.4731484353542328, "logps/chosen": -2087.31689453125, "logps/rejected": -2088.34033203125, "loss": 7.9799, "nll_loss": 1.8134710788726807, "rewards/accuracies": 0.375, "rewards/chosen": -0.26262277364730835, "rewards/margins": -0.05724504217505455, "rewards/rejected": -0.2053777575492859, "step": 53 }, { "epoch": 0.07747489239598278, "grad_norm": 64.03902435302734, "kl/ref_to_policy/chosen": 17.070436477661133, "kl/ref_to_policy/mean": 16.919193267822266, "kl/ref_to_policy/rejected": 16.767946243286133, "learning_rate": 7.571428571428572e-06, "logits/chosen": -1.1147396564483643, "logits/rejected": -0.861160159111023, "logps/chosen": -1445.898681640625, "logps/rejected": -1446.0433349609375, "loss": 8.0335, "nll_loss": 1.833875298500061, "rewards/accuracies": 0.5, "rewards/chosen": -0.17070437967777252, "rewards/margins": -0.003024909645318985, "rewards/rejected": -0.16767947375774384, "step": 54 }, { "epoch": 0.07890961262553801, "grad_norm": 74.27228546142578, "kl/ref_to_policy/chosen": 7.539156913757324, "kl/ref_to_policy/mean": 8.022860527038574, "kl/ref_to_policy/rejected": 8.506563186645508, "learning_rate": 7.714285714285716e-06, "logits/chosen": -1.2243831157684326, "logits/rejected": -0.9699686169624329, "logps/chosen": -1108.6016845703125, "logps/rejected": -1108.098388671875, "loss": 8.177, "nll_loss": 1.8710371255874634, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07539156079292297, "rewards/margins": 0.009674077853560448, "rewards/rejected": -0.08506564050912857, "step": 55 }, { "epoch": 0.08034433285509325, "grad_norm": 77.92454528808594, "kl/ref_to_policy/chosen": 7.597426891326904, "kl/ref_to_policy/mean": 7.985602378845215, "kl/ref_to_policy/rejected": 8.373777389526367, "learning_rate": 7.857142857142858e-06, "logits/chosen": -1.1577574014663696, "logits/rejected": -0.9139986038208008, "logps/chosen": -1466.136474609375, "logps/rejected": -1465.709716796875, "loss": 8.2448, "nll_loss": 1.887803316116333, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07597425580024719, "rewards/margins": 0.007763510569930077, "rewards/rejected": -0.08373777568340302, "step": 56 }, { "epoch": 0.08177905308464849, "grad_norm": 78.5572509765625, "kl/ref_to_policy/chosen": 11.760835647583008, "kl/ref_to_policy/mean": 14.299156188964844, "kl/ref_to_policy/rejected": 16.837478637695312, "learning_rate": 8.000000000000001e-06, "logits/chosen": -1.2760138511657715, "logits/rejected": -1.046396255493164, "logps/chosen": -1142.6593017578125, "logps/rejected": -1142.3941650390625, "loss": 7.7155, "nll_loss": 1.7612359523773193, "rewards/accuracies": 0.75, "rewards/chosen": -0.11760835349559784, "rewards/margins": 0.05076642334461212, "rewards/rejected": -0.16837477684020996, "step": 57 }, { "epoch": 0.08321377331420372, "grad_norm": 72.0765380859375, "kl/ref_to_policy/chosen": 25.862668991088867, "kl/ref_to_policy/mean": 26.122331619262695, "kl/ref_to_policy/rejected": 26.381990432739258, "learning_rate": 8.142857142857143e-06, "logits/chosen": -1.0769097805023193, "logits/rejected": -0.8992606401443481, "logps/chosen": -1547.6439208984375, "logps/rejected": -1548.065673828125, "loss": 7.4532, "nll_loss": 1.6898409128189087, "rewards/accuracies": 0.5625, "rewards/chosen": -0.25862669944763184, "rewards/margins": 0.005193240009248257, "rewards/rejected": -0.26381993293762207, "step": 58 }, { "epoch": 0.08464849354375897, "grad_norm": 99.10498809814453, "kl/ref_to_policy/chosen": -0.8381727933883667, "kl/ref_to_policy/mean": 3.6313138008117676, "kl/ref_to_policy/rejected": 8.100800514221191, "learning_rate": 8.285714285714287e-06, "logits/chosen": -1.4393999576568604, "logits/rejected": -1.1980133056640625, "logps/chosen": -1157.1148681640625, "logps/rejected": -1155.882080078125, "loss": 7.5334, "nll_loss": 1.7205944061279297, "rewards/accuracies": 0.8125, "rewards/chosen": 0.008381731808185577, "rewards/margins": 0.08938972651958466, "rewards/rejected": -0.08100799471139908, "step": 59 }, { "epoch": 0.08608321377331421, "grad_norm": 74.4801254272461, "kl/ref_to_policy/chosen": 18.521602630615234, "kl/ref_to_policy/mean": 17.77753257751465, "kl/ref_to_policy/rejected": 17.033462524414062, "learning_rate": 8.428571428571429e-06, "logits/chosen": -1.0407847166061401, "logits/rejected": -0.8711041212081909, "logps/chosen": -1556.881103515625, "logps/rejected": -1557.0589599609375, "loss": 7.5395, "nll_loss": 1.708603024482727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.185216024518013, "rewards/margins": -0.014881400391459465, "rewards/rejected": -0.17033463716506958, "step": 60 }, { "epoch": 0.08608321377331421, "eval_kl/ref_to_policy/chosen": 22.090404510498047, "eval_kl/ref_to_policy/mean": 22.37346076965332, "eval_kl/ref_to_policy/rejected": 22.656513214111328, "eval_logits/chosen": -1.0422927141189575, "eval_logits/rejected": -0.8642475605010986, "eval_logps/chosen": -1546.326416015625, "eval_logps/rejected": -1546.379150390625, "eval_loss": 7.789914608001709, "eval_nll_loss": 1.7739406824111938, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.22090405225753784, "eval_rewards/margins": 0.005661075934767723, "eval_rewards/rejected": -0.2265651375055313, "eval_runtime": 114.8261, "eval_samples_per_second": 3.275, "eval_steps_per_second": 1.637, "step": 60 }, { "epoch": 0.08751793400286945, "grad_norm": 96.24207305908203, "kl/ref_to_policy/chosen": 7.721226692199707, "kl/ref_to_policy/mean": 9.88934326171875, "kl/ref_to_policy/rejected": 12.057458877563477, "learning_rate": 8.571428571428571e-06, "logits/chosen": -1.3250696659088135, "logits/rejected": -1.1054030656814575, "logps/chosen": -1576.22900390625, "logps/rejected": -1575.4012451171875, "loss": 7.6495, "nll_loss": 1.7435615062713623, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07721227407455444, "rewards/margins": 0.043362319469451904, "rewards/rejected": -0.12057458609342575, "step": 61 }, { "epoch": 0.08895265423242468, "grad_norm": 48.23530960083008, "kl/ref_to_policy/chosen": 31.286867141723633, "kl/ref_to_policy/mean": 28.658611297607422, "kl/ref_to_policy/rejected": 26.030353546142578, "learning_rate": 8.714285714285715e-06, "logits/chosen": -0.6825488805770874, "logits/rejected": -0.5764737129211426, "logps/chosen": -2188.997802734375, "logps/rejected": -2190.376220703125, "loss": 7.7835, "nll_loss": 1.7650232315063477, "rewards/accuracies": 0.375, "rewards/chosen": -0.3128686547279358, "rewards/margins": -0.05256511643528938, "rewards/rejected": -0.2603035271167755, "step": 62 }, { "epoch": 0.09038737446197992, "grad_norm": 100.5356216430664, "kl/ref_to_policy/chosen": -2.0972652435302734, "kl/ref_to_policy/mean": -0.20614206790924072, "kl/ref_to_policy/rejected": 1.6849807500839233, "learning_rate": 8.857142857142858e-06, "logits/chosen": -1.287955403327942, "logits/rejected": -1.0551402568817139, "logps/chosen": -1479.4339599609375, "logps/rejected": -1478.8587646484375, "loss": 7.363, "nll_loss": 1.6709693670272827, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020972643047571182, "rewards/margins": 0.037822458893060684, "rewards/rejected": -0.016849806532263756, "step": 63 }, { "epoch": 0.09182209469153516, "grad_norm": 61.11951446533203, "kl/ref_to_policy/chosen": 15.723905563354492, "kl/ref_to_policy/mean": 14.636232376098633, "kl/ref_to_policy/rejected": 13.54856014251709, "learning_rate": 9e-06, "logits/chosen": -0.8386430740356445, "logits/rejected": -0.7211064100265503, "logps/chosen": -1655.1771240234375, "logps/rejected": -1655.8980712890625, "loss": 7.0858, "nll_loss": 1.5945569276809692, "rewards/accuracies": 0.5, "rewards/chosen": -0.15723904967308044, "rewards/margins": -0.021753448992967606, "rewards/rejected": -0.13548560440540314, "step": 64 }, { "epoch": 0.09325681492109039, "grad_norm": 101.93685913085938, "kl/ref_to_policy/chosen": 1.2923557758331299, "kl/ref_to_policy/mean": 3.599538564682007, "kl/ref_to_policy/rejected": 5.9067206382751465, "learning_rate": 9.142857142857144e-06, "logits/chosen": -1.3574756383895874, "logits/rejected": -1.142805576324463, "logps/chosen": -856.8626708984375, "logps/rejected": -856.31494140625, "loss": 6.6973, "nll_loss": 1.5057082176208496, "rewards/accuracies": 0.625, "rewards/chosen": -0.012923553586006165, "rewards/margins": 0.046143654733896255, "rewards/rejected": -0.05906721204519272, "step": 65 }, { "epoch": 0.09469153515064563, "grad_norm": 75.61996459960938, "kl/ref_to_policy/chosen": 3.880415439605713, "kl/ref_to_policy/mean": 2.9148917198181152, "kl/ref_to_policy/rejected": 1.9493670463562012, "learning_rate": 9.285714285714288e-06, "logits/chosen": -0.8782755136489868, "logits/rejected": -0.7198506593704224, "logps/chosen": -1597.34130859375, "logps/rejected": -1597.5684814453125, "loss": 7.4133, "nll_loss": 1.6765087842941284, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03880416601896286, "rewards/margins": -0.019310493022203445, "rewards/rejected": -0.019493667408823967, "step": 66 }, { "epoch": 0.09612625538020086, "grad_norm": 96.23664093017578, "kl/ref_to_policy/chosen": -1.0575326681137085, "kl/ref_to_policy/mean": 0.9983001947402954, "kl/ref_to_policy/rejected": 3.054133415222168, "learning_rate": 9.42857142857143e-06, "logits/chosen": -1.143807291984558, "logits/rejected": -0.9573706388473511, "logps/chosen": -1304.4189453125, "logps/rejected": -1304.3275146484375, "loss": 7.0846, "nll_loss": 1.601832628250122, "rewards/accuracies": 0.6875, "rewards/chosen": 0.010575324296951294, "rewards/margins": 0.04111666604876518, "rewards/rejected": -0.03054133802652359, "step": 67 }, { "epoch": 0.0975609756097561, "grad_norm": 78.10585021972656, "kl/ref_to_policy/chosen": 13.002607345581055, "kl/ref_to_policy/mean": 11.581298828125, "kl/ref_to_policy/rejected": 10.159991264343262, "learning_rate": 9.571428571428573e-06, "logits/chosen": -0.7354602813720703, "logits/rejected": -0.5843961834907532, "logps/chosen": -1579.7056884765625, "logps/rejected": -1579.9237060546875, "loss": 7.2511, "nll_loss": 1.6346244812011719, "rewards/accuracies": 0.5, "rewards/chosen": -0.13002605736255646, "rewards/margins": -0.0284261554479599, "rewards/rejected": -0.10159990191459656, "step": 68 }, { "epoch": 0.09899569583931134, "grad_norm": 70.20247650146484, "kl/ref_to_policy/chosen": 5.5443925857543945, "kl/ref_to_policy/mean": 4.243047714233398, "kl/ref_to_policy/rejected": 2.941702365875244, "learning_rate": 9.714285714285715e-06, "logits/chosen": -0.8709947466850281, "logits/rejected": -0.7180661559104919, "logps/chosen": -1447.81787109375, "logps/rejected": -1448.121337890625, "loss": 6.6621, "nll_loss": 1.4874240159988403, "rewards/accuracies": 0.5, "rewards/chosen": -0.05544392392039299, "rewards/margins": -0.026026900857686996, "rewards/rejected": -0.029417024925351143, "step": 69 }, { "epoch": 0.10043041606886657, "grad_norm": 76.77359771728516, "kl/ref_to_policy/chosen": 6.9545369148254395, "kl/ref_to_policy/mean": 6.6963958740234375, "kl/ref_to_policy/rejected": 6.438254356384277, "learning_rate": 9.857142857142859e-06, "logits/chosen": -0.9056985378265381, "logits/rejected": -0.7312473058700562, "logps/chosen": -1403.7669677734375, "logps/rejected": -1403.891357421875, "loss": 6.5728, "nll_loss": 1.4680273532867432, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06954536586999893, "rewards/margins": -0.005162823013961315, "rewards/rejected": -0.06438253819942474, "step": 70 }, { "epoch": 0.10043041606886657, "eval_kl/ref_to_policy/chosen": 11.483498573303223, "eval_kl/ref_to_policy/mean": 11.801331520080566, "eval_kl/ref_to_policy/rejected": 12.119165420532227, "eval_logits/chosen": -0.8223150372505188, "eval_logits/rejected": -0.6363703012466431, "eval_logps/chosen": -1535.7196044921875, "eval_logps/rejected": -1535.841796875, "eval_loss": 7.04022216796875, "eval_nll_loss": 1.5863534212112427, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.1148349717259407, "eval_rewards/margins": 0.006356681231409311, "eval_rewards/rejected": -0.12119164317846298, "eval_runtime": 114.9823, "eval_samples_per_second": 3.27, "eval_steps_per_second": 1.635, "step": 70 }, { "epoch": 0.10186513629842181, "grad_norm": 66.38375091552734, "kl/ref_to_policy/chosen": 2.262831449508667, "kl/ref_to_policy/mean": -0.12483334541320801, "kl/ref_to_policy/rejected": -2.512498378753662, "learning_rate": 1e-05, "logits/chosen": -0.657391369342804, "logits/rejected": -0.5054954886436462, "logps/chosen": -1099.200439453125, "logps/rejected": -1099.4542236328125, "loss": 6.5832, "nll_loss": 1.4650670289993286, "rewards/accuracies": 0.4375, "rewards/chosen": -0.022628314793109894, "rewards/margins": -0.04775330424308777, "rewards/rejected": 0.025124981999397278, "step": 71 }, { "epoch": 0.10329985652797705, "grad_norm": 105.94052124023438, "kl/ref_to_policy/chosen": -3.0876810550689697, "kl/ref_to_policy/mean": 0.34024739265441895, "kl/ref_to_policy/rejected": 3.768174648284912, "learning_rate": 9.99993723698994e-06, "logits/chosen": -1.1243045330047607, "logits/rejected": -0.8781906366348267, "logps/chosen": -1228.1544189453125, "logps/rejected": -1228.0025634765625, "loss": 6.077, "nll_loss": 1.353310227394104, "rewards/accuracies": 0.75, "rewards/chosen": 0.030876807868480682, "rewards/margins": 0.06855855882167816, "rewards/rejected": -0.03768175095319748, "step": 72 }, { "epoch": 0.10473457675753228, "grad_norm": 66.2980728149414, "kl/ref_to_policy/chosen": 10.519880294799805, "kl/ref_to_policy/mean": 7.133124828338623, "kl/ref_to_policy/rejected": 3.746368169784546, "learning_rate": 9.999748949535436e-06, "logits/chosen": -0.48436933755874634, "logits/rejected": -0.35865575075149536, "logps/chosen": -1990.0980224609375, "logps/rejected": -1990.504150390625, "loss": 7.0374, "nll_loss": 1.5760952234268188, "rewards/accuracies": 0.375, "rewards/chosen": -0.10519880801439285, "rewards/margins": -0.06773511320352554, "rewards/rejected": -0.03746368736028671, "step": 73 }, { "epoch": 0.10616929698708752, "grad_norm": 87.7349624633789, "kl/ref_to_policy/chosen": -4.301197528839111, "kl/ref_to_policy/mean": -2.0634801387786865, "kl/ref_to_policy/rejected": 0.17423784732818604, "learning_rate": 9.999435142363484e-06, "logits/chosen": -0.8438023924827576, "logits/rejected": -0.598275899887085, "logps/chosen": -1069.25341796875, "logps/rejected": -1069.2406005859375, "loss": 6.5081, "nll_loss": 1.4581992626190186, "rewards/accuracies": 0.625, "rewards/chosen": 0.04301197826862335, "rewards/margins": 0.044754352420568466, "rewards/rejected": -0.001742372289299965, "step": 74 }, { "epoch": 0.10760401721664276, "grad_norm": 67.996337890625, "kl/ref_to_policy/chosen": 4.256232261657715, "kl/ref_to_policy/mean": 3.0298848152160645, "kl/ref_to_policy/rejected": 1.8035364151000977, "learning_rate": 9.998995823352276e-06, "logits/chosen": -0.6617578864097595, "logits/rejected": -0.4818864166736603, "logps/chosen": -1150.2484130859375, "logps/rejected": -1150.502685546875, "loss": 6.2665, "nll_loss": 1.3891692161560059, "rewards/accuracies": 0.4375, "rewards/chosen": -0.042562320828437805, "rewards/margins": -0.024526961147785187, "rewards/rejected": -0.018035363405942917, "step": 75 }, { "epoch": 0.10903873744619799, "grad_norm": 89.04920959472656, "kl/ref_to_policy/chosen": 2.7033581733703613, "kl/ref_to_policy/mean": 4.906732559204102, "kl/ref_to_policy/rejected": 7.110106945037842, "learning_rate": 9.998431003531008e-06, "logits/chosen": -0.8681896328926086, "logits/rejected": -0.6382002234458923, "logps/chosen": -1826.445556640625, "logps/rejected": -1826.510498046875, "loss": 6.3734, "nll_loss": 1.4243934154510498, "rewards/accuracies": 0.6875, "rewards/chosen": -0.027033578604459763, "rewards/margins": 0.04406748712062836, "rewards/rejected": -0.07110106945037842, "step": 76 }, { "epoch": 0.11047345767575323, "grad_norm": 86.5269775390625, "kl/ref_to_policy/chosen": 4.224468231201172, "kl/ref_to_policy/mean": 6.2589826583862305, "kl/ref_to_policy/rejected": 8.293498039245605, "learning_rate": 9.997740697079595e-06, "logits/chosen": -0.6520290374755859, "logits/rejected": -0.42504939436912537, "logps/chosen": -1125.8260498046875, "logps/rejected": -1125.83251953125, "loss": 6.3002, "nll_loss": 1.405788779258728, "rewards/accuracies": 0.5625, "rewards/chosen": -0.042244672775268555, "rewards/margins": 0.04069029539823532, "rewards/rejected": -0.08293496817350388, "step": 77 }, { "epoch": 0.11190817790530846, "grad_norm": 58.087852478027344, "kl/ref_to_policy/chosen": 32.66326904296875, "kl/ref_to_policy/mean": 30.176591873168945, "kl/ref_to_policy/rejected": 27.689910888671875, "learning_rate": 9.99692492132832e-06, "logits/chosen": -0.3749508559703827, "logits/rejected": -0.22214524447917938, "logps/chosen": -2529.639892578125, "logps/rejected": -2529.94580078125, "loss": 7.3232, "nll_loss": 1.6498167514801025, "rewards/accuracies": 0.5, "rewards/chosen": -0.32663270831108093, "rewards/margins": -0.04973362386226654, "rewards/rejected": -0.2768990993499756, "step": 78 }, { "epoch": 0.1133428981348637, "grad_norm": 77.1606674194336, "kl/ref_to_policy/chosen": 0.5363078117370605, "kl/ref_to_policy/mean": -0.07653665542602539, "kl/ref_to_policy/rejected": -0.6893808841705322, "learning_rate": 9.9959836967574e-06, "logits/chosen": -0.6037003397941589, "logits/rejected": -0.40836018323898315, "logps/chosen": -869.2238159179688, "logps/rejected": -869.3242797851562, "loss": 5.8402, "nll_loss": 1.2837796211242676, "rewards/accuracies": 0.5, "rewards/chosen": -0.005363075062632561, "rewards/margins": -0.012256890535354614, "rewards/rejected": 0.006893808022141457, "step": 79 }, { "epoch": 0.11477761836441894, "grad_norm": 90.070556640625, "kl/ref_to_policy/chosen": 4.565455436706543, "kl/ref_to_policy/mean": 5.676449775695801, "kl/ref_to_policy/rejected": 6.7874436378479, "learning_rate": 9.994917046996472e-06, "logits/chosen": -0.7000424265861511, "logits/rejected": -0.46428582072257996, "logps/chosen": -1124.5732421875, "logps/rejected": -1124.5062255859375, "loss": 6.2327, "nll_loss": 1.386307716369629, "rewards/accuracies": 0.5625, "rewards/chosen": -0.045654550194740295, "rewards/margins": 0.022219890728592873, "rewards/rejected": -0.06787443906068802, "step": 80 }, { "epoch": 0.11477761836441894, "eval_kl/ref_to_policy/chosen": 19.861896514892578, "eval_kl/ref_to_policy/mean": 20.176925659179688, "eval_kl/ref_to_policy/rejected": 20.491952896118164, "eval_logits/chosen": -0.6412636637687683, "eval_logits/rejected": -0.4468497037887573, "eval_logps/chosen": -1544.0980224609375, "eval_logps/rejected": -1544.214599609375, "eval_loss": 6.325852394104004, "eval_nll_loss": 1.4075939655303955, "eval_rewards/accuracies": 0.5106382966041565, "eval_rewards/chosen": -0.19861899316310883, "eval_rewards/margins": 0.006300552748143673, "eval_rewards/rejected": -0.20491953194141388, "eval_runtime": 113.5574, "eval_samples_per_second": 3.311, "eval_steps_per_second": 1.656, "step": 80 }, { "epoch": 0.11621233859397417, "grad_norm": 57.97358703613281, "kl/ref_to_policy/chosen": 28.2091064453125, "kl/ref_to_policy/mean": 24.196674346923828, "kl/ref_to_policy/rejected": 20.184240341186523, "learning_rate": 9.993724998823995e-06, "logits/chosen": -0.44470199942588806, "logits/rejected": -0.2945561408996582, "logps/chosen": -2100.958984375, "logps/rejected": -2101.10595703125, "loss": 6.5188, "nll_loss": 1.4445264339447021, "rewards/accuracies": 0.375, "rewards/chosen": -0.28209108114242554, "rewards/margins": -0.08024867624044418, "rewards/rejected": -0.20184241235256195, "step": 81 }, { "epoch": 0.11764705882352941, "grad_norm": 78.67740631103516, "kl/ref_to_policy/chosen": 15.638644218444824, "kl/ref_to_policy/mean": 15.563911437988281, "kl/ref_to_policy/rejected": 15.489177703857422, "learning_rate": 9.992407582166582e-06, "logits/chosen": -0.6755179762840271, "logits/rejected": -0.4735002815723419, "logps/chosen": -1385.201171875, "logps/rejected": -1385.4814453125, "loss": 6.514, "nll_loss": 1.4535471200942993, "rewards/accuracies": 0.5, "rewards/chosen": -0.15638643503189087, "rewards/margins": -0.001494680531322956, "rewards/rejected": -0.1548917591571808, "step": 82 }, { "epoch": 0.11908177905308465, "grad_norm": 102.8233871459961, "kl/ref_to_policy/chosen": 3.074310779571533, "kl/ref_to_policy/mean": 5.33935546875, "kl/ref_to_policy/rejected": 7.604400634765625, "learning_rate": 9.990964830098246e-06, "logits/chosen": -0.9323773980140686, "logits/rejected": -0.6492084264755249, "logps/chosen": -1457.83984375, "logps/rejected": -1458.1572265625, "loss": 5.2986, "nll_loss": 1.1552757024765015, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030743110924959183, "rewards/margins": 0.04530090093612671, "rewards/rejected": -0.076044000685215, "step": 83 }, { "epoch": 0.12051649928263988, "grad_norm": 85.58009338378906, "kl/ref_to_policy/chosen": 6.815189361572266, "kl/ref_to_policy/mean": 8.99878978729248, "kl/ref_to_policy/rejected": 11.182390213012695, "learning_rate": 9.989396778839572e-06, "logits/chosen": -0.729337751865387, "logits/rejected": -0.5008273720741272, "logps/chosen": -1069.072998046875, "logps/rejected": -1069.141357421875, "loss": 5.3137, "nll_loss": 1.1594445705413818, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06815189123153687, "rewards/margins": 0.04367200657725334, "rewards/rejected": -0.1118239089846611, "step": 84 }, { "epoch": 0.12195121951219512, "grad_norm": 60.47288513183594, "kl/ref_to_policy/chosen": 18.322364807128906, "kl/ref_to_policy/mean": 15.078882217407227, "kl/ref_to_policy/rejected": 11.835400581359863, "learning_rate": 9.987703467756807e-06, "logits/chosen": -0.47163185477256775, "logits/rejected": -0.32760512828826904, "logps/chosen": -1701.330322265625, "logps/rejected": -1701.30029296875, "loss": 6.2917, "nll_loss": 1.3898141384124756, "rewards/accuracies": 0.375, "rewards/chosen": -0.1832236349582672, "rewards/margins": -0.06486963480710983, "rewards/rejected": -0.11835401505231857, "step": 85 }, { "epoch": 0.12338593974175036, "grad_norm": 86.1303939819336, "kl/ref_to_policy/chosen": 0.4786018133163452, "kl/ref_to_policy/mean": 2.3628921508789062, "kl/ref_to_policy/rejected": 4.2471818923950195, "learning_rate": 9.985884939360873e-06, "logits/chosen": -0.7038267850875854, "logits/rejected": -0.48366624116897583, "logps/chosen": -1125.9317626953125, "logps/rejected": -1126.2569580078125, "loss": 5.4617, "nll_loss": 1.1958404779434204, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0047860220074653625, "rewards/margins": 0.03768579661846161, "rewards/rejected": -0.04247182235121727, "step": 86 }, { "epoch": 0.12482065997130559, "grad_norm": 93.10389709472656, "kl/ref_to_policy/chosen": -8.219244956970215, "kl/ref_to_policy/mean": -5.4103875160217285, "kl/ref_to_policy/rejected": -2.6015305519104004, "learning_rate": 9.983941239306291e-06, "logits/chosen": -0.8225407004356384, "logits/rejected": -0.5915071964263916, "logps/chosen": -953.33544921875, "logps/rejected": -953.9414672851562, "loss": 4.7797, "nll_loss": 1.0275015830993652, "rewards/accuracies": 0.625, "rewards/chosen": 0.08219244331121445, "rewards/margins": 0.056177135556936264, "rewards/rejected": 0.026015304028987885, "step": 87 }, { "epoch": 0.12625538020086083, "grad_norm": 73.23225402832031, "kl/ref_to_policy/chosen": 1.443718433380127, "kl/ref_to_policy/mean": 0.9695166349411011, "kl/ref_to_policy/rejected": 0.4953155517578125, "learning_rate": 9.981872416390055e-06, "logits/chosen": -0.7230777740478516, "logits/rejected": -0.5364927053451538, "logps/chosen": -1483.37353515625, "logps/rejected": -1483.575439453125, "loss": 5.3615, "nll_loss": 1.1645348072052002, "rewards/accuracies": 0.5, "rewards/chosen": -0.01443717535585165, "rewards/margins": -0.00948402564972639, "rewards/rejected": -0.004953155294060707, "step": 88 }, { "epoch": 0.12769010043041606, "grad_norm": 84.9209213256836, "kl/ref_to_policy/chosen": 8.940335273742676, "kl/ref_to_policy/mean": 8.550710678100586, "kl/ref_to_policy/rejected": 8.161087036132812, "learning_rate": 9.979678522550382e-06, "logits/chosen": -0.7617268562316895, "logits/rejected": -0.5671170353889465, "logps/chosen": -1538.1826171875, "logps/rejected": -1538.3291015625, "loss": 5.2851, "nll_loss": 1.1450724601745605, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08940334618091583, "rewards/margins": -0.0077924784272909164, "rewards/rejected": -0.08161088079214096, "step": 89 }, { "epoch": 0.1291248206599713, "grad_norm": 70.6360855102539, "kl/ref_to_policy/chosen": 0.9885406494140625, "kl/ref_to_policy/mean": 0.7452189922332764, "kl/ref_to_policy/rejected": 0.5018982887268066, "learning_rate": 9.977359612865424e-06, "logits/chosen": -0.8027845025062561, "logits/rejected": -0.6260030269622803, "logps/chosen": -1421.7542724609375, "logps/rejected": -1422.2421875, "loss": 5.3187, "nll_loss": 1.1541624069213867, "rewards/accuracies": 0.5, "rewards/chosen": -0.009885404258966446, "rewards/margins": -0.004866423085331917, "rewards/rejected": -0.005018983036279678, "step": 90 }, { "epoch": 0.1291248206599713, "eval_kl/ref_to_policy/chosen": 13.54210090637207, "eval_kl/ref_to_policy/mean": 13.881000518798828, "eval_kl/ref_to_policy/rejected": 14.21989917755127, "eval_logits/chosen": -0.6553236842155457, "eval_logits/rejected": -0.47267410159111023, "eval_logps/chosen": -1537.778076171875, "eval_logps/rejected": -1537.9425048828125, "eval_loss": 5.648195743560791, "eval_nll_loss": 1.238097906112671, "eval_rewards/accuracies": 0.5106382966041565, "eval_rewards/chosen": -0.1354210078716278, "eval_rewards/margins": 0.006777983624488115, "eval_rewards/rejected": -0.14219899475574493, "eval_runtime": 114.7687, "eval_samples_per_second": 3.276, "eval_steps_per_second": 1.638, "step": 90 }, { "epoch": 0.13055954088952654, "grad_norm": 83.60857391357422, "kl/ref_to_policy/chosen": -0.8914337158203125, "kl/ref_to_policy/mean": 0.32557249069213867, "kl/ref_to_policy/rejected": 1.5425785779953003, "learning_rate": 9.974915745551882e-06, "logits/chosen": -0.6549862623214722, "logits/rejected": -0.4791857898235321, "logps/chosen": -1015.7804565429688, "logps/rejected": -1016.0972900390625, "loss": 5.3881, "nll_loss": 1.1752805709838867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008914332836866379, "rewards/margins": 0.024340122938156128, "rewards/rejected": -0.0154257882386446, "step": 91 }, { "epoch": 0.13199426111908177, "grad_norm": 89.37378692626953, "kl/ref_to_policy/chosen": 1.4905399084091187, "kl/ref_to_policy/mean": 4.178305149078369, "kl/ref_to_policy/rejected": 6.866070747375488, "learning_rate": 9.972346981963546e-06, "logits/chosen": -0.8073133826255798, "logits/rejected": -0.5823205709457397, "logps/chosen": -1556.310791015625, "logps/rejected": -1556.8641357421875, "loss": 4.6823, "nll_loss": 1.0022286176681519, "rewards/accuracies": 0.625, "rewards/chosen": -0.01490539126098156, "rewards/margins": 0.05375530570745468, "rewards/rejected": -0.06866070628166199, "step": 92 }, { "epoch": 0.133428981348637, "grad_norm": 94.29769134521484, "kl/ref_to_policy/chosen": -9.31818962097168, "kl/ref_to_policy/mean": -5.367629528045654, "kl/ref_to_policy/rejected": -1.4170703887939453, "learning_rate": 9.969653386589749e-06, "logits/chosen": -1.0164096355438232, "logits/rejected": -0.7488709688186646, "logps/chosen": -941.4318237304688, "logps/rejected": -942.2586669921875, "loss": 4.0949, "nll_loss": 0.8585785031318665, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09318189322948456, "rewards/margins": 0.07901118695735931, "rewards/rejected": 0.01417070534080267, "step": 93 }, { "epoch": 0.13486370157819225, "grad_norm": 84.36872863769531, "kl/ref_to_policy/chosen": -3.436522960662842, "kl/ref_to_policy/mean": -2.2605044841766357, "kl/ref_to_policy/rejected": -1.084485650062561, "learning_rate": 9.96683502705375e-06, "logits/chosen": -0.795971155166626, "logits/rejected": -0.5920226573944092, "logps/chosen": -956.6649780273438, "logps/rejected": -957.5463256835938, "loss": 4.6171, "nll_loss": 0.9821864366531372, "rewards/accuracies": 0.625, "rewards/chosen": 0.034365225583314896, "rewards/margins": 0.023520370945334435, "rewards/rejected": 0.010844854637980461, "step": 94 }, { "epoch": 0.13629842180774748, "grad_norm": 85.2784423828125, "kl/ref_to_policy/chosen": 2.544079542160034, "kl/ref_to_policy/mean": 5.865750789642334, "kl/ref_to_policy/rejected": 9.187422752380371, "learning_rate": 9.963891974111042e-06, "logits/chosen": -0.8111896514892578, "logits/rejected": -0.5620436072349548, "logps/chosen": -1309.1533203125, "logps/rejected": -1309.8045654296875, "loss": 4.7226, "nll_loss": 1.0140666961669922, "rewards/accuracies": 0.625, "rewards/chosen": -0.02544078603386879, "rewards/margins": 0.06643343716859818, "rewards/rejected": -0.09187422692775726, "step": 95 }, { "epoch": 0.13773314203730272, "grad_norm": 86.06370544433594, "kl/ref_to_policy/chosen": -8.701515197753906, "kl/ref_to_policy/mean": -5.492672920227051, "kl/ref_to_policy/rejected": -2.2838306427001953, "learning_rate": 9.960824301647569e-06, "logits/chosen": -0.8862168788909912, "logits/rejected": -0.645530104637146, "logps/chosen": -795.2594604492188, "logps/rejected": -795.9212036132812, "loss": 4.0645, "nll_loss": 0.8491889238357544, "rewards/accuracies": 0.625, "rewards/chosen": 0.08701513707637787, "rewards/margins": 0.06417682766914368, "rewards/rejected": 0.022838309407234192, "step": 96 }, { "epoch": 0.13916786226685796, "grad_norm": 48.50733184814453, "kl/ref_to_policy/chosen": 18.794328689575195, "kl/ref_to_policy/mean": 15.455883026123047, "kl/ref_to_policy/rejected": 12.117437362670898, "learning_rate": 9.957632086677876e-06, "logits/chosen": -0.45304301381111145, "logits/rejected": -0.3156452775001526, "logps/chosen": -2057.997314453125, "logps/rejected": -2057.65087890625, "loss": 6.2315, "nll_loss": 1.3742098808288574, "rewards/accuracies": 0.375, "rewards/chosen": -0.18794330954551697, "rewards/margins": -0.06676892191171646, "rewards/rejected": -0.12117437273263931, "step": 97 }, { "epoch": 0.1406025824964132, "grad_norm": 51.869441986083984, "kl/ref_to_policy/chosen": 7.714349746704102, "kl/ref_to_policy/mean": 5.272354602813721, "kl/ref_to_policy/rejected": 2.83035945892334, "learning_rate": 9.95431540934317e-06, "logits/chosen": -0.5075920820236206, "logits/rejected": -0.35007256269454956, "logps/chosen": -1907.7611083984375, "logps/rejected": -1907.72900390625, "loss": 5.595, "nll_loss": 1.2173631191253662, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07714349776506424, "rewards/margins": -0.04883990436792374, "rewards/rejected": -0.028303595259785652, "step": 98 }, { "epoch": 0.14203730272596843, "grad_norm": 62.844688415527344, "kl/ref_to_policy/chosen": 3.257598876953125, "kl/ref_to_policy/mean": 2.1695780754089355, "kl/ref_to_policy/rejected": 1.081557273864746, "learning_rate": 9.95087435290931e-06, "logits/chosen": -0.5996955037117004, "logits/rejected": -0.4300953149795532, "logps/chosen": -1042.093505859375, "logps/rejected": -1042.33349609375, "loss": 4.959, "nll_loss": 1.0618348121643066, "rewards/accuracies": 0.5, "rewards/chosen": -0.03257599472999573, "rewards/margins": -0.021760407835245132, "rewards/rejected": -0.010815568268299103, "step": 99 }, { "epoch": 0.14347202295552366, "grad_norm": 60.18775939941406, "kl/ref_to_policy/chosen": -3.889958620071411, "kl/ref_to_policy/mean": -2.4161384105682373, "kl/ref_to_policy/rejected": -0.9423189163208008, "learning_rate": 9.947309003764723e-06, "logits/chosen": -0.5812790393829346, "logits/rejected": -0.41142016649246216, "logps/chosen": -1449.19580078125, "logps/rejected": -1449.4757080078125, "loss": 5.1882, "nll_loss": 1.1262184381484985, "rewards/accuracies": 0.5, "rewards/chosen": 0.03889958932995796, "rewards/margins": 0.02947639860212803, "rewards/rejected": 0.009423185139894485, "step": 100 }, { "epoch": 0.14347202295552366, "eval_kl/ref_to_policy/chosen": 9.456500053405762, "eval_kl/ref_to_policy/mean": 9.812389373779297, "eval_kl/ref_to_policy/rejected": 10.168277740478516, "eval_logits/chosen": -0.5990070700645447, "eval_logits/rejected": -0.42038506269454956, "eval_logps/chosen": -1533.6925048828125, "eval_logps/rejected": -1533.8907470703125, "eval_loss": 5.074648380279541, "eval_nll_loss": 1.094603180885315, "eval_rewards/accuracies": 0.5079787373542786, "eval_rewards/chosen": -0.0945649966597557, "eval_rewards/margins": 0.007117779925465584, "eval_rewards/rejected": -0.10168277472257614, "eval_runtime": 113.3836, "eval_samples_per_second": 3.316, "eval_steps_per_second": 1.658, "step": 100 }, { "epoch": 0.1449067431850789, "grad_norm": 47.1476936340332, "kl/ref_to_policy/chosen": 1.906801700592041, "kl/ref_to_policy/mean": 0.027881622314453125, "kl/ref_to_policy/rejected": -1.8510394096374512, "learning_rate": 9.943619451418225e-06, "logits/chosen": -0.5374348759651184, "logits/rejected": -0.38886475563049316, "logps/chosen": -1027.96142578125, "logps/rejected": -1027.69970703125, "loss": 4.822, "nll_loss": 1.0257850885391235, "rewards/accuracies": 0.4375, "rewards/chosen": -0.019068021327257156, "rewards/margins": -0.037578411400318146, "rewards/rejected": 0.018510393798351288, "step": 101 }, { "epoch": 0.14634146341463414, "grad_norm": 63.29978561401367, "kl/ref_to_policy/chosen": -0.7393722534179688, "kl/ref_to_policy/mean": 0.5490775108337402, "kl/ref_to_policy/rejected": 1.837526798248291, "learning_rate": 9.939805788496778e-06, "logits/chosen": -0.6672026515007019, "logits/rejected": -0.4571641683578491, "logps/chosen": -1238.2919921875, "logps/rejected": -1239.3023681640625, "loss": 4.3519, "nll_loss": 0.9160887002944946, "rewards/accuracies": 0.625, "rewards/chosen": 0.0073937177658081055, "rewards/margins": 0.025768987834453583, "rewards/rejected": -0.018375268206000328, "step": 102 }, { "epoch": 0.14777618364418937, "grad_norm": 73.44212341308594, "kl/ref_to_policy/chosen": -14.790895462036133, "kl/ref_to_policy/mean": -8.893753051757812, "kl/ref_to_policy/rejected": -2.996610164642334, "learning_rate": 9.935868110743175e-06, "logits/chosen": -0.8988282084465027, "logits/rejected": -0.6607550382614136, "logps/chosen": -392.7091064453125, "logps/rejected": -394.55224609375, "loss": 2.7311, "nll_loss": 0.52251136302948, "rewards/accuracies": 0.75, "rewards/chosen": 0.14790895581245422, "rewards/margins": 0.11794283986091614, "rewards/rejected": 0.029966097325086594, "step": 103 }, { "epoch": 0.1492109038737446, "grad_norm": 47.284400939941406, "kl/ref_to_policy/chosen": 5.610379219055176, "kl/ref_to_policy/mean": 5.411188125610352, "kl/ref_to_policy/rejected": 5.211998462677002, "learning_rate": 9.931806517013612e-06, "logits/chosen": -0.4949027895927429, "logits/rejected": -0.307862251996994, "logps/chosen": -1391.3681640625, "logps/rejected": -1391.5792236328125, "loss": 4.984, "nll_loss": 1.0704588890075684, "rewards/accuracies": 0.5, "rewards/chosen": -0.056103792041540146, "rewards/margins": -0.003983806818723679, "rewards/rejected": -0.05211998522281647, "step": 104 }, { "epoch": 0.15064562410329985, "grad_norm": 54.30876541137695, "kl/ref_to_policy/chosen": -6.305841445922852, "kl/ref_to_policy/mean": -2.133237361907959, "kl/ref_to_policy/rejected": 2.0393662452697754, "learning_rate": 9.927621109275233e-06, "logits/chosen": -0.6424197554588318, "logits/rejected": -0.4147929251194, "logps/chosen": -1110.967529296875, "logps/rejected": -1111.9886474609375, "loss": 3.8251, "nll_loss": 0.7916878461837769, "rewards/accuracies": 0.625, "rewards/chosen": 0.06305840611457825, "rewards/margins": 0.08345206826925278, "rewards/rejected": -0.02039366215467453, "step": 105 }, { "epoch": 0.15208034433285508, "grad_norm": 31.4960994720459, "kl/ref_to_policy/chosen": 12.754409790039062, "kl/ref_to_policy/mean": 9.404555320739746, "kl/ref_to_policy/rejected": 6.0547027587890625, "learning_rate": 9.92331199260355e-06, "logits/chosen": -0.4596009850502014, "logits/rejected": -0.3301039934158325, "logps/chosen": -2077.22314453125, "logps/rejected": -2076.48828125, "loss": 5.0962, "nll_loss": 1.0903637409210205, "rewards/accuracies": 0.375, "rewards/chosen": -0.1275440901517868, "rewards/margins": -0.06699706614017487, "rewards/rejected": -0.06054702028632164, "step": 106 }, { "epoch": 0.15351506456241032, "grad_norm": 42.60166931152344, "kl/ref_to_policy/chosen": -5.523712158203125, "kl/ref_to_policy/mean": -3.19888973236084, "kl/ref_to_policy/rejected": -0.8740676641464233, "learning_rate": 9.918879275179819e-06, "logits/chosen": -0.7127547860145569, "logits/rejected": -0.511856198310852, "logps/chosen": -916.281005859375, "logps/rejected": -916.9091186523438, "loss": 3.7507, "nll_loss": 0.7685686945915222, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05523711442947388, "rewards/margins": 0.04649643227458, "rewards/rejected": 0.008740676566958427, "step": 107 }, { "epoch": 0.15494978479196556, "grad_norm": 32.062255859375, "kl/ref_to_policy/chosen": 8.873767852783203, "kl/ref_to_policy/mean": 6.143346786499023, "kl/ref_to_policy/rejected": 3.4129276275634766, "learning_rate": 9.914323068288312e-06, "logits/chosen": -0.4850109815597534, "logits/rejected": -0.3274956941604614, "logps/chosen": -1869.483154296875, "logps/rejected": -1869.7158203125, "loss": 4.9265, "nll_loss": 1.0491516590118408, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08873766660690308, "rewards/margins": -0.05460838973522186, "rewards/rejected": -0.034129273146390915, "step": 108 }, { "epoch": 0.1563845050215208, "grad_norm": 21.45846939086914, "kl/ref_to_policy/chosen": 12.215178489685059, "kl/ref_to_policy/mean": 7.874290943145752, "kl/ref_to_policy/rejected": 3.5334043502807617, "learning_rate": 9.909643486313533e-06, "logits/chosen": -0.2649412155151367, "logits/rejected": -0.16828405857086182, "logps/chosen": -2056.566650390625, "logps/rejected": -2055.730712890625, "loss": 6.1839, "nll_loss": 1.3598480224609375, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12215177714824677, "rewards/margins": -0.08681774139404297, "rewards/rejected": -0.03533405065536499, "step": 109 }, { "epoch": 0.15781922525107603, "grad_norm": 19.650623321533203, "kl/ref_to_policy/chosen": 12.63276481628418, "kl/ref_to_policy/mean": 9.10662841796875, "kl/ref_to_policy/rejected": 5.5804924964904785, "learning_rate": 9.904840646737346e-06, "logits/chosen": -0.4217444956302643, "logits/rejected": -0.3180479109287262, "logps/chosen": -2070.625, "logps/rejected": -2069.472900390625, "loss": 5.9472, "nll_loss": 1.3030997514724731, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12632764875888824, "rewards/margins": -0.0705227255821228, "rewards/rejected": -0.05580492690205574, "step": 110 }, { "epoch": 0.15781922525107603, "eval_kl/ref_to_policy/chosen": 6.735382556915283, "eval_kl/ref_to_policy/mean": 7.134402751922607, "eval_kl/ref_to_policy/rejected": 7.533421516418457, "eval_logits/chosen": -0.522655725479126, "eval_logits/rejected": -0.34433406591415405, "eval_logps/chosen": -1530.9715576171875, "eval_logps/rejected": -1531.255859375, "eval_loss": 4.71306848526001, "eval_nll_loss": 1.004192590713501, "eval_rewards/accuracies": 0.5079787373542786, "eval_rewards/chosen": -0.06735382974147797, "eval_rewards/margins": 0.007980386726558208, "eval_rewards/rejected": -0.07533421367406845, "eval_runtime": 114.7909, "eval_samples_per_second": 3.276, "eval_steps_per_second": 1.638, "step": 110 }, { "epoch": 0.15925394548063126, "grad_norm": 50.48379898071289, "kl/ref_to_policy/chosen": -5.728445053100586, "kl/ref_to_policy/mean": -2.624593734741211, "kl/ref_to_policy/rejected": 0.47925806045532227, "learning_rate": 9.899914670136016e-06, "logits/chosen": -0.7350459098815918, "logits/rejected": -0.5589559674263, "logps/chosen": -1873.708984375, "logps/rejected": -1874.358154296875, "loss": 4.2456, "nll_loss": 0.8942334055900574, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05728444457054138, "rewards/margins": 0.06207702308893204, "rewards/rejected": -0.0047925785183906555, "step": 111 }, { "epoch": 0.1606886657101865, "grad_norm": 30.93092918395996, "kl/ref_to_policy/chosen": -5.688694000244141, "kl/ref_to_policy/mean": -4.172355651855469, "kl/ref_to_policy/rejected": -2.656017303466797, "learning_rate": 9.89486568017719e-06, "logits/chosen": -0.5591096878051758, "logits/rejected": -0.3694317936897278, "logps/chosen": -1360.2098388671875, "logps/rejected": -1360.924072265625, "loss": 3.756, "nll_loss": 0.7676513195037842, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0568869411945343, "rewards/margins": 0.03032676875591278, "rewards/rejected": 0.026560164988040924, "step": 112 }, { "epoch": 0.16212338593974174, "grad_norm": 33.52889633178711, "kl/ref_to_policy/chosen": -1.6662812232971191, "kl/ref_to_policy/mean": 2.1054978370666504, "kl/ref_to_policy/rejected": 5.87727689743042, "learning_rate": 9.889693803616793e-06, "logits/chosen": -0.6623775959014893, "logits/rejected": -0.44893863797187805, "logps/chosen": -1069.544677734375, "logps/rejected": -1070.5394287109375, "loss": 3.4539, "nll_loss": 0.6981315016746521, "rewards/accuracies": 0.625, "rewards/chosen": 0.016662809997797012, "rewards/margins": 0.0754355788230896, "rewards/rejected": -0.05877276882529259, "step": 113 }, { "epoch": 0.16355810616929697, "grad_norm": 32.55335235595703, "kl/ref_to_policy/chosen": -9.267871856689453, "kl/ref_to_policy/mean": -5.8346781730651855, "kl/ref_to_policy/rejected": -2.4014854431152344, "learning_rate": 9.884399170295839e-06, "logits/chosen": -0.7026369571685791, "logits/rejected": -0.484952449798584, "logps/chosen": -724.4821166992188, "logps/rejected": -726.6531982421875, "loss": 2.9947, "nll_loss": 0.5818889737129211, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0926787257194519, "rewards/margins": 0.06866385787725449, "rewards/rejected": 0.024014852941036224, "step": 114 }, { "epoch": 0.1649928263988522, "grad_norm": 17.272872924804688, "kl/ref_to_policy/chosen": 46.77111053466797, "kl/ref_to_policy/mean": 43.695098876953125, "kl/ref_to_policy/rejected": 40.61908721923828, "learning_rate": 9.878981913137178e-06, "logits/chosen": -0.4978514313697815, "logits/rejected": -0.3522443473339081, "logps/chosen": -1888.2945556640625, "logps/rejected": -1887.185546875, "loss": 5.2074, "nll_loss": 1.1188626289367676, "rewards/accuracies": 0.375, "rewards/chosen": -0.46771109104156494, "rewards/margins": -0.061520226299762726, "rewards/rejected": -0.40619081258773804, "step": 115 }, { "epoch": 0.16642754662840745, "grad_norm": 19.70751953125, "kl/ref_to_policy/chosen": 6.208740711212158, "kl/ref_to_policy/mean": 5.516138076782227, "kl/ref_to_policy/rejected": 4.823534965515137, "learning_rate": 9.873442168142158e-06, "logits/chosen": -0.624593198299408, "logits/rejected": -0.459818571805954, "logps/chosen": -1188.380615234375, "logps/rejected": -1188.7474365234375, "loss": 4.2371, "nll_loss": 0.8822128176689148, "rewards/accuracies": 0.5, "rewards/chosen": -0.06208740919828415, "rewards/margins": -0.013852051459252834, "rewards/rejected": -0.04823535680770874, "step": 116 }, { "epoch": 0.1678622668579627, "grad_norm": 19.647659301757812, "kl/ref_to_policy/chosen": 6.5472822189331055, "kl/ref_to_policy/mean": 7.6205291748046875, "kl/ref_to_policy/rejected": 8.693775177001953, "learning_rate": 9.867780074387207e-06, "logits/chosen": -0.6745280623435974, "logits/rejected": -0.46164262294769287, "logps/chosen": -1208.06298828125, "logps/rejected": -1208.8558349609375, "loss": 3.8994, "nll_loss": 0.8019770979881287, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06547282636165619, "rewards/margins": 0.021464938297867775, "rewards/rejected": -0.08693776279687881, "step": 117 }, { "epoch": 0.16929698708751795, "grad_norm": 15.209737777709961, "kl/ref_to_policy/chosen": 14.828804016113281, "kl/ref_to_policy/mean": 12.394536972045898, "kl/ref_to_policy/rejected": 9.9602689743042, "learning_rate": 9.861995774020341e-06, "logits/chosen": -0.6476424336433411, "logits/rejected": -0.4938022196292877, "logps/chosen": -1099.313232421875, "logps/rejected": -1098.848876953125, "loss": 4.5693, "nll_loss": 0.9608502388000488, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1482880413532257, "rewards/margins": -0.04868534952402115, "rewards/rejected": -0.09960268437862396, "step": 118 }, { "epoch": 0.17073170731707318, "grad_norm": 14.61567497253418, "kl/ref_to_policy/chosen": 21.43667221069336, "kl/ref_to_policy/mean": 19.525413513183594, "kl/ref_to_policy/rejected": 17.614158630371094, "learning_rate": 9.856089412257605e-06, "logits/chosen": -0.47055914998054504, "logits/rejected": -0.33139604330062866, "logps/chosen": -1423.6788330078125, "logps/rejected": -1423.5946044921875, "loss": 5.097, "nll_loss": 1.0940258502960205, "rewards/accuracies": 0.4375, "rewards/chosen": -0.21436670422554016, "rewards/margins": -0.03822513669729233, "rewards/rejected": -0.17614156007766724, "step": 119 }, { "epoch": 0.17216642754662842, "grad_norm": 13.1753568649292, "kl/ref_to_policy/chosen": 15.124740600585938, "kl/ref_to_policy/mean": 14.4599027633667, "kl/ref_to_policy/rejected": 13.795063018798828, "learning_rate": 9.850061137379414e-06, "logits/chosen": -0.6101362705230713, "logits/rejected": -0.45278218388557434, "logps/chosen": -2011.476318359375, "logps/rejected": -2011.3458251953125, "loss": 5.1596, "nll_loss": 1.1130117177963257, "rewards/accuracies": 0.4375, "rewards/chosen": -0.15124741196632385, "rewards/margins": -0.01329677551984787, "rewards/rejected": -0.13795062899589539, "step": 120 }, { "epoch": 0.17216642754662842, "eval_kl/ref_to_policy/chosen": 11.454007148742676, "eval_kl/ref_to_policy/mean": 11.944790840148926, "eval_kl/ref_to_policy/rejected": 12.435572624206543, "eval_logits/chosen": -0.5870837569236755, "eval_logits/rejected": -0.4015708565711975, "eval_logps/chosen": -1535.68994140625, "eval_logps/rejected": -1536.158203125, "eval_loss": 4.584718704223633, "eval_nll_loss": 0.9722355008125305, "eval_rewards/accuracies": 0.5079787373542786, "eval_rewards/chosen": -0.11454006284475327, "eval_rewards/margins": 0.009815654717385769, "eval_rewards/rejected": -0.12435571849346161, "eval_runtime": 114.0846, "eval_samples_per_second": 3.296, "eval_steps_per_second": 1.648, "step": 120 }, { "epoch": 0.17360114777618366, "grad_norm": 15.51179313659668, "kl/ref_to_policy/chosen": 9.166370391845703, "kl/ref_to_policy/mean": 10.66579532623291, "kl/ref_to_policy/rejected": 12.16522216796875, "learning_rate": 9.843911100726838e-06, "logits/chosen": -0.5674393773078918, "logits/rejected": -0.33535444736480713, "logps/chosen": -1326.025634765625, "logps/rejected": -1326.760986328125, "loss": 4.0978, "nll_loss": 0.8526591658592224, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09166368097066879, "rewards/margins": 0.029988521710038185, "rewards/rejected": -0.12165220826864243, "step": 121 }, { "epoch": 0.1750358680057389, "grad_norm": 11.017143249511719, "kl/ref_to_policy/chosen": 20.671873092651367, "kl/ref_to_policy/mean": 19.624740600585938, "kl/ref_to_policy/rejected": 18.57761001586914, "learning_rate": 9.837639456697802e-06, "logits/chosen": -0.5354692935943604, "logits/rejected": -0.3517917990684509, "logps/chosen": -1696.21875, "logps/rejected": -1695.95458984375, "loss": 4.7685, "nll_loss": 1.0142829418182373, "rewards/accuracies": 0.4375, "rewards/chosen": -0.20671874284744263, "rewards/margins": -0.02094263769686222, "rewards/rejected": -0.18577608466148376, "step": 122 }, { "epoch": 0.17647058823529413, "grad_norm": 8.48554515838623, "kl/ref_to_policy/chosen": 34.06566619873047, "kl/ref_to_policy/mean": 29.795997619628906, "kl/ref_to_policy/rejected": 25.526325225830078, "learning_rate": 9.83124636274321e-06, "logits/chosen": -0.39746949076652527, "logits/rejected": -0.29508745670318604, "logps/chosen": -2546.721923828125, "logps/rejected": -2545.6376953125, "loss": 5.8355, "nll_loss": 1.2727149724960327, "rewards/accuracies": 0.3125, "rewards/chosen": -0.34065666794776917, "rewards/margins": -0.08539342135190964, "rewards/rejected": -0.25526323914527893, "step": 123 }, { "epoch": 0.17790530846484937, "grad_norm": 44.085479736328125, "kl/ref_to_policy/chosen": 24.198863983154297, "kl/ref_to_policy/mean": 19.835105895996094, "kl/ref_to_policy/rejected": 15.471346855163574, "learning_rate": 9.824731979362991e-06, "logits/chosen": -0.5799437165260315, "logits/rejected": -0.43394869565963745, "logps/chosen": -1976.98779296875, "logps/rejected": -1977.143310546875, "loss": 4.9715, "nll_loss": 1.0558922290802002, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2419886440038681, "rewards/margins": -0.0872751846909523, "rewards/rejected": -0.1547134518623352, "step": 124 }, { "epoch": 0.1793400286944046, "grad_norm": 8.130631446838379, "kl/ref_to_policy/chosen": 45.31373596191406, "kl/ref_to_policy/mean": 39.398929595947266, "kl/ref_to_policy/rejected": 33.48412322998047, "learning_rate": 9.818096470102067e-06, "logits/chosen": -0.3536359965801239, "logits/rejected": -0.2844098210334778, "logps/chosen": -2857.876220703125, "logps/rejected": -2856.06005859375, "loss": 6.5845, "nll_loss": 1.4559299945831299, "rewards/accuracies": 0.25, "rewards/chosen": -0.4531373381614685, "rewards/margins": -0.11829613894224167, "rewards/rejected": -0.33484119176864624, "step": 125 }, { "epoch": 0.18077474892395984, "grad_norm": 147.34657287597656, "kl/ref_to_policy/chosen": 5.439140319824219, "kl/ref_to_policy/mean": 8.157654762268066, "kl/ref_to_policy/rejected": 10.876168251037598, "learning_rate": 9.811340001546252e-06, "logits/chosen": -0.6936448216438293, "logits/rejected": -0.49105241894721985, "logps/chosen": -1211.298583984375, "logps/rejected": -1213.458251953125, "loss": 3.6015, "nll_loss": 0.7319586873054504, "rewards/accuracies": 0.625, "rewards/chosen": -0.05439140647649765, "rewards/margins": 0.05437028408050537, "rewards/rejected": -0.10876168310642242, "step": 126 }, { "epoch": 0.18220946915351507, "grad_norm": 8.124923706054688, "kl/ref_to_policy/chosen": 30.12948989868164, "kl/ref_to_policy/mean": 26.55831527709961, "kl/ref_to_policy/rejected": 22.987146377563477, "learning_rate": 9.80446274331807e-06, "logits/chosen": -0.5211549997329712, "logits/rejected": -0.4041650891304016, "logps/chosen": -1621.85595703125, "logps/rejected": -1621.5887451171875, "loss": 5.0723, "nll_loss": 1.0838840007781982, "rewards/accuracies": 0.375, "rewards/chosen": -0.30129486322402954, "rewards/margins": -0.07142342627048492, "rewards/rejected": -0.2298714518547058, "step": 127 }, { "epoch": 0.1836441893830703, "grad_norm": 8.032658576965332, "kl/ref_to_policy/chosen": 31.416397094726562, "kl/ref_to_policy/mean": 31.02690887451172, "kl/ref_to_policy/rejected": 30.637420654296875, "learning_rate": 9.797464868072489e-06, "logits/chosen": -0.5431579351425171, "logits/rejected": -0.4094155728816986, "logps/chosen": -1677.9320068359375, "logps/rejected": -1677.5650634765625, "loss": 4.5228, "nll_loss": 0.954755961894989, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3141639828681946, "rewards/margins": -0.007789762690663338, "rewards/rejected": -0.3063742220401764, "step": 128 }, { "epoch": 0.18507890961262555, "grad_norm": 9.075834274291992, "kl/ref_to_policy/chosen": 16.206899642944336, "kl/ref_to_policy/mean": 18.658628463745117, "kl/ref_to_policy/rejected": 21.11035919189453, "learning_rate": 9.790346551492594e-06, "logits/chosen": -0.6675095558166504, "logits/rejected": -0.4734486937522888, "logps/chosen": -1057.7381591796875, "logps/rejected": -1058.8466796875, "loss": 3.5186, "nll_loss": 0.7104941606521606, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16206903755664825, "rewards/margins": 0.04903458058834076, "rewards/rejected": -0.21110357344150543, "step": 129 }, { "epoch": 0.18651362984218078, "grad_norm": 8.93906021118164, "kl/ref_to_policy/chosen": 39.24082946777344, "kl/ref_to_policy/mean": 33.12590026855469, "kl/ref_to_policy/rejected": 27.010969161987305, "learning_rate": 9.783107972285177e-06, "logits/chosen": -0.4032834470272064, "logits/rejected": -0.30013278126716614, "logps/chosen": -1944.5767822265625, "logps/rejected": -1944.235595703125, "loss": 5.2584, "nll_loss": 1.1235220432281494, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3924083113670349, "rewards/margins": -0.12229861319065094, "rewards/rejected": -0.2701096832752228, "step": 130 }, { "epoch": 0.18651362984218078, "eval_kl/ref_to_policy/chosen": 20.385833740234375, "eval_kl/ref_to_policy/mean": 21.06956672668457, "eval_kl/ref_to_policy/rejected": 21.753297805786133, "eval_logits/chosen": -0.588394820690155, "eval_logits/rejected": -0.4252239763736725, "eval_logps/chosen": -1544.6219482421875, "eval_logps/rejected": -1545.4759521484375, "eval_loss": 4.586004257202148, "eval_nll_loss": 0.9730575084686279, "eval_rewards/accuracies": 0.5186170339584351, "eval_rewards/chosen": -0.20385831594467163, "eval_rewards/margins": 0.013674644753336906, "eval_rewards/rejected": -0.21753299236297607, "eval_runtime": 113.5468, "eval_samples_per_second": 3.311, "eval_steps_per_second": 1.656, "step": 130 }, { "epoch": 0.18794835007173602, "grad_norm": 11.206088066101074, "kl/ref_to_policy/chosen": 3.835909843444824, "kl/ref_to_policy/mean": 6.999944686889648, "kl/ref_to_policy/rejected": 10.163979530334473, "learning_rate": 9.775749312176249e-06, "logits/chosen": -0.7665029764175415, "logits/rejected": -0.5633811950683594, "logps/chosen": -1285.0703125, "logps/rejected": -1287.5018310546875, "loss": 3.4078, "nll_loss": 0.684897780418396, "rewards/accuracies": 0.625, "rewards/chosen": -0.03835909813642502, "rewards/margins": 0.06328068673610687, "rewards/rejected": -0.10163978487253189, "step": 131 }, { "epoch": 0.18938307030129126, "grad_norm": 97.48048400878906, "kl/ref_to_policy/chosen": 22.698341369628906, "kl/ref_to_policy/mean": 24.682615280151367, "kl/ref_to_policy/rejected": 26.66689109802246, "learning_rate": 9.768270755906467e-06, "logits/chosen": -0.8167471885681152, "logits/rejected": -0.6589804291725159, "logps/chosen": -1232.4639892578125, "logps/rejected": -1233.5595703125, "loss": 3.8677, "nll_loss": 0.7968628406524658, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22698339819908142, "rewards/margins": 0.039685484021902084, "rewards/rejected": -0.2666689157485962, "step": 132 }, { "epoch": 0.1908177905308465, "grad_norm": 7.153334617614746, "kl/ref_to_policy/chosen": 21.923828125, "kl/ref_to_policy/mean": 22.512109756469727, "kl/ref_to_policy/rejected": 23.10038948059082, "learning_rate": 9.760672491226515e-06, "logits/chosen": -0.718183159828186, "logits/rejected": -0.5227647423744202, "logps/chosen": -1481.6033935546875, "logps/rejected": -1482.2939453125, "loss": 4.0833, "nll_loss": 0.8467991352081299, "rewards/accuracies": 0.5, "rewards/chosen": -0.2192382514476776, "rewards/margins": 0.011765611357986927, "rewards/rejected": -0.23100388050079346, "step": 133 }, { "epoch": 0.19225251076040173, "grad_norm": 7.122723579406738, "kl/ref_to_policy/chosen": 18.79424285888672, "kl/ref_to_policy/mean": 22.814491271972656, "kl/ref_to_policy/rejected": 26.834735870361328, "learning_rate": 9.752954708892379e-06, "logits/chosen": -0.6467876434326172, "logits/rejected": -0.39852550625801086, "logps/chosen": -1288.606689453125, "logps/rejected": -1290.672607421875, "loss": 3.5073, "nll_loss": 0.711543619632721, "rewards/accuracies": 0.625, "rewards/chosen": -0.18794241547584534, "rewards/margins": 0.08040495216846466, "rewards/rejected": -0.2683473527431488, "step": 134 }, { "epoch": 0.19368723098995697, "grad_norm": 6.877944469451904, "kl/ref_to_policy/chosen": 14.593789100646973, "kl/ref_to_policy/mean": 18.924894332885742, "kl/ref_to_policy/rejected": 23.25600242614746, "learning_rate": 9.745117602660556e-06, "logits/chosen": -0.6899006366729736, "logits/rejected": -0.46275794506073, "logps/chosen": -1416.671875, "logps/rejected": -1419.4688720703125, "loss": 3.4752, "nll_loss": 0.7043485641479492, "rewards/accuracies": 0.625, "rewards/chosen": -0.14593788981437683, "rewards/margins": 0.08662213385105133, "rewards/rejected": -0.23256002366542816, "step": 135 }, { "epoch": 0.1951219512195122, "grad_norm": 6.903712749481201, "kl/ref_to_policy/chosen": 35.58076477050781, "kl/ref_to_policy/mean": 32.95863342285156, "kl/ref_to_policy/rejected": 30.336505889892578, "learning_rate": 9.737161369283201e-06, "logits/chosen": -0.4098990857601166, "logits/rejected": -0.23099996149539948, "logps/chosen": -1804.9400634765625, "logps/rejected": -1804.6041259765625, "loss": 5.4789, "nll_loss": 1.1874853372573853, "rewards/accuracies": 0.375, "rewards/chosen": -0.35580769181251526, "rewards/margins": -0.05244257673621178, "rewards/rejected": -0.3033650815486908, "step": 136 }, { "epoch": 0.19655667144906744, "grad_norm": 6.525818347930908, "kl/ref_to_policy/chosen": 11.262283325195312, "kl/ref_to_policy/mean": 11.405228614807129, "kl/ref_to_policy/rejected": 11.548171997070312, "learning_rate": 9.729086208503174e-06, "logits/chosen": -0.5147634148597717, "logits/rejected": -0.2751272916793823, "logps/chosen": -1351.1138916015625, "logps/rejected": -1352.0404052734375, "loss": 4.3291, "nll_loss": 0.9069387316703796, "rewards/accuracies": 0.5, "rewards/chosen": -0.11262284219264984, "rewards/margins": 0.002858884632587433, "rewards/rejected": -0.11548171192407608, "step": 137 }, { "epoch": 0.19799139167862267, "grad_norm": 6.457612037658691, "kl/ref_to_policy/chosen": 8.36099910736084, "kl/ref_to_policy/mean": 12.69914436340332, "kl/ref_to_policy/rejected": 17.03728675842285, "learning_rate": 9.720892323049034e-06, "logits/chosen": -0.6930458545684814, "logits/rejected": -0.39990800619125366, "logps/chosen": -1510.704833984375, "logps/rejected": -1514.252197265625, "loss": 3.523, "nll_loss": 0.7160840630531311, "rewards/accuracies": 0.625, "rewards/chosen": -0.08361000567674637, "rewards/margins": 0.08676289021968842, "rewards/rejected": -0.170372873544693, "step": 138 }, { "epoch": 0.1994261119081779, "grad_norm": 7.3405985832214355, "kl/ref_to_policy/chosen": 27.651111602783203, "kl/ref_to_policy/mean": 29.786026000976562, "kl/ref_to_policy/rejected": 31.920944213867188, "learning_rate": 9.712579918629947e-06, "logits/chosen": -0.7024210095405579, "logits/rejected": -0.4241674244403839, "logps/chosen": -1545.666748046875, "logps/rejected": -1548.5380859375, "loss": 3.7438, "nll_loss": 0.7658035755157471, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27651113271713257, "rewards/margins": 0.042698320001363754, "rewards/rejected": -0.31920939683914185, "step": 139 }, { "epoch": 0.20086083213773315, "grad_norm": 15.318578720092773, "kl/ref_to_policy/chosen": 27.349000930786133, "kl/ref_to_policy/mean": 30.215002059936523, "kl/ref_to_policy/rejected": 33.08100891113281, "learning_rate": 9.704149203930522e-06, "logits/chosen": -0.6991807818412781, "logits/rejected": -0.3996809124946594, "logps/chosen": -1499.731689453125, "logps/rejected": -1501.8250732421875, "loss": 4.1556, "nll_loss": 0.8707681894302368, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2734900116920471, "rewards/margins": 0.057320043444633484, "rewards/rejected": -0.3308100402355194, "step": 140 }, { "epoch": 0.20086083213773315, "eval_kl/ref_to_policy/chosen": 29.86239242553711, "eval_kl/ref_to_policy/mean": 30.876121520996094, "eval_kl/ref_to_policy/rejected": 31.889854431152344, "eval_logits/chosen": -0.546277642250061, "eval_logits/rejected": -0.270348459482193, "eval_logps/chosen": -1554.098388671875, "eval_logps/rejected": -1555.6124267578125, "eval_loss": 4.5627264976501465, "eval_nll_loss": 0.9674803614616394, "eval_rewards/accuracies": 0.5079787373542786, "eval_rewards/chosen": -0.2986239492893219, "eval_rewards/margins": 0.020274560898542404, "eval_rewards/rejected": -0.318898469209671, "eval_runtime": 113.2451, "eval_samples_per_second": 3.32, "eval_steps_per_second": 1.66, "step": 140 }, { "epoch": 0.20229555236728838, "grad_norm": 8.536910057067871, "kl/ref_to_policy/chosen": 72.77719116210938, "kl/ref_to_policy/mean": 65.33346557617188, "kl/ref_to_policy/rejected": 57.88974380493164, "learning_rate": 9.695600390605573e-06, "logits/chosen": -0.08609353005886078, "logits/rejected": 0.027152590453624725, "logps/chosen": -2436.245849609375, "logps/rejected": -2432.8642578125, "loss": 6.6646, "nll_loss": 1.4718749523162842, "rewards/accuracies": 0.1875, "rewards/chosen": -0.7277718782424927, "rewards/margins": -0.14887449145317078, "rewards/rejected": -0.5788974165916443, "step": 141 }, { "epoch": 0.20373027259684362, "grad_norm": 7.435892105102539, "kl/ref_to_policy/chosen": 12.308944702148438, "kl/ref_to_policy/mean": 14.980998992919922, "kl/ref_to_policy/rejected": 17.653057098388672, "learning_rate": 9.686933693274801e-06, "logits/chosen": -0.7479989528656006, "logits/rejected": -0.40818917751312256, "logps/chosen": -687.5432739257812, "logps/rejected": -690.085693359375, "loss": 3.7499, "nll_loss": 0.7684676647186279, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12308944761753082, "rewards/margins": 0.053441114723682404, "rewards/rejected": -0.17653055489063263, "step": 142 }, { "epoch": 0.20516499282639886, "grad_norm": 253.88882446289062, "kl/ref_to_policy/chosen": 11.257946014404297, "kl/ref_to_policy/mean": 14.870843887329102, "kl/ref_to_policy/rejected": 18.48374366760254, "learning_rate": 9.67814932951741e-06, "logits/chosen": -0.7631012797355652, "logits/rejected": -0.4111711084842682, "logps/chosen": -747.2345581054688, "logps/rejected": -748.8157348632812, "loss": 3.7544, "nll_loss": 0.7718849778175354, "rewards/accuracies": 0.625, "rewards/chosen": -0.11257946491241455, "rewards/margins": 0.07225794345140457, "rewards/rejected": -0.18483740091323853, "step": 143 }, { "epoch": 0.2065997130559541, "grad_norm": 6.624530792236328, "kl/ref_to_policy/chosen": 53.72500228881836, "kl/ref_to_policy/mean": 53.28506088256836, "kl/ref_to_policy/rejected": 52.84511947631836, "learning_rate": 9.669247519866645e-06, "logits/chosen": -0.4603515863418579, "logits/rejected": -0.18222564458847046, "logps/chosen": -1909.64697265625, "logps/rejected": -1910.24951171875, "loss": 5.1627, "nll_loss": 1.1138935089111328, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5372499823570251, "rewards/margins": -0.008798733353614807, "rewards/rejected": -0.5284512042999268, "step": 144 }, { "epoch": 0.20803443328550933, "grad_norm": 7.028384685516357, "kl/ref_to_policy/chosen": 27.774599075317383, "kl/ref_to_policy/mean": 26.084796905517578, "kl/ref_to_policy/rejected": 24.394996643066406, "learning_rate": 9.660228487804254e-06, "logits/chosen": -0.6692523956298828, "logits/rejected": -0.4022386372089386, "logps/chosen": -1314.477783203125, "logps/rejected": -1315.13916015625, "loss": 4.5122, "nll_loss": 0.9478066563606262, "rewards/accuracies": 0.4375, "rewards/chosen": -0.27774596214294434, "rewards/margins": -0.03379599004983902, "rewards/rejected": -0.24394994974136353, "step": 145 }, { "epoch": 0.20946915351506457, "grad_norm": 6.422809600830078, "kl/ref_to_policy/chosen": 30.820213317871094, "kl/ref_to_policy/mean": 31.35479736328125, "kl/ref_to_policy/rejected": 31.889379501342773, "learning_rate": 9.651092459754879e-06, "logits/chosen": -0.6933527588844299, "logits/rejected": -0.3653371036052704, "logps/chosen": -1383.5703125, "logps/rejected": -1384.840576171875, "loss": 4.2784, "nll_loss": 0.8953120112419128, "rewards/accuracies": 0.5, "rewards/chosen": -0.3082021474838257, "rewards/margins": 0.010691642761230469, "rewards/rejected": -0.31889379024505615, "step": 146 }, { "epoch": 0.2109038737446198, "grad_norm": 6.517394542694092, "kl/ref_to_policy/chosen": 29.487890243530273, "kl/ref_to_policy/mean": 32.486122131347656, "kl/ref_to_policy/rejected": 35.48435592651367, "learning_rate": 9.641839665080363e-06, "logits/chosen": -0.8189574480056763, "logits/rejected": -0.4594953656196594, "logps/chosen": -1645.78466796875, "logps/rejected": -1649.5428466796875, "loss": 4.2322, "nll_loss": 0.8902266025543213, "rewards/accuracies": 0.5625, "rewards/chosen": -0.29487890005111694, "rewards/margins": 0.05996464937925339, "rewards/rejected": -0.35484352707862854, "step": 147 }, { "epoch": 0.21233859397417504, "grad_norm": 6.93446159362793, "kl/ref_to_policy/chosen": 25.07514190673828, "kl/ref_to_policy/mean": 27.362844467163086, "kl/ref_to_policy/rejected": 29.650548934936523, "learning_rate": 9.632470336074009e-06, "logits/chosen": -0.7348756194114685, "logits/rejected": -0.4335314929485321, "logps/chosen": -1333.9969482421875, "logps/rejected": -1336.252197265625, "loss": 4.5493, "nll_loss": 0.9680567383766174, "rewards/accuracies": 0.5, "rewards/chosen": -0.2507513761520386, "rewards/margins": 0.045754097402095795, "rewards/rejected": -0.29650548100471497, "step": 148 }, { "epoch": 0.21377331420373027, "grad_norm": 10.338397979736328, "kl/ref_to_policy/chosen": 54.02285385131836, "kl/ref_to_policy/mean": 52.00590515136719, "kl/ref_to_policy/rejected": 49.98895263671875, "learning_rate": 9.622984707954732e-06, "logits/chosen": -0.7203717231750488, "logits/rejected": -0.43954625725746155, "logps/chosen": -1955.06201171875, "logps/rejected": -1955.5220947265625, "loss": 5.2954, "nll_loss": 1.1431293487548828, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5402286052703857, "rewards/margins": -0.04033900797367096, "rewards/rejected": -0.4998894929885864, "step": 149 }, { "epoch": 0.2152080344332855, "grad_norm": 5.496667861938477, "kl/ref_to_policy/chosen": 2.0531718730926514, "kl/ref_to_policy/mean": 6.068507671356201, "kl/ref_to_policy/rejected": 10.083842277526855, "learning_rate": 9.613383018861159e-06, "logits/chosen": -1.0209285020828247, "logits/rejected": -0.616409957408905, "logps/chosen": -849.9437866210938, "logps/rejected": -852.4295043945312, "loss": 2.9363, "nll_loss": 0.56839919090271, "rewards/accuracies": 0.625, "rewards/chosen": -0.020531706511974335, "rewards/margins": 0.08030670881271362, "rewards/rejected": -0.10083843022584915, "step": 150 }, { "epoch": 0.2152080344332855, "eval_kl/ref_to_policy/chosen": 31.20492935180664, "eval_kl/ref_to_policy/mean": 32.45994186401367, "eval_kl/ref_to_policy/rejected": 33.71495819091797, "eval_logits/chosen": -0.7891390919685364, "eval_logits/rejected": -0.46379226446151733, "eval_logps/chosen": -1555.4410400390625, "eval_logps/rejected": -1557.4376220703125, "eval_loss": 4.567503452301025, "eval_nll_loss": 0.9692649841308594, "eval_rewards/accuracies": 0.5106382966041565, "eval_rewards/chosen": -0.3120492994785309, "eval_rewards/margins": 0.025100277736783028, "eval_rewards/rejected": -0.33714956045150757, "eval_runtime": 112.8008, "eval_samples_per_second": 3.333, "eval_steps_per_second": 1.667, "step": 150 }, { "epoch": 0.21664275466284075, "grad_norm": 6.225917816162109, "kl/ref_to_policy/chosen": 27.977169036865234, "kl/ref_to_policy/mean": 27.782814025878906, "kl/ref_to_policy/rejected": 27.58846092224121, "learning_rate": 9.603665509845657e-06, "logits/chosen": -0.802638053894043, "logits/rejected": -0.4601612985134125, "logps/chosen": -1269.84423828125, "logps/rejected": -1271.6802978515625, "loss": 4.1717, "nll_loss": 0.8663291931152344, "rewards/accuracies": 0.5, "rewards/chosen": -0.27977168560028076, "rewards/margins": -0.0038870982825756073, "rewards/rejected": -0.27588459849357605, "step": 151 }, { "epoch": 0.21807747489239598, "grad_norm": 7.411525249481201, "kl/ref_to_policy/chosen": 21.93979835510254, "kl/ref_to_policy/mean": 23.338991165161133, "kl/ref_to_policy/rejected": 24.738183975219727, "learning_rate": 9.593832424868271e-06, "logits/chosen": -0.7182157039642334, "logits/rejected": -0.3840704560279846, "logps/chosen": -937.3538208007812, "logps/rejected": -938.8573608398438, "loss": 4.1106, "nll_loss": 0.855283796787262, "rewards/accuracies": 0.5, "rewards/chosen": -0.21939799189567566, "rewards/margins": 0.027983810752630234, "rewards/rejected": -0.2473818063735962, "step": 152 }, { "epoch": 0.21951219512195122, "grad_norm": 195.0580291748047, "kl/ref_to_policy/chosen": 44.935157775878906, "kl/ref_to_policy/mean": 41.69145965576172, "kl/ref_to_policy/rejected": 38.44776153564453, "learning_rate": 9.583884010790605e-06, "logits/chosen": -0.696082592010498, "logits/rejected": -0.384784460067749, "logps/chosen": -1430.3336181640625, "logps/rejected": -1430.2525634765625, "loss": 5.479, "nll_loss": 1.1847437620162964, "rewards/accuracies": 0.4375, "rewards/chosen": -0.44935157895088196, "rewards/margins": -0.06487396359443665, "rewards/rejected": -0.3844775855541229, "step": 153 }, { "epoch": 0.22094691535150646, "grad_norm": 8.415170669555664, "kl/ref_to_policy/chosen": 46.58022689819336, "kl/ref_to_policy/mean": 44.777889251708984, "kl/ref_to_policy/rejected": 42.97555160522461, "learning_rate": 9.573820517369623e-06, "logits/chosen": -0.47212427854537964, "logits/rejected": -0.19186003506183624, "logps/chosen": -1600.260009765625, "logps/rejected": -1601.7117919921875, "loss": 5.5193, "nll_loss": 1.1989941596984863, "rewards/accuracies": 0.375, "rewards/chosen": -0.46580225229263306, "rewards/margins": -0.036046743392944336, "rewards/rejected": -0.4297555088996887, "step": 154 }, { "epoch": 0.2223816355810617, "grad_norm": 6.464853286743164, "kl/ref_to_policy/chosen": 31.65397071838379, "kl/ref_to_policy/mean": 34.93175506591797, "kl/ref_to_policy/rejected": 38.20954132080078, "learning_rate": 9.563642197251382e-06, "logits/chosen": -0.8584099411964417, "logits/rejected": -0.45577096939086914, "logps/chosen": -1436.044677734375, "logps/rejected": -1440.74853515625, "loss": 4.0968, "nll_loss": 0.8562841415405273, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3165397047996521, "rewards/margins": 0.06555572897195816, "rewards/rejected": -0.3820953965187073, "step": 155 }, { "epoch": 0.22381635581061693, "grad_norm": 6.4498724937438965, "kl/ref_to_policy/chosen": 19.75579071044922, "kl/ref_to_policy/mean": 24.889585494995117, "kl/ref_to_policy/rejected": 30.023378372192383, "learning_rate": 9.553349305964687e-06, "logits/chosen": -1.0515908002853394, "logits/rejected": -0.5717988610267639, "logps/chosen": -1268.451171875, "logps/rejected": -1273.759765625, "loss": 3.6158, "nll_loss": 0.7410756945610046, "rewards/accuracies": 0.625, "rewards/chosen": -0.19755789637565613, "rewards/margins": 0.10267588496208191, "rewards/rejected": -0.3002338111400604, "step": 156 }, { "epoch": 0.22525107604017217, "grad_norm": 8.606592178344727, "kl/ref_to_policy/chosen": 29.529159545898438, "kl/ref_to_policy/mean": 29.455854415893555, "kl/ref_to_policy/rejected": 29.38254737854004, "learning_rate": 9.54294210191467e-06, "logits/chosen": -0.587514340877533, "logits/rejected": -0.22416959702968597, "logps/chosen": -1326.560302734375, "logps/rejected": -1328.781005859375, "loss": 5.1561, "nll_loss": 1.1130114793777466, "rewards/accuracies": 0.4375, "rewards/chosen": -0.29529160261154175, "rewards/margins": -0.0014661457389593124, "rewards/rejected": -0.2938254475593567, "step": 157 }, { "epoch": 0.2266857962697274, "grad_norm": 170.24087524414062, "kl/ref_to_policy/chosen": 12.559683799743652, "kl/ref_to_policy/mean": 17.8001766204834, "kl/ref_to_policy/rejected": 23.04067039489746, "learning_rate": 9.532420846376316e-06, "logits/chosen": -0.9948787093162537, "logits/rejected": -0.42699146270751953, "logps/chosen": -926.4581909179688, "logps/rejected": -931.2672729492188, "loss": 3.9589, "nll_loss": 0.8270803093910217, "rewards/accuracies": 0.625, "rewards/chosen": -0.12559683620929718, "rewards/margins": 0.10480985790491104, "rewards/rejected": -0.23040671646595, "step": 158 }, { "epoch": 0.22812051649928264, "grad_norm": 6.003561019897461, "kl/ref_to_policy/chosen": 33.427032470703125, "kl/ref_to_policy/mean": 34.082977294921875, "kl/ref_to_policy/rejected": 34.73892593383789, "learning_rate": 9.521785803487888e-06, "logits/chosen": -0.6413944363594055, "logits/rejected": -0.24858957529067993, "logps/chosen": -1651.473876953125, "logps/rejected": -1654.3385009765625, "loss": 5.1645, "nll_loss": 1.1170666217803955, "rewards/accuracies": 0.4375, "rewards/chosen": -0.33427032828330994, "rewards/margins": 0.013118898496031761, "rewards/rejected": -0.34738922119140625, "step": 159 }, { "epoch": 0.22955523672883787, "grad_norm": 55.64314270019531, "kl/ref_to_policy/chosen": 70.73506164550781, "kl/ref_to_policy/mean": 71.55496215820312, "kl/ref_to_policy/rejected": 72.37486267089844, "learning_rate": 9.51103724024431e-06, "logits/chosen": -0.6708917021751404, "logits/rejected": -0.3111037611961365, "logps/chosen": -2169.742919921875, "logps/rejected": -2172.181640625, "loss": 4.8816, "nll_loss": 1.0469530820846558, "rewards/accuracies": 0.5, "rewards/chosen": -0.7073505520820618, "rewards/margins": 0.016397984698414803, "rewards/rejected": -0.7237485647201538, "step": 160 }, { "epoch": 0.22955523672883787, "eval_kl/ref_to_policy/chosen": 33.01432800292969, "eval_kl/ref_to_policy/mean": 35.26668167114258, "eval_kl/ref_to_policy/rejected": 37.519039154052734, "eval_logits/chosen": -0.7303560972213745, "eval_logits/rejected": -0.28293198347091675, "eval_logps/chosen": -1557.2503662109375, "eval_logps/rejected": -1561.24169921875, "eval_loss": 4.542534828186035, "eval_nll_loss": 0.9654771685600281, "eval_rewards/accuracies": 0.5186170339584351, "eval_rewards/chosen": -0.3301433026790619, "eval_rewards/margins": 0.045047082006931305, "eval_rewards/rejected": -0.3751903772354126, "eval_runtime": 112.9288, "eval_samples_per_second": 3.33, "eval_steps_per_second": 1.665, "step": 160 }, { "epoch": 0.2309899569583931, "grad_norm": 100.83583068847656, "kl/ref_to_policy/chosen": 4.661296844482422, "kl/ref_to_policy/mean": 12.89354419708252, "kl/ref_to_policy/rejected": 21.12579345703125, "learning_rate": 9.500175426490455e-06, "logits/chosen": -1.135246992111206, "logits/rejected": -0.49183815717697144, "logps/chosen": -647.2862548828125, "logps/rejected": -652.4129028320312, "loss": 2.5284, "nll_loss": 0.4768775999546051, "rewards/accuracies": 0.75, "rewards/chosen": -0.04661297798156738, "rewards/margins": 0.16464495658874512, "rewards/rejected": -0.2112579345703125, "step": 161 }, { "epoch": 0.23242467718794835, "grad_norm": 10.721485137939453, "kl/ref_to_policy/chosen": 71.73443603515625, "kl/ref_to_policy/mean": 68.6573486328125, "kl/ref_to_policy/rejected": 65.58026885986328, "learning_rate": 9.489200634914373e-06, "logits/chosen": -0.4835962653160095, "logits/rejected": -0.17549122869968414, "logps/chosen": -2043.171875, "logps/rejected": -2043.2637939453125, "loss": 5.7067, "nll_loss": 1.2433998584747314, "rewards/accuracies": 0.375, "rewards/chosen": -0.717344343662262, "rewards/margins": -0.06154165044426918, "rewards/rejected": -0.6558026671409607, "step": 162 }, { "epoch": 0.23385939741750358, "grad_norm": 74.76397705078125, "kl/ref_to_policy/chosen": -4.4256110191345215, "kl/ref_to_policy/mean": 4.19760799407959, "kl/ref_to_policy/rejected": 12.820826530456543, "learning_rate": 9.478113141040444e-06, "logits/chosen": -1.0608221292495728, "logits/rejected": -0.3821842074394226, "logps/chosen": -816.3400268554688, "logps/rejected": -824.0376586914062, "loss": 2.4377, "nll_loss": 0.45456743240356445, "rewards/accuracies": 0.75, "rewards/chosen": 0.04425611346960068, "rewards/margins": 0.17246437072753906, "rewards/rejected": -0.12820827960968018, "step": 163 }, { "epoch": 0.23529411764705882, "grad_norm": 8.318624496459961, "kl/ref_to_policy/chosen": 67.51856231689453, "kl/ref_to_policy/mean": 65.28811645507812, "kl/ref_to_policy/rejected": 63.05767059326172, "learning_rate": 9.466913223222467e-06, "logits/chosen": -0.5166719555854797, "logits/rejected": -0.15286272764205933, "logps/chosen": -2114.4716796875, "logps/rejected": -2115.7138671875, "loss": 5.0366, "nll_loss": 1.0775697231292725, "rewards/accuracies": 0.375, "rewards/chosen": -0.6751855611801147, "rewards/margins": -0.04460892081260681, "rewards/rejected": -0.6305767297744751, "step": 164 }, { "epoch": 0.23672883787661406, "grad_norm": 9.164430618286133, "kl/ref_to_policy/chosen": 62.56611251831055, "kl/ref_to_policy/mean": 60.64094161987305, "kl/ref_to_policy/rejected": 58.71577835083008, "learning_rate": 9.455601162636662e-06, "logits/chosen": -0.6408874988555908, "logits/rejected": -0.3168572187423706, "logps/chosen": -2011.745849609375, "logps/rejected": -2012.0732421875, "loss": 5.0191, "nll_loss": 1.074327826499939, "rewards/accuracies": 0.375, "rewards/chosen": -0.6256611347198486, "rewards/margins": -0.038503363728523254, "rewards/rejected": -0.5871577858924866, "step": 165 }, { "epoch": 0.2381635581061693, "grad_norm": 8.405735969543457, "kl/ref_to_policy/chosen": -5.897089958190918, "kl/ref_to_policy/mean": 5.311347007751465, "kl/ref_to_policy/rejected": 16.51978302001953, "learning_rate": 9.444177243274619e-06, "logits/chosen": -1.181861400604248, "logits/rejected": -0.41021376848220825, "logps/chosen": -402.7435302734375, "logps/rejected": -412.574462890625, "loss": 2.003, "nll_loss": 0.35253679752349854, "rewards/accuracies": 0.8125, "rewards/chosen": 0.058970868587493896, "rewards/margins": 0.22416870296001434, "rewards/rejected": -0.16519781947135925, "step": 166 }, { "epoch": 0.23959827833572453, "grad_norm": 9.312651634216309, "kl/ref_to_policy/chosen": -7.641769886016846, "kl/ref_to_policy/mean": 0.7786208391189575, "kl/ref_to_policy/rejected": 9.199010848999023, "learning_rate": 9.432641751936162e-06, "logits/chosen": -1.2366045713424683, "logits/rejected": -0.6039983034133911, "logps/chosen": -512.8972778320312, "logps/rejected": -519.6658325195312, "loss": 2.573, "nll_loss": 0.4889419972896576, "rewards/accuracies": 0.75, "rewards/chosen": 0.07641769200563431, "rewards/margins": 0.16840781271457672, "rewards/rejected": -0.09199010580778122, "step": 167 }, { "epoch": 0.24103299856527977, "grad_norm": 10.332749366760254, "kl/ref_to_policy/chosen": 52.54487609863281, "kl/ref_to_policy/mean": 48.687644958496094, "kl/ref_to_policy/rejected": 44.830413818359375, "learning_rate": 9.420994978222156e-06, "logits/chosen": -0.6786137819290161, "logits/rejected": -0.3790303170681, "logps/chosen": -1942.521484375, "logps/rejected": -1944.572509765625, "loss": 5.5058, "nll_loss": 1.190721869468689, "rewards/accuracies": 0.375, "rewards/chosen": -0.525448739528656, "rewards/margins": -0.07714461535215378, "rewards/rejected": -0.4483041763305664, "step": 168 }, { "epoch": 0.242467718794835, "grad_norm": 111.71300506591797, "kl/ref_to_policy/chosen": 13.402259826660156, "kl/ref_to_policy/mean": 18.741073608398438, "kl/ref_to_policy/rejected": 24.079879760742188, "learning_rate": 9.40923721452723e-06, "logits/chosen": -1.0402662754058838, "logits/rejected": -0.4131658673286438, "logps/chosen": -982.0897216796875, "logps/rejected": -988.527099609375, "loss": 3.2523, "nll_loss": 0.6499794125556946, "rewards/accuracies": 0.625, "rewards/chosen": -0.13402259349822998, "rewards/margins": 0.10677620023488998, "rewards/rejected": -0.24079880118370056, "step": 169 }, { "epoch": 0.24390243902439024, "grad_norm": 9.230712890625, "kl/ref_to_policy/chosen": 53.239891052246094, "kl/ref_to_policy/mean": 50.28522491455078, "kl/ref_to_policy/rejected": 47.330562591552734, "learning_rate": 9.397368756032445e-06, "logits/chosen": -0.49676355719566345, "logits/rejected": -0.1914852261543274, "logps/chosen": -1987.53662109375, "logps/rejected": -1988.37548828125, "loss": 6.0044, "nll_loss": 1.317487359046936, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5323988795280457, "rewards/margins": -0.059093303978443146, "rewards/rejected": -0.4733055830001831, "step": 170 }, { "epoch": 0.24390243902439024, "eval_kl/ref_to_policy/chosen": 32.8104362487793, "eval_kl/ref_to_policy/mean": 35.67975997924805, "eval_kl/ref_to_policy/rejected": 38.54908752441406, "eval_logits/chosen": -0.866045355796814, "eval_logits/rejected": -0.3837307393550873, "eval_logps/chosen": -1557.04638671875, "eval_logps/rejected": -1562.2716064453125, "eval_loss": 4.523464679718018, "eval_nll_loss": 0.961448073387146, "eval_rewards/accuracies": 0.5106382966041565, "eval_rewards/chosen": -0.3281043767929077, "eval_rewards/margins": 0.057386480271816254, "eval_rewards/rejected": -0.3854908347129822, "eval_runtime": 113.9936, "eval_samples_per_second": 3.298, "eval_steps_per_second": 1.649, "step": 170 }, { "epoch": 0.24533715925394547, "grad_norm": 5.056923866271973, "kl/ref_to_policy/chosen": 29.908226013183594, "kl/ref_to_policy/mean": 32.92023468017578, "kl/ref_to_policy/rejected": 35.93224334716797, "learning_rate": 9.38538990069787e-06, "logits/chosen": -1.0505067110061646, "logits/rejected": -0.5280241370201111, "logps/chosen": -1575.744140625, "logps/rejected": -1582.07275390625, "loss": 3.7991, "nll_loss": 0.7802188396453857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2990822196006775, "rewards/margins": 0.06024019420146942, "rewards/rejected": -0.3593224287033081, "step": 171 }, { "epoch": 0.2467718794835007, "grad_norm": 4.37679386138916, "kl/ref_to_policy/chosen": 17.829124450683594, "kl/ref_to_policy/mean": 29.443849563598633, "kl/ref_to_policy/rejected": 41.05856704711914, "learning_rate": 9.373300949255112e-06, "logits/chosen": -1.3818542957305908, "logits/rejected": -0.6000950336456299, "logps/chosen": -1050.8370361328125, "logps/rejected": -1064.1875, "loss": 2.5129, "nll_loss": 0.4799208343029022, "rewards/accuracies": 0.75, "rewards/chosen": -0.17829126119613647, "rewards/margins": 0.23229441046714783, "rewards/rejected": -0.4105857014656067, "step": 172 }, { "epoch": 0.24820659971305595, "grad_norm": 11.218392372131348, "kl/ref_to_policy/chosen": 60.324432373046875, "kl/ref_to_policy/mean": 53.47661209106445, "kl/ref_to_policy/rejected": 46.62879180908203, "learning_rate": 9.361102205199762e-06, "logits/chosen": -0.4972975552082062, "logits/rejected": -0.3079904615879059, "logps/chosen": -2295.715087890625, "logps/rejected": -2296.72265625, "loss": 6.8479, "nll_loss": 1.5186749696731567, "rewards/accuracies": 0.1875, "rewards/chosen": -0.6032443046569824, "rewards/margins": -0.13695642352104187, "rewards/rejected": -0.46628788113594055, "step": 173 }, { "epoch": 0.24964131994261118, "grad_norm": 12.603170394897461, "kl/ref_to_policy/chosen": 54.21894073486328, "kl/ref_to_policy/mean": 54.34278106689453, "kl/ref_to_policy/rejected": 54.466617584228516, "learning_rate": 9.348793974783778e-06, "logits/chosen": -0.45173487067222595, "logits/rejected": -0.09126909077167511, "logps/chosen": -2116.703369140625, "logps/rejected": -2122.013671875, "loss": 6.1259, "nll_loss": 1.3557064533233643, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5421894788742065, "rewards/margins": 0.0024767741560935974, "rewards/rejected": -0.5446661710739136, "step": 174 }, { "epoch": 0.25107604017216645, "grad_norm": 39.08280563354492, "kl/ref_to_policy/chosen": 22.42992401123047, "kl/ref_to_policy/mean": 24.447311401367188, "kl/ref_to_policy/rejected": 26.46469497680664, "learning_rate": 9.336376567007799e-06, "logits/chosen": -0.8746179342269897, "logits/rejected": -0.32921111583709717, "logps/chosen": -1369.8909912109375, "logps/rejected": -1377.0447998046875, "loss": 4.6201, "nll_loss": 0.9838672280311584, "rewards/accuracies": 0.5, "rewards/chosen": -0.22429925203323364, "rewards/margins": 0.04034772142767906, "rewards/rejected": -0.264646977186203, "step": 175 }, { "epoch": 0.25251076040172166, "grad_norm": 87.87013244628906, "kl/ref_to_policy/chosen": 1.510673999786377, "kl/ref_to_policy/mean": 10.44352912902832, "kl/ref_to_policy/rejected": 19.37638282775879, "learning_rate": 9.32385029361338e-06, "logits/chosen": -1.2215807437896729, "logits/rejected": -0.49342870712280273, "logps/chosen": -880.7997436523438, "logps/rejected": -891.3040771484375, "loss": 2.9142, "nll_loss": 0.5746318697929382, "rewards/accuracies": 0.75, "rewards/chosen": -0.015106745064258575, "rewards/margins": 0.17865708470344543, "rewards/rejected": -0.1937638372182846, "step": 176 }, { "epoch": 0.2539454806312769, "grad_norm": 9.087642669677734, "kl/ref_to_policy/chosen": 47.24858093261719, "kl/ref_to_policy/mean": 42.950660705566406, "kl/ref_to_policy/rejected": 38.65274429321289, "learning_rate": 9.311215469075168e-06, "logits/chosen": -0.6295865774154663, "logits/rejected": -0.28967463970184326, "logps/chosen": -1858.91259765625, "logps/rejected": -1856.7952880859375, "loss": 5.5508, "nll_loss": 1.2003200054168701, "rewards/accuracies": 0.3125, "rewards/chosen": -0.47248575091362, "rewards/margins": -0.08595834672451019, "rewards/rejected": -0.386527419090271, "step": 177 }, { "epoch": 0.25538020086083213, "grad_norm": 8.238442420959473, "kl/ref_to_policy/chosen": 35.154319763183594, "kl/ref_to_policy/mean": 35.76603698730469, "kl/ref_to_policy/rejected": 36.37775421142578, "learning_rate": 9.298472410593013e-06, "logits/chosen": -0.6295448541641235, "logits/rejected": -0.16780740022659302, "logps/chosen": -1534.099609375, "logps/rejected": -1539.321044921875, "loss": 5.4377, "nll_loss": 1.1848169565200806, "rewards/accuracies": 0.4375, "rewards/chosen": -0.351543128490448, "rewards/margins": 0.01223438885062933, "rewards/rejected": -0.3637775778770447, "step": 178 }, { "epoch": 0.2568149210903874, "grad_norm": 7.554957866668701, "kl/ref_to_policy/chosen": 36.12876892089844, "kl/ref_to_policy/mean": 34.025611877441406, "kl/ref_to_policy/rejected": 31.922460556030273, "learning_rate": 9.285621438083997e-06, "logits/chosen": -0.4774588644504547, "logits/rejected": -0.1138167679309845, "logps/chosen": -1572.222412109375, "logps/rejected": -1574.19384765625, "loss": 5.4834, "nll_loss": 1.1898192167282104, "rewards/accuracies": 0.375, "rewards/chosen": -0.3612876534461975, "rewards/margins": -0.04206307604908943, "rewards/rejected": -0.31922459602355957, "step": 179 }, { "epoch": 0.2582496413199426, "grad_norm": 7.410195350646973, "kl/ref_to_policy/chosen": 17.453224182128906, "kl/ref_to_policy/mean": 19.170515060424805, "kl/ref_to_policy/rejected": 20.887805938720703, "learning_rate": 9.27266287417441e-06, "logits/chosen": -0.9920313358306885, "logits/rejected": -0.44373470544815063, "logps/chosen": -1010.4063720703125, "logps/rejected": -1014.4561767578125, "loss": 3.977, "nll_loss": 0.8228877782821655, "rewards/accuracies": 0.5, "rewards/chosen": -0.17453224956989288, "rewards/margins": 0.03434579819440842, "rewards/rejected": -0.2088780552148819, "step": 180 }, { "epoch": 0.2582496413199426, "eval_kl/ref_to_policy/chosen": 22.979707717895508, "eval_kl/ref_to_policy/mean": 25.49992561340332, "eval_kl/ref_to_policy/rejected": 28.020143508911133, "eval_logits/chosen": -0.8978632092475891, "eval_logits/rejected": -0.3981076180934906, "eval_logps/chosen": -1547.2156982421875, "eval_logps/rejected": -1551.74267578125, "eval_loss": 4.537942886352539, "eval_nll_loss": 0.9654968976974487, "eval_rewards/accuracies": 0.5505319237709045, "eval_rewards/chosen": -0.22979706525802612, "eval_rewards/margins": 0.05040436238050461, "eval_rewards/rejected": -0.28020143508911133, "eval_runtime": 111.7888, "eval_samples_per_second": 3.363, "eval_steps_per_second": 1.682, "step": 180 }, { "epoch": 0.25968436154949787, "grad_norm": 58.81148147583008, "kl/ref_to_policy/chosen": 4.369189739227295, "kl/ref_to_policy/mean": 9.174757957458496, "kl/ref_to_policy/rejected": 13.980328559875488, "learning_rate": 9.259597044191635e-06, "logits/chosen": -1.4524314403533936, "logits/rejected": -0.8246263265609741, "logps/chosen": -708.033203125, "logps/rejected": -713.8338012695312, "loss": 2.9974, "nll_loss": 0.5859439373016357, "rewards/accuracies": 0.625, "rewards/chosen": -0.04369189962744713, "rewards/margins": 0.09611137956380844, "rewards/rejected": -0.13980329036712646, "step": 181 }, { "epoch": 0.2611190817790531, "grad_norm": 5.574890613555908, "kl/ref_to_policy/chosen": 14.570411682128906, "kl/ref_to_policy/mean": 17.01894760131836, "kl/ref_to_policy/rejected": 19.467483520507812, "learning_rate": 9.246424276156008e-06, "logits/chosen": -0.8628742694854736, "logits/rejected": -0.36380571126937866, "logps/chosen": -1020.6485595703125, "logps/rejected": -1028.080810546875, "loss": 3.71, "nll_loss": 0.7574123740196228, "rewards/accuracies": 0.5, "rewards/chosen": -0.14570412039756775, "rewards/margins": 0.04897074028849602, "rewards/rejected": -0.19467484951019287, "step": 182 }, { "epoch": 0.26255380200860834, "grad_norm": 25.006397247314453, "kl/ref_to_policy/chosen": 7.040089130401611, "kl/ref_to_policy/mean": 13.339167594909668, "kl/ref_to_policy/rejected": 19.63824462890625, "learning_rate": 9.233144900772553e-06, "logits/chosen": -1.1692028045654297, "logits/rejected": -0.5084843039512634, "logps/chosen": -1340.4931640625, "logps/rejected": -1347.6617431640625, "loss": 3.1017, "nll_loss": 0.6157010793685913, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0704008936882019, "rewards/margins": 0.12598156929016113, "rewards/rejected": -0.19638246297836304, "step": 183 }, { "epoch": 0.26398852223816355, "grad_norm": 54.69405746459961, "kl/ref_to_policy/chosen": 10.106935501098633, "kl/ref_to_policy/mean": 10.59068775177002, "kl/ref_to_policy/rejected": 11.07443904876709, "learning_rate": 9.2197592514227e-06, "logits/chosen": -0.8555445671081543, "logits/rejected": -0.4445149004459381, "logps/chosen": -1213.27099609375, "logps/rejected": -1218.525634765625, "loss": 4.4514, "nll_loss": 0.9387896060943604, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10106935352087021, "rewards/margins": 0.009675038047134876, "rewards/rejected": -0.11074438691139221, "step": 184 }, { "epoch": 0.2654232424677188, "grad_norm": 62.01688766479492, "kl/ref_to_policy/chosen": 23.695087432861328, "kl/ref_to_policy/mean": 26.09160804748535, "kl/ref_to_policy/rejected": 28.48813247680664, "learning_rate": 9.206267664155906e-06, "logits/chosen": -0.8559294939041138, "logits/rejected": -0.36129096150398254, "logps/chosen": -1549.632568359375, "logps/rejected": -1556.8319091796875, "loss": 4.5066, "nll_loss": 0.9568601250648499, "rewards/accuracies": 0.5, "rewards/chosen": -0.23695087432861328, "rewards/margins": 0.047930411994457245, "rewards/rejected": -0.2848812937736511, "step": 185 }, { "epoch": 0.266857962697274, "grad_norm": 87.28421783447266, "kl/ref_to_policy/chosen": 19.851211547851562, "kl/ref_to_policy/mean": 23.450021743774414, "kl/ref_to_policy/rejected": 27.048831939697266, "learning_rate": 9.192670477681224e-06, "logits/chosen": -0.8540453910827637, "logits/rejected": -0.36588254570961, "logps/chosen": -1268.6871337890625, "logps/rejected": -1272.611328125, "loss": 4.6249, "nll_loss": 0.9897589087486267, "rewards/accuracies": 0.625, "rewards/chosen": -0.19851210713386536, "rewards/margins": 0.0719761997461319, "rewards/rejected": -0.27048832178115845, "step": 186 }, { "epoch": 0.2682926829268293, "grad_norm": 8.24028205871582, "kl/ref_to_policy/chosen": 24.97346305847168, "kl/ref_to_policy/mean": 28.896530151367188, "kl/ref_to_policy/rejected": 32.81959533691406, "learning_rate": 9.178968033358792e-06, "logits/chosen": -0.9029431343078613, "logits/rejected": -0.36688095331192017, "logps/chosen": -1437.0281982421875, "logps/rejected": -1442.321533203125, "loss": 4.0001, "nll_loss": 0.833765983581543, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24973464012145996, "rewards/margins": 0.07846132665872574, "rewards/rejected": -0.3281959593296051, "step": 187 }, { "epoch": 0.2697274031563845, "grad_norm": 5.293621063232422, "kl/ref_to_policy/chosen": 28.543983459472656, "kl/ref_to_policy/mean": 32.987091064453125, "kl/ref_to_policy/rejected": 37.43021011352539, "learning_rate": 9.165160675191272e-06, "logits/chosen": -0.7673455476760864, "logits/rejected": -0.205832839012146, "logps/chosen": -1203.9970703125, "logps/rejected": -1209.8170166015625, "loss": 3.4398, "nll_loss": 0.6941101551055908, "rewards/accuracies": 0.625, "rewards/chosen": -0.2854398488998413, "rewards/margins": 0.08886224031448364, "rewards/rejected": -0.37430205941200256, "step": 188 }, { "epoch": 0.27116212338593976, "grad_norm": 7.898053169250488, "kl/ref_to_policy/chosen": 63.38465118408203, "kl/ref_to_policy/mean": 59.78373718261719, "kl/ref_to_policy/rejected": 56.18281936645508, "learning_rate": 9.151248749815208e-06, "logits/chosen": -0.3778899908065796, "logits/rejected": -0.13493813574314117, "logps/chosen": -2186.994140625, "logps/rejected": -2186.643798828125, "loss": 5.6367, "nll_loss": 1.22372305393219, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6338464617729187, "rewards/margins": -0.0720183476805687, "rewards/rejected": -0.5618281960487366, "step": 189 }, { "epoch": 0.27259684361549497, "grad_norm": 6.803033351898193, "kl/ref_to_policy/chosen": 38.10912322998047, "kl/ref_to_policy/mean": 40.09611892700195, "kl/ref_to_policy/rejected": 42.0831184387207, "learning_rate": 9.137232606492323e-06, "logits/chosen": -0.6468863487243652, "logits/rejected": -0.23045098781585693, "logps/chosen": -1696.382568359375, "logps/rejected": -1703.120361328125, "loss": 4.7339, "nll_loss": 1.0114028453826904, "rewards/accuracies": 0.5, "rewards/chosen": -0.3810912072658539, "rewards/margins": 0.039739981293678284, "rewards/rejected": -0.42083117365837097, "step": 190 }, { "epoch": 0.27259684361549497, "eval_kl/ref_to_policy/chosen": 31.65225601196289, "eval_kl/ref_to_policy/mean": 35.09895324707031, "eval_kl/ref_to_policy/rejected": 38.545654296875, "eval_logits/chosen": -0.5628092885017395, "eval_logits/rejected": -0.15072283148765564, "eval_logps/chosen": -1555.8883056640625, "eval_logps/rejected": -1562.268310546875, "eval_loss": 4.514738082885742, "eval_nll_loss": 0.9599937796592712, "eval_rewards/accuracies": 0.5159574747085571, "eval_rewards/chosen": -0.3165225088596344, "eval_rewards/margins": 0.06893400847911835, "eval_rewards/rejected": -0.38545656204223633, "eval_runtime": 112.5181, "eval_samples_per_second": 3.342, "eval_steps_per_second": 1.671, "step": 190 }, { "epoch": 0.27403156384505023, "grad_norm": 10.088262557983398, "kl/ref_to_policy/chosen": 47.51959228515625, "kl/ref_to_policy/mean": 47.136226654052734, "kl/ref_to_policy/rejected": 46.75286102294922, "learning_rate": 9.123112597100759e-06, "logits/chosen": -0.4109794795513153, "logits/rejected": -0.13396932184696198, "logps/chosen": -1521.12109375, "logps/rejected": -1524.1434326171875, "loss": 5.389, "nll_loss": 1.1692910194396973, "rewards/accuracies": 0.375, "rewards/chosen": -0.47519591450691223, "rewards/margins": -0.007667304947972298, "rewards/rejected": -0.4675285816192627, "step": 191 }, { "epoch": 0.27546628407460544, "grad_norm": 5.198230743408203, "kl/ref_to_policy/chosen": 27.213865280151367, "kl/ref_to_policy/mean": 33.464054107666016, "kl/ref_to_policy/rejected": 39.7142448425293, "learning_rate": 9.108889076126226e-06, "logits/chosen": -0.6877369284629822, "logits/rejected": -0.26970627903938293, "logps/chosen": -1673.4482421875, "logps/rejected": -1683.20654296875, "loss": 3.9542, "nll_loss": 0.8274222612380981, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27213865518569946, "rewards/margins": 0.12500375509262085, "rewards/rejected": -0.3971423804759979, "step": 192 }, { "epoch": 0.2769010043041607, "grad_norm": 6.585834503173828, "kl/ref_to_policy/chosen": 59.88514709472656, "kl/ref_to_policy/mean": 60.047325134277344, "kl/ref_to_policy/rejected": 60.209510803222656, "learning_rate": 9.094562400653127e-06, "logits/chosen": -0.5245810747146606, "logits/rejected": -0.19258901476860046, "logps/chosen": -2222.78369140625, "logps/rejected": -2228.28369140625, "loss": 4.6216, "nll_loss": 0.9785953760147095, "rewards/accuracies": 0.5, "rewards/chosen": -0.5988514423370361, "rewards/margins": 0.003243653103709221, "rewards/rejected": -0.6020951271057129, "step": 193 }, { "epoch": 0.2783357245337159, "grad_norm": 7.821066379547119, "kl/ref_to_policy/chosen": 3.767179489135742, "kl/ref_to_policy/mean": 8.750235557556152, "kl/ref_to_policy/rejected": 13.733291625976562, "learning_rate": 9.080132930355567e-06, "logits/chosen": -0.6247912645339966, "logits/rejected": -0.2055768519639969, "logps/chosen": -639.7593383789062, "logps/rejected": -647.2019653320312, "loss": 3.6613, "nll_loss": 0.7507696151733398, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03767179325222969, "rewards/margins": 0.09966111183166504, "rewards/rejected": -0.13733290135860443, "step": 194 }, { "epoch": 0.2797704447632712, "grad_norm": 8.089599609375, "kl/ref_to_policy/chosen": 15.95822525024414, "kl/ref_to_policy/mean": 21.726728439331055, "kl/ref_to_policy/rejected": 27.49523162841797, "learning_rate": 9.065601027488345e-06, "logits/chosen": -0.7049504518508911, "logits/rejected": -0.31010767817497253, "logps/chosen": -1650.3800048828125, "logps/rejected": -1660.6787109375, "loss": 4.3439, "nll_loss": 0.9239566326141357, "rewards/accuracies": 0.625, "rewards/chosen": -0.159582257270813, "rewards/margins": 0.11537003517150879, "rewards/rejected": -0.2749522924423218, "step": 195 }, { "epoch": 0.2812051649928264, "grad_norm": 7.740196228027344, "kl/ref_to_policy/chosen": 49.30718231201172, "kl/ref_to_policy/mean": 46.662227630615234, "kl/ref_to_policy/rejected": 44.01726531982422, "learning_rate": 9.050967056877846e-06, "logits/chosen": -0.5275700092315674, "logits/rejected": -0.2914316952228546, "logps/chosen": -1909.6353759765625, "logps/rejected": -1913.316162109375, "loss": 5.7974, "nll_loss": 1.2668676376342773, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4930717945098877, "rewards/margins": -0.05289916694164276, "rewards/rejected": -0.4401726722717285, "step": 196 }, { "epoch": 0.28263988522238165, "grad_norm": 17.757570266723633, "kl/ref_to_policy/chosen": 28.111661911010742, "kl/ref_to_policy/mean": 35.90179443359375, "kl/ref_to_policy/rejected": 43.691932678222656, "learning_rate": 9.03623138591289e-06, "logits/chosen": -0.7748942375183105, "logits/rejected": -0.32973721623420715, "logps/chosen": -1555.9007568359375, "logps/rejected": -1568.61181640625, "loss": 3.4862, "nll_loss": 0.7136948704719543, "rewards/accuracies": 0.625, "rewards/chosen": -0.2811165750026703, "rewards/margins": 0.15580272674560547, "rewards/rejected": -0.43691933155059814, "step": 197 }, { "epoch": 0.28407460545193686, "grad_norm": 7.541652679443359, "kl/ref_to_policy/chosen": 52.51357650756836, "kl/ref_to_policy/mean": 50.427978515625, "kl/ref_to_policy/rejected": 48.34238052368164, "learning_rate": 9.0213943845355e-06, "logits/chosen": -0.34789717197418213, "logits/rejected": -0.11904049664735794, "logps/chosen": -1828.107421875, "logps/rejected": -1830.1197509765625, "loss": 5.277, "nll_loss": 1.1368095874786377, "rewards/accuracies": 0.375, "rewards/chosen": -0.5251357555389404, "rewards/margins": -0.04171191155910492, "rewards/rejected": -0.4834238290786743, "step": 198 }, { "epoch": 0.2855093256814921, "grad_norm": 5.884748935699463, "kl/ref_to_policy/chosen": 41.95630645751953, "kl/ref_to_policy/mean": 45.93510055541992, "kl/ref_to_policy/rejected": 49.91388702392578, "learning_rate": 9.006456425231624e-06, "logits/chosen": -0.4860832691192627, "logits/rejected": -0.15749605000019073, "logps/chosen": -1585.425048828125, "logps/rejected": -1596.5291748046875, "loss": 4.4247, "nll_loss": 0.9393361210823059, "rewards/accuracies": 0.5, "rewards/chosen": -0.41956308484077454, "rewards/margins": 0.07957583665847778, "rewards/rejected": -0.49913889169692993, "step": 199 }, { "epoch": 0.28694404591104733, "grad_norm": 60.068790435791016, "kl/ref_to_policy/chosen": 26.394201278686523, "kl/ref_to_policy/mean": 33.03697204589844, "kl/ref_to_policy/rejected": 39.679744720458984, "learning_rate": 8.99141788302178e-06, "logits/chosen": -0.7804693579673767, "logits/rejected": -0.3320049047470093, "logps/chosen": -1714.089599609375, "logps/rejected": -1725.169921875, "loss": 4.0403, "nll_loss": 0.8499698638916016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26394200325012207, "rewards/margins": 0.13285541534423828, "rewards/rejected": -0.39679741859436035, "step": 200 }, { "epoch": 0.28694404591104733, "eval_kl/ref_to_policy/chosen": 25.1722354888916, "eval_kl/ref_to_policy/mean": 30.282751083374023, "eval_kl/ref_to_policy/rejected": 35.39326477050781, "eval_logits/chosen": -0.6430355906486511, "eval_logits/rejected": -0.23033617436885834, "eval_logps/chosen": -1549.408203125, "eval_logps/rejected": -1559.11572265625, "eval_loss": 4.472827434539795, "eval_nll_loss": 0.9542362689971924, "eval_rewards/accuracies": 0.5478723645210266, "eval_rewards/chosen": -0.2517223656177521, "eval_rewards/margins": 0.10221029072999954, "eval_rewards/rejected": -0.353932648897171, "eval_runtime": 113.0716, "eval_samples_per_second": 3.325, "eval_steps_per_second": 1.663, "step": 200 }, { "epoch": 0.2883787661406026, "grad_norm": 5.397548675537109, "kl/ref_to_policy/chosen": 9.775898933410645, "kl/ref_to_policy/mean": 19.408903121948242, "kl/ref_to_policy/rejected": 29.04190444946289, "learning_rate": 8.976279135451636e-06, "logits/chosen": -0.7836517691612244, "logits/rejected": -0.23830324411392212, "logps/chosen": -991.0728149414062, "logps/rejected": -1005.4188842773438, "loss": 3.2996, "nll_loss": 0.6717767119407654, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09775899350643158, "rewards/margins": 0.19266009330749512, "rewards/rejected": -0.2904190421104431, "step": 201 }, { "epoch": 0.2898134863701578, "grad_norm": 6.576742172241211, "kl/ref_to_policy/chosen": 43.03358459472656, "kl/ref_to_policy/mean": 45.421836853027344, "kl/ref_to_policy/rejected": 47.81008529663086, "learning_rate": 8.96104056258254e-06, "logits/chosen": -0.5674750804901123, "logits/rejected": -0.21360409259796143, "logps/chosen": -1632.7913818359375, "logps/rejected": -1641.5894775390625, "loss": 4.7441, "nll_loss": 1.0152902603149414, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4303358793258667, "rewards/margins": 0.047765035182237625, "rewards/rejected": -0.47810089588165283, "step": 202 }, { "epoch": 0.29124820659971307, "grad_norm": 8.239521980285645, "kl/ref_to_policy/chosen": 31.703815460205078, "kl/ref_to_policy/mean": 36.3531494140625, "kl/ref_to_policy/rejected": 41.00248718261719, "learning_rate": 8.94570254698197e-06, "logits/chosen": -0.7047522068023682, "logits/rejected": -0.24015969038009644, "logps/chosen": -1724.258544921875, "logps/rejected": -1733.64501953125, "loss": 4.6222, "nll_loss": 0.9910228848457336, "rewards/accuracies": 0.5, "rewards/chosen": -0.3170381486415863, "rewards/margins": 0.09298671036958694, "rewards/rejected": -0.41002488136291504, "step": 203 }, { "epoch": 0.2926829268292683, "grad_norm": 5.141488075256348, "kl/ref_to_policy/chosen": 7.284406661987305, "kl/ref_to_policy/mean": 14.819525718688965, "kl/ref_to_policy/rejected": 22.354644775390625, "learning_rate": 8.930265473713939e-06, "logits/chosen": -0.9209145307540894, "logits/rejected": -0.38577690720558167, "logps/chosen": -1113.995361328125, "logps/rejected": -1126.7198486328125, "loss": 3.2624, "nll_loss": 0.6570707559585571, "rewards/accuracies": 0.625, "rewards/chosen": -0.07284407317638397, "rewards/margins": 0.1507023572921753, "rewards/rejected": -0.22354643046855927, "step": 204 }, { "epoch": 0.29411764705882354, "grad_norm": 5.184057712554932, "kl/ref_to_policy/chosen": 9.373686790466309, "kl/ref_to_policy/mean": 16.11374282836914, "kl/ref_to_policy/rejected": 22.853796005249023, "learning_rate": 8.914729730329321e-06, "logits/chosen": -0.7614907026290894, "logits/rejected": -0.26475098729133606, "logps/chosen": -857.7330322265625, "logps/rejected": -867.748291015625, "loss": 2.9298, "nll_loss": 0.5716598033905029, "rewards/accuracies": 0.625, "rewards/chosen": -0.09373687207698822, "rewards/margins": 0.13480107486248016, "rewards/rejected": -0.22853796184062958, "step": 205 }, { "epoch": 0.29555236728837875, "grad_norm": 5.604503631591797, "kl/ref_to_policy/chosen": 28.30584716796875, "kl/ref_to_policy/mean": 35.64642333984375, "kl/ref_to_policy/rejected": 42.98699951171875, "learning_rate": 8.899095706856122e-06, "logits/chosen": -0.8350744247436523, "logits/rejected": -0.29157158732414246, "logps/chosen": -1322.438720703125, "logps/rejected": -1337.3638916015625, "loss": 3.6864, "nll_loss": 0.7619428038597107, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28305843472480774, "rewards/margins": 0.14681148529052734, "rewards/rejected": -0.42986997961997986, "step": 206 }, { "epoch": 0.296987087517934, "grad_norm": 6.323026657104492, "kl/ref_to_policy/chosen": 20.951061248779297, "kl/ref_to_policy/mean": 26.39972686767578, "kl/ref_to_policy/rejected": 31.848392486572266, "learning_rate": 8.883363795789694e-06, "logits/chosen": -0.8063746690750122, "logits/rejected": -0.31797605752944946, "logps/chosen": -1095.8182373046875, "logps/rejected": -1105.647216796875, "loss": 4.1113, "nll_loss": 0.8633996844291687, "rewards/accuracies": 0.5, "rewards/chosen": -0.20951059460639954, "rewards/margins": 0.10897330194711685, "rewards/rejected": -0.3184839189052582, "step": 207 }, { "epoch": 0.2984218077474892, "grad_norm": 6.781062126159668, "kl/ref_to_policy/chosen": 9.527532577514648, "kl/ref_to_policy/mean": 19.596208572387695, "kl/ref_to_policy/rejected": 29.664888381958008, "learning_rate": 8.867534392082873e-06, "logits/chosen": -0.7070360779762268, "logits/rejected": -0.17720362544059753, "logps/chosen": -1216.871337890625, "logps/rejected": -1235.5106201171875, "loss": 4.0068, "nll_loss": 0.8488202095031738, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09527530521154404, "rewards/margins": 0.20137354731559753, "rewards/rejected": -0.29664885997772217, "step": 208 }, { "epoch": 0.2998565279770445, "grad_norm": 7.711390495300293, "kl/ref_to_policy/chosen": 5.877452850341797, "kl/ref_to_policy/mean": 13.411396026611328, "kl/ref_to_policy/rejected": 20.945341110229492, "learning_rate": 8.851607893136065e-06, "logits/chosen": -0.9390814900398254, "logits/rejected": -0.29844018816947937, "logps/chosen": -977.1351928710938, "logps/rejected": -991.8302001953125, "loss": 3.3928, "nll_loss": 0.6882990598678589, "rewards/accuracies": 0.5625, "rewards/chosen": -0.058774515986442566, "rewards/margins": 0.15067890286445618, "rewards/rejected": -0.20945340394973755, "step": 209 }, { "epoch": 0.3012912482065997, "grad_norm": 111.2918701171875, "kl/ref_to_policy/chosen": 6.652979850769043, "kl/ref_to_policy/mean": 16.112367630004883, "kl/ref_to_policy/rejected": 25.571754455566406, "learning_rate": 8.83558469878728e-06, "logits/chosen": -1.1701258420944214, "logits/rejected": -0.49117445945739746, "logps/chosen": -874.8560791015625, "logps/rejected": -889.40576171875, "loss": 3.1286, "nll_loss": 0.6282524466514587, "rewards/accuracies": 0.625, "rewards/chosen": -0.06652979552745819, "rewards/margins": 0.18918773531913757, "rewards/rejected": -0.25571754574775696, "step": 210 }, { "epoch": 0.3012912482065997, "eval_kl/ref_to_policy/chosen": 24.97813606262207, "eval_kl/ref_to_policy/mean": 31.242136001586914, "eval_kl/ref_to_policy/rejected": 37.50613021850586, "eval_logits/chosen": -0.9815424084663391, "eval_logits/rejected": -0.38320451974868774, "eval_logps/chosen": -1549.214111328125, "eval_logps/rejected": -1561.228759765625, "eval_loss": 4.602940559387207, "eval_nll_loss": 0.9897182583808899, "eval_rewards/accuracies": 0.5531914830207825, "eval_rewards/chosen": -0.24978137016296387, "eval_rewards/margins": 0.1252799779176712, "eval_rewards/rejected": -0.3750613331794739, "eval_runtime": 111.5487, "eval_samples_per_second": 3.371, "eval_steps_per_second": 1.685, "step": 210 }, { "epoch": 0.30272596843615496, "grad_norm": 7.874096870422363, "kl/ref_to_policy/chosen": 72.53424072265625, "kl/ref_to_policy/mean": 70.34923553466797, "kl/ref_to_policy/rejected": 68.16421508789062, "learning_rate": 8.819465211302081e-06, "logits/chosen": -0.5612860321998596, "logits/rejected": -0.2831851541996002, "logps/chosen": -2700.7919921875, "logps/rejected": -2708.412353515625, "loss": 6.0781, "nll_loss": 1.3388416767120361, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7253424525260925, "rewards/margins": -0.04370028153061867, "rewards/rejected": -0.6816421747207642, "step": 211 }, { "epoch": 0.30416068866571017, "grad_norm": 81.19525146484375, "kl/ref_to_policy/chosen": 6.841052055358887, "kl/ref_to_policy/mean": 14.326626777648926, "kl/ref_to_policy/rejected": 21.812196731567383, "learning_rate": 8.803249835363486e-06, "logits/chosen": -1.099914312362671, "logits/rejected": -0.4616975784301758, "logps/chosen": -1232.012939453125, "logps/rejected": -1244.3726806640625, "loss": 4.3409, "nll_loss": 0.9279814958572388, "rewards/accuracies": 0.625, "rewards/chosen": -0.06841052323579788, "rewards/margins": 0.14971144497394562, "rewards/rejected": -0.21812200546264648, "step": 212 }, { "epoch": 0.30559540889526543, "grad_norm": 25.916107177734375, "kl/ref_to_policy/chosen": 33.300392150878906, "kl/ref_to_policy/mean": 38.9493522644043, "kl/ref_to_policy/rejected": 44.59831237792969, "learning_rate": 8.78693897806182e-06, "logits/chosen": -0.9697288870811462, "logits/rejected": -0.35302403569221497, "logps/chosen": -1768.7847900390625, "logps/rejected": -1779.6744384765625, "loss": 4.7271, "nll_loss": 1.019060492515564, "rewards/accuracies": 0.5, "rewards/chosen": -0.3330039381980896, "rewards/margins": 0.11297918111085892, "rewards/rejected": -0.4459831416606903, "step": 213 }, { "epoch": 0.30703012912482064, "grad_norm": 59.425804138183594, "kl/ref_to_policy/chosen": 2.1362037658691406, "kl/ref_to_policy/mean": 12.630874633789062, "kl/ref_to_policy/rejected": 23.12554931640625, "learning_rate": 8.770533048884483e-06, "logits/chosen": -1.399181842803955, "logits/rejected": -0.5502836108207703, "logps/chosen": -793.8392333984375, "logps/rejected": -809.1392822265625, "loss": 2.7436, "nll_loss": 0.5344328284263611, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02136204019188881, "rewards/margins": 0.20989343523979187, "rewards/rejected": -0.23125547170639038, "step": 214 }, { "epoch": 0.3084648493543759, "grad_norm": 8.431618690490723, "kl/ref_to_policy/chosen": 45.55063247680664, "kl/ref_to_policy/mean": 46.91383743286133, "kl/ref_to_policy/rejected": 48.277042388916016, "learning_rate": 8.754032459705672e-06, "logits/chosen": -0.7127687931060791, "logits/rejected": -0.29065200686454773, "logps/chosen": -1897.05859375, "logps/rejected": -1905.833740234375, "loss": 5.5159, "nll_loss": 1.2059334516525269, "rewards/accuracies": 0.375, "rewards/chosen": -0.4555063545703888, "rewards/margins": 0.027264069765806198, "rewards/rejected": -0.4827704131603241, "step": 215 }, { "epoch": 0.3098995695839311, "grad_norm": 5.022409915924072, "kl/ref_to_policy/chosen": 14.122779846191406, "kl/ref_to_policy/mean": 30.58447265625, "kl/ref_to_policy/rejected": 47.04616928100586, "learning_rate": 8.737437624776047e-06, "logits/chosen": -1.069779872894287, "logits/rejected": -0.2210991382598877, "logps/chosen": -832.630615234375, "logps/rejected": -856.3333740234375, "loss": 2.7621, "nll_loss": 0.5511717200279236, "rewards/accuracies": 0.75, "rewards/chosen": -0.14122778177261353, "rewards/margins": 0.32923388481140137, "rewards/rejected": -0.4704616367816925, "step": 216 }, { "epoch": 0.3113342898134864, "grad_norm": 14.917909622192383, "kl/ref_to_policy/chosen": 26.755300521850586, "kl/ref_to_policy/mean": 34.79834747314453, "kl/ref_to_policy/rejected": 42.841400146484375, "learning_rate": 8.720748960712323e-06, "logits/chosen": -1.0618577003479004, "logits/rejected": -0.3806677460670471, "logps/chosen": -1343.8411865234375, "logps/rejected": -1357.9781494140625, "loss": 3.9189, "nll_loss": 0.8191716074943542, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26755303144454956, "rewards/margins": 0.16086100041866302, "rewards/rejected": -0.428413987159729, "step": 217 }, { "epoch": 0.3127690100430416, "grad_norm": 6.281200408935547, "kl/ref_to_policy/chosen": 15.764364242553711, "kl/ref_to_policy/mean": 29.836088180541992, "kl/ref_to_policy/rejected": 43.907814025878906, "learning_rate": 8.703966886486819e-06, "logits/chosen": -1.004652738571167, "logits/rejected": -0.15472739934921265, "logps/chosen": -1085.303466796875, "logps/rejected": -1108.50146484375, "loss": 2.8781, "nll_loss": 0.5736629962921143, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1576436460018158, "rewards/margins": 0.28143447637557983, "rewards/rejected": -0.43907809257507324, "step": 218 }, { "epoch": 0.31420373027259685, "grad_norm": 8.473213195800781, "kl/ref_to_policy/chosen": 52.01457214355469, "kl/ref_to_policy/mean": 56.21446990966797, "kl/ref_to_policy/rejected": 60.41436767578125, "learning_rate": 8.68709182341693e-06, "logits/chosen": -0.7823724150657654, "logits/rejected": -0.2676909267902374, "logps/chosen": -1718.303466796875, "logps/rejected": -1730.342041015625, "loss": 4.7789, "nll_loss": 1.0260653495788574, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5201456546783447, "rewards/margins": 0.08399796485900879, "rewards/rejected": -0.6041436791419983, "step": 219 }, { "epoch": 0.31563845050215206, "grad_norm": 4.827953815460205, "kl/ref_to_policy/chosen": 10.464986801147461, "kl/ref_to_policy/mean": 23.27364730834961, "kl/ref_to_policy/rejected": 36.082305908203125, "learning_rate": 8.670124195154557e-06, "logits/chosen": -1.1901170015335083, "logits/rejected": -0.31342554092407227, "logps/chosen": -1072.011962890625, "logps/rejected": -1091.0198974609375, "loss": 2.9076, "nll_loss": 0.5805972814559937, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1046498566865921, "rewards/margins": 0.2561732530593872, "rewards/rejected": -0.3608230948448181, "step": 220 }, { "epoch": 0.31563845050215206, "eval_kl/ref_to_policy/chosen": 23.779306411743164, "eval_kl/ref_to_policy/mean": 29.23830223083496, "eval_kl/ref_to_policy/rejected": 34.69729995727539, "eval_logits/chosen": -1.0532567501068115, "eval_logits/rejected": -0.42650213837623596, "eval_logps/chosen": -1548.0152587890625, "eval_logps/rejected": -1558.419921875, "eval_loss": 4.517608165740967, "eval_nll_loss": 0.9666510224342346, "eval_rewards/accuracies": 0.5531914830207825, "eval_rewards/chosen": -0.23779308795928955, "eval_rewards/margins": 0.10917989909648895, "eval_rewards/rejected": -0.3469729721546173, "eval_runtime": 112.3554, "eval_samples_per_second": 3.347, "eval_steps_per_second": 1.673, "step": 220 }, { "epoch": 0.3170731707317073, "grad_norm": 7.035848140716553, "kl/ref_to_policy/chosen": 33.544212341308594, "kl/ref_to_policy/mean": 39.0369873046875, "kl/ref_to_policy/rejected": 44.529762268066406, "learning_rate": 8.65306442767547e-06, "logits/chosen": -0.9009273052215576, "logits/rejected": -0.3466491401195526, "logps/chosen": -1616.2083740234375, "logps/rejected": -1625.91650390625, "loss": 4.9337, "nll_loss": 1.071380853652954, "rewards/accuracies": 0.5625, "rewards/chosen": -0.33544212579727173, "rewards/margins": 0.10985550284385681, "rewards/rejected": -0.44529759883880615, "step": 221 }, { "epoch": 0.31850789096126253, "grad_norm": 9.251448631286621, "kl/ref_to_policy/chosen": 43.392696380615234, "kl/ref_to_policy/mean": 48.37132263183594, "kl/ref_to_policy/rejected": 53.34994125366211, "learning_rate": 8.635912949268614e-06, "logits/chosen": -0.9636327028274536, "logits/rejected": -0.3522037863731384, "logps/chosen": -1771.6661376953125, "logps/rejected": -1784.396728515625, "loss": 4.3937, "nll_loss": 0.933577299118042, "rewards/accuracies": 0.5, "rewards/chosen": -0.43392693996429443, "rewards/margins": 0.09957244247198105, "rewards/rejected": -0.5334994196891785, "step": 222 }, { "epoch": 0.3199426111908178, "grad_norm": 7.237243175506592, "kl/ref_to_policy/chosen": 36.59054946899414, "kl/ref_to_policy/mean": 35.1761474609375, "kl/ref_to_policy/rejected": 33.761749267578125, "learning_rate": 8.61867019052535e-06, "logits/chosen": -0.7293698787689209, "logits/rejected": -0.3791010081768036, "logps/chosen": -1642.012451171875, "logps/rejected": -1647.603515625, "loss": 5.3645, "nll_loss": 1.1623173952102661, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3659054636955261, "rewards/margins": -0.028287984430789948, "rewards/rejected": -0.3376174867153168, "step": 223 }, { "epoch": 0.321377331420373, "grad_norm": 77.25817108154297, "kl/ref_to_policy/chosen": 26.815460205078125, "kl/ref_to_policy/mean": 30.215049743652344, "kl/ref_to_policy/rejected": 33.61463165283203, "learning_rate": 8.601336584328659e-06, "logits/chosen": -1.1146494150161743, "logits/rejected": -0.5750897526741028, "logps/chosen": -1292.6201171875, "logps/rejected": -1302.20947265625, "loss": 4.9739, "nll_loss": 1.0759408473968506, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2681545913219452, "rewards/margins": 0.0679917261004448, "rewards/rejected": -0.3361463248729706, "step": 224 }, { "epoch": 0.32281205164992827, "grad_norm": 7.569235324859619, "kl/ref_to_policy/chosen": 39.070289611816406, "kl/ref_to_policy/mean": 36.03038024902344, "kl/ref_to_policy/rejected": 32.99047088623047, "learning_rate": 8.583912565842258e-06, "logits/chosen": -0.6169043183326721, "logits/rejected": -0.2844257652759552, "logps/chosen": -1757.7890625, "logps/rejected": -1764.771728515625, "loss": 5.8705, "nll_loss": 1.2844352722167969, "rewards/accuracies": 0.3125, "rewards/chosen": -0.39070287346839905, "rewards/margins": -0.060798224061727524, "rewards/rejected": -0.3299046754837036, "step": 225 }, { "epoch": 0.3242467718794835, "grad_norm": 5.692795753479004, "kl/ref_to_policy/chosen": 45.55862045288086, "kl/ref_to_policy/mean": 47.93159484863281, "kl/ref_to_policy/rejected": 50.30458068847656, "learning_rate": 8.566398572499685e-06, "logits/chosen": -0.9059635996818542, "logits/rejected": -0.47320035099983215, "logps/chosen": -1947.44140625, "logps/rejected": -1957.7498779296875, "loss": 5.1354, "nll_loss": 1.1135331392288208, "rewards/accuracies": 0.4375, "rewards/chosen": -0.45558616518974304, "rewards/margins": 0.04745958745479584, "rewards/rejected": -0.5030457973480225, "step": 226 }, { "epoch": 0.32568149210903874, "grad_norm": 6.257844924926758, "kl/ref_to_policy/chosen": 18.959747314453125, "kl/ref_to_policy/mean": 28.028995513916016, "kl/ref_to_policy/rejected": 37.098243713378906, "learning_rate": 8.548795043993316e-06, "logits/chosen": -1.107492446899414, "logits/rejected": -0.4205125570297241, "logps/chosen": -1505.272216796875, "logps/rejected": -1522.3502197265625, "loss": 4.6143, "nll_loss": 0.9983636736869812, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1895974576473236, "rewards/margins": 0.18138496577739716, "rewards/rejected": -0.37098240852355957, "step": 227 }, { "epoch": 0.32711621233859395, "grad_norm": 11.428338050842285, "kl/ref_to_policy/chosen": 3.5465595722198486, "kl/ref_to_policy/mean": 17.351764678955078, "kl/ref_to_policy/rejected": 31.156963348388672, "learning_rate": 8.53110242226333e-06, "logits/chosen": -1.4788084030151367, "logits/rejected": -0.6359157562255859, "logps/chosen": -642.2574462890625, "logps/rejected": -662.483154296875, "loss": 2.4977, "nll_loss": 0.47944146394729614, "rewards/accuracies": 0.75, "rewards/chosen": -0.035465605556964874, "rewards/margins": 0.27610403299331665, "rewards/rejected": -0.31156960129737854, "step": 228 }, { "epoch": 0.3285509325681492, "grad_norm": 4.905177116394043, "kl/ref_to_policy/chosen": 33.57714080810547, "kl/ref_to_policy/mean": 43.55616760253906, "kl/ref_to_policy/rejected": 53.53519821166992, "learning_rate": 8.513321151486602e-06, "logits/chosen": -1.260353684425354, "logits/rejected": -0.50921630859375, "logps/chosen": -1569.3038330078125, "logps/rejected": -1587.048828125, "loss": 3.5895, "nll_loss": 0.7438629269599915, "rewards/accuracies": 0.625, "rewards/chosen": -0.335771381855011, "rewards/margins": 0.1995805948972702, "rewards/rejected": -0.5353519916534424, "step": 229 }, { "epoch": 0.3299856527977044, "grad_norm": 10.42033576965332, "kl/ref_to_policy/chosen": 38.13383102416992, "kl/ref_to_policy/mean": 43.025001525878906, "kl/ref_to_policy/rejected": 47.91616439819336, "learning_rate": 8.495451678065563e-06, "logits/chosen": -0.7523488402366638, "logits/rejected": -0.27693381905555725, "logps/chosen": -2236.1728515625, "logps/rejected": -2249.27734375, "loss": 5.3702, "nll_loss": 1.1765350103378296, "rewards/accuracies": 0.5, "rewards/chosen": -0.38133829832077026, "rewards/margins": 0.09782334417104721, "rewards/rejected": -0.47916167974472046, "step": 230 }, { "epoch": 0.3299856527977044, "eval_kl/ref_to_policy/chosen": 21.317075729370117, "eval_kl/ref_to_policy/mean": 30.473983764648438, "eval_kl/ref_to_policy/rejected": 39.63088607788086, "eval_logits/chosen": -1.1000688076019287, "eval_logits/rejected": -0.4434739649295807, "eval_logps/chosen": -1545.552978515625, "eval_logps/rejected": -1563.3533935546875, "eval_loss": 4.438828945159912, "eval_nll_loss": 0.9535277485847473, "eval_rewards/accuracies": 0.5505319237709045, "eval_rewards/chosen": -0.21317076683044434, "eval_rewards/margins": 0.18313813209533691, "eval_rewards/rejected": -0.39630889892578125, "eval_runtime": 111.6779, "eval_samples_per_second": 3.367, "eval_steps_per_second": 1.683, "step": 230 }, { "epoch": 0.3314203730272597, "grad_norm": 5.695046424865723, "kl/ref_to_policy/chosen": 18.893970489501953, "kl/ref_to_policy/mean": 27.79522705078125, "kl/ref_to_policy/rejected": 36.69648742675781, "learning_rate": 8.477494450616988e-06, "logits/chosen": -0.9462900757789612, "logits/rejected": -0.38366609811782837, "logps/chosen": -1717.75341796875, "logps/rejected": -1737.157470703125, "loss": 4.72, "nll_loss": 1.0234942436218262, "rewards/accuracies": 0.5, "rewards/chosen": -0.18893969058990479, "rewards/margins": 0.17802517116069794, "rewards/rejected": -0.3669648766517639, "step": 231 }, { "epoch": 0.3328550932568149, "grad_norm": 7.859055995941162, "kl/ref_to_policy/chosen": 3.101163864135742, "kl/ref_to_policy/mean": 12.720863342285156, "kl/ref_to_policy/rejected": 22.34056282043457, "learning_rate": 8.459449919960737e-06, "logits/chosen": -0.9838207960128784, "logits/rejected": -0.368125855922699, "logps/chosen": -1008.8511962890625, "logps/rejected": -1028.313232421875, "loss": 4.1016, "nll_loss": 0.8701032996177673, "rewards/accuracies": 0.5625, "rewards/chosen": -0.031011637300252914, "rewards/margins": 0.19239400327205658, "rewards/rejected": -0.2234056442975998, "step": 232 }, { "epoch": 0.33428981348637016, "grad_norm": 4.472298622131348, "kl/ref_to_policy/chosen": -4.7023420333862305, "kl/ref_to_policy/mean": 12.165298461914062, "kl/ref_to_policy/rejected": 29.032936096191406, "learning_rate": 8.441318539108433e-06, "logits/chosen": -1.539496898651123, "logits/rejected": -0.7116971015930176, "logps/chosen": -1258.649169921875, "logps/rejected": -1287.601318359375, "loss": 3.0336, "nll_loss": 0.6193868517875671, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04702340438961983, "rewards/margins": 0.33735281229019165, "rewards/rejected": -0.29032936692237854, "step": 233 }, { "epoch": 0.3357245337159254, "grad_norm": 85.06675720214844, "kl/ref_to_policy/chosen": 39.69082260131836, "kl/ref_to_policy/mean": 45.29262161254883, "kl/ref_to_policy/rejected": 50.89441680908203, "learning_rate": 8.423100763252094e-06, "logits/chosen": -1.1182711124420166, "logits/rejected": -0.5532751083374023, "logps/chosen": -1506.2376708984375, "logps/rejected": -1518.7889404296875, "loss": 4.5156, "nll_loss": 0.9607747197151184, "rewards/accuracies": 0.5, "rewards/chosen": -0.3969082236289978, "rewards/margins": 0.11203595995903015, "rewards/rejected": -0.5089441537857056, "step": 234 }, { "epoch": 0.33715925394548063, "grad_norm": 4.813022136688232, "kl/ref_to_policy/chosen": 20.863122940063477, "kl/ref_to_policy/mean": 39.66992950439453, "kl/ref_to_policy/rejected": 58.47673797607422, "learning_rate": 8.404797049752697e-06, "logits/chosen": -1.212156057357788, "logits/rejected": -0.514800488948822, "logps/chosen": -1418.5037841796875, "logps/rejected": -1452.603759765625, "loss": 3.0991, "nll_loss": 0.6378509998321533, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2086312472820282, "rewards/margins": 0.37613606452941895, "rewards/rejected": -0.5847673416137695, "step": 235 }, { "epoch": 0.3385939741750359, "grad_norm": 7.22500467300415, "kl/ref_to_policy/chosen": 56.91231155395508, "kl/ref_to_policy/mean": 61.77404022216797, "kl/ref_to_policy/rejected": 66.6357650756836, "learning_rate": 8.386407858128707e-06, "logits/chosen": -0.5803598165512085, "logits/rejected": -0.16086967289447784, "logps/chosen": -1965.654541015625, "logps/rejected": -1986.7705078125, "loss": 5.6113, "nll_loss": 1.233773112297058, "rewards/accuracies": 0.375, "rewards/chosen": -0.5691230893135071, "rewards/margins": 0.09723450988531113, "rewards/rejected": -0.6663576364517212, "step": 236 }, { "epoch": 0.3400286944045911, "grad_norm": 21.65779685974121, "kl/ref_to_policy/chosen": 21.02992057800293, "kl/ref_to_policy/mean": 34.64501953125, "kl/ref_to_policy/rejected": 48.26012420654297, "learning_rate": 8.367933650044526e-06, "logits/chosen": -0.993233323097229, "logits/rejected": -0.529592752456665, "logps/chosen": -1263.3502197265625, "logps/rejected": -1290.9241943359375, "loss": 4.0469, "nll_loss": 0.862508237361908, "rewards/accuracies": 0.5, "rewards/chosen": -0.21029922366142273, "rewards/margins": 0.2723020315170288, "rewards/rejected": -0.48260125517845154, "step": 237 }, { "epoch": 0.34146341463414637, "grad_norm": 6.712571620941162, "kl/ref_to_policy/chosen": 32.61610412597656, "kl/ref_to_policy/mean": 42.11166000366211, "kl/ref_to_policy/rejected": 51.60721969604492, "learning_rate": 8.349374889298923e-06, "logits/chosen": -0.7958390712738037, "logits/rejected": -0.4499036371707916, "logps/chosen": -2153.303466796875, "logps/rejected": -2179.921630859375, "loss": 5.2697, "nll_loss": 1.1608816385269165, "rewards/accuracies": 0.4375, "rewards/chosen": -0.32616105675697327, "rewards/margins": 0.1899111270904541, "rewards/rejected": -0.516072154045105, "step": 238 }, { "epoch": 0.3428981348637016, "grad_norm": 6.632086753845215, "kl/ref_to_policy/chosen": 28.887794494628906, "kl/ref_to_policy/mean": 39.7597541809082, "kl/ref_to_policy/rejected": 50.63172149658203, "learning_rate": 8.330732041813367e-06, "logits/chosen": -0.9702951908111572, "logits/rejected": -0.5898388028144836, "logps/chosen": -1837.6689453125, "logps/rejected": -1865.8211669921875, "loss": 5.2391, "nll_loss": 1.155412197113037, "rewards/accuracies": 0.375, "rewards/chosen": -0.2888779640197754, "rewards/margins": 0.2174391895532608, "rewards/rejected": -0.506317138671875, "step": 239 }, { "epoch": 0.34433285509325684, "grad_norm": 6.578116416931152, "kl/ref_to_policy/chosen": 22.61591339111328, "kl/ref_to_policy/mean": 38.437286376953125, "kl/ref_to_policy/rejected": 54.25865173339844, "learning_rate": 8.312005575620355e-06, "logits/chosen": -0.8463482856750488, "logits/rejected": -0.39940422773361206, "logps/chosen": -1464.280029296875, "logps/rejected": -1496.2899169921875, "loss": 4.652, "nll_loss": 1.018854022026062, "rewards/accuracies": 0.5, "rewards/chosen": -0.22615914046764374, "rewards/margins": 0.3164273500442505, "rewards/rejected": -0.5425864458084106, "step": 240 }, { "epoch": 0.34433285509325684, "eval_kl/ref_to_policy/chosen": 17.169864654541016, "eval_kl/ref_to_policy/mean": 36.62061309814453, "eval_kl/ref_to_policy/rejected": 56.07136535644531, "eval_logits/chosen": -1.0018715858459473, "eval_logits/rejected": -0.5424321889877319, "eval_logps/chosen": -1541.4058837890625, "eval_logps/rejected": -1579.7940673828125, "eval_loss": 4.379458427429199, "eval_nll_loss": 0.9567639231681824, "eval_rewards/accuracies": 0.5531914830207825, "eval_rewards/chosen": -0.17169862985610962, "eval_rewards/margins": 0.38901498913764954, "eval_rewards/rejected": -0.5607136487960815, "eval_runtime": 112.0827, "eval_samples_per_second": 3.355, "eval_steps_per_second": 1.677, "step": 240 }, { "epoch": 0.34576757532281205, "grad_norm": 25.422452926635742, "kl/ref_to_policy/chosen": -7.484866142272949, "kl/ref_to_policy/mean": 15.0338134765625, "kl/ref_to_policy/rejected": 37.552494049072266, "learning_rate": 8.293195960851634e-06, "logits/chosen": -1.3117115497589111, "logits/rejected": -0.7235029935836792, "logps/chosen": -747.61669921875, "logps/rejected": -786.290771484375, "loss": 3.3667, "nll_loss": 0.7088727355003357, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07484866678714752, "rewards/margins": 0.4503735899925232, "rewards/rejected": -0.37552493810653687, "step": 241 }, { "epoch": 0.3472022955523673, "grad_norm": 15.401252746582031, "kl/ref_to_policy/chosen": 17.324237823486328, "kl/ref_to_policy/mean": 35.11741638183594, "kl/ref_to_policy/rejected": 52.91059112548828, "learning_rate": 8.274303669726427e-06, "logits/chosen": -1.0597269535064697, "logits/rejected": -0.5660406351089478, "logps/chosen": -1168.91015625, "logps/rejected": -1204.27197265625, "loss": 3.876, "nll_loss": 0.8275965452194214, "rewards/accuracies": 0.5, "rewards/chosen": -0.17324239015579224, "rewards/margins": 0.3558635413646698, "rewards/rejected": -0.5291059017181396, "step": 242 }, { "epoch": 0.3486370157819225, "grad_norm": 115.6594009399414, "kl/ref_to_policy/chosen": 24.91469955444336, "kl/ref_to_policy/mean": 47.08078384399414, "kl/ref_to_policy/rejected": 69.24685668945312, "learning_rate": 8.255329176539552e-06, "logits/chosen": -1.2614933252334595, "logits/rejected": -0.6769732236862183, "logps/chosen": -1960.5084228515625, "logps/rejected": -2000.348388671875, "loss": 3.8228, "nll_loss": 0.8245071768760681, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24914702773094177, "rewards/margins": 0.44332155585289, "rewards/rejected": -0.6924685835838318, "step": 243 }, { "epoch": 0.3500717360114778, "grad_norm": 45.91886901855469, "kl/ref_to_policy/chosen": 29.474626541137695, "kl/ref_to_policy/mean": 42.299278259277344, "kl/ref_to_policy/rejected": 55.123931884765625, "learning_rate": 8.236272957649534e-06, "logits/chosen": -0.9548101425170898, "logits/rejected": -0.5192294120788574, "logps/chosen": -1589.0294189453125, "logps/rejected": -1619.5693359375, "loss": 5.0035, "nll_loss": 1.0993833541870117, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2947462499141693, "rewards/margins": 0.2564930319786072, "rewards/rejected": -0.5512392520904541, "step": 244 }, { "epoch": 0.351506456241033, "grad_norm": 7.5245161056518555, "kl/ref_to_policy/chosen": 30.347501754760742, "kl/ref_to_policy/mean": 45.829898834228516, "kl/ref_to_policy/rejected": 61.31229019165039, "learning_rate": 8.217135491466636e-06, "logits/chosen": -0.9801848530769348, "logits/rejected": -0.580032229423523, "logps/chosen": -1981.058349609375, "logps/rejected": -2016.710693359375, "loss": 4.9376, "nll_loss": 1.087804913520813, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3034749925136566, "rewards/margins": 0.30964791774749756, "rewards/rejected": -0.6131228804588318, "step": 245 }, { "epoch": 0.35294117647058826, "grad_norm": 19.82720375061035, "kl/ref_to_policy/chosen": 32.571048736572266, "kl/ref_to_policy/mean": 47.724220275878906, "kl/ref_to_policy/rejected": 62.87739562988281, "learning_rate": 8.197917258440851e-06, "logits/chosen": -0.8870335221290588, "logits/rejected": -0.5863248109817505, "logps/chosen": -1700.922607421875, "logps/rejected": -1735.76611328125, "loss": 5.0059, "nll_loss": 1.0981236696243286, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3257104456424713, "rewards/margins": 0.30306345224380493, "rewards/rejected": -0.6287739276885986, "step": 246 }, { "epoch": 0.35437589670014347, "grad_norm": 8.00979232788086, "kl/ref_to_policy/chosen": 5.792469501495361, "kl/ref_to_policy/mean": 31.7496395111084, "kl/ref_to_policy/rejected": 57.706809997558594, "learning_rate": 8.178618741049841e-06, "logits/chosen": -1.2319694757461548, "logits/rejected": -0.7779315114021301, "logps/chosen": -1522.44091796875, "logps/rejected": -1573.792724609375, "loss": 3.3516, "nll_loss": 0.7086329460144043, "rewards/accuracies": 0.625, "rewards/chosen": -0.05792468041181564, "rewards/margins": 0.5191434025764465, "rewards/rejected": -0.5770680904388428, "step": 247 }, { "epoch": 0.35581061692969873, "grad_norm": 7.451022148132324, "kl/ref_to_policy/chosen": 10.17514419555664, "kl/ref_to_policy/mean": 38.460018157958984, "kl/ref_to_policy/rejected": 66.7448959350586, "learning_rate": 8.15924042378682e-06, "logits/chosen": -1.052493691444397, "logits/rejected": -0.7804156541824341, "logps/chosen": -1287.21435546875, "logps/rejected": -1341.2508544921875, "loss": 3.6872, "nll_loss": 0.7962843775749207, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1017514318227768, "rewards/margins": 0.5656975507736206, "rewards/rejected": -0.6674489974975586, "step": 248 }, { "epoch": 0.35724533715925394, "grad_norm": 5.169087886810303, "kl/ref_to_policy/chosen": 5.0238518714904785, "kl/ref_to_policy/mean": 39.98102569580078, "kl/ref_to_policy/rejected": 74.93820190429688, "learning_rate": 8.1397827931484e-06, "logits/chosen": -1.375598669052124, "logits/rejected": -0.9089187383651733, "logps/chosen": -963.9144287109375, "logps/rejected": -1025.3717041015625, "loss": 2.6163, "nll_loss": 0.5422717332839966, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05023851990699768, "rewards/margins": 0.6991434693336487, "rewards/rejected": -0.7493820190429688, "step": 249 }, { "epoch": 0.3586800573888092, "grad_norm": 13.00700855255127, "kl/ref_to_policy/chosen": 10.62474250793457, "kl/ref_to_policy/mean": 33.92547607421875, "kl/ref_to_policy/rejected": 57.22620391845703, "learning_rate": 8.120246337622364e-06, "logits/chosen": -1.2421627044677734, "logits/rejected": -0.7955504655838013, "logps/chosen": -983.241943359375, "logps/rejected": -1026.5308837890625, "loss": 3.6016, "nll_loss": 0.7638582587242126, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10624740272760391, "rewards/margins": 0.4660146236419678, "rewards/rejected": -0.5722620487213135, "step": 250 }, { "epoch": 0.3586800573888092, "eval_kl/ref_to_policy/chosen": 16.018667221069336, "eval_kl/ref_to_policy/mean": 38.4786262512207, "eval_kl/ref_to_policy/rejected": 60.9385871887207, "eval_logits/chosen": -1.1075433492660522, "eval_logits/rejected": -0.6289094090461731, "eval_logps/chosen": -1540.254638671875, "eval_logps/rejected": -1584.6612548828125, "eval_loss": 4.33317232131958, "eval_nll_loss": 0.9463233947753906, "eval_rewards/accuracies": 0.5531914830207825, "eval_rewards/chosen": -0.16018667817115784, "eval_rewards/margins": 0.44919919967651367, "eval_rewards/rejected": -0.6093858480453491, "eval_runtime": 111.8838, "eval_samples_per_second": 3.361, "eval_steps_per_second": 1.68, "step": 250 }, { "epoch": 0.3601147776183644, "grad_norm": 7.613894462585449, "kl/ref_to_policy/chosen": -11.865091323852539, "kl/ref_to_policy/mean": 32.320701599121094, "kl/ref_to_policy/rejected": 76.5064926147461, "learning_rate": 8.100631547675417e-06, "logits/chosen": -1.652207612991333, "logits/rejected": -0.8896211981773376, "logps/chosen": -359.9083557128906, "logps/rejected": -435.5748291015625, "loss": 1.5834, "nll_loss": 0.29873883724212646, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11865091323852539, "rewards/margins": 0.883715808391571, "rewards/rejected": -0.7650648951530457, "step": 251 }, { "epoch": 0.3615494978479197, "grad_norm": 15.11418628692627, "kl/ref_to_policy/chosen": 13.661087036132812, "kl/ref_to_policy/mean": 33.67607116699219, "kl/ref_to_policy/rejected": 53.69105529785156, "learning_rate": 8.080938915740863e-06, "logits/chosen": -1.231611967086792, "logits/rejected": -0.7438386678695679, "logps/chosen": -1225.3955078125, "logps/rejected": -1267.8160400390625, "loss": 4.0462, "nll_loss": 0.8692629933357239, "rewards/accuracies": 0.5, "rewards/chosen": -0.13661088049411774, "rewards/margins": 0.40029963850975037, "rewards/rejected": -0.5369105339050293, "step": 252 }, { "epoch": 0.3629842180774749, "grad_norm": 5.607687950134277, "kl/ref_to_policy/chosen": 40.60003662109375, "kl/ref_to_policy/mean": 68.18260192871094, "kl/ref_to_policy/rejected": 95.7651596069336, "learning_rate": 8.06116893620624e-06, "logits/chosen": -1.0766286849975586, "logits/rejected": -0.676904559135437, "logps/chosen": -1781.170166015625, "logps/rejected": -1836.76513671875, "loss": 4.294, "nll_loss": 0.942596971988678, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4060003459453583, "rewards/margins": 0.5516511797904968, "rewards/rejected": -0.9576514959335327, "step": 253 }, { "epoch": 0.36441893830703015, "grad_norm": 7.121146202087402, "kl/ref_to_policy/chosen": 39.372772216796875, "kl/ref_to_policy/mean": 53.40095520019531, "kl/ref_to_policy/rejected": 67.42913055419922, "learning_rate": 8.041322105400923e-06, "logits/chosen": -0.657172679901123, "logits/rejected": -0.4716494083404541, "logps/chosen": -2248.763427734375, "logps/rejected": -2283.64208984375, "loss": 6.4841, "nll_loss": 1.4607887268066406, "rewards/accuracies": 0.375, "rewards/chosen": -0.39372771978378296, "rewards/margins": 0.280563622713089, "rewards/rejected": -0.6742913126945496, "step": 254 }, { "epoch": 0.36585365853658536, "grad_norm": 7.964898109436035, "kl/ref_to_policy/chosen": 28.117183685302734, "kl/ref_to_policy/mean": 48.636077880859375, "kl/ref_to_policy/rejected": 69.15496826171875, "learning_rate": 8.021398921583644e-06, "logits/chosen": -0.8078469038009644, "logits/rejected": -0.5894169807434082, "logps/chosen": -1380.8515625, "logps/rejected": -1429.643310546875, "loss": 4.9908, "nll_loss": 1.102419137954712, "rewards/accuracies": 0.375, "rewards/chosen": -0.2811718285083771, "rewards/margins": 0.4103778600692749, "rewards/rejected": -0.6915497183799744, "step": 255 }, { "epoch": 0.3672883787661406, "grad_norm": 27.105947494506836, "kl/ref_to_policy/chosen": 16.077402114868164, "kl/ref_to_policy/mean": 40.47259521484375, "kl/ref_to_policy/rejected": 64.86779022216797, "learning_rate": 8.001399884930004e-06, "logits/chosen": -1.1149896383285522, "logits/rejected": -0.8922098278999329, "logps/chosen": -960.405517578125, "logps/rejected": -1009.9439086914062, "loss": 3.5352, "nll_loss": 0.737130343914032, "rewards/accuracies": 0.5, "rewards/chosen": -0.16077402234077454, "rewards/margins": 0.4879039227962494, "rewards/rejected": -0.6486778855323792, "step": 256 }, { "epoch": 0.36872309899569583, "grad_norm": 28.845355987548828, "kl/ref_to_policy/chosen": 61.18776321411133, "kl/ref_to_policy/mean": 69.74577331542969, "kl/ref_to_policy/rejected": 78.30377960205078, "learning_rate": 7.981325497519892e-06, "logits/chosen": -0.6666136384010315, "logits/rejected": -0.5567719340324402, "logps/chosen": -2319.234619140625, "logps/rejected": -2349.6416015625, "loss": 6.1946, "nll_loss": 1.3781743049621582, "rewards/accuracies": 0.25, "rewards/chosen": -0.6118776798248291, "rewards/margins": 0.17116019129753113, "rewards/rejected": -0.7830377817153931, "step": 257 }, { "epoch": 0.3701578192252511, "grad_norm": 29.876934051513672, "kl/ref_to_policy/chosen": 3.3559603691101074, "kl/ref_to_policy/mean": 37.15166091918945, "kl/ref_to_policy/rejected": 70.94735717773438, "learning_rate": 7.961176263324902e-06, "logits/chosen": -1.2599719762802124, "logits/rejected": -0.8327604532241821, "logps/chosen": -1085.28564453125, "logps/rejected": -1149.087890625, "loss": 3.4588, "nll_loss": 0.7463341951370239, "rewards/accuracies": 0.625, "rewards/chosen": -0.03355959802865982, "rewards/margins": 0.67591392993927, "rewards/rejected": -0.7094736099243164, "step": 258 }, { "epoch": 0.3715925394548063, "grad_norm": 9.871208190917969, "kl/ref_to_policy/chosen": 9.048161506652832, "kl/ref_to_policy/mean": 37.6701545715332, "kl/ref_to_policy/rejected": 66.29214477539062, "learning_rate": 7.940952688195668e-06, "logits/chosen": -0.9458192586898804, "logits/rejected": -0.5606811046600342, "logps/chosen": -1291.993408203125, "logps/rejected": -1349.3995361328125, "loss": 4.4821, "nll_loss": 0.9939285516738892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09048160910606384, "rewards/margins": 0.5724398493766785, "rewards/rejected": -0.6629214286804199, "step": 259 }, { "epoch": 0.37302725968436157, "grad_norm": 109.36133575439453, "kl/ref_to_policy/chosen": 13.995379447937012, "kl/ref_to_policy/mean": 33.1147575378418, "kl/ref_to_policy/rejected": 52.23413848876953, "learning_rate": 7.920655279849173e-06, "logits/chosen": -1.135756015777588, "logits/rejected": -0.7083947658538818, "logps/chosen": -1423.432861328125, "logps/rejected": -1460.466552734375, "loss": 5.221, "nll_loss": 1.164181113243103, "rewards/accuracies": 0.625, "rewards/chosen": -0.13995379209518433, "rewards/margins": 0.38238751888275146, "rewards/rejected": -0.5223413109779358, "step": 260 }, { "epoch": 0.37302725968436157, "eval_kl/ref_to_policy/chosen": 14.8417387008667, "eval_kl/ref_to_policy/mean": 27.570377349853516, "eval_kl/ref_to_policy/rejected": 40.299015045166016, "eval_logits/chosen": -1.2718884944915771, "eval_logits/rejected": -0.8461806774139404, "eval_logps/chosen": -1539.0777587890625, "eval_logps/rejected": -1564.0216064453125, "eval_loss": 5.2694878578186035, "eval_nll_loss": 1.165606141090393, "eval_rewards/accuracies": 0.5585106611251831, "eval_rewards/chosen": -0.1484173834323883, "eval_rewards/margins": 0.2545727789402008, "eval_rewards/rejected": -0.4029901325702667, "eval_runtime": 111.304, "eval_samples_per_second": 3.378, "eval_steps_per_second": 1.689, "step": 260 }, { "epoch": 0.3744619799139168, "grad_norm": 252.8405303955078, "kl/ref_to_policy/chosen": 1.284433364868164, "kl/ref_to_policy/mean": 10.821643829345703, "kl/ref_to_policy/rejected": 20.358854293823242, "learning_rate": 7.900284547855992e-06, "logits/chosen": -1.717612624168396, "logits/rejected": -1.259250283241272, "logps/chosen": -905.550048828125, "logps/rejected": -921.3549194335938, "loss": 5.9719, "nll_loss": 1.335500717163086, "rewards/accuracies": 0.625, "rewards/chosen": -0.012844335287809372, "rewards/margins": 0.19074422121047974, "rewards/rejected": -0.20358853042125702, "step": 261 }, { "epoch": 0.37589670014347204, "grad_norm": 162.8564910888672, "kl/ref_to_policy/chosen": 19.935516357421875, "kl/ref_to_policy/mean": 32.5312614440918, "kl/ref_to_policy/rejected": 45.12701416015625, "learning_rate": 7.87984100362751e-06, "logits/chosen": -1.4650988578796387, "logits/rejected": -0.9517897367477417, "logps/chosen": -1464.412109375, "logps/rejected": -1484.2032470703125, "loss": 4.6987, "nll_loss": 1.0253028869628906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19935515522956848, "rewards/margins": 0.25191494822502136, "rewards/rejected": -0.45127007365226746, "step": 262 }, { "epoch": 0.37733142037302725, "grad_norm": 99.16685485839844, "kl/ref_to_policy/chosen": 14.522640228271484, "kl/ref_to_policy/mean": 38.65250015258789, "kl/ref_to_policy/rejected": 62.7823600769043, "learning_rate": 7.859325160403073e-06, "logits/chosen": -1.26975679397583, "logits/rejected": -0.7215903997421265, "logps/chosen": -1346.3585205078125, "logps/rejected": -1392.7296142578125, "loss": 3.3995, "nll_loss": 0.7170107364654541, "rewards/accuracies": 0.5625, "rewards/chosen": -0.145226389169693, "rewards/margins": 0.4825972020626068, "rewards/rejected": -0.6278235912322998, "step": 263 }, { "epoch": 0.3787661406025825, "grad_norm": 10.159520149230957, "kl/ref_to_policy/chosen": -3.856257915496826, "kl/ref_to_policy/mean": 27.614505767822266, "kl/ref_to_policy/rejected": 59.08526611328125, "learning_rate": 7.838737533237111e-06, "logits/chosen": -1.52314293384552, "logits/rejected": -0.9002086520195007, "logps/chosen": -1034.339111328125, "logps/rejected": -1089.241455078125, "loss": 2.9157, "nll_loss": 0.6085465550422668, "rewards/accuracies": 0.75, "rewards/chosen": 0.03856257349252701, "rewards/margins": 0.6294152736663818, "rewards/rejected": -0.5908526182174683, "step": 264 }, { "epoch": 0.3802008608321377, "grad_norm": 83.3934097290039, "kl/ref_to_policy/chosen": 19.88848114013672, "kl/ref_to_policy/mean": 47.45388412475586, "kl/ref_to_policy/rejected": 75.019287109375, "learning_rate": 7.818078638986208e-06, "logits/chosen": -1.2715070247650146, "logits/rejected": -0.846549928188324, "logps/chosen": -1306.075927734375, "logps/rejected": -1361.624755859375, "loss": 3.6918, "nll_loss": 0.7912173867225647, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19888480007648468, "rewards/margins": 0.5513080954551697, "rewards/rejected": -0.7501928806304932, "step": 265 }, { "epoch": 0.381635581061693, "grad_norm": 61.0827522277832, "kl/ref_to_policy/chosen": 45.19354248046875, "kl/ref_to_policy/mean": 70.34971618652344, "kl/ref_to_policy/rejected": 95.50589752197266, "learning_rate": 7.797348996296116e-06, "logits/chosen": -1.0887806415557861, "logits/rejected": -0.6628750562667847, "logps/chosen": -1316.732421875, "logps/rejected": -1364.6484375, "loss": 3.8612, "nll_loss": 0.8183290362358093, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45193544030189514, "rewards/margins": 0.5031235218048096, "rewards/rejected": -0.9550589323043823, "step": 266 }, { "epoch": 0.3830703012912482, "grad_norm": 46.22964859008789, "kl/ref_to_policy/chosen": 64.69705200195312, "kl/ref_to_policy/mean": 91.2848129272461, "kl/ref_to_policy/rejected": 117.87257385253906, "learning_rate": 7.776549125588743e-06, "logits/chosen": -1.1336941719055176, "logits/rejected": -0.7271771430969238, "logps/chosen": -1468.556396484375, "logps/rejected": -1519.43896484375, "loss": 3.9874, "nll_loss": 0.8530172109603882, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6469704508781433, "rewards/margins": 0.5317552089691162, "rewards/rejected": -1.1787256002426147, "step": 267 }, { "epoch": 0.38450502152080346, "grad_norm": 15.211583137512207, "kl/ref_to_policy/chosen": 82.80394744873047, "kl/ref_to_policy/mean": 87.29264068603516, "kl/ref_to_policy/rejected": 91.78133392333984, "learning_rate": 7.755679549049093e-06, "logits/chosen": -0.6956257820129395, "logits/rejected": -0.47820228338241577, "logps/chosen": -2220.81201171875, "logps/rejected": -2238.127197265625, "loss": 5.9226, "nll_loss": 1.3077993392944336, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8280395269393921, "rewards/margins": 0.08977380394935608, "rewards/rejected": -0.9178133010864258, "step": 268 }, { "epoch": 0.38593974175035867, "grad_norm": 17.50469970703125, "kl/ref_to_policy/chosen": 96.14873504638672, "kl/ref_to_policy/mean": 114.79085540771484, "kl/ref_to_policy/rejected": 133.4329833984375, "learning_rate": 7.734740790612137e-06, "logits/chosen": -0.9103230834007263, "logits/rejected": -0.5452337861061096, "logps/chosen": -2529.820556640625, "logps/rejected": -2570.60693359375, "loss": 5.6537, "nll_loss": 1.26478910446167, "rewards/accuracies": 0.375, "rewards/chosen": -0.961487352848053, "rewards/margins": 0.3728424906730652, "rewards/rejected": -1.3343297243118286, "step": 269 }, { "epoch": 0.38737446197991393, "grad_norm": 9.052054405212402, "kl/ref_to_policy/chosen": 69.96853637695312, "kl/ref_to_policy/mean": 80.99673461914062, "kl/ref_to_policy/rejected": 92.02494049072266, "learning_rate": 7.713733375949677e-06, "logits/chosen": -0.7047710418701172, "logits/rejected": -0.4739341735839844, "logps/chosen": -2303.078125, "logps/rejected": -2329.27685546875, "loss": 5.557, "nll_loss": 1.2285293340682983, "rewards/accuracies": 0.375, "rewards/chosen": -0.699685275554657, "rewards/margins": 0.2205641269683838, "rewards/rejected": -0.920249342918396, "step": 270 }, { "epoch": 0.38737446197991393, "eval_kl/ref_to_policy/chosen": 19.828245162963867, "eval_kl/ref_to_policy/mean": 39.073604583740234, "eval_kl/ref_to_policy/rejected": 58.3189582824707, "eval_logits/chosen": -1.0306862592697144, "eval_logits/rejected": -0.5821533799171448, "eval_logps/chosen": -1544.064453125, "eval_logps/rejected": -1582.0413818359375, "eval_loss": 4.364048957824707, "eval_nll_loss": 0.9471612572669983, "eval_rewards/accuracies": 0.5558510422706604, "eval_rewards/chosen": -0.1982824206352234, "eval_rewards/margins": 0.38490718603134155, "eval_rewards/rejected": -0.5831896066665649, "eval_runtime": 111.7319, "eval_samples_per_second": 3.365, "eval_steps_per_second": 1.683, "step": 270 }, { "epoch": 0.38880918220946914, "grad_norm": 11.30400276184082, "kl/ref_to_policy/chosen": 10.004558563232422, "kl/ref_to_policy/mean": 34.760955810546875, "kl/ref_to_policy/rejected": 59.517356872558594, "learning_rate": 7.692657832457146e-06, "logits/chosen": -1.1612435579299927, "logits/rejected": -0.6689969301223755, "logps/chosen": -1346.6060791015625, "logps/rejected": -1394.61328125, "loss": 4.0145, "nll_loss": 0.868366539478302, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1000455766916275, "rewards/margins": 0.49512794613838196, "rewards/rejected": -0.5951735973358154, "step": 271 }, { "epoch": 0.3902439024390244, "grad_norm": 18.888046264648438, "kl/ref_to_policy/chosen": 11.432906150817871, "kl/ref_to_policy/mean": 33.38811111450195, "kl/ref_to_policy/rejected": 55.34331512451172, "learning_rate": 7.671514689240366e-06, "logits/chosen": -1.0917103290557861, "logits/rejected": -0.596222996711731, "logps/chosen": -1406.8626708984375, "logps/rejected": -1451.23193359375, "loss": 4.0959, "nll_loss": 0.8845667243003845, "rewards/accuracies": 0.5, "rewards/chosen": -0.11432904750108719, "rewards/margins": 0.4391041100025177, "rewards/rejected": -0.5534331202507019, "step": 272 }, { "epoch": 0.3916786226685796, "grad_norm": 7.387451648712158, "kl/ref_to_policy/chosen": 40.37288284301758, "kl/ref_to_policy/mean": 66.18487548828125, "kl/ref_to_policy/rejected": 91.99687194824219, "learning_rate": 7.650304477102258e-06, "logits/chosen": -0.9579944610595703, "logits/rejected": -0.6191481947898865, "logps/chosen": -1449.1854248046875, "logps/rejected": -1503.9324951171875, "loss": 4.5279, "nll_loss": 0.9933135509490967, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4037288427352905, "rewards/margins": 0.5162398815155029, "rewards/rejected": -0.9199687242507935, "step": 273 }, { "epoch": 0.3931133428981349, "grad_norm": 6.750729560852051, "kl/ref_to_policy/chosen": 18.91887664794922, "kl/ref_to_policy/mean": 37.749778747558594, "kl/ref_to_policy/rejected": 56.5806770324707, "learning_rate": 7.629027728529527e-06, "logits/chosen": -0.9772347211837769, "logits/rejected": -0.6896437406539917, "logps/chosen": -1535.2001953125, "logps/rejected": -1575.0670166015625, "loss": 4.4085, "nll_loss": 0.9565789699554443, "rewards/accuracies": 0.5, "rewards/chosen": -0.18918877840042114, "rewards/margins": 0.37661799788475037, "rewards/rejected": -0.5658068060874939, "step": 274 }, { "epoch": 0.3945480631276901, "grad_norm": 10.25978946685791, "kl/ref_to_policy/chosen": 20.60467529296875, "kl/ref_to_policy/mean": 31.57822036743164, "kl/ref_to_policy/rejected": 42.551761627197266, "learning_rate": 7.607684977679284e-06, "logits/chosen": -0.8952125310897827, "logits/rejected": -0.6147925853729248, "logps/chosen": -1838.7686767578125, "logps/rejected": -1866.6708984375, "loss": 5.3468, "nll_loss": 1.178180456161499, "rewards/accuracies": 0.375, "rewards/chosen": -0.20604674518108368, "rewards/margins": 0.21947085857391357, "rewards/rejected": -0.42551761865615845, "step": 275 }, { "epoch": 0.39598278335724535, "grad_norm": 41.287742614746094, "kl/ref_to_policy/chosen": 17.4279842376709, "kl/ref_to_policy/mean": 44.05318069458008, "kl/ref_to_policy/rejected": 70.67837524414062, "learning_rate": 7.586276760365645e-06, "logits/chosen": -1.2635231018066406, "logits/rejected": -0.9259144067764282, "logps/chosen": -1701.0609130859375, "logps/rejected": -1750.563232421875, "loss": 4.1755, "nll_loss": 0.9122381806373596, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1742798537015915, "rewards/margins": 0.5325038433074951, "rewards/rejected": -0.7067837119102478, "step": 276 }, { "epoch": 0.39741750358680056, "grad_norm": 6.462818145751953, "kl/ref_to_policy/chosen": 38.39432144165039, "kl/ref_to_policy/mean": 60.10942459106445, "kl/ref_to_policy/rejected": 81.82453155517578, "learning_rate": 7.564803614046276e-06, "logits/chosen": -1.0375885963439941, "logits/rejected": -0.8021103739738464, "logps/chosen": -2010.206787109375, "logps/rejected": -2056.322509765625, "loss": 4.4251, "nll_loss": 0.9644858837127686, "rewards/accuracies": 0.5625, "rewards/chosen": -0.38394320011138916, "rewards/margins": 0.43430209159851074, "rewards/rejected": -0.8182452321052551, "step": 277 }, { "epoch": 0.3988522238163558, "grad_norm": 3457.1455078125, "kl/ref_to_policy/chosen": 27.474071502685547, "kl/ref_to_policy/mean": 53.26298522949219, "kl/ref_to_policy/rejected": 79.05189514160156, "learning_rate": 7.543266077808893e-06, "logits/chosen": -1.087677001953125, "logits/rejected": -0.9330421090126038, "logps/chosen": -1806.02880859375, "logps/rejected": -1862.960205078125, "loss": 6.0225, "nll_loss": 1.3682893514633179, "rewards/accuracies": 0.5, "rewards/chosen": -0.27474069595336914, "rewards/margins": 0.5157782435417175, "rewards/rejected": -0.7905189990997314, "step": 278 }, { "epoch": 0.40028694404591103, "grad_norm": 6.894753456115723, "kl/ref_to_policy/chosen": 43.226341247558594, "kl/ref_to_policy/mean": 59.48128128051758, "kl/ref_to_policy/rejected": 75.73622131347656, "learning_rate": 7.521664692357737e-06, "logits/chosen": -0.7117177248001099, "logits/rejected": -0.6439226865768433, "logps/chosen": -2516.663818359375, "logps/rejected": -2557.806640625, "loss": 5.973, "nll_loss": 1.3320398330688477, "rewards/accuracies": 0.375, "rewards/chosen": -0.43226343393325806, "rewards/margins": 0.32509875297546387, "rewards/rejected": -0.7573621869087219, "step": 279 }, { "epoch": 0.4017216642754663, "grad_norm": 7.228130340576172, "kl/ref_to_policy/chosen": 54.54338073730469, "kl/ref_to_policy/mean": 67.46078491210938, "kl/ref_to_policy/rejected": 80.37818908691406, "learning_rate": 7.500000000000001e-06, "logits/chosen": -0.818718671798706, "logits/rejected": -0.6446835398674011, "logps/chosen": -2470.46533203125, "logps/rejected": -2504.73779296875, "loss": 5.6518, "nll_loss": 1.2466131448745728, "rewards/accuracies": 0.375, "rewards/chosen": -0.545433759689331, "rewards/margins": 0.25834810733795166, "rewards/rejected": -0.8037818670272827, "step": 280 }, { "epoch": 0.4017216642754663, "eval_kl/ref_to_policy/chosen": 15.38228702545166, "eval_kl/ref_to_policy/mean": 47.744876861572266, "eval_kl/ref_to_policy/rejected": 80.10746765136719, "eval_logits/chosen": -1.242561936378479, "eval_logits/rejected": -0.949858546257019, "eval_logps/chosen": -1539.618408203125, "eval_logps/rejected": -1603.830078125, "eval_loss": 4.327410697937012, "eval_nll_loss": 0.9525725841522217, "eval_rewards/accuracies": 0.5558510422706604, "eval_rewards/chosen": -0.1538228541612625, "eval_rewards/margins": 0.6472518444061279, "eval_rewards/rejected": -0.8010746836662292, "eval_runtime": 110.5424, "eval_samples_per_second": 3.401, "eval_steps_per_second": 1.701, "step": 280 }, { "epoch": 0.4031563845050215, "grad_norm": 6.946725368499756, "kl/ref_to_policy/chosen": 13.30961799621582, "kl/ref_to_policy/mean": 44.215049743652344, "kl/ref_to_policy/rejected": 75.12047576904297, "learning_rate": 7.478272544632204e-06, "logits/chosen": -1.1837881803512573, "logits/rejected": -0.9548963904380798, "logps/chosen": -1323.6455078125, "logps/rejected": -1386.3665771484375, "loss": 4.0566, "nll_loss": 0.8865386247634888, "rewards/accuracies": 0.5, "rewards/chosen": -0.13309617340564728, "rewards/margins": 0.6181086301803589, "rewards/rejected": -0.751204788684845, "step": 281 }, { "epoch": 0.40459110473457677, "grad_norm": 25.24003028869629, "kl/ref_to_policy/chosen": 46.334083557128906, "kl/ref_to_policy/mean": 56.70280075073242, "kl/ref_to_policy/rejected": 67.07151794433594, "learning_rate": 7.456482871726545e-06, "logits/chosen": -1.1379586458206177, "logits/rejected": -0.8321510553359985, "logps/chosen": -1729.0059814453125, "logps/rejected": -1753.610595703125, "loss": 5.327, "nll_loss": 1.1597989797592163, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4633408188819885, "rewards/margins": 0.20737433433532715, "rewards/rejected": -0.6707152128219604, "step": 282 }, { "epoch": 0.406025824964132, "grad_norm": 24.22191047668457, "kl/ref_to_policy/chosen": -8.239789962768555, "kl/ref_to_policy/mean": 55.34055709838867, "kl/ref_to_policy/rejected": 118.9208984375, "learning_rate": 7.434631528317209e-06, "logits/chosen": -1.970827579498291, "logits/rejected": -1.3303250074386597, "logps/chosen": -608.4320678710938, "logps/rejected": -723.3134765625, "loss": 1.7474, "nll_loss": 0.358827143907547, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08239787817001343, "rewards/margins": 1.2716069221496582, "rewards/rejected": -1.189208984375, "step": 283 }, { "epoch": 0.40746054519368724, "grad_norm": 25.976903915405273, "kl/ref_to_policy/chosen": 35.52540969848633, "kl/ref_to_policy/mean": 46.29151916503906, "kl/ref_to_policy/rejected": 57.05763244628906, "learning_rate": 7.412719062986632e-06, "logits/chosen": -0.9022115468978882, "logits/rejected": -0.6660032868385315, "logps/chosen": -2133.908203125, "logps/rejected": -2166.1318359375, "loss": 5.6551, "nll_loss": 1.2492789030075073, "rewards/accuracies": 0.3125, "rewards/chosen": -0.35525405406951904, "rewards/margins": 0.21532228589057922, "rewards/rejected": -0.5705763101577759, "step": 284 }, { "epoch": 0.40889526542324245, "grad_norm": 29.840560913085938, "kl/ref_to_policy/chosen": -15.530769348144531, "kl/ref_to_policy/mean": 14.047222137451172, "kl/ref_to_policy/rejected": 43.62521743774414, "learning_rate": 7.390746025851725e-06, "logits/chosen": -1.949154257774353, "logits/rejected": -1.1195555925369263, "logps/chosen": -544.7269897460938, "logps/rejected": -595.3810424804688, "loss": 2.2312, "nll_loss": 0.44038304686546326, "rewards/accuracies": 0.75, "rewards/chosen": 0.15530769526958466, "rewards/margins": 0.5915598273277283, "rewards/rejected": -0.4362521767616272, "step": 285 }, { "epoch": 0.4103299856527977, "grad_norm": 10.175561904907227, "kl/ref_to_policy/chosen": 11.382495880126953, "kl/ref_to_policy/mean": 28.769811630249023, "kl/ref_to_policy/rejected": 46.157127380371094, "learning_rate": 7.368712968550068e-06, "logits/chosen": -1.2893773317337036, "logits/rejected": -0.8395211696624756, "logps/chosen": -1798.6636962890625, "logps/rejected": -1839.0535888671875, "loss": 4.9358, "nll_loss": 1.0880094766616821, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11382494866847992, "rewards/margins": 0.3477463126182556, "rewards/rejected": -0.46157124638557434, "step": 286 }, { "epoch": 0.4117647058823529, "grad_norm": 7.461395263671875, "kl/ref_to_policy/chosen": 18.15418243408203, "kl/ref_to_policy/mean": 37.51105880737305, "kl/ref_to_policy/rejected": 56.86793518066406, "learning_rate": 7.3466204442260605e-06, "logits/chosen": -1.1644994020462036, "logits/rejected": -0.790250301361084, "logps/chosen": -1914.0213623046875, "logps/rejected": -1959.883544921875, "loss": 5.4119, "nll_loss": 1.2079440355300903, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1815418303012848, "rewards/margins": 0.3871375322341919, "rewards/rejected": -0.5686793327331543, "step": 287 }, { "epoch": 0.4131994261119082, "grad_norm": 11.239782333374023, "kl/ref_to_policy/chosen": 15.57080078125, "kl/ref_to_policy/mean": 57.60736846923828, "kl/ref_to_policy/rejected": 99.64393615722656, "learning_rate": 7.324469007517035e-06, "logits/chosen": -1.6499102115631104, "logits/rejected": -1.0718927383422852, "logps/chosen": -1407.87548828125, "logps/rejected": -1486.22998046875, "loss": 3.3449, "nll_loss": 0.7241158485412598, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15570801496505737, "rewards/margins": 0.8407312631607056, "rewards/rejected": -0.9964392781257629, "step": 288 }, { "epoch": 0.4146341463414634, "grad_norm": 6.01121187210083, "kl/ref_to_policy/chosen": 7.373027801513672, "kl/ref_to_policy/mean": 48.14250946044922, "kl/ref_to_policy/rejected": 88.9119873046875, "learning_rate": 7.302259214539327e-06, "logits/chosen": -1.4867548942565918, "logits/rejected": -0.9414687156677246, "logps/chosen": -1363.8572998046875, "logps/rejected": -1443.429443359375, "loss": 3.6349, "nll_loss": 0.7981788516044617, "rewards/accuracies": 0.625, "rewards/chosen": -0.07373027503490448, "rewards/margins": 0.8153896331787109, "rewards/rejected": -0.8891199827194214, "step": 289 }, { "epoch": 0.41606886657101866, "grad_norm": 187.83030700683594, "kl/ref_to_policy/chosen": 18.6325626373291, "kl/ref_to_policy/mean": 45.684906005859375, "kl/ref_to_policy/rejected": 72.73724365234375, "learning_rate": 7.279991622874319e-06, "logits/chosen": -1.1036524772644043, "logits/rejected": -0.7877974510192871, "logps/chosen": -1476.718505859375, "logps/rejected": -1536.3505859375, "loss": 5.1295, "nll_loss": 1.1460914611816406, "rewards/accuracies": 0.375, "rewards/chosen": -0.18632560968399048, "rewards/margins": 0.5410469174385071, "rewards/rejected": -0.7273725271224976, "step": 290 }, { "epoch": 0.41606886657101866, "eval_kl/ref_to_policy/chosen": 14.834433555603027, "eval_kl/ref_to_policy/mean": 51.463191986083984, "eval_kl/ref_to_policy/rejected": 88.0919418334961, "eval_logits/chosen": -1.3617727756500244, "eval_logits/rejected": -0.8848711252212524, "eval_logps/chosen": -1539.0704345703125, "eval_logps/rejected": -1611.814453125, "eval_loss": 4.287560939788818, "eval_nll_loss": 0.950203001499176, "eval_rewards/accuracies": 0.5558510422706604, "eval_rewards/chosen": -0.14834433794021606, "eval_rewards/margins": 0.7325751185417175, "eval_rewards/rejected": -0.8809194564819336, "eval_runtime": 111.6509, "eval_samples_per_second": 3.368, "eval_steps_per_second": 1.684, "step": 290 }, { "epoch": 0.41750358680057387, "grad_norm": 5.895266532897949, "kl/ref_to_policy/chosen": 23.956592559814453, "kl/ref_to_policy/mean": 62.88701629638672, "kl/ref_to_policy/rejected": 101.81744384765625, "learning_rate": 7.257666791554448e-06, "logits/chosen": -1.2714165449142456, "logits/rejected": -0.8176424503326416, "logps/chosen": -1858.48779296875, "logps/rejected": -1937.1357421875, "loss": 4.4657, "nll_loss": 1.001304268836975, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23956593871116638, "rewards/margins": 0.7786084413528442, "rewards/rejected": -1.018174409866333, "step": 291 }, { "epoch": 0.41893830703012913, "grad_norm": 4.940492630004883, "kl/ref_to_policy/chosen": 28.86663055419922, "kl/ref_to_policy/mean": 60.54585647583008, "kl/ref_to_policy/rejected": 92.22508239746094, "learning_rate": 7.235285281049154e-06, "logits/chosen": -1.2772539854049683, "logits/rejected": -0.8488619327545166, "logps/chosen": -2066.81201171875, "logps/rejected": -2135.096435546875, "loss": 4.5089, "nll_loss": 0.9983846545219421, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2886662781238556, "rewards/margins": 0.6335845589637756, "rewards/rejected": -0.9222508072853088, "step": 292 }, { "epoch": 0.42037302725968434, "grad_norm": 15.042583465576172, "kl/ref_to_policy/chosen": 35.72322082519531, "kl/ref_to_policy/mean": 63.715492248535156, "kl/ref_to_policy/rejected": 91.70777130126953, "learning_rate": 7.212847653250828e-06, "logits/chosen": -1.104468822479248, "logits/rejected": -0.7812088131904602, "logps/chosen": -2302.71533203125, "logps/rejected": -2363.524169921875, "loss": 5.0273, "nll_loss": 1.1223615407943726, "rewards/accuracies": 0.5, "rewards/chosen": -0.3572322130203247, "rewards/margins": 0.5598454475402832, "rewards/rejected": -0.9170777201652527, "step": 293 }, { "epoch": 0.4218077474892396, "grad_norm": 9.900491714477539, "kl/ref_to_policy/chosen": 33.60314178466797, "kl/ref_to_policy/mean": 53.08051681518555, "kl/ref_to_policy/rejected": 72.55789184570312, "learning_rate": 7.190354471460692e-06, "logits/chosen": -1.028242588043213, "logits/rejected": -0.7283084392547607, "logps/chosen": -2312.32958984375, "logps/rejected": -2359.718017578125, "loss": 5.7469, "nll_loss": 1.2904824018478394, "rewards/accuracies": 0.375, "rewards/chosen": -0.336031436920166, "rewards/margins": 0.3895474076271057, "rewards/rejected": -0.7255788445472717, "step": 294 }, { "epoch": 0.4232424677187948, "grad_norm": 5.218917369842529, "kl/ref_to_policy/chosen": -14.119500160217285, "kl/ref_to_policy/mean": 47.75885772705078, "kl/ref_to_policy/rejected": 109.63722229003906, "learning_rate": 7.167806300374665e-06, "logits/chosen": -1.8007712364196777, "logits/rejected": -1.1580289602279663, "logps/chosen": -432.5680236816406, "logps/rejected": -547.14892578125, "loss": 1.9891, "nll_loss": 0.41854771971702576, "rewards/accuracies": 0.75, "rewards/chosen": 0.14119498431682587, "rewards/margins": 1.2375670671463013, "rewards/rejected": -1.096372127532959, "step": 295 }, { "epoch": 0.4246771879483501, "grad_norm": 5.107214450836182, "kl/ref_to_policy/chosen": 20.421886444091797, "kl/ref_to_policy/mean": 55.836849212646484, "kl/ref_to_policy/rejected": 91.25181579589844, "learning_rate": 7.145203706069183e-06, "logits/chosen": -1.3593839406967163, "logits/rejected": -0.950861930847168, "logps/chosen": -1405.6873779296875, "logps/rejected": -1475.478271484375, "loss": 3.8307, "nll_loss": 0.837748646736145, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20421886444091797, "rewards/margins": 0.7082992792129517, "rewards/rejected": -0.9125180840492249, "step": 296 }, { "epoch": 0.4261119081779053, "grad_norm": 5.961255073547363, "kl/ref_to_policy/chosen": 23.21874237060547, "kl/ref_to_policy/mean": 48.93105697631836, "kl/ref_to_policy/rejected": 74.64337158203125, "learning_rate": 7.122547255986985e-06, "logits/chosen": -0.9941931962966919, "logits/rejected": -0.6904710531234741, "logps/chosen": -2038.1092529296875, "logps/rejected": -2094.002685546875, "loss": 5.2395, "nll_loss": 1.17426598072052, "rewards/accuracies": 0.5, "rewards/chosen": -0.232187420129776, "rewards/margins": 0.5142462253570557, "rewards/rejected": -0.746433675289154, "step": 297 }, { "epoch": 0.42754662840746055, "grad_norm": 142.90431213378906, "kl/ref_to_policy/chosen": -3.035701274871826, "kl/ref_to_policy/mean": 44.505882263183594, "kl/ref_to_policy/rejected": 92.04745483398438, "learning_rate": 7.099837518922873e-06, "logits/chosen": -1.5652508735656738, "logits/rejected": -1.000985860824585, "logps/chosen": -1089.40185546875, "logps/rejected": -1180.50830078125, "loss": 3.2761, "nll_loss": 0.7179206609725952, "rewards/accuracies": 0.6875, "rewards/chosen": 0.030357014387845993, "rewards/margins": 0.9508316516876221, "rewards/rejected": -0.9204745888710022, "step": 298 }, { "epoch": 0.42898134863701576, "grad_norm": 162.0907745361328, "kl/ref_to_policy/chosen": 4.420159339904785, "kl/ref_to_policy/mean": 51.949859619140625, "kl/ref_to_policy/rejected": 99.47954559326172, "learning_rate": 7.0770750650094335e-06, "logits/chosen": -1.521538257598877, "logits/rejected": -0.9830626845359802, "logps/chosen": -1383.068603515625, "logps/rejected": -1473.7490234375, "loss": 3.3924, "nll_loss": 0.7468375563621521, "rewards/accuracies": 0.6875, "rewards/chosen": -0.044201597571372986, "rewards/margins": 0.9505940079689026, "rewards/rejected": -0.994795560836792, "step": 299 }, { "epoch": 0.430416068866571, "grad_norm": 8.106657028198242, "kl/ref_to_policy/chosen": 55.001338958740234, "kl/ref_to_policy/mean": 66.37379455566406, "kl/ref_to_policy/rejected": 77.74623107910156, "learning_rate": 7.054260465702712e-06, "logits/chosen": -0.622576892375946, "logits/rejected": -0.4242229163646698, "logps/chosen": -2929.469970703125, "logps/rejected": -2963.6640625, "loss": 6.4801, "nll_loss": 1.4606928825378418, "rewards/accuracies": 0.375, "rewards/chosen": -0.5500134229660034, "rewards/margins": 0.2274489402770996, "rewards/rejected": -0.7774623036384583, "step": 300 }, { "epoch": 0.430416068866571, "eval_kl/ref_to_policy/chosen": 14.126577377319336, "eval_kl/ref_to_policy/mean": 53.157020568847656, "eval_kl/ref_to_policy/rejected": 92.18746185302734, "eval_logits/chosen": -1.1335643529891968, "eval_logits/rejected": -0.7704039216041565, "eval_logps/chosen": -1538.3626708984375, "eval_logps/rejected": -1615.909912109375, "eval_loss": 4.2674384117126465, "eval_nll_loss": 0.9447013139724731, "eval_rewards/accuracies": 0.5718085169792175, "eval_rewards/chosen": -0.14126577973365784, "eval_rewards/margins": 0.7806088328361511, "eval_rewards/rejected": -0.9218745827674866, "eval_runtime": 111.3785, "eval_samples_per_second": 3.376, "eval_steps_per_second": 1.688, "step": 300 }, { "epoch": 0.43185078909612623, "grad_norm": 25.958404541015625, "kl/ref_to_policy/chosen": 21.211362838745117, "kl/ref_to_policy/mean": 40.88221740722656, "kl/ref_to_policy/rejected": 60.553077697753906, "learning_rate": 7.031394293767879e-06, "logits/chosen": -1.0442531108856201, "logits/rejected": -0.7606640458106995, "logps/chosen": -1572.68017578125, "logps/rejected": -1616.4515380859375, "loss": 5.0399, "nll_loss": 1.0946316719055176, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2121136337518692, "rewards/margins": 0.3934171199798584, "rewards/rejected": -0.6055306792259216, "step": 301 }, { "epoch": 0.4332855093256815, "grad_norm": 19.770246505737305, "kl/ref_to_policy/chosen": -10.429632186889648, "kl/ref_to_policy/mean": 30.428489685058594, "kl/ref_to_policy/rejected": 71.28661346435547, "learning_rate": 7.008477123264849e-06, "logits/chosen": -1.3379642963409424, "logits/rejected": -0.8935960531234741, "logps/chosen": -1085.9141845703125, "logps/rejected": -1164.8082275390625, "loss": 3.6942, "nll_loss": 0.8084920644760132, "rewards/accuracies": 0.625, "rewards/chosen": 0.10429631173610687, "rewards/margins": 0.8171623945236206, "rewards/rejected": -0.7128661870956421, "step": 302 }, { "epoch": 0.4347202295552367, "grad_norm": 5.400252342224121, "kl/ref_to_policy/chosen": 8.154787063598633, "kl/ref_to_policy/mean": 50.71799850463867, "kl/ref_to_policy/rejected": 93.28120422363281, "learning_rate": 6.985509529533859e-06, "logits/chosen": -1.0949137210845947, "logits/rejected": -0.7418695092201233, "logps/chosen": -2062.85791015625, "logps/rejected": -2147.630859375, "loss": 4.4755, "nll_loss": 1.0079482793807983, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08154787123203278, "rewards/margins": 0.8512642979621887, "rewards/rejected": -0.9328121542930603, "step": 303 }, { "epoch": 0.43615494978479197, "grad_norm": 6.57516622543335, "kl/ref_to_policy/chosen": 4.171238899230957, "kl/ref_to_policy/mean": 46.485984802246094, "kl/ref_to_policy/rejected": 88.80074310302734, "learning_rate": 6.96249208918104e-06, "logits/chosen": -1.0770481824874878, "logits/rejected": -0.7910947203636169, "logps/chosen": -1291.608642578125, "logps/rejected": -1375.734375, "loss": 3.9887, "nll_loss": 0.8856726884841919, "rewards/accuracies": 0.625, "rewards/chosen": -0.04171239212155342, "rewards/margins": 0.8462949395179749, "rewards/rejected": -0.8880073428153992, "step": 304 }, { "epoch": 0.4375896700143472, "grad_norm": 5.296945095062256, "kl/ref_to_policy/chosen": 16.090246200561523, "kl/ref_to_policy/mean": 59.25251770019531, "kl/ref_to_policy/rejected": 102.414794921875, "learning_rate": 6.939425380063924e-06, "logits/chosen": -1.305524468421936, "logits/rejected": -0.8478037714958191, "logps/chosen": -1765.0120849609375, "logps/rejected": -1848.158935546875, "loss": 3.836, "nll_loss": 0.8508728742599487, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1609024703502655, "rewards/margins": 0.86324542760849, "rewards/rejected": -1.0241479873657227, "step": 305 }, { "epoch": 0.43902439024390244, "grad_norm": 6.805392742156982, "kl/ref_to_policy/chosen": 1.547071099281311, "kl/ref_to_policy/mean": 47.09822082519531, "kl/ref_to_policy/rejected": 92.64936828613281, "learning_rate": 6.916309981276954e-06, "logits/chosen": -1.2990212440490723, "logits/rejected": -0.8315710425376892, "logps/chosen": -1078.2579345703125, "logps/rejected": -1167.4031982421875, "loss": 3.7931, "nll_loss": 0.8431646823883057, "rewards/accuracies": 0.625, "rewards/chosen": -0.0154707171022892, "rewards/margins": 0.9110230207443237, "rewards/rejected": -0.9264938235282898, "step": 306 }, { "epoch": 0.44045911047345765, "grad_norm": 4.678521633148193, "kl/ref_to_policy/chosen": 1.9967041015625, "kl/ref_to_policy/mean": 42.562095642089844, "kl/ref_to_policy/rejected": 83.12747955322266, "learning_rate": 6.89314647313693e-06, "logits/chosen": -1.3437225818634033, "logits/rejected": -0.8766223192214966, "logps/chosen": -1540.301513671875, "logps/rejected": -1619.04541015625, "loss": 3.7877, "nll_loss": 0.835116982460022, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019967030733823776, "rewards/margins": 0.8113077282905579, "rewards/rejected": -0.8312748074531555, "step": 307 }, { "epoch": 0.4418938307030129, "grad_norm": 93.10907745361328, "kl/ref_to_policy/chosen": 21.05111312866211, "kl/ref_to_policy/mean": 70.00315856933594, "kl/ref_to_policy/rejected": 118.9552001953125, "learning_rate": 6.869935437168449e-06, "logits/chosen": -1.367333173751831, "logits/rejected": -0.996738076210022, "logps/chosen": -1739.1683349609375, "logps/rejected": -1834.189697265625, "loss": 3.5201, "nll_loss": 0.7794352173805237, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21051111817359924, "rewards/margins": 0.9790409803390503, "rewards/rejected": -1.1895520687103271, "step": 308 }, { "epoch": 0.4433285509325681, "grad_norm": 209.0909423828125, "kl/ref_to_policy/chosen": -1.096451759338379, "kl/ref_to_policy/mean": 54.6810302734375, "kl/ref_to_policy/rejected": 110.45851135253906, "learning_rate": 6.846677456089305e-06, "logits/chosen": -1.6266694068908691, "logits/rejected": -1.1172118186950684, "logps/chosen": -1382.512939453125, "logps/rejected": -1487.9486083984375, "loss": 2.9387, "nll_loss": 0.6439303159713745, "rewards/accuracies": 0.75, "rewards/chosen": 0.010964524000883102, "rewards/margins": 1.1155495643615723, "rewards/rejected": -1.1045849323272705, "step": 309 }, { "epoch": 0.4447632711621234, "grad_norm": 13.572793006896973, "kl/ref_to_policy/chosen": 67.92689514160156, "kl/ref_to_policy/mean": 73.0342025756836, "kl/ref_to_policy/rejected": 78.14151000976562, "learning_rate": 6.82337311379586e-06, "logits/chosen": -0.7497025728225708, "logits/rejected": -0.5990791916847229, "logps/chosen": -2986.34130859375, "logps/rejected": -3010.7119140625, "loss": 6.4404, "nll_loss": 1.4332658052444458, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6792689561843872, "rewards/margins": 0.10214617103338242, "rewards/rejected": -0.7814151048660278, "step": 310 }, { "epoch": 0.4447632711621234, "eval_kl/ref_to_policy/chosen": 8.07588005065918, "eval_kl/ref_to_policy/mean": 49.193885803222656, "eval_kl/ref_to_policy/rejected": 90.3118896484375, "eval_logits/chosen": -1.2302896976470947, "eval_logits/rejected": -0.8726959228515625, "eval_logps/chosen": -1532.3118896484375, "eval_logps/rejected": -1614.034423828125, "eval_loss": 4.225353717803955, "eval_nll_loss": 0.9438403248786926, "eval_rewards/accuracies": 0.667553186416626, "eval_rewards/chosen": -0.08075880259275436, "eval_rewards/margins": 0.8223599195480347, "eval_rewards/rejected": -0.9031187891960144, "eval_runtime": 111.1934, "eval_samples_per_second": 3.381, "eval_steps_per_second": 1.691, "step": 310 }, { "epoch": 0.44619799139167865, "grad_norm": 5.314009189605713, "kl/ref_to_policy/chosen": 8.749518394470215, "kl/ref_to_policy/mean": 49.455169677734375, "kl/ref_to_policy/rejected": 90.16081237792969, "learning_rate": 6.800022995348381e-06, "logits/chosen": -1.2027347087860107, "logits/rejected": -0.8302821516990662, "logps/chosen": -1362.60888671875, "logps/rejected": -1442.434326171875, "loss": 3.8708, "nll_loss": 0.8550440073013306, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08749519288539886, "rewards/margins": 0.8141130208969116, "rewards/rejected": -0.9016082286834717, "step": 311 }, { "epoch": 0.44763271162123386, "grad_norm": 7.17567777633667, "kl/ref_to_policy/chosen": 3.4714202880859375, "kl/ref_to_policy/mean": 31.93658447265625, "kl/ref_to_policy/rejected": 60.40174865722656, "learning_rate": 6.776627686956354e-06, "logits/chosen": -1.0685971975326538, "logits/rejected": -0.7983371615409851, "logps/chosen": -1513.682373046875, "logps/rejected": -1575.65185546875, "loss": 4.6991, "nll_loss": 1.0435463190078735, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03471419960260391, "rewards/margins": 0.5693032741546631, "rewards/rejected": -0.6040175557136536, "step": 312 }, { "epoch": 0.4490674318507891, "grad_norm": 124.07605743408203, "kl/ref_to_policy/chosen": 0.07867097854614258, "kl/ref_to_policy/mean": 52.178646087646484, "kl/ref_to_policy/rejected": 104.27862548828125, "learning_rate": 6.753187775963773e-06, "logits/chosen": -1.4759998321533203, "logits/rejected": -1.0837522745132446, "logps/chosen": -1018.91455078125, "logps/rejected": -1116.876220703125, "loss": 3.349, "nll_loss": 0.7409859895706177, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0007866993546485901, "rewards/margins": 1.0419995784759521, "rewards/rejected": -1.0427862405776978, "step": 313 }, { "epoch": 0.45050215208034433, "grad_norm": 7.0183000564575195, "kl/ref_to_policy/chosen": 7.6307830810546875, "kl/ref_to_policy/mean": 46.665565490722656, "kl/ref_to_policy/rejected": 85.7003402709961, "learning_rate": 6.729703850834381e-06, "logits/chosen": -1.2075762748718262, "logits/rejected": -0.8927096724510193, "logps/chosen": -1140.7713623046875, "logps/rejected": -1219.1671142578125, "loss": 3.8299, "nll_loss": 0.8401798009872437, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07630782574415207, "rewards/margins": 0.7806955575942993, "rewards/rejected": -0.857003390789032, "step": 314 }, { "epoch": 0.4519368723098996, "grad_norm": 5.7515692710876465, "kl/ref_to_policy/chosen": 12.633678436279297, "kl/ref_to_policy/mean": 51.713340759277344, "kl/ref_to_policy/rejected": 90.7929916381836, "learning_rate": 6.706176501136914e-06, "logits/chosen": -0.9837530255317688, "logits/rejected": -0.728460431098938, "logps/chosen": -1632.6024169921875, "logps/rejected": -1713.9296875, "loss": 4.7105, "nll_loss": 1.0605417490005493, "rewards/accuracies": 0.625, "rewards/chosen": -0.12633676826953888, "rewards/margins": 0.7815932035446167, "rewards/rejected": -0.9079298973083496, "step": 315 }, { "epoch": 0.4533715925394548, "grad_norm": 7.764797210693359, "kl/ref_to_policy/chosen": 16.847686767578125, "kl/ref_to_policy/mean": 57.04197311401367, "kl/ref_to_policy/rejected": 97.23625946044922, "learning_rate": 6.682606317530284e-06, "logits/chosen": -1.0016483068466187, "logits/rejected": -0.810640275478363, "logps/chosen": -2126.39453125, "logps/rejected": -2207.9921875, "loss": 4.7358, "nll_loss": 1.0686633586883545, "rewards/accuracies": 0.625, "rewards/chosen": -0.1684768795967102, "rewards/margins": 0.8038857579231262, "rewards/rejected": -0.9723625183105469, "step": 316 }, { "epoch": 0.45480631276901007, "grad_norm": 53.56513595581055, "kl/ref_to_policy/chosen": 6.692622184753418, "kl/ref_to_policy/mean": 65.45423889160156, "kl/ref_to_policy/rejected": 124.21585845947266, "learning_rate": 6.65899389174876e-06, "logits/chosen": -1.3163331747055054, "logits/rejected": -0.9640593528747559, "logps/chosen": -1140.7393798828125, "logps/rejected": -1251.3057861328125, "loss": 2.8817, "nll_loss": 0.6230954527854919, "rewards/accuracies": 0.75, "rewards/chosen": -0.06692621111869812, "rewards/margins": 1.1752324104309082, "rewards/rejected": -1.2421586513519287, "step": 317 }, { "epoch": 0.4562410329985653, "grad_norm": 8.96833324432373, "kl/ref_to_policy/chosen": 21.092700958251953, "kl/ref_to_policy/mean": 44.10459518432617, "kl/ref_to_policy/rejected": 67.11648559570312, "learning_rate": 6.635339816587109e-06, "logits/chosen": -0.7624435424804688, "logits/rejected": -0.6264288425445557, "logps/chosen": -2002.7490234375, "logps/rejected": -2056.497314453125, "loss": 5.6578, "nll_loss": 1.2740211486816406, "rewards/accuracies": 0.5, "rewards/chosen": -0.21092700958251953, "rewards/margins": 0.4602378010749817, "rewards/rejected": -0.671164870262146, "step": 318 }, { "epoch": 0.45767575322812054, "grad_norm": 6.199533939361572, "kl/ref_to_policy/chosen": 13.73574447631836, "kl/ref_to_policy/mean": 53.12657928466797, "kl/ref_to_policy/rejected": 92.51742553710938, "learning_rate": 6.611644685885713e-06, "logits/chosen": -1.0482523441314697, "logits/rejected": -0.8647012114524841, "logps/chosen": -1517.04833984375, "logps/rejected": -1597.1346435546875, "loss": 4.255, "nll_loss": 0.9476186037063599, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1373574435710907, "rewards/margins": 0.7878166437149048, "rewards/rejected": -0.9251742362976074, "step": 319 }, { "epoch": 0.45911047345767575, "grad_norm": 5.924163818359375, "kl/ref_to_policy/chosen": 14.936875343322754, "kl/ref_to_policy/mean": 49.192893981933594, "kl/ref_to_policy/rejected": 83.44891357421875, "learning_rate": 6.587909094515663e-06, "logits/chosen": -0.9662331342697144, "logits/rejected": -0.7395594716072083, "logps/chosen": -1633.147705078125, "logps/rejected": -1706.523193359375, "loss": 4.5201, "nll_loss": 1.0057048797607422, "rewards/accuracies": 0.625, "rewards/chosen": -0.1493687629699707, "rewards/margins": 0.6851202845573425, "rewards/rejected": -0.834489107131958, "step": 320 }, { "epoch": 0.45911047345767575, "eval_kl/ref_to_policy/chosen": 5.159896373748779, "eval_kl/ref_to_policy/mean": 46.71841812133789, "eval_kl/ref_to_policy/rejected": 88.27693939208984, "eval_logits/chosen": -1.199634313583374, "eval_logits/rejected": -0.8832560777664185, "eval_logps/chosen": -1529.39599609375, "eval_logps/rejected": -1611.99951171875, "eval_loss": 4.241010665893555, "eval_nll_loss": 0.9483423233032227, "eval_rewards/accuracies": 0.7234042286872864, "eval_rewards/chosen": -0.051598966121673584, "eval_rewards/margins": 0.8311703205108643, "eval_rewards/rejected": -0.8827693462371826, "eval_runtime": 112.5103, "eval_samples_per_second": 3.342, "eval_steps_per_second": 1.671, "step": 320 }, { "epoch": 0.460545193687231, "grad_norm": 24.336105346679688, "kl/ref_to_policy/chosen": 0.7648868560791016, "kl/ref_to_policy/mean": 33.29643249511719, "kl/ref_to_policy/rejected": 65.8279800415039, "learning_rate": 6.564133638363823e-06, "logits/chosen": -1.4279861450195312, "logits/rejected": -1.0014832019805908, "logps/chosen": -1087.874267578125, "logps/rejected": -1149.3944091796875, "loss": 3.3624, "nll_loss": 0.7170210480690002, "rewards/accuracies": 0.75, "rewards/chosen": -0.00764886662364006, "rewards/margins": 0.6506308913230896, "rewards/rejected": -0.6582797765731812, "step": 321 }, { "epoch": 0.4619799139167862, "grad_norm": 7.571940898895264, "kl/ref_to_policy/chosen": 16.154708862304688, "kl/ref_to_policy/mean": 49.02022933959961, "kl/ref_to_policy/rejected": 81.88575744628906, "learning_rate": 6.5403189143178725e-06, "logits/chosen": -1.1058598756790161, "logits/rejected": -0.8875933289527893, "logps/chosen": -1307.08447265625, "logps/rejected": -1380.4853515625, "loss": 4.7516, "nll_loss": 1.0607961416244507, "rewards/accuracies": 0.4375, "rewards/chosen": -0.16154709458351135, "rewards/margins": 0.657310426235199, "rewards/rejected": -0.8188575506210327, "step": 322 }, { "epoch": 0.4634146341463415, "grad_norm": 98.3878402709961, "kl/ref_to_policy/chosen": 34.106632232666016, "kl/ref_to_policy/mean": 60.4587287902832, "kl/ref_to_policy/rejected": 86.81082153320312, "learning_rate": 6.5164655202513135e-06, "logits/chosen": -1.1276358366012573, "logits/rejected": -0.8883501291275024, "logps/chosen": -2058.317626953125, "logps/rejected": -2116.328369140625, "loss": 5.581, "nll_loss": 1.259551763534546, "rewards/accuracies": 0.625, "rewards/chosen": -0.34106630086898804, "rewards/margins": 0.5270418524742126, "rewards/rejected": -0.8681081533432007, "step": 323 }, { "epoch": 0.4648493543758967, "grad_norm": 41.70835876464844, "kl/ref_to_policy/chosen": 4.7934064865112305, "kl/ref_to_policy/mean": 46.996192932128906, "kl/ref_to_policy/rejected": 89.19898986816406, "learning_rate": 6.492574055008474e-06, "logits/chosen": -1.3471020460128784, "logits/rejected": -0.9881587624549866, "logps/chosen": -1367.0980224609375, "logps/rejected": -1450.443115234375, "loss": 4.4865, "nll_loss": 1.0097664594650269, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04793405532836914, "rewards/margins": 0.8440558314323425, "rewards/rejected": -0.8919898271560669, "step": 324 }, { "epoch": 0.46628407460545196, "grad_norm": 6.348065376281738, "kl/ref_to_policy/chosen": 23.49587631225586, "kl/ref_to_policy/mean": 57.27253723144531, "kl/ref_to_policy/rejected": 91.04920196533203, "learning_rate": 6.4686451183894604e-06, "logits/chosen": -0.8702314496040344, "logits/rejected": -0.712561845779419, "logps/chosen": -1791.792724609375, "logps/rejected": -1864.9359130859375, "loss": 5.0878, "nll_loss": 1.146569848060608, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23495876789093018, "rewards/margins": 0.6755332946777344, "rewards/rejected": -0.9104920625686646, "step": 325 }, { "epoch": 0.46771879483500717, "grad_norm": 6.056787967681885, "kl/ref_to_policy/chosen": 0.06308650970458984, "kl/ref_to_policy/mean": 40.791114807128906, "kl/ref_to_policy/rejected": 81.5191421508789, "learning_rate": 6.444679311135112e-06, "logits/chosen": -0.9944266080856323, "logits/rejected": -0.8216566443443298, "logps/chosen": -1573.3131103515625, "logps/rejected": -1656.0269775390625, "loss": 4.8039, "nll_loss": 1.085984230041504, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0006308667361736298, "rewards/margins": 0.8145604729652405, "rewards/rejected": -0.815191388130188, "step": 326 }, { "epoch": 0.46915351506456243, "grad_norm": 18.08167266845703, "kl/ref_to_policy/chosen": 17.494096755981445, "kl/ref_to_policy/mean": 55.628822326660156, "kl/ref_to_policy/rejected": 93.76355743408203, "learning_rate": 6.420677234911908e-06, "logits/chosen": -0.990520179271698, "logits/rejected": -0.8254861831665039, "logps/chosen": -1213.376953125, "logps/rejected": -1291.6170654296875, "loss": 4.3122, "nll_loss": 0.9562997221946716, "rewards/accuracies": 0.5, "rewards/chosen": -0.17494095861911774, "rewards/margins": 0.7626946568489075, "rewards/rejected": -0.9376355409622192, "step": 327 }, { "epoch": 0.47058823529411764, "grad_norm": 6.720556259155273, "kl/ref_to_policy/chosen": 11.14324951171875, "kl/ref_to_policy/mean": 51.22264099121094, "kl/ref_to_policy/rejected": 91.3020248413086, "learning_rate": 6.396639492296868e-06, "logits/chosen": -0.9332765936851501, "logits/rejected": -0.7677888870239258, "logps/chosen": -1258.229248046875, "logps/rejected": -1340.016845703125, "loss": 4.7664, "nll_loss": 1.0751161575317383, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11143249273300171, "rewards/margins": 0.8015878200531006, "rewards/rejected": -0.9130203127861023, "step": 328 }, { "epoch": 0.4720229555236729, "grad_norm": 6.143362045288086, "kl/ref_to_policy/chosen": 6.509381294250488, "kl/ref_to_policy/mean": 41.17169952392578, "kl/ref_to_policy/rejected": 75.83401489257812, "learning_rate": 6.372566686762427e-06, "logits/chosen": -1.0576491355895996, "logits/rejected": -0.8472113609313965, "logps/chosen": -1463.572021484375, "logps/rejected": -1538.162109375, "loss": 4.6124, "nll_loss": 1.0281378030776978, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06509380042552948, "rewards/margins": 0.6932463049888611, "rewards/rejected": -0.7583401799201965, "step": 329 }, { "epoch": 0.4734576757532281, "grad_norm": 5.784033298492432, "kl/ref_to_policy/chosen": 16.18281364440918, "kl/ref_to_policy/mean": 57.78590393066406, "kl/ref_to_policy/rejected": 99.38899230957031, "learning_rate": 6.348459422661276e-06, "logits/chosen": -1.0161361694335938, "logits/rejected": -0.9008207321166992, "logps/chosen": -1855.620361328125, "logps/rejected": -1940.3773193359375, "loss": 4.6772, "nll_loss": 1.056098222732544, "rewards/accuracies": 0.875, "rewards/chosen": -0.16182813048362732, "rewards/margins": 0.8320618271827698, "rewards/rejected": -0.9938898682594299, "step": 330 }, { "epoch": 0.4734576757532281, "eval_kl/ref_to_policy/chosen": 4.819427013397217, "eval_kl/ref_to_policy/mean": 49.16254425048828, "eval_kl/ref_to_policy/rejected": 93.50566101074219, "eval_logits/chosen": -1.1376440525054932, "eval_logits/rejected": -0.9213968515396118, "eval_logps/chosen": -1529.0555419921875, "eval_logps/rejected": -1617.228271484375, "eval_loss": 4.206663608551025, "eval_nll_loss": 0.9417508840560913, "eval_rewards/accuracies": 0.6808510422706604, "eval_rewards/chosen": -0.048194266855716705, "eval_rewards/margins": 0.8868623971939087, "eval_rewards/rejected": -0.9350566267967224, "eval_runtime": 111.5513, "eval_samples_per_second": 3.371, "eval_steps_per_second": 1.685, "step": 330 }, { "epoch": 0.4748923959827834, "grad_norm": 20.26960563659668, "kl/ref_to_policy/chosen": -14.065112113952637, "kl/ref_to_policy/mean": 53.41499710083008, "kl/ref_to_policy/rejected": 120.89510345458984, "learning_rate": 6.324318305211201e-06, "logits/chosen": -1.7075858116149902, "logits/rejected": -1.2912442684173584, "logps/chosen": -454.1224060058594, "logps/rejected": -576.9459228515625, "loss": 1.526, "nll_loss": 0.3065105974674225, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14065112173557281, "rewards/margins": 1.349602222442627, "rewards/rejected": -1.2089509963989258, "step": 331 }, { "epoch": 0.4763271162123386, "grad_norm": 5.032298564910889, "kl/ref_to_policy/chosen": -1.499843716621399, "kl/ref_to_policy/mean": 58.159027099609375, "kl/ref_to_policy/rejected": 117.81790161132812, "learning_rate": 6.300143940479881e-06, "logits/chosen": -1.3354206085205078, "logits/rejected": -1.1005311012268066, "logps/chosen": -1274.2579345703125, "logps/rejected": -1387.91162109375, "loss": 3.2789, "nll_loss": 0.7323300838470459, "rewards/accuracies": 0.75, "rewards/chosen": 0.014998444356024265, "rewards/margins": 1.193177580833435, "rewards/rejected": -1.1781789064407349, "step": 332 }, { "epoch": 0.47776183644189385, "grad_norm": 5.6181111335754395, "kl/ref_to_policy/chosen": 10.11408805847168, "kl/ref_to_policy/mean": 61.761783599853516, "kl/ref_to_policy/rejected": 113.40946960449219, "learning_rate": 6.275936935369675e-06, "logits/chosen": -1.2420992851257324, "logits/rejected": -1.0597316026687622, "logps/chosen": -1544.832763671875, "logps/rejected": -1649.3392333984375, "loss": 3.8279, "nll_loss": 0.8565700054168701, "rewards/accuracies": 0.625, "rewards/chosen": -0.10114088654518127, "rewards/margins": 1.0329538583755493, "rewards/rejected": -1.1340947151184082, "step": 333 }, { "epoch": 0.47919655667144906, "grad_norm": 18.349193572998047, "kl/ref_to_policy/chosen": -0.015684127807617188, "kl/ref_to_policy/mean": 46.75641632080078, "kl/ref_to_policy/rejected": 93.52851867675781, "learning_rate": 6.251697897602384e-06, "logits/chosen": -1.2628589868545532, "logits/rejected": -1.0226833820343018, "logps/chosen": -1122.0076904296875, "logps/rejected": -1213.714111328125, "loss": 3.5175, "nll_loss": 0.7739236354827881, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00015683472156524658, "rewards/margins": 0.9354419708251953, "rewards/rejected": -0.935285210609436, "step": 334 }, { "epoch": 0.4806312769010043, "grad_norm": 7.275036334991455, "kl/ref_to_policy/chosen": 4.427461624145508, "kl/ref_to_policy/mean": 40.21718978881836, "kl/ref_to_policy/rejected": 76.00691986083984, "learning_rate": 6.227427435703997e-06, "logits/chosen": -0.7970739603042603, "logits/rejected": -0.754729688167572, "logps/chosen": -1370.6851806640625, "logps/rejected": -1445.6710205078125, "loss": 4.937, "nll_loss": 1.1112356185913086, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04427460953593254, "rewards/margins": 0.715794563293457, "rewards/rejected": -0.7600691914558411, "step": 335 }, { "epoch": 0.48206599713055953, "grad_norm": 5.571835994720459, "kl/ref_to_policy/chosen": 15.534433364868164, "kl/ref_to_policy/mean": 51.980255126953125, "kl/ref_to_policy/rejected": 88.42607116699219, "learning_rate": 6.203126158989411e-06, "logits/chosen": -0.8202882409095764, "logits/rejected": -0.7550958395004272, "logps/chosen": -2065.518798828125, "logps/rejected": -2142.59375, "loss": 5.0116, "nll_loss": 1.1327928304672241, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15534433722496033, "rewards/margins": 0.7289164066314697, "rewards/rejected": -0.8842607736587524, "step": 336 }, { "epoch": 0.4835007173601148, "grad_norm": 7.996979713439941, "kl/ref_to_policy/chosen": 7.556571006774902, "kl/ref_to_policy/mean": 55.010406494140625, "kl/ref_to_policy/rejected": 102.46424865722656, "learning_rate": 6.178794677547138e-06, "logits/chosen": -1.0681945085525513, "logits/rejected": -0.9394057989120483, "logps/chosen": -1345.0330810546875, "logps/rejected": -1440.2435302734375, "loss": 4.1128, "nll_loss": 0.9217970967292786, "rewards/accuracies": 0.625, "rewards/chosen": -0.07556569576263428, "rewards/margins": 0.9490767121315002, "rewards/rejected": -1.0246425867080688, "step": 337 }, { "epoch": 0.48493543758967, "grad_norm": 4.4268479347229, "kl/ref_to_policy/chosen": 22.778844833374023, "kl/ref_to_policy/mean": 70.65411376953125, "kl/ref_to_policy/rejected": 118.52938842773438, "learning_rate": 6.154433602223979e-06, "logits/chosen": -1.2267556190490723, "logits/rejected": -1.0579419136047363, "logps/chosen": -1514.94287109375, "logps/rejected": -1610.4765625, "loss": 3.7095, "nll_loss": 0.8092830181121826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22778841853141785, "rewards/margins": 0.9575052857398987, "rewards/rejected": -1.1852937936782837, "step": 338 }, { "epoch": 0.48637015781922527, "grad_norm": 10.477421760559082, "kl/ref_to_policy/chosen": 3.841935873031616, "kl/ref_to_policy/mean": 40.51751708984375, "kl/ref_to_policy/rejected": 77.19309997558594, "learning_rate": 6.130043544609707e-06, "logits/chosen": -1.055694580078125, "logits/rejected": -0.906654953956604, "logps/chosen": -1518.3497314453125, "logps/rejected": -1595.003662109375, "loss": 4.7496, "nll_loss": 1.0537593364715576, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03841935470700264, "rewards/margins": 0.7335115671157837, "rewards/rejected": -0.7719309329986572, "step": 339 }, { "epoch": 0.4878048780487805, "grad_norm": 49.8398323059082, "kl/ref_to_policy/chosen": 3.8557887077331543, "kl/ref_to_policy/mean": 48.210777282714844, "kl/ref_to_policy/rejected": 92.56575775146484, "learning_rate": 6.105625117021692e-06, "logits/chosen": -1.1998947858810425, "logits/rejected": -1.0364854335784912, "logps/chosen": -1352.8167724609375, "logps/rejected": -1444.9368896484375, "loss": 4.0749, "nll_loss": 0.9046375751495361, "rewards/accuracies": 0.625, "rewards/chosen": -0.03855788707733154, "rewards/margins": 0.8870996236801147, "rewards/rejected": -0.9256575107574463, "step": 340 }, { "epoch": 0.4878048780487805, "eval_kl/ref_to_policy/chosen": 4.092605113983154, "eval_kl/ref_to_policy/mean": 48.87360763549805, "eval_kl/ref_to_policy/rejected": 93.65460968017578, "eval_logits/chosen": -1.3059179782867432, "eval_logits/rejected": -1.026095986366272, "eval_logps/chosen": -1528.32861328125, "eval_logps/rejected": -1617.377197265625, "eval_loss": 4.225151538848877, "eval_nll_loss": 0.9482244849205017, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.04092605412006378, "eval_rewards/margins": 0.8956199884414673, "eval_rewards/rejected": -0.9365460872650146, "eval_runtime": 112.1344, "eval_samples_per_second": 3.353, "eval_steps_per_second": 1.677, "step": 340 }, { "epoch": 0.48923959827833574, "grad_norm": 5.7393975257873535, "kl/ref_to_policy/chosen": 5.704531669616699, "kl/ref_to_policy/mean": 54.07828140258789, "kl/ref_to_policy/rejected": 102.4520263671875, "learning_rate": 6.0811789324895365e-06, "logits/chosen": -1.1647731065750122, "logits/rejected": -0.9435247182846069, "logps/chosen": -1441.1810302734375, "logps/rejected": -1538.5341796875, "loss": 4.2344, "nll_loss": 0.9540988802909851, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05704530328512192, "rewards/margins": 0.9674749374389648, "rewards/rejected": -1.0245201587677002, "step": 341 }, { "epoch": 0.49067431850789095, "grad_norm": 10.273384094238281, "kl/ref_to_policy/chosen": 32.831626892089844, "kl/ref_to_policy/mean": 56.326045989990234, "kl/ref_to_policy/rejected": 79.82046508789062, "learning_rate": 6.056705604739696e-06, "logits/chosen": -0.9623711705207825, "logits/rejected": -0.743865430355072, "logps/chosen": -2233.83935546875, "logps/rejected": -2292.25390625, "loss": 5.3036, "nll_loss": 1.184647560119629, "rewards/accuracies": 0.375, "rewards/chosen": -0.32831627130508423, "rewards/margins": 0.4698883593082428, "rewards/rejected": -0.7982045412063599, "step": 342 }, { "epoch": 0.4921090387374462, "grad_norm": 6.8998188972473145, "kl/ref_to_policy/chosen": 23.94830894470215, "kl/ref_to_policy/mean": 57.75409698486328, "kl/ref_to_policy/rejected": 91.55988311767578, "learning_rate": 6.032205748180054e-06, "logits/chosen": -1.1260114908218384, "logits/rejected": -0.9394336342811584, "logps/chosen": -1900.8310546875, "logps/rejected": -1974.6126708984375, "loss": 5.0627, "nll_loss": 1.1414772272109985, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23948310315608978, "rewards/margins": 0.6761156916618347, "rewards/rejected": -0.9155988693237305, "step": 343 }, { "epoch": 0.4935437589670014, "grad_norm": 9.76743221282959, "kl/ref_to_policy/chosen": 25.8916015625, "kl/ref_to_policy/mean": 37.83639907836914, "kl/ref_to_policy/rejected": 49.781192779541016, "learning_rate": 6.0076799778845105e-06, "logits/chosen": -0.9068439602851868, "logits/rejected": -0.7654686570167542, "logps/chosen": -1643.1260986328125, "logps/rejected": -1679.5330810546875, "loss": 5.516, "nll_loss": 1.2235770225524902, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2589160203933716, "rewards/margins": 0.23889587819576263, "rewards/rejected": -0.4978119134902954, "step": 344 }, { "epoch": 0.4949784791965567, "grad_norm": 5.689603805541992, "kl/ref_to_policy/chosen": 6.654494285583496, "kl/ref_to_policy/mean": 51.25757598876953, "kl/ref_to_policy/rejected": 95.86064910888672, "learning_rate": 5.983128909577532e-06, "logits/chosen": -1.6008415222167969, "logits/rejected": -1.290475845336914, "logps/chosen": -1588.654541015625, "logps/rejected": -1675.2142333984375, "loss": 3.8356, "nll_loss": 0.8512968420982361, "rewards/accuracies": 0.75, "rewards/chosen": -0.06654495000839233, "rewards/margins": 0.8920615911483765, "rewards/rejected": -0.9586065411567688, "step": 345 }, { "epoch": 0.4964131994261119, "grad_norm": 97.76753234863281, "kl/ref_to_policy/chosen": -8.360301971435547, "kl/ref_to_policy/mean": 34.51365280151367, "kl/ref_to_policy/rejected": 77.38761138916016, "learning_rate": 5.958553159618693e-06, "logits/chosen": -2.0033507347106934, "logits/rejected": -1.5053941011428833, "logps/chosen": -1057.6865234375, "logps/rejected": -1134.2177734375, "loss": 2.6321, "nll_loss": 0.5509340763092041, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08360303193330765, "rewards/margins": 0.8574790358543396, "rewards/rejected": -0.7738760113716125, "step": 346 }, { "epoch": 0.49784791965566716, "grad_norm": 21.505842208862305, "kl/ref_to_policy/chosen": 9.268850326538086, "kl/ref_to_policy/mean": 40.20740509033203, "kl/ref_to_policy/rejected": 71.14596557617188, "learning_rate": 5.933953344987215e-06, "logits/chosen": -1.2347581386566162, "logits/rejected": -0.9891963005065918, "logps/chosen": -1600.737548828125, "logps/rejected": -1666.35693359375, "loss": 4.6509, "nll_loss": 1.0364649295806885, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09268851578235626, "rewards/margins": 0.6187711358070374, "rewards/rejected": -0.7114596366882324, "step": 347 }, { "epoch": 0.49928263988522237, "grad_norm": 6.035835266113281, "kl/ref_to_policy/chosen": 3.3488306999206543, "kl/ref_to_policy/mean": 45.39918518066406, "kl/ref_to_policy/rejected": 87.44953918457031, "learning_rate": 5.9093300832664625e-06, "logits/chosen": -1.1150873899459839, "logits/rejected": -0.9633195996284485, "logps/chosen": -1516.9737548828125, "logps/rejected": -1605.162353515625, "loss": 4.5192, "nll_loss": 1.015221118927002, "rewards/accuracies": 0.625, "rewards/chosen": -0.03348831087350845, "rewards/margins": 0.8410071134567261, "rewards/rejected": -0.8744954466819763, "step": 348 }, { "epoch": 0.5007173601147776, "grad_norm": 5.899621963500977, "kl/ref_to_policy/chosen": 16.851417541503906, "kl/ref_to_policy/mean": 70.32414245605469, "kl/ref_to_policy/rejected": 123.79685974121094, "learning_rate": 5.8846839926284435e-06, "logits/chosen": -1.39223313331604, "logits/rejected": -1.2008764743804932, "logps/chosen": -1901.413818359375, "logps/rejected": -2009.0975341796875, "loss": 3.91, "nll_loss": 0.8796438574790955, "rewards/accuracies": 0.625, "rewards/chosen": -0.1685141921043396, "rewards/margins": 1.069454312324524, "rewards/rejected": -1.2379684448242188, "step": 349 }, { "epoch": 0.5021520803443329, "grad_norm": 7.023991107940674, "kl/ref_to_policy/chosen": -5.308874130249023, "kl/ref_to_policy/mean": 55.58097839355469, "kl/ref_to_policy/rejected": 116.47084045410156, "learning_rate": 5.860015691818292e-06, "logits/chosen": -1.4385908842086792, "logits/rejected": -1.3226823806762695, "logps/chosen": -1131.019287109375, "logps/rejected": -1249.4415283203125, "loss": 3.031, "nll_loss": 0.6706801652908325, "rewards/accuracies": 0.75, "rewards/chosen": 0.053088754415512085, "rewards/margins": 1.2177971601486206, "rewards/rejected": -1.1647083759307861, "step": 350 }, { "epoch": 0.5021520803443329, "eval_kl/ref_to_policy/chosen": 9.303476333618164, "eval_kl/ref_to_policy/mean": 56.63798141479492, "eval_kl/ref_to_policy/rejected": 103.97250366210938, "eval_logits/chosen": -1.1736336946487427, "eval_logits/rejected": -1.1213321685791016, "eval_logps/chosen": -1533.53955078125, "eval_logps/rejected": -1627.6951904296875, "eval_loss": 4.19527006149292, "eval_nll_loss": 0.9383165836334229, "eval_rewards/accuracies": 0.7180851101875305, "eval_rewards/chosen": -0.09303475171327591, "eval_rewards/margins": 0.9466902017593384, "eval_rewards/rejected": -1.0397248268127441, "eval_runtime": 111.5602, "eval_samples_per_second": 3.37, "eval_steps_per_second": 1.685, "step": 350 }, { "epoch": 0.503586800573888, "grad_norm": 18.217313766479492, "kl/ref_to_policy/chosen": 40.13957595825195, "kl/ref_to_policy/mean": 74.30939483642578, "kl/ref_to_policy/rejected": 108.47921752929688, "learning_rate": 5.835325800138736e-06, "logits/chosen": -0.928200900554657, "logits/rejected": -0.8896225690841675, "logps/chosen": -2130.9990234375, "logps/rejected": -2208.047607421875, "loss": 5.0844, "nll_loss": 1.1431514024734497, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4013957679271698, "rewards/margins": 0.6833963394165039, "rewards/rejected": -1.084792137145996, "step": 351 }, { "epoch": 0.5050215208034433, "grad_norm": 37.48910140991211, "kl/ref_to_policy/chosen": 32.86738204956055, "kl/ref_to_policy/mean": 69.92405700683594, "kl/ref_to_policy/rejected": 106.98074340820312, "learning_rate": 5.810614937434537e-06, "logits/chosen": -1.1754424571990967, "logits/rejected": -1.0910226106643677, "logps/chosen": -1446.1719970703125, "logps/rejected": -1521.604736328125, "loss": 3.6892, "nll_loss": 0.7761685252189636, "rewards/accuracies": 0.625, "rewards/chosen": -0.3286738991737366, "rewards/margins": 0.7411335110664368, "rewards/rejected": -1.0698074102401733, "step": 352 }, { "epoch": 0.5064562410329986, "grad_norm": 9.786066055297852, "kl/ref_to_policy/chosen": 7.768579483032227, "kl/ref_to_policy/mean": 63.650550842285156, "kl/ref_to_policy/rejected": 119.53252410888672, "learning_rate": 5.78588372407695e-06, "logits/chosen": -1.4095731973648071, "logits/rejected": -1.3450546264648438, "logps/chosen": -1012.9483032226562, "logps/rejected": -1120.0189208984375, "loss": 2.8798, "nll_loss": 0.6132818460464478, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07768578827381134, "rewards/margins": 1.117639422416687, "rewards/rejected": -1.195325255393982, "step": 353 }, { "epoch": 0.5078909612625538, "grad_norm": 7.9153852462768555, "kl/ref_to_policy/chosen": 22.34161949157715, "kl/ref_to_policy/mean": 46.65962600708008, "kl/ref_to_policy/rejected": 70.97763061523438, "learning_rate": 5.761132780948132e-06, "logits/chosen": -0.6674848794937134, "logits/rejected": -0.5817829370498657, "logps/chosen": -2112.958984375, "logps/rejected": -2172.1416015625, "loss": 6.356, "nll_loss": 1.4518743753433228, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22341617941856384, "rewards/margins": 0.4863601326942444, "rewards/rejected": -0.7097762823104858, "step": 354 }, { "epoch": 0.509325681492109, "grad_norm": 9.255871772766113, "kl/ref_to_policy/chosen": 14.151511192321777, "kl/ref_to_policy/mean": 62.019474029541016, "kl/ref_to_policy/rejected": 109.88743591308594, "learning_rate": 5.736362729425558e-06, "logits/chosen": -1.1482772827148438, "logits/rejected": -0.9748789072036743, "logps/chosen": -1586.604736328125, "logps/rejected": -1684.6470947265625, "loss": 4.2881, "nll_loss": 0.9678654074668884, "rewards/accuracies": 0.75, "rewards/chosen": -0.1415151059627533, "rewards/margins": 0.957359254360199, "rewards/rejected": -1.0988743305206299, "step": 355 }, { "epoch": 0.5107604017216643, "grad_norm": 6.694395065307617, "kl/ref_to_policy/chosen": 22.336395263671875, "kl/ref_to_policy/mean": 41.56150817871094, "kl/ref_to_policy/rejected": 60.78662109375, "learning_rate": 5.711574191366427e-06, "logits/chosen": -0.6844134330749512, "logits/rejected": -0.551520049571991, "logps/chosen": -2800.750732421875, "logps/rejected": -2851.45068359375, "loss": 6.416, "nll_loss": 1.458146095275879, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22336392104625702, "rewards/margins": 0.3845022916793823, "rewards/rejected": -0.6078662276268005, "step": 356 }, { "epoch": 0.5121951219512195, "grad_norm": 5.784806728363037, "kl/ref_to_policy/chosen": 15.293659210205078, "kl/ref_to_policy/mean": 54.83060073852539, "kl/ref_to_policy/rejected": 94.36754608154297, "learning_rate": 5.686767789092041e-06, "logits/chosen": -0.9030659198760986, "logits/rejected": -0.8151221871376038, "logps/chosen": -1920.3953857421875, "logps/rejected": -2002.38330078125, "loss": 5.2775, "nll_loss": 1.2045488357543945, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15293657779693604, "rewards/margins": 0.7907388210296631, "rewards/rejected": -0.9436755180358887, "step": 357 }, { "epoch": 0.5136298421807748, "grad_norm": 164.99282836914062, "kl/ref_to_policy/chosen": 0.47776365280151367, "kl/ref_to_policy/mean": 49.71124267578125, "kl/ref_to_policy/rejected": 98.9447250366211, "learning_rate": 5.661944145372193e-06, "logits/chosen": -1.3739523887634277, "logits/rejected": -1.239046573638916, "logps/chosen": -1374.790283203125, "logps/rejected": -1471.8763427734375, "loss": 3.8581, "nll_loss": 0.8603375554084778, "rewards/accuracies": 0.75, "rewards/chosen": -0.004777628928422928, "rewards/margins": 0.9846695065498352, "rewards/rejected": -0.9894472360610962, "step": 358 }, { "epoch": 0.5150645624103299, "grad_norm": 183.15687561035156, "kl/ref_to_policy/chosen": 0.6292085647583008, "kl/ref_to_policy/mean": 44.48313903808594, "kl/ref_to_policy/rejected": 88.33707427978516, "learning_rate": 5.637103883409525e-06, "logits/chosen": -1.635554552078247, "logits/rejected": -1.4607982635498047, "logps/chosen": -958.496337890625, "logps/rejected": -1042.9815673828125, "loss": 3.7379, "nll_loss": 0.8257752656936646, "rewards/accuracies": 0.8125, "rewards/chosen": -0.006292089819908142, "rewards/margins": 0.8770785927772522, "rewards/rejected": -0.8833706378936768, "step": 359 }, { "epoch": 0.5164992826398852, "grad_norm": 21.513031005859375, "kl/ref_to_policy/chosen": 4.595380783081055, "kl/ref_to_policy/mean": 53.00019073486328, "kl/ref_to_policy/rejected": 101.40499877929688, "learning_rate": 5.612247626823878e-06, "logits/chosen": -1.5863902568817139, "logits/rejected": -1.3630592823028564, "logps/chosen": -1126.6422119140625, "logps/rejected": -1221.0689697265625, "loss": 3.1794, "nll_loss": 0.6935610175132751, "rewards/accuracies": 0.875, "rewards/chosen": -0.04595380648970604, "rewards/margins": 0.968096137046814, "rewards/rejected": -1.0140498876571655, "step": 360 }, { "epoch": 0.5164992826398852, "eval_kl/ref_to_policy/chosen": 1.7497131824493408, "eval_kl/ref_to_policy/mean": 50.13136672973633, "eval_kl/ref_to_policy/rejected": 98.51299285888672, "eval_logits/chosen": -1.3035045862197876, "eval_logits/rejected": -1.134438395500183, "eval_logps/chosen": -1525.9857177734375, "eval_logps/rejected": -1622.235595703125, "eval_loss": 4.241216659545898, "eval_nll_loss": 0.9567817449569702, "eval_rewards/accuracies": 0.8031914830207825, "eval_rewards/chosen": -0.01749713160097599, "eval_rewards/margins": 0.9676329493522644, "eval_rewards/rejected": -0.9851300716400146, "eval_runtime": 111.4365, "eval_samples_per_second": 3.374, "eval_steps_per_second": 1.687, "step": 360 }, { "epoch": 0.5179340028694405, "grad_norm": 5.016998291015625, "kl/ref_to_policy/chosen": 19.36288070678711, "kl/ref_to_policy/mean": 61.91156768798828, "kl/ref_to_policy/rejected": 104.46025085449219, "learning_rate": 5.587375999636645e-06, "logits/chosen": -1.0544835329055786, "logits/rejected": -0.9775680303573608, "logps/chosen": -2076.47216796875, "logps/rejected": -2166.507080078125, "loss": 4.3779, "nll_loss": 0.9816074967384338, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19362880289554596, "rewards/margins": 0.8509737253189087, "rewards/rejected": -1.0446025133132935, "step": 361 }, { "epoch": 0.5193687230989957, "grad_norm": 45.716896057128906, "kl/ref_to_policy/chosen": 12.134966850280762, "kl/ref_to_policy/mean": 40.32981872558594, "kl/ref_to_policy/rejected": 68.52466583251953, "learning_rate": 5.562489626255104e-06, "logits/chosen": -0.8069714307785034, "logits/rejected": -0.660819947719574, "logps/chosen": -1930.7755126953125, "logps/rejected": -1993.5599365234375, "loss": 5.5343, "nll_loss": 1.2536870241165161, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12134966254234314, "rewards/margins": 0.5638970136642456, "rewards/rejected": -0.6852466464042664, "step": 362 }, { "epoch": 0.5208034433285509, "grad_norm": 5.289909362792969, "kl/ref_to_policy/chosen": -2.545130729675293, "kl/ref_to_policy/mean": 46.492897033691406, "kl/ref_to_policy/rejected": 95.53092193603516, "learning_rate": 5.5375891314567335e-06, "logits/chosen": -1.1584421396255493, "logits/rejected": -1.1341710090637207, "logps/chosen": -1217.650146484375, "logps/rejected": -1319.191162109375, "loss": 3.7421, "nll_loss": 0.832006573677063, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02545131742954254, "rewards/margins": 0.9807604551315308, "rewards/rejected": -0.9553091526031494, "step": 363 }, { "epoch": 0.5222381635581061, "grad_norm": 4.64686918258667, "kl/ref_to_policy/chosen": 6.898123264312744, "kl/ref_to_policy/mean": 69.71121215820312, "kl/ref_to_policy/rejected": 132.5242919921875, "learning_rate": 5.512675140373537e-06, "logits/chosen": -1.386826992034912, "logits/rejected": -1.3713603019714355, "logps/chosen": -1175.413818359375, "logps/rejected": -1292.8739013671875, "loss": 2.9345, "nll_loss": 0.6485722661018372, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06898123025894165, "rewards/margins": 1.2562618255615234, "rewards/rejected": -1.3252428770065308, "step": 364 }, { "epoch": 0.5236728837876614, "grad_norm": 6.243799686431885, "kl/ref_to_policy/chosen": 11.483375549316406, "kl/ref_to_policy/mean": 43.50446319580078, "kl/ref_to_policy/rejected": 75.52554321289062, "learning_rate": 5.487748278476342e-06, "logits/chosen": -0.827081561088562, "logits/rejected": -0.8129632472991943, "logps/chosen": -1752.8896484375, "logps/rejected": -1824.974853515625, "loss": 5.3837, "nll_loss": 1.218531847000122, "rewards/accuracies": 0.625, "rewards/chosen": -0.11483374238014221, "rewards/margins": 0.6404216289520264, "rewards/rejected": -0.7552554607391357, "step": 365 }, { "epoch": 0.5251076040172167, "grad_norm": 6.064340114593506, "kl/ref_to_policy/chosen": 11.58596420288086, "kl/ref_to_policy/mean": 38.47605895996094, "kl/ref_to_policy/rejected": 65.36614990234375, "learning_rate": 5.462809171559104e-06, "logits/chosen": -0.7301549911499023, "logits/rejected": -0.7482314109802246, "logps/chosen": -2434.828125, "logps/rejected": -2498.7490234375, "loss": 6.1516, "nll_loss": 1.4041728973388672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11585964262485504, "rewards/margins": 0.5378019213676453, "rewards/rejected": -0.6536614894866943, "step": 366 }, { "epoch": 0.5265423242467718, "grad_norm": 16.156448364257812, "kl/ref_to_policy/chosen": 23.099042892456055, "kl/ref_to_policy/mean": 53.36954879760742, "kl/ref_to_policy/rejected": 83.64006042480469, "learning_rate": 5.437858445723191e-06, "logits/chosen": -0.9227613806724548, "logits/rejected": -0.9586368799209595, "logps/chosen": -1758.39599609375, "logps/rejected": -1826.22998046875, "loss": 4.7395, "nll_loss": 1.0392786264419556, "rewards/accuracies": 0.5, "rewards/chosen": -0.2309904545545578, "rewards/margins": 0.6054101586341858, "rewards/rejected": -0.8364006280899048, "step": 367 }, { "epoch": 0.5279770444763271, "grad_norm": 3.976388931274414, "kl/ref_to_policy/chosen": -10.59432601928711, "kl/ref_to_policy/mean": 58.2822380065918, "kl/ref_to_policy/rejected": 127.15880584716797, "learning_rate": 5.412896727361663e-06, "logits/chosen": -1.524924397468567, "logits/rejected": -1.4996705055236816, "logps/chosen": -1139.1400146484375, "logps/rejected": -1271.3795166015625, "loss": 2.6784, "nll_loss": 0.5929587483406067, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1059432327747345, "rewards/margins": 1.3775312900543213, "rewards/rejected": -1.2715880870819092, "step": 368 }, { "epoch": 0.5294117647058824, "grad_norm": 7.52467155456543, "kl/ref_to_policy/chosen": 11.7392578125, "kl/ref_to_policy/mean": 68.54437255859375, "kl/ref_to_policy/rejected": 125.3494873046875, "learning_rate": 5.387924643143553e-06, "logits/chosen": -1.3135604858398438, "logits/rejected": -1.2801151275634766, "logps/chosen": -1102.3720703125, "logps/rejected": -1213.0584716796875, "loss": 3.7578, "nll_loss": 0.8452622890472412, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11739255487918854, "rewards/margins": 1.136102318763733, "rewards/rejected": -1.2534948587417603, "step": 369 }, { "epoch": 0.5308464849354376, "grad_norm": 5.837443828582764, "kl/ref_to_policy/chosen": -5.705494403839111, "kl/ref_to_policy/mean": 45.27572250366211, "kl/ref_to_policy/rejected": 96.2569351196289, "learning_rate": 5.362942819998131e-06, "logits/chosen": -1.0967882871627808, "logits/rejected": -1.1076215505599976, "logps/chosen": -1688.35693359375, "logps/rejected": -1790.7218017578125, "loss": 4.1384, "nll_loss": 0.9343914985656738, "rewards/accuracies": 0.875, "rewards/chosen": 0.05705494433641434, "rewards/margins": 1.0196243524551392, "rewards/rejected": -0.9625694751739502, "step": 370 }, { "epoch": 0.5308464849354376, "eval_kl/ref_to_policy/chosen": 1.0891292095184326, "eval_kl/ref_to_policy/mean": 52.000030517578125, "eval_kl/ref_to_policy/rejected": 102.91093444824219, "eval_logits/chosen": -1.234020709991455, "eval_logits/rejected": -1.1872740983963013, "eval_logps/chosen": -1525.3250732421875, "eval_logps/rejected": -1626.6334228515625, "eval_loss": 4.172474384307861, "eval_nll_loss": 0.9411702156066895, "eval_rewards/accuracies": 0.813829779624939, "eval_rewards/chosen": -0.010891294106841087, "eval_rewards/margins": 1.0182180404663086, "eval_rewards/rejected": -1.0291093587875366, "eval_runtime": 110.8377, "eval_samples_per_second": 3.392, "eval_steps_per_second": 1.696, "step": 370 }, { "epoch": 0.5322812051649928, "grad_norm": 6.001221179962158, "kl/ref_to_policy/chosen": 33.589881896972656, "kl/ref_to_policy/mean": 78.4565200805664, "kl/ref_to_policy/rejected": 123.32316589355469, "learning_rate": 5.337951885099167e-06, "logits/chosen": -0.9773876070976257, "logits/rejected": -0.9773867130279541, "logps/chosen": -2497.97265625, "logps/rejected": -2589.659423828125, "loss": 4.7124, "nll_loss": 1.0682263374328613, "rewards/accuracies": 0.75, "rewards/chosen": -0.33589887619018555, "rewards/margins": 0.8973326683044434, "rewards/rejected": -1.233231544494629, "step": 371 }, { "epoch": 0.533715925394548, "grad_norm": 239.86520385742188, "kl/ref_to_policy/chosen": -7.88067102432251, "kl/ref_to_policy/mean": 49.546348571777344, "kl/ref_to_policy/rejected": 106.97337341308594, "learning_rate": 5.312952465849173e-06, "logits/chosen": -1.6249310970306396, "logits/rejected": -1.4596364498138428, "logps/chosen": -657.2755737304688, "logps/rejected": -767.1178588867188, "loss": 2.6854, "nll_loss": 0.5795174837112427, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07880672812461853, "rewards/margins": 1.1485404968261719, "rewards/rejected": -1.0697336196899414, "step": 372 }, { "epoch": 0.5351506456241033, "grad_norm": 4.606301307678223, "kl/ref_to_policy/chosen": 0.5979658365249634, "kl/ref_to_policy/mean": 52.73517990112305, "kl/ref_to_policy/rejected": 104.87239837646484, "learning_rate": 5.287945189863676e-06, "logits/chosen": -1.3392421007156372, "logits/rejected": -1.287502646446228, "logps/chosen": -1700.855712890625, "logps/rejected": -1803.6224365234375, "loss": 3.8953, "nll_loss": 0.8764121532440186, "rewards/accuracies": 0.875, "rewards/chosen": -0.005979647859930992, "rewards/margins": 1.0427442789077759, "rewards/rejected": -1.0487239360809326, "step": 373 }, { "epoch": 0.5365853658536586, "grad_norm": 13.558262825012207, "kl/ref_to_policy/chosen": 23.53346061706543, "kl/ref_to_policy/mean": 67.501953125, "kl/ref_to_policy/rejected": 111.47045135498047, "learning_rate": 5.262930684955439e-06, "logits/chosen": -1.1455934047698975, "logits/rejected": -1.088233470916748, "logps/chosen": -2321.416259765625, "logps/rejected": -2411.825927734375, "loss": 4.8865, "nll_loss": 1.1124602556228638, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2353345900774002, "rewards/margins": 0.8793699145317078, "rewards/rejected": -1.1147044897079468, "step": 374 }, { "epoch": 0.5380200860832137, "grad_norm": 4.905727863311768, "kl/ref_to_policy/chosen": 12.58437442779541, "kl/ref_to_policy/mean": 68.90825653076172, "kl/ref_to_policy/rejected": 125.23214721679688, "learning_rate": 5.237909579118713e-06, "logits/chosen": -1.4035656452178955, "logits/rejected": -1.3195676803588867, "logps/chosen": -1497.9046630859375, "logps/rejected": -1610.165771484375, "loss": 3.58, "nll_loss": 0.8014729619026184, "rewards/accuracies": 0.75, "rewards/chosen": -0.12584374845027924, "rewards/margins": 1.1264777183532715, "rewards/rejected": -1.2523213624954224, "step": 375 }, { "epoch": 0.539454806312769, "grad_norm": 4.889274597167969, "kl/ref_to_policy/chosen": 10.468426704406738, "kl/ref_to_policy/mean": 62.35423278808594, "kl/ref_to_policy/rejected": 114.24003601074219, "learning_rate": 5.212882500513462e-06, "logits/chosen": -1.379805564880371, "logits/rejected": -1.305220603942871, "logps/chosen": -1651.7574462890625, "logps/rejected": -1753.5037841796875, "loss": 3.8142, "nll_loss": 0.855253279209137, "rewards/accuracies": 0.875, "rewards/chosen": -0.1046842634677887, "rewards/margins": 1.0377161502838135, "rewards/rejected": -1.1424003839492798, "step": 376 }, { "epoch": 0.5408895265423243, "grad_norm": 6.366039752960205, "kl/ref_to_policy/chosen": 6.980680465698242, "kl/ref_to_policy/mean": 45.35420227050781, "kl/ref_to_policy/rejected": 83.72772979736328, "learning_rate": 5.187850077449604e-06, "logits/chosen": -0.9482517242431641, "logits/rejected": -0.8902103304862976, "logps/chosen": -1843.785400390625, "logps/rejected": -1927.0948486328125, "loss": 5.3341, "nll_loss": 1.2156373262405396, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06980679929256439, "rewards/margins": 0.7674704790115356, "rewards/rejected": -0.8372772932052612, "step": 377 }, { "epoch": 0.5423242467718795, "grad_norm": 5.7003326416015625, "kl/ref_to_policy/chosen": -1.3734694719314575, "kl/ref_to_policy/mean": 56.17259216308594, "kl/ref_to_policy/rejected": 113.7186508178711, "learning_rate": 5.162812938371226e-06, "logits/chosen": -1.4033408164978027, "logits/rejected": -1.3586876392364502, "logps/chosen": -1017.6343383789062, "logps/rejected": -1131.2615966796875, "loss": 3.1098, "nll_loss": 0.6855173110961914, "rewards/accuracies": 0.875, "rewards/chosen": 0.013734695501625538, "rewards/margins": 1.150921106338501, "rewards/rejected": -1.1371865272521973, "step": 378 }, { "epoch": 0.5437589670014347, "grad_norm": 8.612738609313965, "kl/ref_to_policy/chosen": 4.167932510375977, "kl/ref_to_policy/mean": 61.72252655029297, "kl/ref_to_policy/rejected": 119.27711486816406, "learning_rate": 5.137771711840811e-06, "logits/chosen": -1.4963198900222778, "logits/rejected": -1.431056261062622, "logps/chosen": -1290.23828125, "logps/rejected": -1402.732177734375, "loss": 3.4745, "nll_loss": 0.7770085334777832, "rewards/accuracies": 0.8125, "rewards/chosen": -0.041679322719573975, "rewards/margins": 1.1510918140411377, "rewards/rejected": -1.192771077156067, "step": 379 }, { "epoch": 0.5451936872309899, "grad_norm": 5.020176410675049, "kl/ref_to_policy/chosen": -1.0601049661636353, "kl/ref_to_policy/mean": 50.461490631103516, "kl/ref_to_policy/rejected": 101.98309326171875, "learning_rate": 5.112727026523461e-06, "logits/chosen": -1.3709948062896729, "logits/rejected": -1.312089204788208, "logps/chosen": -1447.9241943359375, "logps/rejected": -1551.1724853515625, "loss": 3.9111, "nll_loss": 0.8781472444534302, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010601047426462173, "rewards/margins": 1.030431866645813, "rewards/rejected": -1.0198308229446411, "step": 380 }, { "epoch": 0.5451936872309899, "eval_kl/ref_to_policy/chosen": -0.3715679943561554, "eval_kl/ref_to_policy/mean": 50.92209243774414, "eval_kl/ref_to_policy/rejected": 102.21572875976562, "eval_logits/chosen": -1.3736153841018677, "eval_logits/rejected": -1.2959487438201904, "eval_logps/chosen": -1523.8643798828125, "eval_logps/rejected": -1625.9384765625, "eval_loss": 4.248674392700195, "eval_nll_loss": 0.962543785572052, "eval_rewards/accuracies": 0.8457446694374084, "eval_rewards/chosen": 0.003715681843459606, "eval_rewards/margins": 1.0258729457855225, "eval_rewards/rejected": -1.0221573114395142, "eval_runtime": 112.113, "eval_samples_per_second": 3.354, "eval_steps_per_second": 1.677, "step": 380 }, { "epoch": 0.5466284074605452, "grad_norm": 6.769894123077393, "kl/ref_to_policy/chosen": 7.9261393547058105, "kl/ref_to_policy/mean": 54.344764709472656, "kl/ref_to_policy/rejected": 100.76338958740234, "learning_rate": 5.087679511171113e-06, "logits/chosen": -1.1798062324523926, "logits/rejected": -1.1271846294403076, "logps/chosen": -1547.0042724609375, "logps/rejected": -1641.179443359375, "loss": 4.578, "nll_loss": 1.0382330417633057, "rewards/accuracies": 0.875, "rewards/chosen": -0.07926138490438461, "rewards/margins": 0.9283724427223206, "rewards/rejected": -1.007633924484253, "step": 381 }, { "epoch": 0.5480631276901005, "grad_norm": 5.459246635437012, "kl/ref_to_policy/chosen": 9.306493759155273, "kl/ref_to_policy/mean": 60.99442672729492, "kl/ref_to_policy/rejected": 112.68235778808594, "learning_rate": 5.062629794606748e-06, "logits/chosen": -1.3231124877929688, "logits/rejected": -1.2338792085647583, "logps/chosen": -1272.032958984375, "logps/rejected": -1375.475341796875, "loss": 3.9642, "nll_loss": 0.8923043012619019, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09306493401527405, "rewards/margins": 1.0337586402893066, "rewards/rejected": -1.1268235445022583, "step": 382 }, { "epoch": 0.5494978479196556, "grad_norm": 225.01499938964844, "kl/ref_to_policy/chosen": -1.9321845769882202, "kl/ref_to_policy/mean": 42.442630767822266, "kl/ref_to_policy/rejected": 86.81744384765625, "learning_rate": 5.03757850570861e-06, "logits/chosen": -1.4379554986953735, "logits/rejected": -1.30702543258667, "logps/chosen": -1225.5364990234375, "logps/rejected": -1314.8408203125, "loss": 4.0161, "nll_loss": 0.8962928652763367, "rewards/accuracies": 0.875, "rewards/chosen": 0.01932184398174286, "rewards/margins": 0.887496292591095, "rewards/rejected": -0.8681743144989014, "step": 383 }, { "epoch": 0.5509325681492109, "grad_norm": 5.699438571929932, "kl/ref_to_policy/chosen": 13.276183128356934, "kl/ref_to_policy/mean": 47.65713882446289, "kl/ref_to_policy/rejected": 82.03809356689453, "learning_rate": 5.01252627339442e-06, "logits/chosen": -1.0395678281784058, "logits/rejected": -0.9894018769264221, "logps/chosen": -2282.1123046875, "logps/rejected": -2356.8544921875, "loss": 5.2203, "nll_loss": 1.1830039024353027, "rewards/accuracies": 0.875, "rewards/chosen": -0.13276182115077972, "rewards/margins": 0.6876190304756165, "rewards/rejected": -0.8203808069229126, "step": 384 }, { "epoch": 0.5523672883787661, "grad_norm": 6.316893100738525, "kl/ref_to_policy/chosen": -3.0178375244140625, "kl/ref_to_policy/mean": 44.319549560546875, "kl/ref_to_policy/rejected": 91.65693664550781, "learning_rate": 4.987473726605581e-06, "logits/chosen": -1.2160422801971436, "logits/rejected": -1.1475642919540405, "logps/chosen": -1618.841552734375, "logps/rejected": -1712.211669921875, "loss": 4.7204, "nll_loss": 1.07742178440094, "rewards/accuracies": 1.0, "rewards/chosen": 0.030178368091583252, "rewards/margins": 0.9467476606369019, "rewards/rejected": -0.9165693521499634, "step": 385 }, { "epoch": 0.5538020086083214, "grad_norm": 6.3273820877075195, "kl/ref_to_policy/chosen": -0.8721070289611816, "kl/ref_to_policy/mean": 59.24018096923828, "kl/ref_to_policy/rejected": 119.35247802734375, "learning_rate": 4.9624214942913916e-06, "logits/chosen": -1.454271912574768, "logits/rejected": -1.435474157333374, "logps/chosen": -1475.4090576171875, "logps/rejected": -1590.3759765625, "loss": 3.5842, "nll_loss": 0.8093664646148682, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00872107595205307, "rewards/margins": 1.2022457122802734, "rewards/rejected": -1.193524718284607, "step": 386 }, { "epoch": 0.5552367288378766, "grad_norm": 5.633467674255371, "kl/ref_to_policy/chosen": -8.973152160644531, "kl/ref_to_policy/mean": 43.379825592041016, "kl/ref_to_policy/rejected": 95.73279571533203, "learning_rate": 4.9373702053932534e-06, "logits/chosen": -1.3758502006530762, "logits/rejected": -1.3249030113220215, "logps/chosen": -1045.21435546875, "logps/rejected": -1148.74853515625, "loss": 3.5087, "nll_loss": 0.7789348363876343, "rewards/accuracies": 0.875, "rewards/chosen": 0.08973151445388794, "rewards/margins": 1.0470595359802246, "rewards/rejected": -0.9573280215263367, "step": 387 }, { "epoch": 0.5566714490674318, "grad_norm": 5.732989311218262, "kl/ref_to_policy/chosen": 2.347059726715088, "kl/ref_to_policy/mean": 49.00839614868164, "kl/ref_to_policy/rejected": 95.66972351074219, "learning_rate": 4.912320488828887e-06, "logits/chosen": -1.1681679487228394, "logits/rejected": -1.149032711982727, "logps/chosen": -1780.7298583984375, "logps/rejected": -1875.431396484375, "loss": 4.5678, "nll_loss": 1.0361605882644653, "rewards/accuracies": 0.875, "rewards/chosen": -0.023470595479011536, "rewards/margins": 0.9332266449928284, "rewards/rejected": -0.9566972255706787, "step": 388 }, { "epoch": 0.5581061692969871, "grad_norm": 5.176486492156982, "kl/ref_to_policy/chosen": -2.355103015899658, "kl/ref_to_policy/mean": 38.28343200683594, "kl/ref_to_policy/rejected": 78.92195892333984, "learning_rate": 4.88727297347654e-06, "logits/chosen": -1.359843373298645, "logits/rejected": -1.3072311878204346, "logps/chosen": -1528.6761474609375, "logps/rejected": -1614.2384033203125, "loss": 4.3671, "nll_loss": 0.9776178598403931, "rewards/accuracies": 0.875, "rewards/chosen": 0.023551028221845627, "rewards/margins": 0.8127706050872803, "rewards/rejected": -0.7892196178436279, "step": 389 }, { "epoch": 0.5595408895265424, "grad_norm": 48.79985046386719, "kl/ref_to_policy/chosen": -8.351747512817383, "kl/ref_to_policy/mean": 57.549678802490234, "kl/ref_to_policy/rejected": 123.45111083984375, "learning_rate": 4.862228288159191e-06, "logits/chosen": -1.7881708145141602, "logits/rejected": -1.6643574237823486, "logps/chosen": -988.7810668945312, "logps/rejected": -1114.2205810546875, "loss": 2.5712, "nll_loss": 0.5638289451599121, "rewards/accuracies": 0.875, "rewards/chosen": 0.08351748436689377, "rewards/margins": 1.318028450012207, "rewards/rejected": -1.2345110177993774, "step": 390 }, { "epoch": 0.5595408895265424, "eval_kl/ref_to_policy/chosen": -3.116706609725952, "eval_kl/ref_to_policy/mean": 49.32328796386719, "eval_kl/ref_to_policy/rejected": 101.76327514648438, "eval_logits/chosen": -1.3607906103134155, "eval_logits/rejected": -1.3187333345413208, "eval_logps/chosen": -1521.119384765625, "eval_logps/rejected": -1625.48583984375, "eval_loss": 4.174102783203125, "eval_nll_loss": 0.9447272419929504, "eval_rewards/accuracies": 0.8377659320831299, "eval_rewards/chosen": 0.03116706572473049, "eval_rewards/margins": 1.048799991607666, "eval_rewards/rejected": -1.0176328420639038, "eval_runtime": 111.5453, "eval_samples_per_second": 3.371, "eval_steps_per_second": 1.685, "step": 390 }, { "epoch": 0.5609756097560976, "grad_norm": 4.5093607902526855, "kl/ref_to_policy/chosen": -4.2494940757751465, "kl/ref_to_policy/mean": 60.227806091308594, "kl/ref_to_policy/rejected": 124.7051010131836, "learning_rate": 4.837187061628777e-06, "logits/chosen": -1.5180609226226807, "logits/rejected": -1.4754811525344849, "logps/chosen": -1014.07080078125, "logps/rejected": -1139.25390625, "loss": 3.185, "nll_loss": 0.7137941718101501, "rewards/accuracies": 0.875, "rewards/chosen": 0.04249493032693863, "rewards/margins": 1.289546012878418, "rewards/rejected": -1.2470510005950928, "step": 391 }, { "epoch": 0.5624103299856528, "grad_norm": 7.946601390838623, "kl/ref_to_policy/chosen": 3.3847389221191406, "kl/ref_to_policy/mean": 35.95988082885742, "kl/ref_to_policy/rejected": 68.53501892089844, "learning_rate": 4.8121499225503974e-06, "logits/chosen": -0.9745339155197144, "logits/rejected": -0.986267626285553, "logps/chosen": -1600.2833251953125, "logps/rejected": -1674.601318359375, "loss": 5.0963, "nll_loss": 1.146464467048645, "rewards/accuracies": 0.75, "rewards/chosen": -0.0338473916053772, "rewards/margins": 0.651502788066864, "rewards/rejected": -0.6853501796722412, "step": 392 }, { "epoch": 0.563845050215208, "grad_norm": 5.1216607093811035, "kl/ref_to_policy/chosen": 2.8856282234191895, "kl/ref_to_policy/mean": 55.19523239135742, "kl/ref_to_policy/rejected": 107.50483703613281, "learning_rate": 4.787117499486539e-06, "logits/chosen": -1.3143188953399658, "logits/rejected": -1.2610045671463013, "logps/chosen": -1940.9091796875, "logps/rejected": -2046.0985107421875, "loss": 4.3932, "nll_loss": 0.9986753463745117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028856277465820312, "rewards/margins": 1.0461920499801636, "rewards/rejected": -1.0750483274459839, "step": 393 }, { "epoch": 0.5652797704447633, "grad_norm": 5.632513999938965, "kl/ref_to_policy/chosen": -1.17985200881958, "kl/ref_to_policy/mean": 49.60930633544922, "kl/ref_to_policy/rejected": 100.39846801757812, "learning_rate": 4.762090420881289e-06, "logits/chosen": -1.4886454343795776, "logits/rejected": -1.4515339136123657, "logps/chosen": -1428.468505859375, "logps/rejected": -1533.744140625, "loss": 4.1614, "nll_loss": 0.9382956624031067, "rewards/accuracies": 0.6875, "rewards/chosen": 0.011798523366451263, "rewards/margins": 1.0157831907272339, "rewards/rejected": -1.0039846897125244, "step": 394 }, { "epoch": 0.5667144906743186, "grad_norm": 19.612878799438477, "kl/ref_to_policy/chosen": 0.9695491790771484, "kl/ref_to_policy/mean": 45.47045135498047, "kl/ref_to_policy/rejected": 89.97135162353516, "learning_rate": 4.737069315044562e-06, "logits/chosen": -1.4310654401779175, "logits/rejected": -1.4112693071365356, "logps/chosen": -908.2898559570312, "logps/rejected": -998.2037963867188, "loss": 3.8024, "nll_loss": 0.825285792350769, "rewards/accuracies": 0.75, "rewards/chosen": -0.009695488959550858, "rewards/margins": 0.8900179862976074, "rewards/rejected": -0.8997135162353516, "step": 395 }, { "epoch": 0.5681492109038737, "grad_norm": 5.603536605834961, "kl/ref_to_policy/chosen": 9.506034851074219, "kl/ref_to_policy/mean": 61.13364791870117, "kl/ref_to_policy/rejected": 112.76126861572266, "learning_rate": 4.712054810136327e-06, "logits/chosen": -1.4026142358779907, "logits/rejected": -1.3499438762664795, "logps/chosen": -1657.0919189453125, "logps/rejected": -1761.42529296875, "loss": 3.7398, "nll_loss": 0.835257351398468, "rewards/accuracies": 0.75, "rewards/chosen": -0.09506034106016159, "rewards/margins": 1.0325522422790527, "rewards/rejected": -1.127612590789795, "step": 396 }, { "epoch": 0.569583931133429, "grad_norm": 6.108913421630859, "kl/ref_to_policy/chosen": 22.443222045898438, "kl/ref_to_policy/mean": 58.871360778808594, "kl/ref_to_policy/rejected": 95.29949951171875, "learning_rate": 4.687047534150829e-06, "logits/chosen": -1.0945103168487549, "logits/rejected": -1.0314733982086182, "logps/chosen": -2314.763671875, "logps/rejected": -2390.76220703125, "loss": 5.7093, "nll_loss": 1.3082373142242432, "rewards/accuracies": 0.875, "rewards/chosen": -0.22443221509456635, "rewards/margins": 0.7285627126693726, "rewards/rejected": -0.9529950022697449, "step": 397 }, { "epoch": 0.5710186513629842, "grad_norm": 5.096310615539551, "kl/ref_to_policy/chosen": -15.199684143066406, "kl/ref_to_policy/mean": 51.021183013916016, "kl/ref_to_policy/rejected": 117.24205017089844, "learning_rate": 4.662048114900837e-06, "logits/chosen": -1.6071901321411133, "logits/rejected": -1.5513932704925537, "logps/chosen": -1076.47216796875, "logps/rejected": -1202.666015625, "loss": 3.2342, "nll_loss": 0.7293750643730164, "rewards/accuracies": 0.875, "rewards/chosen": 0.15199683606624603, "rewards/margins": 1.3244173526763916, "rewards/rejected": -1.1724205017089844, "step": 398 }, { "epoch": 0.5724533715925395, "grad_norm": 4.823779106140137, "kl/ref_to_policy/chosen": -9.385717391967773, "kl/ref_to_policy/mean": 48.27999496459961, "kl/ref_to_policy/rejected": 105.94570922851562, "learning_rate": 4.6370571800018695e-06, "logits/chosen": -1.646618127822876, "logits/rejected": -1.569930076599121, "logps/chosen": -1108.4658203125, "logps/rejected": -1223.4222412109375, "loss": 3.3799, "nll_loss": 0.7533062696456909, "rewards/accuracies": 0.875, "rewards/chosen": 0.09385718405246735, "rewards/margins": 1.1533141136169434, "rewards/rejected": -1.0594570636749268, "step": 399 }, { "epoch": 0.5738880918220947, "grad_norm": 506.1974792480469, "kl/ref_to_policy/chosen": 9.607348442077637, "kl/ref_to_policy/mean": 67.64738464355469, "kl/ref_to_policy/rejected": 125.68742370605469, "learning_rate": 4.612075356856447e-06, "logits/chosen": -1.8655595779418945, "logits/rejected": -1.7069151401519775, "logps/chosen": -1063.224609375, "logps/rejected": -1176.14453125, "loss": 3.4009, "nll_loss": 0.7589124441146851, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09607348591089249, "rewards/margins": 1.1608006954193115, "rewards/rejected": -1.2568742036819458, "step": 400 }, { "epoch": 0.5738880918220947, "eval_kl/ref_to_policy/chosen": -4.123345375061035, "eval_kl/ref_to_policy/mean": 48.78786087036133, "eval_kl/ref_to_policy/rejected": 101.69908142089844, "eval_logits/chosen": -1.4351859092712402, "eval_logits/rejected": -1.3691633939743042, "eval_logps/chosen": -1520.1126708984375, "eval_logps/rejected": -1625.4215087890625, "eval_loss": 4.178154468536377, "eval_nll_loss": 0.9462413191795349, "eval_rewards/accuracies": 0.8430851101875305, "eval_rewards/chosen": 0.041233453899621964, "eval_rewards/margins": 1.0582242012023926, "eval_rewards/rejected": -1.0169906616210938, "eval_runtime": 111.7873, "eval_samples_per_second": 3.364, "eval_steps_per_second": 1.682, "step": 400 }, { "epoch": 0.5753228120516499, "grad_norm": 4.690700054168701, "kl/ref_to_policy/chosen": -0.16639471054077148, "kl/ref_to_policy/mean": 58.6862678527832, "kl/ref_to_policy/rejected": 117.53893280029297, "learning_rate": 4.587103272638339e-06, "logits/chosen": -1.5521228313446045, "logits/rejected": -1.5264124870300293, "logps/chosen": -1420.0133056640625, "logps/rejected": -1537.3963623046875, "loss": 3.4826, "nll_loss": 0.7802143692970276, "rewards/accuracies": 0.75, "rewards/chosen": 0.0016639456152915955, "rewards/margins": 1.1770532131195068, "rewards/rejected": -1.175389289855957, "step": 401 }, { "epoch": 0.5767575322812052, "grad_norm": 2374.511474609375, "kl/ref_to_policy/chosen": -16.114253997802734, "kl/ref_to_policy/mean": 47.53107452392578, "kl/ref_to_policy/rejected": 111.17640686035156, "learning_rate": 4.562141554276811e-06, "logits/chosen": -1.9774914979934692, "logits/rejected": -1.8730156421661377, "logps/chosen": -869.6513671875, "logps/rejected": -992.2750244140625, "loss": 2.7061, "nll_loss": 0.5920149087905884, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1611425131559372, "rewards/margins": 1.2729065418243408, "rewards/rejected": -1.1117640733718872, "step": 402 }, { "epoch": 0.5781922525107605, "grad_norm": 5.516356468200684, "kl/ref_to_policy/chosen": 8.479299545288086, "kl/ref_to_policy/mean": 61.578216552734375, "kl/ref_to_policy/rejected": 114.67713165283203, "learning_rate": 4.537190828440898e-06, "logits/chosen": -1.4027427434921265, "logits/rejected": -1.3763501644134521, "logps/chosen": -1385.338623046875, "logps/rejected": -1490.1907958984375, "loss": 3.8828, "nll_loss": 0.8732732534408569, "rewards/accuracies": 0.875, "rewards/chosen": -0.08479299396276474, "rewards/margins": 1.0619783401489258, "rewards/rejected": -1.1467711925506592, "step": 403 }, { "epoch": 0.5796269727403156, "grad_norm": 6.273988246917725, "kl/ref_to_policy/chosen": 10.052167892456055, "kl/ref_to_policy/mean": 49.864871978759766, "kl/ref_to_policy/rejected": 89.67758178710938, "learning_rate": 4.512251721523659e-06, "logits/chosen": -1.0999003648757935, "logits/rejected": -1.099595069885254, "logps/chosen": -1652.81005859375, "logps/rejected": -1739.4627685546875, "loss": 4.9695, "nll_loss": 1.1259114742279053, "rewards/accuracies": 0.75, "rewards/chosen": -0.10052168369293213, "rewards/margins": 0.79625403881073, "rewards/rejected": -0.8967757225036621, "step": 404 }, { "epoch": 0.5810616929698709, "grad_norm": 2.8000636100769043, "kl/ref_to_policy/chosen": -1.5120997428894043, "kl/ref_to_policy/mean": 84.16641998291016, "kl/ref_to_policy/rejected": 169.8449249267578, "learning_rate": 4.487324859626465e-06, "logits/chosen": -2.0532729625701904, "logits/rejected": -2.0230753421783447, "logps/chosen": -775.2691040039062, "logps/rejected": -932.5948486328125, "loss": 1.5347, "nll_loss": 0.3303828835487366, "rewards/accuracies": 1.0, "rewards/chosen": 0.015121009200811386, "rewards/margins": 1.7135701179504395, "rewards/rejected": -1.6984492540359497, "step": 405 }, { "epoch": 0.5824964131994261, "grad_norm": 5.423611164093018, "kl/ref_to_policy/chosen": -14.971969604492188, "kl/ref_to_policy/mean": 38.22177505493164, "kl/ref_to_policy/rejected": 91.41551971435547, "learning_rate": 4.462410868543268e-06, "logits/chosen": -1.368178129196167, "logits/rejected": -1.3120393753051758, "logps/chosen": -1056.762451171875, "logps/rejected": -1163.74755859375, "loss": 3.6615, "nll_loss": 0.8179622292518616, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14971968531608582, "rewards/margins": 1.0638748407363892, "rewards/rejected": -0.914155125617981, "step": 406 }, { "epoch": 0.5839311334289814, "grad_norm": 4.353668689727783, "kl/ref_to_policy/chosen": 22.527400970458984, "kl/ref_to_policy/mean": 89.45516967773438, "kl/ref_to_policy/rejected": 156.38294982910156, "learning_rate": 4.437510373744897e-06, "logits/chosen": -1.777951955795288, "logits/rejected": -1.6700246334075928, "logps/chosen": -1564.4569091796875, "logps/rejected": -1690.690673828125, "loss": 3.1043, "nll_loss": 0.6973256468772888, "rewards/accuracies": 0.875, "rewards/chosen": -0.225273996591568, "rewards/margins": 1.3385554552078247, "rewards/rejected": -1.5638294219970703, "step": 407 }, { "epoch": 0.5853658536585366, "grad_norm": 5.976798057556152, "kl/ref_to_policy/chosen": 10.214315414428711, "kl/ref_to_policy/mean": 43.338199615478516, "kl/ref_to_policy/rejected": 76.46208190917969, "learning_rate": 4.4126240003633565e-06, "logits/chosen": -0.9567282199859619, "logits/rejected": -0.9722916483879089, "logps/chosen": -2294.94091796875, "logps/rejected": -2370.280517578125, "loss": 5.6879, "nll_loss": 1.2958608865737915, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10214316099882126, "rewards/margins": 0.6624776721000671, "rewards/rejected": -0.7646209001541138, "step": 408 }, { "epoch": 0.5868005738880918, "grad_norm": 4.064777374267578, "kl/ref_to_policy/chosen": -11.047256469726562, "kl/ref_to_policy/mean": 52.798057556152344, "kl/ref_to_policy/rejected": 116.64337158203125, "learning_rate": 4.387752373176123e-06, "logits/chosen": -1.739648461341858, "logits/rejected": -1.7244627475738525, "logps/chosen": -1138.8511962890625, "logps/rejected": -1261.4871826171875, "loss": 2.8877, "nll_loss": 0.6238464713096619, "rewards/accuracies": 0.75, "rewards/chosen": 0.11047255992889404, "rewards/margins": 1.2769062519073486, "rewards/rejected": -1.166433572769165, "step": 409 }, { "epoch": 0.5882352941176471, "grad_norm": 5.80195951461792, "kl/ref_to_policy/chosen": -5.806215286254883, "kl/ref_to_policy/mean": 41.78667449951172, "kl/ref_to_policy/rejected": 89.37957000732422, "learning_rate": 4.362896116590475e-06, "logits/chosen": -1.4001725912094116, "logits/rejected": -1.3940598964691162, "logps/chosen": -1369.2093505859375, "logps/rejected": -1463.8853759765625, "loss": 4.0693, "nll_loss": 0.898479700088501, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05806215852499008, "rewards/margins": 0.95185786485672, "rewards/rejected": -0.8937956094741821, "step": 410 }, { "epoch": 0.5882352941176471, "eval_kl/ref_to_policy/chosen": -1.1124635934829712, "eval_kl/ref_to_policy/mean": 51.12594985961914, "eval_kl/ref_to_policy/rejected": 103.36436462402344, "eval_logits/chosen": -1.4315229654312134, "eval_logits/rejected": -1.4328784942626953, "eval_logps/chosen": -1523.1236572265625, "eval_logps/rejected": -1627.087158203125, "eval_loss": 4.15667200088501, "eval_nll_loss": 0.9358400106430054, "eval_rewards/accuracies": 0.8191489577293396, "eval_rewards/chosen": 0.011124636046588421, "eval_rewards/margins": 1.0447683334350586, "eval_rewards/rejected": -1.0336437225341797, "eval_runtime": 112.0109, "eval_samples_per_second": 3.357, "eval_steps_per_second": 1.678, "step": 410 }, { "epoch": 0.5896700143472023, "grad_norm": 6.554864883422852, "kl/ref_to_policy/chosen": -9.449349403381348, "kl/ref_to_policy/mean": 44.46142578125, "kl/ref_to_policy/rejected": 98.3721923828125, "learning_rate": 4.3380558546278075e-06, "logits/chosen": -1.3611984252929688, "logits/rejected": -1.375022530555725, "logps/chosen": -1089.331787109375, "logps/rejected": -1196.7432861328125, "loss": 4.0773, "nll_loss": 0.9224861860275269, "rewards/accuracies": 0.875, "rewards/chosen": 0.09449350833892822, "rewards/margins": 1.078215479850769, "rewards/rejected": -0.983721911907196, "step": 411 }, { "epoch": 0.5911047345767575, "grad_norm": 6.736931324005127, "kl/ref_to_policy/chosen": -15.947629928588867, "kl/ref_to_policy/mean": 44.47575378417969, "kl/ref_to_policy/rejected": 104.89913940429688, "learning_rate": 4.313232210907959e-06, "logits/chosen": -1.6171536445617676, "logits/rejected": -1.6014463901519775, "logps/chosen": -735.9195556640625, "logps/rejected": -853.3873901367188, "loss": 3.2284, "nll_loss": 0.7191522121429443, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15947628021240234, "rewards/margins": 1.2084674835205078, "rewards/rejected": -1.048991322517395, "step": 412 }, { "epoch": 0.5925394548063128, "grad_norm": 4.544855117797852, "kl/ref_to_policy/chosen": 8.978421211242676, "kl/ref_to_policy/mean": 75.35363006591797, "kl/ref_to_policy/rejected": 141.7288360595703, "learning_rate": 4.2884258086335755e-06, "logits/chosen": -1.6505423784255981, "logits/rejected": -1.6162493228912354, "logps/chosen": -1210.4237060546875, "logps/rejected": -1338.164306640625, "loss": 3.098, "nll_loss": 0.6945643424987793, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08978421241044998, "rewards/margins": 1.3275041580200195, "rewards/rejected": -1.4172884225845337, "step": 413 }, { "epoch": 0.593974175035868, "grad_norm": 132.81776428222656, "kl/ref_to_policy/chosen": -9.936433792114258, "kl/ref_to_policy/mean": 29.894073486328125, "kl/ref_to_policy/rejected": 69.7245864868164, "learning_rate": 4.2636372705744425e-06, "logits/chosen": -1.4317315816879272, "logits/rejected": -1.4125726222991943, "logps/chosen": -2364.2744140625, "logps/rejected": -2447.73974609375, "loss": 6.2959, "nll_loss": 1.4573208093643188, "rewards/accuracies": 0.625, "rewards/chosen": 0.09936432540416718, "rewards/margins": 0.7966101169586182, "rewards/rejected": -0.697245717048645, "step": 414 }, { "epoch": 0.5954088952654233, "grad_norm": 5.326428413391113, "kl/ref_to_policy/chosen": -0.5159912109375, "kl/ref_to_policy/mean": 65.82026672363281, "kl/ref_to_policy/rejected": 132.15652465820312, "learning_rate": 4.238867219051868e-06, "logits/chosen": -1.7562410831451416, "logits/rejected": -1.7865678071975708, "logps/chosen": -1067.30419921875, "logps/rejected": -1193.353759765625, "loss": 2.883, "nll_loss": 0.6401160955429077, "rewards/accuracies": 0.9375, "rewards/chosen": 0.005159911699593067, "rewards/margins": 1.3267251253128052, "rewards/rejected": -1.3215652704238892, "step": 415 }, { "epoch": 0.5968436154949784, "grad_norm": 4.467376708984375, "kl/ref_to_policy/chosen": 0.23457026481628418, "kl/ref_to_policy/mean": 67.31661987304688, "kl/ref_to_policy/rejected": 134.39866638183594, "learning_rate": 4.214116275923051e-06, "logits/chosen": -1.5913994312286377, "logits/rejected": -1.6538106203079224, "logps/chosen": -1247.1251220703125, "logps/rejected": -1374.426025390625, "loss": 3.4499, "nll_loss": 0.783737063407898, "rewards/accuracies": 0.9375, "rewards/chosen": -0.002345699816942215, "rewards/margins": 1.341640830039978, "rewards/rejected": -1.3439867496490479, "step": 416 }, { "epoch": 0.5982783357245337, "grad_norm": 30.67816925048828, "kl/ref_to_policy/chosen": 3.180957794189453, "kl/ref_to_policy/mean": 43.693695068359375, "kl/ref_to_policy/rejected": 84.20643615722656, "learning_rate": 4.1893850625654626e-06, "logits/chosen": -1.5442421436309814, "logits/rejected": -1.5575432777404785, "logps/chosen": -1934.532470703125, "logps/rejected": -2019.1536865234375, "loss": 6.0759, "nll_loss": 1.403237223625183, "rewards/accuracies": 0.6875, "rewards/chosen": -0.031809575855731964, "rewards/margins": 0.8102548122406006, "rewards/rejected": -0.8420642614364624, "step": 417 }, { "epoch": 0.599713055954089, "grad_norm": 5.150177478790283, "kl/ref_to_policy/chosen": -8.71183967590332, "kl/ref_to_policy/mean": 50.76544952392578, "kl/ref_to_policy/rejected": 110.24275207519531, "learning_rate": 4.1646741998612676e-06, "logits/chosen": -1.608337640762329, "logits/rejected": -1.628230333328247, "logps/chosen": -1505.5927734375, "logps/rejected": -1622.617919921875, "loss": 3.4492, "nll_loss": 0.7721635103225708, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08711838722229004, "rewards/margins": 1.189545750617981, "rewards/rejected": -1.1024274826049805, "step": 418 }, { "epoch": 0.6011477761836442, "grad_norm": 6.019632816314697, "kl/ref_to_policy/chosen": -7.114205360412598, "kl/ref_to_policy/mean": 45.113548278808594, "kl/ref_to_policy/rejected": 97.34130859375, "learning_rate": 4.1399843081817085e-06, "logits/chosen": -1.3934800624847412, "logits/rejected": -1.4676146507263184, "logps/chosen": -1509.9638671875, "logps/rejected": -1617.1187744140625, "loss": 4.2787, "nll_loss": 0.9699848890304565, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0711420550942421, "rewards/margins": 1.0445549488067627, "rewards/rejected": -0.9734129905700684, "step": 419 }, { "epoch": 0.6025824964131994, "grad_norm": 5.4262895584106445, "kl/ref_to_policy/chosen": 2.816416025161743, "kl/ref_to_policy/mean": 58.00640106201172, "kl/ref_to_policy/rejected": 113.19638061523438, "learning_rate": 4.115316007371557e-06, "logits/chosen": -1.3833173513412476, "logits/rejected": -1.470914363861084, "logps/chosen": -1596.49609375, "logps/rejected": -1704.9638671875, "loss": 3.6643, "nll_loss": 0.8210842609405518, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02816416695713997, "rewards/margins": 1.10379958152771, "rewards/rejected": -1.131963849067688, "step": 420 }, { "epoch": 0.6025824964131994, "eval_kl/ref_to_policy/chosen": -4.721065998077393, "eval_kl/ref_to_policy/mean": 49.07633972167969, "eval_kl/ref_to_policy/rejected": 102.87373352050781, "eval_logits/chosen": -1.4141281843185425, "eval_logits/rejected": -1.4949676990509033, "eval_logps/chosen": -1519.5150146484375, "eval_logps/rejected": -1626.596435546875, "eval_loss": 4.168780326843262, "eval_nll_loss": 0.9441843032836914, "eval_rewards/accuracies": 0.832446813583374, "eval_rewards/chosen": 0.04721065238118172, "eval_rewards/margins": 1.0759479999542236, "eval_rewards/rejected": -1.0287373065948486, "eval_runtime": 111.2939, "eval_samples_per_second": 3.378, "eval_steps_per_second": 1.689, "step": 420 }, { "epoch": 0.6040172166427547, "grad_norm": 4.505352020263672, "kl/ref_to_policy/chosen": -4.967510223388672, "kl/ref_to_policy/mean": 67.32703399658203, "kl/ref_to_policy/rejected": 139.62158203125, "learning_rate": 4.090669916733539e-06, "logits/chosen": -1.8094496726989746, "logits/rejected": -1.9133285284042358, "logps/chosen": -838.423095703125, "logps/rejected": -975.4515991210938, "loss": 2.6145, "nll_loss": 0.581328809261322, "rewards/accuracies": 0.875, "rewards/chosen": 0.04967509210109711, "rewards/margins": 1.4458909034729004, "rewards/rejected": -1.3962156772613525, "step": 421 }, { "epoch": 0.6054519368723099, "grad_norm": 6.5674662590026855, "kl/ref_to_policy/chosen": -17.220149993896484, "kl/ref_to_policy/mean": 35.477359771728516, "kl/ref_to_policy/rejected": 88.17487335205078, "learning_rate": 4.066046655012786e-06, "logits/chosen": -1.3705723285675049, "logits/rejected": -1.4610885381698608, "logps/chosen": -1516.7330322265625, "logps/rejected": -1624.5284423828125, "loss": 4.1411, "nll_loss": 0.9361301064491272, "rewards/accuracies": 0.75, "rewards/chosen": 0.1722015142440796, "rewards/margins": 1.053950309753418, "rewards/rejected": -0.8817487955093384, "step": 422 }, { "epoch": 0.6068866571018652, "grad_norm": 6.900933265686035, "kl/ref_to_policy/chosen": -18.6115779876709, "kl/ref_to_policy/mean": 47.28262710571289, "kl/ref_to_policy/rejected": 113.17681884765625, "learning_rate": 4.041446840381309e-06, "logits/chosen": -1.6113947629928589, "logits/rejected": -1.703546166419983, "logps/chosen": -642.2868041992188, "logps/rejected": -768.5538330078125, "loss": 2.7274, "nll_loss": 0.6017166972160339, "rewards/accuracies": 1.0, "rewards/chosen": 0.1861157864332199, "rewards/margins": 1.3178839683532715, "rewards/rejected": -1.1317682266235352, "step": 423 }, { "epoch": 0.6083213773314203, "grad_norm": 41.993377685546875, "kl/ref_to_policy/chosen": 4.89040470123291, "kl/ref_to_policy/mean": 48.46468734741211, "kl/ref_to_policy/rejected": 92.03897094726562, "learning_rate": 4.016871090422471e-06, "logits/chosen": -1.2449434995651245, "logits/rejected": -1.359140396118164, "logps/chosen": -1882.960693359375, "logps/rejected": -1969.6190185546875, "loss": 4.1299, "nll_loss": 0.9115633964538574, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04890405014157295, "rewards/margins": 0.8714856505393982, "rewards/rejected": -0.920389711856842, "step": 424 }, { "epoch": 0.6097560975609756, "grad_norm": 5.290921211242676, "kl/ref_to_policy/chosen": -7.043755531311035, "kl/ref_to_policy/mean": 52.033721923828125, "kl/ref_to_policy/rejected": 111.1112060546875, "learning_rate": 3.992320022115492e-06, "logits/chosen": -1.4275174140930176, "logits/rejected": -1.5402029752731323, "logps/chosen": -1365.776611328125, "logps/rejected": -1483.537109375, "loss": 3.8205, "nll_loss": 0.8644694089889526, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07043756544589996, "rewards/margins": 1.1815495491027832, "rewards/rejected": -1.1111118793487549, "step": 425 }, { "epoch": 0.6111908177905309, "grad_norm": 4.545032501220703, "kl/ref_to_policy/chosen": -7.812516212463379, "kl/ref_to_policy/mean": 51.20738983154297, "kl/ref_to_policy/rejected": 110.227294921875, "learning_rate": 3.9677942518199465e-06, "logits/chosen": -1.5848941802978516, "logits/rejected": -1.6740314960479736, "logps/chosen": -1561.84375, "logps/rejected": -1680.205810546875, "loss": 3.6164, "nll_loss": 0.8123309016227722, "rewards/accuracies": 0.75, "rewards/chosen": 0.07812516391277313, "rewards/margins": 1.1803981065750122, "rewards/rejected": -1.102272868156433, "step": 426 }, { "epoch": 0.6126255380200861, "grad_norm": 4.2160749435424805, "kl/ref_to_policy/chosen": -11.15659236907959, "kl/ref_to_policy/mean": 54.2663459777832, "kl/ref_to_policy/rejected": 119.68928527832031, "learning_rate": 3.943294395260305e-06, "logits/chosen": -1.7281231880187988, "logits/rejected": -1.776158094406128, "logps/chosen": -1064.546630859375, "logps/rejected": -1192.687255859375, "loss": 2.6813, "nll_loss": 0.5873638987541199, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11156593263149261, "rewards/margins": 1.3084588050842285, "rewards/rejected": -1.1968927383422852, "step": 427 }, { "epoch": 0.6140602582496413, "grad_norm": 5.568368911743164, "kl/ref_to_policy/chosen": -2.160310745239258, "kl/ref_to_policy/mean": 37.63197326660156, "kl/ref_to_policy/rejected": 77.42426300048828, "learning_rate": 3.918821067510464e-06, "logits/chosen": -1.1351292133331299, "logits/rejected": -1.196396827697754, "logps/chosen": -1772.753662109375, "logps/rejected": -1860.63525390625, "loss": 4.885, "nll_loss": 1.104393482208252, "rewards/accuracies": 0.6875, "rewards/chosen": 0.021603113040328026, "rewards/margins": 0.7958456873893738, "rewards/rejected": -0.774242639541626, "step": 428 }, { "epoch": 0.6154949784791965, "grad_norm": 5.1095709800720215, "kl/ref_to_policy/chosen": -3.297218084335327, "kl/ref_to_policy/mean": 62.268028259277344, "kl/ref_to_policy/rejected": 127.8332748413086, "learning_rate": 3.89437488297831e-06, "logits/chosen": -1.7122689485549927, "logits/rejected": -1.7646564245224, "logps/chosen": -1060.7965087890625, "logps/rejected": -1188.315673828125, "loss": 3.3452, "nll_loss": 0.7546336650848389, "rewards/accuracies": 0.875, "rewards/chosen": 0.03297217935323715, "rewards/margins": 1.3113049268722534, "rewards/rejected": -1.2783327102661133, "step": 429 }, { "epoch": 0.6169296987087518, "grad_norm": 134.95411682128906, "kl/ref_to_policy/chosen": 27.393798828125, "kl/ref_to_policy/mean": 60.72160720825195, "kl/ref_to_policy/rejected": 94.04943084716797, "learning_rate": 3.869956455390295e-06, "logits/chosen": -1.2082608938217163, "logits/rejected": -1.2366836071014404, "logps/chosen": -2200.526611328125, "logps/rejected": -2275.60986328125, "loss": 6.5915, "nll_loss": 1.5204741954803467, "rewards/accuracies": 0.625, "rewards/chosen": -0.27393797039985657, "rewards/margins": 0.6665562391281128, "rewards/rejected": -0.940494179725647, "step": 430 }, { "epoch": 0.6169296987087518, "eval_kl/ref_to_policy/chosen": -4.777546405792236, "eval_kl/ref_to_policy/mean": 48.76932907104492, "eval_kl/ref_to_policy/rejected": 102.31620025634766, "eval_logits/chosen": -1.4833072423934937, "eval_logits/rejected": -1.5274466276168823, "eval_logps/chosen": -1519.45849609375, "eval_logps/rejected": -1626.0386962890625, "eval_loss": 4.197213172912598, "eval_nll_loss": 0.9524522423744202, "eval_rewards/accuracies": 0.8563829660415649, "eval_rewards/chosen": 0.04777546226978302, "eval_rewards/margins": 1.0709375143051147, "eval_rewards/rejected": -1.0231620073318481, "eval_runtime": 112.8706, "eval_samples_per_second": 3.331, "eval_steps_per_second": 1.666, "step": 430 }, { "epoch": 0.6183644189383071, "grad_norm": 13.104586601257324, "kl/ref_to_policy/chosen": -7.360800743103027, "kl/ref_to_policy/mean": 45.619110107421875, "kl/ref_to_policy/rejected": 98.5990219116211, "learning_rate": 3.845566397776022e-06, "logits/chosen": -1.427729606628418, "logits/rejected": -1.4790613651275635, "logps/chosen": -1239.73291015625, "logps/rejected": -1346.3294677734375, "loss": 3.7121, "nll_loss": 0.8300065398216248, "rewards/accuracies": 0.875, "rewards/chosen": 0.07360801100730896, "rewards/margins": 1.0595982074737549, "rewards/rejected": -0.9859901666641235, "step": 431 }, { "epoch": 0.6197991391678622, "grad_norm": 5.537590026855469, "kl/ref_to_policy/chosen": -2.354727029800415, "kl/ref_to_policy/mean": 45.54377746582031, "kl/ref_to_policy/rejected": 93.44229125976562, "learning_rate": 3.821205322452863e-06, "logits/chosen": -1.2969714403152466, "logits/rejected": -1.3566882610321045, "logps/chosen": -1885.9423828125, "logps/rejected": -1983.0537109375, "loss": 4.7002, "nll_loss": 1.0707893371582031, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02354726940393448, "rewards/margins": 0.9579702615737915, "rewards/rejected": -0.9344229102134705, "step": 432 }, { "epoch": 0.6212338593974175, "grad_norm": 5.700631141662598, "kl/ref_to_policy/chosen": 0.3154259920120239, "kl/ref_to_policy/mean": 45.52178955078125, "kl/ref_to_policy/rejected": 90.72815704345703, "learning_rate": 3.796873841010591e-06, "logits/chosen": -1.404000163078308, "logits/rejected": -1.4276986122131348, "logps/chosen": -2193.151611328125, "logps/rejected": -2290.400146484375, "loss": 4.8452, "nll_loss": 1.1008906364440918, "rewards/accuracies": 0.75, "rewards/chosen": -0.0031542638316750526, "rewards/margins": 0.9041271805763245, "rewards/rejected": -0.9072813987731934, "step": 433 }, { "epoch": 0.6226685796269728, "grad_norm": 5.229866981506348, "kl/ref_to_policy/chosen": -13.785663604736328, "kl/ref_to_policy/mean": 48.29197692871094, "kl/ref_to_policy/rejected": 110.36961364746094, "learning_rate": 3.7725725642960047e-06, "logits/chosen": -1.5804271697998047, "logits/rejected": -1.6971588134765625, "logps/chosen": -1274.53466796875, "logps/rejected": -1392.5902099609375, "loss": 3.6985, "nll_loss": 0.8402240872383118, "rewards/accuracies": 1.0, "rewards/chosen": 0.1378566324710846, "rewards/margins": 1.2415525913238525, "rewards/rejected": -1.1036961078643799, "step": 434 }, { "epoch": 0.624103299856528, "grad_norm": 6.307126998901367, "kl/ref_to_policy/chosen": 2.216780185699463, "kl/ref_to_policy/mean": 40.98124694824219, "kl/ref_to_policy/rejected": 79.74571228027344, "learning_rate": 3.748302102397618e-06, "logits/chosen": -1.2669188976287842, "logits/rejected": -1.3734973669052124, "logps/chosen": -1747.107421875, "logps/rejected": -1833.54052734375, "loss": 4.5927, "nll_loss": 1.0283247232437134, "rewards/accuracies": 0.625, "rewards/chosen": -0.022167809307575226, "rewards/margins": 0.7752892971038818, "rewards/rejected": -0.7974569797515869, "step": 435 }, { "epoch": 0.6255380200860832, "grad_norm": 6.0968708992004395, "kl/ref_to_policy/chosen": 0.3103342056274414, "kl/ref_to_policy/mean": 33.865013122558594, "kl/ref_to_policy/rejected": 67.41969299316406, "learning_rate": 3.7240630646303262e-06, "logits/chosen": -0.8753349184989929, "logits/rejected": -0.9407546520233154, "logps/chosen": -2172.529052734375, "logps/rejected": -2250.013427734375, "loss": 5.7164, "nll_loss": 1.3026361465454102, "rewards/accuracies": 0.625, "rewards/chosen": -0.0031033456325531006, "rewards/margins": 0.6710935831069946, "rewards/rejected": -0.6741969585418701, "step": 436 }, { "epoch": 0.6269727403156384, "grad_norm": 4.7568678855896, "kl/ref_to_policy/chosen": -12.300594329833984, "kl/ref_to_policy/mean": 55.162296295166016, "kl/ref_to_policy/rejected": 122.62519836425781, "learning_rate": 3.6998560595201188e-06, "logits/chosen": -1.7391958236694336, "logits/rejected": -1.8312371969223022, "logps/chosen": -1432.5352783203125, "logps/rejected": -1560.7110595703125, "loss": 3.0544, "nll_loss": 0.6840566992759705, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1230059489607811, "rewards/margins": 1.3492578268051147, "rewards/rejected": -1.2262519598007202, "step": 437 }, { "epoch": 0.6284074605451937, "grad_norm": 5.005058288574219, "kl/ref_to_policy/chosen": -12.826765060424805, "kl/ref_to_policy/mean": 47.30526351928711, "kl/ref_to_policy/rejected": 107.43728637695312, "learning_rate": 3.675681694788801e-06, "logits/chosen": -1.4457519054412842, "logits/rejected": -1.613981008529663, "logps/chosen": -1037.5872802734375, "logps/rejected": -1156.695068359375, "loss": 3.3012, "nll_loss": 0.7368203401565552, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12826764583587646, "rewards/margins": 1.2026405334472656, "rewards/rejected": -1.0743728876113892, "step": 438 }, { "epoch": 0.629842180774749, "grad_norm": 6.385729789733887, "kl/ref_to_policy/chosen": -0.9008989334106445, "kl/ref_to_policy/mean": 43.20972442626953, "kl/ref_to_policy/rejected": 87.32034301757812, "learning_rate": 3.6515405773387257e-06, "logits/chosen": -1.1643054485321045, "logits/rejected": -1.2158626317977905, "logps/chosen": -1492.09912109375, "logps/rejected": -1580.916015625, "loss": 4.5442, "nll_loss": 1.0275274515151978, "rewards/accuracies": 0.9375, "rewards/chosen": 0.009008996188640594, "rewards/margins": 0.8822124004364014, "rewards/rejected": -0.8732033967971802, "step": 439 }, { "epoch": 0.6312769010043041, "grad_norm": 7.521716117858887, "kl/ref_to_policy/chosen": -14.286945343017578, "kl/ref_to_policy/mean": 33.47196960449219, "kl/ref_to_policy/rejected": 81.23088836669922, "learning_rate": 3.627433313237576e-06, "logits/chosen": -1.2730430364608765, "logits/rejected": -1.3669947385787964, "logps/chosen": -889.5177001953125, "logps/rejected": -987.3831787109375, "loss": 4.0648, "nll_loss": 0.9109519124031067, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14286945760250092, "rewards/margins": 0.9551783800125122, "rewards/rejected": -0.8123087882995605, "step": 440 }, { "epoch": 0.6312769010043041, "eval_kl/ref_to_policy/chosen": -6.053585529327393, "eval_kl/ref_to_policy/mean": 48.452335357666016, "eval_kl/ref_to_policy/rejected": 102.95825958251953, "eval_logits/chosen": -1.396902322769165, "eval_logits/rejected": -1.5045232772827148, "eval_logps/chosen": -1518.182373046875, "eval_logps/rejected": -1626.680908203125, "eval_loss": 4.129555702209473, "eval_nll_loss": 0.9358544945716858, "eval_rewards/accuracies": 0.8510638475418091, "eval_rewards/chosen": 0.06053586304187775, "eval_rewards/margins": 1.090118408203125, "eval_rewards/rejected": -1.0295826196670532, "eval_runtime": 112.2362, "eval_samples_per_second": 3.35, "eval_steps_per_second": 1.675, "step": 440 }, { "epoch": 0.6327116212338594, "grad_norm": 6.680513381958008, "kl/ref_to_policy/chosen": 2.9124088287353516, "kl/ref_to_policy/mean": 63.7879638671875, "kl/ref_to_policy/rejected": 124.66352844238281, "learning_rate": 3.603360507703133e-06, "logits/chosen": -1.4314745664596558, "logits/rejected": -1.548475980758667, "logps/chosen": -1214.8812255859375, "logps/rejected": -1333.86865234375, "loss": 3.5797, "nll_loss": 0.8067311644554138, "rewards/accuracies": 0.875, "rewards/chosen": -0.02912408858537674, "rewards/margins": 1.2175111770629883, "rewards/rejected": -1.2466351985931396, "step": 441 }, { "epoch": 0.6341463414634146, "grad_norm": 3.985724925994873, "kl/ref_to_policy/chosen": -8.472085952758789, "kl/ref_to_policy/mean": 58.80925750732422, "kl/ref_to_policy/rejected": 126.09060668945312, "learning_rate": 3.5793227650880928e-06, "logits/chosen": -1.557076334953308, "logits/rejected": -1.7157920598983765, "logps/chosen": -1512.4498291015625, "logps/rejected": -1643.236083984375, "loss": 3.2542, "nll_loss": 0.7332147359848022, "rewards/accuracies": 0.75, "rewards/chosen": 0.08472084999084473, "rewards/margins": 1.3456268310546875, "rewards/rejected": -1.2609059810638428, "step": 442 }, { "epoch": 0.6355810616929699, "grad_norm": 4.561957836151123, "kl/ref_to_policy/chosen": -15.929816246032715, "kl/ref_to_policy/mean": 51.246002197265625, "kl/ref_to_policy/rejected": 118.42181396484375, "learning_rate": 3.555320688864889e-06, "logits/chosen": -1.6923432350158691, "logits/rejected": -1.8288158178329468, "logps/chosen": -1024.859130859375, "logps/rejected": -1154.4725341796875, "loss": 3.1128, "nll_loss": 0.6991428136825562, "rewards/accuracies": 0.875, "rewards/chosen": 0.15929816663265228, "rewards/margins": 1.3435163497924805, "rewards/rejected": -1.184218168258667, "step": 443 }, { "epoch": 0.6370157819225251, "grad_norm": 5.09556770324707, "kl/ref_to_policy/chosen": -19.914152145385742, "kl/ref_to_policy/mean": 47.56230545043945, "kl/ref_to_policy/rejected": 115.03875732421875, "learning_rate": 3.531354881610539e-06, "logits/chosen": -1.6345596313476562, "logits/rejected": -1.7938514947891235, "logps/chosen": -1105.3436279296875, "logps/rejected": -1234.6871337890625, "loss": 3.0972, "nll_loss": 0.6960327625274658, "rewards/accuracies": 0.875, "rewards/chosen": 0.1991415023803711, "rewards/margins": 1.3495290279388428, "rewards/rejected": -1.1503875255584717, "step": 444 }, { "epoch": 0.6384505021520803, "grad_norm": 6.283167839050293, "kl/ref_to_policy/chosen": -10.050113677978516, "kl/ref_to_policy/mean": 37.858829498291016, "kl/ref_to_policy/rejected": 85.76776885986328, "learning_rate": 3.507425944991529e-06, "logits/chosen": -1.3719936609268188, "logits/rejected": -1.4710239171981812, "logps/chosen": -1337.668701171875, "logps/rejected": -1435.034423828125, "loss": 4.0105, "nll_loss": 0.898844301700592, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10050112754106522, "rewards/margins": 0.9581788182258606, "rewards/rejected": -0.8576776385307312, "step": 445 }, { "epoch": 0.6398852223816356, "grad_norm": 5.424286842346191, "kl/ref_to_policy/chosen": -5.5419697761535645, "kl/ref_to_policy/mean": 42.91436767578125, "kl/ref_to_policy/rejected": 91.3707046508789, "learning_rate": 3.483534479748688e-06, "logits/chosen": -1.2868430614471436, "logits/rejected": -1.4019988775253296, "logps/chosen": -1443.2158203125, "logps/rejected": -1543.4254150390625, "loss": 3.9819, "nll_loss": 0.8900128602981567, "rewards/accuracies": 0.875, "rewards/chosen": 0.05541970953345299, "rewards/margins": 0.96912682056427, "rewards/rejected": -0.9137070178985596, "step": 446 }, { "epoch": 0.6413199426111909, "grad_norm": 5.815247535705566, "kl/ref_to_policy/chosen": -1.5060486793518066, "kl/ref_to_policy/mean": 38.78763961791992, "kl/ref_to_policy/rejected": 79.08132934570312, "learning_rate": 3.4596810856821304e-06, "logits/chosen": -1.0126584768295288, "logits/rejected": -1.133022427558899, "logps/chosen": -1977.290771484375, "logps/rejected": -2066.677001953125, "loss": 4.9831, "nll_loss": 1.1299247741699219, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015060493722558022, "rewards/margins": 0.8058737516403198, "rewards/rejected": -0.7908133268356323, "step": 447 }, { "epoch": 0.642754662840746, "grad_norm": 4.416863918304443, "kl/ref_to_policy/chosen": -8.071674346923828, "kl/ref_to_policy/mean": 64.17245483398438, "kl/ref_to_policy/rejected": 136.4165802001953, "learning_rate": 3.4358663616361775e-06, "logits/chosen": -1.9243191480636597, "logits/rejected": -2.0578725337982178, "logps/chosen": -918.6549072265625, "logps/rejected": -1056.5728759765625, "loss": 2.2689, "nll_loss": 0.4931521415710449, "rewards/accuracies": 0.875, "rewards/chosen": 0.08071674406528473, "rewards/margins": 1.4448826313018799, "rewards/rejected": -1.3641659021377563, "step": 448 }, { "epoch": 0.6441893830703013, "grad_norm": 6.185230731964111, "kl/ref_to_policy/chosen": 2.20093035697937, "kl/ref_to_policy/mean": 43.34120559692383, "kl/ref_to_policy/rejected": 84.48148345947266, "learning_rate": 3.4120909054843375e-06, "logits/chosen": -1.0861625671386719, "logits/rejected": -1.1656841039657593, "logps/chosen": -2428.716552734375, "logps/rejected": -2518.8525390625, "loss": 5.5857, "nll_loss": 1.2818419933319092, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02200930565595627, "rewards/margins": 0.8228055834770203, "rewards/rejected": -0.8448148369789124, "step": 449 }, { "epoch": 0.6456241032998565, "grad_norm": 88.14004516601562, "kl/ref_to_policy/chosen": -5.632075309753418, "kl/ref_to_policy/mean": 55.078495025634766, "kl/ref_to_policy/rejected": 115.7890625, "learning_rate": 3.3883553141142884e-06, "logits/chosen": -1.665715217590332, "logits/rejected": -1.8027511835098267, "logps/chosen": -1094.016357421875, "logps/rejected": -1212.6484375, "loss": 3.1332, "nll_loss": 0.6968303322792053, "rewards/accuracies": 1.0, "rewards/chosen": 0.056320756673812866, "rewards/margins": 1.2142114639282227, "rewards/rejected": -1.1578906774520874, "step": 450 }, { "epoch": 0.6456241032998565, "eval_kl/ref_to_policy/chosen": -7.163722991943359, "eval_kl/ref_to_policy/mean": 48.42877197265625, "eval_kl/ref_to_policy/rejected": 104.02127838134766, "eval_logits/chosen": -1.415946125984192, "eval_logits/rejected": -1.5345262289047241, "eval_logps/chosen": -1517.072265625, "eval_logps/rejected": -1627.7440185546875, "eval_loss": 4.143941879272461, "eval_nll_loss": 0.9409931302070618, "eval_rewards/accuracies": 0.8563829660415649, "eval_rewards/chosen": 0.07163724303245544, "eval_rewards/margins": 1.1118500232696533, "eval_rewards/rejected": -1.0402127504348755, "eval_runtime": 112.3868, "eval_samples_per_second": 3.346, "eval_steps_per_second": 1.673, "step": 450 }, { "epoch": 0.6470588235294118, "grad_norm": 4.433711051940918, "kl/ref_to_policy/chosen": -24.56841278076172, "kl/ref_to_policy/mean": 42.12389373779297, "kl/ref_to_policy/rejected": 108.81619262695312, "learning_rate": 3.3646601834128924e-06, "logits/chosen": -1.6519229412078857, "logits/rejected": -1.824701189994812, "logps/chosen": -1186.244140625, "logps/rejected": -1316.4764404296875, "loss": 3.2229, "nll_loss": 0.7258384823799133, "rewards/accuracies": 0.875, "rewards/chosen": 0.24568410217761993, "rewards/margins": 1.3338459730148315, "rewards/rejected": -1.0881619453430176, "step": 451 }, { "epoch": 0.648493543758967, "grad_norm": 4.3636860847473145, "kl/ref_to_policy/chosen": -0.608184814453125, "kl/ref_to_policy/mean": 62.662418365478516, "kl/ref_to_policy/rejected": 125.93302154541016, "learning_rate": 3.3410061082512422e-06, "logits/chosen": -1.5333032608032227, "logits/rejected": -1.6694138050079346, "logps/chosen": -1592.0091552734375, "logps/rejected": -1713.046142578125, "loss": 3.6156, "nll_loss": 0.8205978870391846, "rewards/accuracies": 1.0, "rewards/chosen": 0.006081845611333847, "rewards/margins": 1.2654119729995728, "rewards/rejected": -1.259330153465271, "step": 452 }, { "epoch": 0.6499282639885222, "grad_norm": 234.9413299560547, "kl/ref_to_policy/chosen": -16.95105743408203, "kl/ref_to_policy/mean": 36.8200798034668, "kl/ref_to_policy/rejected": 90.59121704101562, "learning_rate": 3.3173936824697174e-06, "logits/chosen": -1.5385065078735352, "logits/rejected": -1.5907411575317383, "logps/chosen": -1534.4786376953125, "logps/rejected": -1639.92724609375, "loss": 5.6654, "nll_loss": 1.3175313472747803, "rewards/accuracies": 0.75, "rewards/chosen": 0.16951057314872742, "rewards/margins": 1.075422763824463, "rewards/rejected": -0.9059121012687683, "step": 453 }, { "epoch": 0.6513629842180775, "grad_norm": 3.399510622024536, "kl/ref_to_policy/chosen": -26.076181411743164, "kl/ref_to_policy/mean": 60.39512634277344, "kl/ref_to_policy/rejected": 146.86642456054688, "learning_rate": 3.293823498863087e-06, "logits/chosen": -2.0643064975738525, "logits/rejected": -2.2719857692718506, "logps/chosen": -392.205078125, "logps/rejected": -553.2745971679688, "loss": 1.4218, "nll_loss": 0.30171072483062744, "rewards/accuracies": 1.0, "rewards/chosen": 0.2607618272304535, "rewards/margins": 1.7294261455535889, "rewards/rejected": -1.4686644077301025, "step": 454 }, { "epoch": 0.6527977044476327, "grad_norm": 6.615200519561768, "kl/ref_to_policy/chosen": -11.324848175048828, "kl/ref_to_policy/mean": 44.21576690673828, "kl/ref_to_policy/rejected": 99.75637817382812, "learning_rate": 3.2702961491656197e-06, "logits/chosen": -1.3770004510879517, "logits/rejected": -1.5401452779769897, "logps/chosen": -1180.4017333984375, "logps/rejected": -1290.46923828125, "loss": 3.7807, "nll_loss": 0.8508464097976685, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11324848234653473, "rewards/margins": 1.1108123064041138, "rewards/rejected": -0.9975638389587402, "step": 455 }, { "epoch": 0.6542324246771879, "grad_norm": 5.505100250244141, "kl/ref_to_policy/chosen": 1.2773332595825195, "kl/ref_to_policy/mean": 43.29863739013672, "kl/ref_to_policy/rejected": 85.31993865966797, "learning_rate": 3.2468122240362287e-06, "logits/chosen": -1.0227785110473633, "logits/rejected": -1.1505262851715088, "logps/chosen": -2354.32421875, "logps/rejected": -2443.118896484375, "loss": 5.1917, "nll_loss": 1.1841880083084106, "rewards/accuracies": 0.8125, "rewards/chosen": -0.012773338705301285, "rewards/margins": 0.840425968170166, "rewards/rejected": -0.8531993627548218, "step": 456 }, { "epoch": 0.6556671449067432, "grad_norm": 37.463829040527344, "kl/ref_to_policy/chosen": 34.547271728515625, "kl/ref_to_policy/mean": 66.81319427490234, "kl/ref_to_policy/rejected": 99.07910919189453, "learning_rate": 3.223372313043647e-06, "logits/chosen": -1.1739940643310547, "logits/rejected": -1.3233078718185425, "logps/chosen": -2017.5394287109375, "logps/rejected": -2082.9931640625, "loss": 5.1093, "nll_loss": 1.1270710229873657, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3454727530479431, "rewards/margins": 0.6453183889389038, "rewards/rejected": -0.9907910823822021, "step": 457 }, { "epoch": 0.6571018651362984, "grad_norm": 5.899302005767822, "kl/ref_to_policy/chosen": -18.249897003173828, "kl/ref_to_policy/mean": 42.83144760131836, "kl/ref_to_policy/rejected": 103.91279602050781, "learning_rate": 3.1999770046516198e-06, "logits/chosen": -1.531745195388794, "logits/rejected": -1.6854267120361328, "logps/chosen": -1143.4844970703125, "logps/rejected": -1264.3931884765625, "loss": 3.6607, "nll_loss": 0.8277583718299866, "rewards/accuracies": 0.875, "rewards/chosen": 0.18249894678592682, "rewards/margins": 1.221626877784729, "rewards/rejected": -1.0391278266906738, "step": 458 }, { "epoch": 0.6585365853658537, "grad_norm": 4.8511810302734375, "kl/ref_to_policy/chosen": -5.659524917602539, "kl/ref_to_policy/mean": 55.037818908691406, "kl/ref_to_policy/rejected": 115.73515319824219, "learning_rate": 3.1766268862041406e-06, "logits/chosen": -1.5271413326263428, "logits/rejected": -1.689583659172058, "logps/chosen": -1285.426513671875, "logps/rejected": -1405.114013671875, "loss": 3.3731, "nll_loss": 0.7544063925743103, "rewards/accuracies": 0.8125, "rewards/chosen": 0.056595250964164734, "rewards/margins": 1.21394681930542, "rewards/rejected": -1.1573514938354492, "step": 459 }, { "epoch": 0.6599713055954088, "grad_norm": 900.0671997070312, "kl/ref_to_policy/chosen": -11.583290100097656, "kl/ref_to_policy/mean": 36.76082229614258, "kl/ref_to_policy/rejected": 85.10493469238281, "learning_rate": 3.1533225439106965e-06, "logits/chosen": -1.5829908847808838, "logits/rejected": -1.6584911346435547, "logps/chosen": -1291.7135009765625, "logps/rejected": -1389.6126708984375, "loss": 4.2513, "nll_loss": 0.958099901676178, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11583289504051208, "rewards/margins": 0.9668822288513184, "rewards/rejected": -0.8510493040084839, "step": 460 }, { "epoch": 0.6599713055954088, "eval_kl/ref_to_policy/chosen": -6.669688701629639, "eval_kl/ref_to_policy/mean": 49.169517517089844, "eval_kl/ref_to_policy/rejected": 105.00872802734375, "eval_logits/chosen": -1.386007308959961, "eval_logits/rejected": -1.544393539428711, "eval_logps/chosen": -1517.5662841796875, "eval_logps/rejected": -1628.7313232421875, "eval_loss": 4.09207010269165, "eval_nll_loss": 0.9275339245796204, "eval_rewards/accuracies": 0.8510638475418091, "eval_rewards/chosen": 0.0666968896985054, "eval_rewards/margins": 1.1167839765548706, "eval_rewards/rejected": -1.0500870943069458, "eval_runtime": 111.7787, "eval_samples_per_second": 3.364, "eval_steps_per_second": 1.682, "step": 460 }, { "epoch": 0.6614060258249641, "grad_norm": 4.971432685852051, "kl/ref_to_policy/chosen": -20.755615234375, "kl/ref_to_policy/mean": 34.231407165527344, "kl/ref_to_policy/rejected": 89.21842956542969, "learning_rate": 3.130064562831553e-06, "logits/chosen": -1.365018606185913, "logits/rejected": -1.4906644821166992, "logps/chosen": -1910.939697265625, "logps/rejected": -2022.118896484375, "loss": 4.1854, "nll_loss": 0.9516895413398743, "rewards/accuracies": 0.875, "rewards/chosen": 0.20755614340305328, "rewards/margins": 1.0997403860092163, "rewards/rejected": -0.8921841979026794, "step": 461 }, { "epoch": 0.6628407460545194, "grad_norm": 5.390341758728027, "kl/ref_to_policy/chosen": 14.88597297668457, "kl/ref_to_policy/mean": 52.0570182800293, "kl/ref_to_policy/rejected": 89.22806549072266, "learning_rate": 3.106853526863073e-06, "logits/chosen": -0.9651604890823364, "logits/rejected": -1.073264718055725, "logps/chosen": -2418.78466796875, "logps/rejected": -2499.482177734375, "loss": 5.3657, "nll_loss": 1.2225309610366821, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14885972440242767, "rewards/margins": 0.7434208393096924, "rewards/rejected": -0.8922805786132812, "step": 462 }, { "epoch": 0.6642754662840746, "grad_norm": 5.515178680419922, "kl/ref_to_policy/chosen": -8.9137544631958, "kl/ref_to_policy/mean": 34.32785415649414, "kl/ref_to_policy/rejected": 77.56946563720703, "learning_rate": 3.0836900187230475e-06, "logits/chosen": -1.1476374864578247, "logits/rejected": -1.2437094449996948, "logps/chosen": -1982.3831787109375, "logps/rejected": -2073.180908203125, "loss": 5.0695, "nll_loss": 1.1573328971862793, "rewards/accuracies": 0.875, "rewards/chosen": 0.08913755416870117, "rewards/margins": 0.8648322224617004, "rewards/rejected": -0.7756946682929993, "step": 463 }, { "epoch": 0.6657101865136298, "grad_norm": 146.64767456054688, "kl/ref_to_policy/chosen": 9.754310607910156, "kl/ref_to_policy/mean": 42.15980911254883, "kl/ref_to_policy/rejected": 74.5653076171875, "learning_rate": 3.0605746199360755e-06, "logits/chosen": -1.052375316619873, "logits/rejected": -1.179836630821228, "logps/chosen": -1811.3948974609375, "logps/rejected": -1881.3680419921875, "loss": 5.4273, "nll_loss": 1.223235845565796, "rewards/accuracies": 0.75, "rewards/chosen": -0.09754310548305511, "rewards/margins": 0.6481099724769592, "rewards/rejected": -0.7456530332565308, "step": 464 }, { "epoch": 0.667144906743185, "grad_norm": 6.6193928718566895, "kl/ref_to_policy/chosen": -21.686349868774414, "kl/ref_to_policy/mean": 38.191802978515625, "kl/ref_to_policy/rejected": 98.06995391845703, "learning_rate": 3.0375079108189613e-06, "logits/chosen": -1.6115944385528564, "logits/rejected": -1.7799557447433472, "logps/chosen": -865.9073486328125, "logps/rejected": -984.1832885742188, "loss": 3.3004, "nll_loss": 0.7339641451835632, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2168634831905365, "rewards/margins": 1.1975629329681396, "rewards/rejected": -0.9806994199752808, "step": 465 }, { "epoch": 0.6685796269727403, "grad_norm": 6.3114519119262695, "kl/ref_to_policy/chosen": -19.32021713256836, "kl/ref_to_policy/mean": 47.66094970703125, "kl/ref_to_policy/rejected": 114.6421127319336, "learning_rate": 3.0144904704661413e-06, "logits/chosen": -1.5378996133804321, "logits/rejected": -1.7611557245254517, "logps/chosen": -537.5938110351562, "logps/rejected": -667.33740234375, "loss": 2.6427, "nll_loss": 0.5802564024925232, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1932021677494049, "rewards/margins": 1.3396233320236206, "rewards/rejected": -1.146421194076538, "step": 466 }, { "epoch": 0.6700143472022956, "grad_norm": 6.901356220245361, "kl/ref_to_policy/chosen": -7.277001857757568, "kl/ref_to_policy/mean": 34.59872817993164, "kl/ref_to_policy/rejected": 76.47444915771484, "learning_rate": 2.991522876735154e-06, "logits/chosen": -1.089463472366333, "logits/rejected": -1.15988290309906, "logps/chosen": -1272.4964599609375, "logps/rejected": -1362.6912841796875, "loss": 4.9067, "nll_loss": 1.1130706071853638, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07277002185583115, "rewards/margins": 0.8375146389007568, "rewards/rejected": -0.7647445201873779, "step": 467 }, { "epoch": 0.6714490674318508, "grad_norm": 28.171506881713867, "kl/ref_to_policy/chosen": -8.359810829162598, "kl/ref_to_policy/mean": 34.02877426147461, "kl/ref_to_policy/rejected": 76.41735076904297, "learning_rate": 2.9686057062321226e-06, "logits/chosen": -1.470019817352295, "logits/rejected": -1.5128443241119385, "logps/chosen": -1292.40576171875, "logps/rejected": -1379.6165771484375, "loss": 6.1967, "nll_loss": 1.4364336729049683, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08359810709953308, "rewards/margins": 0.8477716445922852, "rewards/rejected": -0.7641735672950745, "step": 468 }, { "epoch": 0.672883787661406, "grad_norm": 6.219433307647705, "kl/ref_to_policy/chosen": -5.266351699829102, "kl/ref_to_policy/mean": 30.195634841918945, "kl/ref_to_policy/rejected": 65.65763092041016, "learning_rate": 2.9457395342972904e-06, "logits/chosen": -0.9100340008735657, "logits/rejected": -1.0132665634155273, "logps/chosen": -1670.069580078125, "logps/rejected": -1748.606689453125, "loss": 4.8658, "nll_loss": 1.0945452451705933, "rewards/accuracies": 0.875, "rewards/chosen": 0.052663519978523254, "rewards/margins": 0.7092397212982178, "rewards/rejected": -0.6565762162208557, "step": 469 }, { "epoch": 0.6743185078909613, "grad_norm": 4.700960636138916, "kl/ref_to_policy/chosen": -13.14570140838623, "kl/ref_to_policy/mean": 55.938018798828125, "kl/ref_to_policy/rejected": 125.02174377441406, "learning_rate": 2.9229249349905686e-06, "logits/chosen": -1.642045259475708, "logits/rejected": -1.8113658428192139, "logps/chosen": -1147.0731201171875, "logps/rejected": -1278.658447265625, "loss": 3.1968, "nll_loss": 0.7222731709480286, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13145700097084045, "rewards/margins": 1.3816744089126587, "rewards/rejected": -1.2502175569534302, "step": 470 }, { "epoch": 0.6743185078909613, "eval_kl/ref_to_policy/chosen": -9.53772258758545, "eval_kl/ref_to_policy/mean": 45.34699249267578, "eval_kl/ref_to_policy/rejected": 100.23169708251953, "eval_logits/chosen": -1.4335790872573853, "eval_logits/rejected": -1.548112154006958, "eval_logps/chosen": -1514.6983642578125, "eval_logps/rejected": -1623.954345703125, "eval_loss": 4.159414768218994, "eval_nll_loss": 0.9442088603973389, "eval_rewards/accuracies": 0.8590425252914429, "eval_rewards/chosen": 0.09537722170352936, "eval_rewards/margins": 1.0976942777633667, "eval_rewards/rejected": -1.002316951751709, "eval_runtime": 111.9099, "eval_samples_per_second": 3.36, "eval_steps_per_second": 1.68, "step": 470 }, { "epoch": 0.6757532281205165, "grad_norm": 5.420197486877441, "kl/ref_to_policy/chosen": -12.609674453735352, "kl/ref_to_policy/mean": 42.32634735107422, "kl/ref_to_policy/rejected": 97.26237487792969, "learning_rate": 2.900162481077126e-06, "logits/chosen": -1.4062694311141968, "logits/rejected": -1.5580259561538696, "logps/chosen": -1245.687255859375, "logps/rejected": -1357.6275634765625, "loss": 3.9135, "nll_loss": 0.8806906938552856, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12609674036502838, "rewards/margins": 1.0987204313278198, "rewards/rejected": -0.9726237058639526, "step": 471 }, { "epoch": 0.6771879483500718, "grad_norm": 6.745048522949219, "kl/ref_to_policy/chosen": -11.152179718017578, "kl/ref_to_policy/mean": 31.350826263427734, "kl/ref_to_policy/rejected": 73.85382843017578, "learning_rate": 2.8774527440130173e-06, "logits/chosen": -1.1674861907958984, "logits/rejected": -1.2892733812332153, "logps/chosen": -1166.1055908203125, "logps/rejected": -1255.912353515625, "loss": 4.4198, "nll_loss": 0.9924097061157227, "rewards/accuracies": 0.875, "rewards/chosen": 0.11152178794145584, "rewards/margins": 0.8500600457191467, "rewards/rejected": -0.7385382652282715, "step": 472 }, { "epoch": 0.6786226685796269, "grad_norm": 4.530329704284668, "kl/ref_to_policy/chosen": -11.923574447631836, "kl/ref_to_policy/mean": 56.00350570678711, "kl/ref_to_policy/rejected": 123.93058776855469, "learning_rate": 2.8547962939308187e-06, "logits/chosen": -1.7089204788208008, "logits/rejected": -1.9003266096115112, "logps/chosen": -884.7171630859375, "logps/rejected": -1016.4617919921875, "loss": 2.8167, "nll_loss": 0.6254026889801025, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11923574656248093, "rewards/margins": 1.358541488647461, "rewards/rejected": -1.2393057346343994, "step": 473 }, { "epoch": 0.6800573888091822, "grad_norm": 5.101683616638184, "kl/ref_to_policy/chosen": -8.812845230102539, "kl/ref_to_policy/mean": 39.8847770690918, "kl/ref_to_policy/rejected": 88.58238983154297, "learning_rate": 2.8321936996253368e-06, "logits/chosen": -1.3109699487686157, "logits/rejected": -1.43738853931427, "logps/chosen": -1854.76513671875, "logps/rejected": -1954.25244140625, "loss": 4.3181, "nll_loss": 0.9759806394577026, "rewards/accuracies": 0.875, "rewards/chosen": 0.08812844753265381, "rewards/margins": 0.9739523530006409, "rewards/rejected": -0.8858239054679871, "step": 474 }, { "epoch": 0.6814921090387375, "grad_norm": 6.387237548828125, "kl/ref_to_policy/chosen": -4.434447288513184, "kl/ref_to_policy/mean": 31.196914672851562, "kl/ref_to_policy/rejected": 66.8282699584961, "learning_rate": 2.8096455285393094e-06, "logits/chosen": -0.9354070425033569, "logits/rejected": -1.0648095607757568, "logps/chosen": -2072.1279296875, "logps/rejected": -2152.734619140625, "loss": 5.3464, "nll_loss": 1.2147822380065918, "rewards/accuracies": 0.75, "rewards/chosen": 0.044344477355480194, "rewards/margins": 0.7126271724700928, "rewards/rejected": -0.6682827472686768, "step": 475 }, { "epoch": 0.6829268292682927, "grad_norm": 4.87832498550415, "kl/ref_to_policy/chosen": -0.527452826499939, "kl/ref_to_policy/mean": 62.79393768310547, "kl/ref_to_policy/rejected": 126.11532592773438, "learning_rate": 2.787152346749173e-06, "logits/chosen": -1.5003941059112549, "logits/rejected": -1.6911869049072266, "logps/chosen": -1369.1209716796875, "logps/rejected": -1490.8282470703125, "loss": 3.4827, "nll_loss": 0.7869483232498169, "rewards/accuracies": 0.9375, "rewards/chosen": 0.005274530500173569, "rewards/margins": 1.266427755355835, "rewards/rejected": -1.2611533403396606, "step": 476 }, { "epoch": 0.6843615494978479, "grad_norm": 8.790543556213379, "kl/ref_to_policy/chosen": -19.91948890686035, "kl/ref_to_policy/mean": 33.84300231933594, "kl/ref_to_policy/rejected": 87.6054916381836, "learning_rate": 2.7647147189508485e-06, "logits/chosen": -1.3552526235580444, "logits/rejected": -1.5272183418273926, "logps/chosen": -1108.71337890625, "logps/rejected": -1216.37109375, "loss": 3.8797, "nll_loss": 0.8739398717880249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19919487833976746, "rewards/margins": 1.0752499103546143, "rewards/rejected": -0.8760548830032349, "step": 477 }, { "epoch": 0.6857962697274032, "grad_norm": 30.648624420166016, "kl/ref_to_policy/chosen": -13.53242301940918, "kl/ref_to_policy/mean": 41.49503707885742, "kl/ref_to_policy/rejected": 96.52250671386719, "learning_rate": 2.7423332084455543e-06, "logits/chosen": -1.454900860786438, "logits/rejected": -1.6500945091247559, "logps/chosen": -1585.0692138671875, "logps/rejected": -1693.1417236328125, "loss": 5.9411, "nll_loss": 1.3886293172836304, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13532423973083496, "rewards/margins": 1.1005492210388184, "rewards/rejected": -0.9652249813079834, "step": 478 }, { "epoch": 0.6872309899569584, "grad_norm": 19.67469024658203, "kl/ref_to_policy/chosen": -3.519893169403076, "kl/ref_to_policy/mean": 59.1533203125, "kl/ref_to_policy/rejected": 121.82653045654297, "learning_rate": 2.720008377125682e-06, "logits/chosen": -1.6749579906463623, "logits/rejected": -1.8122625350952148, "logps/chosen": -1555.9332275390625, "logps/rejected": -1676.3323974609375, "loss": 3.0646, "nll_loss": 0.6817018389701843, "rewards/accuracies": 0.875, "rewards/chosen": 0.03519892692565918, "rewards/margins": 1.2534642219543457, "rewards/rejected": -1.2182652950286865, "step": 479 }, { "epoch": 0.6886657101865137, "grad_norm": 5.181328296661377, "kl/ref_to_policy/chosen": -11.498885154724121, "kl/ref_to_policy/mean": 43.72309875488281, "kl/ref_to_policy/rejected": 98.94508361816406, "learning_rate": 2.697740785460675e-06, "logits/chosen": -1.1936097145080566, "logits/rejected": -1.4307589530944824, "logps/chosen": -1323.3839111328125, "logps/rejected": -1434.439208984375, "loss": 4.0557, "nll_loss": 0.9180054664611816, "rewards/accuracies": 0.875, "rewards/chosen": 0.11498884856700897, "rewards/margins": 1.1044394969940186, "rewards/rejected": -0.9894508123397827, "step": 480 }, { "epoch": 0.6886657101865137, "eval_kl/ref_to_policy/chosen": -10.332962036132812, "eval_kl/ref_to_policy/mean": 44.97636032104492, "eval_kl/ref_to_policy/rejected": 100.28567504882812, "eval_logits/chosen": -1.3632856607437134, "eval_logits/rejected": -1.578959584236145, "eval_logps/chosen": -1513.903076171875, "eval_logps/rejected": -1624.0081787109375, "eval_loss": 4.149866104125977, "eval_nll_loss": 0.9419829845428467, "eval_rewards/accuracies": 0.8590425252914429, "eval_rewards/chosen": 0.1033296212553978, "eval_rewards/margins": 1.1061863899230957, "eval_rewards/rejected": -1.0028568506240845, "eval_runtime": 111.3017, "eval_samples_per_second": 3.378, "eval_steps_per_second": 1.689, "step": 480 }, { "epoch": 0.6901004304160688, "grad_norm": 5.50925350189209, "kl/ref_to_policy/chosen": -5.572969436645508, "kl/ref_to_policy/mean": 42.42257308959961, "kl/ref_to_policy/rejected": 90.4181137084961, "learning_rate": 2.6755309924829657e-06, "logits/chosen": -1.1705896854400635, "logits/rejected": -1.3993726968765259, "logps/chosen": -1847.098876953125, "logps/rejected": -1948.539306640625, "loss": 4.6917, "nll_loss": 1.0657145977020264, "rewards/accuracies": 0.75, "rewards/chosen": 0.055729687213897705, "rewards/margins": 0.9599108695983887, "rewards/rejected": -0.904181182384491, "step": 481 }, { "epoch": 0.6915351506456241, "grad_norm": 5.369027137756348, "kl/ref_to_policy/chosen": -18.482486724853516, "kl/ref_to_policy/mean": 31.104782104492188, "kl/ref_to_policy/rejected": 80.69204711914062, "learning_rate": 2.6533795557739407e-06, "logits/chosen": -1.2778416872024536, "logits/rejected": -1.478403091430664, "logps/chosen": -1693.556640625, "logps/rejected": -1794.2818603515625, "loss": 4.4977, "nll_loss": 1.0220686197280884, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1848248541355133, "rewards/margins": 0.9917453527450562, "rewards/rejected": -0.8069204688072205, "step": 482 }, { "epoch": 0.6929698708751794, "grad_norm": 5.019474983215332, "kl/ref_to_policy/chosen": -5.629670143127441, "kl/ref_to_policy/mean": 54.89984130859375, "kl/ref_to_policy/rejected": 115.42935180664062, "learning_rate": 2.6312870314499335e-06, "logits/chosen": -1.399906873703003, "logits/rejected": -1.6722991466522217, "logps/chosen": -1219.034423828125, "logps/rejected": -1340.3043212890625, "loss": 3.5289, "nll_loss": 0.7924644947052002, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0562966987490654, "rewards/margins": 1.2105903625488281, "rewards/rejected": -1.1542936563491821, "step": 483 }, { "epoch": 0.6944045911047346, "grad_norm": 4.942447185516357, "kl/ref_to_policy/chosen": 7.972014427185059, "kl/ref_to_policy/mean": 70.8379135131836, "kl/ref_to_policy/rejected": 133.7038116455078, "learning_rate": 2.609253974148278e-06, "logits/chosen": -1.396716594696045, "logits/rejected": -1.6911447048187256, "logps/chosen": -1461.61279296875, "logps/rejected": -1582.840576171875, "loss": 3.6149, "nll_loss": 0.8186135292053223, "rewards/accuracies": 0.9375, "rewards/chosen": -0.079720139503479, "rewards/margins": 1.2573180198669434, "rewards/rejected": -1.3370380401611328, "step": 484 }, { "epoch": 0.6958393113342898, "grad_norm": 2.8762168884277344, "kl/ref_to_policy/chosen": -20.006664276123047, "kl/ref_to_policy/mean": 59.731510162353516, "kl/ref_to_policy/rejected": 139.4696807861328, "learning_rate": 2.5872809370133704e-06, "logits/chosen": -1.688684344291687, "logits/rejected": -2.1474950313568115, "logps/chosen": -1109.602783203125, "logps/rejected": -1261.7314453125, "loss": 2.0969, "nll_loss": 0.46065402030944824, "rewards/accuracies": 0.875, "rewards/chosen": 0.20006662607192993, "rewards/margins": 1.5947633981704712, "rewards/rejected": -1.394696593284607, "step": 485 }, { "epoch": 0.697274031563845, "grad_norm": 4.558497428894043, "kl/ref_to_policy/chosen": -4.0993452072143555, "kl/ref_to_policy/mean": 52.34306335449219, "kl/ref_to_policy/rejected": 108.78547668457031, "learning_rate": 2.5653684716827904e-06, "logits/chosen": -1.2076250314712524, "logits/rejected": -1.5147953033447266, "logps/chosen": -1937.408447265625, "logps/rejected": -2048.40673828125, "loss": 4.1939, "nll_loss": 0.9549069404602051, "rewards/accuracies": 0.875, "rewards/chosen": 0.04099345952272415, "rewards/margins": 1.1288481950759888, "rewards/rejected": -1.0878548622131348, "step": 486 }, { "epoch": 0.6987087517934003, "grad_norm": 7.842737674713135, "kl/ref_to_policy/chosen": -26.469486236572266, "kl/ref_to_policy/mean": 14.805898666381836, "kl/ref_to_policy/rejected": 56.08128356933594, "learning_rate": 2.5435171282734563e-06, "logits/chosen": -1.0935803651809692, "logits/rejected": -1.3310425281524658, "logps/chosen": -1619.382080078125, "logps/rejected": -1707.7608642578125, "loss": 4.5224, "nll_loss": 1.0147616863250732, "rewards/accuracies": 0.75, "rewards/chosen": 0.26469483971595764, "rewards/margins": 0.825507640838623, "rewards/rejected": -0.5608128309249878, "step": 487 }, { "epoch": 0.7001434720229556, "grad_norm": 4.921175003051758, "kl/ref_to_policy/chosen": 4.4043498039245605, "kl/ref_to_policy/mean": 59.703208923339844, "kl/ref_to_policy/rejected": 115.00206756591797, "learning_rate": 2.5217274553677975e-06, "logits/chosen": -1.3472169637680054, "logits/rejected": -1.6538087129592896, "logps/chosen": -1868.701171875, "logps/rejected": -1979.349609375, "loss": 3.9393, "nll_loss": 0.8879867792129517, "rewards/accuracies": 0.75, "rewards/chosen": -0.044043488800525665, "rewards/margins": 1.1059770584106445, "rewards/rejected": -1.1500205993652344, "step": 488 }, { "epoch": 0.7015781922525107, "grad_norm": 5.937678337097168, "kl/ref_to_policy/chosen": -3.7295303344726562, "kl/ref_to_policy/mean": 45.989864349365234, "kl/ref_to_policy/rejected": 95.70925903320312, "learning_rate": 2.5000000000000015e-06, "logits/chosen": -1.0600227117538452, "logits/rejected": -1.310704231262207, "logps/chosen": -1547.4422607421875, "logps/rejected": -1648.9222412109375, "loss": 4.6198, "nll_loss": 1.0531233549118042, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03729529678821564, "rewards/margins": 0.9943878650665283, "rewards/rejected": -0.9570926427841187, "step": 489 }, { "epoch": 0.703012912482066, "grad_norm": 5.825986385345459, "kl/ref_to_policy/chosen": -3.4904263019561768, "kl/ref_to_policy/mean": 51.687156677246094, "kl/ref_to_policy/rejected": 106.86474609375, "learning_rate": 2.478335307642264e-06, "logits/chosen": -1.3439878225326538, "logits/rejected": -1.627063274383545, "logps/chosen": -1297.6737060546875, "logps/rejected": -1409.34521484375, "loss": 3.8555, "nll_loss": 0.8668357133865356, "rewards/accuracies": 0.8125, "rewards/chosen": 0.034904249012470245, "rewards/margins": 1.1035516262054443, "rewards/rejected": -1.0686473846435547, "step": 490 }, { "epoch": 0.703012912482066, "eval_kl/ref_to_policy/chosen": -10.640630722045898, "eval_kl/ref_to_policy/mean": 45.24263381958008, "eval_kl/ref_to_policy/rejected": 101.12590026855469, "eval_logits/chosen": -1.3710262775421143, "eval_logits/rejected": -1.6262575387954712, "eval_logps/chosen": -1513.595458984375, "eval_logps/rejected": -1624.848388671875, "eval_loss": 4.142261981964111, "eval_nll_loss": 0.9399645924568176, "eval_rewards/accuracies": 0.8510638475418091, "eval_rewards/chosen": 0.1064063012599945, "eval_rewards/margins": 1.117665410041809, "eval_rewards/rejected": -1.0112590789794922, "eval_runtime": 111.5455, "eval_samples_per_second": 3.371, "eval_steps_per_second": 1.685, "step": 490 }, { "epoch": 0.7044476327116213, "grad_norm": 6.093170166015625, "kl/ref_to_policy/chosen": -4.395424842834473, "kl/ref_to_policy/mean": 30.03952407836914, "kl/ref_to_policy/rejected": 64.47447204589844, "learning_rate": 2.4567339221911086e-06, "logits/chosen": -0.9565767645835876, "logits/rejected": -1.08722984790802, "logps/chosen": -1950.3389892578125, "logps/rejected": -2030.2733154296875, "loss": 5.6347, "nll_loss": 1.2847232818603516, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04395424574613571, "rewards/margins": 0.6886990070343018, "rewards/rejected": -0.6447446942329407, "step": 491 }, { "epoch": 0.7058823529411765, "grad_norm": 5.094422340393066, "kl/ref_to_policy/chosen": 12.528589248657227, "kl/ref_to_policy/mean": 61.91465759277344, "kl/ref_to_policy/rejected": 111.30072784423828, "learning_rate": 2.435196385953727e-06, "logits/chosen": -1.1953191757202148, "logits/rejected": -1.4215575456619263, "logps/chosen": -2246.81005859375, "logps/rejected": -2348.626953125, "loss": 4.621, "nll_loss": 1.0516688823699951, "rewards/accuracies": 0.75, "rewards/chosen": -0.12528586387634277, "rewards/margins": 0.9877214431762695, "rewards/rejected": -1.1130073070526123, "step": 492 }, { "epoch": 0.7073170731707317, "grad_norm": 7.969398498535156, "kl/ref_to_policy/chosen": -18.160842895507812, "kl/ref_to_policy/mean": 42.874454498291016, "kl/ref_to_policy/rejected": 103.90974426269531, "learning_rate": 2.413723239634356e-06, "logits/chosen": -1.504673957824707, "logits/rejected": -1.7745985984802246, "logps/chosen": -914.2844848632812, "logps/rejected": -1034.62841796875, "loss": 2.9513, "nll_loss": 0.6484355926513672, "rewards/accuracies": 0.875, "rewards/chosen": 0.1816084235906601, "rewards/margins": 1.2207059860229492, "rewards/rejected": -1.0390974283218384, "step": 493 }, { "epoch": 0.7087517934002869, "grad_norm": 10.517597198486328, "kl/ref_to_policy/chosen": -5.934850692749023, "kl/ref_to_policy/mean": 22.16928482055664, "kl/ref_to_policy/rejected": 50.27342224121094, "learning_rate": 2.3923150223207176e-06, "logits/chosen": -0.9117950797080994, "logits/rejected": -1.0267395973205566, "logps/chosen": -2137.182373046875, "logps/rejected": -2205.98046875, "loss": 6.0942, "nll_loss": 1.390339970588684, "rewards/accuracies": 0.75, "rewards/chosen": 0.059348512440919876, "rewards/margins": 0.5620827078819275, "rewards/rejected": -0.5027341842651367, "step": 494 }, { "epoch": 0.7101865136298422, "grad_norm": 112.20967102050781, "kl/ref_to_policy/chosen": -0.7529566287994385, "kl/ref_to_policy/mean": 47.62729263305664, "kl/ref_to_policy/rejected": 96.00753784179688, "learning_rate": 2.370972271470475e-06, "logits/chosen": -1.4364676475524902, "logits/rejected": -1.6079751253128052, "logps/chosen": -1380.2470703125, "logps/rejected": -1478.2672119140625, "loss": 4.1353, "nll_loss": 0.9309688806533813, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007529575377702713, "rewards/margins": 0.9676049947738647, "rewards/rejected": -0.9600753784179688, "step": 495 }, { "epoch": 0.7116212338593975, "grad_norm": 4.6165008544921875, "kl/ref_to_policy/chosen": -11.840313911437988, "kl/ref_to_policy/mean": 44.252384185791016, "kl/ref_to_policy/rejected": 100.34508514404297, "learning_rate": 2.3496955228977437e-06, "logits/chosen": -1.4672824144363403, "logits/rejected": -1.6944730281829834, "logps/chosen": -1614.0582275390625, "logps/rejected": -1726.468017578125, "loss": 3.8473, "nll_loss": 0.8666391372680664, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1184031292796135, "rewards/margins": 1.1218539476394653, "rewards/rejected": -1.003450870513916, "step": 496 }, { "epoch": 0.7130559540889526, "grad_norm": 4.735752105712891, "kl/ref_to_policy/chosen": -20.592870712280273, "kl/ref_to_policy/mean": 34.504554748535156, "kl/ref_to_policy/rejected": 89.60197448730469, "learning_rate": 2.328485310759635e-06, "logits/chosen": -1.4709446430206299, "logits/rejected": -1.6772103309631348, "logps/chosen": -1662.5634765625, "logps/rejected": -1772.871337890625, "loss": 3.8627, "nll_loss": 0.8702866435050964, "rewards/accuracies": 0.875, "rewards/chosen": 0.2059287130832672, "rewards/margins": 1.1019483804702759, "rewards/rejected": -0.8960197567939758, "step": 497 }, { "epoch": 0.7144906743185079, "grad_norm": 5.701773166656494, "kl/ref_to_policy/chosen": -10.443414688110352, "kl/ref_to_policy/mean": 45.748512268066406, "kl/ref_to_policy/rejected": 101.94044494628906, "learning_rate": 2.307342167542854e-06, "logits/chosen": -1.5278314352035522, "logits/rejected": -1.727997064590454, "logps/chosen": -1541.189453125, "logps/rejected": -1651.8328857421875, "loss": 3.8169, "nll_loss": 0.8621950149536133, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10443414002656937, "rewards/margins": 1.1238386631011963, "rewards/rejected": -1.019404411315918, "step": 498 }, { "epoch": 0.7159253945480631, "grad_norm": 5.502771854400635, "kl/ref_to_policy/chosen": -11.967199325561523, "kl/ref_to_policy/mean": 37.99469757080078, "kl/ref_to_policy/rejected": 87.95658874511719, "learning_rate": 2.286266624050326e-06, "logits/chosen": -1.2199686765670776, "logits/rejected": -1.4388612508773804, "logps/chosen": -1716.0718994140625, "logps/rejected": -1817.68310546875, "loss": 4.584, "nll_loss": 1.0443775653839111, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11967199295759201, "rewards/margins": 0.9992378354072571, "rewards/rejected": -0.8795658349990845, "step": 499 }, { "epoch": 0.7173601147776184, "grad_norm": 8.042586326599121, "kl/ref_to_policy/chosen": -0.31061553955078125, "kl/ref_to_policy/mean": 37.175113677978516, "kl/ref_to_policy/rejected": 74.66083526611328, "learning_rate": 2.265259209387867e-06, "logits/chosen": -0.9562553763389587, "logits/rejected": -1.1253788471221924, "logps/chosen": -1844.6424560546875, "logps/rejected": -1925.778076171875, "loss": 5.3917, "nll_loss": 1.229865312576294, "rewards/accuracies": 0.875, "rewards/chosen": 0.0031061600893735886, "rewards/margins": 0.7497144937515259, "rewards/rejected": -0.7466082572937012, "step": 500 }, { "epoch": 0.7173601147776184, "eval_kl/ref_to_policy/chosen": -12.07337760925293, "eval_kl/ref_to_policy/mean": 44.288875579833984, "eval_kl/ref_to_policy/rejected": 100.65113067626953, "eval_logits/chosen": -1.4184246063232422, "eval_logits/rejected": -1.6243008375167847, "eval_logps/chosen": -1512.16259765625, "eval_logps/rejected": -1624.3736572265625, "eval_loss": 4.148153781890869, "eval_nll_loss": 0.9430283904075623, "eval_rewards/accuracies": 0.8643617033958435, "eval_rewards/chosen": 0.12073376029729843, "eval_rewards/margins": 1.1272449493408203, "eval_rewards/rejected": -1.0065112113952637, "eval_runtime": 112.0546, "eval_samples_per_second": 3.356, "eval_steps_per_second": 1.678, "step": 500 }, { "epoch": 0.7187948350071736, "grad_norm": 5.533142566680908, "kl/ref_to_policy/chosen": -24.58734893798828, "kl/ref_to_policy/mean": 49.462955474853516, "kl/ref_to_policy/rejected": 123.51325225830078, "learning_rate": 2.2443204509509094e-06, "logits/chosen": -1.77785325050354, "logits/rejected": -2.036353349685669, "logps/chosen": -820.24072265625, "logps/rejected": -961.13623046875, "loss": 2.4846, "nll_loss": 0.5513791441917419, "rewards/accuracies": 0.875, "rewards/chosen": 0.24587351083755493, "rewards/margins": 1.4810060262680054, "rewards/rejected": -1.2351324558258057, "step": 501 }, { "epoch": 0.7202295552367288, "grad_norm": 5.7426838874816895, "kl/ref_to_policy/chosen": -14.552848815917969, "kl/ref_to_policy/mean": 35.944786071777344, "kl/ref_to_policy/rejected": 86.44241333007812, "learning_rate": 2.2234508744112564e-06, "logits/chosen": -1.2500426769256592, "logits/rejected": -1.451236605644226, "logps/chosen": -1719.3377685546875, "logps/rejected": -1821.5733642578125, "loss": 4.7416, "nll_loss": 1.0843920707702637, "rewards/accuracies": 0.875, "rewards/chosen": 0.14552848041057587, "rewards/margins": 1.0099525451660156, "rewards/rejected": -0.8644241690635681, "step": 502 }, { "epoch": 0.7216642754662841, "grad_norm": 13.424676895141602, "kl/ref_to_policy/chosen": -21.440908432006836, "kl/ref_to_policy/mean": 31.1007080078125, "kl/ref_to_policy/rejected": 83.64231872558594, "learning_rate": 2.202651003703885e-06, "logits/chosen": -1.4129269123077393, "logits/rejected": -1.5471047163009644, "logps/chosen": -1445.57470703125, "logps/rejected": -1554.302490234375, "loss": 3.895, "nll_loss": 0.8743141293525696, "rewards/accuracies": 0.75, "rewards/chosen": 0.21440908312797546, "rewards/margins": 1.0508322715759277, "rewards/rejected": -0.8364231586456299, "step": 503 }, { "epoch": 0.7230989956958394, "grad_norm": 9.040212631225586, "kl/ref_to_policy/chosen": 0.7662220001220703, "kl/ref_to_policy/mean": 50.567222595214844, "kl/ref_to_policy/rejected": 100.36822509765625, "learning_rate": 2.181921361013794e-06, "logits/chosen": -1.313890814781189, "logits/rejected": -1.609748363494873, "logps/chosen": -1259.9381103515625, "logps/rejected": -1359.5068359375, "loss": 4.1103, "nll_loss": 0.9099230766296387, "rewards/accuracies": 0.875, "rewards/chosen": -0.007662218064069748, "rewards/margins": 0.9960199594497681, "rewards/rejected": -1.003682255744934, "step": 504 }, { "epoch": 0.7245337159253945, "grad_norm": 6.519081115722656, "kl/ref_to_policy/chosen": -18.771629333496094, "kl/ref_to_policy/mean": 42.90791702270508, "kl/ref_to_policy/rejected": 104.58747100830078, "learning_rate": 2.16126246676289e-06, "logits/chosen": -1.5587403774261475, "logits/rejected": -1.8224290609359741, "logps/chosen": -843.2205810546875, "logps/rejected": -964.4547119140625, "loss": 3.1765, "nll_loss": 0.7066909670829773, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1877162903547287, "rewards/margins": 1.2335909605026245, "rewards/rejected": -1.0458747148513794, "step": 505 }, { "epoch": 0.7259684361549498, "grad_norm": 4.1061859130859375, "kl/ref_to_policy/chosen": -26.659786224365234, "kl/ref_to_policy/mean": 54.89186477661133, "kl/ref_to_policy/rejected": 136.4435272216797, "learning_rate": 2.140674839596931e-06, "logits/chosen": -1.8138046264648438, "logits/rejected": -2.2253878116607666, "logps/chosen": -759.1840209960938, "logps/rejected": -912.4747924804688, "loss": 2.3163, "nll_loss": 0.5190324783325195, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26659783720970154, "rewards/margins": 1.6310330629348755, "rewards/rejected": -1.3644351959228516, "step": 506 }, { "epoch": 0.727403156384505, "grad_norm": 182.78001403808594, "kl/ref_to_policy/chosen": -17.197832107543945, "kl/ref_to_policy/mean": 57.00033950805664, "kl/ref_to_policy/rejected": 131.19850158691406, "learning_rate": 2.1201589963724933e-06, "logits/chosen": -1.9041879177093506, "logits/rejected": -2.24188494682312, "logps/chosen": -1157.8177490234375, "logps/rejected": -1297.942626953125, "loss": 3.8039, "nll_loss": 0.8791078329086304, "rewards/accuracies": 0.75, "rewards/chosen": 0.17197832465171814, "rewards/margins": 1.4839634895324707, "rewards/rejected": -1.3119851350784302, "step": 507 }, { "epoch": 0.7288378766140603, "grad_norm": 94.83061218261719, "kl/ref_to_policy/chosen": -1.8500261306762695, "kl/ref_to_policy/mean": 52.1847038269043, "kl/ref_to_policy/rejected": 106.21942901611328, "learning_rate": 2.09971545214401e-06, "logits/chosen": -1.2867578268051147, "logits/rejected": -1.5363314151763916, "logps/chosen": -1702.11865234375, "logps/rejected": -1808.3111572265625, "loss": 4.0277, "nll_loss": 0.9045674800872803, "rewards/accuracies": 0.875, "rewards/chosen": 0.018500253558158875, "rewards/margins": 1.0806944370269775, "rewards/rejected": -1.0621942281723022, "step": 508 }, { "epoch": 0.7302725968436155, "grad_norm": 5.981005668640137, "kl/ref_to_policy/chosen": -15.369230270385742, "kl/ref_to_policy/mean": 27.826129913330078, "kl/ref_to_policy/rejected": 71.021484375, "learning_rate": 2.0793447201508288e-06, "logits/chosen": -1.2343388795852661, "logits/rejected": -1.4072051048278809, "logps/chosen": -1843.2713623046875, "logps/rejected": -1934.9744873046875, "loss": 4.9771, "nll_loss": 1.1335108280181885, "rewards/accuracies": 0.875, "rewards/chosen": 0.1536923050880432, "rewards/margins": 0.8639071583747864, "rewards/rejected": -0.7102148532867432, "step": 509 }, { "epoch": 0.7317073170731707, "grad_norm": 4.973477363586426, "kl/ref_to_policy/chosen": -24.832992553710938, "kl/ref_to_policy/mean": 38.82828140258789, "kl/ref_to_policy/rejected": 102.48955535888672, "learning_rate": 2.0590473118043326e-06, "logits/chosen": -1.5021836757659912, "logits/rejected": -1.7391595840454102, "logps/chosen": -1212.1749267578125, "logps/rejected": -1334.866455078125, "loss": 3.4674, "nll_loss": 0.7829406261444092, "rewards/accuracies": 1.0, "rewards/chosen": 0.24832990765571594, "rewards/margins": 1.2732254266738892, "rewards/rejected": -1.024895429611206, "step": 510 }, { "epoch": 0.7317073170731707, "eval_kl/ref_to_policy/chosen": -11.910983085632324, "eval_kl/ref_to_policy/mean": 44.386878967285156, "eval_kl/ref_to_policy/rejected": 100.68474578857422, "eval_logits/chosen": -1.3484349250793457, "eval_logits/rejected": -1.6048318147659302, "eval_logps/chosen": -1512.3250732421875, "eval_logps/rejected": -1624.407470703125, "eval_loss": 4.140196323394775, "eval_nll_loss": 0.9396961331367493, "eval_rewards/accuracies": 0.8617021441459656, "eval_rewards/chosen": 0.11910983175039291, "eval_rewards/margins": 1.1259572505950928, "eval_rewards/rejected": -1.0068473815917969, "eval_runtime": 110.0087, "eval_samples_per_second": 3.418, "eval_steps_per_second": 1.709, "step": 510 }, { "epoch": 0.733142037302726, "grad_norm": 98.67194366455078, "kl/ref_to_policy/chosen": 3.817441940307617, "kl/ref_to_policy/mean": 42.331478118896484, "kl/ref_to_policy/rejected": 80.84551239013672, "learning_rate": 2.0388237366751005e-06, "logits/chosen": -1.0594611167907715, "logits/rejected": -1.2171419858932495, "logps/chosen": -1558.5830078125, "logps/rejected": -1639.1024169921875, "loss": 5.175, "nll_loss": 1.1648898124694824, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03817442059516907, "rewards/margins": 0.7702807188034058, "rewards/rejected": -0.8084551095962524, "step": 511 }, { "epoch": 0.7345767575322812, "grad_norm": 3.171644449234009, "kl/ref_to_policy/chosen": -18.931854248046875, "kl/ref_to_policy/mean": 61.68438720703125, "kl/ref_to_policy/rejected": 142.3006134033203, "learning_rate": 2.01867450248011e-06, "logits/chosen": -1.880043625831604, "logits/rejected": -2.2824454307556152, "logps/chosen": -674.6384887695312, "logps/rejected": -828.140380859375, "loss": 1.8611, "nll_loss": 0.40248847007751465, "rewards/accuracies": 0.875, "rewards/chosen": 0.18931853771209717, "rewards/margins": 1.6123247146606445, "rewards/rejected": -1.4230061769485474, "step": 512 }, { "epoch": 0.7360114777618364, "grad_norm": 6.042587757110596, "kl/ref_to_policy/chosen": -11.825763702392578, "kl/ref_to_policy/mean": 43.96443176269531, "kl/ref_to_policy/rejected": 99.75462341308594, "learning_rate": 1.998600115069998e-06, "logits/chosen": -1.3038644790649414, "logits/rejected": -1.5255067348480225, "logps/chosen": -999.0882568359375, "logps/rejected": -1111.838623046875, "loss": 3.776, "nll_loss": 0.8486377000808716, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11825762689113617, "rewards/margins": 1.115803837776184, "rewards/rejected": -0.9975461959838867, "step": 513 }, { "epoch": 0.7374461979913917, "grad_norm": 5.06346321105957, "kl/ref_to_policy/chosen": -16.80782127380371, "kl/ref_to_policy/mean": 26.3073673248291, "kl/ref_to_policy/rejected": 69.42255401611328, "learning_rate": 1.978601078416357e-06, "logits/chosen": -1.1360232830047607, "logits/rejected": -1.3310678005218506, "logps/chosen": -1898.9344482421875, "logps/rejected": -1991.4237060546875, "loss": 4.8823, "nll_loss": 1.1085320711135864, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16807821393013, "rewards/margins": 0.8623036742210388, "rewards/rejected": -0.6942254304885864, "step": 514 }, { "epoch": 0.7388809182209469, "grad_norm": 5.183386325836182, "kl/ref_to_policy/chosen": -1.0298519134521484, "kl/ref_to_policy/mean": 49.183231353759766, "kl/ref_to_policy/rejected": 99.39631652832031, "learning_rate": 1.9586778945990785e-06, "logits/chosen": -1.1221206188201904, "logits/rejected": -1.3639217615127563, "logps/chosen": -1504.4857177734375, "logps/rejected": -1607.257568359375, "loss": 4.2443, "nll_loss": 0.9584910869598389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.010298512876033783, "rewards/margins": 1.0042616128921509, "rewards/rejected": -0.9939632415771484, "step": 515 }, { "epoch": 0.7403156384505022, "grad_norm": 4.5829668045043945, "kl/ref_to_policy/chosen": -9.165569305419922, "kl/ref_to_policy/mean": 53.219669342041016, "kl/ref_to_policy/rejected": 115.60490417480469, "learning_rate": 1.9388310637937606e-06, "logits/chosen": -1.4025002717971802, "logits/rejected": -1.718590497970581, "logps/chosen": -1530.7174072265625, "logps/rejected": -1653.81982421875, "loss": 3.5713, "nll_loss": 0.8061051964759827, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0916556864976883, "rewards/margins": 1.2477047443389893, "rewards/rejected": -1.1560490131378174, "step": 516 }, { "epoch": 0.7417503586800573, "grad_norm": 76.23786163330078, "kl/ref_to_policy/chosen": -4.277089595794678, "kl/ref_to_policy/mean": 34.50190353393555, "kl/ref_to_policy/rejected": 73.28089904785156, "learning_rate": 1.9190610842591386e-06, "logits/chosen": -1.0691708326339722, "logits/rejected": -1.2478785514831543, "logps/chosen": -1715.7620849609375, "logps/rejected": -1797.958740234375, "loss": 5.3117, "nll_loss": 1.2011919021606445, "rewards/accuracies": 0.875, "rewards/chosen": 0.042770884931087494, "rewards/margins": 0.7755798101425171, "rewards/rejected": -0.7328089475631714, "step": 517 }, { "epoch": 0.7431850789096126, "grad_norm": 7.383810520172119, "kl/ref_to_policy/chosen": 0.46415138244628906, "kl/ref_to_policy/mean": 43.13538360595703, "kl/ref_to_policy/rejected": 85.80661010742188, "learning_rate": 1.8993684523245842e-06, "logits/chosen": -1.1695936918258667, "logits/rejected": -1.3085628747940063, "logps/chosen": -1537.214111328125, "logps/rejected": -1626.89453125, "loss": 4.6077, "nll_loss": 1.0409454107284546, "rewards/accuracies": 0.9375, "rewards/chosen": -0.004641521722078323, "rewards/margins": 0.853424608707428, "rewards/rejected": -0.8580661416053772, "step": 518 }, { "epoch": 0.7446197991391679, "grad_norm": 4.99336051940918, "kl/ref_to_policy/chosen": -0.8871443271636963, "kl/ref_to_policy/mean": 47.52973556518555, "kl/ref_to_policy/rejected": 95.94660186767578, "learning_rate": 1.879753662377637e-06, "logits/chosen": -1.2079107761383057, "logits/rejected": -1.4480524063110352, "logps/chosen": -1833.7926025390625, "logps/rejected": -1936.579345703125, "loss": 4.606, "nll_loss": 1.0463061332702637, "rewards/accuracies": 0.75, "rewards/chosen": 0.008871445432305336, "rewards/margins": 0.9683375358581543, "rewards/rejected": -0.9594659805297852, "step": 519 }, { "epoch": 0.7460545193687231, "grad_norm": 4.207171440124512, "kl/ref_to_policy/chosen": -21.04468536376953, "kl/ref_to_policy/mean": 47.20522689819336, "kl/ref_to_policy/rejected": 115.45513916015625, "learning_rate": 1.8602172068516011e-06, "logits/chosen": -1.689624309539795, "logits/rejected": -1.981498122215271, "logps/chosen": -1066.400634765625, "logps/rejected": -1199.4493408203125, "loss": 2.9054, "nll_loss": 0.6482254266738892, "rewards/accuracies": 0.875, "rewards/chosen": 0.21044686436653137, "rewards/margins": 1.3649982213974, "rewards/rejected": -1.154551386833191, "step": 520 }, { "epoch": 0.7460545193687231, "eval_kl/ref_to_policy/chosen": -13.713561058044434, "eval_kl/ref_to_policy/mean": 42.28884506225586, "eval_kl/ref_to_policy/rejected": 98.29124450683594, "eval_logits/chosen": -1.4079235792160034, "eval_logits/rejected": -1.6074658632278442, "eval_logps/chosen": -1510.5224609375, "eval_logps/rejected": -1622.013916015625, "eval_loss": 4.193199634552002, "eval_nll_loss": 0.9537328481674194, "eval_rewards/accuracies": 0.8723404407501221, "eval_rewards/chosen": 0.1371356099843979, "eval_rewards/margins": 1.12004816532135, "eval_rewards/rejected": -0.9829124808311462, "eval_runtime": 110.3021, "eval_samples_per_second": 3.409, "eval_steps_per_second": 1.704, "step": 520 }, { "epoch": 0.7474892395982783, "grad_norm": 5.772912979125977, "kl/ref_to_policy/chosen": -10.01593017578125, "kl/ref_to_policy/mean": 38.69423294067383, "kl/ref_to_policy/rejected": 87.4043960571289, "learning_rate": 1.8407595762131814e-06, "logits/chosen": -1.2288503646850586, "logits/rejected": -1.4201205968856812, "logps/chosen": -1533.6246337890625, "logps/rejected": -1635.712890625, "loss": 3.7635, "nll_loss": 0.8358808755874634, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10015930235385895, "rewards/margins": 0.9742032289505005, "rewards/rejected": -0.874043881893158, "step": 521 }, { "epoch": 0.7489239598278336, "grad_norm": 5.085772514343262, "kl/ref_to_policy/chosen": -5.203364372253418, "kl/ref_to_policy/mean": 51.21775817871094, "kl/ref_to_policy/rejected": 107.63888549804688, "learning_rate": 1.8213812589501611e-06, "logits/chosen": -1.3918474912643433, "logits/rejected": -1.623002052307129, "logps/chosen": -1545.5545654296875, "logps/rejected": -1658.5274658203125, "loss": 3.9719, "nll_loss": 0.8983528017997742, "rewards/accuracies": 0.875, "rewards/chosen": 0.05203364044427872, "rewards/margins": 1.1284223794937134, "rewards/rejected": -1.0763887166976929, "step": 522 }, { "epoch": 0.7503586800573888, "grad_norm": 6.8691277503967285, "kl/ref_to_policy/chosen": -12.082393646240234, "kl/ref_to_policy/mean": 44.35764694213867, "kl/ref_to_policy/rejected": 100.79768371582031, "learning_rate": 1.8020827415591496e-06, "logits/chosen": -1.307745337486267, "logits/rejected": -1.5293015241622925, "logps/chosen": -1297.2222900390625, "logps/rejected": -1410.0965576171875, "loss": 3.6361, "nll_loss": 0.8148936629295349, "rewards/accuracies": 0.9375, "rewards/chosen": 0.120823934674263, "rewards/margins": 1.1288007497787476, "rewards/rejected": -1.007976770401001, "step": 523 }, { "epoch": 0.7517934002869441, "grad_norm": 4.089823246002197, "kl/ref_to_policy/chosen": 8.644673347473145, "kl/ref_to_policy/mean": 73.16548919677734, "kl/ref_to_policy/rejected": 137.68630981445312, "learning_rate": 1.7828645085333645e-06, "logits/chosen": -1.515762448310852, "logits/rejected": -1.7707185745239258, "logps/chosen": -1934.1837158203125, "logps/rejected": -2057.151123046875, "loss": 3.8754, "nll_loss": 0.8874233961105347, "rewards/accuracies": 1.0, "rewards/chosen": -0.08644673228263855, "rewards/margins": 1.2904161214828491, "rewards/rejected": -1.3768630027770996, "step": 524 }, { "epoch": 0.7532281205164992, "grad_norm": 3.798339366912842, "kl/ref_to_policy/chosen": -19.19326400756836, "kl/ref_to_policy/mean": 62.791099548339844, "kl/ref_to_policy/rejected": 144.7754669189453, "learning_rate": 1.7637270423504664e-06, "logits/chosen": -1.8490954637527466, "logits/rejected": -2.1683578491210938, "logps/chosen": -656.7677001953125, "logps/rejected": -811.2305908203125, "loss": 1.9136, "nll_loss": 0.41721442341804504, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19193261861801147, "rewards/margins": 1.639687180519104, "rewards/rejected": -1.4477545022964478, "step": 525 }, { "epoch": 0.7546628407460545, "grad_norm": 4.047372341156006, "kl/ref_to_policy/chosen": -13.79440975189209, "kl/ref_to_policy/mean": 61.494239807128906, "kl/ref_to_policy/rejected": 136.78289794921875, "learning_rate": 1.7446708234604498e-06, "logits/chosen": -1.73604154586792, "logits/rejected": -2.015712022781372, "logps/chosen": -1009.2211303710938, "logps/rejected": -1152.7694091796875, "loss": 2.8893, "nll_loss": 0.652437686920166, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1379440873861313, "rewards/margins": 1.5057729482650757, "rewards/rejected": -1.3678288459777832, "step": 526 }, { "epoch": 0.7560975609756098, "grad_norm": 5.74457311630249, "kl/ref_to_policy/chosen": -17.76784896850586, "kl/ref_to_policy/mean": 45.048561096191406, "kl/ref_to_policy/rejected": 107.8649673461914, "learning_rate": 1.7256963302735752e-06, "logits/chosen": -1.5608892440795898, "logits/rejected": -1.788352370262146, "logps/chosen": -1093.099365234375, "logps/rejected": -1215.945068359375, "loss": 3.451, "nll_loss": 0.7783290147781372, "rewards/accuracies": 1.0, "rewards/chosen": 0.17767848074436188, "rewards/margins": 1.2563282251358032, "rewards/rejected": -1.078649640083313, "step": 527 }, { "epoch": 0.757532281205165, "grad_norm": 3.75237774848938, "kl/ref_to_policy/chosen": -21.040607452392578, "kl/ref_to_policy/mean": 59.5205192565918, "kl/ref_to_policy/rejected": 140.08164978027344, "learning_rate": 1.7068040391483676e-06, "logits/chosen": -1.8607100248336792, "logits/rejected": -2.1892223358154297, "logps/chosen": -816.7172241210938, "logps/rejected": -968.8199462890625, "loss": 2.2988, "nll_loss": 0.5112686157226562, "rewards/accuracies": 0.875, "rewards/chosen": 0.21040606498718262, "rewards/margins": 1.611222505569458, "rewards/rejected": -1.4008164405822754, "step": 528 }, { "epoch": 0.7589670014347202, "grad_norm": 6.380819320678711, "kl/ref_to_policy/chosen": -14.27849292755127, "kl/ref_to_policy/mean": 33.665584564208984, "kl/ref_to_policy/rejected": 81.60966491699219, "learning_rate": 1.6879944243796477e-06, "logits/chosen": -1.2527108192443848, "logits/rejected": -1.438629150390625, "logps/chosen": -1884.221435546875, "logps/rejected": -1985.959228515625, "loss": 4.7986, "nll_loss": 1.0926461219787598, "rewards/accuracies": 0.75, "rewards/chosen": 0.1427849382162094, "rewards/margins": 0.9588814377784729, "rewards/rejected": -0.8160965442657471, "step": 529 }, { "epoch": 0.7604017216642754, "grad_norm": 5.0438761711120605, "kl/ref_to_policy/chosen": -26.76934242248535, "kl/ref_to_policy/mean": 24.654033660888672, "kl/ref_to_policy/rejected": 76.07740783691406, "learning_rate": 1.6692679581866334e-06, "logits/chosen": -1.161961555480957, "logits/rejected": -1.363606572151184, "logps/chosen": -1769.8712158203125, "logps/rejected": -1872.9405517578125, "loss": 4.7228, "nll_loss": 1.0807268619537354, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2676934003829956, "rewards/margins": 1.0284674167633057, "rewards/rejected": -0.7607741355895996, "step": 530 }, { "epoch": 0.7604017216642754, "eval_kl/ref_to_policy/chosen": -14.351677894592285, "eval_kl/ref_to_policy/mean": 41.58321762084961, "eval_kl/ref_to_policy/rejected": 97.51811981201172, "eval_logits/chosen": -1.4394503831863403, "eval_logits/rejected": -1.640151023864746, "eval_logps/chosen": -1509.884521484375, "eval_logps/rejected": -1621.24072265625, "eval_loss": 4.2044806480407715, "eval_nll_loss": 0.9565162658691406, "eval_rewards/accuracies": 0.8776595592498779, "eval_rewards/chosen": 0.14351677894592285, "eval_rewards/margins": 1.1186978816986084, "eval_rewards/rejected": -0.9751811623573303, "eval_runtime": 111.3318, "eval_samples_per_second": 3.377, "eval_steps_per_second": 1.689, "step": 530 }, { "epoch": 0.7618364418938307, "grad_norm": 5.250414848327637, "kl/ref_to_policy/chosen": -12.767526626586914, "kl/ref_to_policy/mean": 31.38230323791504, "kl/ref_to_policy/rejected": 75.53213500976562, "learning_rate": 1.650625110701079e-06, "logits/chosen": -1.0424963235855103, "logits/rejected": -1.2280936241149902, "logps/chosen": -2097.380859375, "logps/rejected": -2189.0751953125, "loss": 5.1006, "nll_loss": 1.165658712387085, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12767526507377625, "rewards/margins": 0.8829965591430664, "rewards/rejected": -0.7553213238716125, "step": 531 }, { "epoch": 0.763271162123386, "grad_norm": 4.727962493896484, "kl/ref_to_policy/chosen": -9.137120246887207, "kl/ref_to_policy/mean": 60.63360595703125, "kl/ref_to_policy/rejected": 130.40432739257812, "learning_rate": 1.632066349955474e-06, "logits/chosen": -1.672489881515503, "logits/rejected": -1.9020241498947144, "logps/chosen": -1099.2769775390625, "logps/rejected": -1232.951171875, "loss": 3.1506, "nll_loss": 0.7117711901664734, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09137119352817535, "rewards/margins": 1.3954145908355713, "rewards/rejected": -1.3040432929992676, "step": 532 }, { "epoch": 0.7647058823529411, "grad_norm": 5.820644855499268, "kl/ref_to_policy/chosen": -14.23892593383789, "kl/ref_to_policy/mean": 29.36435890197754, "kl/ref_to_policy/rejected": 72.9676513671875, "learning_rate": 1.6135921418712959e-06, "logits/chosen": -1.1612519025802612, "logits/rejected": -1.3381497859954834, "logps/chosen": -1920.784423828125, "logps/rejected": -2013.0595703125, "loss": 5.0774, "nll_loss": 1.1583547592163086, "rewards/accuracies": 0.75, "rewards/chosen": 0.14238928258419037, "rewards/margins": 0.8720657229423523, "rewards/rejected": -0.729676365852356, "step": 533 }, { "epoch": 0.7661406025824964, "grad_norm": 4.365135192871094, "kl/ref_to_policy/chosen": -22.472675323486328, "kl/ref_to_policy/mean": 51.71137619018555, "kl/ref_to_policy/rejected": 125.89542388916016, "learning_rate": 1.5952029502473032e-06, "logits/chosen": -1.823563814163208, "logits/rejected": -2.126143217086792, "logps/chosen": -796.4882202148438, "logps/rejected": -939.3660888671875, "loss": 2.4874, "nll_loss": 0.5509485006332397, "rewards/accuracies": 0.875, "rewards/chosen": 0.22472673654556274, "rewards/margins": 1.4836809635162354, "rewards/rejected": -1.2589542865753174, "step": 534 }, { "epoch": 0.7675753228120517, "grad_norm": 6.302175521850586, "kl/ref_to_policy/chosen": 0.15185308456420898, "kl/ref_to_policy/mean": 37.90290069580078, "kl/ref_to_policy/rejected": 75.65394592285156, "learning_rate": 1.5768992367479058e-06, "logits/chosen": -0.9195979833602905, "logits/rejected": -1.073012113571167, "logps/chosen": -2268.30810546875, "logps/rejected": -2350.78271484375, "loss": 5.5408, "nll_loss": 1.2673530578613281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0015185251832008362, "rewards/margins": 0.7550209760665894, "rewards/rejected": -0.7565394639968872, "step": 535 }, { "epoch": 0.7690100430416069, "grad_norm": 4.625567436218262, "kl/ref_to_policy/chosen": -23.026552200317383, "kl/ref_to_policy/mean": 46.80744171142578, "kl/ref_to_policy/rejected": 116.64144897460938, "learning_rate": 1.5586814608915673e-06, "logits/chosen": -1.6268221139907837, "logits/rejected": -1.9640793800354004, "logps/chosen": -1073.5516357421875, "logps/rejected": -1207.4969482421875, "loss": 3.1192, "nll_loss": 0.7033362984657288, "rewards/accuracies": 1.0, "rewards/chosen": 0.2302655279636383, "rewards/margins": 1.3966798782348633, "rewards/rejected": -1.1664143800735474, "step": 536 }, { "epoch": 0.7704447632711621, "grad_norm": 4.909739017486572, "kl/ref_to_policy/chosen": -13.93475341796875, "kl/ref_to_policy/mean": 36.13282775878906, "kl/ref_to_policy/rejected": 86.2004165649414, "learning_rate": 1.5405500800392643e-06, "logits/chosen": -1.333832025527954, "logits/rejected": -1.5565879344940186, "logps/chosen": -1809.7998046875, "logps/rejected": -1913.0853271484375, "loss": 4.2515, "nll_loss": 0.9599754810333252, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13934752345085144, "rewards/margins": 1.0013515949249268, "rewards/rejected": -0.8620040416717529, "step": 537 }, { "epoch": 0.7718794835007173, "grad_norm": 5.9314374923706055, "kl/ref_to_policy/chosen": -16.077157974243164, "kl/ref_to_policy/mean": 26.730501174926758, "kl/ref_to_policy/rejected": 69.53816223144531, "learning_rate": 1.5225055493830132e-06, "logits/chosen": -1.2011831998825073, "logits/rejected": -1.3768320083618164, "logps/chosen": -1887.4697265625, "logps/rejected": -1978.7646484375, "loss": 4.759, "nll_loss": 1.0781821012496948, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16077157855033875, "rewards/margins": 0.8561532497406006, "rewards/rejected": -0.6953816413879395, "step": 538 }, { "epoch": 0.7733142037302726, "grad_norm": 94.7508544921875, "kl/ref_to_policy/chosen": -6.235273361206055, "kl/ref_to_policy/mean": 57.70908737182617, "kl/ref_to_policy/rejected": 121.6534423828125, "learning_rate": 1.5045483219344387e-06, "logits/chosen": -1.6827564239501953, "logits/rejected": -2.0446903705596924, "logps/chosen": -967.4053344726562, "logps/rejected": -1091.0772705078125, "loss": 3.0276, "nll_loss": 0.6635472774505615, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0623527392745018, "rewards/margins": 1.278887152671814, "rewards/rejected": -1.2165343761444092, "step": 539 }, { "epoch": 0.7747489239598279, "grad_norm": 4.747532844543457, "kl/ref_to_policy/chosen": -4.419392108917236, "kl/ref_to_policy/mean": 52.28040313720703, "kl/ref_to_policy/rejected": 108.98019409179688, "learning_rate": 1.4866788485133988e-06, "logits/chosen": -1.340464472770691, "logits/rejected": -1.5727869272232056, "logps/chosen": -1686.43994140625, "logps/rejected": -1799.833740234375, "loss": 3.9157, "nll_loss": 0.885349690914154, "rewards/accuracies": 0.875, "rewards/chosen": 0.044193923473358154, "rewards/margins": 1.133995771408081, "rewards/rejected": -1.0898019075393677, "step": 540 }, { "epoch": 0.7747489239598279, "eval_kl/ref_to_policy/chosen": -14.383479118347168, "eval_kl/ref_to_policy/mean": 41.926700592041016, "eval_kl/ref_to_policy/rejected": 98.23687744140625, "eval_logits/chosen": -1.4316227436065674, "eval_logits/rejected": -1.6606601476669312, "eval_logps/chosen": -1509.8525390625, "eval_logps/rejected": -1621.95947265625, "eval_loss": 4.187948226928711, "eval_nll_loss": 0.9527829885482788, "eval_rewards/accuracies": 0.896276593208313, "eval_rewards/chosen": 0.14383478462696075, "eval_rewards/margins": 1.1262035369873047, "eval_rewards/rejected": -0.9823687672615051, "eval_runtime": 112.0368, "eval_samples_per_second": 3.356, "eval_steps_per_second": 1.678, "step": 540 }, { "epoch": 0.776183644189383, "grad_norm": 4.304272174835205, "kl/ref_to_policy/chosen": 12.670039176940918, "kl/ref_to_policy/mean": 74.8685073852539, "kl/ref_to_policy/rejected": 137.06698608398438, "learning_rate": 1.4688975777366716e-06, "logits/chosen": -1.4051613807678223, "logits/rejected": -1.7338708639144897, "logps/chosen": -1626.045166015625, "logps/rejected": -1748.973388671875, "loss": 3.6738, "nll_loss": 0.8312311172485352, "rewards/accuracies": 0.875, "rewards/chosen": -0.12670037150382996, "rewards/margins": 1.243969440460205, "rewards/rejected": -1.3706697225570679, "step": 541 }, { "epoch": 0.7776183644189383, "grad_norm": 7.179723262786865, "kl/ref_to_policy/chosen": -20.44873046875, "kl/ref_to_policy/mean": 20.100208282470703, "kl/ref_to_policy/rejected": 60.649147033691406, "learning_rate": 1.4512049560066837e-06, "logits/chosen": -1.1685632467269897, "logits/rejected": -1.34490966796875, "logps/chosen": -1628.66064453125, "logps/rejected": -1718.5455322265625, "loss": 4.8471, "nll_loss": 1.095329999923706, "rewards/accuracies": 0.75, "rewards/chosen": 0.20448730885982513, "rewards/margins": 0.8109787702560425, "rewards/rejected": -0.6064914464950562, "step": 542 }, { "epoch": 0.7790530846484935, "grad_norm": 4.502772331237793, "kl/ref_to_policy/chosen": -16.513835906982422, "kl/ref_to_policy/mean": 45.87990951538086, "kl/ref_to_policy/rejected": 108.27365112304688, "learning_rate": 1.433601427500318e-06, "logits/chosen": -1.5241159200668335, "logits/rejected": -1.7786550521850586, "logps/chosen": -1482.7908935546875, "logps/rejected": -1605.395751953125, "loss": 3.3753, "nll_loss": 0.7589468359947205, "rewards/accuracies": 1.0, "rewards/chosen": 0.1651383638381958, "rewards/margins": 1.2478749752044678, "rewards/rejected": -1.082736611366272, "step": 543 }, { "epoch": 0.7804878048780488, "grad_norm": 5.651638031005859, "kl/ref_to_policy/chosen": -20.070735931396484, "kl/ref_to_policy/mean": 34.533267974853516, "kl/ref_to_policy/rejected": 89.13728332519531, "learning_rate": 1.4160874341577447e-06, "logits/chosen": -1.4679409265518188, "logits/rejected": -1.6658294200897217, "logps/chosen": -1565.8355712890625, "logps/rejected": -1676.938232421875, "loss": 3.6743, "nll_loss": 0.8214325904846191, "rewards/accuracies": 0.8125, "rewards/chosen": 0.200707346200943, "rewards/margins": 1.0920801162719727, "rewards/rejected": -0.8913727402687073, "step": 544 }, { "epoch": 0.7819225251076041, "grad_norm": 6.361689567565918, "kl/ref_to_policy/chosen": -29.211702346801758, "kl/ref_to_policy/mean": 14.490235328674316, "kl/ref_to_policy/rejected": 58.19217300415039, "learning_rate": 1.3986634156713418e-06, "logits/chosen": -1.140486240386963, "logits/rejected": -1.3248112201690674, "logps/chosen": -1389.280517578125, "logps/rejected": -1482.4521484375, "loss": 4.872, "nll_loss": 1.1066635847091675, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29211702942848206, "rewards/margins": 0.8740387558937073, "rewards/rejected": -0.5819217562675476, "step": 545 }, { "epoch": 0.7833572453371592, "grad_norm": 6.428523540496826, "kl/ref_to_policy/chosen": 5.4936299324035645, "kl/ref_to_policy/mean": 56.061519622802734, "kl/ref_to_policy/rejected": 106.62940979003906, "learning_rate": 1.3813298094746491e-06, "logits/chosen": -1.396311640739441, "logits/rejected": -1.5635417699813843, "logps/chosen": -1610.3138427734375, "logps/rejected": -1711.70947265625, "loss": 3.8877, "nll_loss": 0.8709261417388916, "rewards/accuracies": 0.9375, "rewards/chosen": -0.054936304688453674, "rewards/margins": 1.0113576650619507, "rewards/rejected": -1.0662940740585327, "step": 546 }, { "epoch": 0.7847919655667145, "grad_norm": 5.649950981140137, "kl/ref_to_policy/chosen": -22.771081924438477, "kl/ref_to_policy/mean": 40.74188995361328, "kl/ref_to_policy/rejected": 104.25486755371094, "learning_rate": 1.3640870507313859e-06, "logits/chosen": -1.4940752983093262, "logits/rejected": -1.7878514528274536, "logps/chosen": -1306.6351318359375, "logps/rejected": -1430.0145263671875, "loss": 3.5059, "nll_loss": 0.7924089431762695, "rewards/accuracies": 1.0, "rewards/chosen": 0.22771084308624268, "rewards/margins": 1.2702592611312866, "rewards/rejected": -1.0425485372543335, "step": 547 }, { "epoch": 0.7862266857962698, "grad_norm": 5.7815375328063965, "kl/ref_to_policy/chosen": -14.479900360107422, "kl/ref_to_policy/mean": 36.024925231933594, "kl/ref_to_policy/rejected": 86.52975463867188, "learning_rate": 1.3469355723245303e-06, "logits/chosen": -1.3585150241851807, "logits/rejected": -1.5821619033813477, "logps/chosen": -1092.0123291015625, "logps/rejected": -1194.99462890625, "loss": 3.8536, "nll_loss": 0.8615529537200928, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14479900896549225, "rewards/margins": 1.010096549987793, "rewards/rejected": -0.8652974963188171, "step": 548 }, { "epoch": 0.787661406025825, "grad_norm": 4.350789546966553, "kl/ref_to_policy/chosen": -14.311101913452148, "kl/ref_to_policy/mean": 47.77650451660156, "kl/ref_to_policy/rejected": 109.8641128540039, "learning_rate": 1.3298758048454436e-06, "logits/chosen": -1.4425537586212158, "logits/rejected": -1.7568602561950684, "logps/chosen": -1250.28271484375, "logps/rejected": -1373.3348388671875, "loss": 3.3682, "nll_loss": 0.7556155920028687, "rewards/accuracies": 1.0, "rewards/chosen": 0.14311102032661438, "rewards/margins": 1.2417521476745605, "rewards/rejected": -1.0986411571502686, "step": 549 }, { "epoch": 0.7890961262553802, "grad_norm": 5.388617038726807, "kl/ref_to_policy/chosen": -7.312671184539795, "kl/ref_to_policy/mean": 48.793338775634766, "kl/ref_to_policy/rejected": 104.89935302734375, "learning_rate": 1.3129081765830725e-06, "logits/chosen": -1.4827687740325928, "logits/rejected": -1.7667561769485474, "logps/chosen": -1147.0467529296875, "logps/rejected": -1259.2705078125, "loss": 3.5101, "nll_loss": 0.7823395133018494, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07312670350074768, "rewards/margins": 1.1221201419830322, "rewards/rejected": -1.048993468284607, "step": 550 }, { "epoch": 0.7890961262553802, "eval_kl/ref_to_policy/chosen": -15.004451751708984, "eval_kl/ref_to_policy/mean": 41.9472541809082, "eval_kl/ref_to_policy/rejected": 98.89895629882812, "eval_logits/chosen": -1.4130795001983643, "eval_logits/rejected": -1.6750434637069702, "eval_logps/chosen": -1509.2315673828125, "eval_logps/rejected": -1622.6217041015625, "eval_loss": 4.130539894104004, "eval_nll_loss": 0.9392409920692444, "eval_rewards/accuracies": 0.9122340679168701, "eval_rewards/chosen": 0.1500445008277893, "eval_rewards/margins": 1.1390341520309448, "eval_rewards/rejected": -0.9889895915985107, "eval_runtime": 111.1426, "eval_samples_per_second": 3.383, "eval_steps_per_second": 1.692, "step": 550 }, { "epoch": 0.7905308464849354, "grad_norm": 17.873903274536133, "kl/ref_to_policy/chosen": 2.5297493934631348, "kl/ref_to_policy/mean": 45.02005386352539, "kl/ref_to_policy/rejected": 87.51036071777344, "learning_rate": 1.2960331135131826e-06, "logits/chosen": -1.292668104171753, "logits/rejected": -1.5666697025299072, "logps/chosen": -1373.90478515625, "logps/rejected": -1462.6666259765625, "loss": 4.2346, "nll_loss": 0.9301596283912659, "rewards/accuracies": 0.75, "rewards/chosen": -0.025297492742538452, "rewards/margins": 0.8498060703277588, "rewards/rejected": -0.8751035332679749, "step": 551 }, { "epoch": 0.7919655667144907, "grad_norm": 6.301400184631348, "kl/ref_to_policy/chosen": -13.65475082397461, "kl/ref_to_policy/mean": 30.920045852661133, "kl/ref_to_policy/rejected": 75.49484252929688, "learning_rate": 1.2792510392876777e-06, "logits/chosen": -1.0862138271331787, "logits/rejected": -1.298716425895691, "logps/chosen": -1813.47802734375, "logps/rejected": -1906.653076171875, "loss": 5.1821, "nll_loss": 1.1871871948242188, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1365474909543991, "rewards/margins": 0.8914958834648132, "rewards/rejected": -0.7549483776092529, "step": 552 }, { "epoch": 0.793400286944046, "grad_norm": 4.52197265625, "kl/ref_to_policy/chosen": -8.385957717895508, "kl/ref_to_policy/mean": 42.37968444824219, "kl/ref_to_policy/rejected": 93.14531707763672, "learning_rate": 1.262562375223954e-06, "logits/chosen": -1.2709332704544067, "logits/rejected": -1.475311279296875, "logps/chosen": -2511.6064453125, "logps/rejected": -2614.9794921875, "loss": 4.8487, "nll_loss": 1.111647367477417, "rewards/accuracies": 0.875, "rewards/chosen": 0.08385959267616272, "rewards/margins": 1.0153127908706665, "rewards/rejected": -0.9314531683921814, "step": 553 }, { "epoch": 0.7948350071736011, "grad_norm": 4.757059574127197, "kl/ref_to_policy/chosen": -5.184198379516602, "kl/ref_to_policy/mean": 62.174163818359375, "kl/ref_to_policy/rejected": 129.53253173828125, "learning_rate": 1.245967540294329e-06, "logits/chosen": -1.6113595962524414, "logits/rejected": -1.936597466468811, "logps/chosen": -1146.370361328125, "logps/rejected": -1279.190673828125, "loss": 3.066, "nll_loss": 0.6865803003311157, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05184199661016464, "rewards/margins": 1.3471670150756836, "rewards/rejected": -1.2953251600265503, "step": 554 }, { "epoch": 0.7962697274031564, "grad_norm": 5.474193096160889, "kl/ref_to_policy/chosen": -13.49106216430664, "kl/ref_to_policy/mean": 28.1215763092041, "kl/ref_to_policy/rejected": 69.73421478271484, "learning_rate": 1.2294669511155193e-06, "logits/chosen": -1.1674062013626099, "logits/rejected": -1.3550868034362793, "logps/chosen": -2216.430908203125, "logps/rejected": -2308.626953125, "loss": 4.9502, "nll_loss": 1.1221798658370972, "rewards/accuracies": 1.0, "rewards/chosen": 0.13491061329841614, "rewards/margins": 0.8322526812553406, "rewards/rejected": -0.6973421573638916, "step": 555 }, { "epoch": 0.7977044476327116, "grad_norm": 4.65912389755249, "kl/ref_to_policy/chosen": -14.188389778137207, "kl/ref_to_policy/mean": 41.1667594909668, "kl/ref_to_policy/rejected": 96.52191925048828, "learning_rate": 1.2130610219381811e-06, "logits/chosen": -1.4553542137145996, "logits/rejected": -1.7253572940826416, "logps/chosen": -1718.6240234375, "logps/rejected": -1832.0921630859375, "loss": 3.9077, "nll_loss": 0.8797246217727661, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14188390970230103, "rewards/margins": 1.1071029901504517, "rewards/rejected": -0.9652190804481506, "step": 556 }, { "epoch": 0.7991391678622669, "grad_norm": 7.164508819580078, "kl/ref_to_policy/chosen": -21.232616424560547, "kl/ref_to_policy/mean": 34.089351654052734, "kl/ref_to_policy/rejected": 89.41132354736328, "learning_rate": 1.1967501646365147e-06, "logits/chosen": -1.456015706062317, "logits/rejected": -1.7398555278778076, "logps/chosen": -1213.259521484375, "logps/rejected": -1326.509033203125, "loss": 3.8902, "nll_loss": 0.8758388757705688, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21232616901397705, "rewards/margins": 1.1064393520355225, "rewards/rejected": -0.8941132426261902, "step": 557 }, { "epoch": 0.8005738880918221, "grad_norm": 6.188491344451904, "kl/ref_to_policy/chosen": -8.412431716918945, "kl/ref_to_policy/mean": 40.71172332763672, "kl/ref_to_policy/rejected": 89.83587646484375, "learning_rate": 1.1805347886979219e-06, "logits/chosen": -1.2771034240722656, "logits/rejected": -1.4952105283737183, "logps/chosen": -1540.1265869140625, "logps/rejected": -1641.5194091796875, "loss": 4.3769, "nll_loss": 0.9914687871932983, "rewards/accuracies": 1.0, "rewards/chosen": 0.08412431180477142, "rewards/margins": 0.9824830293655396, "rewards/rejected": -0.8983587026596069, "step": 558 }, { "epoch": 0.8020086083213773, "grad_norm": 290.3334655761719, "kl/ref_to_policy/chosen": -40.624977111816406, "kl/ref_to_policy/mean": 8.507970809936523, "kl/ref_to_policy/rejected": 57.64091491699219, "learning_rate": 1.1644153012127208e-06, "logits/chosen": -1.7104086875915527, "logits/rejected": -1.8591207265853882, "logps/chosen": -946.0078125, "logps/rejected": -1045.2503662109375, "loss": 4.6913, "nll_loss": 1.0684884786605835, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4062497615814209, "rewards/margins": 0.9826589226722717, "rewards/rejected": -0.5764090418815613, "step": 559 }, { "epoch": 0.8034433285509326, "grad_norm": 5.132367134094238, "kl/ref_to_policy/chosen": -16.856252670288086, "kl/ref_to_policy/mean": 47.150306701660156, "kl/ref_to_policy/rejected": 111.1568603515625, "learning_rate": 1.1483921068639353e-06, "logits/chosen": -1.4721901416778564, "logits/rejected": -1.7698158025741577, "logps/chosen": -1770.2843017578125, "logps/rejected": -1894.528076171875, "loss": 3.5496, "nll_loss": 0.8047613501548767, "rewards/accuracies": 1.0, "rewards/chosen": 0.16856253147125244, "rewards/margins": 1.2801311016082764, "rewards/rejected": -1.1115686893463135, "step": 560 }, { "epoch": 0.8034433285509326, "eval_kl/ref_to_policy/chosen": -15.642666816711426, "eval_kl/ref_to_policy/mean": 41.155555725097656, "eval_kl/ref_to_policy/rejected": 97.95376586914062, "eval_logits/chosen": -1.422311544418335, "eval_logits/rejected": -1.6756362915039062, "eval_logps/chosen": -1508.59326171875, "eval_logps/rejected": -1621.6763916015625, "eval_loss": 4.143180847167969, "eval_nll_loss": 0.9422029256820679, "eval_rewards/accuracies": 0.9202127456665039, "eval_rewards/chosen": 0.15642666816711426, "eval_rewards/margins": 1.1359643936157227, "eval_rewards/rejected": -0.9795377850532532, "eval_runtime": 110.9729, "eval_samples_per_second": 3.388, "eval_steps_per_second": 1.694, "step": 560 }, { "epoch": 0.8048780487804879, "grad_norm": 4.849057674407959, "kl/ref_to_policy/chosen": -18.38163948059082, "kl/ref_to_policy/mean": 50.575469970703125, "kl/ref_to_policy/rejected": 119.53257751464844, "learning_rate": 1.1324656079171288e-06, "logits/chosen": -1.6433569192886353, "logits/rejected": -1.9974632263183594, "logps/chosen": -965.8214111328125, "logps/rejected": -1099.581298828125, "loss": 3.0617, "nll_loss": 0.6872400641441345, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18381640315055847, "rewards/margins": 1.379142165184021, "rewards/rejected": -1.1953258514404297, "step": 561 }, { "epoch": 0.806312769010043, "grad_norm": 630.5310668945312, "kl/ref_to_policy/chosen": -14.284662246704102, "kl/ref_to_policy/mean": 25.020957946777344, "kl/ref_to_policy/rejected": 64.32658386230469, "learning_rate": 1.1166362042103056e-06, "logits/chosen": -1.2671411037445068, "logits/rejected": -1.3612844944000244, "logps/chosen": -1812.94189453125, "logps/rejected": -1895.7601318359375, "loss": 6.3783, "nll_loss": 1.4797091484069824, "rewards/accuracies": 1.0, "rewards/chosen": 0.14284662902355194, "rewards/margins": 0.7861124277114868, "rewards/rejected": -0.6432658433914185, "step": 562 }, { "epoch": 0.8077474892395983, "grad_norm": 4.878719329833984, "kl/ref_to_policy/chosen": -11.827132225036621, "kl/ref_to_policy/mean": 45.581581115722656, "kl/ref_to_policy/rejected": 102.99030303955078, "learning_rate": 1.1009042931438784e-06, "logits/chosen": -1.3971648216247559, "logits/rejected": -1.6584280729293823, "logps/chosen": -1732.1728515625, "logps/rejected": -1845.9512939453125, "loss": 3.954, "nll_loss": 0.8960073590278625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11827132105827332, "rewards/margins": 1.1481742858886719, "rewards/rejected": -1.0299030542373657, "step": 563 }, { "epoch": 0.8091822094691535, "grad_norm": 4.920968532562256, "kl/ref_to_policy/chosen": -15.727143287658691, "kl/ref_to_policy/mean": 40.4184455871582, "kl/ref_to_policy/rejected": 96.56403350830078, "learning_rate": 1.0852702696706807e-06, "logits/chosen": -1.3866803646087646, "logits/rejected": -1.6574771404266357, "logps/chosen": -1535.1947021484375, "logps/rejected": -1649.42333984375, "loss": 4.0148, "nll_loss": 0.908441960811615, "rewards/accuracies": 1.0, "rewards/chosen": 0.15727141499519348, "rewards/margins": 1.1229116916656494, "rewards/rejected": -0.9656403064727783, "step": 564 }, { "epoch": 0.8106169296987088, "grad_norm": 6.236594200134277, "kl/ref_to_policy/chosen": -21.670236587524414, "kl/ref_to_policy/mean": 22.041765213012695, "kl/ref_to_policy/rejected": 65.75376892089844, "learning_rate": 1.0697345262860638e-06, "logits/chosen": -1.0589745044708252, "logits/rejected": -1.2771201133728027, "logps/chosen": -1372.657958984375, "logps/rejected": -1465.581787109375, "loss": 4.5569, "nll_loss": 1.0297414064407349, "rewards/accuracies": 0.875, "rewards/chosen": 0.21670235693454742, "rewards/margins": 0.874239981174469, "rewards/rejected": -0.6575376391410828, "step": 565 }, { "epoch": 0.812051649928264, "grad_norm": 3.1621267795562744, "kl/ref_to_policy/chosen": -17.274005889892578, "kl/ref_to_policy/mean": 59.43462371826172, "kl/ref_to_policy/rejected": 136.14324951171875, "learning_rate": 1.0542974530180327e-06, "logits/chosen": -1.7628132104873657, "logits/rejected": -2.082540512084961, "logps/chosen": -1233.28857421875, "logps/rejected": -1378.002685546875, "loss": 2.495, "nll_loss": 0.5566748380661011, "rewards/accuracies": 1.0, "rewards/chosen": 0.17274005711078644, "rewards/margins": 1.534172534942627, "rewards/rejected": -1.361432433128357, "step": 566 }, { "epoch": 0.8134863701578192, "grad_norm": 8.522574424743652, "kl/ref_to_policy/chosen": 5.575809001922607, "kl/ref_to_policy/mean": 56.98033905029297, "kl/ref_to_policy/rejected": 108.3848648071289, "learning_rate": 1.0389594374174628e-06, "logits/chosen": -1.4810903072357178, "logits/rejected": -1.8035002946853638, "logps/chosen": -1382.20849609375, "logps/rejected": -1482.4766845703125, "loss": 3.5597, "nll_loss": 0.7752220034599304, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05575808137655258, "rewards/margins": 1.0280905961990356, "rewards/rejected": -1.0838487148284912, "step": 567 }, { "epoch": 0.8149210903873745, "grad_norm": 5.822363376617432, "kl/ref_to_policy/chosen": -9.691393852233887, "kl/ref_to_policy/mean": 29.202495574951172, "kl/ref_to_policy/rejected": 68.09638214111328, "learning_rate": 1.0237208645483648e-06, "logits/chosen": -1.0409969091415405, "logits/rejected": -1.1971362829208374, "logps/chosen": -2149.71484375, "logps/rejected": -2232.334716796875, "loss": 5.4963, "nll_loss": 1.2589654922485352, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09691394120454788, "rewards/margins": 0.7778777480125427, "rewards/rejected": -0.6809638142585754, "step": 568 }, { "epoch": 0.8163558106169297, "grad_norm": 4.482673168182373, "kl/ref_to_policy/chosen": -13.935821533203125, "kl/ref_to_policy/mean": 49.15494918823242, "kl/ref_to_policy/rejected": 112.24571990966797, "learning_rate": 1.00858211697822e-06, "logits/chosen": -1.4732277393341064, "logits/rejected": -1.782694697380066, "logps/chosen": -1351.181396484375, "logps/rejected": -1475.4664306640625, "loss": 3.2198, "nll_loss": 0.7194546461105347, "rewards/accuracies": 0.875, "rewards/chosen": 0.13935820758342743, "rewards/margins": 1.2618154287338257, "rewards/rejected": -1.1224572658538818, "step": 569 }, { "epoch": 0.8177905308464849, "grad_norm": 4.884852886199951, "kl/ref_to_policy/chosen": -23.554027557373047, "kl/ref_to_policy/mean": 52.393882751464844, "kl/ref_to_policy/rejected": 128.34178161621094, "learning_rate": 9.935435747683758e-07, "logits/chosen": -1.698089599609375, "logits/rejected": -2.105529308319092, "logps/chosen": -904.6022338867188, "logps/rejected": -1048.861328125, "loss": 2.6169, "nll_loss": 0.5858930945396423, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23554028570652008, "rewards/margins": 1.5189582109451294, "rewards/rejected": -1.283417820930481, "step": 570 }, { "epoch": 0.8177905308464849, "eval_kl/ref_to_policy/chosen": -16.161903381347656, "eval_kl/ref_to_policy/mean": 40.64240264892578, "eval_kl/ref_to_policy/rejected": 97.44671630859375, "eval_logits/chosen": -1.415872573852539, "eval_logits/rejected": -1.6787534952163696, "eval_logps/chosen": -1508.0740966796875, "eval_logps/rejected": -1621.169189453125, "eval_loss": 4.14785623550415, "eval_nll_loss": 0.9433109164237976, "eval_rewards/accuracies": 0.9202127456665039, "eval_rewards/chosen": 0.16161903738975525, "eval_rewards/margins": 1.1360862255096436, "eval_rewards/rejected": -0.9744671583175659, "eval_runtime": 110.0475, "eval_samples_per_second": 3.417, "eval_steps_per_second": 1.708, "step": 570 }, { "epoch": 0.8192252510760402, "grad_norm": 5.826337814331055, "kl/ref_to_policy/chosen": -20.7706241607666, "kl/ref_to_policy/mean": 22.98196029663086, "kl/ref_to_policy/rejected": 66.73454284667969, "learning_rate": 9.786056154645001e-07, "logits/chosen": -1.120529294013977, "logits/rejected": -1.3322911262512207, "logps/chosen": -1793.3387451171875, "logps/rejected": -1886.664306640625, "loss": 4.6503, "nll_loss": 1.051374912261963, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2077062577009201, "rewards/margins": 0.8750516176223755, "rewards/rejected": -0.667345404624939, "step": 571 }, { "epoch": 0.8206599713055954, "grad_norm": 5.22569465637207, "kl/ref_to_policy/chosen": -20.724464416503906, "kl/ref_to_policy/mean": 37.09492492675781, "kl/ref_to_policy/rejected": 94.91432189941406, "learning_rate": 9.637686140871121e-07, "logits/chosen": -1.4448275566101074, "logits/rejected": -1.6904977560043335, "logps/chosen": -980.9083251953125, "logps/rejected": -1093.429931640625, "loss": 3.368, "nll_loss": 0.7507247924804688, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072446644306183, "rewards/margins": 1.1563878059387207, "rewards/rejected": -0.94914311170578, "step": 572 }, { "epoch": 0.8220946915351507, "grad_norm": 5.017503261566162, "kl/ref_to_policy/chosen": -36.785457611083984, "kl/ref_to_policy/mean": 32.07963943481445, "kl/ref_to_policy/rejected": 100.94474029541016, "learning_rate": 9.490329431221545e-07, "logits/chosen": -1.7123961448669434, "logits/rejected": -2.032925844192505, "logps/chosen": -1073.0738525390625, "logps/rejected": -1206.8900146484375, "loss": 3.035, "nll_loss": 0.6807990670204163, "rewards/accuracies": 0.875, "rewards/chosen": 0.3678545653820038, "rewards/margins": 1.3773020505905151, "rewards/rejected": -1.0094473361968994, "step": 573 }, { "epoch": 0.8235294117647058, "grad_norm": 5.987112045288086, "kl/ref_to_policy/chosen": -25.21862030029297, "kl/ref_to_policy/mean": 31.920991897583008, "kl/ref_to_policy/rejected": 89.06059265136719, "learning_rate": 9.34398972511656e-07, "logits/chosen": -1.229670524597168, "logits/rejected": -1.5564374923706055, "logps/chosen": -1017.5782470703125, "logps/rejected": -1131.0958251953125, "loss": 4.1212, "nll_loss": 0.9372463822364807, "rewards/accuracies": 1.0, "rewards/chosen": 0.25218620896339417, "rewards/margins": 1.1427921056747437, "rewards/rejected": -0.8906059861183167, "step": 574 }, { "epoch": 0.8249641319942611, "grad_norm": 16.86886978149414, "kl/ref_to_policy/chosen": -15.805822372436523, "kl/ref_to_policy/mean": 37.127105712890625, "kl/ref_to_policy/rejected": 90.06004333496094, "learning_rate": 9.198670696444339e-07, "logits/chosen": -1.494951605796814, "logits/rejected": -1.6322343349456787, "logps/chosen": -1618.4442138671875, "logps/rejected": -1722.59130859375, "loss": 3.5421, "nll_loss": 0.7901593446731567, "rewards/accuracies": 1.0, "rewards/chosen": 0.15805824100971222, "rewards/margins": 1.0586585998535156, "rewards/rejected": -0.9006003141403198, "step": 575 }, { "epoch": 0.8263988522238164, "grad_norm": 4.158964157104492, "kl/ref_to_policy/chosen": -9.821043968200684, "kl/ref_to_policy/mean": 66.12427520751953, "kl/ref_to_policy/rejected": 142.06959533691406, "learning_rate": 9.054375993468745e-07, "logits/chosen": -1.7422614097595215, "logits/rejected": -2.168452739715576, "logps/chosen": -949.960205078125, "logps/rejected": -1095.3468017578125, "loss": 2.5431, "nll_loss": 0.5660082101821899, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09821043908596039, "rewards/margins": 1.5189063549041748, "rewards/rejected": -1.4206960201263428, "step": 576 }, { "epoch": 0.8278335724533716, "grad_norm": 84.55091857910156, "kl/ref_to_policy/chosen": -17.91026496887207, "kl/ref_to_policy/mean": 50.861106872558594, "kl/ref_to_policy/rejected": 119.63249206542969, "learning_rate": 8.911109238737748e-07, "logits/chosen": -1.9992750883102417, "logits/rejected": -2.3028769493103027, "logps/chosen": -519.3709716796875, "logps/rejected": -649.1441650390625, "loss": 3.7334, "nll_loss": 0.8538476228713989, "rewards/accuracies": 0.875, "rewards/chosen": 0.17910262942314148, "rewards/margins": 1.3754276037216187, "rewards/rejected": -1.1963248252868652, "step": 577 }, { "epoch": 0.8292682926829268, "grad_norm": 5.552534103393555, "kl/ref_to_policy/chosen": -21.41107177734375, "kl/ref_to_policy/mean": 29.421417236328125, "kl/ref_to_policy/rejected": 80.25390625, "learning_rate": 8.768874028992431e-07, "logits/chosen": -1.1803185939788818, "logits/rejected": -1.457988977432251, "logps/chosen": -1202.7921142578125, "logps/rejected": -1306.6092529296875, "loss": 4.0389, "nll_loss": 0.9083975553512573, "rewards/accuracies": 1.0, "rewards/chosen": 0.21411073207855225, "rewards/margins": 1.0166497230529785, "rewards/rejected": -0.802539050579071, "step": 578 }, { "epoch": 0.830703012912482, "grad_norm": 82.863037109375, "kl/ref_to_policy/chosen": -18.802732467651367, "kl/ref_to_policy/mean": 37.441959381103516, "kl/ref_to_policy/rejected": 93.68665313720703, "learning_rate": 8.627673935076769e-07, "logits/chosen": -1.6365185976028442, "logits/rejected": -1.8818168640136719, "logps/chosen": -1331.853515625, "logps/rejected": -1442.384033203125, "loss": 4.8371, "nll_loss": 1.1139674186706543, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1880273073911667, "rewards/margins": 1.1248937845230103, "rewards/rejected": -0.9368664622306824, "step": 579 }, { "epoch": 0.8321377331420373, "grad_norm": 5.639092445373535, "kl/ref_to_policy/chosen": -8.93535327911377, "kl/ref_to_policy/mean": 40.410072326660156, "kl/ref_to_policy/rejected": 89.7554931640625, "learning_rate": 8.487512501847933e-07, "logits/chosen": -1.2189199924468994, "logits/rejected": -1.4962635040283203, "logps/chosen": -1755.9083251953125, "logps/rejected": -1859.9683837890625, "loss": 4.6435, "nll_loss": 1.055832862854004, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0893535390496254, "rewards/margins": 0.9869083762168884, "rewards/rejected": -0.897554874420166, "step": 580 }, { "epoch": 0.8321377331420373, "eval_kl/ref_to_policy/chosen": -16.266746520996094, "eval_kl/ref_to_policy/mean": 40.6291618347168, "eval_kl/ref_to_policy/rejected": 97.52507781982422, "eval_logits/chosen": -1.3965736627578735, "eval_logits/rejected": -1.6777756214141846, "eval_logps/chosen": -1507.96923828125, "eval_logps/rejected": -1621.2476806640625, "eval_loss": 4.117298603057861, "eval_nll_loss": 0.9354483485221863, "eval_rewards/accuracies": 0.9202127456665039, "eval_rewards/chosen": 0.16266748309135437, "eval_rewards/margins": 1.13791823387146, "eval_rewards/rejected": -0.9752506613731384, "eval_runtime": 111.1394, "eval_samples_per_second": 3.383, "eval_steps_per_second": 1.692, "step": 580 }, { "epoch": 0.8335724533715926, "grad_norm": 4.680318355560303, "kl/ref_to_policy/chosen": -27.959861755371094, "kl/ref_to_policy/mean": 42.915138244628906, "kl/ref_to_policy/rejected": 113.79014587402344, "learning_rate": 8.348393248087289e-07, "logits/chosen": -1.630407452583313, "logits/rejected": -2.024937152862549, "logps/chosen": -864.3292236328125, "logps/rejected": -998.9385986328125, "loss": 2.806, "nll_loss": 0.6267791986465454, "rewards/accuracies": 1.0, "rewards/chosen": 0.2795986235141754, "rewards/margins": 1.4175000190734863, "rewards/rejected": -1.1379014253616333, "step": 581 }, { "epoch": 0.8350071736011477, "grad_norm": 6.3898773193359375, "kl/ref_to_policy/chosen": -18.261709213256836, "kl/ref_to_policy/mean": 18.83921241760254, "kl/ref_to_policy/rejected": 55.94013977050781, "learning_rate": 8.210319666412087e-07, "logits/chosen": -0.9703482389450073, "logits/rejected": -1.1098589897155762, "logps/chosen": -1925.0430908203125, "logps/rejected": -2007.1629638671875, "loss": 5.0703, "nll_loss": 1.1485967636108398, "rewards/accuracies": 1.0, "rewards/chosen": 0.18261709809303284, "rewards/margins": 0.7420184016227722, "rewards/rejected": -0.5594013333320618, "step": 582 }, { "epoch": 0.836441893830703, "grad_norm": 3.7882657051086426, "kl/ref_to_policy/chosen": -14.283226013183594, "kl/ref_to_policy/mean": 67.82647705078125, "kl/ref_to_policy/rejected": 149.93618774414062, "learning_rate": 8.073295223187766e-07, "logits/chosen": -1.9380536079406738, "logits/rejected": -2.3205044269561768, "logps/chosen": -909.0214233398438, "logps/rejected": -1064.2701416015625, "loss": 2.0158, "nll_loss": 0.4430646598339081, "rewards/accuracies": 0.875, "rewards/chosen": 0.14283224940299988, "rewards/margins": 1.6421940326690674, "rewards/rejected": -1.4993617534637451, "step": 583 }, { "epoch": 0.8378766140602583, "grad_norm": 6.495886325836182, "kl/ref_to_policy/chosen": 3.1351728439331055, "kl/ref_to_policy/mean": 47.52293014526367, "kl/ref_to_policy/rejected": 91.91069030761719, "learning_rate": 7.937323358440935e-07, "logits/chosen": -1.0279157161712646, "logits/rejected": -1.240728735923767, "logps/chosen": -1712.2757568359375, "logps/rejected": -1805.48486328125, "loss": 5.0512, "nll_loss": 1.1543118953704834, "rewards/accuracies": 1.0, "rewards/chosen": -0.0313517302274704, "rewards/margins": 0.8877551555633545, "rewards/rejected": -0.9191069006919861, "step": 584 }, { "epoch": 0.8393113342898135, "grad_norm": 4.205379486083984, "kl/ref_to_policy/chosen": -21.04732894897461, "kl/ref_to_policy/mean": 54.49264907836914, "kl/ref_to_policy/rejected": 130.03262329101562, "learning_rate": 7.802407485773011e-07, "logits/chosen": -1.782392978668213, "logits/rejected": -2.2054765224456787, "logps/chosen": -690.0151977539062, "logps/rejected": -834.3392333984375, "loss": 2.234, "nll_loss": 0.4891546070575714, "rewards/accuracies": 1.0, "rewards/chosen": 0.21047329902648926, "rewards/margins": 1.510799527168274, "rewards/rejected": -1.3003262281417847, "step": 585 }, { "epoch": 0.8407460545193687, "grad_norm": 6.703197956085205, "kl/ref_to_policy/chosen": -34.01860046386719, "kl/ref_to_policy/mean": 16.463760375976562, "kl/ref_to_policy/rejected": 66.94612121582031, "learning_rate": 7.668550992274476e-07, "logits/chosen": -1.2186574935913086, "logits/rejected": -1.489408016204834, "logps/chosen": -1110.9578857421875, "logps/rejected": -1214.7410888671875, "loss": 4.0941, "nll_loss": 0.921536386013031, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3401859402656555, "rewards/margins": 1.009647250175476, "rewards/rejected": -0.6694613099098206, "step": 586 }, { "epoch": 0.8421807747489239, "grad_norm": 6.433321475982666, "kl/ref_to_policy/chosen": -25.67558479309082, "kl/ref_to_policy/mean": 10.174150466918945, "kl/ref_to_policy/rejected": 46.023887634277344, "learning_rate": 7.535757238439939e-07, "logits/chosen": -0.9177906513214111, "logits/rejected": -1.0823986530303955, "logps/chosen": -1730.105712890625, "logps/rejected": -1812.96923828125, "loss": 5.6071, "nll_loss": 1.2787494659423828, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2567558288574219, "rewards/margins": 0.7169946432113647, "rewards/rejected": -0.46023887395858765, "step": 587 }, { "epoch": 0.8436154949784792, "grad_norm": 4.275094985961914, "kl/ref_to_policy/chosen": -23.886436462402344, "kl/ref_to_policy/mean": 34.87532043457031, "kl/ref_to_policy/rejected": 93.63706970214844, "learning_rate": 7.404029558083653e-07, "logits/chosen": -1.398421287536621, "logits/rejected": -1.6754438877105713, "logps/chosen": -1715.1058349609375, "logps/rejected": -1828.57666015625, "loss": 3.8325, "nll_loss": 0.8686447143554688, "rewards/accuracies": 1.0, "rewards/chosen": 0.23886433243751526, "rewards/margins": 1.1752351522445679, "rewards/rejected": -0.9363706707954407, "step": 588 }, { "epoch": 0.8450502152080345, "grad_norm": 4.52432918548584, "kl/ref_to_policy/chosen": -16.613704681396484, "kl/ref_to_policy/mean": 40.997352600097656, "kl/ref_to_policy/rejected": 98.6084213256836, "learning_rate": 7.273371258255923e-07, "logits/chosen": -1.2948335409164429, "logits/rejected": -1.5863099098205566, "logps/chosen": -1980.64404296875, "logps/rejected": -2094.6787109375, "loss": 3.961, "nll_loss": 0.8984989523887634, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16613703966140747, "rewards/margins": 1.1522212028503418, "rewards/rejected": -0.9860841035842896, "step": 589 }, { "epoch": 0.8464849354375896, "grad_norm": 8.912546157836914, "kl/ref_to_policy/chosen": -23.361854553222656, "kl/ref_to_policy/mean": 26.043899536132812, "kl/ref_to_policy/rejected": 75.44966125488281, "learning_rate": 7.143785619160026e-07, "logits/chosen": -1.2405920028686523, "logits/rejected": -1.5062370300292969, "logps/chosen": -1359.9583740234375, "logps/rejected": -1462.03564453125, "loss": 4.5314, "nll_loss": 1.0285452604293823, "rewards/accuracies": 0.875, "rewards/chosen": 0.23361855745315552, "rewards/margins": 0.9881150722503662, "rewards/rejected": -0.7544965147972107, "step": 590 }, { "epoch": 0.8464849354375896, "eval_kl/ref_to_policy/chosen": -16.308929443359375, "eval_kl/ref_to_policy/mean": 40.58087921142578, "eval_kl/ref_to_policy/rejected": 97.47069549560547, "eval_logits/chosen": -1.394832730293274, "eval_logits/rejected": -1.6874507665634155, "eval_logps/chosen": -1507.92724609375, "eval_logps/rejected": -1621.193359375, "eval_loss": 4.108944416046143, "eval_nll_loss": 0.9329833388328552, "eval_rewards/accuracies": 0.9202127456665039, "eval_rewards/chosen": 0.163089320063591, "eval_rewards/margins": 1.13779616355896, "eval_rewards/rejected": -0.9747068285942078, "eval_runtime": 110.0508, "eval_samples_per_second": 3.417, "eval_steps_per_second": 1.708, "step": 590 }, { "epoch": 0.8479196556671449, "grad_norm": 8.499542236328125, "kl/ref_to_policy/chosen": -18.379215240478516, "kl/ref_to_policy/mean": 18.88759422302246, "kl/ref_to_policy/rejected": 56.15439987182617, "learning_rate": 7.015275894069862e-07, "logits/chosen": -1.2401812076568604, "logits/rejected": -1.4443126916885376, "logps/chosen": -1080.8863525390625, "logps/rejected": -1162.615234375, "loss": 4.9533, "nll_loss": 1.1189898252487183, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1837921440601349, "rewards/margins": 0.74533611536026, "rewards/rejected": -0.5615439414978027, "step": 591 }, { "epoch": 0.8493543758967002, "grad_norm": 5.432191848754883, "kl/ref_to_policy/chosen": -24.447425842285156, "kl/ref_to_policy/mean": 31.571340560913086, "kl/ref_to_policy/rejected": 87.59010314941406, "learning_rate": 6.887845309248326e-07, "logits/chosen": -1.4218577146530151, "logits/rejected": -1.7623529434204102, "logps/chosen": -1155.3338623046875, "logps/rejected": -1269.6995849609375, "loss": 3.7773, "nll_loss": 0.8476359844207764, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24447426199913025, "rewards/margins": 1.120375156402588, "rewards/rejected": -0.8759009838104248, "step": 592 }, { "epoch": 0.8507890961262554, "grad_norm": 6.224435806274414, "kl/ref_to_policy/chosen": -10.420528411865234, "kl/ref_to_policy/mean": 33.17827606201172, "kl/ref_to_policy/rejected": 76.77708435058594, "learning_rate": 6.761497063866207e-07, "logits/chosen": -1.2016135454177856, "logits/rejected": -1.4605642557144165, "logps/chosen": -1751.969970703125, "logps/rejected": -1844.75927734375, "loss": 4.3921, "nll_loss": 0.9863642454147339, "rewards/accuracies": 1.0, "rewards/chosen": 0.10420528799295425, "rewards/margins": 0.8719760179519653, "rewards/rejected": -0.7677708268165588, "step": 593 }, { "epoch": 0.8522238163558106, "grad_norm": 6.962981224060059, "kl/ref_to_policy/chosen": -10.492706298828125, "kl/ref_to_policy/mean": 27.434358596801758, "kl/ref_to_policy/rejected": 65.36141967773438, "learning_rate": 6.63623432992202e-07, "logits/chosen": -1.052018404006958, "logits/rejected": -1.2224020957946777, "logps/chosen": -2033.17919921875, "logps/rejected": -2114.95703125, "loss": 5.5463, "nll_loss": 1.2688941955566406, "rewards/accuracies": 0.875, "rewards/chosen": 0.10492704808712006, "rewards/margins": 0.7585412263870239, "rewards/rejected": -0.6536142230033875, "step": 594 }, { "epoch": 0.8536585365853658, "grad_norm": 7.513665676116943, "kl/ref_to_policy/chosen": -22.44939422607422, "kl/ref_to_policy/mean": 14.777470588684082, "kl/ref_to_policy/rejected": 52.004337310791016, "learning_rate": 6.512060252162228e-07, "logits/chosen": -1.0558853149414062, "logits/rejected": -1.253065824508667, "logps/chosen": -1879.698974609375, "logps/rejected": -1960.9359130859375, "loss": 5.3978, "nll_loss": 1.2302145957946777, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22449393570423126, "rewards/margins": 0.7445372939109802, "rewards/rejected": -0.5200433135032654, "step": 595 }, { "epoch": 0.8550932568149211, "grad_norm": 5.950935363769531, "kl/ref_to_policy/chosen": -26.490093231201172, "kl/ref_to_policy/mean": 23.893596649169922, "kl/ref_to_policy/rejected": 74.27729034423828, "learning_rate": 6.388977948002406e-07, "logits/chosen": -1.1907087564468384, "logits/rejected": -1.4643000364303589, "logps/chosen": -1434.8380126953125, "logps/rejected": -1537.6424560546875, "loss": 4.4421, "nll_loss": 1.0086359977722168, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26490095257759094, "rewards/margins": 1.007673740386963, "rewards/rejected": -0.7427728176116943, "step": 596 }, { "epoch": 0.8565279770444764, "grad_norm": 5.3698859214782715, "kl/ref_to_policy/chosen": -17.0977725982666, "kl/ref_to_policy/mean": 33.01145935058594, "kl/ref_to_policy/rejected": 83.12068939208984, "learning_rate": 6.2669905074489e-07, "logits/chosen": -1.3890159130096436, "logits/rejected": -1.6278325319290161, "logps/chosen": -1567.308349609375, "logps/rejected": -1670.6461181640625, "loss": 4.3254, "nll_loss": 0.9790010452270508, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17097774147987366, "rewards/margins": 1.0021846294403076, "rewards/rejected": -0.8312069177627563, "step": 597 }, { "epoch": 0.8579626972740315, "grad_norm": 5.109411716461182, "kl/ref_to_policy/chosen": 3.443758487701416, "kl/ref_to_policy/mean": 54.032447814941406, "kl/ref_to_policy/rejected": 104.62113189697266, "learning_rate": 6.146100993021308e-07, "logits/chosen": -1.237080454826355, "logits/rejected": -1.4954807758331299, "logps/chosen": -2126.693603515625, "logps/rejected": -2230.963134765625, "loss": 4.6361, "nll_loss": 1.0574281215667725, "rewards/accuracies": 1.0, "rewards/chosen": -0.03443756699562073, "rewards/margins": 1.0117738246917725, "rewards/rejected": -1.0462113618850708, "step": 598 }, { "epoch": 0.8593974175035868, "grad_norm": 4.8077006340026855, "kl/ref_to_policy/chosen": -12.700933456420898, "kl/ref_to_policy/mean": 50.25029754638672, "kl/ref_to_policy/rejected": 113.20152282714844, "learning_rate": 6.026312439675553e-07, "logits/chosen": -1.485308051109314, "logits/rejected": -1.835515022277832, "logps/chosen": -1241.9708251953125, "logps/rejected": -1365.293212890625, "loss": 3.4229, "nll_loss": 0.76894211769104, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1270093470811844, "rewards/margins": 1.2590245008468628, "rewards/rejected": -1.1320152282714844, "step": 599 }, { "epoch": 0.860832137733142, "grad_norm": 6.1690354347229, "kl/ref_to_policy/chosen": -21.29623794555664, "kl/ref_to_policy/mean": 27.85297203063965, "kl/ref_to_policy/rejected": 77.00218200683594, "learning_rate": 5.907627854727688e-07, "logits/chosen": -1.2993284463882446, "logits/rejected": -1.5344257354736328, "logps/chosen": -1194.5709228515625, "logps/rejected": -1296.019775390625, "loss": 4.1006, "nll_loss": 0.9207570552825928, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21296237409114838, "rewards/margins": 0.9829842448234558, "rewards/rejected": -0.7700218558311462, "step": 600 }, { "epoch": 0.860832137733142, "eval_kl/ref_to_policy/chosen": -17.110971450805664, "eval_kl/ref_to_policy/mean": 39.80633544921875, "eval_kl/ref_to_policy/rejected": 96.72364044189453, "eval_logits/chosen": -1.4161990880966187, "eval_logits/rejected": -1.6918079853057861, "eval_logps/chosen": -1507.1251220703125, "eval_logps/rejected": -1620.4462890625, "eval_loss": 4.129255771636963, "eval_nll_loss": 0.938656747341156, "eval_rewards/accuracies": 0.9255319237709045, "eval_rewards/chosen": 0.17110970616340637, "eval_rewards/margins": 1.1383461952209473, "eval_rewards/rejected": -0.9672363996505737, "eval_runtime": 111.4923, "eval_samples_per_second": 3.372, "eval_steps_per_second": 1.686, "step": 600 }, { "epoch": 0.8622668579626973, "grad_norm": 7.41853141784668, "kl/ref_to_policy/chosen": -15.512503623962402, "kl/ref_to_policy/mean": 40.663387298583984, "kl/ref_to_policy/rejected": 96.83928680419922, "learning_rate": 5.790050217778442e-07, "logits/chosen": -1.3708429336547852, "logits/rejected": -1.701097846031189, "logps/chosen": -949.5499267578125, "logps/rejected": -1063.3157958984375, "loss": 3.9354, "nll_loss": 0.8895400166511536, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15512502193450928, "rewards/margins": 1.1235177516937256, "rewards/rejected": -0.9683927893638611, "step": 601 }, { "epoch": 0.8637015781922525, "grad_norm": 6.895825386047363, "kl/ref_to_policy/chosen": -18.879737854003906, "kl/ref_to_policy/mean": 7.151588439941406, "kl/ref_to_policy/rejected": 33.18291473388672, "learning_rate": 5.673582480638395e-07, "logits/chosen": -0.8206638693809509, "logits/rejected": -0.903295636177063, "logps/chosen": -2215.018798828125, "logps/rejected": -2275.985595703125, "loss": 5.7167, "nll_loss": 1.2970541715621948, "rewards/accuracies": 1.0, "rewards/chosen": 0.18879736959934235, "rewards/margins": 0.5206265449523926, "rewards/rejected": -0.33182916045188904, "step": 602 }, { "epoch": 0.8651362984218077, "grad_norm": 7.283055782318115, "kl/ref_to_policy/chosen": -18.823543548583984, "kl/ref_to_policy/mean": 37.263893127441406, "kl/ref_to_policy/rejected": 93.3513412475586, "learning_rate": 5.558227567253832e-07, "logits/chosen": -1.516948938369751, "logits/rejected": -1.782738447189331, "logps/chosen": -1392.028076171875, "logps/rejected": -1505.759521484375, "loss": 3.9983, "nll_loss": 0.9044390916824341, "rewards/accuracies": 0.875, "rewards/chosen": 0.18823541700839996, "rewards/margins": 1.1217488050460815, "rewards/rejected": -0.9335132837295532, "step": 603 }, { "epoch": 0.866571018651363, "grad_norm": 5.374007225036621, "kl/ref_to_policy/chosen": -1.8257246017456055, "kl/ref_to_policy/mean": 42.3155517578125, "kl/ref_to_policy/rejected": 86.45681762695312, "learning_rate": 5.443988373633397e-07, "logits/chosen": -1.2051126956939697, "logits/rejected": -1.4026520252227783, "logps/chosen": -2390.79931640625, "logps/rejected": -2483.515380859375, "loss": 5.2794, "nll_loss": 1.2099766731262207, "rewards/accuracies": 1.0, "rewards/chosen": 0.01825723797082901, "rewards/margins": 0.882825493812561, "rewards/rejected": -0.8645683526992798, "step": 604 }, { "epoch": 0.8680057388809183, "grad_norm": 3.996739625930786, "kl/ref_to_policy/chosen": -16.849952697753906, "kl/ref_to_policy/mean": 65.98097229003906, "kl/ref_to_policy/rejected": 148.8118896484375, "learning_rate": 5.330867767775333e-07, "logits/chosen": -1.8794902563095093, "logits/rejected": -2.3127284049987793, "logps/chosen": -748.4859619140625, "logps/rejected": -904.4505615234375, "loss": 2.1979, "nll_loss": 0.48902565240859985, "rewards/accuracies": 1.0, "rewards/chosen": 0.16849951446056366, "rewards/margins": 1.656618356704712, "rewards/rejected": -1.4881188869476318, "step": 605 }, { "epoch": 0.8694404591104734, "grad_norm": 21.723936080932617, "kl/ref_to_policy/chosen": -11.356147766113281, "kl/ref_to_policy/mean": 40.74607467651367, "kl/ref_to_policy/rejected": 92.84829711914062, "learning_rate": 5.218868589595555e-07, "logits/chosen": -1.5024693012237549, "logits/rejected": -1.6817326545715332, "logps/chosen": -1314.768798828125, "logps/rejected": -1418.5513916015625, "loss": 3.6807, "nll_loss": 0.8209651708602905, "rewards/accuracies": 0.875, "rewards/chosen": 0.1135614812374115, "rewards/margins": 1.0420444011688232, "rewards/rejected": -0.9284829497337341, "step": 606 }, { "epoch": 0.8708751793400287, "grad_norm": 4.309367656707764, "kl/ref_to_policy/chosen": -26.2618465423584, "kl/ref_to_policy/mean": 44.187294006347656, "kl/ref_to_policy/rejected": 114.63643646240234, "learning_rate": 5.107993650856285e-07, "logits/chosen": -1.5919941663742065, "logits/rejected": -1.9421714544296265, "logps/chosen": -874.8396606445312, "logps/rejected": -1009.4215698242188, "loss": 2.6231, "nll_loss": 0.5798934102058411, "rewards/accuracies": 1.0, "rewards/chosen": 0.26261845231056213, "rewards/margins": 1.408982753753662, "rewards/rejected": -1.1463643312454224, "step": 607 }, { "epoch": 0.8723098995695839, "grad_norm": 5.461764812469482, "kl/ref_to_policy/chosen": -5.4127326011657715, "kl/ref_to_policy/mean": 44.10942459106445, "kl/ref_to_policy/rejected": 93.63157653808594, "learning_rate": 4.998245735095459e-07, "logits/chosen": -1.2386846542358398, "logits/rejected": -1.4683257341384888, "logps/chosen": -1729.76708984375, "logps/rejected": -1832.9656982421875, "loss": 4.4837, "nll_loss": 1.0174630880355835, "rewards/accuracies": 0.875, "rewards/chosen": 0.05412732809782028, "rewards/margins": 0.9904431104660034, "rewards/rejected": -0.9363157749176025, "step": 608 }, { "epoch": 0.8737446197991392, "grad_norm": 5.341058254241943, "kl/ref_to_policy/chosen": -0.24441266059875488, "kl/ref_to_policy/mean": 41.25947570800781, "kl/ref_to_policy/rejected": 82.76336669921875, "learning_rate": 4.889627597556911e-07, "logits/chosen": -1.1642308235168457, "logits/rejected": -1.3882381916046143, "logps/chosen": -2239.232177734375, "logps/rejected": -2333.16943359375, "loss": 5.1917, "nll_loss": 1.1814175844192505, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0024441108107566833, "rewards/margins": 0.8300777673721313, "rewards/rejected": -0.8276336193084717, "step": 609 }, { "epoch": 0.8751793400286944, "grad_norm": 5.46244478225708, "kl/ref_to_policy/chosen": -11.677900314331055, "kl/ref_to_policy/mean": 32.798667907714844, "kl/ref_to_policy/rejected": 77.27523803710938, "learning_rate": 4.782141965121129e-07, "logits/chosen": -1.1255029439926147, "logits/rejected": -1.361769676208496, "logps/chosen": -1561.7518310546875, "logps/rejected": -1654.4315185546875, "loss": 4.6492, "nll_loss": 1.0534652471542358, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11677897721529007, "rewards/margins": 0.8895313143730164, "rewards/rejected": -0.7727523446083069, "step": 610 }, { "epoch": 0.8751793400286944, "eval_kl/ref_to_policy/chosen": -17.203529357910156, "eval_kl/ref_to_policy/mean": 39.726806640625, "eval_kl/ref_to_policy/rejected": 96.65714263916016, "eval_logits/chosen": -1.4214683771133423, "eval_logits/rejected": -1.6979544162750244, "eval_logps/chosen": -1507.032470703125, "eval_logps/rejected": -1620.3797607421875, "eval_loss": 4.140796184539795, "eval_nll_loss": 0.9416362047195435, "eval_rewards/accuracies": 0.936170220375061, "eval_rewards/chosen": 0.17203529179096222, "eval_rewards/margins": 1.1386066675186157, "eval_rewards/rejected": -0.9665713906288147, "eval_runtime": 112.3949, "eval_samples_per_second": 3.345, "eval_steps_per_second": 1.673, "step": 610 }, { "epoch": 0.8766140602582496, "grad_norm": 5.4254608154296875, "kl/ref_to_policy/chosen": -13.430669784545898, "kl/ref_to_policy/mean": 36.05692672729492, "kl/ref_to_policy/rejected": 85.54452514648438, "learning_rate": 4.6757915362368567e-07, "logits/chosen": -1.3475139141082764, "logits/rejected": -1.637538194656372, "logps/chosen": -1794.0849609375, "logps/rejected": -1897.8551025390625, "loss": 4.2829, "nll_loss": 0.9668095111846924, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13430668413639069, "rewards/margins": 0.989751935005188, "rewards/rejected": -0.8554452061653137, "step": 611 }, { "epoch": 0.8780487804878049, "grad_norm": 6.451540946960449, "kl/ref_to_policy/chosen": -30.97838592529297, "kl/ref_to_policy/mean": 23.353565216064453, "kl/ref_to_policy/rejected": 77.68550872802734, "learning_rate": 4.570578980853302e-07, "logits/chosen": -1.4640135765075684, "logits/rejected": -1.7294178009033203, "logps/chosen": -1519.7559814453125, "logps/rejected": -1633.3184814453125, "loss": 3.7649, "nll_loss": 0.8422784209251404, "rewards/accuracies": 0.875, "rewards/chosen": 0.30978384613990784, "rewards/margins": 1.0866389274597168, "rewards/rejected": -0.7768551111221313, "step": 612 }, { "epoch": 0.8794835007173601, "grad_norm": 5.632730960845947, "kl/ref_to_policy/chosen": -22.342731475830078, "kl/ref_to_policy/mean": 26.944961547851562, "kl/ref_to_policy/rejected": 76.23265838623047, "learning_rate": 4.466506940353138e-07, "logits/chosen": -1.2617111206054688, "logits/rejected": -1.5164433717727661, "logps/chosen": -1345.9620361328125, "logps/rejected": -1448.6192626953125, "loss": 4.0781, "nll_loss": 0.9153661727905273, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22342732548713684, "rewards/margins": 0.9857538938522339, "rewards/rejected": -0.7623265981674194, "step": 613 }, { "epoch": 0.8809182209469153, "grad_norm": 3.4749670028686523, "kl/ref_to_policy/chosen": -26.091026306152344, "kl/ref_to_policy/mean": 49.881385803222656, "kl/ref_to_policy/rejected": 125.85380554199219, "learning_rate": 4.363578027486187e-07, "logits/chosen": -1.7217013835906982, "logits/rejected": -2.1357903480529785, "logps/chosen": -1092.510498046875, "logps/rejected": -1238.1702880859375, "loss": 2.3606, "nll_loss": 0.5207383036613464, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2609102725982666, "rewards/margins": 1.5194482803344727, "rewards/rejected": -1.258538007736206, "step": 614 }, { "epoch": 0.8823529411764706, "grad_norm": 4.51090145111084, "kl/ref_to_policy/chosen": -24.650169372558594, "kl/ref_to_policy/mean": 38.67722702026367, "kl/ref_to_policy/rejected": 102.0046157836914, "learning_rate": 4.261794826303783e-07, "logits/chosen": -1.447562336921692, "logits/rejected": -1.7942482233047485, "logps/chosen": -1019.5139770507812, "logps/rejected": -1143.598388671875, "loss": 2.9855, "nll_loss": 0.6611955165863037, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24650168418884277, "rewards/margins": 1.2665477991104126, "rewards/rejected": -1.0200461149215698, "step": 615 }, { "epoch": 0.8837876614060258, "grad_norm": 4.193504333496094, "kl/ref_to_policy/chosen": -0.7611985206604004, "kl/ref_to_policy/mean": 68.01996612548828, "kl/ref_to_policy/rejected": 136.8011474609375, "learning_rate": 4.16115989209398e-07, "logits/chosen": -1.6044471263885498, "logits/rejected": -1.9398854970932007, "logps/chosen": -1286.2701416015625, "logps/rejected": -1420.87939453125, "loss": 3.2334, "nll_loss": 0.7297648191452026, "rewards/accuracies": 0.875, "rewards/chosen": 0.007611950859427452, "rewards/margins": 1.375623345375061, "rewards/rejected": -1.368011236190796, "step": 616 }, { "epoch": 0.8852223816355811, "grad_norm": 5.4579997062683105, "kl/ref_to_policy/chosen": -16.072351455688477, "kl/ref_to_policy/mean": 46.01536178588867, "kl/ref_to_policy/rejected": 108.10308074951172, "learning_rate": 4.0616757513173123e-07, "logits/chosen": -1.5481315851211548, "logits/rejected": -1.8903717994689941, "logps/chosen": -1106.2010498046875, "logps/rejected": -1229.0152587890625, "loss": 3.3453, "nll_loss": 0.7474454641342163, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1607235074043274, "rewards/margins": 1.2417542934417725, "rewards/rejected": -1.0810308456420898, "step": 617 }, { "epoch": 0.8866571018651362, "grad_norm": 5.697551727294922, "kl/ref_to_policy/chosen": -7.637320041656494, "kl/ref_to_policy/mean": 34.81425094604492, "kl/ref_to_policy/rejected": 77.26582336425781, "learning_rate": 3.963344901543437e-07, "logits/chosen": -1.1867974996566772, "logits/rejected": -1.428100824356079, "logps/chosen": -1615.65966796875, "logps/rejected": -1708.590087890625, "loss": 4.6375, "nll_loss": 1.0455132722854614, "rewards/accuracies": 0.875, "rewards/chosen": 0.07637319713830948, "rewards/margins": 0.849031388759613, "rewards/rejected": -0.7726582288742065, "step": 618 }, { "epoch": 0.8880918220946915, "grad_norm": 5.264880657196045, "kl/ref_to_policy/chosen": -26.787555694580078, "kl/ref_to_policy/mean": 42.53929901123047, "kl/ref_to_policy/rejected": 111.86615753173828, "learning_rate": 3.866169811388415e-07, "logits/chosen": -1.726571798324585, "logits/rejected": -2.0484726428985596, "logps/chosen": -857.8140258789062, "logps/rejected": -992.1259155273438, "loss": 2.7382, "nll_loss": 0.606389582157135, "rewards/accuracies": 1.0, "rewards/chosen": 0.2678755521774292, "rewards/margins": 1.3865370750427246, "rewards/rejected": -1.1186615228652954, "step": 619 }, { "epoch": 0.8895265423242468, "grad_norm": 5.060810089111328, "kl/ref_to_policy/chosen": -18.998743057250977, "kl/ref_to_policy/mean": 30.867141723632812, "kl/ref_to_policy/rejected": 80.7330322265625, "learning_rate": 3.7701529204526856e-07, "logits/chosen": -1.3011690378189087, "logits/rejected": -1.5695563554763794, "logps/chosen": -1574.251220703125, "logps/rejected": -1678.38330078125, "loss": 4.1734, "nll_loss": 0.9402675628662109, "rewards/accuracies": 1.0, "rewards/chosen": 0.1899874210357666, "rewards/margins": 0.9973176717758179, "rewards/rejected": -0.8073302507400513, "step": 620 }, { "epoch": 0.8895265423242468, "eval_kl/ref_to_policy/chosen": -17.163597106933594, "eval_kl/ref_to_policy/mean": 39.841880798339844, "eval_kl/ref_to_policy/rejected": 96.84735870361328, "eval_logits/chosen": -1.4197100400924683, "eval_logits/rejected": -1.7006374597549438, "eval_logps/chosen": -1507.072509765625, "eval_logps/rejected": -1620.5699462890625, "eval_loss": 4.14015531539917, "eval_nll_loss": 0.9416007995605469, "eval_rewards/accuracies": 0.936170220375061, "eval_rewards/chosen": 0.1716359704732895, "eval_rewards/margins": 1.1401094198226929, "eval_rewards/rejected": -0.9684735536575317, "eval_runtime": 113.5401, "eval_samples_per_second": 3.312, "eval_steps_per_second": 1.656, "step": 620 }, { "epoch": 0.890961262553802, "grad_norm": 45.04426574707031, "kl/ref_to_policy/chosen": -20.28866958618164, "kl/ref_to_policy/mean": 40.56842803955078, "kl/ref_to_policy/rejected": 101.42552185058594, "learning_rate": 3.675296639259912e-07, "logits/chosen": -1.7693495750427246, "logits/rejected": -2.036048650741577, "logps/chosen": -821.9613647460938, "logps/rejected": -942.423583984375, "loss": 2.8394, "nll_loss": 0.622097909450531, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2028866857290268, "rewards/margins": 1.21714186668396, "rewards/rejected": -1.014255166053772, "step": 621 }, { "epoch": 0.8923959827833573, "grad_norm": 5.1058220863342285, "kl/ref_to_policy/chosen": -28.242536544799805, "kl/ref_to_policy/mean": 28.717350006103516, "kl/ref_to_policy/rejected": 85.67723846435547, "learning_rate": 3.581603349196372e-07, "logits/chosen": -1.4029499292373657, "logits/rejected": -1.7227356433868408, "logps/chosen": -992.4450073242188, "logps/rejected": -1107.04833984375, "loss": 3.3239, "nll_loss": 0.7378114461898804, "rewards/accuracies": 1.0, "rewards/chosen": 0.2824253439903259, "rewards/margins": 1.1391977071762085, "rewards/rejected": -0.8567723631858826, "step": 622 }, { "epoch": 0.8938307030129125, "grad_norm": 5.036569595336914, "kl/ref_to_policy/chosen": -3.5088918209075928, "kl/ref_to_policy/mean": 44.56452178955078, "kl/ref_to_policy/rejected": 92.637939453125, "learning_rate": 3.4890754024512254e-07, "logits/chosen": -1.1561803817749023, "logits/rejected": -1.4114694595336914, "logps/chosen": -1800.4207763671875, "logps/rejected": -1904.561767578125, "loss": 4.7652, "nll_loss": 1.0851035118103027, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0350889191031456, "rewards/margins": 0.9614682197570801, "rewards/rejected": -0.9263793230056763, "step": 623 }, { "epoch": 0.8952654232424677, "grad_norm": 5.509184837341309, "kl/ref_to_policy/chosen": -8.820626258850098, "kl/ref_to_policy/mean": 42.4105110168457, "kl/ref_to_policy/rejected": 93.64165496826172, "learning_rate": 3.397715121957468e-07, "logits/chosen": -1.2623847723007202, "logits/rejected": -1.5266954898834229, "logps/chosen": -1485.210693359375, "logps/rejected": -1588.6044921875, "loss": 4.2496, "nll_loss": 0.9621097445487976, "rewards/accuracies": 1.0, "rewards/chosen": 0.08820626139640808, "rewards/margins": 1.0246226787567139, "rewards/rejected": -0.936416506767273, "step": 624 }, { "epoch": 0.896700143472023, "grad_norm": 3.3208630084991455, "kl/ref_to_policy/chosen": -15.965128898620605, "kl/ref_to_policy/mean": 60.26859664916992, "kl/ref_to_policy/rejected": 136.5023193359375, "learning_rate": 3.3075248013335614e-07, "logits/chosen": -1.7449660301208496, "logits/rejected": -2.1670286655426025, "logps/chosen": -1338.292724609375, "logps/rejected": -1484.40673828125, "loss": 2.7716, "nll_loss": 0.623875617980957, "rewards/accuracies": 1.0, "rewards/chosen": 0.1596512794494629, "rewards/margins": 1.524674415588379, "rewards/rejected": -1.365023136138916, "step": 625 }, { "epoch": 0.8981348637015782, "grad_norm": 4.961740016937256, "kl/ref_to_policy/chosen": -28.681474685668945, "kl/ref_to_policy/mean": 29.529897689819336, "kl/ref_to_policy/rejected": 87.74127197265625, "learning_rate": 3.2185067048259245e-07, "logits/chosen": -1.3515844345092773, "logits/rejected": -1.666420817375183, "logps/chosen": -1551.920166015625, "logps/rejected": -1666.5322265625, "loss": 4.2161, "nll_loss": 0.9626041650772095, "rewards/accuracies": 1.0, "rewards/chosen": 0.28681471943855286, "rewards/margins": 1.1642273664474487, "rewards/rejected": -0.877412736415863, "step": 626 }, { "epoch": 0.8995695839311334, "grad_norm": 3.253243923187256, "kl/ref_to_policy/chosen": -20.21104621887207, "kl/ref_to_policy/mean": 56.21403503417969, "kl/ref_to_policy/rejected": 132.63909912109375, "learning_rate": 3.1306630672520153e-07, "logits/chosen": -1.87485933303833, "logits/rejected": -2.245018243789673, "logps/chosen": -1078.2108154296875, "logps/rejected": -1223.19580078125, "loss": 2.3744, "nll_loss": 0.5252514481544495, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2021104395389557, "rewards/margins": 1.5285015106201172, "rewards/rejected": -1.3263909816741943, "step": 627 }, { "epoch": 0.9010043041606887, "grad_norm": 3.981308698654175, "kl/ref_to_policy/chosen": -10.931556701660156, "kl/ref_to_policy/mean": 60.1900634765625, "kl/ref_to_policy/rejected": 131.31167602539062, "learning_rate": 3.0439960939442794e-07, "logits/chosen": -1.5179647207260132, "logits/rejected": -1.9210275411605835, "logps/chosen": -1141.0450439453125, "logps/rejected": -1276.5753173828125, "loss": 2.8613, "nll_loss": 0.6405057907104492, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10931553691625595, "rewards/margins": 1.42243230342865, "rewards/rejected": -1.3131169080734253, "step": 628 }, { "epoch": 0.9024390243902439, "grad_norm": 7.278995037078857, "kl/ref_to_policy/chosen": 8.053503036499023, "kl/ref_to_policy/mean": 31.88091278076172, "kl/ref_to_policy/rejected": 55.70832061767578, "learning_rate": 2.9585079606947843e-07, "logits/chosen": -0.8360357284545898, "logits/rejected": -0.9585114121437073, "logps/chosen": -2271.1865234375, "logps/rejected": -2331.03076171875, "loss": 6.2749, "nll_loss": 1.431099534034729, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08053503185510635, "rewards/margins": 0.4765481650829315, "rewards/rejected": -0.5570833086967468, "step": 629 }, { "epoch": 0.9038737446197992, "grad_norm": 4.589001655578613, "kl/ref_to_policy/chosen": -5.972783088684082, "kl/ref_to_policy/mean": 51.01249694824219, "kl/ref_to_policy/rejected": 107.9977798461914, "learning_rate": 2.874200813700534e-07, "logits/chosen": -1.3745108842849731, "logits/rejected": -1.6660573482513428, "logps/chosen": -1906.167724609375, "logps/rejected": -2020.9293212890625, "loss": 3.9083, "nll_loss": 0.8840414881706238, "rewards/accuracies": 1.0, "rewards/chosen": 0.059727832674980164, "rewards/margins": 1.1397056579589844, "rewards/rejected": -1.0799777507781982, "step": 630 }, { "epoch": 0.9038737446197992, "eval_kl/ref_to_policy/chosen": -17.133895874023438, "eval_kl/ref_to_policy/mean": 39.947418212890625, "eval_kl/ref_to_policy/rejected": 97.02872467041016, "eval_logits/chosen": -1.4138108491897583, "eval_logits/rejected": -1.7049050331115723, "eval_logps/chosen": -1507.10205078125, "eval_logps/rejected": -1620.7513427734375, "eval_loss": 4.125217437744141, "eval_nll_loss": 0.9378679394721985, "eval_rewards/accuracies": 0.9335106611251831, "eval_rewards/chosen": 0.17133893072605133, "eval_rewards/margins": 1.141626238822937, "eval_rewards/rejected": -0.9702872633934021, "eval_runtime": 113.0821, "eval_samples_per_second": 3.325, "eval_steps_per_second": 1.663, "step": 630 }, { "epoch": 0.9053084648493543, "grad_norm": 5.754022121429443, "kl/ref_to_policy/chosen": -26.471263885498047, "kl/ref_to_policy/mean": 50.63972854614258, "kl/ref_to_policy/rejected": 127.75074005126953, "learning_rate": 2.7910767695096707e-07, "logits/chosen": -1.772430419921875, "logits/rejected": -2.1906304359436035, "logps/chosen": -678.7864990234375, "logps/rejected": -824.2291870117188, "loss": 2.335, "nll_loss": 0.5166870355606079, "rewards/accuracies": 1.0, "rewards/chosen": 0.2647126317024231, "rewards/margins": 1.5422199964523315, "rewards/rejected": -1.2775073051452637, "step": 631 }, { "epoch": 0.9067431850789096, "grad_norm": 5.355777740478516, "kl/ref_to_policy/chosen": -22.16982650756836, "kl/ref_to_policy/mean": 46.25419616699219, "kl/ref_to_policy/rejected": 114.67821502685547, "learning_rate": 2.7091379149682683e-07, "logits/chosen": -1.604055643081665, "logits/rejected": -2.000837802886963, "logps/chosen": -935.0410766601562, "logps/rejected": -1069.92626953125, "loss": 2.9361, "nll_loss": 0.6537454724311829, "rewards/accuracies": 1.0, "rewards/chosen": 0.22169825434684753, "rewards/margins": 1.3684804439544678, "rewards/rejected": -1.1467821598052979, "step": 632 }, { "epoch": 0.9081779053084649, "grad_norm": 4.050642490386963, "kl/ref_to_policy/chosen": -32.73173904418945, "kl/ref_to_policy/mean": 37.04167938232422, "kl/ref_to_policy/rejected": 106.81509399414062, "learning_rate": 2.628386307167996e-07, "logits/chosen": -1.5850826501846313, "logits/rejected": -1.966376543045044, "logps/chosen": -968.135498046875, "logps/rejected": -1103.244873046875, "loss": 2.7489, "nll_loss": 0.6106653809547424, "rewards/accuracies": 0.9375, "rewards/chosen": 0.32731738686561584, "rewards/margins": 1.3954684734344482, "rewards/rejected": -1.0681509971618652, "step": 633 }, { "epoch": 0.9096126255380201, "grad_norm": 5.714278221130371, "kl/ref_to_policy/chosen": -19.872581481933594, "kl/ref_to_policy/mean": 43.34891891479492, "kl/ref_to_policy/rejected": 106.57042694091797, "learning_rate": 2.548823973394449e-07, "logits/chosen": -1.5790001153945923, "logits/rejected": -1.9378613233566284, "logps/chosen": -909.61181640625, "logps/rejected": -1034.505859375, "loss": 3.1299, "nll_loss": 0.6964945793151855, "rewards/accuracies": 1.0, "rewards/chosen": 0.19872578978538513, "rewards/margins": 1.264430046081543, "rewards/rejected": -1.0657042264938354, "step": 634 }, { "epoch": 0.9110473457675753, "grad_norm": 4.584503173828125, "kl/ref_to_policy/chosen": -7.816714286804199, "kl/ref_to_policy/mean": 49.6978874206543, "kl/ref_to_policy/rejected": 107.21247863769531, "learning_rate": 2.470452911076227e-07, "logits/chosen": -1.2773919105529785, "logits/rejected": -1.5998426675796509, "logps/chosen": -1692.222412109375, "logps/rejected": -1806.16748046875, "loss": 3.9393, "nll_loss": 0.8927252292633057, "rewards/accuracies": 1.0, "rewards/chosen": 0.07816711813211441, "rewards/margins": 1.150291919708252, "rewards/rejected": -1.0721248388290405, "step": 635 }, { "epoch": 0.9124820659971306, "grad_norm": 5.936883926391602, "kl/ref_to_policy/chosen": -24.057344436645508, "kl/ref_to_policy/mean": 32.14082336425781, "kl/ref_to_policy/rejected": 88.3389892578125, "learning_rate": 2.393275087734864e-07, "logits/chosen": -1.3505314588546753, "logits/rejected": -1.7079354524612427, "logps/chosen": -1171.177001953125, "logps/rejected": -1285.0966796875, "loss": 3.6829, "nll_loss": 0.8251689672470093, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2405734360218048, "rewards/margins": 1.1239633560180664, "rewards/rejected": -0.8833897709846497, "step": 636 }, { "epoch": 0.9139167862266858, "grad_norm": 5.188084602355957, "kl/ref_to_policy/chosen": -1.6324559450149536, "kl/ref_to_policy/mean": 41.43467712402344, "kl/ref_to_policy/rejected": 84.50180053710938, "learning_rate": 2.317292440935348e-07, "logits/chosen": -1.0814932584762573, "logits/rejected": -1.2979846000671387, "logps/chosen": -2388.593994140625, "logps/rejected": -2482.431396484375, "loss": 5.573, "nll_loss": 1.2806445360183716, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01632455736398697, "rewards/margins": 0.8613425493240356, "rewards/rejected": -0.8450180292129517, "step": 637 }, { "epoch": 0.9153515064562411, "grad_norm": 3.7925291061401367, "kl/ref_to_policy/chosen": -21.190759658813477, "kl/ref_to_policy/mean": 60.606170654296875, "kl/ref_to_policy/rejected": 142.40309143066406, "learning_rate": 2.242506878237538e-07, "logits/chosen": -1.9552291631698608, "logits/rejected": -2.39743971824646, "logps/chosen": -432.44989013671875, "logps/rejected": -587.209716796875, "loss": 1.6348, "nll_loss": 0.3462771475315094, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21190756559371948, "rewards/margins": 1.6359386444091797, "rewards/rejected": -1.4240309000015259, "step": 638 }, { "epoch": 0.9167862266857962, "grad_norm": 5.004083156585693, "kl/ref_to_policy/chosen": 5.442426681518555, "kl/ref_to_policy/mean": 61.657718658447266, "kl/ref_to_policy/rejected": 117.87300872802734, "learning_rate": 2.1689202771482344e-07, "logits/chosen": -1.414747953414917, "logits/rejected": -1.7541085481643677, "logps/chosen": -1697.7081298828125, "logps/rejected": -1811.36328125, "loss": 4.0057, "nll_loss": 0.9066685438156128, "rewards/accuracies": 0.875, "rewards/chosen": -0.054424263536930084, "rewards/margins": 1.1243058443069458, "rewards/rejected": -1.1787301301956177, "step": 639 }, { "epoch": 0.9182209469153515, "grad_norm": 4.367313385009766, "kl/ref_to_policy/chosen": -11.486897468566895, "kl/ref_to_policy/mean": 51.31605529785156, "kl/ref_to_policy/rejected": 114.11900329589844, "learning_rate": 2.0965344850740698e-07, "logits/chosen": -1.4567674398422241, "logits/rejected": -1.806179404258728, "logps/chosen": -1540.73193359375, "logps/rejected": -1665.822265625, "loss": 3.8233, "nll_loss": 0.869227945804596, "rewards/accuracies": 1.0, "rewards/chosen": 0.11486896872520447, "rewards/margins": 1.2560590505599976, "rewards/rejected": -1.1411900520324707, "step": 640 }, { "epoch": 0.9182209469153515, "eval_kl/ref_to_policy/chosen": -17.082050323486328, "eval_kl/ref_to_policy/mean": 39.96186447143555, "eval_kl/ref_to_policy/rejected": 97.00578308105469, "eval_logits/chosen": -1.4110368490219116, "eval_logits/rejected": -1.7043132781982422, "eval_logps/chosen": -1507.154052734375, "eval_logps/rejected": -1620.7283935546875, "eval_loss": 4.123661041259766, "eval_nll_loss": 0.9372899532318115, "eval_rewards/accuracies": 0.936170220375061, "eval_rewards/chosen": 0.17082051932811737, "eval_rewards/margins": 1.140878438949585, "eval_rewards/rejected": -0.9700578451156616, "eval_runtime": 113.5648, "eval_samples_per_second": 3.311, "eval_steps_per_second": 1.655, "step": 640 }, { "epoch": 0.9196556671449068, "grad_norm": 5.346977233886719, "kl/ref_to_policy/chosen": -17.321691513061523, "kl/ref_to_policy/mean": 37.97872543334961, "kl/ref_to_policy/rejected": 93.27914428710938, "learning_rate": 2.0253513192751374e-07, "logits/chosen": -1.42568838596344, "logits/rejected": -1.7524231672286987, "logps/chosen": -1079.1080322265625, "logps/rejected": -1193.1502685546875, "loss": 3.4333, "nll_loss": 0.7609674334526062, "rewards/accuracies": 0.875, "rewards/chosen": 0.17321690917015076, "rewards/margins": 1.1060084104537964, "rewards/rejected": -0.9327914118766785, "step": 641 }, { "epoch": 0.921090387374462, "grad_norm": 7.25120735168457, "kl/ref_to_policy/chosen": -21.297283172607422, "kl/ref_to_policy/mean": 22.60654640197754, "kl/ref_to_policy/rejected": 66.5103759765625, "learning_rate": 1.9553725668193192e-07, "logits/chosen": -1.1283504962921143, "logits/rejected": -1.3915472030639648, "logps/chosen": -1028.98388671875, "logps/rejected": -1122.8853759765625, "loss": 4.6225, "nll_loss": 1.0453987121582031, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21297281980514526, "rewards/margins": 0.8780765533447266, "rewards/rejected": -0.6651037335395813, "step": 642 }, { "epoch": 0.9225251076040172, "grad_norm": 43.2213020324707, "kl/ref_to_policy/chosen": -3.916275978088379, "kl/ref_to_policy/mean": 41.61859893798828, "kl/ref_to_policy/rejected": 87.15348052978516, "learning_rate": 1.8865999845374794e-07, "logits/chosen": -1.358961820602417, "logits/rejected": -1.582597017288208, "logps/chosen": -1230.505615234375, "logps/rejected": -1324.276611328125, "loss": 3.8535, "nll_loss": 0.8453524112701416, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03916274756193161, "rewards/margins": 0.9106975197792053, "rewards/rejected": -0.8715346455574036, "step": 643 }, { "epoch": 0.9239598278335724, "grad_norm": 3.9759998321533203, "kl/ref_to_policy/chosen": -20.440866470336914, "kl/ref_to_policy/mean": 56.73588943481445, "kl/ref_to_policy/rejected": 133.9126434326172, "learning_rate": 1.8190352989793325e-07, "logits/chosen": -1.848856806755066, "logits/rejected": -2.2223124504089355, "logps/chosen": -944.67626953125, "logps/rejected": -1089.58056640625, "loss": 2.5057, "nll_loss": 0.5602436065673828, "rewards/accuracies": 1.0, "rewards/chosen": 0.204408660531044, "rewards/margins": 1.5435351133346558, "rewards/rejected": -1.3391265869140625, "step": 644 }, { "epoch": 0.9253945480631277, "grad_norm": 5.129319190979004, "kl/ref_to_policy/chosen": -20.787599563598633, "kl/ref_to_policy/mean": 37.250919342041016, "kl/ref_to_policy/rejected": 95.28943634033203, "learning_rate": 1.7526802063700943e-07, "logits/chosen": -1.476973533630371, "logits/rejected": -1.7508971691131592, "logps/chosen": -1862.08740234375, "logps/rejected": -1975.611572265625, "loss": 4.1827, "nll_loss": 0.9545143842697144, "rewards/accuracies": 1.0, "rewards/chosen": 0.20787599682807922, "rewards/margins": 1.1607704162597656, "rewards/rejected": -0.9528943300247192, "step": 645 }, { "epoch": 0.926829268292683, "grad_norm": 5.354647159576416, "kl/ref_to_policy/chosen": -28.084369659423828, "kl/ref_to_policy/mean": 41.9619140625, "kl/ref_to_policy/rejected": 112.00819396972656, "learning_rate": 1.6875363725679052e-07, "logits/chosen": -1.58774733543396, "logits/rejected": -2.0107932090759277, "logps/chosen": -1011.3140258789062, "logps/rejected": -1146.4202880859375, "loss": 2.86, "nll_loss": 0.6375475525856018, "rewards/accuracies": 0.875, "rewards/chosen": 0.28084370493888855, "rewards/margins": 1.400925636291504, "rewards/rejected": -1.1200820207595825, "step": 646 }, { "epoch": 0.9282639885222381, "grad_norm": 4.381620407104492, "kl/ref_to_policy/chosen": -18.939767837524414, "kl/ref_to_policy/mean": 52.46278381347656, "kl/ref_to_policy/rejected": 123.86532592773438, "learning_rate": 1.6236054330219853e-07, "logits/chosen": -1.7269765138626099, "logits/rejected": -2.1205403804779053, "logps/chosen": -1058.7945556640625, "logps/rejected": -1194.2384033203125, "loss": 3.0647, "nll_loss": 0.6925135850906372, "rewards/accuracies": 1.0, "rewards/chosen": 0.1893976926803589, "rewards/margins": 1.4280508756637573, "rewards/rejected": -1.2386531829833984, "step": 647 }, { "epoch": 0.9296987087517934, "grad_norm": 6.762674808502197, "kl/ref_to_policy/chosen": -29.452571868896484, "kl/ref_to_policy/mean": 20.691547393798828, "kl/ref_to_policy/rejected": 70.83566284179688, "learning_rate": 1.5608889927316407e-07, "logits/chosen": -1.2781147956848145, "logits/rejected": -1.5459017753601074, "logps/chosen": -1574.75830078125, "logps/rejected": -1678.8143310546875, "loss": 4.3734, "nll_loss": 0.990233302116394, "rewards/accuracies": 0.9375, "rewards/chosen": 0.294525682926178, "rewards/margins": 1.0028823614120483, "rewards/rejected": -0.7083566188812256, "step": 648 }, { "epoch": 0.9311334289813487, "grad_norm": 6.461817264556885, "kl/ref_to_policy/chosen": -30.17985725402832, "kl/ref_to_policy/mean": 26.30561065673828, "kl/ref_to_policy/rejected": 82.79107666015625, "learning_rate": 1.4993886262058833e-07, "logits/chosen": -1.2711490392684937, "logits/rejected": -1.518250584602356, "logps/chosen": -1625.890380859375, "logps/rejected": -1739.9140625, "loss": 4.3999, "nll_loss": 1.0046168565750122, "rewards/accuracies": 0.8125, "rewards/chosen": 0.301798552274704, "rewards/margins": 1.1297093629837036, "rewards/rejected": -0.8279107809066772, "step": 649 }, { "epoch": 0.9325681492109039, "grad_norm": 4.379281520843506, "kl/ref_to_policy/chosen": -17.611812591552734, "kl/ref_to_policy/mean": 52.233924865722656, "kl/ref_to_policy/rejected": 122.07966613769531, "learning_rate": 1.439105877423963e-07, "logits/chosen": -1.6416469812393188, "logits/rejected": -2.0203888416290283, "logps/chosen": -886.46630859375, "logps/rejected": -1021.7086181640625, "loss": 2.9376, "nll_loss": 0.6574876308441162, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1761181354522705, "rewards/margins": 1.3969148397445679, "rewards/rejected": -1.2207965850830078, "step": 650 }, { "epoch": 0.9325681492109039, "eval_kl/ref_to_policy/chosen": -17.312767028808594, "eval_kl/ref_to_policy/mean": 39.823211669921875, "eval_kl/ref_to_policy/rejected": 96.95919036865234, "eval_logits/chosen": -1.413274884223938, "eval_logits/rejected": -1.7015637159347534, "eval_logps/chosen": -1506.9232177734375, "eval_logps/rejected": -1620.681884765625, "eval_loss": 4.126082897186279, "eval_nll_loss": 0.938210129737854, "eval_rewards/accuracies": 0.9414893388748169, "eval_rewards/chosen": 0.173127681016922, "eval_rewards/margins": 1.1427193880081177, "eval_rewards/rejected": -0.9695918560028076, "eval_runtime": 113.6469, "eval_samples_per_second": 3.308, "eval_steps_per_second": 1.654, "step": 650 }, { "epoch": 0.9340028694404591, "grad_norm": 6.732354164123535, "kl/ref_to_policy/chosen": -27.22667694091797, "kl/ref_to_policy/mean": 29.857528686523438, "kl/ref_to_policy/rejected": 86.94173431396484, "learning_rate": 1.3800422597965935e-07, "logits/chosen": -1.3850383758544922, "logits/rejected": -1.6209690570831299, "logps/chosen": -1031.546630859375, "logps/rejected": -1145.6488037109375, "loss": 3.9943, "nll_loss": 0.9054826498031616, "rewards/accuracies": 1.0, "rewards/chosen": 0.2722667455673218, "rewards/margins": 1.1416841745376587, "rewards/rejected": -0.8694173693656921, "step": 651 }, { "epoch": 0.9354375896700143, "grad_norm": 514.940185546875, "kl/ref_to_policy/chosen": -29.85129737854004, "kl/ref_to_policy/mean": 32.74906921386719, "kl/ref_to_policy/rejected": 95.34944152832031, "learning_rate": 1.322199256127943e-07, "logits/chosen": -1.6713535785675049, "logits/rejected": -1.977495551109314, "logps/chosen": -797.1799926757812, "logps/rejected": -918.2341918945312, "loss": 3.9929, "nll_loss": 0.9106427431106567, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2985129654407501, "rewards/margins": 1.2520073652267456, "rewards/rejected": -0.9534944295883179, "step": 652 }, { "epoch": 0.9368723098995696, "grad_norm": 5.774999141693115, "kl/ref_to_policy/chosen": -18.5284481048584, "kl/ref_to_policy/mean": 45.243751525878906, "kl/ref_to_policy/rejected": 109.01596069335938, "learning_rate": 1.2655783185784253e-07, "logits/chosen": -1.4882879257202148, "logits/rejected": -1.8546195030212402, "logps/chosen": -1353.979248046875, "logps/rejected": -1478.761962890625, "loss": 3.5211, "nll_loss": 0.7955548763275146, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18528448045253754, "rewards/margins": 1.2754439115524292, "rewards/rejected": -1.09015953540802, "step": 653 }, { "epoch": 0.9383070301291249, "grad_norm": 6.296442031860352, "kl/ref_to_policy/chosen": -14.596924781799316, "kl/ref_to_policy/mean": 9.905580520629883, "kl/ref_to_policy/rejected": 34.408084869384766, "learning_rate": 1.210180868628219e-07, "logits/chosen": -0.9288145899772644, "logits/rejected": -1.0422111749649048, "logps/chosen": -2335.590576171875, "logps/rejected": -2398.407958984375, "loss": 5.999, "nll_loss": 1.364093542098999, "rewards/accuracies": 0.875, "rewards/chosen": 0.14596925675868988, "rewards/margins": 0.4900501072406769, "rewards/rejected": -0.3440808653831482, "step": 654 }, { "epoch": 0.93974175035868, "grad_norm": 4.342623233795166, "kl/ref_to_policy/chosen": -28.498422622680664, "kl/ref_to_policy/mean": 47.55474090576172, "kl/ref_to_policy/rejected": 123.60789489746094, "learning_rate": 1.1560082970416164e-07, "logits/chosen": -1.8029528856277466, "logits/rejected": -2.2195613384246826, "logps/chosen": -731.6578369140625, "logps/rejected": -877.54541015625, "loss": 2.3148, "nll_loss": 0.5092270970344543, "rewards/accuracies": 1.0, "rewards/chosen": 0.2849842309951782, "rewards/margins": 1.521063208580017, "rewards/rejected": -1.2360789775848389, "step": 655 }, { "epoch": 0.9411764705882353, "grad_norm": 5.619664192199707, "kl/ref_to_policy/chosen": -5.836790084838867, "kl/ref_to_policy/mean": 42.58184814453125, "kl/ref_to_policy/rejected": 91.00048828125, "learning_rate": 1.1030619638320805e-07, "logits/chosen": -1.194818377494812, "logits/rejected": -1.4198092222213745, "logps/chosen": -2102.6318359375, "logps/rejected": -2206.51416015625, "loss": 4.8321, "nll_loss": 1.1009613275527954, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05836791545152664, "rewards/margins": 0.9683728218078613, "rewards/rejected": -0.9100049734115601, "step": 656 }, { "epoch": 0.9426111908177905, "grad_norm": 4.1673054695129395, "kl/ref_to_policy/chosen": -24.247154235839844, "kl/ref_to_policy/mean": 52.608421325683594, "kl/ref_to_policy/rejected": 129.4639892578125, "learning_rate": 1.0513431982280997e-07, "logits/chosen": -1.9208946228027344, "logits/rejected": -2.325392007827759, "logps/chosen": -795.0419311523438, "logps/rejected": -939.9053955078125, "loss": 2.4722, "nll_loss": 0.5517598986625671, "rewards/accuracies": 1.0, "rewards/chosen": 0.24247151613235474, "rewards/margins": 1.5371114015579224, "rewards/rejected": -1.2946399450302124, "step": 657 }, { "epoch": 0.9440459110473458, "grad_norm": 4.386082172393799, "kl/ref_to_policy/chosen": -29.18897247314453, "kl/ref_to_policy/mean": 35.22969436645508, "kl/ref_to_policy/rejected": 99.64836120605469, "learning_rate": 1.0008532986398422e-07, "logits/chosen": -1.494133472442627, "logits/rejected": -1.896979808807373, "logps/chosen": -1121.25634765625, "logps/rejected": -1245.9295654296875, "loss": 3.0132, "nll_loss": 0.6694651246070862, "rewards/accuracies": 1.0, "rewards/chosen": 0.2918896973133087, "rewards/margins": 1.288373351097107, "rewards/rejected": -0.996483564376831, "step": 658 }, { "epoch": 0.945480631276901, "grad_norm": 5.787262439727783, "kl/ref_to_policy/chosen": -7.386599063873291, "kl/ref_to_policy/mean": 35.06195068359375, "kl/ref_to_policy/rejected": 77.51050567626953, "learning_rate": 9.51593532626538e-08, "logits/chosen": -1.2172625064849854, "logits/rejected": -1.4114538431167603, "logps/chosen": -1802.1290283203125, "logps/rejected": -1893.246826171875, "loss": 4.8802, "nll_loss": 1.1080282926559448, "rewards/accuracies": 1.0, "rewards/chosen": 0.07386598736047745, "rewards/margins": 0.8489710092544556, "rewards/rejected": -0.7751049399375916, "step": 659 }, { "epoch": 0.9469153515064562, "grad_norm": 5.887627601623535, "kl/ref_to_policy/chosen": -17.753751754760742, "kl/ref_to_policy/mean": 39.447845458984375, "kl/ref_to_policy/rejected": 96.64944458007812, "learning_rate": 9.035651368646647e-08, "logits/chosen": -1.4799983501434326, "logits/rejected": -1.7795032262802124, "logps/chosen": -1128.7305908203125, "logps/rejected": -1243.2899169921875, "loss": 3.4217, "nll_loss": 0.7617961168289185, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17753751575946808, "rewards/margins": 1.1440320014953613, "rewards/rejected": -0.9664943814277649, "step": 660 }, { "epoch": 0.9469153515064562, "eval_kl/ref_to_policy/chosen": -17.39002227783203, "eval_kl/ref_to_policy/mean": 39.77736282348633, "eval_kl/ref_to_policy/rejected": 96.94474792480469, "eval_logits/chosen": -1.4118534326553345, "eval_logits/rejected": -1.7004995346069336, "eval_logps/chosen": -1506.8460693359375, "eval_logps/rejected": -1620.6673583984375, "eval_loss": 4.12286901473999, "eval_nll_loss": 0.9374497532844543, "eval_rewards/accuracies": 0.9414893388748169, "eval_rewards/chosen": 0.17390023171901703, "eval_rewards/margins": 1.1433477401733398, "eval_rewards/rejected": -0.9694474339485168, "eval_runtime": 113.5236, "eval_samples_per_second": 3.312, "eval_steps_per_second": 1.656, "step": 660 }, { "epoch": 0.9483500717360115, "grad_norm": 53.745235443115234, "kl/ref_to_policy/chosen": -15.611062049865723, "kl/ref_to_policy/mean": 22.179828643798828, "kl/ref_to_policy/rejected": 59.97071838378906, "learning_rate": 8.567693171168956e-08, "logits/chosen": -1.308556079864502, "logits/rejected": -1.5579252243041992, "logps/chosen": -1490.943603515625, "logps/rejected": -1570.251953125, "loss": 5.6726, "nll_loss": 1.3000826835632324, "rewards/accuracies": 0.875, "rewards/chosen": 0.15611062943935394, "rewards/margins": 0.7558177709579468, "rewards/rejected": -0.5997071862220764, "step": 661 }, { "epoch": 0.9497847919655668, "grad_norm": 4.1089653968811035, "kl/ref_to_policy/chosen": -14.031821250915527, "kl/ref_to_policy/mean": 55.61558532714844, "kl/ref_to_policy/rejected": 125.26300048828125, "learning_rate": 8.11207248201834e-08, "logits/chosen": -1.663710117340088, "logits/rejected": -2.0044970512390137, "logps/chosen": -1300.030517578125, "logps/rejected": -1434.5970458984375, "loss": 3.3615, "nll_loss": 0.7630646824836731, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14031822979450226, "rewards/margins": 1.3929482698440552, "rewards/rejected": -1.2526299953460693, "step": 662 }, { "epoch": 0.9512195121951219, "grad_norm": 4.263807773590088, "kl/ref_to_policy/chosen": -9.42828369140625, "kl/ref_to_policy/mean": 49.496360778808594, "kl/ref_to_policy/rejected": 108.42101287841797, "learning_rate": 7.6688007396451e-08, "logits/chosen": -1.426565170288086, "logits/rejected": -1.721841812133789, "logps/chosen": -1897.98583984375, "logps/rejected": -2012.421142578125, "loss": 3.802, "nll_loss": 0.8608169555664062, "rewards/accuracies": 1.0, "rewards/chosen": 0.0942828357219696, "rewards/margins": 1.1784930229187012, "rewards/rejected": -1.0842101573944092, "step": 663 }, { "epoch": 0.9526542324246772, "grad_norm": 5.020418643951416, "kl/ref_to_policy/chosen": -22.8299560546875, "kl/ref_to_policy/mean": 45.618080139160156, "kl/ref_to_policy/rejected": 114.06610870361328, "learning_rate": 7.237889072476856e-08, "logits/chosen": -1.6496504545211792, "logits/rejected": -2.0777525901794434, "logps/chosen": -1300.6778564453125, "logps/rejected": -1435.8668212890625, "loss": 2.9159, "nll_loss": 0.649838924407959, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22829952836036682, "rewards/margins": 1.3689604997634888, "rewards/rejected": -1.1406610012054443, "step": 664 }, { "epoch": 0.9540889526542324, "grad_norm": 4.727002143859863, "kl/ref_to_policy/chosen": -15.79278564453125, "kl/ref_to_policy/mean": 41.22864532470703, "kl/ref_to_policy/rejected": 98.25007629394531, "learning_rate": 6.819348298638839e-08, "logits/chosen": -1.3708348274230957, "logits/rejected": -1.6828579902648926, "logps/chosen": -1526.355712890625, "logps/rejected": -1640.9884033203125, "loss": 3.6075, "nll_loss": 0.8087443709373474, "rewards/accuracies": 1.0, "rewards/chosen": 0.15792784094810486, "rewards/margins": 1.1404285430908203, "rewards/rejected": -0.9825007319450378, "step": 665 }, { "epoch": 0.9555236728837877, "grad_norm": 4.972632884979248, "kl/ref_to_policy/chosen": -13.769514083862305, "kl/ref_to_policy/mean": 43.25881576538086, "kl/ref_to_policy/rejected": 100.28713989257812, "learning_rate": 6.4131889256826e-08, "logits/chosen": -1.466694712638855, "logits/rejected": -1.7356538772583008, "logps/chosen": -1942.20703125, "logps/rejected": -2056.35546875, "loss": 4.084, "nll_loss": 0.9269839525222778, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13769513368606567, "rewards/margins": 1.1405665874481201, "rewards/rejected": -1.0028715133666992, "step": 666 }, { "epoch": 0.9569583931133429, "grad_norm": 5.591494083404541, "kl/ref_to_policy/chosen": -12.806594848632812, "kl/ref_to_policy/mean": 39.62765884399414, "kl/ref_to_policy/rejected": 92.0619125366211, "learning_rate": 6.019421150322114e-08, "logits/chosen": -1.227940559387207, "logits/rejected": -1.5178507566452026, "logps/chosen": -1716.826171875, "logps/rejected": -1820.92919921875, "loss": 4.5424, "nll_loss": 1.0379329919815063, "rewards/accuracies": 1.0, "rewards/chosen": 0.12806594371795654, "rewards/margins": 1.048685073852539, "rewards/rejected": -0.9206191897392273, "step": 667 }, { "epoch": 0.9583931133428981, "grad_norm": 58.20410919189453, "kl/ref_to_policy/chosen": -0.315000057220459, "kl/ref_to_policy/mean": 39.90547561645508, "kl/ref_to_policy/rejected": 80.12594604492188, "learning_rate": 5.638054858177644e-08, "logits/chosen": -1.3353217840194702, "logits/rejected": -1.522354245185852, "logps/chosen": -1780.614501953125, "logps/rejected": -1866.345703125, "loss": 4.2065, "nll_loss": 0.9293645024299622, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00314999558031559, "rewards/margins": 0.8044094443321228, "rewards/rejected": -0.8012595176696777, "step": 668 }, { "epoch": 0.9598278335724534, "grad_norm": 4.8534770011901855, "kl/ref_to_policy/chosen": -12.56266975402832, "kl/ref_to_policy/mean": 45.506370544433594, "kl/ref_to_policy/rejected": 103.5754165649414, "learning_rate": 5.26909962352784e-08, "logits/chosen": -1.3905683755874634, "logits/rejected": -1.6743065118789673, "logps/chosen": -1840.820068359375, "logps/rejected": -1955.223876953125, "loss": 4.0176, "nll_loss": 0.9125683307647705, "rewards/accuracies": 1.0, "rewards/chosen": 0.12562671303749084, "rewards/margins": 1.1613808870315552, "rewards/rejected": -1.0357542037963867, "step": 669 }, { "epoch": 0.9612625538020086, "grad_norm": 336.0335998535156, "kl/ref_to_policy/chosen": -27.49594497680664, "kl/ref_to_policy/mean": 34.72414016723633, "kl/ref_to_policy/rejected": 96.94422149658203, "learning_rate": 4.91256470906909e-08, "logits/chosen": -1.798980712890625, "logits/rejected": -2.053058624267578, "logps/chosen": -1186.8165283203125, "logps/rejected": -1309.4930419921875, "loss": 3.1943, "nll_loss": 0.7108975648880005, "rewards/accuracies": 1.0, "rewards/chosen": 0.2749594449996948, "rewards/margins": 1.2444016933441162, "rewards/rejected": -0.9694421887397766, "step": 670 }, { "epoch": 0.9612625538020086, "eval_kl/ref_to_policy/chosen": -17.412715911865234, "eval_kl/ref_to_policy/mean": 39.73408889770508, "eval_kl/ref_to_policy/rejected": 96.88089752197266, "eval_logits/chosen": -1.415387749671936, "eval_logits/rejected": -1.7018799781799316, "eval_logps/chosen": -1506.8233642578125, "eval_logps/rejected": -1620.6033935546875, "eval_loss": 4.123062610626221, "eval_nll_loss": 0.9374790787696838, "eval_rewards/accuracies": 0.938829779624939, "eval_rewards/chosen": 0.17412714660167694, "eval_rewards/margins": 1.1429362297058105, "eval_rewards/rejected": -0.9688089489936829, "eval_runtime": 114.461, "eval_samples_per_second": 3.285, "eval_steps_per_second": 1.642, "step": 670 }, { "epoch": 0.9626972740315638, "grad_norm": 4.491569519042969, "kl/ref_to_policy/chosen": -15.930231094360352, "kl/ref_to_policy/mean": 47.3368034362793, "kl/ref_to_policy/rejected": 110.60383605957031, "learning_rate": 4.568459065683206e-08, "logits/chosen": -1.4076008796691895, "logits/rejected": -1.7698720693588257, "logps/chosen": -1640.9368896484375, "logps/rejected": -1765.496337890625, "loss": 3.5943, "nll_loss": 0.8123351335525513, "rewards/accuracies": 1.0, "rewards/chosen": 0.15930230915546417, "rewards/margins": 1.2653406858444214, "rewards/rejected": -1.1060383319854736, "step": 671 }, { "epoch": 0.9641319942611191, "grad_norm": 4.0130462646484375, "kl/ref_to_policy/chosen": -21.377574920654297, "kl/ref_to_policy/mean": 47.42205047607422, "kl/ref_to_policy/rejected": 116.22166442871094, "learning_rate": 4.236791332212498e-08, "logits/chosen": -1.6479524374008179, "logits/rejected": -2.014951229095459, "logps/chosen": -947.9349365234375, "logps/rejected": -1083.2001953125, "loss": 2.6234, "nll_loss": 0.5768427848815918, "rewards/accuracies": 1.0, "rewards/chosen": 0.21377576887607574, "rewards/margins": 1.3759922981262207, "rewards/rejected": -1.1622166633605957, "step": 672 }, { "epoch": 0.9655667144906743, "grad_norm": 5.983315944671631, "kl/ref_to_policy/chosen": -12.010712623596191, "kl/ref_to_policy/mean": 38.6908073425293, "kl/ref_to_policy/rejected": 89.392333984375, "learning_rate": 3.917569835243107e-08, "logits/chosen": -1.2815521955490112, "logits/rejected": -1.528031826019287, "logps/chosen": -1706.6611328125, "logps/rejected": -1809.4176025390625, "loss": 4.5582, "nll_loss": 1.03798246383667, "rewards/accuracies": 1.0, "rewards/chosen": 0.12010712921619415, "rewards/margins": 1.0140303373336792, "rewards/rejected": -0.893923282623291, "step": 673 }, { "epoch": 0.9670014347202296, "grad_norm": 5.047052383422852, "kl/ref_to_policy/chosen": -20.264911651611328, "kl/ref_to_policy/mean": 35.9326057434082, "kl/ref_to_policy/rejected": 92.130126953125, "learning_rate": 3.610802588895845e-08, "logits/chosen": -1.441024661064148, "logits/rejected": -1.6948872804641724, "logps/chosen": -1313.9537353515625, "logps/rejected": -1428.1768798828125, "loss": 3.6938, "nll_loss": 0.8284791707992554, "rewards/accuracies": 0.875, "rewards/chosen": 0.20264911651611328, "rewards/margins": 1.1239503622055054, "rewards/rejected": -0.9213013052940369, "step": 674 }, { "epoch": 0.9684361549497847, "grad_norm": 5.481049060821533, "kl/ref_to_policy/chosen": -26.641353607177734, "kl/ref_to_policy/mean": 31.529796600341797, "kl/ref_to_policy/rejected": 89.7009506225586, "learning_rate": 3.316497294625132e-08, "logits/chosen": -1.5029970407485962, "logits/rejected": -1.8209326267242432, "logps/chosen": -1274.272705078125, "logps/rejected": -1389.173583984375, "loss": 3.9117, "nll_loss": 0.8866004943847656, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2664135694503784, "rewards/margins": 1.1634228229522705, "rewards/rejected": -0.8970093727111816, "step": 675 }, { "epoch": 0.96987087517934, "grad_norm": 3.9459519386291504, "kl/ref_to_policy/chosen": -21.469253540039062, "kl/ref_to_policy/mean": 60.25615692138672, "kl/ref_to_policy/rejected": 141.9815673828125, "learning_rate": 3.034661341025258e-08, "logits/chosen": -1.8810155391693115, "logits/rejected": -2.304809093475342, "logps/chosen": -708.85888671875, "logps/rejected": -864.3447875976562, "loss": 2.0333, "nll_loss": 0.44615885615348816, "rewards/accuracies": 1.0, "rewards/chosen": 0.21469253301620483, "rewards/margins": 1.6345081329345703, "rewards/rejected": -1.4198156595230103, "step": 676 }, { "epoch": 0.9713055954088953, "grad_norm": 4.280660152435303, "kl/ref_to_policy/chosen": -9.444134712219238, "kl/ref_to_policy/mean": 59.516319274902344, "kl/ref_to_policy/rejected": 128.47677612304688, "learning_rate": 2.765301803645426e-08, "logits/chosen": -1.5154109001159668, "logits/rejected": -1.8709065914154053, "logps/chosen": -1353.321533203125, "logps/rejected": -1488.260009765625, "loss": 3.4105, "nll_loss": 0.7729014158248901, "rewards/accuracies": 0.875, "rewards/chosen": 0.09444133937358856, "rewards/margins": 1.3792091608047485, "rewards/rejected": -1.284767746925354, "step": 677 }, { "epoch": 0.9727403156384505, "grad_norm": 4.002831935882568, "kl/ref_to_policy/chosen": -13.88841438293457, "kl/ref_to_policy/mean": 43.08039855957031, "kl/ref_to_policy/rejected": 100.0492172241211, "learning_rate": 2.5084254448117794e-08, "logits/chosen": -1.4529579877853394, "logits/rejected": -1.7584527730941772, "logps/chosen": -1866.478759765625, "logps/rejected": -1980.74462890625, "loss": 3.5302, "nll_loss": 0.7881006002426147, "rewards/accuracies": 1.0, "rewards/chosen": 0.13888412714004517, "rewards/margins": 1.1393762826919556, "rewards/rejected": -1.0004920959472656, "step": 678 }, { "epoch": 0.9741750358680057, "grad_norm": 4.655378341674805, "kl/ref_to_policy/chosen": -14.871829986572266, "kl/ref_to_policy/mean": 48.50799560546875, "kl/ref_to_policy/rejected": 111.88782501220703, "learning_rate": 2.264038713457706e-08, "logits/chosen": -1.4382483959197998, "logits/rejected": -1.8043407201766968, "logps/chosen": -1153.0186767578125, "logps/rejected": -1277.7257080078125, "loss": 3.2304, "nll_loss": 0.7222899198532104, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14871829748153687, "rewards/margins": 1.2675963640213013, "rewards/rejected": -1.1188781261444092, "step": 679 }, { "epoch": 0.975609756097561, "grad_norm": 4.985217571258545, "kl/ref_to_policy/chosen": -0.7354879379272461, "kl/ref_to_policy/mean": 49.70731735229492, "kl/ref_to_policy/rejected": 100.1501235961914, "learning_rate": 2.0321477449619098e-08, "logits/chosen": -1.1322145462036133, "logits/rejected": -1.3881739377975464, "logps/chosen": -1747.6630859375, "logps/rejected": -1851.497802734375, "loss": 4.3, "nll_loss": 0.9730894565582275, "rewards/accuracies": 1.0, "rewards/chosen": 0.007354879751801491, "rewards/margins": 1.0088560581207275, "rewards/rejected": -1.001501202583313, "step": 680 }, { "epoch": 0.975609756097561, "eval_kl/ref_to_policy/chosen": -17.367643356323242, "eval_kl/ref_to_policy/mean": 39.7983283996582, "eval_kl/ref_to_policy/rejected": 96.96429443359375, "eval_logits/chosen": -1.414766788482666, "eval_logits/rejected": -1.705306053161621, "eval_logps/chosen": -1506.868408203125, "eval_logps/rejected": -1620.6868896484375, "eval_loss": 4.123131275177002, "eval_nll_loss": 0.9375060200691223, "eval_rewards/accuracies": 0.9414893388748169, "eval_rewards/chosen": 0.17367643117904663, "eval_rewards/margins": 1.1433193683624268, "eval_rewards/rejected": -0.9696428179740906, "eval_runtime": 113.9916, "eval_samples_per_second": 3.298, "eval_steps_per_second": 1.649, "step": 680 }, { "epoch": 0.9770444763271162, "grad_norm": 5.47835636138916, "kl/ref_to_policy/chosen": -20.304222106933594, "kl/ref_to_policy/mean": 31.678247451782227, "kl/ref_to_policy/rejected": 83.66071319580078, "learning_rate": 1.8127583609945376e-08, "logits/chosen": -1.225882887840271, "logits/rejected": -1.5408653020858765, "logps/chosen": -1550.46142578125, "logps/rejected": -1654.89501953125, "loss": 4.2339, "nll_loss": 0.9600047469139099, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20304220914840698, "rewards/margins": 1.0396493673324585, "rewards/rejected": -0.8366072177886963, "step": 681 }, { "epoch": 0.9784791965566715, "grad_norm": 5.110333442687988, "kl/ref_to_policy/chosen": 0.4497098922729492, "kl/ref_to_policy/mean": 50.76375961303711, "kl/ref_to_policy/rejected": 101.07779693603516, "learning_rate": 1.6058760693708487e-08, "logits/chosen": -1.1930326223373413, "logits/rejected": -1.499510645866394, "logps/chosen": -1767.30908203125, "logps/rejected": -1872.0816650390625, "loss": 4.4, "nll_loss": 0.9972098469734192, "rewards/accuracies": 1.0, "rewards/chosen": -0.00449710339307785, "rewards/margins": 1.0062808990478516, "rewards/rejected": -1.0107779502868652, "step": 682 }, { "epoch": 0.9799139167862266, "grad_norm": 6.98350191116333, "kl/ref_to_policy/chosen": -28.378677368164062, "kl/ref_to_policy/mean": 41.62483596801758, "kl/ref_to_policy/rejected": 111.62835693359375, "learning_rate": 1.411506063912882e-08, "logits/chosen": -1.5949604511260986, "logits/rejected": -1.9788986444473267, "logps/chosen": -911.433837890625, "logps/rejected": -1046.44482421875, "loss": 2.668, "nll_loss": 0.5907541513442993, "rewards/accuracies": 0.9375, "rewards/chosen": 0.283786803483963, "rewards/margins": 1.400070309638977, "rewards/rejected": -1.1162835359573364, "step": 683 }, { "epoch": 0.9813486370157819, "grad_norm": 4.416239261627197, "kl/ref_to_policy/chosen": -21.00501251220703, "kl/ref_to_policy/mean": 42.19076919555664, "kl/ref_to_policy/rejected": 105.38653564453125, "learning_rate": 1.2296532243193382e-08, "logits/chosen": -1.5455501079559326, "logits/rejected": -1.8855315446853638, "logps/chosen": -1486.830810546875, "logps/rejected": -1611.677490234375, "loss": 3.3242, "nll_loss": 0.745016872882843, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21005010604858398, "rewards/margins": 1.2639155387878418, "rewards/rejected": -1.0538654327392578, "step": 684 }, { "epoch": 0.9827833572453372, "grad_norm": 5.803210258483887, "kl/ref_to_policy/chosen": -23.331083297729492, "kl/ref_to_policy/mean": 21.485971450805664, "kl/ref_to_policy/rejected": 66.30302429199219, "learning_rate": 1.0603221160429e-08, "logits/chosen": -1.1217743158340454, "logits/rejected": -1.3447320461273193, "logps/chosen": -1725.6611328125, "logps/rejected": -1819.502197265625, "loss": 5.0432, "nll_loss": 1.1521031856536865, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23331083357334137, "rewards/margins": 0.89634108543396, "rewards/rejected": -0.663030207157135, "step": 685 }, { "epoch": 0.9842180774748924, "grad_norm": 3.4829864501953125, "kl/ref_to_policy/chosen": -24.075448989868164, "kl/ref_to_policy/mean": 58.58348846435547, "kl/ref_to_policy/rejected": 141.24241638183594, "learning_rate": 9.035169901754902e-09, "logits/chosen": -1.9401731491088867, "logits/rejected": -2.393833875656128, "logps/chosen": -813.877685546875, "logps/rejected": -969.8224487304688, "loss": 2.1214, "nll_loss": 0.4701695442199707, "rewards/accuracies": 1.0, "rewards/chosen": 0.24075447022914886, "rewards/margins": 1.6531786918640137, "rewards/rejected": -1.412424087524414, "step": 686 }, { "epoch": 0.9856527977044476, "grad_norm": 4.151869297027588, "kl/ref_to_policy/chosen": 11.80355453491211, "kl/ref_to_policy/mean": 80.93168640136719, "kl/ref_to_policy/rejected": 150.059814453125, "learning_rate": 7.59241783341913e-09, "logits/chosen": -1.5733388662338257, "logits/rejected": -1.975975751876831, "logps/chosen": -1337.006591796875, "logps/rejected": -1472.811767578125, "loss": 2.9299, "nll_loss": 0.6530876159667969, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11803553998470306, "rewards/margins": 1.3825626373291016, "rewards/rejected": -1.5005980730056763, "step": 687 }, { "epoch": 0.9870875179340028, "grad_norm": 5.521193504333496, "kl/ref_to_policy/chosen": -15.02031135559082, "kl/ref_to_policy/mean": 21.90380096435547, "kl/ref_to_policy/rejected": 58.82791519165039, "learning_rate": 6.2750011760054355e-09, "logits/chosen": -1.142054557800293, "logits/rejected": -1.3146172761917114, "logps/chosen": -1889.5343017578125, "logps/rejected": -1971.968505859375, "loss": 4.8852, "nll_loss": 1.1017547845840454, "rewards/accuracies": 1.0, "rewards/chosen": 0.15020310878753662, "rewards/margins": 0.7384822368621826, "rewards/rejected": -0.588279128074646, "step": 688 }, { "epoch": 0.9885222381635581, "grad_norm": 5.089587688446045, "kl/ref_to_policy/chosen": -12.710577011108398, "kl/ref_to_policy/mean": 42.74605178833008, "kl/ref_to_policy/rejected": 98.20268249511719, "learning_rate": 5.082953003528457e-09, "logits/chosen": -1.4630048274993896, "logits/rejected": -1.766567349433899, "logps/chosen": -1636.718994140625, "logps/rejected": -1749.22802734375, "loss": 3.847, "nll_loss": 0.8647660613059998, "rewards/accuracies": 0.75, "rewards/chosen": 0.12710575759410858, "rewards/margins": 1.1091325283050537, "rewards/rejected": -0.9820266962051392, "step": 689 }, { "epoch": 0.9899569583931134, "grad_norm": 6.623115539550781, "kl/ref_to_policy/chosen": -27.544479370117188, "kl/ref_to_policy/mean": 34.712833404541016, "kl/ref_to_policy/rejected": 96.97013854980469, "learning_rate": 4.016303242600495e-09, "logits/chosen": -1.614179015159607, "logits/rejected": -1.8965723514556885, "logps/chosen": -920.04150390625, "logps/rejected": -1043.7806396484375, "loss": 3.1948, "nll_loss": 0.711340069770813, "rewards/accuracies": 1.0, "rewards/chosen": 0.27544477581977844, "rewards/margins": 1.2451462745666504, "rewards/rejected": -0.9697014093399048, "step": 690 }, { "epoch": 0.9899569583931134, "eval_kl/ref_to_policy/chosen": -17.500947952270508, "eval_kl/ref_to_policy/mean": 39.65900421142578, "eval_kl/ref_to_policy/rejected": 96.81896209716797, "eval_logits/chosen": -1.4129987955093384, "eval_logits/rejected": -1.7022329568862915, "eval_logps/chosen": -1506.7349853515625, "eval_logps/rejected": -1620.5413818359375, "eval_loss": 4.123695373535156, "eval_nll_loss": 0.93763667345047, "eval_rewards/accuracies": 0.9414893388748169, "eval_rewards/chosen": 0.17500948905944824, "eval_rewards/margins": 1.1431989669799805, "eval_rewards/rejected": -0.9681894779205322, "eval_runtime": 113.94, "eval_samples_per_second": 3.3, "eval_steps_per_second": 1.65, "step": 690 }, { "epoch": 0.9913916786226685, "grad_norm": 5.435235977172852, "kl/ref_to_policy/chosen": -21.60996437072754, "kl/ref_to_policy/mean": 36.62342071533203, "kl/ref_to_policy/rejected": 94.8567886352539, "learning_rate": 3.075078671682108e-09, "logits/chosen": -1.3437764644622803, "logits/rejected": -1.6864944696426392, "logps/chosen": -1485.42138671875, "logps/rejected": -1600.30419921875, "loss": 3.9351, "nll_loss": 0.8923105001449585, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21609964966773987, "rewards/margins": 1.1646676063537598, "rewards/rejected": -0.9485679864883423, "step": 691 }, { "epoch": 0.9928263988522238, "grad_norm": 5.344364166259766, "kl/ref_to_policy/chosen": -20.547853469848633, "kl/ref_to_policy/mean": 22.709369659423828, "kl/ref_to_policy/rejected": 65.96659088134766, "learning_rate": 2.2593029204076578e-09, "logits/chosen": -1.147918701171875, "logits/rejected": -1.4118876457214355, "logps/chosen": -1708.7646484375, "logps/rejected": -1802.0543212890625, "loss": 4.6901, "nll_loss": 1.060681700706482, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2054785192012787, "rewards/margins": 0.8651443719863892, "rewards/rejected": -0.6596659421920776, "step": 692 }, { "epoch": 0.994261119081779, "grad_norm": 6.016766548156738, "kl/ref_to_policy/chosen": -9.874515533447266, "kl/ref_to_policy/mean": 33.85459518432617, "kl/ref_to_policy/rejected": 77.58370971679688, "learning_rate": 1.5689964689935555e-09, "logits/chosen": -1.2271000146865845, "logits/rejected": -1.4486329555511475, "logps/chosen": -1554.65673828125, "logps/rejected": -1648.9136962890625, "loss": 4.3241, "nll_loss": 0.9697118997573853, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09874515235424042, "rewards/margins": 0.8745821714401245, "rewards/rejected": -0.7758370637893677, "step": 693 }, { "epoch": 0.9956958393113343, "grad_norm": 7.1396074295043945, "kl/ref_to_policy/chosen": -24.473731994628906, "kl/ref_to_policy/mean": 13.312845230102539, "kl/ref_to_policy/rejected": 51.099422454833984, "learning_rate": 1.004176647724231e-09, "logits/chosen": -1.0419081449508667, "logits/rejected": -1.2204275131225586, "logps/chosen": -1575.1043701171875, "logps/rejected": -1657.3748779296875, "loss": 5.3804, "nll_loss": 1.226608157157898, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24473731219768524, "rewards/margins": 0.7557315230369568, "rewards/rejected": -0.5109941959381104, "step": 694 }, { "epoch": 0.9971305595408895, "grad_norm": 5.293726921081543, "kl/ref_to_policy/chosen": -24.299362182617188, "kl/ref_to_policy/mean": 32.26121139526367, "kl/ref_to_policy/rejected": 88.82178497314453, "learning_rate": 5.648576365169245e-10, "logits/chosen": -1.4349772930145264, "logits/rejected": -1.6690207719802856, "logps/chosen": -1206.38818359375, "logps/rejected": -1320.68701171875, "loss": 3.7693, "nll_loss": 0.8479300737380981, "rewards/accuracies": 0.875, "rewards/chosen": 0.24299362301826477, "rewards/margins": 1.1312114000320435, "rewards/rejected": -0.8882178068161011, "step": 695 }, { "epoch": 0.9985652797704447, "grad_norm": 5.382726669311523, "kl/ref_to_policy/chosen": -30.678468704223633, "kl/ref_to_policy/mean": 32.78575897216797, "kl/ref_to_policy/rejected": 96.24998474121094, "learning_rate": 2.5105046456475047e-10, "logits/chosen": -1.573439121246338, "logits/rejected": -1.8992246389389038, "logps/chosen": -818.360595703125, "logps/rejected": -943.0332641601562, "loss": 3.0842, "nll_loss": 0.6855060458183289, "rewards/accuracies": 1.0, "rewards/chosen": 0.3067846894264221, "rewards/margins": 1.2692844867706299, "rewards/rejected": -0.9624997973442078, "step": 696 }, { "epoch": 1.0, "grad_norm": 4.451387405395508, "kl/ref_to_policy/chosen": -10.505434036254883, "kl/ref_to_policy/mean": 54.38639450073242, "kl/ref_to_policy/rejected": 119.2782211303711, "learning_rate": 6.276301006080676e-11, "logits/chosen": -1.4529314041137695, "logits/rejected": -1.7944130897521973, "logps/chosen": -1452.76806640625, "logps/rejected": -1577.7156982421875, "loss": 3.6946, "nll_loss": 0.840961217880249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1050543338060379, "rewards/margins": 1.2978365421295166, "rewards/rejected": -1.1927822828292847, "step": 697 }, { "epoch": 1.0, "step": 697, "total_flos": 0.0, "train_loss": 4.881054094737366, "train_runtime": 20257.6196, "train_samples_per_second": 0.551, "train_steps_per_second": 0.034 } ], "logging_steps": 1, "max_steps": 697, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }