{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 4660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008583690987124463, "grad_norm": 93.58179277358883, "learning_rate": 9.98068669527897e-07, "logits/chosen": 0.5796874761581421, "logits/rejected": 0.602734386920929, "logps/chosen": -425.0, "logps/rejected": -381.8500061035156, "loss": 0.6812, "rewards/accuracies": 0.34375, "rewards/chosen": -0.02446288987994194, "rewards/margins": 0.031247710809111595, "rewards/rejected": -0.05564384534955025, "step": 10 }, { "epoch": 0.017167381974248927, "grad_norm": 90.82063504727739, "learning_rate": 9.959227467811158e-07, "logits/chosen": 0.6007324457168579, "logits/rejected": 0.576464831829071, "logps/chosen": -377.29998779296875, "logps/rejected": -356.6499938964844, "loss": 0.6934, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.06567688286304474, "rewards/margins": 0.027753448113799095, "rewards/rejected": -0.09355469048023224, "step": 20 }, { "epoch": 0.02575107296137339, "grad_norm": 92.04574913424867, "learning_rate": 9.937768240343348e-07, "logits/chosen": 0.667724609375, "logits/rejected": 0.622265636920929, "logps/chosen": -407.70001220703125, "logps/rejected": -364.29998779296875, "loss": 0.6979, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.006365966983139515, "rewards/margins": 0.0382080078125, "rewards/rejected": -0.04450683668255806, "step": 30 }, { "epoch": 0.034334763948497854, "grad_norm": 86.00717227244284, "learning_rate": 9.916309012875536e-07, "logits/chosen": 0.7188476324081421, "logits/rejected": 0.6522461175918579, "logps/chosen": -376.95001220703125, "logps/rejected": -346.29998779296875, "loss": 0.6731, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06004180759191513, "rewards/margins": 0.10652466118335724, "rewards/rejected": -0.04642028734087944, "step": 40 }, { "epoch": 0.04291845493562232, "grad_norm": 102.52232125753096, "learning_rate": 9.894849785407725e-07, "logits/chosen": 0.6292480230331421, "logits/rejected": 0.5814453363418579, "logps/chosen": -386.8500061035156, "logps/rejected": -361.1000061035156, "loss": 0.6622, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008270263671875, "rewards/margins": 0.13797912001609802, "rewards/rejected": -0.12953491508960724, "step": 50 }, { "epoch": 0.05150214592274678, "grad_norm": 97.88670797009195, "learning_rate": 9.873390557939913e-07, "logits/chosen": 0.610107421875, "logits/rejected": 0.5680176019668579, "logps/chosen": -393.8500061035156, "logps/rejected": -364.8500061035156, "loss": 0.6299, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.06891174614429474, "rewards/margins": 0.23018798232078552, "rewards/rejected": -0.299072265625, "step": 60 }, { "epoch": 0.060085836909871244, "grad_norm": 111.6762572709456, "learning_rate": 9.851931330472103e-07, "logits/chosen": 0.502972424030304, "logits/rejected": 0.45930176973342896, "logps/chosen": -381.20001220703125, "logps/rejected": -350.04998779296875, "loss": 0.6547, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09945373237133026, "rewards/margins": 0.20567627251148224, "rewards/rejected": -0.3045288026332855, "step": 70 }, { "epoch": 0.06866952789699571, "grad_norm": 83.08528731587334, "learning_rate": 9.830472103004291e-07, "logits/chosen": 0.4913085997104645, "logits/rejected": 0.42578125, "logps/chosen": -402.20001220703125, "logps/rejected": -380.1000061035156, "loss": 0.6411, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2843872010707855, "rewards/margins": 0.2613769471645355, "rewards/rejected": -0.545654296875, "step": 80 }, { "epoch": 0.07725321888412018, "grad_norm": 77.92443023493503, "learning_rate": 9.80901287553648e-07, "logits/chosen": 0.4337402284145355, "logits/rejected": 0.3875488340854645, "logps/chosen": -437.6000061035156, "logps/rejected": -418.1499938964844, "loss": 0.593, "rewards/accuracies": 0.59375, "rewards/chosen": -0.42218780517578125, "rewards/margins": 0.3784728944301605, "rewards/rejected": -0.79931640625, "step": 90 }, { "epoch": 0.08583690987124463, "grad_norm": 84.45105503740035, "learning_rate": 9.78755364806867e-07, "logits/chosen": 0.42509764432907104, "logits/rejected": 0.35767823457717896, "logps/chosen": -434.5, "logps/rejected": -369.79998779296875, "loss": 0.6063, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.46063232421875, "rewards/margins": 0.3885864317417145, "rewards/rejected": -0.848925769329071, "step": 100 }, { "epoch": 0.0944206008583691, "grad_norm": 85.28182014684224, "learning_rate": 9.766094420600858e-07, "logits/chosen": 0.3869872987270355, "logits/rejected": 0.3864990174770355, "logps/chosen": -403.8500061035156, "logps/rejected": -375.79998779296875, "loss": 0.6084, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.555908203125, "rewards/margins": 0.3509521484375, "rewards/rejected": -0.9066406488418579, "step": 110 }, { "epoch": 0.10300429184549356, "grad_norm": 98.26888052663664, "learning_rate": 9.744635193133046e-07, "logits/chosen": 0.44316405057907104, "logits/rejected": 0.4260803163051605, "logps/chosen": -400.70001220703125, "logps/rejected": -383.5, "loss": 0.6264, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.602832019329071, "rewards/margins": 0.3648681640625, "rewards/rejected": -0.96728515625, "step": 120 }, { "epoch": 0.11158798283261803, "grad_norm": 85.93558544935696, "learning_rate": 9.723175965665237e-07, "logits/chosen": 0.4620605409145355, "logits/rejected": 0.42351073026657104, "logps/chosen": -399.6499938964844, "logps/rejected": -395.45001220703125, "loss": 0.5947, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6856445074081421, "rewards/margins": 0.470458984375, "rewards/rejected": -1.156347632408142, "step": 130 }, { "epoch": 0.12017167381974249, "grad_norm": 65.1455313874359, "learning_rate": 9.701716738197425e-07, "logits/chosen": 0.4283691346645355, "logits/rejected": 0.42476195096969604, "logps/chosen": -423.3999938964844, "logps/rejected": -388.3500061035156, "loss": 0.6248, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.87890625, "rewards/margins": 0.37553709745407104, "rewards/rejected": -1.253515601158142, "step": 140 }, { "epoch": 0.12875536480686695, "grad_norm": 90.76324463885837, "learning_rate": 9.680257510729613e-07, "logits/chosen": 0.47416990995407104, "logits/rejected": 0.47572630643844604, "logps/chosen": -398.3500061035156, "logps/rejected": -406.3500061035156, "loss": 0.5682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4722228944301605, "rewards/margins": 0.509570300579071, "rewards/rejected": -0.9808593988418579, "step": 150 }, { "epoch": 0.13733905579399142, "grad_norm": 85.29919464778169, "learning_rate": 9.658798283261801e-07, "logits/chosen": 0.49885255098342896, "logits/rejected": 0.4849853515625, "logps/chosen": -380.6499938964844, "logps/rejected": -362.04998779296875, "loss": 0.6113, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3292343020439148, "rewards/margins": 0.4195495545864105, "rewards/rejected": -0.749072253704071, "step": 160 }, { "epoch": 0.1459227467811159, "grad_norm": 87.7331253084213, "learning_rate": 9.637339055793992e-07, "logits/chosen": 0.4069274961948395, "logits/rejected": 0.31458741426467896, "logps/chosen": -442.79998779296875, "logps/rejected": -399.70001220703125, "loss": 0.6042, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2218017578125, "rewards/margins": 0.4540771543979645, "rewards/rejected": -0.6753906011581421, "step": 170 }, { "epoch": 0.15450643776824036, "grad_norm": 80.20871874849743, "learning_rate": 9.61587982832618e-07, "logits/chosen": 0.3735107481479645, "logits/rejected": 0.3502197265625, "logps/chosen": -385.20001220703125, "logps/rejected": -357.79998779296875, "loss": 0.5911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07313843071460724, "rewards/margins": 0.4320068359375, "rewards/rejected": -0.3587356507778168, "step": 180 }, { "epoch": 0.1630901287553648, "grad_norm": 72.15146852241043, "learning_rate": 9.594420600858368e-07, "logits/chosen": 0.42340087890625, "logits/rejected": 0.4005371034145355, "logps/chosen": -344.29998779296875, "logps/rejected": -345.3999938964844, "loss": 0.652, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04525909572839737, "rewards/margins": 0.3015808165073395, "rewards/rejected": -0.25642091035842896, "step": 190 }, { "epoch": 0.17167381974248927, "grad_norm": 114.11041147365277, "learning_rate": 9.572961373390558e-07, "logits/chosen": 0.37065428495407104, "logits/rejected": 0.35467529296875, "logps/chosen": -412.95001220703125, "logps/rejected": -348.3999938964844, "loss": 0.6284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24711914360523224, "rewards/margins": 0.3831115663051605, "rewards/rejected": -0.13557739555835724, "step": 200 }, { "epoch": 0.18025751072961374, "grad_norm": 93.05749357635496, "learning_rate": 9.551502145922747e-07, "logits/chosen": 0.3744873106479645, "logits/rejected": 0.33289796113967896, "logps/chosen": -410.29998779296875, "logps/rejected": -375.6000061035156, "loss": 0.5514, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3562683165073395, "rewards/margins": 0.5600341558456421, "rewards/rejected": -0.20369109511375427, "step": 210 }, { "epoch": 0.1888412017167382, "grad_norm": 109.02938555944384, "learning_rate": 9.530042918454935e-07, "logits/chosen": 0.3608764708042145, "logits/rejected": 0.33274537324905396, "logps/chosen": -441.0, "logps/rejected": -419.25, "loss": 0.6087, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.13027343153953552, "rewards/margins": 0.5419921875, "rewards/rejected": -0.4121337831020355, "step": 220 }, { "epoch": 0.19742489270386265, "grad_norm": 71.47198963991443, "learning_rate": 9.508583690987124e-07, "logits/chosen": 0.505664050579071, "logits/rejected": 0.4896484315395355, "logps/chosen": -399.45001220703125, "logps/rejected": -369.1000061035156, "loss": 0.6613, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.011242675594985485, "rewards/margins": 0.44658660888671875, "rewards/rejected": -0.43497925996780396, "step": 230 }, { "epoch": 0.20600858369098712, "grad_norm": 73.08347967039214, "learning_rate": 9.487124463519312e-07, "logits/chosen": 0.4662109315395355, "logits/rejected": 0.3946777284145355, "logps/chosen": -401.8999938964844, "logps/rejected": -372.95001220703125, "loss": 0.6253, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.19111938774585724, "rewards/margins": 0.510791003704071, "rewards/rejected": -0.3192504942417145, "step": 240 }, { "epoch": 0.2145922746781116, "grad_norm": 94.71659775987155, "learning_rate": 9.465665236051502e-07, "logits/chosen": 0.44658201932907104, "logits/rejected": 0.42170411348342896, "logps/chosen": -383.29998779296875, "logps/rejected": -337.1499938964844, "loss": 0.6041, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.26915282011032104, "rewards/margins": 0.50372314453125, "rewards/rejected": -0.234405517578125, "step": 250 }, { "epoch": 0.22317596566523606, "grad_norm": 74.91853678178387, "learning_rate": 9.444206008583691e-07, "logits/chosen": 0.4902710020542145, "logits/rejected": 0.46636962890625, "logps/chosen": -382.6000061035156, "logps/rejected": -369.1000061035156, "loss": 0.5707, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02276916429400444, "rewards/margins": 0.610595703125, "rewards/rejected": -0.6336425542831421, "step": 260 }, { "epoch": 0.2317596566523605, "grad_norm": 102.93846011361646, "learning_rate": 9.422746781115879e-07, "logits/chosen": 0.38569945096969604, "logits/rejected": 0.38218384981155396, "logps/chosen": -400.6499938964844, "logps/rejected": -379.5, "loss": 0.5811, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12243805080652237, "rewards/margins": 0.6329101324081421, "rewards/rejected": -0.755419909954071, "step": 270 }, { "epoch": 0.24034334763948498, "grad_norm": 87.89291662683146, "learning_rate": 9.401287553648068e-07, "logits/chosen": 0.4308105409145355, "logits/rejected": 0.3696044981479645, "logps/chosen": -398.1499938964844, "logps/rejected": -380.3999938964844, "loss": 0.572, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03416747972369194, "rewards/margins": 0.597033679485321, "rewards/rejected": -0.5623138546943665, "step": 280 }, { "epoch": 0.24892703862660945, "grad_norm": 71.85274991388243, "learning_rate": 9.379828326180257e-07, "logits/chosen": 0.46428221464157104, "logits/rejected": 0.45774537324905396, "logps/chosen": -412.5, "logps/rejected": -395.20001220703125, "loss": 0.6401, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02659912034869194, "rewards/margins": 0.44243162870407104, "rewards/rejected": -0.4683288633823395, "step": 290 }, { "epoch": 0.2575107296137339, "grad_norm": 74.92399512596855, "learning_rate": 9.358369098712446e-07, "logits/chosen": 0.45689696073532104, "logits/rejected": 0.3986572325229645, "logps/chosen": -391.20001220703125, "logps/rejected": -364.75, "loss": 0.653, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.080108642578125, "rewards/margins": 0.3924011290073395, "rewards/rejected": -0.4724670350551605, "step": 300 }, { "epoch": 0.26609442060085836, "grad_norm": 81.88684728384392, "learning_rate": 9.336909871244635e-07, "logits/chosen": 0.5186218023300171, "logits/rejected": 0.46978759765625, "logps/chosen": -372.5, "logps/rejected": -364.8999938964844, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": -0.03024902381002903, "rewards/margins": 0.4516845643520355, "rewards/rejected": -0.4818664491176605, "step": 310 }, { "epoch": 0.27467811158798283, "grad_norm": 85.86081637775938, "learning_rate": 9.315450643776823e-07, "logits/chosen": 0.500048816204071, "logits/rejected": 0.4262939393520355, "logps/chosen": -389.45001220703125, "logps/rejected": -372.70001220703125, "loss": 0.6032, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.08588866889476776, "rewards/margins": 0.588134765625, "rewards/rejected": -0.501635730266571, "step": 320 }, { "epoch": 0.2832618025751073, "grad_norm": 84.59702513236614, "learning_rate": 9.293991416309013e-07, "logits/chosen": 0.5234375, "logits/rejected": 0.49775391817092896, "logps/chosen": -347.3999938964844, "logps/rejected": -315.75, "loss": 0.6075, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.088714599609375, "rewards/margins": 0.555712878704071, "rewards/rejected": -0.46684569120407104, "step": 330 }, { "epoch": 0.2918454935622318, "grad_norm": 83.46104412953727, "learning_rate": 9.272532188841201e-07, "logits/chosen": 0.4837402403354645, "logits/rejected": 0.46240234375, "logps/chosen": -368.95001220703125, "logps/rejected": -350.6000061035156, "loss": 0.6598, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2888732850551605, "rewards/margins": 0.5232909917831421, "rewards/rejected": -0.23493652045726776, "step": 340 }, { "epoch": 0.30042918454935624, "grad_norm": 89.49298280937151, "learning_rate": 9.25107296137339e-07, "logits/chosen": 0.5703369379043579, "logits/rejected": 0.50677490234375, "logps/chosen": -398.3500061035156, "logps/rejected": -392.1499938964844, "loss": 0.5791, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11780395358800888, "rewards/margins": 0.5961242914199829, "rewards/rejected": -0.47832030057907104, "step": 350 }, { "epoch": 0.3090128755364807, "grad_norm": 88.14711013023205, "learning_rate": 9.22961373390558e-07, "logits/chosen": 0.4288330078125, "logits/rejected": 0.40631103515625, "logps/chosen": -318.0, "logps/rejected": -316.5, "loss": 0.6263, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.02463378943502903, "rewards/margins": 0.5069580078125, "rewards/rejected": -0.4822021424770355, "step": 360 }, { "epoch": 0.31759656652360513, "grad_norm": 75.2523433630276, "learning_rate": 9.208154506437768e-07, "logits/chosen": 0.5197998285293579, "logits/rejected": 0.4921875, "logps/chosen": -394.6499938964844, "logps/rejected": -372.3999938964844, "loss": 0.6539, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.23726806044578552, "rewards/margins": 0.48724365234375, "rewards/rejected": -0.25001221895217896, "step": 370 }, { "epoch": 0.3261802575107296, "grad_norm": 73.79841761975986, "learning_rate": 9.186695278969957e-07, "logits/chosen": 0.41350096464157104, "logits/rejected": 0.3919433653354645, "logps/chosen": -370.1499938964844, "logps/rejected": -359.20001220703125, "loss": 0.5999, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.6592773199081421, "rewards/margins": 0.5671631097793579, "rewards/rejected": 0.09239502251148224, "step": 380 }, { "epoch": 0.33476394849785407, "grad_norm": 82.50142241627624, "learning_rate": 9.165236051502145e-07, "logits/chosen": 0.4007568359375, "logits/rejected": 0.3743652403354645, "logps/chosen": -390.1000061035156, "logps/rejected": -345.0, "loss": 0.6465, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.555712878704071, "rewards/margins": 0.447998046875, "rewards/rejected": 0.108123779296875, "step": 390 }, { "epoch": 0.34334763948497854, "grad_norm": 120.9710128669391, "learning_rate": 9.143776824034333e-07, "logits/chosen": 0.45057374238967896, "logits/rejected": 0.34833985567092896, "logps/chosen": -423.0, "logps/rejected": -386.29998779296875, "loss": 0.6609, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.388885498046875, "rewards/margins": 0.3966064453125, "rewards/rejected": -0.00815429724752903, "step": 400 }, { "epoch": 0.351931330472103, "grad_norm": 92.85523809125584, "learning_rate": 9.122317596566524e-07, "logits/chosen": 0.4139160215854645, "logits/rejected": 0.36088865995407104, "logps/chosen": -379.95001220703125, "logps/rejected": -352.20001220703125, "loss": 0.5932, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.36860960721969604, "rewards/margins": 0.5681396722793579, "rewards/rejected": -0.19972534477710724, "step": 410 }, { "epoch": 0.3605150214592275, "grad_norm": 69.49063616980095, "learning_rate": 9.100858369098712e-07, "logits/chosen": 0.3135925233364105, "logits/rejected": 0.3089355528354645, "logps/chosen": -401.70001220703125, "logps/rejected": -382.5, "loss": 0.6094, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.12392578274011612, "rewards/margins": 0.45927733182907104, "rewards/rejected": -0.33513182401657104, "step": 420 }, { "epoch": 0.36909871244635195, "grad_norm": 79.77314354468054, "learning_rate": 9.079399141630901e-07, "logits/chosen": 0.3045898377895355, "logits/rejected": 0.3299316465854645, "logps/chosen": -400.45001220703125, "logps/rejected": -373.54998779296875, "loss": 0.6517, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.48017579317092896, "rewards/margins": 0.40784913301467896, "rewards/rejected": -0.88720703125, "step": 430 }, { "epoch": 0.3776824034334764, "grad_norm": 63.511021273544166, "learning_rate": 9.05793991416309e-07, "logits/chosen": 0.4227539002895355, "logits/rejected": 0.3243164122104645, "logps/chosen": -433.8999938964844, "logps/rejected": -377.95001220703125, "loss": 0.594, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6689697504043579, "rewards/margins": 0.4835266172885895, "rewards/rejected": -1.1525390148162842, "step": 440 }, { "epoch": 0.38626609442060084, "grad_norm": 89.4275872521739, "learning_rate": 9.036480686695278e-07, "logits/chosen": 0.31220704317092896, "logits/rejected": 0.2880004942417145, "logps/chosen": -437.70001220703125, "logps/rejected": -430.3500061035156, "loss": 0.6675, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.259622186422348, "rewards/margins": 0.45366209745407104, "rewards/rejected": -0.713427722454071, "step": 450 }, { "epoch": 0.3948497854077253, "grad_norm": 93.98712919009515, "learning_rate": 9.015021459227468e-07, "logits/chosen": 0.4172607362270355, "logits/rejected": 0.37629395723342896, "logps/chosen": -378.20001220703125, "logps/rejected": -386.70001220703125, "loss": 0.6115, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005955505184829235, "rewards/margins": 0.4729247987270355, "rewards/rejected": -0.46687012910842896, "step": 460 }, { "epoch": 0.4034334763948498, "grad_norm": 79.20515783559782, "learning_rate": 8.993562231759656e-07, "logits/chosen": 0.35517579317092896, "logits/rejected": 0.3569702208042145, "logps/chosen": -389.20001220703125, "logps/rejected": -374.25, "loss": 0.6195, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03984985500574112, "rewards/margins": 0.4603027403354645, "rewards/rejected": -0.5005859136581421, "step": 470 }, { "epoch": 0.41201716738197425, "grad_norm": 107.32051541531689, "learning_rate": 8.972103004291846e-07, "logits/chosen": 0.45854490995407104, "logits/rejected": 0.44163209199905396, "logps/chosen": -411.25, "logps/rejected": -390.1000061035156, "loss": 0.5452, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19658812880516052, "rewards/margins": 0.7081298828125, "rewards/rejected": -0.5108642578125, "step": 480 }, { "epoch": 0.4206008583690987, "grad_norm": 62.74416053988647, "learning_rate": 8.950643776824034e-07, "logits/chosen": 0.4460205137729645, "logits/rejected": 0.46357423067092896, "logps/chosen": -401.1000061035156, "logps/rejected": -360.3999938964844, "loss": 0.4966, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.19114379584789276, "rewards/margins": 0.881054699420929, "rewards/rejected": -0.6904296875, "step": 490 }, { "epoch": 0.4291845493562232, "grad_norm": 76.16159309664116, "learning_rate": 8.929184549356222e-07, "logits/chosen": 0.48249512910842896, "logits/rejected": 0.4447006285190582, "logps/chosen": -384.79998779296875, "logps/rejected": -377.1000061035156, "loss": 0.7097, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04543457180261612, "rewards/margins": 0.3246398866176605, "rewards/rejected": -0.37069398164749146, "step": 500 }, { "epoch": 0.43776824034334766, "grad_norm": 95.4027652763346, "learning_rate": 8.907725321888412e-07, "logits/chosen": 0.5862666964530945, "logits/rejected": 0.550860583782196, "logps/chosen": -360.54998779296875, "logps/rejected": -344.1499938964844, "loss": 0.619, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1464385986328125, "rewards/margins": 0.5289551019668579, "rewards/rejected": -0.6761993169784546, "step": 510 }, { "epoch": 0.44635193133047213, "grad_norm": 86.39050673277684, "learning_rate": 8.886266094420601e-07, "logits/chosen": 0.476806640625, "logits/rejected": 0.47833251953125, "logps/chosen": -384.8999938964844, "logps/rejected": -378.70001220703125, "loss": 0.5994, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14428099989891052, "rewards/margins": 0.585278332233429, "rewards/rejected": -0.7301391363143921, "step": 520 }, { "epoch": 0.45493562231759654, "grad_norm": 89.87779139685644, "learning_rate": 8.86480686695279e-07, "logits/chosen": 0.37958985567092896, "logits/rejected": 0.3419342041015625, "logps/chosen": -381.20001220703125, "logps/rejected": -356.8500061035156, "loss": 0.65, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02516784705221653, "rewards/margins": 0.41046142578125, "rewards/rejected": -0.3851318359375, "step": 530 }, { "epoch": 0.463519313304721, "grad_norm": 99.22116052214102, "learning_rate": 8.843347639484978e-07, "logits/chosen": 0.550341784954071, "logits/rejected": 0.4535156190395355, "logps/chosen": -382.79998779296875, "logps/rejected": -336.54998779296875, "loss": 0.5146, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.23041991889476776, "rewards/margins": 0.7979736328125, "rewards/rejected": -0.567474365234375, "step": 540 }, { "epoch": 0.4721030042918455, "grad_norm": 64.73656375784098, "learning_rate": 8.821888412017166e-07, "logits/chosen": 0.514697253704071, "logits/rejected": 0.4959960877895355, "logps/chosen": -391.95001220703125, "logps/rejected": -351.5, "loss": 0.5345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.27595216035842896, "rewards/margins": 0.862548828125, "rewards/rejected": -0.5862151980400085, "step": 550 }, { "epoch": 0.48068669527896996, "grad_norm": 124.28447782197199, "learning_rate": 8.800429184549357e-07, "logits/chosen": 0.4894042909145355, "logits/rejected": 0.48176270723342896, "logps/chosen": -386.79998779296875, "logps/rejected": -364.25, "loss": 0.5754, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.28924560546875, "rewards/margins": 0.6858154535293579, "rewards/rejected": -0.3966308534145355, "step": 560 }, { "epoch": 0.4892703862660944, "grad_norm": 78.78390348605451, "learning_rate": 8.778969957081545e-07, "logits/chosen": 0.42412108182907104, "logits/rejected": 0.448974609375, "logps/chosen": -395.1000061035156, "logps/rejected": -368.5, "loss": 0.59, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06474609673023224, "rewards/margins": 0.7273925542831421, "rewards/rejected": -0.6622558832168579, "step": 570 }, { "epoch": 0.4978540772532189, "grad_norm": 96.57456302941547, "learning_rate": 8.757510729613734e-07, "logits/chosen": 0.488037109375, "logits/rejected": 0.48237305879592896, "logps/chosen": -400.1000061035156, "logps/rejected": -369.1499938964844, "loss": 0.6499, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01710205152630806, "rewards/margins": 0.542651355266571, "rewards/rejected": -0.5259033441543579, "step": 580 }, { "epoch": 0.5064377682403434, "grad_norm": 80.36559533342219, "learning_rate": 8.736051502145922e-07, "logits/chosen": 0.4302734434604645, "logits/rejected": 0.379638671875, "logps/chosen": -408.20001220703125, "logps/rejected": -362.54998779296875, "loss": 0.6016, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.13620606064796448, "rewards/margins": 0.6349853277206421, "rewards/rejected": -0.49858397245407104, "step": 590 }, { "epoch": 0.5150214592274678, "grad_norm": 87.57334734477797, "learning_rate": 8.714592274678111e-07, "logits/chosen": 0.529296875, "logits/rejected": 0.4739746153354645, "logps/chosen": -382.29998779296875, "logps/rejected": -338.54998779296875, "loss": 0.6806, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.18206176161766052, "rewards/margins": 0.506213366985321, "rewards/rejected": -0.3235229551792145, "step": 600 }, { "epoch": 0.5236051502145923, "grad_norm": 66.14290009729496, "learning_rate": 8.693133047210301e-07, "logits/chosen": 0.4843505918979645, "logits/rejected": 0.4866699278354645, "logps/chosen": -397.04998779296875, "logps/rejected": -362.29998779296875, "loss": 0.5127, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.4958740174770355, "rewards/margins": 0.716687023639679, "rewards/rejected": -0.22125244140625, "step": 610 }, { "epoch": 0.5321888412017167, "grad_norm": 85.67950734577094, "learning_rate": 8.671673819742489e-07, "logits/chosen": 0.46858519315719604, "logits/rejected": 0.4464355409145355, "logps/chosen": -398.79998779296875, "logps/rejected": -391.54998779296875, "loss": 0.6119, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.4027343690395355, "rewards/margins": 0.49418336153030396, "rewards/rejected": -0.09122314304113388, "step": 620 }, { "epoch": 0.5407725321888412, "grad_norm": 89.54420184313848, "learning_rate": 8.650214592274677e-07, "logits/chosen": 0.4477783143520355, "logits/rejected": 0.41614991426467896, "logps/chosen": -413.5, "logps/rejected": -396.8500061035156, "loss": 0.6007, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4101806581020355, "rewards/margins": 0.661846935749054, "rewards/rejected": -0.2515319883823395, "step": 630 }, { "epoch": 0.5493562231759657, "grad_norm": 73.75554848292853, "learning_rate": 8.628755364806867e-07, "logits/chosen": 0.4527343809604645, "logits/rejected": 0.3885742127895355, "logps/chosen": -394.54998779296875, "logps/rejected": -372.20001220703125, "loss": 0.5188, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.20379638671875, "rewards/margins": 0.837353527545929, "rewards/rejected": -0.633190929889679, "step": 640 }, { "epoch": 0.5579399141630901, "grad_norm": 80.00174120709897, "learning_rate": 8.607296137339055e-07, "logits/chosen": 0.4269042909145355, "logits/rejected": 0.446737676858902, "logps/chosen": -400.3500061035156, "logps/rejected": -389.70001220703125, "loss": 0.6723, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08190307766199112, "rewards/margins": 0.5954040288925171, "rewards/rejected": -0.677978515625, "step": 650 }, { "epoch": 0.5665236051502146, "grad_norm": 81.33007943598504, "learning_rate": 8.585836909871245e-07, "logits/chosen": 0.36271971464157104, "logits/rejected": 0.373291015625, "logps/chosen": -401.6499938964844, "logps/rejected": -352.04998779296875, "loss": 0.6416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06304015964269638, "rewards/margins": 0.5346924066543579, "rewards/rejected": -0.4721527099609375, "step": 660 }, { "epoch": 0.575107296137339, "grad_norm": 68.29310349320347, "learning_rate": 8.564377682403433e-07, "logits/chosen": 0.48261719942092896, "logits/rejected": 0.4375, "logps/chosen": -403.75, "logps/rejected": -347.0, "loss": 0.5815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20468750596046448, "rewards/margins": 0.607714831829071, "rewards/rejected": -0.4027160704135895, "step": 670 }, { "epoch": 0.5836909871244635, "grad_norm": 103.25247431387625, "learning_rate": 8.542918454935622e-07, "logits/chosen": 0.52508544921875, "logits/rejected": 0.4754638671875, "logps/chosen": -406.1499938964844, "logps/rejected": -406.1499938964844, "loss": 0.6699, "rewards/accuracies": 0.5625, "rewards/chosen": 0.27338868379592896, "rewards/margins": 0.3425231873989105, "rewards/rejected": -0.068145751953125, "step": 680 }, { "epoch": 0.592274678111588, "grad_norm": 100.78799359844442, "learning_rate": 8.521459227467811e-07, "logits/chosen": 0.545703113079071, "logits/rejected": 0.521728515625, "logps/chosen": -405.29998779296875, "logps/rejected": -363.1000061035156, "loss": 0.598, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5170043706893921, "rewards/margins": 0.656787097454071, "rewards/rejected": -0.13951416313648224, "step": 690 }, { "epoch": 0.6008583690987125, "grad_norm": 87.90233738184071, "learning_rate": 8.499999999999999e-07, "logits/chosen": 0.5794922113418579, "logits/rejected": 0.552001953125, "logps/chosen": -408.8999938964844, "logps/rejected": -386.29998779296875, "loss": 0.6168, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2677368223667145, "rewards/margins": 0.6103881597518921, "rewards/rejected": -0.34282225370407104, "step": 700 }, { "epoch": 0.6094420600858369, "grad_norm": 69.84171683395057, "learning_rate": 8.478540772532189e-07, "logits/chosen": 0.41331785917282104, "logits/rejected": 0.44390869140625, "logps/chosen": -392.6000061035156, "logps/rejected": -360.1000061035156, "loss": 0.6834, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.05913085862994194, "rewards/margins": 0.39434814453125, "rewards/rejected": -0.3347122073173523, "step": 710 }, { "epoch": 0.6180257510729614, "grad_norm": 99.78359445853926, "learning_rate": 8.457081545064378e-07, "logits/chosen": 0.384765625, "logits/rejected": 0.37431639432907104, "logps/chosen": -399.1000061035156, "logps/rejected": -362.70001220703125, "loss": 0.5726, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.07016601413488388, "rewards/margins": 0.543933093547821, "rewards/rejected": -0.4736389219760895, "step": 720 }, { "epoch": 0.6266094420600858, "grad_norm": 81.29381172855797, "learning_rate": 8.435622317596566e-07, "logits/chosen": 0.344970703125, "logits/rejected": 0.3149215579032898, "logps/chosen": -408.20001220703125, "logps/rejected": -396.8999938964844, "loss": 0.5767, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19256591796875, "rewards/margins": 0.741772472858429, "rewards/rejected": -0.5497070550918579, "step": 730 }, { "epoch": 0.6351931330472103, "grad_norm": 81.32974588736727, "learning_rate": 8.414163090128755e-07, "logits/chosen": 0.4842773377895355, "logits/rejected": 0.4497924745082855, "logps/chosen": -412.79998779296875, "logps/rejected": -365.6499938964844, "loss": 0.6549, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.07790527492761612, "rewards/margins": 0.531848132610321, "rewards/rejected": -0.45338135957717896, "step": 740 }, { "epoch": 0.6437768240343348, "grad_norm": 68.83759125624144, "learning_rate": 8.392703862660943e-07, "logits/chosen": 0.32843017578125, "logits/rejected": 0.2613769471645355, "logps/chosen": -418.3999938964844, "logps/rejected": -389.45001220703125, "loss": 0.5871, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15720215439796448, "rewards/margins": 0.6257873773574829, "rewards/rejected": -0.7826172113418579, "step": 750 }, { "epoch": 0.6523605150214592, "grad_norm": 94.9655215225264, "learning_rate": 8.371244635193134e-07, "logits/chosen": 0.3301757872104645, "logits/rejected": 0.2924133241176605, "logps/chosen": -427.79998779296875, "logps/rejected": -393.1000061035156, "loss": 0.6049, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.09816894680261612, "rewards/margins": 0.633288562297821, "rewards/rejected": -0.7317870855331421, "step": 760 }, { "epoch": 0.6609442060085837, "grad_norm": 105.2633440233459, "learning_rate": 8.349785407725322e-07, "logits/chosen": 0.37055665254592896, "logits/rejected": 0.3477233946323395, "logps/chosen": -385.1499938964844, "logps/rejected": -353.79998779296875, "loss": 0.6304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1292858123779297, "rewards/margins": 0.5131469964981079, "rewards/rejected": -0.642163097858429, "step": 770 }, { "epoch": 0.6695278969957081, "grad_norm": 73.67460551385861, "learning_rate": 8.32832618025751e-07, "logits/chosen": 0.4927001893520355, "logits/rejected": 0.4589477479457855, "logps/chosen": -392.70001220703125, "logps/rejected": -355.8999938964844, "loss": 0.6357, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06964416801929474, "rewards/margins": 0.5264037847518921, "rewards/rejected": -0.596020519733429, "step": 780 }, { "epoch": 0.6781115879828327, "grad_norm": 84.58599891354547, "learning_rate": 8.306866952789699e-07, "logits/chosen": 0.4305664002895355, "logits/rejected": 0.4263671934604645, "logps/chosen": -389.54998779296875, "logps/rejected": -346.6499938964844, "loss": 0.5892, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.09847869724035263, "rewards/margins": 0.685534656047821, "rewards/rejected": -0.5867553949356079, "step": 790 }, { "epoch": 0.6866952789699571, "grad_norm": 78.12800187615602, "learning_rate": 8.285407725321888e-07, "logits/chosen": 0.37385255098342896, "logits/rejected": 0.35369569063186646, "logps/chosen": -386.20001220703125, "logps/rejected": -356.0, "loss": 0.5914, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.07282714545726776, "rewards/margins": 0.5387939214706421, "rewards/rejected": -0.46525877714157104, "step": 800 }, { "epoch": 0.6952789699570815, "grad_norm": 81.37299122438141, "learning_rate": 8.263948497854076e-07, "logits/chosen": 0.46989744901657104, "logits/rejected": 0.39287108182907104, "logps/chosen": -419.0, "logps/rejected": -388.5, "loss": 0.5913, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0289154052734375, "rewards/margins": 0.5942657589912415, "rewards/rejected": -0.564892590045929, "step": 810 }, { "epoch": 0.703862660944206, "grad_norm": 67.46246140825866, "learning_rate": 8.242489270386266e-07, "logits/chosen": 0.4120727479457855, "logits/rejected": 0.3407653868198395, "logps/chosen": -404.6000061035156, "logps/rejected": -347.6000061035156, "loss": 0.5597, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.265768438577652, "rewards/margins": 0.6657959222793579, "rewards/rejected": -0.40037840604782104, "step": 820 }, { "epoch": 0.7124463519313304, "grad_norm": 68.97405456487762, "learning_rate": 8.221030042918454e-07, "logits/chosen": 0.4730224609375, "logits/rejected": 0.4612182676792145, "logps/chosen": -381.1000061035156, "logps/rejected": -353.04998779296875, "loss": 0.672, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.11038818210363388, "rewards/margins": 0.4702087342739105, "rewards/rejected": -0.36011964082717896, "step": 830 }, { "epoch": 0.721030042918455, "grad_norm": 70.73735888645231, "learning_rate": 8.199570815450644e-07, "logits/chosen": 0.47395020723342896, "logits/rejected": 0.456298828125, "logps/chosen": -387.25, "logps/rejected": -384.8500061035156, "loss": 0.6307, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.16070556640625, "rewards/margins": 0.550976574420929, "rewards/rejected": -0.3901611268520355, "step": 840 }, { "epoch": 0.7296137339055794, "grad_norm": 60.28551837301069, "learning_rate": 8.178111587982832e-07, "logits/chosen": 0.5111083984375, "logits/rejected": 0.48583984375, "logps/chosen": -380.0, "logps/rejected": -352.95001220703125, "loss": 0.5658, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3114379942417145, "rewards/margins": 0.6934570074081421, "rewards/rejected": -0.38310545682907104, "step": 850 }, { "epoch": 0.7381974248927039, "grad_norm": 96.35268033933663, "learning_rate": 8.15665236051502e-07, "logits/chosen": 0.573657214641571, "logits/rejected": 0.4818115234375, "logps/chosen": -387.20001220703125, "logps/rejected": -384.0, "loss": 0.6053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.27497559785842896, "rewards/margins": 0.612774670124054, "rewards/rejected": -0.3378051817417145, "step": 860 }, { "epoch": 0.7467811158798283, "grad_norm": 90.61769083278993, "learning_rate": 8.13519313304721e-07, "logits/chosen": 0.41966551542282104, "logits/rejected": 0.4227539002895355, "logps/chosen": -411.8999938964844, "logps/rejected": -390.1000061035156, "loss": 0.5304, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.25092774629592896, "rewards/margins": 0.797167956829071, "rewards/rejected": -0.5449463129043579, "step": 870 }, { "epoch": 0.7553648068669528, "grad_norm": 85.04121369502657, "learning_rate": 8.113733905579399e-07, "logits/chosen": 0.4425048828125, "logits/rejected": 0.4794921875, "logps/chosen": -384.5, "logps/rejected": -386.1000061035156, "loss": 0.534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.01999206468462944, "rewards/margins": 0.8163909912109375, "rewards/rejected": -0.796875, "step": 880 }, { "epoch": 0.7639484978540773, "grad_norm": 93.7342910179825, "learning_rate": 8.092274678111588e-07, "logits/chosen": 0.4049072265625, "logits/rejected": 0.40424805879592896, "logps/chosen": -411.29998779296875, "logps/rejected": -394.8999938964844, "loss": 0.6404, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.37672728300094604, "rewards/margins": 0.536724865436554, "rewards/rejected": -0.913134753704071, "step": 890 }, { "epoch": 0.7725321888412017, "grad_norm": 70.58014534236187, "learning_rate": 8.070815450643776e-07, "logits/chosen": 0.40985107421875, "logits/rejected": 0.3108276426792145, "logps/chosen": -411.3999938964844, "logps/rejected": -375.1499938964844, "loss": 0.6039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5054687261581421, "rewards/margins": 0.6588989496231079, "rewards/rejected": -1.164941430091858, "step": 900 }, { "epoch": 0.7811158798283262, "grad_norm": 86.07791595357675, "learning_rate": 8.049356223175964e-07, "logits/chosen": 0.3715454041957855, "logits/rejected": 0.41428834199905396, "logps/chosen": -424.8999938964844, "logps/rejected": -417.79998779296875, "loss": 0.6017, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.38605958223342896, "rewards/margins": 0.7439209222793579, "rewards/rejected": -1.129296898841858, "step": 910 }, { "epoch": 0.7896995708154506, "grad_norm": 68.70357250523489, "learning_rate": 8.027896995708155e-07, "logits/chosen": 0.4511474668979645, "logits/rejected": 0.428567498922348, "logps/chosen": -366.5, "logps/rejected": -352.29998779296875, "loss": 0.5954, "rewards/accuracies": 0.59375, "rewards/chosen": -0.28925782442092896, "rewards/margins": 0.635180652141571, "rewards/rejected": -0.924487292766571, "step": 920 }, { "epoch": 0.7982832618025751, "grad_norm": 94.23647032542529, "learning_rate": 8.006437768240343e-07, "logits/chosen": 0.41716307401657104, "logits/rejected": 0.39202880859375, "logps/chosen": -432.79998779296875, "logps/rejected": -372.95001220703125, "loss": 0.5952, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01990966871380806, "rewards/margins": 0.6559082269668579, "rewards/rejected": -0.67626953125, "step": 930 }, { "epoch": 0.8068669527896996, "grad_norm": 67.78630078099205, "learning_rate": 7.984978540772532e-07, "logits/chosen": 0.4898262023925781, "logits/rejected": 0.48072052001953125, "logps/chosen": -398.79998779296875, "logps/rejected": -385.1499938964844, "loss": 0.6372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0875396728515625, "rewards/margins": 0.554125964641571, "rewards/rejected": -0.46660155057907104, "step": 940 }, { "epoch": 0.8154506437768241, "grad_norm": 85.37779881841767, "learning_rate": 7.96351931330472e-07, "logits/chosen": 0.41644287109375, "logits/rejected": 0.4005126953125, "logps/chosen": -390.8999938964844, "logps/rejected": -361.75, "loss": 0.6731, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3610778748989105, "rewards/margins": 0.596081554889679, "rewards/rejected": -0.23515014350414276, "step": 950 }, { "epoch": 0.8240343347639485, "grad_norm": 69.05903515545417, "learning_rate": 7.942060085836909e-07, "logits/chosen": 0.5565429925918579, "logits/rejected": 0.526562511920929, "logps/chosen": -357.0, "logps/rejected": -324.79998779296875, "loss": 0.6469, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.47784727811813354, "rewards/margins": 0.55218505859375, "rewards/rejected": -0.07432250678539276, "step": 960 }, { "epoch": 0.8326180257510729, "grad_norm": 91.38929880143064, "learning_rate": 7.920600858369099e-07, "logits/chosen": 0.4556823670864105, "logits/rejected": 0.4958434998989105, "logps/chosen": -366.6000061035156, "logps/rejected": -328.1000061035156, "loss": 0.6118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.678271472454071, "rewards/margins": 0.552355945110321, "rewards/rejected": 0.12556152045726776, "step": 970 }, { "epoch": 0.8412017167381974, "grad_norm": 75.1009114136202, "learning_rate": 7.899141630901287e-07, "logits/chosen": 0.40715330839157104, "logits/rejected": 0.3968749940395355, "logps/chosen": -393.75, "logps/rejected": -357.70001220703125, "loss": 0.6949, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.42436522245407104, "rewards/margins": 0.477783203125, "rewards/rejected": -0.05361328274011612, "step": 980 }, { "epoch": 0.8497854077253219, "grad_norm": 87.41570229462448, "learning_rate": 7.877682403433477e-07, "logits/chosen": 0.4568847715854645, "logits/rejected": 0.44731444120407104, "logps/chosen": -381.29998779296875, "logps/rejected": -387.79998779296875, "loss": 0.6382, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.4562225341796875, "rewards/margins": 0.4949707090854645, "rewards/rejected": -0.03890686109662056, "step": 990 }, { "epoch": 0.8583690987124464, "grad_norm": 69.51390496935858, "learning_rate": 7.856223175965665e-07, "logits/chosen": 0.3939208984375, "logits/rejected": 0.36286622285842896, "logps/chosen": -419.29998779296875, "logps/rejected": -374.54998779296875, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": 0.15354004502296448, "rewards/margins": 0.4848388731479645, "rewards/rejected": -0.3309265077114105, "step": 1000 }, { "epoch": 0.8669527896995708, "grad_norm": 91.17244990250124, "learning_rate": 7.834763948497853e-07, "logits/chosen": 0.42595213651657104, "logits/rejected": 0.36262816190719604, "logps/chosen": -367.3999938964844, "logps/rejected": -357.3999938964844, "loss": 0.6722, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.10726623237133026, "rewards/margins": 0.453369140625, "rewards/rejected": -0.3458496034145355, "step": 1010 }, { "epoch": 0.8755364806866953, "grad_norm": 80.80032006956553, "learning_rate": 7.813304721030043e-07, "logits/chosen": 0.2964843809604645, "logits/rejected": 0.2734375, "logps/chosen": -367.3999938964844, "logps/rejected": -360.25, "loss": 0.6007, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.23255614936351776, "rewards/margins": 0.64599609375, "rewards/rejected": -0.41318970918655396, "step": 1020 }, { "epoch": 0.8841201716738197, "grad_norm": 89.12707517676105, "learning_rate": 7.791845493562232e-07, "logits/chosen": 0.3741699159145355, "logits/rejected": 0.3499206602573395, "logps/chosen": -335.6499938964844, "logps/rejected": -335.79998779296875, "loss": 0.615, "rewards/accuracies": 0.625, "rewards/chosen": 0.16214600205421448, "rewards/margins": 0.6152099370956421, "rewards/rejected": -0.45367431640625, "step": 1030 }, { "epoch": 0.8927038626609443, "grad_norm": 100.32488980210373, "learning_rate": 7.77038626609442e-07, "logits/chosen": 0.328369140625, "logits/rejected": 0.2886108458042145, "logps/chosen": -439.70001220703125, "logps/rejected": -386.20001220703125, "loss": 0.6484, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14078979194164276, "rewards/margins": 0.49641114473342896, "rewards/rejected": -0.3555969297885895, "step": 1040 }, { "epoch": 0.9012875536480687, "grad_norm": 59.967763918848746, "learning_rate": 7.748927038626609e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.33555907011032104, "logps/chosen": -407.20001220703125, "logps/rejected": -396.04998779296875, "loss": 0.5394, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.26093751192092896, "rewards/margins": 0.71142578125, "rewards/rejected": -0.45024412870407104, "step": 1050 }, { "epoch": 0.9098712446351931, "grad_norm": 88.52618698543928, "learning_rate": 7.727467811158797e-07, "logits/chosen": 0.41242676973342896, "logits/rejected": 0.3967529237270355, "logps/chosen": -412.0, "logps/rejected": -395.1499938964844, "loss": 0.6799, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.28033447265625, "rewards/margins": 0.47071534395217896, "rewards/rejected": -0.19006958603858948, "step": 1060 }, { "epoch": 0.9184549356223176, "grad_norm": 58.09384314869585, "learning_rate": 7.706008583690988e-07, "logits/chosen": 0.3746887147426605, "logits/rejected": 0.3568878173828125, "logps/chosen": -382.6499938964844, "logps/rejected": -396.1000061035156, "loss": 0.5667, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03712158277630806, "rewards/margins": 0.6381591558456421, "rewards/rejected": -0.600433349609375, "step": 1070 }, { "epoch": 0.927038626609442, "grad_norm": 59.06527652056125, "learning_rate": 7.684549356223176e-07, "logits/chosen": 0.31816405057907104, "logits/rejected": 0.2832092344760895, "logps/chosen": -401.25, "logps/rejected": -367.8999938964844, "loss": 0.5618, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.41768187284469604, "rewards/margins": 0.7224792242050171, "rewards/rejected": -0.30555421113967896, "step": 1080 }, { "epoch": 0.9356223175965666, "grad_norm": 102.25652603879425, "learning_rate": 7.663090128755364e-07, "logits/chosen": 0.2743774354457855, "logits/rejected": 0.246337890625, "logps/chosen": -386.8500061035156, "logps/rejected": -372.70001220703125, "loss": 0.6041, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05422363430261612, "rewards/margins": 0.62799072265625, "rewards/rejected": -0.573486328125, "step": 1090 }, { "epoch": 0.944206008583691, "grad_norm": 106.14436463232997, "learning_rate": 7.641630901287553e-07, "logits/chosen": 0.291818231344223, "logits/rejected": 0.28303223848342896, "logps/chosen": -376.04998779296875, "logps/rejected": -366.54998779296875, "loss": 0.7456, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.3132080137729645, "rewards/margins": 0.362060546875, "rewards/rejected": -0.674877941608429, "step": 1100 }, { "epoch": 0.9527896995708155, "grad_norm": 95.48317917253908, "learning_rate": 7.620171673819742e-07, "logits/chosen": 0.20518799126148224, "logits/rejected": 0.15603026747703552, "logps/chosen": -402.70001220703125, "logps/rejected": -385.3999938964844, "loss": 0.6636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2752014100551605, "rewards/margins": 0.49555665254592896, "rewards/rejected": -0.76959228515625, "step": 1110 }, { "epoch": 0.9613733905579399, "grad_norm": 90.58282859400967, "learning_rate": 7.598712446351932e-07, "logits/chosen": 0.33964842557907104, "logits/rejected": 0.30929869413375854, "logps/chosen": -377.79998779296875, "logps/rejected": -321.79998779296875, "loss": 0.5913, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03917236253619194, "rewards/margins": 0.61932373046875, "rewards/rejected": -0.6585937738418579, "step": 1120 }, { "epoch": 0.9699570815450643, "grad_norm": 87.28424558676662, "learning_rate": 7.57725321888412e-07, "logits/chosen": 0.29582518339157104, "logits/rejected": 0.2571655213832855, "logps/chosen": -407.79998779296875, "logps/rejected": -389.3999938964844, "loss": 0.5912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09501953423023224, "rewards/margins": 0.654223620891571, "rewards/rejected": -0.559130847454071, "step": 1130 }, { "epoch": 0.9785407725321889, "grad_norm": 85.60494718907375, "learning_rate": 7.555793991416308e-07, "logits/chosen": 0.41094666719436646, "logits/rejected": 0.41706544160842896, "logps/chosen": -382.70001220703125, "logps/rejected": -348.70001220703125, "loss": 0.5742, "rewards/accuracies": 0.65625, "rewards/chosen": 0.27069091796875, "rewards/margins": 0.5933593511581421, "rewards/rejected": -0.32249754667282104, "step": 1140 }, { "epoch": 0.9871244635193133, "grad_norm": 50.363676931367834, "learning_rate": 7.534334763948498e-07, "logits/chosen": 0.4289917051792145, "logits/rejected": 0.41267091035842896, "logps/chosen": -395.1499938964844, "logps/rejected": -361.42498779296875, "loss": 0.6254, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.371734619140625, "rewards/margins": 0.581817626953125, "rewards/rejected": -0.2099609375, "step": 1150 }, { "epoch": 0.9957081545064378, "grad_norm": 55.940598832902246, "learning_rate": 7.512875536480686e-07, "logits/chosen": 0.573437511920929, "logits/rejected": 0.5156005620956421, "logps/chosen": -425.6499938964844, "logps/rejected": -387.1499938964844, "loss": 0.5841, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6337890625, "rewards/margins": 0.750927746295929, "rewards/rejected": -0.11760864406824112, "step": 1160 }, { "epoch": 1.0042918454935623, "grad_norm": 12.86409087280012, "learning_rate": 7.491416309012876e-07, "logits/chosen": 0.4547363221645355, "logits/rejected": 0.44355469942092896, "logps/chosen": -365.20001220703125, "logps/rejected": -373.70001220703125, "loss": 0.3498, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.1112182140350342, "rewards/margins": 1.806604027748108, "rewards/rejected": -0.6956237554550171, "step": 1170 }, { "epoch": 1.0128755364806867, "grad_norm": 11.362025096966068, "learning_rate": 7.469957081545064e-07, "logits/chosen": 0.46260833740234375, "logits/rejected": 0.4178222715854645, "logps/chosen": -369.04998779296875, "logps/rejected": -374.45001220703125, "loss": 0.0931, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.549218773841858, "rewards/margins": 3.5296874046325684, "rewards/rejected": -1.9792969226837158, "step": 1180 }, { "epoch": 1.0214592274678111, "grad_norm": 19.382088634908623, "learning_rate": 7.448497854077253e-07, "logits/chosen": 0.3198791444301605, "logits/rejected": 0.2780395448207855, "logps/chosen": -420.20001220703125, "logps/rejected": -396.20001220703125, "loss": 0.1099, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.037109375, "rewards/margins": 3.83984375, "rewards/rejected": -2.799609422683716, "step": 1190 }, { "epoch": 1.0300429184549356, "grad_norm": 13.968910045559367, "learning_rate": 7.427038626609442e-07, "logits/chosen": 0.1500396728515625, "logits/rejected": 0.20372924208641052, "logps/chosen": -366.8500061035156, "logps/rejected": -362.25, "loss": 0.0914, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4399658143520355, "rewards/margins": 3.8109374046325684, "rewards/rejected": -3.374218702316284, "step": 1200 }, { "epoch": 1.0386266094420602, "grad_norm": 18.19601596527249, "learning_rate": 7.40557939914163e-07, "logits/chosen": 0.21311035752296448, "logits/rejected": 0.12650756537914276, "logps/chosen": -372.1499938964844, "logps/rejected": -376.8500061035156, "loss": 0.0692, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01190795935690403, "rewards/margins": 4.326562404632568, "rewards/rejected": -4.3125, "step": 1210 }, { "epoch": 1.0472103004291846, "grad_norm": 12.296257211053714, "learning_rate": 7.38412017167382e-07, "logits/chosen": 0.10661010444164276, "logits/rejected": 0.16643676161766052, "logps/chosen": -400.8500061035156, "logps/rejected": -441.3999938964844, "loss": 0.099, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.03742218017578125, "rewards/margins": 4.642968654632568, "rewards/rejected": -4.602343559265137, "step": 1220 }, { "epoch": 1.055793991416309, "grad_norm": 11.453489555889282, "learning_rate": 7.362660944206009e-07, "logits/chosen": 0.08863525092601776, "logits/rejected": 0.09099731594324112, "logps/chosen": -402.70001220703125, "logps/rejected": -419.6499938964844, "loss": 0.0919, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.18503722548484802, "rewards/margins": 4.546875, "rewards/rejected": -4.728125095367432, "step": 1230 }, { "epoch": 1.0643776824034334, "grad_norm": 32.78558014586438, "learning_rate": 7.341201716738197e-07, "logits/chosen": -0.063390351831913, "logits/rejected": -0.0585479736328125, "logps/chosen": -417.70001220703125, "logps/rejected": -422.1499938964844, "loss": 0.0782, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.38902586698532104, "rewards/margins": 4.600781440734863, "rewards/rejected": -4.990624904632568, "step": 1240 }, { "epoch": 1.0729613733905579, "grad_norm": 12.645538579517725, "learning_rate": 7.319742489270386e-07, "logits/chosen": 0.0847930908203125, "logits/rejected": 0.1536865234375, "logps/chosen": -380.29998779296875, "logps/rejected": -422.1499938964844, "loss": 0.0786, "rewards/accuracies": 0.96875, "rewards/chosen": -0.08945312350988388, "rewards/margins": 4.662499904632568, "rewards/rejected": -4.748437404632568, "step": 1250 }, { "epoch": 1.0815450643776825, "grad_norm": 42.59621575194187, "learning_rate": 7.298283261802574e-07, "logits/chosen": 0.01641235314309597, "logits/rejected": 0.05018310621380806, "logps/chosen": -393.0, "logps/rejected": -413.6499938964844, "loss": 0.0959, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.14216308295726776, "rewards/margins": 4.462500095367432, "rewards/rejected": -4.31640625, "step": 1260 }, { "epoch": 1.090128755364807, "grad_norm": 21.421349185224773, "learning_rate": 7.276824034334764e-07, "logits/chosen": 0.22298583388328552, "logits/rejected": 0.15301513671875, "logps/chosen": -412.75, "logps/rejected": -400.1499938964844, "loss": 0.0819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.07669677585363388, "rewards/margins": 4.625781059265137, "rewards/rejected": -4.705468654632568, "step": 1270 }, { "epoch": 1.0987124463519313, "grad_norm": 11.363480162748225, "learning_rate": 7.255364806866953e-07, "logits/chosen": 0.17399902641773224, "logits/rejected": 0.1483917236328125, "logps/chosen": -451.3999938964844, "logps/rejected": -429.5, "loss": 0.0797, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.33488768339157104, "rewards/margins": 4.767187595367432, "rewards/rejected": -4.43359375, "step": 1280 }, { "epoch": 1.1072961373390557, "grad_norm": 13.194407969384184, "learning_rate": 7.233905579399141e-07, "logits/chosen": 0.16759033501148224, "logits/rejected": 0.13680724799633026, "logps/chosen": -419.75, "logps/rejected": -432.1499938964844, "loss": 0.0769, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5582519769668579, "rewards/margins": 4.743750095367432, "rewards/rejected": -4.185937404632568, "step": 1290 }, { "epoch": 1.1158798283261802, "grad_norm": 11.115551075589273, "learning_rate": 7.21244635193133e-07, "logits/chosen": 0.13477173447608948, "logits/rejected": 0.10588989406824112, "logps/chosen": -377.1499938964844, "logps/rejected": -384.45001220703125, "loss": 0.0812, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.3731445372104645, "rewards/margins": 4.574999809265137, "rewards/rejected": -4.9453125, "step": 1300 }, { "epoch": 1.1244635193133048, "grad_norm": 10.329976076573034, "learning_rate": 7.190987124463519e-07, "logits/chosen": 0.02620239183306694, "logits/rejected": 0.0072692870162427425, "logps/chosen": -363.70001220703125, "logps/rejected": -389.5, "loss": 0.0877, "rewards/accuracies": 0.96875, "rewards/chosen": -0.817675769329071, "rewards/margins": 4.48046875, "rewards/rejected": -5.297656059265137, "step": 1310 }, { "epoch": 1.1330472103004292, "grad_norm": 11.581426063277247, "learning_rate": 7.169527896995708e-07, "logits/chosen": 0.05608520656824112, "logits/rejected": -0.02749023400247097, "logps/chosen": -371.25, "logps/rejected": -386.70001220703125, "loss": 0.1068, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.731372058391571, "rewards/margins": 4.567187309265137, "rewards/rejected": -5.294531345367432, "step": 1320 }, { "epoch": 1.1416309012875536, "grad_norm": 29.191572859901264, "learning_rate": 7.148068669527897e-07, "logits/chosen": -0.02387695387005806, "logits/rejected": -0.03231201320886612, "logps/chosen": -415.6000061035156, "logps/rejected": -427.8999938964844, "loss": 0.0973, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.857617199420929, "rewards/margins": 4.910937309265137, "rewards/rejected": -5.768750190734863, "step": 1330 }, { "epoch": 1.150214592274678, "grad_norm": 13.809950696238593, "learning_rate": 7.126609442060085e-07, "logits/chosen": 0.009387207217514515, "logits/rejected": -0.01857299730181694, "logps/chosen": -426.29998779296875, "logps/rejected": -409.04998779296875, "loss": 0.0862, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.40388792753219604, "rewards/margins": 4.817968845367432, "rewards/rejected": -5.220312595367432, "step": 1340 }, { "epoch": 1.1587982832618025, "grad_norm": 18.526632610068816, "learning_rate": 7.105150214592275e-07, "logits/chosen": 0.10792388767004013, "logits/rejected": 0.1154327392578125, "logps/chosen": -413.0, "logps/rejected": -418.20001220703125, "loss": 0.0684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0060791014693677425, "rewards/margins": 4.670312404632568, "rewards/rejected": -4.664843559265137, "step": 1350 }, { "epoch": 1.167381974248927, "grad_norm": 16.455392869011106, "learning_rate": 7.083690987124463e-07, "logits/chosen": 0.1435592621564865, "logits/rejected": 0.19035644829273224, "logps/chosen": -377.5, "logps/rejected": -394.20001220703125, "loss": 0.0748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03602294996380806, "rewards/margins": 4.449999809265137, "rewards/rejected": -4.414843559265137, "step": 1360 }, { "epoch": 1.1759656652360515, "grad_norm": 18.40457480438338, "learning_rate": 7.062231759656652e-07, "logits/chosen": 0.058013916015625, "logits/rejected": 0.08394165337085724, "logps/chosen": -419.25, "logps/rejected": -442.70001220703125, "loss": 0.0956, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.01825866661965847, "rewards/margins": 4.719531059265137, "rewards/rejected": -4.737500190734863, "step": 1370 }, { "epoch": 1.184549356223176, "grad_norm": 5.560024091357229, "learning_rate": 7.040772532188841e-07, "logits/chosen": 0.04558410495519638, "logits/rejected": 0.0255126953125, "logps/chosen": -408.3999938964844, "logps/rejected": -401.8500061035156, "loss": 0.0684, "rewards/accuracies": 0.96875, "rewards/chosen": 0.07662353664636612, "rewards/margins": 4.852343559265137, "rewards/rejected": -4.778906345367432, "step": 1380 }, { "epoch": 1.1931330472103003, "grad_norm": 25.240779158216196, "learning_rate": 7.01931330472103e-07, "logits/chosen": 0.068084716796875, "logits/rejected": 0.11025695502758026, "logps/chosen": -403.5, "logps/rejected": -423.3999938964844, "loss": 0.0852, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.14189453423023224, "rewards/margins": 4.704687595367432, "rewards/rejected": -4.84375, "step": 1390 }, { "epoch": 1.201716738197425, "grad_norm": 4.586509207761821, "learning_rate": 6.997854077253219e-07, "logits/chosen": 0.06058197095990181, "logits/rejected": 0.08283080905675888, "logps/chosen": -462.70001220703125, "logps/rejected": -473.29998779296875, "loss": 0.0769, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.07390137016773224, "rewards/margins": 5.32421875, "rewards/rejected": -5.396874904632568, "step": 1400 }, { "epoch": 1.2103004291845494, "grad_norm": 20.972462493982153, "learning_rate": 6.976394849785407e-07, "logits/chosen": 0.064605712890625, "logits/rejected": 0.06521148979663849, "logps/chosen": -428.70001220703125, "logps/rejected": -419.3999938964844, "loss": 0.0888, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01762695237994194, "rewards/margins": 4.73828125, "rewards/rejected": -4.724999904632568, "step": 1410 }, { "epoch": 1.2188841201716738, "grad_norm": 27.196740021247024, "learning_rate": 6.954935622317595e-07, "logits/chosen": 0.02848205529153347, "logits/rejected": -0.0354156494140625, "logps/chosen": -365.8500061035156, "logps/rejected": -391.3500061035156, "loss": 0.1097, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.14265136420726776, "rewards/margins": 4.521874904632568, "rewards/rejected": -4.664843559265137, "step": 1420 }, { "epoch": 1.2274678111587982, "grad_norm": 11.619947853695509, "learning_rate": 6.933476394849786e-07, "logits/chosen": 0.01710815355181694, "logits/rejected": -0.010333252139389515, "logps/chosen": -376.5, "logps/rejected": -397.75, "loss": 0.0718, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0483856201171875, "rewards/margins": 4.9609375, "rewards/rejected": -4.9140625, "step": 1430 }, { "epoch": 1.2360515021459229, "grad_norm": 22.47144216320928, "learning_rate": 6.912017167381974e-07, "logits/chosen": -0.04461059719324112, "logits/rejected": -0.04581298679113388, "logps/chosen": -354.1499938964844, "logps/rejected": -360.0, "loss": 0.1863, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.11118163913488388, "rewards/margins": 4.735937595367432, "rewards/rejected": -4.846875190734863, "step": 1440 }, { "epoch": 1.2446351931330473, "grad_norm": 9.627753508528414, "learning_rate": 6.890557939914162e-07, "logits/chosen": -0.03460693359375, "logits/rejected": -0.04718017578125, "logps/chosen": -416.1000061035156, "logps/rejected": -433.5, "loss": 0.0464, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3897949159145355, "rewards/margins": 5.248437404632568, "rewards/rejected": -4.857812404632568, "step": 1450 }, { "epoch": 1.2532188841201717, "grad_norm": 11.67551054883096, "learning_rate": 6.869098712446351e-07, "logits/chosen": -0.0664520263671875, "logits/rejected": -0.07742615044116974, "logps/chosen": -386.6000061035156, "logps/rejected": -405.20001220703125, "loss": 0.0736, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0044799805618822575, "rewards/margins": 4.876562595367432, "rewards/rejected": -4.881249904632568, "step": 1460 }, { "epoch": 1.261802575107296, "grad_norm": 14.00419406987426, "learning_rate": 6.84763948497854e-07, "logits/chosen": -0.07635803520679474, "logits/rejected": -0.11767272651195526, "logps/chosen": -387.3500061035156, "logps/rejected": -418.0, "loss": 0.0935, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.39404296875, "rewards/margins": 4.741406440734863, "rewards/rejected": -5.141406059265137, "step": 1470 }, { "epoch": 1.2703862660944205, "grad_norm": 25.263784583386318, "learning_rate": 6.82618025751073e-07, "logits/chosen": -0.08965606987476349, "logits/rejected": -0.12743225693702698, "logps/chosen": -421.54998779296875, "logps/rejected": -436.5, "loss": 0.0628, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.2547607421875, "rewards/margins": 5.034375190734863, "rewards/rejected": -5.291406154632568, "step": 1480 }, { "epoch": 1.2789699570815452, "grad_norm": 9.627059494742944, "learning_rate": 6.804721030042918e-07, "logits/chosen": -0.09942016750574112, "logits/rejected": -0.16571959853172302, "logps/chosen": -445.29998779296875, "logps/rejected": -429.79998779296875, "loss": 0.0832, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.098779320716858, "rewards/margins": 5.134375095367432, "rewards/rejected": -6.235937595367432, "step": 1490 }, { "epoch": 1.2875536480686696, "grad_norm": 22.821784466757595, "learning_rate": 6.783261802575106e-07, "logits/chosen": 0.05247802659869194, "logits/rejected": 0.02497558668255806, "logps/chosen": -410.8500061035156, "logps/rejected": -449.1000061035156, "loss": 0.0829, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4922119081020355, "rewards/margins": 4.97265625, "rewards/rejected": -5.467187404632568, "step": 1500 }, { "epoch": 1.296137339055794, "grad_norm": 13.34846848397635, "learning_rate": 6.761802575107296e-07, "logits/chosen": -0.0579833984375, "logits/rejected": -0.068359375, "logps/chosen": -397.04998779296875, "logps/rejected": -394.70001220703125, "loss": 0.1027, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.17656250298023224, "rewards/margins": 4.676562309265137, "rewards/rejected": -4.853125095367432, "step": 1510 }, { "epoch": 1.3047210300429184, "grad_norm": 15.06676890158088, "learning_rate": 6.740343347639484e-07, "logits/chosen": 0.01079711876809597, "logits/rejected": -0.0038665770553052425, "logps/chosen": -405.20001220703125, "logps/rejected": -409.25, "loss": 0.0868, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04116211086511612, "rewards/margins": 4.525000095367432, "rewards/rejected": -4.565625190734863, "step": 1520 }, { "epoch": 1.3133047210300428, "grad_norm": 33.07850961735917, "learning_rate": 6.718884120171674e-07, "logits/chosen": -0.022229766473174095, "logits/rejected": -0.0067657469771802425, "logps/chosen": -417.6000061035156, "logps/rejected": -433.79998779296875, "loss": 0.0801, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.22727051377296448, "rewards/margins": 5.064062595367432, "rewards/rejected": -4.834374904632568, "step": 1530 }, { "epoch": 1.3218884120171674, "grad_norm": 8.050932296629552, "learning_rate": 6.697424892703862e-07, "logits/chosen": -0.0936988815665245, "logits/rejected": -0.07673339545726776, "logps/chosen": -409.04998779296875, "logps/rejected": -425.8999938964844, "loss": 0.0708, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17547607421875, "rewards/margins": 4.932812690734863, "rewards/rejected": -5.109375, "step": 1540 }, { "epoch": 1.3304721030042919, "grad_norm": 22.23202180237858, "learning_rate": 6.675965665236051e-07, "logits/chosen": -0.14187011122703552, "logits/rejected": -0.18663330376148224, "logps/chosen": -357.25, "logps/rejected": -403.1499938964844, "loss": 0.0707, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.35261231660842896, "rewards/margins": 5.181250095367432, "rewards/rejected": -5.53125, "step": 1550 }, { "epoch": 1.3390557939914163, "grad_norm": 32.371957635768844, "learning_rate": 6.65450643776824e-07, "logits/chosen": -0.14471435546875, "logits/rejected": -0.14298248291015625, "logps/chosen": -372.75, "logps/rejected": -381.70001220703125, "loss": 0.0993, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.693493664264679, "rewards/margins": 5.0390625, "rewards/rejected": -5.73046875, "step": 1560 }, { "epoch": 1.3476394849785407, "grad_norm": 18.323187124770577, "learning_rate": 6.633047210300428e-07, "logits/chosen": -0.07662658393383026, "logits/rejected": -0.085485078394413, "logps/chosen": -413.8999938964844, "logps/rejected": -432.3999938964844, "loss": 0.0425, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.13222655653953552, "rewards/margins": 5.266406059265137, "rewards/rejected": -5.3984375, "step": 1570 }, { "epoch": 1.356223175965665, "grad_norm": 5.413724696519472, "learning_rate": 6.611587982832619e-07, "logits/chosen": -0.11881103366613388, "logits/rejected": -0.09738769382238388, "logps/chosen": -415.5, "logps/rejected": -451.0, "loss": 0.0689, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.04794921725988388, "rewards/margins": 5.370312690734863, "rewards/rejected": -5.414843559265137, "step": 1580 }, { "epoch": 1.3648068669527897, "grad_norm": 11.14655624799024, "learning_rate": 6.590128755364807e-07, "logits/chosen": -0.10919189453125, "logits/rejected": -0.18187865614891052, "logps/chosen": -412.29998779296875, "logps/rejected": -435.6499938964844, "loss": 0.0807, "rewards/accuracies": 0.96875, "rewards/chosen": -0.18126221001148224, "rewards/margins": 4.99609375, "rewards/rejected": -5.1796875, "step": 1590 }, { "epoch": 1.3733905579399142, "grad_norm": 13.92810386134119, "learning_rate": 6.568669527896995e-07, "logits/chosen": 0.06620483100414276, "logits/rejected": 0.04072265699505806, "logps/chosen": -355.75, "logps/rejected": -369.79998779296875, "loss": 0.11, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.13283690810203552, "rewards/margins": 4.577343940734863, "rewards/rejected": -4.711718559265137, "step": 1600 }, { "epoch": 1.3819742489270386, "grad_norm": 40.000172216554255, "learning_rate": 6.547210300429184e-07, "logits/chosen": -0.033355712890625, "logits/rejected": -0.06947021186351776, "logps/chosen": -390.3500061035156, "logps/rejected": -381.0, "loss": 0.0916, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.07994995266199112, "rewards/margins": 4.751562595367432, "rewards/rejected": -4.825781345367432, "step": 1610 }, { "epoch": 1.3905579399141632, "grad_norm": 23.023209598691146, "learning_rate": 6.525751072961372e-07, "logits/chosen": -0.09736023098230362, "logits/rejected": -0.07124023139476776, "logps/chosen": -400.75, "logps/rejected": -416.3999938964844, "loss": 0.0789, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.35419923067092896, "rewards/margins": 5.014062404632568, "rewards/rejected": -5.3671875, "step": 1620 }, { "epoch": 1.3991416309012876, "grad_norm": 21.666273277728223, "learning_rate": 6.504291845493563e-07, "logits/chosen": -0.08389892429113388, "logits/rejected": -0.05601196363568306, "logps/chosen": -425.3500061035156, "logps/rejected": -423.8500061035156, "loss": 0.1263, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.32880860567092896, "rewards/margins": 5.077343940734863, "rewards/rejected": -5.403906345367432, "step": 1630 }, { "epoch": 1.407725321888412, "grad_norm": 29.170807549514347, "learning_rate": 6.482832618025751e-07, "logits/chosen": -0.08381958305835724, "logits/rejected": -0.08295287936925888, "logps/chosen": -338.3500061035156, "logps/rejected": -399.1000061035156, "loss": 0.115, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.3863525390625, "rewards/margins": 4.584374904632568, "rewards/rejected": -4.970312595367432, "step": 1640 }, { "epoch": 1.4163090128755365, "grad_norm": 13.293419481169717, "learning_rate": 6.461373390557939e-07, "logits/chosen": -0.086700439453125, "logits/rejected": -0.08922271430492401, "logps/chosen": -379.20001220703125, "logps/rejected": -401.04998779296875, "loss": 0.1085, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.22958984971046448, "rewards/margins": 4.620312690734863, "rewards/rejected": -4.849218845367432, "step": 1650 }, { "epoch": 1.4248927038626609, "grad_norm": 26.659643100252158, "learning_rate": 6.439914163090129e-07, "logits/chosen": -0.07275390625, "logits/rejected": -0.05431518703699112, "logps/chosen": -383.6000061035156, "logps/rejected": -396.6000061035156, "loss": 0.0807, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3049560487270355, "rewards/margins": 4.771874904632568, "rewards/rejected": -4.466406345367432, "step": 1660 }, { "epoch": 1.4334763948497855, "grad_norm": 7.80907968515692, "learning_rate": 6.418454935622317e-07, "logits/chosen": -0.03206176683306694, "logits/rejected": -0.03000030480325222, "logps/chosen": -402.04998779296875, "logps/rejected": -399.04998779296875, "loss": 0.1304, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.15439453721046448, "rewards/margins": 4.684374809265137, "rewards/rejected": -4.52734375, "step": 1670 }, { "epoch": 1.44206008583691, "grad_norm": 14.285528094505134, "learning_rate": 6.396995708154506e-07, "logits/chosen": -0.1271926909685135, "logits/rejected": -0.11528320610523224, "logps/chosen": -369.95001220703125, "logps/rejected": -390.79998779296875, "loss": 0.0968, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06386718899011612, "rewards/margins": 4.568749904632568, "rewards/rejected": -4.504687309265137, "step": 1680 }, { "epoch": 1.4506437768240343, "grad_norm": 16.526300733389867, "learning_rate": 6.375536480686695e-07, "logits/chosen": -0.12454833835363388, "logits/rejected": -0.13391418755054474, "logps/chosen": -422.8999938964844, "logps/rejected": -445.20001220703125, "loss": 0.0611, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.115966796875, "rewards/margins": 5.079687595367432, "rewards/rejected": -4.966406345367432, "step": 1690 }, { "epoch": 1.4592274678111588, "grad_norm": 7.212727696085295, "learning_rate": 6.354077253218884e-07, "logits/chosen": -0.12458495795726776, "logits/rejected": -0.17696838080883026, "logps/chosen": -438.5, "logps/rejected": -453.29998779296875, "loss": 0.0555, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.119140625, "rewards/margins": 5.268750190734863, "rewards/rejected": -5.388281345367432, "step": 1700 }, { "epoch": 1.4678111587982832, "grad_norm": 15.483386323008265, "learning_rate": 6.332618025751073e-07, "logits/chosen": -0.16541747748851776, "logits/rejected": -0.18590088188648224, "logps/chosen": -395.1000061035156, "logps/rejected": -402.1499938964844, "loss": 0.0951, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.49030762910842896, "rewards/margins": 4.985937595367432, "rewards/rejected": -5.473437309265137, "step": 1710 }, { "epoch": 1.4763948497854078, "grad_norm": 11.63819523537556, "learning_rate": 6.311158798283261e-07, "logits/chosen": -0.19019775092601776, "logits/rejected": -0.22065429389476776, "logps/chosen": -394.70001220703125, "logps/rejected": -404.8999938964844, "loss": 0.0878, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5693603754043579, "rewards/margins": 5.073437690734863, "rewards/rejected": -5.640625, "step": 1720 }, { "epoch": 1.4849785407725322, "grad_norm": 30.092407640497026, "learning_rate": 6.28969957081545e-07, "logits/chosen": -0.118408203125, "logits/rejected": -0.12757568061351776, "logps/chosen": -431.5, "logps/rejected": -470.29998779296875, "loss": 0.0556, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.31513673067092896, "rewards/margins": 5.119531154632568, "rewards/rejected": -5.432812690734863, "step": 1730 }, { "epoch": 1.4935622317596566, "grad_norm": 4.908568452728586, "learning_rate": 6.26824034334764e-07, "logits/chosen": 0.0069213868118822575, "logits/rejected": -0.015179443173110485, "logps/chosen": -401.79998779296875, "logps/rejected": -398.45001220703125, "loss": 0.082, "rewards/accuracies": 0.9375, "rewards/chosen": -0.38121336698532104, "rewards/margins": 4.657031059265137, "rewards/rejected": -5.035937309265137, "step": 1740 }, { "epoch": 1.5021459227467813, "grad_norm": 19.068747326336116, "learning_rate": 6.246781115879828e-07, "logits/chosen": -0.177215576171875, "logits/rejected": -0.13640746474266052, "logps/chosen": -379.5, "logps/rejected": -406.8500061035156, "loss": 0.0746, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.46240234375, "rewards/margins": 4.71484375, "rewards/rejected": -5.177343845367432, "step": 1750 }, { "epoch": 1.5107296137339055, "grad_norm": 26.264859230770163, "learning_rate": 6.225321888412017e-07, "logits/chosen": -0.11212158203125, "logits/rejected": -0.12115059047937393, "logps/chosen": -404.20001220703125, "logps/rejected": -440.3999938964844, "loss": 0.0743, "rewards/accuracies": 0.96875, "rewards/chosen": -0.06157226487994194, "rewards/margins": 5.42578125, "rewards/rejected": -5.485156059265137, "step": 1760 }, { "epoch": 1.51931330472103, "grad_norm": 10.195016818470808, "learning_rate": 6.203862660944205e-07, "logits/chosen": -0.07125244289636612, "logits/rejected": -0.08332214504480362, "logps/chosen": -403.3500061035156, "logps/rejected": -432.45001220703125, "loss": 0.0603, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3261962831020355, "rewards/margins": 5.050000190734863, "rewards/rejected": -5.373437404632568, "step": 1770 }, { "epoch": 1.5278969957081545, "grad_norm": 27.22675164339287, "learning_rate": 6.182403433476395e-07, "logits/chosen": -0.063013456761837, "logits/rejected": -0.10613403469324112, "logps/chosen": -397.70001220703125, "logps/rejected": -433.1499938964844, "loss": 0.0754, "rewards/accuracies": 0.96875, "rewards/chosen": -0.760906994342804, "rewards/margins": 5.0625, "rewards/rejected": -5.826562404632568, "step": 1780 }, { "epoch": 1.536480686695279, "grad_norm": 3.2854844355406345, "learning_rate": 6.160944206008584e-07, "logits/chosen": -0.10878296196460724, "logits/rejected": -0.120635986328125, "logps/chosen": -370.79998779296875, "logps/rejected": -411.1000061035156, "loss": 0.0749, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9701904058456421, "rewards/margins": 4.864062309265137, "rewards/rejected": -5.834374904632568, "step": 1790 }, { "epoch": 1.5450643776824036, "grad_norm": 13.591689742495568, "learning_rate": 6.139484978540772e-07, "logits/chosen": 0.08925781399011612, "logits/rejected": 0.017527008429169655, "logps/chosen": -458.0, "logps/rejected": -455.1000061035156, "loss": 0.063, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4933105409145355, "rewards/margins": 5.661718845367432, "rewards/rejected": -6.159375190734863, "step": 1800 }, { "epoch": 1.5536480686695278, "grad_norm": 39.946241213570495, "learning_rate": 6.118025751072961e-07, "logits/chosen": -0.10375366359949112, "logits/rejected": -0.08781280368566513, "logps/chosen": -411.3999938964844, "logps/rejected": -475.70001220703125, "loss": 0.0684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.106958031654358, "rewards/margins": 5.696093559265137, "rewards/rejected": -6.809374809265137, "step": 1810 }, { "epoch": 1.5622317596566524, "grad_norm": 54.238522553355295, "learning_rate": 6.09656652360515e-07, "logits/chosen": -0.2752685546875, "logits/rejected": -0.22305908799171448, "logps/chosen": -375.79998779296875, "logps/rejected": -417.1000061035156, "loss": 0.0743, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.1767578125, "rewards/margins": 5.084374904632568, "rewards/rejected": -7.260937690734863, "step": 1820 }, { "epoch": 1.5708154506437768, "grad_norm": 12.690071369843249, "learning_rate": 6.075107296137339e-07, "logits/chosen": -0.19184570014476776, "logits/rejected": -0.24849243462085724, "logps/chosen": -449.1499938964844, "logps/rejected": -458.3500061035156, "loss": 0.0778, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.778222680091858, "rewards/margins": 5.821875095367432, "rewards/rejected": -7.595312595367432, "step": 1830 }, { "epoch": 1.5793991416309012, "grad_norm": 17.31833812458355, "learning_rate": 6.053648068669528e-07, "logits/chosen": -0.04459228366613388, "logits/rejected": -0.03709716722369194, "logps/chosen": -409.6499938964844, "logps/rejected": -444.5, "loss": 0.0763, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.6431152820587158, "rewards/margins": 5.389062404632568, "rewards/rejected": -7.032812595367432, "step": 1840 }, { "epoch": 1.5879828326180259, "grad_norm": 11.245371622346708, "learning_rate": 6.032188841201716e-07, "logits/chosen": -0.27128905057907104, "logits/rejected": -0.21333464980125427, "logps/chosen": -420.29998779296875, "logps/rejected": -456.70001220703125, "loss": 0.0593, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.637597680091858, "rewards/margins": 5.55078125, "rewards/rejected": -7.189062595367432, "step": 1850 }, { "epoch": 1.59656652360515, "grad_norm": 19.708907456197664, "learning_rate": 6.010729613733906e-07, "logits/chosen": -0.11395873874425888, "logits/rejected": -0.205657958984375, "logps/chosen": -431.6000061035156, "logps/rejected": -437.79998779296875, "loss": 0.0501, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.175537109375, "rewards/margins": 5.409375190734863, "rewards/rejected": -6.582812309265137, "step": 1860 }, { "epoch": 1.6051502145922747, "grad_norm": 19.857698275582568, "learning_rate": 5.989270386266094e-07, "logits/chosen": -0.030029296875, "logits/rejected": -0.06689453125, "logps/chosen": -406.5, "logps/rejected": -442.0, "loss": 0.0774, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8001464605331421, "rewards/margins": 5.721875190734863, "rewards/rejected": -6.515625, "step": 1870 }, { "epoch": 1.613733905579399, "grad_norm": 23.62599553865035, "learning_rate": 5.967811158798283e-07, "logits/chosen": -0.0384521484375, "logits/rejected": -0.09433593600988388, "logps/chosen": -374.8500061035156, "logps/rejected": -396.8500061035156, "loss": 0.0972, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.733691394329071, "rewards/margins": 5.012499809265137, "rewards/rejected": -5.74609375, "step": 1880 }, { "epoch": 1.6223175965665235, "grad_norm": 6.587236850023234, "learning_rate": 5.946351931330472e-07, "logits/chosen": -0.05234374850988388, "logits/rejected": -0.1018218994140625, "logps/chosen": -384.3999938964844, "logps/rejected": -432.8999938964844, "loss": 0.0657, "rewards/accuracies": 0.96875, "rewards/chosen": -0.30961912870407104, "rewards/margins": 5.324999809265137, "rewards/rejected": -5.630468845367432, "step": 1890 }, { "epoch": 1.6309012875536482, "grad_norm": 10.287745862798387, "learning_rate": 5.924892703862661e-07, "logits/chosen": -0.19701842963695526, "logits/rejected": -0.25560301542282104, "logps/chosen": -382.25, "logps/rejected": -399.3999938964844, "loss": 0.0772, "rewards/accuracies": 0.96875, "rewards/chosen": -0.671435534954071, "rewards/margins": 5.288281440734863, "rewards/rejected": -5.96875, "step": 1900 }, { "epoch": 1.6394849785407726, "grad_norm": 17.59476211994643, "learning_rate": 5.903433476394849e-07, "logits/chosen": -0.15318603813648224, "logits/rejected": -0.1458740234375, "logps/chosen": -423.1499938964844, "logps/rejected": -451.5, "loss": 0.0847, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.679736316204071, "rewards/margins": 5.6953125, "rewards/rejected": -6.375, "step": 1910 }, { "epoch": 1.648068669527897, "grad_norm": 23.23600525781261, "learning_rate": 5.881974248927038e-07, "logits/chosen": -0.11387024074792862, "logits/rejected": -0.20839843153953552, "logps/chosen": -385.75, "logps/rejected": -389.3999938964844, "loss": 0.1002, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6735595464706421, "rewards/margins": 5.21484375, "rewards/rejected": -5.888281345367432, "step": 1920 }, { "epoch": 1.6566523605150214, "grad_norm": 7.455399199190586, "learning_rate": 5.860515021459227e-07, "logits/chosen": -0.119354248046875, "logits/rejected": -0.09841003268957138, "logps/chosen": -408.0, "logps/rejected": -438.29998779296875, "loss": 0.0737, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9164794683456421, "rewards/margins": 5.3359375, "rewards/rejected": -6.249218940734863, "step": 1930 }, { "epoch": 1.6652360515021458, "grad_norm": 3.3815142079425784, "learning_rate": 5.839055793991417e-07, "logits/chosen": -0.112060546875, "logits/rejected": -0.12138061225414276, "logps/chosen": -409.3500061035156, "logps/rejected": -432.20001220703125, "loss": 0.0908, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.16497802734375, "rewards/margins": 5.279687404632568, "rewards/rejected": -6.440625190734863, "step": 1940 }, { "epoch": 1.6738197424892705, "grad_norm": 9.787814865591304, "learning_rate": 5.817596566523605e-07, "logits/chosen": -0.204193115234375, "logits/rejected": -0.20503386855125427, "logps/chosen": -414.29998779296875, "logps/rejected": -463.8999938964844, "loss": 0.0633, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.01123046875, "rewards/margins": 5.560156345367432, "rewards/rejected": -6.571875095367432, "step": 1950 }, { "epoch": 1.6824034334763949, "grad_norm": 14.972706969532501, "learning_rate": 5.796137339055793e-07, "logits/chosen": -0.24951171875, "logits/rejected": -0.21007080376148224, "logps/chosen": -375.0, "logps/rejected": -417.70001220703125, "loss": 0.0884, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.11865234375, "rewards/margins": 5.336718559265137, "rewards/rejected": -6.453125, "step": 1960 }, { "epoch": 1.6909871244635193, "grad_norm": 8.221620861299366, "learning_rate": 5.774678111587982e-07, "logits/chosen": -0.3475341796875, "logits/rejected": -0.3128662109375, "logps/chosen": -451.0, "logps/rejected": -472.29998779296875, "loss": 0.0803, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.289941430091858, "rewards/margins": 5.892968654632568, "rewards/rejected": -7.182812690734863, "step": 1970 }, { "epoch": 1.699570815450644, "grad_norm": 60.276216740292895, "learning_rate": 5.753218884120172e-07, "logits/chosen": -0.2579589784145355, "logits/rejected": -0.22457274794578552, "logps/chosen": -378.8999938964844, "logps/rejected": -424.45001220703125, "loss": 0.1039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2832763195037842, "rewards/margins": 5.296093940734863, "rewards/rejected": -6.582812309265137, "step": 1980 }, { "epoch": 1.7081545064377681, "grad_norm": 21.533834623627303, "learning_rate": 5.731759656652361e-07, "logits/chosen": -0.09178467094898224, "logits/rejected": -0.12762756645679474, "logps/chosen": -420.1499938964844, "logps/rejected": -423.8999938964844, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.063745141029358, "rewards/margins": 5.4453125, "rewards/rejected": -6.509375095367432, "step": 1990 }, { "epoch": 1.7167381974248928, "grad_norm": 50.12006756654514, "learning_rate": 5.710300429184549e-07, "logits/chosen": -0.151641845703125, "logits/rejected": -0.21804198622703552, "logps/chosen": -393.75, "logps/rejected": -403.5, "loss": 0.0812, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.907519519329071, "rewards/margins": 5.327343940734863, "rewards/rejected": -6.236718654632568, "step": 2000 }, { "epoch": 1.7253218884120172, "grad_norm": 15.540734033747048, "learning_rate": 5.688841201716737e-07, "logits/chosen": -0.09453125298023224, "logits/rejected": -0.178497314453125, "logps/chosen": -399.25, "logps/rejected": -417.3999938964844, "loss": 0.056, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.626953125, "rewards/margins": 5.239843845367432, "rewards/rejected": -5.8671875, "step": 2010 }, { "epoch": 1.7339055793991416, "grad_norm": 2.297137067192209, "learning_rate": 5.667381974248927e-07, "logits/chosen": -0.06398925930261612, "logits/rejected": -0.05792236328125, "logps/chosen": -417.1000061035156, "logps/rejected": -440.79998779296875, "loss": 0.0442, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5635741949081421, "rewards/margins": 5.689062595367432, "rewards/rejected": -6.254687309265137, "step": 2020 }, { "epoch": 1.7424892703862662, "grad_norm": 15.895095034635652, "learning_rate": 5.645922746781116e-07, "logits/chosen": -0.06040344387292862, "logits/rejected": -0.09334716945886612, "logps/chosen": -418.70001220703125, "logps/rejected": -447.79998779296875, "loss": 0.0652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7054198980331421, "rewards/margins": 5.693749904632568, "rewards/rejected": -6.396874904632568, "step": 2030 }, { "epoch": 1.7510729613733904, "grad_norm": 15.358365003344073, "learning_rate": 5.624463519313305e-07, "logits/chosen": 0.028106689453125, "logits/rejected": 0.03260498121380806, "logps/chosen": -398.45001220703125, "logps/rejected": -437.6499938964844, "loss": 0.1203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.505053699016571, "rewards/margins": 5.596093654632568, "rewards/rejected": -6.095312595367432, "step": 2040 }, { "epoch": 1.759656652360515, "grad_norm": 2.8136244938941903, "learning_rate": 5.603004291845493e-07, "logits/chosen": -0.07514037936925888, "logits/rejected": -0.08828125149011612, "logps/chosen": -413.6000061035156, "logps/rejected": -443.1499938964844, "loss": 0.0678, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.34874266386032104, "rewards/margins": 5.594531059265137, "rewards/rejected": -5.943749904632568, "step": 2050 }, { "epoch": 1.7682403433476395, "grad_norm": 7.694679923517573, "learning_rate": 5.581545064377682e-07, "logits/chosen": 0.11674804985523224, "logits/rejected": 0.07679291069507599, "logps/chosen": -407.79998779296875, "logps/rejected": -418.45001220703125, "loss": 0.0733, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.610888659954071, "rewards/margins": 5.159375190734863, "rewards/rejected": -5.770312309265137, "step": 2060 }, { "epoch": 1.7768240343347639, "grad_norm": 13.761625859622395, "learning_rate": 5.560085836909871e-07, "logits/chosen": -0.02241211012005806, "logits/rejected": -0.08964844048023224, "logps/chosen": -416.54998779296875, "logps/rejected": -413.8999938964844, "loss": 0.1121, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.596508800983429, "rewards/margins": 5.255468845367432, "rewards/rejected": -5.848437309265137, "step": 2070 }, { "epoch": 1.7854077253218885, "grad_norm": 10.500323351437277, "learning_rate": 5.53862660944206e-07, "logits/chosen": -0.06499633938074112, "logits/rejected": -0.13374023139476776, "logps/chosen": -432.79998779296875, "logps/rejected": -453.54998779296875, "loss": 0.0552, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7290893793106079, "rewards/margins": 5.110156059265137, "rewards/rejected": -5.837500095367432, "step": 2080 }, { "epoch": 1.7939914163090127, "grad_norm": 12.49894941026603, "learning_rate": 5.517167381974248e-07, "logits/chosen": -0.14570312201976776, "logits/rejected": -0.16201171278953552, "logps/chosen": -429.1000061035156, "logps/rejected": -456.70001220703125, "loss": 0.0826, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.877246081829071, "rewards/margins": 5.534375190734863, "rewards/rejected": -6.412499904632568, "step": 2090 }, { "epoch": 1.8025751072961373, "grad_norm": 5.006309691374615, "learning_rate": 5.495708154506438e-07, "logits/chosen": -0.10888977348804474, "logits/rejected": -0.08519897609949112, "logps/chosen": -390.79998779296875, "logps/rejected": -417.0, "loss": 0.0946, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.799609363079071, "rewards/margins": 5.365624904632568, "rewards/rejected": -6.170312404632568, "step": 2100 }, { "epoch": 1.8111587982832618, "grad_norm": 6.570021578355053, "learning_rate": 5.474248927038626e-07, "logits/chosen": -0.14440612494945526, "logits/rejected": -0.191162109375, "logps/chosen": -409.20001220703125, "logps/rejected": -447.5, "loss": 0.1228, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.629138171672821, "rewards/margins": 5.650000095367432, "rewards/rejected": -6.28125, "step": 2110 }, { "epoch": 1.8197424892703862, "grad_norm": 26.331525348238113, "learning_rate": 5.452789699570815e-07, "logits/chosen": -0.14991454780101776, "logits/rejected": -0.1365203857421875, "logps/chosen": -376.1000061035156, "logps/rejected": -395.1000061035156, "loss": 0.0922, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9908447265625, "rewards/margins": 5.134375095367432, "rewards/rejected": -6.1171875, "step": 2120 }, { "epoch": 1.8283261802575108, "grad_norm": 21.728813877364342, "learning_rate": 5.431330472103004e-07, "logits/chosen": -0.07867431640625, "logits/rejected": -0.030792236328125, "logps/chosen": -356.20001220703125, "logps/rejected": -399.6499938964844, "loss": 0.0899, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6906982660293579, "rewards/margins": 4.944531440734863, "rewards/rejected": -5.635937690734863, "step": 2130 }, { "epoch": 1.8369098712446352, "grad_norm": 5.82008700759021, "learning_rate": 5.409871244635193e-07, "logits/chosen": -0.24013671278953552, "logits/rejected": -0.16271667182445526, "logps/chosen": -382.95001220703125, "logps/rejected": -422.29998779296875, "loss": 0.0509, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5479980707168579, "rewards/margins": 5.303124904632568, "rewards/rejected": -5.853125095367432, "step": 2140 }, { "epoch": 1.8454935622317596, "grad_norm": 18.00087881312856, "learning_rate": 5.388412017167382e-07, "logits/chosen": -0.20957641303539276, "logits/rejected": -0.28947752714157104, "logps/chosen": -413.0, "logps/rejected": -425.20001220703125, "loss": 0.0825, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.868786633014679, "rewards/margins": 5.370312690734863, "rewards/rejected": -6.239062309265137, "step": 2150 }, { "epoch": 1.8540772532188843, "grad_norm": 13.067500326455535, "learning_rate": 5.36695278969957e-07, "logits/chosen": -0.15487059950828552, "logits/rejected": -0.16651611030101776, "logps/chosen": -421.1499938964844, "logps/rejected": -461.95001220703125, "loss": 0.0443, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5853271484375, "rewards/margins": 5.911718845367432, "rewards/rejected": -6.490624904632568, "step": 2160 }, { "epoch": 1.8626609442060085, "grad_norm": 18.520572145965, "learning_rate": 5.34549356223176e-07, "logits/chosen": -0.09503173828125, "logits/rejected": -0.11440734565258026, "logps/chosen": -420.70001220703125, "logps/rejected": -441.79998779296875, "loss": 0.0727, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.42760008573532104, "rewards/margins": 5.913281440734863, "rewards/rejected": -6.339062690734863, "step": 2170 }, { "epoch": 1.871244635193133, "grad_norm": 43.43748351747219, "learning_rate": 5.324034334763949e-07, "logits/chosen": -0.21953125298023224, "logits/rejected": -0.2470703125, "logps/chosen": -418.8500061035156, "logps/rejected": -442.1000061035156, "loss": 0.0617, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7735656499862671, "rewards/margins": 6.098437309265137, "rewards/rejected": -6.870312690734863, "step": 2180 }, { "epoch": 1.8798283261802575, "grad_norm": 48.046959107459834, "learning_rate": 5.302575107296137e-07, "logits/chosen": -0.12520141899585724, "logits/rejected": -0.13376006484031677, "logps/chosen": -416.8500061035156, "logps/rejected": -441.0, "loss": 0.0834, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7791748046875, "rewards/margins": 5.712500095367432, "rewards/rejected": -6.491406440734863, "step": 2190 }, { "epoch": 1.888412017167382, "grad_norm": 8.355796931814565, "learning_rate": 5.281115879828326e-07, "logits/chosen": -0.12087402492761612, "logits/rejected": -0.19768676161766052, "logps/chosen": -408.0, "logps/rejected": -395.1000061035156, "loss": 0.0827, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.859570324420929, "rewards/margins": 5.175000190734863, "rewards/rejected": -6.034375190734863, "step": 2200 }, { "epoch": 1.8969957081545066, "grad_norm": 16.701142957985837, "learning_rate": 5.259656652360514e-07, "logits/chosen": -0.19086456298828125, "logits/rejected": -0.2282867431640625, "logps/chosen": -393.95001220703125, "logps/rejected": -422.29998779296875, "loss": 0.0706, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7028564214706421, "rewards/margins": 5.626562595367432, "rewards/rejected": -6.328906059265137, "step": 2210 }, { "epoch": 1.9055793991416308, "grad_norm": 8.437029923355508, "learning_rate": 5.238197424892704e-07, "logits/chosen": -0.07515869289636612, "logits/rejected": -0.1065673828125, "logps/chosen": -379.70001220703125, "logps/rejected": -381.8999938964844, "loss": 0.0893, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.783935546875, "rewards/margins": 5.115624904632568, "rewards/rejected": -5.903124809265137, "step": 2220 }, { "epoch": 1.9141630901287554, "grad_norm": 23.411987756003665, "learning_rate": 5.216738197424893e-07, "logits/chosen": -0.2673301696777344, "logits/rejected": -0.28447264432907104, "logps/chosen": -413.3999938964844, "logps/rejected": -438.1000061035156, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.222070336341858, "rewards/margins": 5.346093654632568, "rewards/rejected": -6.568749904632568, "step": 2230 }, { "epoch": 1.9227467811158798, "grad_norm": 5.6913078139784385, "learning_rate": 5.195278969957081e-07, "logits/chosen": -0.2648071348667145, "logits/rejected": -0.23629455268383026, "logps/chosen": -396.95001220703125, "logps/rejected": -424.0, "loss": 0.0556, "rewards/accuracies": 0.96875, "rewards/chosen": -0.821582019329071, "rewards/margins": 5.579687595367432, "rewards/rejected": -6.40625, "step": 2240 }, { "epoch": 1.9313304721030042, "grad_norm": 11.664349987508563, "learning_rate": 5.173819742489271e-07, "logits/chosen": -0.23793944716453552, "logits/rejected": -0.25181883573532104, "logps/chosen": -424.3999938964844, "logps/rejected": -456.6000061035156, "loss": 0.0714, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4892578125, "rewards/margins": 5.426562309265137, "rewards/rejected": -5.920312404632568, "step": 2250 }, { "epoch": 1.9399141630901289, "grad_norm": 15.081269634476735, "learning_rate": 5.152360515021459e-07, "logits/chosen": -0.09409179538488388, "logits/rejected": -0.10847167670726776, "logps/chosen": -429.5, "logps/rejected": -434.5, "loss": 0.0766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6043456792831421, "rewards/margins": 5.300000190734863, "rewards/rejected": -5.907031059265137, "step": 2260 }, { "epoch": 1.948497854077253, "grad_norm": 18.747117361662497, "learning_rate": 5.130901287553648e-07, "logits/chosen": -0.09161376953125, "logits/rejected": -0.13571777939796448, "logps/chosen": -357.20001220703125, "logps/rejected": -395.3999938964844, "loss": 0.0805, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.83154296875, "rewards/margins": 5.365624904632568, "rewards/rejected": -6.193749904632568, "step": 2270 }, { "epoch": 1.9570815450643777, "grad_norm": 21.67701131948868, "learning_rate": 5.109442060085836e-07, "logits/chosen": -0.20980720221996307, "logits/rejected": -0.22226563096046448, "logps/chosen": -418.45001220703125, "logps/rejected": -434.3999938964844, "loss": 0.0865, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9582763910293579, "rewards/margins": 5.44140625, "rewards/rejected": -6.396093845367432, "step": 2280 }, { "epoch": 1.9656652360515021, "grad_norm": 16.502792433801712, "learning_rate": 5.087982832618026e-07, "logits/chosen": -0.18007202446460724, "logits/rejected": -0.21450194716453552, "logps/chosen": -416.95001220703125, "logps/rejected": -471.6499938964844, "loss": 0.0699, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.201562523841858, "rewards/margins": 5.658593654632568, "rewards/rejected": -6.857812404632568, "step": 2290 }, { "epoch": 1.9742489270386265, "grad_norm": 65.75016562396877, "learning_rate": 5.066523605150215e-07, "logits/chosen": -0.1854248046875, "logits/rejected": -0.14471435546875, "logps/chosen": -402.45001220703125, "logps/rejected": -444.20001220703125, "loss": 0.0927, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.775390625, "rewards/margins": 5.282031059265137, "rewards/rejected": -7.060156345367432, "step": 2300 }, { "epoch": 1.9828326180257512, "grad_norm": 22.158355847178623, "learning_rate": 5.045064377682403e-07, "logits/chosen": -0.393594354391098, "logits/rejected": -0.38462525606155396, "logps/chosen": -361.125, "logps/rejected": -406.04998779296875, "loss": 0.0808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8699219226837158, "rewards/margins": 5.4921875, "rewards/rejected": -7.3671875, "step": 2310 }, { "epoch": 1.9914163090128756, "grad_norm": 47.119015105275196, "learning_rate": 5.023605150214591e-07, "logits/chosen": -0.25578004121780396, "logits/rejected": -0.2618652284145355, "logps/chosen": -407.45001220703125, "logps/rejected": -442.1000061035156, "loss": 0.1013, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.858789086341858, "rewards/margins": 5.371874809265137, "rewards/rejected": -7.228125095367432, "step": 2320 }, { "epoch": 2.0, "grad_norm": 25.135748989869736, "learning_rate": 5.00214592274678e-07, "logits/chosen": -0.21175536513328552, "logits/rejected": -0.21538086235523224, "logps/chosen": -380.04998779296875, "logps/rejected": -433.8999938964844, "loss": 0.0759, "rewards/accuracies": 0.96875, "rewards/chosen": -1.454492211341858, "rewards/margins": 5.407812595367432, "rewards/rejected": -6.859375, "step": 2330 }, { "epoch": 2.0085836909871246, "grad_norm": 2.2449449153356738, "learning_rate": 4.98068669527897e-07, "logits/chosen": -0.278717041015625, "logits/rejected": -0.28974610567092896, "logps/chosen": -350.8500061035156, "logps/rejected": -373.45001220703125, "loss": 0.0478, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8935546875, "rewards/margins": 6.423437595367432, "rewards/rejected": -7.318749904632568, "step": 2340 }, { "epoch": 2.017167381974249, "grad_norm": 1.9138994101712838, "learning_rate": 4.959227467811158e-07, "logits/chosen": -0.28847044706344604, "logits/rejected": -0.2662109434604645, "logps/chosen": -374.45001220703125, "logps/rejected": -418.70001220703125, "loss": 0.019, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.450555443763733, "rewards/margins": 7.157812595367432, "rewards/rejected": -8.610937118530273, "step": 2350 }, { "epoch": 2.0257510729613735, "grad_norm": 3.332320556453579, "learning_rate": 4.937768240343347e-07, "logits/chosen": -0.3636535704135895, "logits/rejected": -0.3927368223667145, "logps/chosen": -360.8500061035156, "logps/rejected": -422.5, "loss": 0.0334, "rewards/accuracies": 0.96875, "rewards/chosen": -2.479296922683716, "rewards/margins": 7.354687690734863, "rewards/rejected": -9.8359375, "step": 2360 }, { "epoch": 2.0343347639484977, "grad_norm": 1.900256610635695, "learning_rate": 4.916309012875537e-07, "logits/chosen": -0.38179320096969604, "logits/rejected": -0.4037231504917145, "logps/chosen": -396.20001220703125, "logps/rejected": -439.70001220703125, "loss": 0.0175, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.301953077316284, "rewards/margins": 7.581250190734863, "rewards/rejected": -10.875, "step": 2370 }, { "epoch": 2.0429184549356223, "grad_norm": 1.382235780136041, "learning_rate": 4.894849785407725e-07, "logits/chosen": -0.22792968153953552, "logits/rejected": -0.25251466035842896, "logps/chosen": -434.0, "logps/rejected": -464.70001220703125, "loss": 0.02, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6640625, "rewards/margins": 8.475000381469727, "rewards/rejected": -11.139062881469727, "step": 2380 }, { "epoch": 2.051502145922747, "grad_norm": 0.3832037415085132, "learning_rate": 4.873390557939914e-07, "logits/chosen": -0.4705810546875, "logits/rejected": -0.4884277284145355, "logps/chosen": -461.8500061035156, "logps/rejected": -494.8999938964844, "loss": 0.0128, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.356738328933716, "rewards/margins": 7.889062404632568, "rewards/rejected": -10.237500190734863, "step": 2390 }, { "epoch": 2.060085836909871, "grad_norm": 0.15207589884002223, "learning_rate": 4.851931330472102e-07, "logits/chosen": -0.4476074278354645, "logits/rejected": -0.44117432832717896, "logps/chosen": -423.8500061035156, "logps/rejected": -474.25, "loss": 0.0147, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.074414014816284, "rewards/margins": 8.481249809265137, "rewards/rejected": -11.543749809265137, "step": 2400 }, { "epoch": 2.0686695278969958, "grad_norm": 3.7109145390758678, "learning_rate": 4.830472103004292e-07, "logits/chosen": -0.4293212890625, "logits/rejected": -0.45195311307907104, "logps/chosen": -428.8999938964844, "logps/rejected": -465.6000061035156, "loss": 0.0245, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.089062452316284, "rewards/margins": 8.315625190734863, "rewards/rejected": -11.409375190734863, "step": 2410 }, { "epoch": 2.0772532188841204, "grad_norm": 1.4432538419018173, "learning_rate": 4.809012875536481e-07, "logits/chosen": -0.43559569120407104, "logits/rejected": -0.40736085176467896, "logps/chosen": -383.25, "logps/rejected": -457.6499938964844, "loss": 0.0249, "rewards/accuracies": 0.96875, "rewards/chosen": -3.8902344703674316, "rewards/margins": 8.425000190734863, "rewards/rejected": -12.321874618530273, "step": 2420 }, { "epoch": 2.0858369098712446, "grad_norm": 3.7087733625775514, "learning_rate": 4.787553648068669e-07, "logits/chosen": -0.343017578125, "logits/rejected": -0.36962890625, "logps/chosen": -419.8999938964844, "logps/rejected": -460.5, "loss": 0.0404, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.714062452316284, "rewards/margins": 7.704687595367432, "rewards/rejected": -11.420312881469727, "step": 2430 }, { "epoch": 2.0944206008583692, "grad_norm": 0.8862412427695131, "learning_rate": 4.766094420600858e-07, "logits/chosen": -0.32939451932907104, "logits/rejected": -0.27421873807907104, "logps/chosen": -430.25, "logps/rejected": -485.79998779296875, "loss": 0.0198, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.671875, "rewards/margins": 8.453125, "rewards/rejected": -12.115625381469727, "step": 2440 }, { "epoch": 2.1030042918454934, "grad_norm": 1.9701713638241634, "learning_rate": 4.744635193133047e-07, "logits/chosen": -0.44526368379592896, "logits/rejected": -0.4786621034145355, "logps/chosen": -434.1000061035156, "logps/rejected": -469.5, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5374999046325684, "rewards/margins": 7.974999904632568, "rewards/rejected": -11.509374618530273, "step": 2450 }, { "epoch": 2.111587982832618, "grad_norm": 0.9656910235590569, "learning_rate": 4.723175965665236e-07, "logits/chosen": -0.20761413872241974, "logits/rejected": -0.17620849609375, "logps/chosen": -411.04998779296875, "logps/rejected": -446.8999938964844, "loss": 0.0387, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.885546922683716, "rewards/margins": 7.604687690734863, "rewards/rejected": -10.4921875, "step": 2460 }, { "epoch": 2.1201716738197427, "grad_norm": 0.616666865268536, "learning_rate": 4.701716738197425e-07, "logits/chosen": -0.37904053926467896, "logits/rejected": -0.3268188536167145, "logps/chosen": -472.1000061035156, "logps/rejected": -528.9000244140625, "loss": 0.0171, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.77734375, "rewards/margins": 8.534375190734863, "rewards/rejected": -12.3125, "step": 2470 }, { "epoch": 2.128755364806867, "grad_norm": 0.8818919593636391, "learning_rate": 4.6802575107296134e-07, "logits/chosen": -0.2685180604457855, "logits/rejected": -0.37749022245407104, "logps/chosen": -473.3999938964844, "logps/rejected": -516.2000122070312, "loss": 0.0148, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.180468559265137, "rewards/margins": 8.824999809265137, "rewards/rejected": -13.009374618530273, "step": 2480 }, { "epoch": 2.1373390557939915, "grad_norm": 4.812836431660748, "learning_rate": 4.658798283261802e-07, "logits/chosen": -0.41522216796875, "logits/rejected": -0.4581298828125, "logps/chosen": -440.0, "logps/rejected": -494.79998779296875, "loss": 0.0167, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.825000047683716, "rewards/margins": 10.029687881469727, "rewards/rejected": -13.856249809265137, "step": 2490 }, { "epoch": 2.1459227467811157, "grad_norm": 6.587250832401947, "learning_rate": 4.6373390557939914e-07, "logits/chosen": -0.20610351860523224, "logits/rejected": -0.2576049864292145, "logps/chosen": -391.3500061035156, "logps/rejected": -457.04998779296875, "loss": 0.0401, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.313037157058716, "rewards/margins": 8.642187118530273, "rewards/rejected": -11.956250190734863, "step": 2500 }, { "epoch": 2.1545064377682404, "grad_norm": 2.901384691072297, "learning_rate": 4.61587982832618e-07, "logits/chosen": -0.41926270723342896, "logits/rejected": -0.4696289002895355, "logps/chosen": -465.6000061035156, "logps/rejected": -503.3999938964844, "loss": 0.0137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5277342796325684, "rewards/margins": 8.504687309265137, "rewards/rejected": -12.034375190734863, "step": 2510 }, { "epoch": 2.163090128755365, "grad_norm": 1.2135526002401213, "learning_rate": 4.594420600858369e-07, "logits/chosen": -0.40642088651657104, "logits/rejected": -0.4490600526332855, "logps/chosen": -403.20001220703125, "logps/rejected": -482.3999938964844, "loss": 0.0276, "rewards/accuracies": 0.96875, "rewards/chosen": -3.7249999046325684, "rewards/margins": 8.692187309265137, "rewards/rejected": -12.412500381469727, "step": 2520 }, { "epoch": 2.171673819742489, "grad_norm": 1.4107618587987432, "learning_rate": 4.5729613733905576e-07, "logits/chosen": -0.33057862520217896, "logits/rejected": -0.3555053770542145, "logps/chosen": -436.1499938964844, "logps/rejected": -484.1000061035156, "loss": 0.0252, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6058592796325684, "rewards/margins": 8.918749809265137, "rewards/rejected": -12.521875381469727, "step": 2530 }, { "epoch": 2.180257510729614, "grad_norm": 1.4873798874827557, "learning_rate": 4.5515021459227464e-07, "logits/chosen": -0.2203369140625, "logits/rejected": -0.26738280057907104, "logps/chosen": -431.3999938964844, "logps/rejected": -522.7000122070312, "loss": 0.033, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.634765625, "rewards/margins": 8.706250190734863, "rewards/rejected": -13.34375, "step": 2540 }, { "epoch": 2.188841201716738, "grad_norm": 0.09613919117259827, "learning_rate": 4.5300429184549357e-07, "logits/chosen": -0.346923828125, "logits/rejected": -0.37263184785842896, "logps/chosen": -449.70001220703125, "logps/rejected": -496.6000061035156, "loss": 0.0281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.01171875, "rewards/margins": 9.137499809265137, "rewards/rejected": -14.134374618530273, "step": 2550 }, { "epoch": 2.1974248927038627, "grad_norm": 116.33759214374483, "learning_rate": 4.5085836909871244e-07, "logits/chosen": -0.4938720762729645, "logits/rejected": -0.49412840604782104, "logps/chosen": -429.75, "logps/rejected": -479.1000061035156, "loss": 0.0335, "rewards/accuracies": 0.96875, "rewards/chosen": -4.844531059265137, "rewards/margins": 9.196874618530273, "rewards/rejected": -14.034375190734863, "step": 2560 }, { "epoch": 2.2060085836909873, "grad_norm": 0.9272169068149476, "learning_rate": 4.4871244635193126e-07, "logits/chosen": -0.39813232421875, "logits/rejected": -0.4775390625, "logps/chosen": -455.6000061035156, "logps/rejected": -506.5, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.578906297683716, "rewards/margins": 8.715624809265137, "rewards/rejected": -12.296875, "step": 2570 }, { "epoch": 2.2145922746781115, "grad_norm": 2.554159103159828, "learning_rate": 4.465665236051502e-07, "logits/chosen": -0.37141114473342896, "logits/rejected": -0.4498291015625, "logps/chosen": -407.8500061035156, "logps/rejected": -459.1000061035156, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3062500953674316, "rewards/margins": 8.2265625, "rewards/rejected": -11.528124809265137, "step": 2580 }, { "epoch": 2.223175965665236, "grad_norm": 2.0623059801149015, "learning_rate": 4.4442060085836906e-07, "logits/chosen": -0.3496337831020355, "logits/rejected": -0.34992676973342896, "logps/chosen": -443.20001220703125, "logps/rejected": -505.70001220703125, "loss": 0.0368, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.592968702316284, "rewards/margins": 8.4453125, "rewards/rejected": -12.034375190734863, "step": 2590 }, { "epoch": 2.2317596566523603, "grad_norm": 1.6356529998150269, "learning_rate": 4.42274678111588e-07, "logits/chosen": -0.49516600370407104, "logits/rejected": -0.527148425579071, "logps/chosen": -490.70001220703125, "logps/rejected": -508.6000061035156, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.358593702316284, "rewards/margins": 9.0390625, "rewards/rejected": -12.399999618530273, "step": 2600 }, { "epoch": 2.240343347639485, "grad_norm": 3.2680729334606053, "learning_rate": 4.4012875536480687e-07, "logits/chosen": -0.36198729276657104, "logits/rejected": -0.35832518339157104, "logps/chosen": -471.3999938964844, "logps/rejected": -536.0999755859375, "loss": 0.021, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.84765625, "rewards/margins": 8.846875190734863, "rewards/rejected": -12.703125, "step": 2610 }, { "epoch": 2.2489270386266096, "grad_norm": 1.61851187044734, "learning_rate": 4.379828326180257e-07, "logits/chosen": -0.3167480528354645, "logits/rejected": -0.3385009765625, "logps/chosen": -417.79998779296875, "logps/rejected": -478.70001220703125, "loss": 0.0298, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.8179688453674316, "rewards/margins": 8.704687118530273, "rewards/rejected": -12.53125, "step": 2620 }, { "epoch": 2.257510729613734, "grad_norm": 0.6493919946924928, "learning_rate": 4.358369098712446e-07, "logits/chosen": -0.35246580839157104, "logits/rejected": -0.40156251192092896, "logps/chosen": -422.6499938964844, "logps/rejected": -476.3999938964844, "loss": 0.0328, "rewards/accuracies": 0.96875, "rewards/chosen": -3.856640577316284, "rewards/margins": 8.328125, "rewards/rejected": -12.178125381469727, "step": 2630 }, { "epoch": 2.2660944206008584, "grad_norm": 3.3772409045776177, "learning_rate": 4.336909871244635e-07, "logits/chosen": -0.4506973326206207, "logits/rejected": -0.553295910358429, "logps/chosen": -421.5, "logps/rejected": -462.79998779296875, "loss": 0.0083, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.617968797683716, "rewards/margins": 8.84375, "rewards/rejected": -12.462499618530273, "step": 2640 }, { "epoch": 2.274678111587983, "grad_norm": 9.458565515998549, "learning_rate": 4.315450643776824e-07, "logits/chosen": -0.45199108123779297, "logits/rejected": -0.4716796875, "logps/chosen": -393.6000061035156, "logps/rejected": -451.0, "loss": 0.0304, "rewards/accuracies": 0.96875, "rewards/chosen": -3.442187547683716, "rewards/margins": 8.462499618530273, "rewards/rejected": -11.909375190734863, "step": 2650 }, { "epoch": 2.2832618025751072, "grad_norm": 15.573249472155945, "learning_rate": 4.2939914163090124e-07, "logits/chosen": -0.37236326932907104, "logits/rejected": -0.4368652403354645, "logps/chosen": -443.3999938964844, "logps/rejected": -501.1000061035156, "loss": 0.0279, "rewards/accuracies": 0.96875, "rewards/chosen": -3.4800782203674316, "rewards/margins": 8.635937690734863, "rewards/rejected": -12.115625381469727, "step": 2660 }, { "epoch": 2.291845493562232, "grad_norm": 4.846110103568078, "learning_rate": 4.272532188841201e-07, "logits/chosen": -0.4035400450229645, "logits/rejected": -0.422119140625, "logps/chosen": -411.04998779296875, "logps/rejected": -495.3999938964844, "loss": 0.0196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.78125, "rewards/margins": 8.600000381469727, "rewards/rejected": -12.390625, "step": 2670 }, { "epoch": 2.300429184549356, "grad_norm": 0.8587205885002226, "learning_rate": 4.2510729613733904e-07, "logits/chosen": -0.49238282442092896, "logits/rejected": -0.4830322265625, "logps/chosen": -411.8500061035156, "logps/rejected": -496.6000061035156, "loss": 0.0153, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.37890625, "rewards/margins": 8.53125, "rewards/rejected": -12.915624618530273, "step": 2680 }, { "epoch": 2.3090128755364807, "grad_norm": 0.47564827559110784, "learning_rate": 4.229613733905579e-07, "logits/chosen": -0.4992431700229645, "logits/rejected": -0.46098631620407104, "logps/chosen": -473.3500061035156, "logps/rejected": -511.8500061035156, "loss": 0.022, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.450000047683716, "rewards/margins": 8.71875, "rewards/rejected": -12.176562309265137, "step": 2690 }, { "epoch": 2.317596566523605, "grad_norm": 1.0107991720169627, "learning_rate": 4.2081545064377685e-07, "logits/chosen": -0.18307189643383026, "logits/rejected": -0.29273682832717896, "logps/chosen": -414.70001220703125, "logps/rejected": -462.0, "loss": 0.0359, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.36328125, "rewards/margins": 8.153124809265137, "rewards/rejected": -11.521875381469727, "step": 2700 }, { "epoch": 2.3261802575107295, "grad_norm": 0.5152537556563656, "learning_rate": 4.1866952789699567e-07, "logits/chosen": -0.4253173768520355, "logits/rejected": -0.442138671875, "logps/chosen": -431.3999938964844, "logps/rejected": -484.6000061035156, "loss": 0.0119, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.303515672683716, "rewards/margins": 8.484375, "rewards/rejected": -11.787500381469727, "step": 2710 }, { "epoch": 2.334763948497854, "grad_norm": 3.3000828241833964, "learning_rate": 4.1652360515021454e-07, "logits/chosen": -0.3816894590854645, "logits/rejected": -0.4306701719760895, "logps/chosen": -458.8999938964844, "logps/rejected": -472.1000061035156, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.969921827316284, "rewards/margins": 8.260937690734863, "rewards/rejected": -11.2265625, "step": 2720 }, { "epoch": 2.3433476394849784, "grad_norm": 0.8654611232340044, "learning_rate": 4.1437768240343347e-07, "logits/chosen": -0.355865478515625, "logits/rejected": -0.4540771543979645, "logps/chosen": -476.8999938964844, "logps/rejected": -494.0, "loss": 0.0149, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2183594703674316, "rewards/margins": 8.565625190734863, "rewards/rejected": -11.778124809265137, "step": 2730 }, { "epoch": 2.351931330472103, "grad_norm": 0.993930405562353, "learning_rate": 4.1223175965665235e-07, "logits/chosen": -0.47687989473342896, "logits/rejected": -0.487548828125, "logps/chosen": -472.3999938964844, "logps/rejected": -541.2999877929688, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.655468702316284, "rewards/margins": 8.834375381469727, "rewards/rejected": -12.478124618530273, "step": 2740 }, { "epoch": 2.3605150214592276, "grad_norm": 10.880272550900882, "learning_rate": 4.100858369098713e-07, "logits/chosen": -0.38469237089157104, "logits/rejected": -0.4753662049770355, "logps/chosen": -468.29998779296875, "logps/rejected": -545.9000244140625, "loss": 0.0176, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.357031345367432, "rewards/margins": 8.895312309265137, "rewards/rejected": -13.237500190734863, "step": 2750 }, { "epoch": 2.369098712446352, "grad_norm": 7.177309173577127, "learning_rate": 4.079399141630901e-07, "logits/chosen": -0.4986816346645355, "logits/rejected": -0.537890613079071, "logps/chosen": -423.5, "logps/rejected": -490.5, "loss": 0.0205, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.0625, "rewards/margins": 9.051562309265137, "rewards/rejected": -13.103124618530273, "step": 2760 }, { "epoch": 2.3776824034334765, "grad_norm": 1.9639062234366875, "learning_rate": 4.0579399141630897e-07, "logits/chosen": -0.4094604551792145, "logits/rejected": -0.4169921875, "logps/chosen": -453.20001220703125, "logps/rejected": -558.5999755859375, "loss": 0.0261, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.942968845367432, "rewards/margins": 9.173437118530273, "rewards/rejected": -14.118749618530273, "step": 2770 }, { "epoch": 2.3862660944206007, "grad_norm": 0.46102443653984526, "learning_rate": 4.036480686695279e-07, "logits/chosen": -0.4019531309604645, "logits/rejected": -0.46400147676467896, "logps/chosen": -437.5, "logps/rejected": -491.79998779296875, "loss": 0.0412, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.977343797683716, "rewards/margins": 9.178125381469727, "rewards/rejected": -13.15625, "step": 2780 }, { "epoch": 2.3948497854077253, "grad_norm": 4.944029259173708, "learning_rate": 4.015021459227468e-07, "logits/chosen": -0.44111329317092896, "logits/rejected": -0.44707030057907104, "logps/chosen": -402.6000061035156, "logps/rejected": -473.3999938964844, "loss": 0.0231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6031250953674316, "rewards/margins": 8.692187309265137, "rewards/rejected": -12.300000190734863, "step": 2790 }, { "epoch": 2.40343347639485, "grad_norm": 1.0589467053175796, "learning_rate": 3.993562231759656e-07, "logits/chosen": -0.36149901151657104, "logits/rejected": -0.39661866426467896, "logps/chosen": -457.70001220703125, "logps/rejected": -494.70001220703125, "loss": 0.0218, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3394532203674316, "rewards/margins": 8.931249618530273, "rewards/rejected": -12.271875381469727, "step": 2800 }, { "epoch": 2.412017167381974, "grad_norm": 0.5824139009263352, "learning_rate": 3.972103004291845e-07, "logits/chosen": -0.4072509706020355, "logits/rejected": -0.47856444120407104, "logps/chosen": -433.20001220703125, "logps/rejected": -467.1000061035156, "loss": 0.0285, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.30859375, "rewards/margins": 8.884374618530273, "rewards/rejected": -12.199999809265137, "step": 2810 }, { "epoch": 2.4206008583690988, "grad_norm": 2.2599903846283964, "learning_rate": 3.950643776824034e-07, "logits/chosen": -0.37919920682907104, "logits/rejected": -0.44111329317092896, "logps/chosen": -459.20001220703125, "logps/rejected": -509.8999938964844, "loss": 0.0224, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2679686546325684, "rewards/margins": 8.779687881469727, "rewards/rejected": -12.046875, "step": 2820 }, { "epoch": 2.429184549356223, "grad_norm": 0.4634426355502626, "learning_rate": 3.9291845493562233e-07, "logits/chosen": -0.41181641817092896, "logits/rejected": -0.43842774629592896, "logps/chosen": -449.0, "logps/rejected": -520.0999755859375, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.010156154632568, "rewards/margins": 9.053125381469727, "rewards/rejected": -13.068750381469727, "step": 2830 }, { "epoch": 2.4377682403433476, "grad_norm": 1.321536507478587, "learning_rate": 3.907725321888412e-07, "logits/chosen": -0.34743040800094604, "logits/rejected": -0.4209136962890625, "logps/chosen": -453.70001220703125, "logps/rejected": -506.0, "loss": 0.0284, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.257031440734863, "rewards/margins": 8.96875, "rewards/rejected": -13.225000381469727, "step": 2840 }, { "epoch": 2.4463519313304722, "grad_norm": 5.710306728270681, "learning_rate": 3.8862660944206e-07, "logits/chosen": -0.267181396484375, "logits/rejected": -0.4113525450229645, "logps/chosen": -465.70001220703125, "logps/rejected": -522.5, "loss": 0.0247, "rewards/accuracies": 0.96875, "rewards/chosen": -4.1484375, "rewards/margins": 8.803125381469727, "rewards/rejected": -12.949999809265137, "step": 2850 }, { "epoch": 2.4549356223175964, "grad_norm": 7.884082087213311, "learning_rate": 3.8648068669527895e-07, "logits/chosen": -0.3802246153354645, "logits/rejected": -0.4784301817417145, "logps/chosen": -447.70001220703125, "logps/rejected": -498.1000061035156, "loss": 0.0207, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.900390625, "rewards/margins": 8.948437690734863, "rewards/rejected": -12.856249809265137, "step": 2860 }, { "epoch": 2.463519313304721, "grad_norm": 1.0436801672614922, "learning_rate": 3.8433476394849783e-07, "logits/chosen": -0.43657225370407104, "logits/rejected": -0.44697266817092896, "logps/chosen": -417.0, "logps/rejected": -495.70001220703125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.203906297683716, "rewards/margins": 9.3828125, "rewards/rejected": -12.59375, "step": 2870 }, { "epoch": 2.4721030042918457, "grad_norm": 1.1997040317034824, "learning_rate": 3.8218884120171675e-07, "logits/chosen": -0.42500001192092896, "logits/rejected": -0.45793455839157104, "logps/chosen": -446.0, "logps/rejected": -528.0, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.751953125, "rewards/margins": 8.634374618530273, "rewards/rejected": -12.387499809265137, "step": 2880 }, { "epoch": 2.48068669527897, "grad_norm": 0.5152934897792839, "learning_rate": 3.800429184549356e-07, "logits/chosen": -0.3827880918979645, "logits/rejected": -0.49749755859375, "logps/chosen": -434.79998779296875, "logps/rejected": -473.8999938964844, "loss": 0.0154, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.251953125, "rewards/margins": 8.7734375, "rewards/rejected": -12.028124809265137, "step": 2890 }, { "epoch": 2.4892703862660945, "grad_norm": 2.708485140496381, "learning_rate": 3.7789699570815445e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.3224853575229645, "logps/chosen": -466.5, "logps/rejected": -535.7999877929688, "loss": 0.0341, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.1253905296325684, "rewards/margins": 8.696874618530273, "rewards/rejected": -11.828125, "step": 2900 }, { "epoch": 2.4978540772532187, "grad_norm": 0.8529710270105891, "learning_rate": 3.757510729613734e-07, "logits/chosen": -0.3951660096645355, "logits/rejected": -0.482666015625, "logps/chosen": -419.79998779296875, "logps/rejected": -466.79998779296875, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3685545921325684, "rewards/margins": 9.112500190734863, "rewards/rejected": -12.478124618530273, "step": 2910 }, { "epoch": 2.5064377682403434, "grad_norm": 0.5253285275220013, "learning_rate": 3.7360515021459225e-07, "logits/chosen": -0.44218748807907104, "logits/rejected": -0.46235352754592896, "logps/chosen": -448.1000061035156, "logps/rejected": -479.79998779296875, "loss": 0.0357, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.325000047683716, "rewards/margins": 8.765625, "rewards/rejected": -12.09375, "step": 2920 }, { "epoch": 2.5150214592274676, "grad_norm": 2.2210380949282706, "learning_rate": 3.714592274678112e-07, "logits/chosen": -0.4555419981479645, "logits/rejected": -0.4444824159145355, "logps/chosen": -420.20001220703125, "logps/rejected": -483.8500061035156, "loss": 0.0206, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3949217796325684, "rewards/margins": 8.267187118530273, "rewards/rejected": -11.653124809265137, "step": 2930 }, { "epoch": 2.523605150214592, "grad_norm": 1.0934063554230669, "learning_rate": 3.6931330472103e-07, "logits/chosen": -0.45341795682907104, "logits/rejected": -0.47331541776657104, "logps/chosen": -421.79998779296875, "logps/rejected": -495.6000061035156, "loss": 0.0244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2484374046325684, "rewards/margins": 8.834375381469727, "rewards/rejected": -12.081250190734863, "step": 2940 }, { "epoch": 2.532188841201717, "grad_norm": 1.8578596294486962, "learning_rate": 3.671673819742489e-07, "logits/chosen": -0.4161376953125, "logits/rejected": -0.544177234172821, "logps/chosen": -463.1000061035156, "logps/rejected": -510.5, "loss": 0.0258, "rewards/accuracies": 0.96875, "rewards/chosen": -3.271289110183716, "rewards/margins": 8.939062118530273, "rewards/rejected": -12.209375381469727, "step": 2950 }, { "epoch": 2.540772532188841, "grad_norm": 2.010502518355877, "learning_rate": 3.650214592274678e-07, "logits/chosen": -0.4856933653354645, "logits/rejected": -0.5157226324081421, "logps/chosen": -482.0, "logps/rejected": -498.79998779296875, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.145312547683716, "rewards/margins": 8.643750190734863, "rewards/rejected": -11.787500381469727, "step": 2960 }, { "epoch": 2.5493562231759657, "grad_norm": 2.798349701883747, "learning_rate": 3.628755364806867e-07, "logits/chosen": -0.5657714605331421, "logits/rejected": -0.625927746295929, "logps/chosen": -428.6000061035156, "logps/rejected": -492.6000061035156, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.0859375, "rewards/margins": 9.2109375, "rewards/rejected": -13.293749809265137, "step": 2970 }, { "epoch": 2.5579399141630903, "grad_norm": 1.3080155963280793, "learning_rate": 3.6072961373390556e-07, "logits/chosen": -0.526226818561554, "logits/rejected": -0.615527331829071, "logps/chosen": -429.70001220703125, "logps/rejected": -496.1000061035156, "loss": 0.0205, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.8031249046325684, "rewards/margins": 9.278124809265137, "rewards/rejected": -13.074999809265137, "step": 2980 }, { "epoch": 2.5665236051502145, "grad_norm": 0.3442917291217951, "learning_rate": 3.5858369098712443e-07, "logits/chosen": -0.6820312738418579, "logits/rejected": -0.6776367425918579, "logps/chosen": -447.6000061035156, "logps/rejected": -505.70001220703125, "loss": 0.0111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.986328125, "rewards/margins": 9.104687690734863, "rewards/rejected": -13.096875190734863, "step": 2990 }, { "epoch": 2.575107296137339, "grad_norm": 0.5751679778519891, "learning_rate": 3.564377682403433e-07, "logits/chosen": -0.42460936307907104, "logits/rejected": -0.4182983338832855, "logps/chosen": -441.79998779296875, "logps/rejected": -497.6000061035156, "loss": 0.0422, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.3187499046325684, "rewards/margins": 8.762499809265137, "rewards/rejected": -12.084375381469727, "step": 3000 }, { "epoch": 2.5836909871244638, "grad_norm": 0.7194929972204092, "learning_rate": 3.5429184549356223e-07, "logits/chosen": -0.44716185331344604, "logits/rejected": -0.48039549589157104, "logps/chosen": -453.20001220703125, "logps/rejected": -497.70001220703125, "loss": 0.0191, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.603515625, "rewards/margins": 9.225000381469727, "rewards/rejected": -12.834375381469727, "step": 3010 }, { "epoch": 2.592274678111588, "grad_norm": 1.9751142134958461, "learning_rate": 3.521459227467811e-07, "logits/chosen": -0.5660644769668579, "logits/rejected": -0.5924316644668579, "logps/chosen": -402.0, "logps/rejected": -482.70001220703125, "loss": 0.0083, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7906250953674316, "rewards/margins": 9.107812881469727, "rewards/rejected": -12.903124809265137, "step": 3020 }, { "epoch": 2.6008583690987126, "grad_norm": 0.76156426497839, "learning_rate": 3.5e-07, "logits/chosen": -0.563891589641571, "logits/rejected": -0.583691418170929, "logps/chosen": -433.5, "logps/rejected": -519.5, "loss": 0.0158, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.676953077316284, "rewards/margins": 8.981249809265137, "rewards/rejected": -12.65625, "step": 3030 }, { "epoch": 2.609442060085837, "grad_norm": 4.212681669902303, "learning_rate": 3.4785407725321886e-07, "logits/chosen": -0.524609386920929, "logits/rejected": -0.572558581829071, "logps/chosen": -476.29998779296875, "logps/rejected": -526.7999877929688, "loss": 0.0119, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8724608421325684, "rewards/margins": 9.137499809265137, "rewards/rejected": -12.003125190734863, "step": 3040 }, { "epoch": 2.6180257510729614, "grad_norm": 1.7580166771918817, "learning_rate": 3.4570815450643773e-07, "logits/chosen": -0.4007812440395355, "logits/rejected": -0.4558959901332855, "logps/chosen": -446.8999938964844, "logps/rejected": -508.29998779296875, "loss": 0.0216, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0191407203674316, "rewards/margins": 8.973437309265137, "rewards/rejected": -11.996874809265137, "step": 3050 }, { "epoch": 2.6266094420600856, "grad_norm": 0.6269088752407402, "learning_rate": 3.4356223175965666e-07, "logits/chosen": -0.2579406797885895, "logits/rejected": -0.3241210877895355, "logps/chosen": -399.25, "logps/rejected": -478.0, "loss": 0.019, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9886717796325684, "rewards/margins": 9.024999618530273, "rewards/rejected": -12.015625, "step": 3060 }, { "epoch": 2.6351931330472103, "grad_norm": 1.5994991978920816, "learning_rate": 3.4141630901287554e-07, "logits/chosen": -0.472625732421875, "logits/rejected": -0.5118652582168579, "logps/chosen": -436.04998779296875, "logps/rejected": -497.3999938964844, "loss": 0.0189, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9320311546325684, "rewards/margins": 9.421875, "rewards/rejected": -13.350000381469727, "step": 3070 }, { "epoch": 2.643776824034335, "grad_norm": 1.3737300866078248, "learning_rate": 3.392703862660944e-07, "logits/chosen": -0.3815673887729645, "logits/rejected": -0.46320801973342896, "logps/chosen": -453.29998779296875, "logps/rejected": -500.5, "loss": 0.028, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.8515625, "rewards/margins": 8.621874809265137, "rewards/rejected": -12.46875, "step": 3080 }, { "epoch": 2.652360515021459, "grad_norm": 0.9573365173434777, "learning_rate": 3.371244635193133e-07, "logits/chosen": -0.5018554925918579, "logits/rejected": -0.4856201112270355, "logps/chosen": -406.1000061035156, "logps/rejected": -497.70001220703125, "loss": 0.019, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4808592796325684, "rewards/margins": 9.368749618530273, "rewards/rejected": -12.853124618530273, "step": 3090 }, { "epoch": 2.6609442060085837, "grad_norm": 0.43068947090759463, "learning_rate": 3.3497854077253216e-07, "logits/chosen": -0.652587890625, "logits/rejected": -0.730273425579071, "logps/chosen": -454.75, "logps/rejected": -529.4000244140625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.466406345367432, "rewards/margins": 9.870312690734863, "rewards/rejected": -14.337499618530273, "step": 3100 }, { "epoch": 2.6695278969957084, "grad_norm": 2.831895468929523, "learning_rate": 3.328326180257511e-07, "logits/chosen": -0.5657714605331421, "logits/rejected": -0.609179675579071, "logps/chosen": -426.95001220703125, "logps/rejected": -509.3999938964844, "loss": 0.0174, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.621874809265137, "rewards/margins": 9.550000190734863, "rewards/rejected": -14.178125381469727, "step": 3110 }, { "epoch": 2.6781115879828326, "grad_norm": 25.903304484538573, "learning_rate": 3.306866952789699e-07, "logits/chosen": -0.5397583246231079, "logits/rejected": -0.5998779535293579, "logps/chosen": -457.5, "logps/rejected": -523.9000244140625, "loss": 0.0156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.912499904632568, "rewards/margins": 9.800000190734863, "rewards/rejected": -14.712499618530273, "step": 3120 }, { "epoch": 2.686695278969957, "grad_norm": 0.9312229570568605, "learning_rate": 3.2854077253218884e-07, "logits/chosen": -0.6064117550849915, "logits/rejected": -0.5582275390625, "logps/chosen": -460.20001220703125, "logps/rejected": -523.7999877929688, "loss": 0.0101, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.077734470367432, "rewards/margins": 9.420312881469727, "rewards/rejected": -13.503125190734863, "step": 3130 }, { "epoch": 2.6952789699570814, "grad_norm": 0.7541116498269944, "learning_rate": 3.263948497854077e-07, "logits/chosen": -0.46748048067092896, "logits/rejected": -0.5615234375, "logps/chosen": -454.1499938964844, "logps/rejected": -511.8999938964844, "loss": 0.0223, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.065234184265137, "rewards/margins": 9.285937309265137, "rewards/rejected": -13.359375, "step": 3140 }, { "epoch": 2.703862660944206, "grad_norm": 2.1365933012382454, "learning_rate": 3.242489270386266e-07, "logits/chosen": -0.5459228754043579, "logits/rejected": -0.514111340045929, "logps/chosen": -405.29998779296875, "logps/rejected": -478.3999938964844, "loss": 0.0166, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.135937690734863, "rewards/margins": 9.264062881469727, "rewards/rejected": -13.40625, "step": 3150 }, { "epoch": 2.71244635193133, "grad_norm": 0.46835316959419626, "learning_rate": 3.221030042918455e-07, "logits/chosen": -0.46577757596969604, "logits/rejected": -0.5217040777206421, "logps/chosen": -432.20001220703125, "logps/rejected": -508.5, "loss": 0.0165, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.857812404632568, "rewards/margins": 9.8828125, "rewards/rejected": -14.746874809265137, "step": 3160 }, { "epoch": 2.721030042918455, "grad_norm": 5.755515549466192, "learning_rate": 3.1995708154506434e-07, "logits/chosen": -0.4782470762729645, "logits/rejected": -0.5312255620956421, "logps/chosen": -447.8999938964844, "logps/rejected": -508.20001220703125, "loss": 0.0272, "rewards/accuracies": 0.96875, "rewards/chosen": -4.800000190734863, "rewards/margins": 10.121874809265137, "rewards/rejected": -14.909375190734863, "step": 3170 }, { "epoch": 2.7296137339055795, "grad_norm": 1.9235243032464384, "learning_rate": 3.1781115879828327e-07, "logits/chosen": -0.360107421875, "logits/rejected": -0.42155760526657104, "logps/chosen": -457.79998779296875, "logps/rejected": -524.0999755859375, "loss": 0.0324, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.361718654632568, "rewards/margins": 9.807812690734863, "rewards/rejected": -15.162500381469727, "step": 3180 }, { "epoch": 2.7381974248927037, "grad_norm": 9.124567299997006, "learning_rate": 3.1566523605150214e-07, "logits/chosen": -0.4493652284145355, "logits/rejected": -0.48712158203125, "logps/chosen": -417.45001220703125, "logps/rejected": -489.20001220703125, "loss": 0.0175, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.645312309265137, "rewards/margins": 9.375, "rewards/rejected": -14.015625, "step": 3190 }, { "epoch": 2.7467811158798283, "grad_norm": 3.9334978089155475, "learning_rate": 3.13519313304721e-07, "logits/chosen": -0.3644775450229645, "logits/rejected": -0.46135252714157104, "logps/chosen": -440.29998779296875, "logps/rejected": -493.8999938964844, "loss": 0.0217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.557812690734863, "rewards/margins": 8.706250190734863, "rewards/rejected": -13.268750190734863, "step": 3200 }, { "epoch": 2.755364806866953, "grad_norm": 8.122871408770706, "learning_rate": 3.113733905579399e-07, "logits/chosen": -0.4884033203125, "logits/rejected": -0.47650146484375, "logps/chosen": -404.6499938964844, "logps/rejected": -480.29998779296875, "loss": 0.0235, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.010937690734863, "rewards/margins": 8.46875, "rewards/rejected": -12.475000381469727, "step": 3210 }, { "epoch": 2.763948497854077, "grad_norm": 0.8147846075382466, "learning_rate": 3.0922746781115877e-07, "logits/chosen": -0.5292114019393921, "logits/rejected": -0.598498523235321, "logps/chosen": -431.20001220703125, "logps/rejected": -499.1000061035156, "loss": 0.0141, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8515625, "rewards/margins": 9.46875, "rewards/rejected": -13.324999809265137, "step": 3220 }, { "epoch": 2.772532188841202, "grad_norm": 2.476939317990392, "learning_rate": 3.0708154506437764e-07, "logits/chosen": -0.42548829317092896, "logits/rejected": -0.4642333984375, "logps/chosen": -447.20001220703125, "logps/rejected": -499.29998779296875, "loss": 0.0237, "rewards/accuracies": 0.96875, "rewards/chosen": -4.101953029632568, "rewards/margins": 9.509374618530273, "rewards/rejected": -13.615625381469727, "step": 3230 }, { "epoch": 2.7811158798283264, "grad_norm": 0.6219307185961367, "learning_rate": 3.0493562231759657e-07, "logits/chosen": -0.412078857421875, "logits/rejected": -0.4906982481479645, "logps/chosen": -416.1000061035156, "logps/rejected": -482.70001220703125, "loss": 0.0247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.37890625, "rewards/margins": 9.573437690734863, "rewards/rejected": -13.956250190734863, "step": 3240 }, { "epoch": 2.7896995708154506, "grad_norm": 0.5075075394688136, "learning_rate": 3.0278969957081544e-07, "logits/chosen": -0.5274902582168579, "logits/rejected": -0.567272961139679, "logps/chosen": -446.6499938964844, "logps/rejected": -512.9000244140625, "loss": 0.0214, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.759375095367432, "rewards/margins": 9.453125, "rewards/rejected": -14.209375381469727, "step": 3250 }, { "epoch": 2.7982832618025753, "grad_norm": 1.574916886531938, "learning_rate": 3.006437768240343e-07, "logits/chosen": -0.5614258050918579, "logits/rejected": -0.587109386920929, "logps/chosen": -415.70001220703125, "logps/rejected": -491.70001220703125, "loss": 0.0182, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.572265625, "rewards/margins": 9.364062309265137, "rewards/rejected": -13.934374809265137, "step": 3260 }, { "epoch": 2.8068669527896994, "grad_norm": 4.317708261655983, "learning_rate": 2.984978540772532e-07, "logits/chosen": -0.4383789002895355, "logits/rejected": -0.4937500059604645, "logps/chosen": -478.20001220703125, "logps/rejected": -511.3999938964844, "loss": 0.033, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.365624904632568, "rewards/margins": 8.857812881469727, "rewards/rejected": -13.215624809265137, "step": 3270 }, { "epoch": 2.815450643776824, "grad_norm": 0.20142958166666902, "learning_rate": 2.9635193133047207e-07, "logits/chosen": -0.3328613340854645, "logits/rejected": -0.4281066954135895, "logps/chosen": -431.3999938964844, "logps/rejected": -477.79998779296875, "loss": 0.0175, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.322656154632568, "rewards/margins": 9.654687881469727, "rewards/rejected": -13.984375, "step": 3280 }, { "epoch": 2.8240343347639483, "grad_norm": 0.9247749811273913, "learning_rate": 2.94206008583691e-07, "logits/chosen": -0.429412841796875, "logits/rejected": -0.5338134765625, "logps/chosen": -448.8999938964844, "logps/rejected": -503.1000061035156, "loss": 0.0291, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.835546970367432, "rewards/margins": 9.346875190734863, "rewards/rejected": -14.184374809265137, "step": 3290 }, { "epoch": 2.832618025751073, "grad_norm": 1.3785882835147447, "learning_rate": 2.920600858369098e-07, "logits/chosen": -0.37956541776657104, "logits/rejected": -0.45050048828125, "logps/chosen": -435.29998779296875, "logps/rejected": -497.20001220703125, "loss": 0.0289, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.048437595367432, "rewards/margins": 9.615625381469727, "rewards/rejected": -14.668749809265137, "step": 3300 }, { "epoch": 2.8412017167381975, "grad_norm": 2.553872584236344, "learning_rate": 2.8991416309012875e-07, "logits/chosen": -0.33698731660842896, "logits/rejected": -0.3714599609375, "logps/chosen": -426.3500061035156, "logps/rejected": -502.20001220703125, "loss": 0.0355, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.91796875, "rewards/margins": 8.995312690734863, "rewards/rejected": -13.90625, "step": 3310 }, { "epoch": 2.8497854077253217, "grad_norm": 0.4637321539251045, "learning_rate": 2.877682403433476e-07, "logits/chosen": -0.467803955078125, "logits/rejected": -0.49345093965530396, "logps/chosen": -407.1000061035156, "logps/rejected": -482.3999938964844, "loss": 0.0263, "rewards/accuracies": 0.96875, "rewards/chosen": -4.953906059265137, "rewards/margins": 9.274999618530273, "rewards/rejected": -14.228124618530273, "step": 3320 }, { "epoch": 2.8583690987124464, "grad_norm": 0.9347881926644849, "learning_rate": 2.856223175965665e-07, "logits/chosen": -0.31312257051467896, "logits/rejected": -0.369415283203125, "logps/chosen": -414.3999938964844, "logps/rejected": -470.70001220703125, "loss": 0.0323, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.336718559265137, "rewards/margins": 8.659375190734863, "rewards/rejected": -12.993749618530273, "step": 3330 }, { "epoch": 2.866952789699571, "grad_norm": 1.404279236253599, "learning_rate": 2.834763948497854e-07, "logits/chosen": -0.529052734375, "logits/rejected": -0.549450695514679, "logps/chosen": -451.1000061035156, "logps/rejected": -515.0, "loss": 0.0191, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.735937595367432, "rewards/margins": 9.290624618530273, "rewards/rejected": -14.024999618530273, "step": 3340 }, { "epoch": 2.875536480686695, "grad_norm": 1.0605240706124524, "learning_rate": 2.8133047210300425e-07, "logits/chosen": -0.3851562440395355, "logits/rejected": -0.43896484375, "logps/chosen": -416.6000061035156, "logps/rejected": -464.6000061035156, "loss": 0.0252, "rewards/accuracies": 0.96875, "rewards/chosen": -4.084374904632568, "rewards/margins": 8.824999809265137, "rewards/rejected": -12.912500381469727, "step": 3350 }, { "epoch": 2.88412017167382, "grad_norm": 1.8749859109682558, "learning_rate": 2.791845493562232e-07, "logits/chosen": -0.31160277128219604, "logits/rejected": -0.43425291776657104, "logps/chosen": -467.20001220703125, "logps/rejected": -516.7999877929688, "loss": 0.0186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.690625190734863, "rewards/margins": 9.501562118530273, "rewards/rejected": -14.1875, "step": 3360 }, { "epoch": 2.8927038626609445, "grad_norm": 1.0358733148234938, "learning_rate": 2.7703862660944205e-07, "logits/chosen": -0.40545654296875, "logits/rejected": -0.4769043028354645, "logps/chosen": -441.45001220703125, "logps/rejected": -514.0, "loss": 0.0259, "rewards/accuracies": 0.96875, "rewards/chosen": -4.541406154632568, "rewards/margins": 9.801562309265137, "rewards/rejected": -14.356249809265137, "step": 3370 }, { "epoch": 2.9012875536480687, "grad_norm": 6.09189081818798, "learning_rate": 2.748927038626609e-07, "logits/chosen": -0.34541016817092896, "logits/rejected": -0.38081663846969604, "logps/chosen": -489.3999938964844, "logps/rejected": -524.5999755859375, "loss": 0.0251, "rewards/accuracies": 0.96875, "rewards/chosen": -4.158398628234863, "rewards/margins": 9.5546875, "rewards/rejected": -13.709375381469727, "step": 3380 }, { "epoch": 2.909871244635193, "grad_norm": 1.09703005947529, "learning_rate": 2.7274678111587985e-07, "logits/chosen": -0.51922607421875, "logits/rejected": -0.523974597454071, "logps/chosen": -408.8500061035156, "logps/rejected": -518.7999877929688, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.985937595367432, "rewards/margins": 9.418749809265137, "rewards/rejected": -14.40625, "step": 3390 }, { "epoch": 2.9184549356223175, "grad_norm": 1.518058063700349, "learning_rate": 2.7060085836909867e-07, "logits/chosen": -0.41705322265625, "logits/rejected": -0.4603027403354645, "logps/chosen": -382.3999938964844, "logps/rejected": -465.0, "loss": 0.013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.57421875, "rewards/margins": 8.935937881469727, "rewards/rejected": -13.503125190734863, "step": 3400 }, { "epoch": 2.927038626609442, "grad_norm": 1.1096420268440528, "learning_rate": 2.684549356223176e-07, "logits/chosen": -0.3531555235385895, "logits/rejected": -0.4030517637729645, "logps/chosen": -411.0, "logps/rejected": -460.29998779296875, "loss": 0.0262, "rewards/accuracies": 0.96875, "rewards/chosen": -3.768359422683716, "rewards/margins": 8.826562881469727, "rewards/rejected": -12.600000381469727, "step": 3410 }, { "epoch": 2.9356223175965663, "grad_norm": 0.43105577762238206, "learning_rate": 2.663090128755365e-07, "logits/chosen": -0.3514953553676605, "logits/rejected": -0.35112303495407104, "logps/chosen": -356.20001220703125, "logps/rejected": -465.0, "loss": 0.0248, "rewards/accuracies": 0.96875, "rewards/chosen": -4.106249809265137, "rewards/margins": 8.981249809265137, "rewards/rejected": -13.081250190734863, "step": 3420 }, { "epoch": 2.944206008583691, "grad_norm": 2.4506330738511704, "learning_rate": 2.6416309012875535e-07, "logits/chosen": -0.4135986268520355, "logits/rejected": -0.4825195372104645, "logps/chosen": -434.3500061035156, "logps/rejected": -513.7000122070312, "loss": 0.0225, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.378125190734863, "rewards/margins": 9.534375190734863, "rewards/rejected": -13.909375190734863, "step": 3430 }, { "epoch": 2.9527896995708156, "grad_norm": 11.635168893353395, "learning_rate": 2.620171673819742e-07, "logits/chosen": -0.38038331270217896, "logits/rejected": -0.47056275606155396, "logps/chosen": -459.79998779296875, "logps/rejected": -511.79998779296875, "loss": 0.0268, "rewards/accuracies": 0.96875, "rewards/chosen": -4.098437309265137, "rewards/margins": 9.5625, "rewards/rejected": -13.662500381469727, "step": 3440 }, { "epoch": 2.96137339055794, "grad_norm": 2.1745816890041363, "learning_rate": 2.598712446351931e-07, "logits/chosen": -0.4321227967739105, "logits/rejected": -0.45994871854782104, "logps/chosen": -453.20001220703125, "logps/rejected": -503.8999938964844, "loss": 0.0173, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.264843940734863, "rewards/margins": 8.990625381469727, "rewards/rejected": -13.259374618530273, "step": 3450 }, { "epoch": 2.9699570815450644, "grad_norm": 0.36904698512827694, "learning_rate": 2.5772532188841203e-07, "logits/chosen": -0.4138549864292145, "logits/rejected": -0.439453125, "logps/chosen": -445.70001220703125, "logps/rejected": -501.29998779296875, "loss": 0.0254, "rewards/accuracies": 0.96875, "rewards/chosen": -4.273046970367432, "rewards/margins": 8.978124618530273, "rewards/rejected": -13.262499809265137, "step": 3460 }, { "epoch": 2.978540772532189, "grad_norm": 3.2119672946159747, "learning_rate": 2.555793991416309e-07, "logits/chosen": -0.3733154237270355, "logits/rejected": -0.4508300721645355, "logps/chosen": -415.6000061035156, "logps/rejected": -463.5, "loss": 0.0302, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.003125190734863, "rewards/margins": 8.982812881469727, "rewards/rejected": -12.987500190734863, "step": 3470 }, { "epoch": 2.9871244635193133, "grad_norm": 0.8953633470721614, "learning_rate": 2.534334763948498e-07, "logits/chosen": -0.583740234375, "logits/rejected": -0.540295422077179, "logps/chosen": -493.8500061035156, "logps/rejected": -545.9000244140625, "loss": 0.0163, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.50390625, "rewards/margins": 8.787500381469727, "rewards/rejected": -13.293749809265137, "step": 3480 }, { "epoch": 2.995708154506438, "grad_norm": 0.7876062416998485, "learning_rate": 2.5128755364806865e-07, "logits/chosen": -0.4969238340854645, "logits/rejected": -0.5274658203125, "logps/chosen": -421.75, "logps/rejected": -485.8999938964844, "loss": 0.0286, "rewards/accuracies": 0.96875, "rewards/chosen": -4.277734279632568, "rewards/margins": 9.160937309265137, "rewards/rejected": -13.443750381469727, "step": 3490 }, { "epoch": 3.004291845493562, "grad_norm": 0.0948622947231037, "learning_rate": 2.4914163090128753e-07, "logits/chosen": -0.564282238483429, "logits/rejected": -0.5618896484375, "logps/chosen": -450.79998779296875, "logps/rejected": -526.2000122070312, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.632031440734863, "rewards/margins": 9.765625, "rewards/rejected": -14.396875381469727, "step": 3500 }, { "epoch": 3.0128755364806867, "grad_norm": 0.17970011620903617, "learning_rate": 2.4699570815450646e-07, "logits/chosen": -0.3944335877895355, "logits/rejected": -0.49378663301467896, "logps/chosen": -476.6000061035156, "logps/rejected": -525.7000122070312, "loss": 0.0178, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.237109184265137, "rewards/margins": 9.493749618530273, "rewards/rejected": -13.740625381469727, "step": 3510 }, { "epoch": 3.0214592274678114, "grad_norm": 0.07670303078102136, "learning_rate": 2.448497854077253e-07, "logits/chosen": -0.5210937261581421, "logits/rejected": -0.5685058832168579, "logps/chosen": -526.9000244140625, "logps/rejected": -575.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.090624809265137, "rewards/margins": 10.139062881469727, "rewards/rejected": -14.228124618530273, "step": 3520 }, { "epoch": 3.0300429184549356, "grad_norm": 0.07386481879295975, "learning_rate": 2.427038626609442e-07, "logits/chosen": -0.36213380098342896, "logits/rejected": -0.408538818359375, "logps/chosen": -439.0, "logps/rejected": -520.4000244140625, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.143359184265137, "rewards/margins": 10.709375381469727, "rewards/rejected": -14.846875190734863, "step": 3530 }, { "epoch": 3.03862660944206, "grad_norm": 0.045093786421351076, "learning_rate": 2.405579399141631e-07, "logits/chosen": -0.43853759765625, "logits/rejected": -0.467254638671875, "logps/chosen": -432.04998779296875, "logps/rejected": -505.20001220703125, "loss": 0.0191, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.600781440734863, "rewards/margins": 10.170312881469727, "rewards/rejected": -14.78125, "step": 3540 }, { "epoch": 3.0472103004291844, "grad_norm": 0.11126657368588092, "learning_rate": 2.3841201716738196e-07, "logits/chosen": -0.47124022245407104, "logits/rejected": -0.546679675579071, "logps/chosen": -495.6000061035156, "logps/rejected": -549.0, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.03125, "rewards/margins": 10.415624618530273, "rewards/rejected": -15.453125, "step": 3550 }, { "epoch": 3.055793991416309, "grad_norm": 0.28714398698263277, "learning_rate": 2.3626609442060086e-07, "logits/chosen": -0.3548828065395355, "logits/rejected": -0.428314208984375, "logps/chosen": -401.79998779296875, "logps/rejected": -478.29998779296875, "loss": 0.0435, "rewards/accuracies": 0.9375, "rewards/chosen": -4.577734470367432, "rewards/margins": 9.910937309265137, "rewards/rejected": -14.484375, "step": 3560 }, { "epoch": 3.0643776824034337, "grad_norm": 0.7082061254519743, "learning_rate": 2.3412017167381973e-07, "logits/chosen": -0.31062012910842896, "logits/rejected": -0.3959716856479645, "logps/chosen": -425.45001220703125, "logps/rejected": -511.70001220703125, "loss": 0.0287, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.392968654632568, "rewards/margins": 10.470312118530273, "rewards/rejected": -14.859375, "step": 3570 }, { "epoch": 3.072961373390558, "grad_norm": 0.1092044860387954, "learning_rate": 2.3197424892703863e-07, "logits/chosen": -0.4988769590854645, "logits/rejected": -0.4993042051792145, "logps/chosen": -440.0, "logps/rejected": -520.2999877929688, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.684374809265137, "rewards/margins": 10.949999809265137, "rewards/rejected": -15.634374618530273, "step": 3580 }, { "epoch": 3.0815450643776825, "grad_norm": 1.160330849568134, "learning_rate": 2.2982832618025748e-07, "logits/chosen": -0.5821288824081421, "logits/rejected": -0.6104491949081421, "logps/chosen": -428.29998779296875, "logps/rejected": -516.5999755859375, "loss": 0.0178, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.071093559265137, "rewards/margins": 10.462499618530273, "rewards/rejected": -15.534375190734863, "step": 3590 }, { "epoch": 3.0901287553648067, "grad_norm": 0.4533097249189143, "learning_rate": 2.2768240343347638e-07, "logits/chosen": -0.45213621854782104, "logits/rejected": -0.508593738079071, "logps/chosen": -411.8999938964844, "logps/rejected": -511.20001220703125, "loss": 0.0221, "rewards/accuracies": 0.96875, "rewards/chosen": -5.186718940734863, "rewards/margins": 10.356249809265137, "rewards/rejected": -15.534375190734863, "step": 3600 }, { "epoch": 3.0987124463519313, "grad_norm": 0.09628781018289802, "learning_rate": 2.2553648068669526e-07, "logits/chosen": -0.47136229276657104, "logits/rejected": -0.5036255121231079, "logps/chosen": -444.29998779296875, "logps/rejected": -534.7999877929688, "loss": 0.0235, "rewards/accuracies": 0.96875, "rewards/chosen": -5.492968559265137, "rewards/margins": 10.776562690734863, "rewards/rejected": -16.274999618530273, "step": 3610 }, { "epoch": 3.107296137339056, "grad_norm": 0.07492727013371783, "learning_rate": 2.2339055793991416e-07, "logits/chosen": -0.38398438692092896, "logits/rejected": -0.4569335877895355, "logps/chosen": -489.0, "logps/rejected": -547.9000244140625, "loss": 0.0304, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.182031154632568, "rewards/margins": 11.046875, "rewards/rejected": -16.228124618530273, "step": 3620 }, { "epoch": 3.11587982832618, "grad_norm": 0.062291150325349987, "learning_rate": 2.2124463519313306e-07, "logits/chosen": -0.39827269315719604, "logits/rejected": -0.44276124238967896, "logps/chosen": -418.5, "logps/rejected": -512.2999877929688, "loss": 0.0218, "rewards/accuracies": 0.96875, "rewards/chosen": -5.302343845367432, "rewards/margins": 11.059374809265137, "rewards/rejected": -16.365625381469727, "step": 3630 }, { "epoch": 3.124463519313305, "grad_norm": 0.14178103017659802, "learning_rate": 2.190987124463519e-07, "logits/chosen": -0.612353503704071, "logits/rejected": -0.668408215045929, "logps/chosen": -469.0, "logps/rejected": -539.9000244140625, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.657812595367432, "rewards/margins": 10.878125190734863, "rewards/rejected": -16.546875, "step": 3640 }, { "epoch": 3.133047210300429, "grad_norm": 0.22814936335505037, "learning_rate": 2.169527896995708e-07, "logits/chosen": -0.4649719297885895, "logits/rejected": -0.51934814453125, "logps/chosen": -452.70001220703125, "logps/rejected": -512.2999877929688, "loss": 0.0352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.786718845367432, "rewards/margins": 10.646875381469727, "rewards/rejected": -16.4375, "step": 3650 }, { "epoch": 3.1416309012875536, "grad_norm": 0.3346740139892221, "learning_rate": 2.1480686695278969e-07, "logits/chosen": -0.41722410917282104, "logits/rejected": -0.452392578125, "logps/chosen": -454.8999938964844, "logps/rejected": -548.4000244140625, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.953906059265137, "rewards/margins": 11.100000381469727, "rewards/rejected": -17.053125381469727, "step": 3660 }, { "epoch": 3.1502145922746783, "grad_norm": 0.4736584913808494, "learning_rate": 2.1266094420600859e-07, "logits/chosen": -0.5252319574356079, "logits/rejected": -0.5882934331893921, "logps/chosen": -471.79998779296875, "logps/rejected": -527.0, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.766406059265137, "rewards/margins": 10.546875, "rewards/rejected": -16.296875, "step": 3670 }, { "epoch": 3.1587982832618025, "grad_norm": 0.9000685653651161, "learning_rate": 2.1051502145922746e-07, "logits/chosen": -0.52252197265625, "logits/rejected": -0.5416259765625, "logps/chosen": -438.8999938964844, "logps/rejected": -541.9000244140625, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.587500095367432, "rewards/margins": 11.003125190734863, "rewards/rejected": -16.596874237060547, "step": 3680 }, { "epoch": 3.167381974248927, "grad_norm": 0.05787867197531949, "learning_rate": 2.0836909871244634e-07, "logits/chosen": -0.503082275390625, "logits/rejected": -0.46137696504592896, "logps/chosen": -416.5, "logps/rejected": -531.7000122070312, "loss": 0.0178, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.657031059265137, "rewards/margins": 10.846875190734863, "rewards/rejected": -16.512500762939453, "step": 3690 }, { "epoch": 3.1759656652360517, "grad_norm": 0.14373344246723235, "learning_rate": 2.062231759656652e-07, "logits/chosen": -0.38768309354782104, "logits/rejected": -0.46623533964157104, "logps/chosen": -461.1499938964844, "logps/rejected": -531.4000244140625, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.55859375, "rewards/margins": 11.043749809265137, "rewards/rejected": -15.612500190734863, "step": 3700 }, { "epoch": 3.184549356223176, "grad_norm": 0.15136078979913456, "learning_rate": 2.040772532188841e-07, "logits/chosen": -0.4318603575229645, "logits/rejected": -0.5757240056991577, "logps/chosen": -446.20001220703125, "logps/rejected": -530.9000244140625, "loss": 0.0218, "rewards/accuracies": 0.96875, "rewards/chosen": -5.8359375, "rewards/margins": 10.990625381469727, "rewards/rejected": -16.818750381469727, "step": 3710 }, { "epoch": 3.1931330472103006, "grad_norm": 0.35130774439939577, "learning_rate": 2.0193133047210301e-07, "logits/chosen": -0.45623779296875, "logits/rejected": -0.5042206048965454, "logps/chosen": -440.20001220703125, "logps/rejected": -518.7000122070312, "loss": 0.0183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.513671875, "rewards/margins": 10.774999618530273, "rewards/rejected": -16.28125, "step": 3720 }, { "epoch": 3.2017167381974247, "grad_norm": 4.787749241847106, "learning_rate": 1.9978540772532186e-07, "logits/chosen": -0.47191160917282104, "logits/rejected": -0.504223644733429, "logps/chosen": -446.20001220703125, "logps/rejected": -567.4000244140625, "loss": 0.0143, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.192187309265137, "rewards/margins": 11.290624618530273, "rewards/rejected": -17.475000381469727, "step": 3730 }, { "epoch": 3.2103004291845494, "grad_norm": 3.3142480770400793, "learning_rate": 1.9763948497854076e-07, "logits/chosen": -0.45933228731155396, "logits/rejected": -0.4983764588832855, "logps/chosen": -415.29998779296875, "logps/rejected": -518.5999755859375, "loss": 0.018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.857421875, "rewards/margins": 11.206250190734863, "rewards/rejected": -17.0625, "step": 3740 }, { "epoch": 3.218884120171674, "grad_norm": 0.09055091497172199, "learning_rate": 1.9549356223175964e-07, "logits/chosen": -0.3434081971645355, "logits/rejected": -0.396484375, "logps/chosen": -473.70001220703125, "logps/rejected": -558.0999755859375, "loss": 0.0353, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.6640625, "rewards/margins": 10.356249809265137, "rewards/rejected": -16.021875381469727, "step": 3750 }, { "epoch": 3.227467811158798, "grad_norm": 0.11207176821994755, "learning_rate": 1.9334763948497854e-07, "logits/chosen": -0.542773425579071, "logits/rejected": -0.612536609172821, "logps/chosen": -453.20001220703125, "logps/rejected": -534.5999755859375, "loss": 0.0133, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.262890815734863, "rewards/margins": 11.28125, "rewards/rejected": -16.543750762939453, "step": 3760 }, { "epoch": 3.236051502145923, "grad_norm": 0.3743344129238915, "learning_rate": 1.9120171673819742e-07, "logits/chosen": -0.46467286348342896, "logits/rejected": -0.5119384527206421, "logps/chosen": -478.6000061035156, "logps/rejected": -555.5, "loss": 0.0132, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.440625190734863, "rewards/margins": 11.193750381469727, "rewards/rejected": -16.640625, "step": 3770 }, { "epoch": 3.244635193133047, "grad_norm": 0.24296927538939966, "learning_rate": 1.890557939914163e-07, "logits/chosen": -0.505932629108429, "logits/rejected": -0.5726562738418579, "logps/chosen": -424.5, "logps/rejected": -512.9000244140625, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.279687404632568, "rewards/margins": 11.1796875, "rewards/rejected": -17.453125, "step": 3780 }, { "epoch": 3.2532188841201717, "grad_norm": 0.1681197769364639, "learning_rate": 1.869098712446352e-07, "logits/chosen": -0.544628918170929, "logits/rejected": -0.561474621295929, "logps/chosen": -478.0, "logps/rejected": -582.5999755859375, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.20703125, "rewards/margins": 11.5, "rewards/rejected": -17.706249237060547, "step": 3790 }, { "epoch": 3.2618025751072963, "grad_norm": 0.39818592783561185, "learning_rate": 1.8476394849785407e-07, "logits/chosen": -0.3221069276332855, "logits/rejected": -0.4893554747104645, "logps/chosen": -490.8999938964844, "logps/rejected": -562.4000244140625, "loss": 0.0131, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.591406345367432, "rewards/margins": 11.409375190734863, "rewards/rejected": -17.003124237060547, "step": 3800 }, { "epoch": 3.2703862660944205, "grad_norm": 0.8823866789831107, "learning_rate": 1.8261802575107297e-07, "logits/chosen": -0.4580322206020355, "logits/rejected": -0.5227295160293579, "logps/chosen": -468.70001220703125, "logps/rejected": -547.7000122070312, "loss": 0.0217, "rewards/accuracies": 0.96875, "rewards/chosen": -5.411718845367432, "rewards/margins": 11.078125, "rewards/rejected": -16.490625381469727, "step": 3810 }, { "epoch": 3.278969957081545, "grad_norm": 0.09959753132764793, "learning_rate": 1.8047210300429184e-07, "logits/chosen": -0.481393426656723, "logits/rejected": -0.5412842035293579, "logps/chosen": -492.79998779296875, "logps/rejected": -574.2999877929688, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.767968654632568, "rewards/margins": 11.846875190734863, "rewards/rejected": -17.621875762939453, "step": 3820 }, { "epoch": 3.2875536480686693, "grad_norm": 0.8472346712883431, "learning_rate": 1.7832618025751072e-07, "logits/chosen": -0.4970703125, "logits/rejected": -0.5432373285293579, "logps/chosen": -444.0, "logps/rejected": -482.1000061035156, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.399218559265137, "rewards/margins": 10.234375, "rewards/rejected": -16.621875762939453, "step": 3830 }, { "epoch": 3.296137339055794, "grad_norm": 0.20256682647574872, "learning_rate": 1.761802575107296e-07, "logits/chosen": -0.594433605670929, "logits/rejected": -0.6988281011581421, "logps/chosen": -467.5, "logps/rejected": -548.4000244140625, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.533593654632568, "rewards/margins": 11.206250190734863, "rewards/rejected": -17.746875762939453, "step": 3840 }, { "epoch": 3.3047210300429186, "grad_norm": 0.06399927514667011, "learning_rate": 1.740343347639485e-07, "logits/chosen": -0.37849122285842896, "logits/rejected": -0.4180664122104645, "logps/chosen": -418.20001220703125, "logps/rejected": -527.0999755859375, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.40234375, "rewards/margins": 11.0625, "rewards/rejected": -16.462499618530273, "step": 3850 }, { "epoch": 3.313304721030043, "grad_norm": 0.09034928107520022, "learning_rate": 1.7188841201716737e-07, "logits/chosen": -0.6256347894668579, "logits/rejected": -0.650146484375, "logps/chosen": -413.1000061035156, "logps/rejected": -503.79998779296875, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.874218940734863, "rewards/margins": 11.081250190734863, "rewards/rejected": -16.953125, "step": 3860 }, { "epoch": 3.3218884120171674, "grad_norm": 0.16338419534501564, "learning_rate": 1.6974248927038627e-07, "logits/chosen": -0.602124035358429, "logits/rejected": -0.555468738079071, "logps/chosen": -440.8999938964844, "logps/rejected": -501.29998779296875, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.609375, "rewards/margins": 10.9375, "rewards/rejected": -16.549999237060547, "step": 3870 }, { "epoch": 3.3304721030042916, "grad_norm": 0.9818630189181746, "learning_rate": 1.6759656652360514e-07, "logits/chosen": -0.4006103575229645, "logits/rejected": -0.42333984375, "logps/chosen": -450.79998779296875, "logps/rejected": -515.7999877929688, "loss": 0.0306, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.349218845367432, "rewards/margins": 10.493749618530273, "rewards/rejected": -15.834375381469727, "step": 3880 }, { "epoch": 3.3390557939914163, "grad_norm": 0.15162294246549157, "learning_rate": 1.6545064377682402e-07, "logits/chosen": -0.5174316167831421, "logits/rejected": -0.5606933832168579, "logps/chosen": -467.8999938964844, "logps/rejected": -538.0999755859375, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.903906345367432, "rewards/margins": 11.290624618530273, "rewards/rejected": -17.196874618530273, "step": 3890 }, { "epoch": 3.347639484978541, "grad_norm": 0.2641681972306145, "learning_rate": 1.6330472103004292e-07, "logits/chosen": -0.49024659395217896, "logits/rejected": -0.5806640386581421, "logps/chosen": -435.8999938964844, "logps/rejected": -523.2000122070312, "loss": 0.0396, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.399218559265137, "rewards/margins": 10.378125190734863, "rewards/rejected": -15.768750190734863, "step": 3900 }, { "epoch": 3.356223175965665, "grad_norm": 0.10636536565409023, "learning_rate": 1.611587982832618e-07, "logits/chosen": -0.5033203363418579, "logits/rejected": -0.53955078125, "logps/chosen": -443.5, "logps/rejected": -532.4000244140625, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.591406345367432, "rewards/margins": 11.065625190734863, "rewards/rejected": -16.662500381469727, "step": 3910 }, { "epoch": 3.3648068669527897, "grad_norm": 0.023061016231088503, "learning_rate": 1.5901287553648067e-07, "logits/chosen": -0.43548583984375, "logits/rejected": -0.4675048887729645, "logps/chosen": -437.29998779296875, "logps/rejected": -540.0999755859375, "loss": 0.0326, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.4140625, "rewards/margins": 10.934374809265137, "rewards/rejected": -16.34375, "step": 3920 }, { "epoch": 3.3733905579399144, "grad_norm": 0.0608932439089705, "learning_rate": 1.5686695278969955e-07, "logits/chosen": -0.4179931581020355, "logits/rejected": -0.563061535358429, "logps/chosen": -471.70001220703125, "logps/rejected": -506.1000061035156, "loss": 0.0219, "rewards/accuracies": 0.96875, "rewards/chosen": -4.826952934265137, "rewards/margins": 11.515625, "rewards/rejected": -16.353124618530273, "step": 3930 }, { "epoch": 3.3819742489270386, "grad_norm": 0.2787552503024038, "learning_rate": 1.5472103004291845e-07, "logits/chosen": -0.515429675579071, "logits/rejected": -0.5873779058456421, "logps/chosen": -454.0, "logps/rejected": -527.4000244140625, "loss": 0.0178, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.682031154632568, "rewards/margins": 10.934374809265137, "rewards/rejected": -16.615625381469727, "step": 3940 }, { "epoch": 3.390557939914163, "grad_norm": 0.04864564229643884, "learning_rate": 1.5257510729613735e-07, "logits/chosen": -0.46180421113967896, "logits/rejected": -0.49028319120407104, "logps/chosen": -422.29998779296875, "logps/rejected": -507.20001220703125, "loss": 0.0306, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.482812404632568, "rewards/margins": 10.956250190734863, "rewards/rejected": -16.431249618530273, "step": 3950 }, { "epoch": 3.3991416309012874, "grad_norm": 0.0542952709626364, "learning_rate": 1.5042918454935622e-07, "logits/chosen": -0.6314452886581421, "logits/rejected": -0.615039050579071, "logps/chosen": -453.6000061035156, "logps/rejected": -548.9000244140625, "loss": 0.0186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.657031059265137, "rewards/margins": 10.981249809265137, "rewards/rejected": -16.634374618530273, "step": 3960 }, { "epoch": 3.407725321888412, "grad_norm": 0.10240672775874922, "learning_rate": 1.482832618025751e-07, "logits/chosen": -0.6039062738418579, "logits/rejected": -0.5972350835800171, "logps/chosen": -440.0, "logps/rejected": -535.4000244140625, "loss": 0.0137, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.745312690734863, "rewards/margins": 11.165624618530273, "rewards/rejected": -16.909374237060547, "step": 3970 }, { "epoch": 3.4163090128755362, "grad_norm": 0.28893780394866825, "learning_rate": 1.4613733905579397e-07, "logits/chosen": -0.4552246034145355, "logits/rejected": -0.5011230707168579, "logps/chosen": -437.0, "logps/rejected": -494.1000061035156, "loss": 0.0231, "rewards/accuracies": 0.96875, "rewards/chosen": -5.875781059265137, "rewards/margins": 10.337499618530273, "rewards/rejected": -16.209375381469727, "step": 3980 }, { "epoch": 3.424892703862661, "grad_norm": 0.51486595466649, "learning_rate": 1.4399141630901287e-07, "logits/chosen": -0.571972668170929, "logits/rejected": -0.613037109375, "logps/chosen": -476.29998779296875, "logps/rejected": -548.2999877929688, "loss": 0.0224, "rewards/accuracies": 0.96875, "rewards/chosen": -6.189062595367432, "rewards/margins": 11.028124809265137, "rewards/rejected": -17.209375381469727, "step": 3990 }, { "epoch": 3.4334763948497855, "grad_norm": 0.07432262250676909, "learning_rate": 1.4184549356223175e-07, "logits/chosen": -0.4578857421875, "logits/rejected": -0.556591808795929, "logps/chosen": -446.8999938964844, "logps/rejected": -507.70001220703125, "loss": 0.0218, "rewards/accuracies": 0.96875, "rewards/chosen": -5.31640625, "rewards/margins": 10.8203125, "rewards/rejected": -16.118749618530273, "step": 4000 }, { "epoch": 3.4420600858369097, "grad_norm": 0.5086195952510676, "learning_rate": 1.3969957081545065e-07, "logits/chosen": -0.43292236328125, "logits/rejected": -0.513806164264679, "logps/chosen": -455.1000061035156, "logps/rejected": -519.2999877929688, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.415625095367432, "rewards/margins": 10.678125381469727, "rewards/rejected": -16.090625762939453, "step": 4010 }, { "epoch": 3.4506437768240343, "grad_norm": 0.6811995903287839, "learning_rate": 1.375536480686695e-07, "logits/chosen": -0.523388683795929, "logits/rejected": -0.5645751953125, "logps/chosen": -467.8999938964844, "logps/rejected": -552.5, "loss": 0.0304, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.944531440734863, "rewards/margins": 11.287500381469727, "rewards/rejected": -17.243749618530273, "step": 4020 }, { "epoch": 3.459227467811159, "grad_norm": 0.3279705978890854, "learning_rate": 1.354077253218884e-07, "logits/chosen": -0.5240844488143921, "logits/rejected": -0.536102294921875, "logps/chosen": -442.5, "logps/rejected": -537.5999755859375, "loss": 0.0131, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.548437595367432, "rewards/margins": 11.256250381469727, "rewards/rejected": -16.806249618530273, "step": 4030 }, { "epoch": 3.467811158798283, "grad_norm": 0.08655858183539146, "learning_rate": 1.332618025751073e-07, "logits/chosen": -0.617480456829071, "logits/rejected": -0.6874023675918579, "logps/chosen": -460.29998779296875, "logps/rejected": -524.0999755859375, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.44140625, "rewards/margins": 10.865625381469727, "rewards/rejected": -16.309375762939453, "step": 4040 }, { "epoch": 3.476394849785408, "grad_norm": 0.38356227853979946, "learning_rate": 1.3111587982832618e-07, "logits/chosen": -0.529589831829071, "logits/rejected": -0.5992676019668579, "logps/chosen": -422.1000061035156, "logps/rejected": -475.70001220703125, "loss": 0.0268, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.106249809265137, "rewards/margins": 11.1875, "rewards/rejected": -16.28125, "step": 4050 }, { "epoch": 3.484978540772532, "grad_norm": 0.029307963277662337, "learning_rate": 1.2896995708154508e-07, "logits/chosen": -0.4181152284145355, "logits/rejected": -0.47149658203125, "logps/chosen": -462.8999938964844, "logps/rejected": -524.2999877929688, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.51953125, "rewards/margins": 10.9375, "rewards/rejected": -16.450000762939453, "step": 4060 }, { "epoch": 3.4935622317596566, "grad_norm": 0.2999928836736653, "learning_rate": 1.2682403433476393e-07, "logits/chosen": -0.546093761920929, "logits/rejected": -0.631152331829071, "logps/chosen": -441.3999938964844, "logps/rejected": -548.5999755859375, "loss": 0.0222, "rewards/accuracies": 0.96875, "rewards/chosen": -5.455468654632568, "rewards/margins": 10.887499809265137, "rewards/rejected": -16.346874237060547, "step": 4070 }, { "epoch": 3.5021459227467813, "grad_norm": 0.32005738664924965, "learning_rate": 1.2467811158798283e-07, "logits/chosen": -0.46868896484375, "logits/rejected": -0.543591320514679, "logps/chosen": -478.79998779296875, "logps/rejected": -538.5999755859375, "loss": 0.0225, "rewards/accuracies": 0.96875, "rewards/chosen": -5.30078125, "rewards/margins": 10.524999618530273, "rewards/rejected": -15.821874618530273, "step": 4080 }, { "epoch": 3.5107296137339055, "grad_norm": 0.40621253454325496, "learning_rate": 1.225321888412017e-07, "logits/chosen": -0.585205078125, "logits/rejected": -0.6421264410018921, "logps/chosen": -481.54998779296875, "logps/rejected": -535.5999755859375, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.811718940734863, "rewards/margins": 11.168749809265137, "rewards/rejected": -16.984375, "step": 4090 }, { "epoch": 3.51931330472103, "grad_norm": 0.12659878902602759, "learning_rate": 1.203862660944206e-07, "logits/chosen": -0.3822265565395355, "logits/rejected": -0.4558349549770355, "logps/chosen": -453.29998779296875, "logps/rejected": -541.0999755859375, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.035937309265137, "rewards/margins": 10.890625, "rewards/rejected": -16.928125381469727, "step": 4100 }, { "epoch": 3.5278969957081543, "grad_norm": 0.14057779514347973, "learning_rate": 1.1824034334763948e-07, "logits/chosen": -0.6291259527206421, "logits/rejected": -0.6538604497909546, "logps/chosen": -427.1000061035156, "logps/rejected": -536.7999877929688, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.030468940734863, "rewards/margins": 11.853124618530273, "rewards/rejected": -17.875, "step": 4110 }, { "epoch": 3.536480686695279, "grad_norm": 0.3494906375416725, "learning_rate": 1.1609442060085837e-07, "logits/chosen": -0.6259765625, "logits/rejected": -0.618847668170929, "logps/chosen": -427.29998779296875, "logps/rejected": -515.0, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.80078125, "rewards/margins": 11.274999618530273, "rewards/rejected": -17.084375381469727, "step": 4120 }, { "epoch": 3.5450643776824036, "grad_norm": 0.05830464410520989, "learning_rate": 1.1394849785407724e-07, "logits/chosen": -0.5318206548690796, "logits/rejected": -0.511462390422821, "logps/chosen": -493.29998779296875, "logps/rejected": -574.0999755859375, "loss": 0.0309, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.948437690734863, "rewards/margins": 10.9921875, "rewards/rejected": -16.940624237060547, "step": 4130 }, { "epoch": 3.5536480686695278, "grad_norm": 0.03733778880080496, "learning_rate": 1.1180257510729613e-07, "logits/chosen": -0.5648193359375, "logits/rejected": -0.578991711139679, "logps/chosen": -452.8500061035156, "logps/rejected": -560.5, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.825781345367432, "rewards/margins": 11.809374809265137, "rewards/rejected": -17.621875762939453, "step": 4140 }, { "epoch": 3.5622317596566524, "grad_norm": 0.1441398784469748, "learning_rate": 1.0965665236051502e-07, "logits/chosen": -0.4588623046875, "logits/rejected": -0.523974597454071, "logps/chosen": -418.0, "logps/rejected": -510.1000061035156, "loss": 0.023, "rewards/accuracies": 0.96875, "rewards/chosen": -5.213281154632568, "rewards/margins": 10.793749809265137, "rewards/rejected": -16.009374618530273, "step": 4150 }, { "epoch": 3.570815450643777, "grad_norm": 0.15071763414463477, "learning_rate": 1.0751072961373391e-07, "logits/chosen": -0.5269531011581421, "logits/rejected": -0.552685558795929, "logps/chosen": -434.1000061035156, "logps/rejected": -503.3999938964844, "loss": 0.0306, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.608593940734863, "rewards/margins": 10.675000190734863, "rewards/rejected": -16.284374237060547, "step": 4160 }, { "epoch": 3.5793991416309012, "grad_norm": 0.6285888816527, "learning_rate": 1.0536480686695278e-07, "logits/chosen": -0.6355956792831421, "logits/rejected": -0.6544433832168579, "logps/chosen": -475.6000061035156, "logps/rejected": -578.7999877929688, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.453125, "rewards/margins": 11.28125, "rewards/rejected": -16.740625381469727, "step": 4170 }, { "epoch": 3.587982832618026, "grad_norm": 0.038720886164161424, "learning_rate": 1.0321888412017167e-07, "logits/chosen": -0.4979003965854645, "logits/rejected": -0.5230468511581421, "logps/chosen": -449.6000061035156, "logps/rejected": -521.5, "loss": 0.0177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.59375, "rewards/margins": 10.634374618530273, "rewards/rejected": -16.228124618530273, "step": 4180 }, { "epoch": 3.59656652360515, "grad_norm": 0.1638289855858865, "learning_rate": 1.0107296137339056e-07, "logits/chosen": -0.504833996295929, "logits/rejected": -0.507110595703125, "logps/chosen": -429.8999938964844, "logps/rejected": -539.5999755859375, "loss": 0.0236, "rewards/accuracies": 0.96875, "rewards/chosen": -5.196093559265137, "rewards/margins": 10.596875190734863, "rewards/rejected": -15.790624618530273, "step": 4190 }, { "epoch": 3.6051502145922747, "grad_norm": 0.28205567897830586, "learning_rate": 9.892703862660943e-08, "logits/chosen": -0.511523425579071, "logits/rejected": -0.609301745891571, "logps/chosen": -478.8999938964844, "logps/rejected": -532.4000244140625, "loss": 0.025, "rewards/accuracies": 0.96875, "rewards/chosen": -5.978125095367432, "rewards/margins": 10.40625, "rewards/rejected": -16.390625, "step": 4200 }, { "epoch": 3.613733905579399, "grad_norm": 0.07871876049466232, "learning_rate": 9.678111587982832e-08, "logits/chosen": -0.4658203125, "logits/rejected": -0.529345691204071, "logps/chosen": -449.20001220703125, "logps/rejected": -529.2999877929688, "loss": 0.0288, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.499218940734863, "rewards/margins": 10.631250381469727, "rewards/rejected": -16.125, "step": 4210 }, { "epoch": 3.6223175965665235, "grad_norm": 0.1304300640847237, "learning_rate": 9.46351931330472e-08, "logits/chosen": -0.515948474407196, "logits/rejected": -0.56982421875, "logps/chosen": -462.0, "logps/rejected": -541.4000244140625, "loss": 0.0135, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.667187690734863, "rewards/margins": 10.309374809265137, "rewards/rejected": -15.971875190734863, "step": 4220 }, { "epoch": 3.630901287553648, "grad_norm": 0.0855365534556271, "learning_rate": 9.24892703862661e-08, "logits/chosen": -0.347076416015625, "logits/rejected": -0.38670653104782104, "logps/chosen": -456.8999938964844, "logps/rejected": -544.4000244140625, "loss": 0.0311, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.499218940734863, "rewards/margins": 10.899999618530273, "rewards/rejected": -16.396875381469727, "step": 4230 }, { "epoch": 3.6394849785407724, "grad_norm": 0.032786261392949365, "learning_rate": 9.034334763948499e-08, "logits/chosen": -0.513671875, "logits/rejected": -0.542651355266571, "logps/chosen": -466.6000061035156, "logps/rejected": -549.5, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.111718654632568, "rewards/margins": 11.540624618530273, "rewards/rejected": -17.634374618530273, "step": 4240 }, { "epoch": 3.648068669527897, "grad_norm": 0.28212470240775284, "learning_rate": 8.819742489270386e-08, "logits/chosen": -0.5195678472518921, "logits/rejected": -0.560382068157196, "logps/chosen": -458.29998779296875, "logps/rejected": -524.4000244140625, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.817968845367432, "rewards/margins": 11.009374618530273, "rewards/rejected": -16.834375381469727, "step": 4250 }, { "epoch": 3.6566523605150216, "grad_norm": 0.6350183894320915, "learning_rate": 8.605150214592275e-08, "logits/chosen": -0.5663086175918579, "logits/rejected": -0.6140381097793579, "logps/chosen": -443.0, "logps/rejected": -544.7999877929688, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.208593845367432, "rewards/margins": 11.096875190734863, "rewards/rejected": -17.3125, "step": 4260 }, { "epoch": 3.665236051502146, "grad_norm": 0.3212409077886702, "learning_rate": 8.390557939914162e-08, "logits/chosen": -0.474365234375, "logits/rejected": -0.572827160358429, "logps/chosen": -507.8999938964844, "logps/rejected": -555.2000122070312, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.797656059265137, "rewards/margins": 11.306249618530273, "rewards/rejected": -17.090625762939453, "step": 4270 }, { "epoch": 3.6738197424892705, "grad_norm": 0.08344276838529266, "learning_rate": 8.175965665236051e-08, "logits/chosen": -0.4314331114292145, "logits/rejected": -0.4405273497104645, "logps/chosen": -440.79998779296875, "logps/rejected": -531.5, "loss": 0.0147, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.926562309265137, "rewards/margins": 10.573437690734863, "rewards/rejected": -16.496875762939453, "step": 4280 }, { "epoch": 3.682403433476395, "grad_norm": 0.2516445998856356, "learning_rate": 7.961373390557939e-08, "logits/chosen": -0.46809083223342896, "logits/rejected": -0.47987061738967896, "logps/chosen": -444.0, "logps/rejected": -517.0, "loss": 0.0311, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.275000095367432, "rewards/margins": 11.189062118530273, "rewards/rejected": -16.450000762939453, "step": 4290 }, { "epoch": 3.6909871244635193, "grad_norm": 0.15208871712690586, "learning_rate": 7.746781115879827e-08, "logits/chosen": -0.4883056581020355, "logits/rejected": -0.55767822265625, "logps/chosen": -462.70001220703125, "logps/rejected": -543.7000122070312, "loss": 0.0134, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.196875095367432, "rewards/margins": 11.115625381469727, "rewards/rejected": -16.309375762939453, "step": 4300 }, { "epoch": 3.699570815450644, "grad_norm": 6.50184216288345, "learning_rate": 7.532188841201718e-08, "logits/chosen": -0.544873058795929, "logits/rejected": -0.5732666254043579, "logps/chosen": -412.79998779296875, "logps/rejected": -503.0, "loss": 0.014, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.762499809265137, "rewards/margins": 11.134374618530273, "rewards/rejected": -16.890625, "step": 4310 }, { "epoch": 3.708154506437768, "grad_norm": 0.5078452547401664, "learning_rate": 7.317596566523605e-08, "logits/chosen": -0.489501953125, "logits/rejected": -0.6026366949081421, "logps/chosen": -461.20001220703125, "logps/rejected": -529.2999877929688, "loss": 0.0145, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.846875190734863, "rewards/margins": 11.337499618530273, "rewards/rejected": -17.168750762939453, "step": 4320 }, { "epoch": 3.7167381974248928, "grad_norm": 0.28679833761903295, "learning_rate": 7.103004291845494e-08, "logits/chosen": -0.35820311307907104, "logits/rejected": -0.3603271543979645, "logps/chosen": -386.1000061035156, "logps/rejected": -481.79998779296875, "loss": 0.035, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.62109375, "rewards/margins": 10.185937881469727, "rewards/rejected": -15.818750381469727, "step": 4330 }, { "epoch": 3.725321888412017, "grad_norm": 0.03332982022598832, "learning_rate": 6.888412017167381e-08, "logits/chosen": -0.622119128704071, "logits/rejected": -0.6522461175918579, "logps/chosen": -465.6000061035156, "logps/rejected": -542.7999877929688, "loss": 0.0183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.034375190734863, "rewards/margins": 10.846875190734863, "rewards/rejected": -16.896875381469727, "step": 4340 }, { "epoch": 3.7339055793991416, "grad_norm": 0.13372302407936823, "learning_rate": 6.67381974248927e-08, "logits/chosen": -0.4971252381801605, "logits/rejected": -0.570849597454071, "logps/chosen": -438.6000061035156, "logps/rejected": -518.0, "loss": 0.0209, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.817968845367432, "rewards/margins": 10.871874809265137, "rewards/rejected": -16.6875, "step": 4350 }, { "epoch": 3.742489270386266, "grad_norm": 0.28310699763176267, "learning_rate": 6.459227467811158e-08, "logits/chosen": -0.5357666015625, "logits/rejected": -0.612902820110321, "logps/chosen": -461.5, "logps/rejected": -534.5999755859375, "loss": 0.0134, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.078125, "rewards/margins": 11.5, "rewards/rejected": -17.578125, "step": 4360 }, { "epoch": 3.7510729613733904, "grad_norm": 7.239778153097841, "learning_rate": 6.244635193133048e-08, "logits/chosen": -0.4986816346645355, "logits/rejected": -0.512377917766571, "logps/chosen": -484.79998779296875, "logps/rejected": -549.5, "loss": 0.0201, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.342187404632568, "rewards/margins": 10.865625381469727, "rewards/rejected": -17.203125, "step": 4370 }, { "epoch": 3.759656652360515, "grad_norm": 1.7470922276702023, "learning_rate": 6.030042918454935e-08, "logits/chosen": -0.416656494140625, "logits/rejected": -0.46937257051467896, "logps/chosen": -470.20001220703125, "logps/rejected": -539.0, "loss": 0.0263, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.810546875, "rewards/margins": 10.621874809265137, "rewards/rejected": -16.431249618530273, "step": 4380 }, { "epoch": 3.7682403433476397, "grad_norm": 0.033254120163290116, "learning_rate": 5.8154506437768235e-08, "logits/chosen": -0.5901855230331421, "logits/rejected": -0.663378894329071, "logps/chosen": -456.3500061035156, "logps/rejected": -512.0, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.364843845367432, "rewards/margins": 11.078125, "rewards/rejected": -16.434375762939453, "step": 4390 }, { "epoch": 3.776824034334764, "grad_norm": 0.3711236185207497, "learning_rate": 5.600858369098712e-08, "logits/chosen": -0.49454957246780396, "logits/rejected": -0.549023449420929, "logps/chosen": -491.0, "logps/rejected": -553.0, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.903906345367432, "rewards/margins": 10.662500381469727, "rewards/rejected": -16.559375762939453, "step": 4400 }, { "epoch": 3.7854077253218885, "grad_norm": 0.08197074792629062, "learning_rate": 5.386266094420601e-08, "logits/chosen": -0.4793457090854645, "logits/rejected": -0.5284668207168579, "logps/chosen": -507.3500061035156, "logps/rejected": -588.7999877929688, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.237500190734863, "rewards/margins": 11.637499809265137, "rewards/rejected": -17.878124237060547, "step": 4410 }, { "epoch": 3.7939914163090127, "grad_norm": 2.8248349656435816, "learning_rate": 5.171673819742489e-08, "logits/chosen": -0.42139893770217896, "logits/rejected": -0.4217773377895355, "logps/chosen": -426.5, "logps/rejected": -494.3999938964844, "loss": 0.027, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.4140625, "rewards/margins": 10.931249618530273, "rewards/rejected": -16.356250762939453, "step": 4420 }, { "epoch": 3.8025751072961373, "grad_norm": 0.1261057225402086, "learning_rate": 4.9570815450643774e-08, "logits/chosen": -0.38242799043655396, "logits/rejected": -0.5324341058731079, "logps/chosen": -483.5, "logps/rejected": -530.4000244140625, "loss": 0.0134, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.823437690734863, "rewards/margins": 11.184374809265137, "rewards/rejected": -16.993749618530273, "step": 4430 }, { "epoch": 3.8111587982832615, "grad_norm": 0.22438524859389974, "learning_rate": 4.7424892703862656e-08, "logits/chosen": -0.5602142214775085, "logits/rejected": -0.611499011516571, "logps/chosen": -424.5, "logps/rejected": -519.7999877929688, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.120312690734863, "rewards/margins": 11.356249809265137, "rewards/rejected": -17.465625762939453, "step": 4440 }, { "epoch": 3.819742489270386, "grad_norm": 2.7874377877905503, "learning_rate": 4.5278969957081544e-08, "logits/chosen": -0.53009033203125, "logits/rejected": -0.583264172077179, "logps/chosen": -444.20001220703125, "logps/rejected": -536.5, "loss": 0.014, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.39453125, "rewards/margins": 10.982812881469727, "rewards/rejected": -16.378124237060547, "step": 4450 }, { "epoch": 3.828326180257511, "grad_norm": 0.25394254153091284, "learning_rate": 4.313304721030043e-08, "logits/chosen": -0.5914306640625, "logits/rejected": -0.6375061273574829, "logps/chosen": -478.8999938964844, "logps/rejected": -537.9000244140625, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.946875095367432, "rewards/margins": 11.306249618530273, "rewards/rejected": -17.246875762939453, "step": 4460 }, { "epoch": 3.836909871244635, "grad_norm": 0.13336751883554981, "learning_rate": 4.0987124463519313e-08, "logits/chosen": -0.3928466737270355, "logits/rejected": -0.464111328125, "logps/chosen": -467.29998779296875, "logps/rejected": -558.7999877929688, "loss": 0.0218, "rewards/accuracies": 0.96875, "rewards/chosen": -5.8359375, "rewards/margins": 10.8125, "rewards/rejected": -16.653125762939453, "step": 4470 }, { "epoch": 3.8454935622317596, "grad_norm": 0.36502154729896713, "learning_rate": 3.8841201716738195e-08, "logits/chosen": -0.37457275390625, "logits/rejected": -0.4106689393520355, "logps/chosen": -483.6000061035156, "logps/rejected": -563.5, "loss": 0.0221, "rewards/accuracies": 0.96875, "rewards/chosen": -6.129687309265137, "rewards/margins": 11.154687881469727, "rewards/rejected": -17.296875, "step": 4480 }, { "epoch": 3.8540772532188843, "grad_norm": 1.6629093571739153, "learning_rate": 3.669527896995708e-08, "logits/chosen": -0.550048828125, "logits/rejected": -0.5906982421875, "logps/chosen": -455.1000061035156, "logps/rejected": -531.0, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.125, "rewards/margins": 11.246874809265137, "rewards/rejected": -17.359375, "step": 4490 }, { "epoch": 3.8626609442060085, "grad_norm": 0.023245053584796258, "learning_rate": 3.4549356223175965e-08, "logits/chosen": -0.6104980707168579, "logits/rejected": -0.6585937738418579, "logps/chosen": -432.29998779296875, "logps/rejected": -526.4000244140625, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.143750190734863, "rewards/margins": 10.940625190734863, "rewards/rejected": -17.078125, "step": 4500 }, { "epoch": 3.871244635193133, "grad_norm": 0.32482300417898036, "learning_rate": 3.2403433476394846e-08, "logits/chosen": -0.5097824335098267, "logits/rejected": -0.592572033405304, "logps/chosen": -495.0, "logps/rejected": -556.2999877929688, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.758593559265137, "rewards/margins": 11.574999809265137, "rewards/rejected": -17.328125, "step": 4510 }, { "epoch": 3.8798283261802577, "grad_norm": 0.1562992752434742, "learning_rate": 3.0257510729613734e-08, "logits/chosen": -0.4534973204135895, "logits/rejected": -0.556042492389679, "logps/chosen": -445.1000061035156, "logps/rejected": -494.3999938964844, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.795312404632568, "rewards/margins": 11.287500381469727, "rewards/rejected": -17.084375381469727, "step": 4520 }, { "epoch": 3.888412017167382, "grad_norm": 0.21652840054734515, "learning_rate": 2.8111587982832616e-08, "logits/chosen": -0.5446106195449829, "logits/rejected": -0.570751965045929, "logps/chosen": -461.29998779296875, "logps/rejected": -553.2000122070312, "loss": 0.0141, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.189843654632568, "rewards/margins": 10.971875190734863, "rewards/rejected": -17.171875, "step": 4530 }, { "epoch": 3.8969957081545066, "grad_norm": 0.4087016524320137, "learning_rate": 2.5965665236051504e-08, "logits/chosen": -0.4939941465854645, "logits/rejected": -0.5531982183456421, "logps/chosen": -476.79998779296875, "logps/rejected": -543.0, "loss": 0.0263, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.557031154632568, "rewards/margins": 11.381250381469727, "rewards/rejected": -16.931249618530273, "step": 4540 }, { "epoch": 3.9055793991416308, "grad_norm": 0.03282995868149006, "learning_rate": 2.3819742489270385e-08, "logits/chosen": -0.4865966737270355, "logits/rejected": -0.570019543170929, "logps/chosen": -422.6499938964844, "logps/rejected": -503.70001220703125, "loss": 0.0262, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.028906345367432, "rewards/margins": 11.431249618530273, "rewards/rejected": -17.462499618530273, "step": 4550 }, { "epoch": 3.9141630901287554, "grad_norm": 0.02253061952134738, "learning_rate": 2.167381974248927e-08, "logits/chosen": -0.70849609375, "logits/rejected": -0.760693371295929, "logps/chosen": -410.20001220703125, "logps/rejected": -510.1000061035156, "loss": 0.0177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.59375, "rewards/margins": 10.746874809265137, "rewards/rejected": -17.350000381469727, "step": 4560 }, { "epoch": 3.9227467811158796, "grad_norm": 3.440036103695739, "learning_rate": 1.9527896995708155e-08, "logits/chosen": -0.4623046815395355, "logits/rejected": -0.50506591796875, "logps/chosen": -422.8999938964844, "logps/rejected": -517.0999755859375, "loss": 0.0188, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.108593940734863, "rewards/margins": 10.498437881469727, "rewards/rejected": -16.606250762939453, "step": 4570 }, { "epoch": 3.9313304721030042, "grad_norm": 0.6003817744848279, "learning_rate": 1.738197424892704e-08, "logits/chosen": -0.5237060785293579, "logits/rejected": -0.609814465045929, "logps/chosen": -472.1499938964844, "logps/rejected": -544.4000244140625, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.964062690734863, "rewards/margins": 11.550000190734863, "rewards/rejected": -17.515625, "step": 4580 }, { "epoch": 3.939914163090129, "grad_norm": 0.09862456129733702, "learning_rate": 1.523605150214592e-08, "logits/chosen": -0.5623534917831421, "logits/rejected": -0.6323608160018921, "logps/chosen": -448.54998779296875, "logps/rejected": -539.9000244140625, "loss": 0.0057, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.688281059265137, "rewards/margins": 11.699999809265137, "rewards/rejected": -17.403125762939453, "step": 4590 }, { "epoch": 3.948497854077253, "grad_norm": 0.08993341755823005, "learning_rate": 1.3090128755364806e-08, "logits/chosen": -0.3091064393520355, "logits/rejected": -0.4073242247104645, "logps/chosen": -453.5, "logps/rejected": -538.0999755859375, "loss": 0.0277, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.759375095367432, "rewards/margins": 10.764062881469727, "rewards/rejected": -17.528125762939453, "step": 4600 }, { "epoch": 3.9570815450643777, "grad_norm": 0.041369465865210474, "learning_rate": 1.0944206008583691e-08, "logits/chosen": -0.56268310546875, "logits/rejected": -0.655444324016571, "logps/chosen": -479.3999938964844, "logps/rejected": -534.4000244140625, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.293749809265137, "rewards/margins": 11.253125190734863, "rewards/rejected": -17.549999237060547, "step": 4610 }, { "epoch": 3.9656652360515023, "grad_norm": 0.16485340618963346, "learning_rate": 8.798283261802574e-09, "logits/chosen": -0.4974365234375, "logits/rejected": -0.588305652141571, "logps/chosen": -453.1000061035156, "logps/rejected": -552.5999755859375, "loss": 0.022, "rewards/accuracies": 0.96875, "rewards/chosen": -5.832812309265137, "rewards/margins": 11.084375381469727, "rewards/rejected": -16.918750762939453, "step": 4620 }, { "epoch": 3.9742489270386265, "grad_norm": 0.056054659375505335, "learning_rate": 6.652360515021459e-09, "logits/chosen": -0.6281982660293579, "logits/rejected": -0.684741199016571, "logps/chosen": -457.8999938964844, "logps/rejected": -549.0, "loss": 0.0074, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.072656154632568, "rewards/margins": 11.756250381469727, "rewards/rejected": -17.84375, "step": 4630 }, { "epoch": 3.982832618025751, "grad_norm": 0.1978699497992826, "learning_rate": 4.506437768240343e-09, "logits/chosen": -0.5661255121231079, "logits/rejected": -0.597094714641571, "logps/chosen": -456.5, "logps/rejected": -530.2999877929688, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.084374904632568, "rewards/margins": 11.237500190734863, "rewards/rejected": -17.328125, "step": 4640 }, { "epoch": 3.991416309012876, "grad_norm": 0.5347896613346057, "learning_rate": 2.360515021459227e-09, "logits/chosen": -0.504504382610321, "logits/rejected": -0.576171875, "logps/chosen": -462.6000061035156, "logps/rejected": -551.0, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.274218559265137, "rewards/margins": 11.190625190734863, "rewards/rejected": -17.462499618530273, "step": 4650 }, { "epoch": 4.0, "grad_norm": 0.14704352528114775, "learning_rate": 2.1459227467811156e-10, "logits/chosen": -0.4452148377895355, "logits/rejected": -0.5035644769668579, "logps/chosen": -437.5, "logps/rejected": -506.70001220703125, "loss": 0.0223, "rewards/accuracies": 0.96875, "rewards/chosen": -5.873437404632568, "rewards/margins": 10.634374618530273, "rewards/rejected": -16.503124237060547, "step": 4660 } ], "logging_steps": 10, "max_steps": 4660, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }