{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 7588, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005271481286241434, "grad_norm": 67.49248198953677, "learning_rate": 9.988139167105956e-07, "logits/chosen": -1.0543212890625, "logits/rejected": -0.8619140386581421, "logps/chosen": -292.92498779296875, "logps/rejected": -272.57501220703125, "loss": 0.6911, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.02321929857134819, "rewards/margins": 0.00572967529296875, "rewards/rejected": -0.028919601812958717, "step": 10 }, { "epoch": 0.010542962572482868, "grad_norm": 65.20830906790225, "learning_rate": 9.974960463890353e-07, "logits/chosen": -1.0332763195037842, "logits/rejected": -0.863903820514679, "logps/chosen": -318.6499938964844, "logps/rejected": -305.8374938964844, "loss": 0.6978, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.06251907348632812, "rewards/margins": 0.0075744627974927425, "rewards/rejected": -0.07008133083581924, "step": 20 }, { "epoch": 0.0158144438587243, "grad_norm": 58.79286986950535, "learning_rate": 9.96178176067475e-07, "logits/chosen": -1.073632836341858, "logits/rejected": -0.894775390625, "logps/chosen": -317.2124938964844, "logps/rejected": -288.6625061035156, "loss": 0.671, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.02159729041159153, "rewards/margins": 0.07093505561351776, "rewards/rejected": -0.092534638941288, "step": 30 }, { "epoch": 0.021085925144965736, "grad_norm": 70.62555094964577, "learning_rate": 9.948603057459145e-07, "logits/chosen": -1.1312744617462158, "logits/rejected": -0.947460949420929, "logps/chosen": -303.6499938964844, "logps/rejected": -303.42498779296875, "loss": 0.6507, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01400604285299778, "rewards/margins": 0.10915832221508026, "rewards/rejected": -0.0950469970703125, "step": 40 }, { "epoch": 0.02635740643120717, "grad_norm": 60.59348242787014, "learning_rate": 9.935424354243542e-07, "logits/chosen": -1.2312500476837158, "logits/rejected": -0.813061535358429, "logps/chosen": -264.8500061035156, "logps/rejected": -271.45001220703125, "loss": 0.6552, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03845062106847763, "rewards/margins": 0.1129150390625, "rewards/rejected": -0.15146636962890625, "step": 50 }, { "epoch": 0.0316288877174486, "grad_norm": 59.79291910986725, "learning_rate": 9.922245651027939e-07, "logits/chosen": -1.2034180164337158, "logits/rejected": -0.933117687702179, "logps/chosen": -282.3500061035156, "logps/rejected": -289.73748779296875, "loss": 0.6427, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.01384887658059597, "rewards/margins": 0.16029052436351776, "rewards/rejected": -0.17397765815258026, "step": 60 }, { "epoch": 0.03690036900369004, "grad_norm": 55.10131681386421, "learning_rate": 9.909066947812334e-07, "logits/chosen": -1.250634789466858, "logits/rejected": -0.98681640625, "logps/chosen": -320.7124938964844, "logps/rejected": -313.9375, "loss": 0.6168, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0076904296875, "rewards/margins": 0.21025696396827698, "rewards/rejected": -0.20249328017234802, "step": 70 }, { "epoch": 0.04217185028993147, "grad_norm": 55.12899963766204, "learning_rate": 9.895888244596733e-07, "logits/chosen": -1.1742675304412842, "logits/rejected": -0.971667468547821, "logps/chosen": -297.7124938964844, "logps/rejected": -264.01251220703125, "loss": 0.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.11827392876148224, "rewards/margins": 0.2631286680698395, "rewards/rejected": -0.14459991455078125, "step": 80 }, { "epoch": 0.047443331576172906, "grad_norm": 57.24739728721032, "learning_rate": 9.882709541381128e-07, "logits/chosen": -1.19384765625, "logits/rejected": -1.118383765220642, "logps/chosen": -308.1499938964844, "logps/rejected": -286.0375061035156, "loss": 0.5845, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15993957221508026, "rewards/margins": 0.34971922636032104, "rewards/rejected": -0.18942871689796448, "step": 90 }, { "epoch": 0.05271481286241434, "grad_norm": 69.73206895492469, "learning_rate": 9.869530838165525e-07, "logits/chosen": -1.2018554210662842, "logits/rejected": -1.0310790538787842, "logps/chosen": -322.61248779296875, "logps/rejected": -316.45623779296875, "loss": 0.5782, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.2636169493198395, "rewards/margins": 0.38078612089157104, "rewards/rejected": -0.11694946140050888, "step": 100 }, { "epoch": 0.05798629414865577, "grad_norm": 69.02664754849202, "learning_rate": 9.85635213494992e-07, "logits/chosen": -1.149194359779358, "logits/rejected": -1.0390503406524658, "logps/chosen": -363.26251220703125, "logps/rejected": -327.73748779296875, "loss": 0.6075, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.219085693359375, "rewards/margins": 0.42970579862594604, "rewards/rejected": -0.21038207411766052, "step": 110 }, { "epoch": 0.0632577754348972, "grad_norm": 78.09766212443708, "learning_rate": 9.843173431734316e-07, "logits/chosen": -1.088769555091858, "logits/rejected": -0.9942626953125, "logps/chosen": -313.48748779296875, "logps/rejected": -294.5249938964844, "loss": 0.6536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04244690015912056, "rewards/margins": 0.2823120057582855, "rewards/rejected": -0.23973998427391052, "step": 120 }, { "epoch": 0.06852925672113865, "grad_norm": 67.1530775459763, "learning_rate": 9.829994728518713e-07, "logits/chosen": -1.128637671470642, "logits/rejected": -0.9006103277206421, "logps/chosen": -299.0625, "logps/rejected": -279.6875, "loss": 0.5995, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3755249083042145, "rewards/margins": 0.4014739990234375, "rewards/rejected": -0.02600708045065403, "step": 130 }, { "epoch": 0.07380073800738007, "grad_norm": 73.2335364704245, "learning_rate": 9.81681602530311e-07, "logits/chosen": -1.230566382408142, "logits/rejected": -0.9673816561698914, "logps/chosen": -315.6499938964844, "logps/rejected": -294.5625, "loss": 0.587, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.397714227437973, "rewards/margins": 0.45262449979782104, "rewards/rejected": -0.054994963109493256, "step": 140 }, { "epoch": 0.0790722192936215, "grad_norm": 67.79154872139756, "learning_rate": 9.803637322087505e-07, "logits/chosen": -1.2999999523162842, "logits/rejected": -0.994458019733429, "logps/chosen": -326.875, "logps/rejected": -307.57501220703125, "loss": 0.5349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2655578553676605, "rewards/margins": 0.5574585199356079, "rewards/rejected": -0.29124146699905396, "step": 150 }, { "epoch": 0.08434370057986294, "grad_norm": 68.7487423332191, "learning_rate": 9.790458618871902e-07, "logits/chosen": -1.29931640625, "logits/rejected": -1.1171386241912842, "logps/chosen": -322.6875, "logps/rejected": -288.11248779296875, "loss": 0.5739, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.09020843356847763, "rewards/margins": 0.512420654296875, "rewards/rejected": -0.42258912324905396, "step": 160 }, { "epoch": 0.08961518186610437, "grad_norm": 60.4536212754181, "learning_rate": 9.7772799156563e-07, "logits/chosen": -1.22509765625, "logits/rejected": -1.0955321788787842, "logps/chosen": -332.8999938964844, "logps/rejected": -317.07501220703125, "loss": 0.6434, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.06626281887292862, "rewards/margins": 0.40392762422561646, "rewards/rejected": -0.33818358182907104, "step": 170 }, { "epoch": 0.09488666315234581, "grad_norm": 75.97888196482029, "learning_rate": 9.764101212440694e-07, "logits/chosen": -1.2601410150527954, "logits/rejected": -1.0383727550506592, "logps/chosen": -327.6000061035156, "logps/rejected": -304.0, "loss": 0.5617, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3026779294013977, "rewards/margins": 0.536090075969696, "rewards/rejected": -0.23297119140625, "step": 180 }, { "epoch": 0.10015814443858724, "grad_norm": 53.515071045773006, "learning_rate": 9.750922509225091e-07, "logits/chosen": -1.267919898033142, "logits/rejected": -1.1145141124725342, "logps/chosen": -305.95001220703125, "logps/rejected": -297.6625061035156, "loss": 0.5801, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.21826934814453125, "rewards/margins": 0.4731201231479645, "rewards/rejected": -0.254983514547348, "step": 190 }, { "epoch": 0.10542962572482868, "grad_norm": 65.8737708063444, "learning_rate": 9.737743806009488e-07, "logits/chosen": -1.24755859375, "logits/rejected": -1.2008056640625, "logps/chosen": -328.875, "logps/rejected": -296.20001220703125, "loss": 0.627, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.11452331393957138, "rewards/margins": 0.3698974549770355, "rewards/rejected": -0.25492554903030396, "step": 200 }, { "epoch": 0.11070110701107011, "grad_norm": 47.01537527845389, "learning_rate": 9.724565102793885e-07, "logits/chosen": -1.34716796875, "logits/rejected": -1.01129150390625, "logps/chosen": -322.8500061035156, "logps/rejected": -316.92498779296875, "loss": 0.5399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.35765379667282104, "rewards/margins": 0.537341296672821, "rewards/rejected": -0.17970581352710724, "step": 210 }, { "epoch": 0.11597258829731154, "grad_norm": 88.9273805392115, "learning_rate": 9.71138639957828e-07, "logits/chosen": -1.24169921875, "logits/rejected": -1.106909155845642, "logps/chosen": -354.92498779296875, "logps/rejected": -342.2124938964844, "loss": 0.6205, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2381744384765625, "rewards/margins": 0.4032348692417145, "rewards/rejected": -0.165110781788826, "step": 220 }, { "epoch": 0.12124406958355298, "grad_norm": 69.71500187927847, "learning_rate": 9.698207696362677e-07, "logits/chosen": -1.288720726966858, "logits/rejected": -1.1188476085662842, "logps/chosen": -350.2124938964844, "logps/rejected": -343.4375, "loss": 0.6059, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.230987548828125, "rewards/margins": 0.44500732421875, "rewards/rejected": -0.21347656846046448, "step": 230 }, { "epoch": 0.1265155508697944, "grad_norm": 50.79528134926148, "learning_rate": 9.685028993147074e-07, "logits/chosen": -1.351171851158142, "logits/rejected": -1.165429711341858, "logps/chosen": -303.63751220703125, "logps/rejected": -298.5874938964844, "loss": 0.6102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2621398866176605, "rewards/margins": 0.47377318143844604, "rewards/rejected": -0.21159668266773224, "step": 240 }, { "epoch": 0.13178703215603585, "grad_norm": 58.541278164740085, "learning_rate": 9.671850289931471e-07, "logits/chosen": -1.321929931640625, "logits/rejected": -1.131494164466858, "logps/chosen": -333.13751220703125, "logps/rejected": -291.9624938964844, "loss": 0.5546, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.16593627631664276, "rewards/margins": 0.5364745855331421, "rewards/rejected": -0.370391845703125, "step": 250 }, { "epoch": 0.1370585134422773, "grad_norm": 60.89955815618724, "learning_rate": 9.658671586715866e-07, "logits/chosen": -1.2051270008087158, "logits/rejected": -1.042993187904358, "logps/chosen": -384.67498779296875, "logps/rejected": -349.1000061035156, "loss": 0.5367, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.14791259169578552, "rewards/margins": 0.60235595703125, "rewards/rejected": -0.4545044004917145, "step": 260 }, { "epoch": 0.1423299947285187, "grad_norm": 45.076879723507155, "learning_rate": 9.645492883500263e-07, "logits/chosen": -1.2556641101837158, "logits/rejected": -1.1449706554412842, "logps/chosen": -309.04998779296875, "logps/rejected": -284.76251220703125, "loss": 0.5978, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0003295898495707661, "rewards/margins": 0.4715942442417145, "rewards/rejected": -0.471435546875, "step": 270 }, { "epoch": 0.14760147601476015, "grad_norm": 87.16376218591348, "learning_rate": 9.63231418028466e-07, "logits/chosen": -1.296484351158142, "logits/rejected": -1.116796851158142, "logps/chosen": -311.86248779296875, "logps/rejected": -294.07501220703125, "loss": 0.597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.22005920112133026, "rewards/margins": 0.5121825933456421, "rewards/rejected": -0.2920379638671875, "step": 280 }, { "epoch": 0.1528729573010016, "grad_norm": 62.65588183306058, "learning_rate": 9.619135477069055e-07, "logits/chosen": -1.266992211341858, "logits/rejected": -1.130218505859375, "logps/chosen": -320.7124938964844, "logps/rejected": -298.625, "loss": 0.5322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3233642578125, "rewards/margins": 0.6823364496231079, "rewards/rejected": -0.3586669862270355, "step": 290 }, { "epoch": 0.158144438587243, "grad_norm": 59.96583232501794, "learning_rate": 9.605956773853452e-07, "logits/chosen": -1.3330078125, "logits/rejected": -1.1083495616912842, "logps/chosen": -297.4125061035156, "logps/rejected": -294.7875061035156, "loss": 0.5877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09271240234375, "rewards/margins": 0.552868664264679, "rewards/rejected": -0.46038818359375, "step": 300 }, { "epoch": 0.16341591987348444, "grad_norm": 66.86635483308518, "learning_rate": 9.59277807063785e-07, "logits/chosen": -1.255273461341858, "logits/rejected": -1.076025366783142, "logps/chosen": -376.0, "logps/rejected": -351.92498779296875, "loss": 0.5567, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14058837294578552, "rewards/margins": 0.632519543170929, "rewards/rejected": -0.7732177972793579, "step": 310 }, { "epoch": 0.16868740115972589, "grad_norm": 56.69734030504065, "learning_rate": 9.579599367422246e-07, "logits/chosen": -1.214111328125, "logits/rejected": -1.0700562000274658, "logps/chosen": -351.04998779296875, "logps/rejected": -301.57501220703125, "loss": 0.5441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.162994384765625, "rewards/margins": 0.6830810308456421, "rewards/rejected": -0.845751941204071, "step": 320 }, { "epoch": 0.17395888244596733, "grad_norm": 74.07561233213687, "learning_rate": 9.56642066420664e-07, "logits/chosen": -1.322412133216858, "logits/rejected": -1.154272437095642, "logps/chosen": -327.8374938964844, "logps/rejected": -321.6875, "loss": 0.6222, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09601745754480362, "rewards/margins": 0.54833984375, "rewards/rejected": -0.6447509527206421, "step": 330 }, { "epoch": 0.17923036373220874, "grad_norm": 74.9222539687631, "learning_rate": 9.553241960991038e-07, "logits/chosen": -1.3166992664337158, "logits/rejected": -1.161718726158142, "logps/chosen": -309.8500061035156, "logps/rejected": -294.67498779296875, "loss": 0.5738, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.141693115234375, "rewards/margins": 0.5922790765762329, "rewards/rejected": -0.4510498046875, "step": 340 }, { "epoch": 0.18450184501845018, "grad_norm": 51.487932449476446, "learning_rate": 9.540063257775435e-07, "logits/chosen": -1.1992676258087158, "logits/rejected": -1.0961945056915283, "logps/chosen": -327.75, "logps/rejected": -308.5, "loss": 0.5724, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2656051516532898, "rewards/margins": 0.6284545660018921, "rewards/rejected": -0.362905889749527, "step": 350 }, { "epoch": 0.18977332630469163, "grad_norm": 65.40330508980892, "learning_rate": 9.526884554559831e-07, "logits/chosen": -1.266503930091858, "logits/rejected": -1.088720679283142, "logps/chosen": -350.6499938964844, "logps/rejected": -314.9125061035156, "loss": 0.5323, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0787353515625, "rewards/margins": 0.685577392578125, "rewards/rejected": -0.606719970703125, "step": 360 }, { "epoch": 0.19504480759093304, "grad_norm": 79.43521930394195, "learning_rate": 9.513705851344227e-07, "logits/chosen": -1.31005859375, "logits/rejected": -1.0363006591796875, "logps/chosen": -344.57501220703125, "logps/rejected": -320.79998779296875, "loss": 0.5704, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01855926588177681, "rewards/margins": 0.6300293207168579, "rewards/rejected": -0.6479126214981079, "step": 370 }, { "epoch": 0.20031628887717448, "grad_norm": 82.97467344021696, "learning_rate": 9.500527148128624e-07, "logits/chosen": -1.0713989734649658, "logits/rejected": -0.9591308832168579, "logps/chosen": -399.04998779296875, "logps/rejected": -318.2749938964844, "loss": 0.49, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26087647676467896, "rewards/margins": 0.8612915277481079, "rewards/rejected": -1.122460961341858, "step": 380 }, { "epoch": 0.20558777016341592, "grad_norm": 66.47486867959257, "learning_rate": 9.487348444913021e-07, "logits/chosen": -1.231359839439392, "logits/rejected": -1.01995849609375, "logps/chosen": -337.63751220703125, "logps/rejected": -301.67498779296875, "loss": 0.5817, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4416870176792145, "rewards/margins": 0.630139172077179, "rewards/rejected": -1.070715308189392, "step": 390 }, { "epoch": 0.21085925144965736, "grad_norm": 86.8942290694314, "learning_rate": 9.474169741697417e-07, "logits/chosen": -1.27099609375, "logits/rejected": -1.070703148841858, "logps/chosen": -349.2749938964844, "logps/rejected": -331.25, "loss": 0.6553, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.282736212015152, "rewards/margins": 0.4347900450229645, "rewards/rejected": -0.717547595500946, "step": 400 }, { "epoch": 0.21613073273589878, "grad_norm": 65.88436950996315, "learning_rate": 9.460991038481813e-07, "logits/chosen": -1.1693847179412842, "logits/rejected": -1.0669434070587158, "logps/chosen": -338.2749938964844, "logps/rejected": -318.2250061035156, "loss": 0.5347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.149322509765625, "rewards/margins": 0.7467406988143921, "rewards/rejected": -0.8955322504043579, "step": 410 }, { "epoch": 0.22140221402214022, "grad_norm": 68.8872667602359, "learning_rate": 9.44781233526621e-07, "logits/chosen": -1.1773192882537842, "logits/rejected": -1.097070336341858, "logps/chosen": -353.1312561035156, "logps/rejected": -322.92498779296875, "loss": 0.5047, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.02072296105325222, "rewards/margins": 0.7635253667831421, "rewards/rejected": -0.785357654094696, "step": 420 }, { "epoch": 0.22667369530838166, "grad_norm": 50.28234481936392, "learning_rate": 9.434633632050606e-07, "logits/chosen": -1.1969726085662842, "logits/rejected": -1.085229516029358, "logps/chosen": -306.25, "logps/rejected": -312.4375, "loss": 0.5726, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19089965522289276, "rewards/margins": 0.527301013469696, "rewards/rejected": -0.718823254108429, "step": 430 }, { "epoch": 0.23194517659462308, "grad_norm": 88.47407484543632, "learning_rate": 9.421454928835002e-07, "logits/chosen": -1.290624976158142, "logits/rejected": -1.1167480945587158, "logps/chosen": -331.86248779296875, "logps/rejected": -295.3500061035156, "loss": 0.5482, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.07728882133960724, "rewards/margins": 0.59033203125, "rewards/rejected": -0.512969970703125, "step": 440 }, { "epoch": 0.23721665788086452, "grad_norm": 70.60797196985364, "learning_rate": 9.408276225619399e-07, "logits/chosen": -1.241796851158142, "logits/rejected": -1.070043921470642, "logps/chosen": -357.88751220703125, "logps/rejected": -323.625, "loss": 0.502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.36439818143844604, "rewards/margins": 0.83935546875, "rewards/rejected": -0.4751648008823395, "step": 450 }, { "epoch": 0.24248813916710596, "grad_norm": 53.22138993849179, "learning_rate": 9.395097522403796e-07, "logits/chosen": -1.283203125, "logits/rejected": -1.1349608898162842, "logps/chosen": -323.625, "logps/rejected": -299.5, "loss": 0.5764, "rewards/accuracies": 0.6875, "rewards/chosen": 0.36098021268844604, "rewards/margins": 0.671557605266571, "rewards/rejected": -0.311309814453125, "step": 460 }, { "epoch": 0.2477596204533474, "grad_norm": 55.79668741844659, "learning_rate": 9.381918819188192e-07, "logits/chosen": -1.1763184070587158, "logits/rejected": -1.0784180164337158, "logps/chosen": -317.6625061035156, "logps/rejected": -311.86248779296875, "loss": 0.5598, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.420053094625473, "rewards/margins": 0.7059081792831421, "rewards/rejected": -0.28516846895217896, "step": 470 }, { "epoch": 0.2530311017395888, "grad_norm": 83.7196363800189, "learning_rate": 9.368740115972587e-07, "logits/chosen": -1.223785400390625, "logits/rejected": -1.095727562904358, "logps/chosen": -319.8125, "logps/rejected": -309.0249938964844, "loss": 0.5844, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4523559510707855, "rewards/margins": 0.6552734375, "rewards/rejected": -0.20275267958641052, "step": 480 }, { "epoch": 0.25830258302583026, "grad_norm": 60.185964591154665, "learning_rate": 9.355561412756983e-07, "logits/chosen": -1.203271508216858, "logits/rejected": -1.205786108970642, "logps/chosen": -381.26251220703125, "logps/rejected": -336.875, "loss": 0.5593, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5760864019393921, "rewards/margins": 0.692028820514679, "rewards/rejected": -0.11561889946460724, "step": 490 }, { "epoch": 0.2635740643120717, "grad_norm": 72.66750240681664, "learning_rate": 9.34238270954138e-07, "logits/chosen": -1.133203148841858, "logits/rejected": -1.1507079601287842, "logps/chosen": -345.36248779296875, "logps/rejected": -330.7250061035156, "loss": 0.6409, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.29205626249313354, "rewards/margins": 0.5046844482421875, "rewards/rejected": -0.212677001953125, "step": 500 }, { "epoch": 0.26884554559831314, "grad_norm": 47.66073602056624, "learning_rate": 9.329204006325777e-07, "logits/chosen": -1.052026391029358, "logits/rejected": -1.0691406726837158, "logps/chosen": -345.42498779296875, "logps/rejected": -290.0, "loss": 0.5321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.41962432861328125, "rewards/margins": 0.672515869140625, "rewards/rejected": -0.252694696187973, "step": 510 }, { "epoch": 0.2741170268845546, "grad_norm": 57.24088193848034, "learning_rate": 9.316025303110173e-07, "logits/chosen": -1.0986328125, "logits/rejected": -0.959716796875, "logps/chosen": -375.51251220703125, "logps/rejected": -328.61248779296875, "loss": 0.5841, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2090911865234375, "rewards/margins": 0.615185558795929, "rewards/rejected": -0.405914306640625, "step": 520 }, { "epoch": 0.27938850817079597, "grad_norm": 88.73009645184125, "learning_rate": 9.30284659989457e-07, "logits/chosen": -1.1414062976837158, "logits/rejected": -0.998461902141571, "logps/chosen": -345.4624938964844, "logps/rejected": -313.32501220703125, "loss": 0.5408, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20083312690258026, "rewards/margins": 0.7903808355331421, "rewards/rejected": -0.5893096923828125, "step": 530 }, { "epoch": 0.2846599894570374, "grad_norm": 64.45247351935589, "learning_rate": 9.289667896678966e-07, "logits/chosen": -1.167871117591858, "logits/rejected": -0.9391113519668579, "logps/chosen": -353.26251220703125, "logps/rejected": -307.07501220703125, "loss": 0.5213, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.31807249784469604, "rewards/margins": 0.9098602533340454, "rewards/rejected": -0.5917419195175171, "step": 540 }, { "epoch": 0.28993147074327885, "grad_norm": 53.25575610195643, "learning_rate": 9.276489193463362e-07, "logits/chosen": -1.0911133289337158, "logits/rejected": -0.880932629108429, "logps/chosen": -363.5249938964844, "logps/rejected": -340.2124938964844, "loss": 0.5061, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.16115722060203552, "rewards/margins": 0.914721667766571, "rewards/rejected": -0.753375232219696, "step": 550 }, { "epoch": 0.2952029520295203, "grad_norm": 72.86296184964982, "learning_rate": 9.263310490247759e-07, "logits/chosen": -1.3088867664337158, "logits/rejected": -1.145166039466858, "logps/chosen": -341.625, "logps/rejected": -305.8125, "loss": 0.5684, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15709838271141052, "rewards/margins": 0.678619384765625, "rewards/rejected": -0.835400402545929, "step": 560 }, { "epoch": 0.30047443331576174, "grad_norm": 60.659835883133816, "learning_rate": 9.250131787032156e-07, "logits/chosen": -1.200463891029358, "logits/rejected": -1.076025366783142, "logps/chosen": -336.42498779296875, "logps/rejected": -318.9750061035156, "loss": 0.5527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01937255822122097, "rewards/margins": 0.718463122844696, "rewards/rejected": -0.738232433795929, "step": 570 }, { "epoch": 0.3057459146020032, "grad_norm": 61.09325819451641, "learning_rate": 9.236953083816552e-07, "logits/chosen": -1.3232910633087158, "logits/rejected": -1.1092407703399658, "logps/chosen": -315.26251220703125, "logps/rejected": -317.0249938964844, "loss": 0.5149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24483947455883026, "rewards/margins": 0.866503894329071, "rewards/rejected": -0.621337890625, "step": 580 }, { "epoch": 0.3110173958882446, "grad_norm": 91.21246789576706, "learning_rate": 9.223774380600948e-07, "logits/chosen": -1.405371069908142, "logits/rejected": -1.2843749523162842, "logps/chosen": -369.6499938964844, "logps/rejected": -334.25, "loss": 0.5715, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16961821913719177, "rewards/margins": 0.7342773675918579, "rewards/rejected": -0.564471423625946, "step": 590 }, { "epoch": 0.316288877174486, "grad_norm": 108.55104881676735, "learning_rate": 9.210595677385344e-07, "logits/chosen": -1.263281226158142, "logits/rejected": -1.196533203125, "logps/chosen": -339.9750061035156, "logps/rejected": -317.7749938964844, "loss": 0.6196, "rewards/accuracies": 0.625, "rewards/chosen": 0.12999267876148224, "rewards/margins": 0.5481323003768921, "rewards/rejected": -0.4178710877895355, "step": 600 }, { "epoch": 0.32156035846072745, "grad_norm": 66.72929086064453, "learning_rate": 9.197416974169741e-07, "logits/chosen": -1.2440307140350342, "logits/rejected": -1.144384741783142, "logps/chosen": -357.9624938964844, "logps/rejected": -342.1000061035156, "loss": 0.6068, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.15337523818016052, "rewards/margins": 0.7041351199150085, "rewards/rejected": -0.550891101360321, "step": 610 }, { "epoch": 0.3268318397469689, "grad_norm": 55.30446589706109, "learning_rate": 9.184238270954138e-07, "logits/chosen": -1.2597167491912842, "logits/rejected": -1.17645263671875, "logps/chosen": -349.625, "logps/rejected": -319.1937561035156, "loss": 0.5097, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14702148735523224, "rewards/margins": 0.739990234375, "rewards/rejected": -0.5927978754043579, "step": 620 }, { "epoch": 0.33210332103321033, "grad_norm": 75.42506924564606, "learning_rate": 9.171059567738534e-07, "logits/chosen": -1.297998070716858, "logits/rejected": -1.199365258216858, "logps/chosen": -339.6000061035156, "logps/rejected": -303.20001220703125, "loss": 0.5097, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01345214806497097, "rewards/margins": 0.761627197265625, "rewards/rejected": -0.7745116949081421, "step": 630 }, { "epoch": 0.33737480231945177, "grad_norm": 70.24037574047595, "learning_rate": 9.157880864522931e-07, "logits/chosen": -1.2614257335662842, "logits/rejected": -0.9980224370956421, "logps/chosen": -335.5625, "logps/rejected": -311.38751220703125, "loss": 0.5526, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16145019233226776, "rewards/margins": 0.8254760503768921, "rewards/rejected": -0.6637939214706421, "step": 640 }, { "epoch": 0.3426462836056932, "grad_norm": 66.21928918248653, "learning_rate": 9.144702161307327e-07, "logits/chosen": -1.364160180091858, "logits/rejected": -1.137792944908142, "logps/chosen": -365.38751220703125, "logps/rejected": -347.4125061035156, "loss": 0.489, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2511230409145355, "rewards/margins": 0.9194091558456421, "rewards/rejected": -0.6685150265693665, "step": 650 }, { "epoch": 0.34791776489193466, "grad_norm": 54.206669048094874, "learning_rate": 9.131523458091723e-07, "logits/chosen": -1.116790771484375, "logits/rejected": -1.11004638671875, "logps/chosen": -367.6000061035156, "logps/rejected": -331.36248779296875, "loss": 0.6171, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10147400200366974, "rewards/margins": 0.6505126953125, "rewards/rejected": -0.751953125, "step": 660 }, { "epoch": 0.35318924617817604, "grad_norm": 70.17455422769008, "learning_rate": 9.118344754876119e-07, "logits/chosen": -1.2755858898162842, "logits/rejected": -1.100830078125, "logps/chosen": -374.6000061035156, "logps/rejected": -335.63751220703125, "loss": 0.5013, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05473632737994194, "rewards/margins": 0.9933227300643921, "rewards/rejected": -1.046990990638733, "step": 670 }, { "epoch": 0.3584607274644175, "grad_norm": 44.80913815182738, "learning_rate": 9.105166051660517e-07, "logits/chosen": -1.3132812976837158, "logits/rejected": -1.130090355873108, "logps/chosen": -348.3125, "logps/rejected": -346.92498779296875, "loss": 0.4947, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.24459227919578552, "rewards/margins": 1.014715552330017, "rewards/rejected": -0.7704712152481079, "step": 680 }, { "epoch": 0.3637322087506589, "grad_norm": 60.60655427955898, "learning_rate": 9.091987348444913e-07, "logits/chosen": -1.3025391101837158, "logits/rejected": -1.171594262123108, "logps/chosen": -340.3999938964844, "logps/rejected": -308.07501220703125, "loss": 0.5377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.059173583984375, "rewards/margins": 0.8583434820175171, "rewards/rejected": -0.9173218011856079, "step": 690 }, { "epoch": 0.36900369003690037, "grad_norm": 69.95588778813242, "learning_rate": 9.078808645229309e-07, "logits/chosen": -1.397607445716858, "logits/rejected": -1.291748046875, "logps/chosen": -365.7749938964844, "logps/rejected": -323.5, "loss": 0.5263, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.24861450493335724, "rewards/margins": 0.7721191644668579, "rewards/rejected": -1.019982933998108, "step": 700 }, { "epoch": 0.3742751713231418, "grad_norm": 71.71667375151885, "learning_rate": 9.065629942013705e-07, "logits/chosen": -1.36083984375, "logits/rejected": -1.25732421875, "logps/chosen": -318.4375, "logps/rejected": -313.48748779296875, "loss": 0.5967, "rewards/accuracies": 0.65625, "rewards/chosen": -0.56549072265625, "rewards/margins": 0.639404296875, "rewards/rejected": -1.2043945789337158, "step": 710 }, { "epoch": 0.37954665260938325, "grad_norm": 45.88466351577995, "learning_rate": 9.052451238798102e-07, "logits/chosen": -1.335546851158142, "logits/rejected": -1.223046898841858, "logps/chosen": -334.0625, "logps/rejected": -306.9750061035156, "loss": 0.5645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.419412225484848, "rewards/margins": 0.822918713092804, "rewards/rejected": -1.242285132408142, "step": 720 }, { "epoch": 0.3848181338956247, "grad_norm": 60.70188949259511, "learning_rate": 9.039272535582499e-07, "logits/chosen": -1.338769555091858, "logits/rejected": -1.221337914466858, "logps/chosen": -354.875, "logps/rejected": -337.32501220703125, "loss": 0.5368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3277755677700043, "rewards/margins": 0.8636718988418579, "rewards/rejected": -1.191247582435608, "step": 730 }, { "epoch": 0.3900896151818661, "grad_norm": 54.01426883313912, "learning_rate": 9.026093832366895e-07, "logits/chosen": -1.3628418445587158, "logits/rejected": -1.1244628429412842, "logps/chosen": -315.29998779296875, "logps/rejected": -314.7250061035156, "loss": 0.51, "rewards/accuracies": 0.75, "rewards/chosen": -0.23123779892921448, "rewards/margins": 0.971606433391571, "rewards/rejected": -1.202856421470642, "step": 740 }, { "epoch": 0.3953610964681075, "grad_norm": 91.64330075299394, "learning_rate": 9.012915129151291e-07, "logits/chosen": -1.2689940929412842, "logits/rejected": -1.1267578601837158, "logps/chosen": -362.42498779296875, "logps/rejected": -321.8999938964844, "loss": 0.5753, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4827331602573395, "rewards/margins": 0.8346801996231079, "rewards/rejected": -1.3176758289337158, "step": 750 }, { "epoch": 0.40063257775434896, "grad_norm": 57.516033881498096, "learning_rate": 8.999736425935688e-07, "logits/chosen": -1.3933594226837158, "logits/rejected": -1.3042480945587158, "logps/chosen": -332.2250061035156, "logps/rejected": -297.92498779296875, "loss": 0.4951, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4839416444301605, "rewards/margins": 1.1380615234375, "rewards/rejected": -1.622778296470642, "step": 760 }, { "epoch": 0.4059040590405904, "grad_norm": 64.98119699328497, "learning_rate": 8.986557722720084e-07, "logits/chosen": -1.30419921875, "logits/rejected": -1.163915991783142, "logps/chosen": -315.7250061035156, "logps/rejected": -319.1499938964844, "loss": 0.4916, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2757812440395355, "rewards/margins": 1.1939208507537842, "rewards/rejected": -1.470068335533142, "step": 770 }, { "epoch": 0.41117554032683185, "grad_norm": 64.5633680864746, "learning_rate": 8.97337901950448e-07, "logits/chosen": -1.202880859375, "logits/rejected": -0.991992175579071, "logps/chosen": -332.2749938964844, "logps/rejected": -310.4624938964844, "loss": 0.5093, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.149577334523201, "rewards/margins": 1.069006323814392, "rewards/rejected": -1.2188720703125, "step": 780 }, { "epoch": 0.4164470216130733, "grad_norm": 74.86978509519363, "learning_rate": 8.960200316288878e-07, "logits/chosen": -1.168359398841858, "logits/rejected": -0.901532769203186, "logps/chosen": -331.0625, "logps/rejected": -334.3500061035156, "loss": 0.513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08330993354320526, "rewards/margins": 1.096777319908142, "rewards/rejected": -1.1807982921600342, "step": 790 }, { "epoch": 0.42171850289931473, "grad_norm": 69.80599952151367, "learning_rate": 8.947021613073274e-07, "logits/chosen": -1.2922852039337158, "logits/rejected": -1.1395263671875, "logps/chosen": -323.4125061035156, "logps/rejected": -325.29998779296875, "loss": 0.5004, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.29902952909469604, "rewards/margins": 1.006018042564392, "rewards/rejected": -0.7072509527206421, "step": 800 }, { "epoch": 0.4269899841855561, "grad_norm": 58.02341969770931, "learning_rate": 8.93384290985767e-07, "logits/chosen": -1.223535180091858, "logits/rejected": -1.1515624523162842, "logps/chosen": -339.79998779296875, "logps/rejected": -288.2124938964844, "loss": 0.5448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7059081792831421, "rewards/margins": 0.8739989995956421, "rewards/rejected": -0.16770020127296448, "step": 810 }, { "epoch": 0.43226146547179756, "grad_norm": 45.74594491497009, "learning_rate": 8.920664206642066e-07, "logits/chosen": -1.189111351966858, "logits/rejected": -0.9875732660293579, "logps/chosen": -336.82501220703125, "logps/rejected": -303.625, "loss": 0.5495, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7355316281318665, "rewards/margins": 0.847851574420929, "rewards/rejected": -0.11217041313648224, "step": 820 }, { "epoch": 0.437532946758039, "grad_norm": 59.66031105865667, "learning_rate": 8.907485503426463e-07, "logits/chosen": -1.0302734375, "logits/rejected": -1.0830078125, "logps/chosen": -374.7250061035156, "logps/rejected": -338.1000061035156, "loss": 0.5094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6305786371231079, "rewards/margins": 0.960888683795929, "rewards/rejected": -0.3307251036167145, "step": 830 }, { "epoch": 0.44280442804428044, "grad_norm": 70.00892340225593, "learning_rate": 8.894306800210858e-07, "logits/chosen": -1.202490210533142, "logits/rejected": -1.02392578125, "logps/chosen": -303.70001220703125, "logps/rejected": -301.20001220703125, "loss": 0.616, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05787963792681694, "rewards/margins": 0.6745849847793579, "rewards/rejected": -0.732879638671875, "step": 840 }, { "epoch": 0.4480759093305219, "grad_norm": 51.06242544943851, "learning_rate": 8.881128096995255e-07, "logits/chosen": -1.0737793445587158, "logits/rejected": -0.9889465570449829, "logps/chosen": -360.4375, "logps/rejected": -322.07501220703125, "loss": 0.516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.23143310844898224, "rewards/margins": 0.93402099609375, "rewards/rejected": -0.701831042766571, "step": 850 }, { "epoch": 0.4533473906167633, "grad_norm": 83.61632627174093, "learning_rate": 8.867949393779651e-07, "logits/chosen": -1.1216552257537842, "logits/rejected": -1.0041015148162842, "logps/chosen": -345.0249938964844, "logps/rejected": -341.45001220703125, "loss": 0.579, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13632813096046448, "rewards/margins": 0.7856200933456421, "rewards/rejected": -0.6495727300643921, "step": 860 }, { "epoch": 0.45861887190300477, "grad_norm": 77.36591261237466, "learning_rate": 8.854770690564048e-07, "logits/chosen": -1.3022949695587158, "logits/rejected": -1.1497802734375, "logps/chosen": -320.2875061035156, "logps/rejected": -307.57501220703125, "loss": 0.4922, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4698486328125, "rewards/margins": 0.9173218011856079, "rewards/rejected": -0.44736939668655396, "step": 870 }, { "epoch": 0.46389035318924615, "grad_norm": 60.43364442782097, "learning_rate": 8.841591987348444e-07, "logits/chosen": -1.2196533679962158, "logits/rejected": -1.0296630859375, "logps/chosen": -333.75, "logps/rejected": -322.8999938964844, "loss": 0.6052, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5284789800643921, "rewards/margins": 0.687268078327179, "rewards/rejected": -0.15830841660499573, "step": 880 }, { "epoch": 0.4691618344754876, "grad_norm": 63.92941898422623, "learning_rate": 8.82841328413284e-07, "logits/chosen": -1.2831542491912842, "logits/rejected": -1.176513671875, "logps/chosen": -346.9750061035156, "logps/rejected": -304.20001220703125, "loss": 0.5114, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.751879870891571, "rewards/margins": 0.891308605670929, "rewards/rejected": -0.1395263671875, "step": 890 }, { "epoch": 0.47443331576172904, "grad_norm": 59.894420579823475, "learning_rate": 8.815234580917237e-07, "logits/chosen": -1.298437476158142, "logits/rejected": -1.193597435951233, "logps/chosen": -308.4750061035156, "logps/rejected": -302.0375061035156, "loss": 0.5528, "rewards/accuracies": 0.71875, "rewards/chosen": 0.650683581829071, "rewards/margins": 0.741436779499054, "rewards/rejected": -0.09029541164636612, "step": 900 }, { "epoch": 0.4797047970479705, "grad_norm": 37.43909868256346, "learning_rate": 8.802055877701634e-07, "logits/chosen": -1.292871117591858, "logits/rejected": -1.191308617591858, "logps/chosen": -319.88751220703125, "logps/rejected": -309.2250061035156, "loss": 0.5462, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5358062982559204, "rewards/margins": 0.7776428461074829, "rewards/rejected": -0.24138183891773224, "step": 910 }, { "epoch": 0.4849762783342119, "grad_norm": 81.57548465740389, "learning_rate": 8.78887717448603e-07, "logits/chosen": -1.2252929210662842, "logits/rejected": -1.0353271961212158, "logps/chosen": -320.6312561035156, "logps/rejected": -330.3500061035156, "loss": 0.5847, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.17603759467601776, "rewards/margins": 0.820385754108429, "rewards/rejected": -0.6442199945449829, "step": 920 }, { "epoch": 0.49024775962045336, "grad_norm": 60.28851604866474, "learning_rate": 8.775698471270426e-07, "logits/chosen": -1.259423851966858, "logits/rejected": -1.137304663658142, "logps/chosen": -282.25, "logps/rejected": -279.70001220703125, "loss": 0.5498, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.08215942233800888, "rewards/margins": 0.8741699457168579, "rewards/rejected": -0.7915099859237671, "step": 930 }, { "epoch": 0.4955192409066948, "grad_norm": 58.81374630858948, "learning_rate": 8.762519768054823e-07, "logits/chosen": -1.2538573741912842, "logits/rejected": -1.0584075450897217, "logps/chosen": -313.88751220703125, "logps/rejected": -310.67498779296875, "loss": 0.5511, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2953124940395355, "rewards/margins": 0.7333984375, "rewards/rejected": -0.4382690489292145, "step": 940 }, { "epoch": 0.5007907221929362, "grad_norm": 58.11458336654666, "learning_rate": 8.749341064839219e-07, "logits/chosen": -1.2639648914337158, "logits/rejected": -1.1553955078125, "logps/chosen": -359.375, "logps/rejected": -348.2250061035156, "loss": 0.5467, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21343383193016052, "rewards/margins": 0.9063720703125, "rewards/rejected": -0.693347156047821, "step": 950 }, { "epoch": 0.5060622034791776, "grad_norm": 50.352101610829266, "learning_rate": 8.736162361623616e-07, "logits/chosen": -1.165869116783142, "logits/rejected": -1.0746581554412842, "logps/chosen": -367.57501220703125, "logps/rejected": -342.04998779296875, "loss": 0.493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03553619235754013, "rewards/margins": 0.9364379644393921, "rewards/rejected": -0.901232898235321, "step": 960 }, { "epoch": 0.5113336847654191, "grad_norm": 65.6374022252459, "learning_rate": 8.722983658408012e-07, "logits/chosen": -1.3157227039337158, "logits/rejected": -1.2541992664337158, "logps/chosen": -321.6937561035156, "logps/rejected": -305.5249938964844, "loss": 0.5701, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13342896103858948, "rewards/margins": 0.721240222454071, "rewards/rejected": -0.8542236089706421, "step": 970 }, { "epoch": 0.5166051660516605, "grad_norm": 49.491571958430676, "learning_rate": 8.709804955192409e-07, "logits/chosen": -1.33642578125, "logits/rejected": -1.2587890625, "logps/chosen": -335.625, "logps/rejected": -305.5874938964844, "loss": 0.528, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.05378112941980362, "rewards/margins": 0.8909912109375, "rewards/rejected": -0.8362792730331421, "step": 980 }, { "epoch": 0.521876647337902, "grad_norm": 72.33913490205428, "learning_rate": 8.696626251976805e-07, "logits/chosen": -1.334326148033142, "logits/rejected": -1.098242163658142, "logps/chosen": -301.625, "logps/rejected": -299.1000061035156, "loss": 0.4729, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.02826538123190403, "rewards/margins": 1.1031982898712158, "rewards/rejected": -1.1321532726287842, "step": 990 }, { "epoch": 0.5271481286241434, "grad_norm": 65.3728788957458, "learning_rate": 8.683447548761201e-07, "logits/chosen": -1.301367163658142, "logits/rejected": -1.1139647960662842, "logps/chosen": -324.3125, "logps/rejected": -338.45001220703125, "loss": 0.5399, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.035369873046875, "rewards/margins": 0.9834839105606079, "rewards/rejected": -1.0192382335662842, "step": 1000 }, { "epoch": 0.5324196099103848, "grad_norm": 45.22544296332828, "learning_rate": 8.670268845545597e-07, "logits/chosen": -1.2396423816680908, "logits/rejected": -1.1093871593475342, "logps/chosen": -353.75, "logps/rejected": -314.79998779296875, "loss": 0.4972, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.15750733017921448, "rewards/margins": 0.984814465045929, "rewards/rejected": -0.8274261355400085, "step": 1010 }, { "epoch": 0.5376910911966263, "grad_norm": 64.7146201565407, "learning_rate": 8.657090142329995e-07, "logits/chosen": -1.3051269054412842, "logits/rejected": -1.1543457508087158, "logps/chosen": -304.10626220703125, "logps/rejected": -275.4125061035156, "loss": 0.5383, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.11521072685718536, "rewards/margins": 0.8188232183456421, "rewards/rejected": -0.7040863037109375, "step": 1020 }, { "epoch": 0.5429625724828677, "grad_norm": 47.87129856721423, "learning_rate": 8.643911439114391e-07, "logits/chosen": -1.285791039466858, "logits/rejected": -1.191595435142517, "logps/chosen": -356.7749938964844, "logps/rejected": -335.625, "loss": 0.535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02147216722369194, "rewards/margins": 0.936718761920929, "rewards/rejected": -0.9150146245956421, "step": 1030 }, { "epoch": 0.5482340537691092, "grad_norm": 59.34759996755625, "learning_rate": 8.630732735898787e-07, "logits/chosen": -1.2230346202850342, "logits/rejected": -1.0718505382537842, "logps/chosen": -321.75, "logps/rejected": -318.67498779296875, "loss": 0.5861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0031036376021802425, "rewards/margins": 0.88958740234375, "rewards/rejected": -0.8868514895439148, "step": 1040 }, { "epoch": 0.5535055350553506, "grad_norm": 47.102887530004665, "learning_rate": 8.617554032683183e-07, "logits/chosen": -1.412988305091858, "logits/rejected": -1.1786620616912842, "logps/chosen": -291.07501220703125, "logps/rejected": -292.29998779296875, "loss": 0.5947, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.033538818359375, "rewards/margins": 0.810375988483429, "rewards/rejected": -0.844146728515625, "step": 1050 }, { "epoch": 0.5587770163415919, "grad_norm": 79.58591964711613, "learning_rate": 8.60437532946758e-07, "logits/chosen": -1.253076195716858, "logits/rejected": -1.157958984375, "logps/chosen": -338.07501220703125, "logps/rejected": -289.98126220703125, "loss": 0.6346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10137633979320526, "rewards/margins": 0.7296508550643921, "rewards/rejected": -0.6286376714706421, "step": 1060 }, { "epoch": 0.5640484976278334, "grad_norm": 52.15710570358783, "learning_rate": 8.591196626251977e-07, "logits/chosen": -1.350000023841858, "logits/rejected": -1.126245141029358, "logps/chosen": -340.2250061035156, "logps/rejected": -332.86248779296875, "loss": 0.4906, "rewards/accuracies": 0.78125, "rewards/chosen": 0.18063049018383026, "rewards/margins": 1.0172851085662842, "rewards/rejected": -0.8362518548965454, "step": 1070 }, { "epoch": 0.5693199789140748, "grad_norm": 45.554204233836884, "learning_rate": 8.578017923036373e-07, "logits/chosen": -1.258056640625, "logits/rejected": -1.13134765625, "logps/chosen": -341.73748779296875, "logps/rejected": -354.4750061035156, "loss": 0.5121, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.07855834811925888, "rewards/margins": 1.0057861804962158, "rewards/rejected": -0.9269164800643921, "step": 1080 }, { "epoch": 0.5745914602003163, "grad_norm": 47.20122525245549, "learning_rate": 8.56483921982077e-07, "logits/chosen": -1.2716796398162842, "logits/rejected": -1.1747558116912842, "logps/chosen": -309.0249938964844, "logps/rejected": -312.57501220703125, "loss": 0.5909, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18494263291358948, "rewards/margins": 0.709704577922821, "rewards/rejected": -0.8942550420761108, "step": 1090 }, { "epoch": 0.5798629414865577, "grad_norm": 68.02059462340942, "learning_rate": 8.551660516605166e-07, "logits/chosen": -1.270849585533142, "logits/rejected": -1.0932128429412842, "logps/chosen": -342.1875, "logps/rejected": -312.38751220703125, "loss": 0.5743, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15819701552391052, "rewards/margins": 0.749420166015625, "rewards/rejected": -0.908428966999054, "step": 1100 }, { "epoch": 0.5851344227727991, "grad_norm": 57.87266794997134, "learning_rate": 8.538481813389562e-07, "logits/chosen": -1.2527344226837158, "logits/rejected": -1.2020080089569092, "logps/chosen": -332.9624938964844, "logps/rejected": -300.13751220703125, "loss": 0.6102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.078643798828125, "rewards/margins": 0.6665283441543579, "rewards/rejected": -0.7452179193496704, "step": 1110 }, { "epoch": 0.5904059040590406, "grad_norm": 70.75830642204807, "learning_rate": 8.525303110173958e-07, "logits/chosen": -1.2716796398162842, "logits/rejected": -1.136010766029358, "logps/chosen": -310.8999938964844, "logps/rejected": -301.8500061035156, "loss": 0.518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.06049804762005806, "rewards/margins": 0.766186535358429, "rewards/rejected": -0.8269866704940796, "step": 1120 }, { "epoch": 0.595677385345282, "grad_norm": 53.09529052799547, "learning_rate": 8.512124406958356e-07, "logits/chosen": -1.078588843345642, "logits/rejected": -0.9752197265625, "logps/chosen": -348.4125061035156, "logps/rejected": -294.9125061035156, "loss": 0.6279, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0232086181640625, "rewards/margins": 0.6390746831893921, "rewards/rejected": -0.6630859375, "step": 1130 }, { "epoch": 0.6009488666315235, "grad_norm": 54.08036927538122, "learning_rate": 8.498945703742752e-07, "logits/chosen": -1.1586425304412842, "logits/rejected": -1.116308569908142, "logps/chosen": -337.04376220703125, "logps/rejected": -326.42498779296875, "loss": 0.5775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17920836806297302, "rewards/margins": 0.7008606195449829, "rewards/rejected": -0.5223144292831421, "step": 1140 }, { "epoch": 0.6062203479177649, "grad_norm": 62.71299107232606, "learning_rate": 8.485767000527148e-07, "logits/chosen": -1.251953125, "logits/rejected": -1.118749976158142, "logps/chosen": -304.1000061035156, "logps/rejected": -318.70001220703125, "loss": 0.5677, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.15980224311351776, "rewards/margins": 0.6598755121231079, "rewards/rejected": -0.500244140625, "step": 1150 }, { "epoch": 0.6114918292040064, "grad_norm": 63.29960573318622, "learning_rate": 8.472588297311544e-07, "logits/chosen": -1.172607421875, "logits/rejected": -1.141748070716858, "logps/chosen": -362.23748779296875, "logps/rejected": -326.75, "loss": 0.5754, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.00470733642578125, "rewards/margins": 0.8082641363143921, "rewards/rejected": -0.812823474407196, "step": 1160 }, { "epoch": 0.6167633104902478, "grad_norm": 55.32628388980585, "learning_rate": 8.459409594095941e-07, "logits/chosen": -1.1879394054412842, "logits/rejected": -1.038842797279358, "logps/chosen": -331.6875, "logps/rejected": -327.5249938964844, "loss": 0.5093, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04124755784869194, "rewards/margins": 0.950396716594696, "rewards/rejected": -0.908587634563446, "step": 1170 }, { "epoch": 0.6220347917764892, "grad_norm": 49.55626751321092, "learning_rate": 8.446230890880337e-07, "logits/chosen": -1.209204077720642, "logits/rejected": -1.0579102039337158, "logps/chosen": -334.7875061035156, "logps/rejected": -367.07501220703125, "loss": 0.5656, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.26047974824905396, "rewards/margins": 0.887133777141571, "rewards/rejected": -0.625561535358429, "step": 1180 }, { "epoch": 0.6273062730627307, "grad_norm": 80.8584887531583, "learning_rate": 8.433052187664734e-07, "logits/chosen": -1.168798804283142, "logits/rejected": -1.14501953125, "logps/chosen": -330.6000061035156, "logps/rejected": -288.45001220703125, "loss": 0.4866, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4881347715854645, "rewards/margins": 1.048883080482483, "rewards/rejected": -0.5616394281387329, "step": 1190 }, { "epoch": 0.632577754348972, "grad_norm": 76.24241983413938, "learning_rate": 8.419873484449131e-07, "logits/chosen": -1.011376976966858, "logits/rejected": -0.942675769329071, "logps/chosen": -357.4624938964844, "logps/rejected": -318.4624938964844, "loss": 0.5315, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4569335877895355, "rewards/margins": 0.925762951374054, "rewards/rejected": -0.46925657987594604, "step": 1200 }, { "epoch": 0.6378492356352135, "grad_norm": 55.3366395126279, "learning_rate": 8.406694781233526e-07, "logits/chosen": -1.203125, "logits/rejected": -1.045922875404358, "logps/chosen": -321.45001220703125, "logps/rejected": -351.2749938964844, "loss": 0.5336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.43558961153030396, "rewards/margins": 0.919384777545929, "rewards/rejected": -0.48408812284469604, "step": 1210 }, { "epoch": 0.6431207169214549, "grad_norm": 58.13060123399191, "learning_rate": 8.393516078017922e-07, "logits/chosen": -1.2168457508087158, "logits/rejected": -0.9294189214706421, "logps/chosen": -325.36248779296875, "logps/rejected": -309.92498779296875, "loss": 0.4879, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.665637195110321, "rewards/margins": 1.090063452720642, "rewards/rejected": -0.42341917753219604, "step": 1220 }, { "epoch": 0.6483921982076963, "grad_norm": 76.26079673697735, "learning_rate": 8.380337374802318e-07, "logits/chosen": -1.16937255859375, "logits/rejected": -1.04052734375, "logps/chosen": -367.54998779296875, "logps/rejected": -324.1499938964844, "loss": 0.6036, "rewards/accuracies": 0.6875, "rewards/chosen": 0.180145263671875, "rewards/margins": 1.004724144935608, "rewards/rejected": -0.82513427734375, "step": 1230 }, { "epoch": 0.6536636794939378, "grad_norm": 63.135430523920334, "learning_rate": 8.367158671586716e-07, "logits/chosen": -1.246679663658142, "logits/rejected": -1.0513427257537842, "logps/chosen": -336.6875, "logps/rejected": -302.86248779296875, "loss": 0.553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06580200046300888, "rewards/margins": 0.9522949457168579, "rewards/rejected": -1.017645239830017, "step": 1240 }, { "epoch": 0.6589351607801792, "grad_norm": 71.8015028944644, "learning_rate": 8.353979968371112e-07, "logits/chosen": -1.2221190929412842, "logits/rejected": -1.015380859375, "logps/chosen": -359.63751220703125, "logps/rejected": -307.1499938964844, "loss": 0.6004, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.371054083108902, "rewards/margins": 0.7677978277206421, "rewards/rejected": -1.1385376453399658, "step": 1250 }, { "epoch": 0.6642066420664207, "grad_norm": 65.68807510021202, "learning_rate": 8.340801265155508e-07, "logits/chosen": -1.1875488758087158, "logits/rejected": -1.060693383216858, "logps/chosen": -305.38751220703125, "logps/rejected": -293.1499938964844, "loss": 0.5508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.08203125, "rewards/margins": 0.975140392780304, "rewards/rejected": -1.0576050281524658, "step": 1260 }, { "epoch": 0.6694781233526621, "grad_norm": 55.99253591165247, "learning_rate": 8.327622561939904e-07, "logits/chosen": -1.2821776866912842, "logits/rejected": -1.088623046875, "logps/chosen": -333.01251220703125, "logps/rejected": -349.625, "loss": 0.5536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17212525010108948, "rewards/margins": 0.781054675579071, "rewards/rejected": -0.6090087890625, "step": 1270 }, { "epoch": 0.6747496046389035, "grad_norm": 54.210467003450134, "learning_rate": 8.314443858724301e-07, "logits/chosen": -1.22900390625, "logits/rejected": -1.053857445716858, "logps/chosen": -316.4750061035156, "logps/rejected": -320.1000061035156, "loss": 0.5535, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2935638427734375, "rewards/margins": 0.76861572265625, "rewards/rejected": -0.47434693574905396, "step": 1280 }, { "epoch": 0.680021085925145, "grad_norm": 63.212529012215505, "learning_rate": 8.301265155508697e-07, "logits/chosen": -1.245361328125, "logits/rejected": -1.208593726158142, "logps/chosen": -334.8500061035156, "logps/rejected": -292.23748779296875, "loss": 0.5947, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3288940489292145, "rewards/margins": 0.75482177734375, "rewards/rejected": -0.4261108338832855, "step": 1290 }, { "epoch": 0.6852925672113864, "grad_norm": 69.46659185125904, "learning_rate": 8.288086452293094e-07, "logits/chosen": -1.2550780773162842, "logits/rejected": -1.17822265625, "logps/chosen": -326.51251220703125, "logps/rejected": -300.625, "loss": 0.5186, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3138976991176605, "rewards/margins": 0.849780261516571, "rewards/rejected": -0.5353943109512329, "step": 1300 }, { "epoch": 0.6905640484976279, "grad_norm": 65.35866971276957, "learning_rate": 8.27490774907749e-07, "logits/chosen": -1.2815430164337158, "logits/rejected": -1.178320288658142, "logps/chosen": -356.9750061035156, "logps/rejected": -324.5249938964844, "loss": 0.5493, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47030335664749146, "rewards/margins": 0.791638195514679, "rewards/rejected": -0.321624755859375, "step": 1310 }, { "epoch": 0.6958355297838693, "grad_norm": 49.55858750858437, "learning_rate": 8.261729045861887e-07, "logits/chosen": -1.2809569835662842, "logits/rejected": -1.2249023914337158, "logps/chosen": -343.6875, "logps/rejected": -325.67498779296875, "loss": 0.5361, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.41257935762405396, "rewards/margins": 0.9251953363418579, "rewards/rejected": -0.512194812297821, "step": 1320 }, { "epoch": 0.7011070110701108, "grad_norm": 47.75115276319353, "learning_rate": 8.248550342646283e-07, "logits/chosen": -1.449072241783142, "logits/rejected": -1.1931641101837158, "logps/chosen": -314.9750061035156, "logps/rejected": -337.75, "loss": 0.4722, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.283761590719223, "rewards/margins": 0.9835144281387329, "rewards/rejected": -0.6996825933456421, "step": 1330 }, { "epoch": 0.7063784923563521, "grad_norm": 44.37114041376355, "learning_rate": 8.235371639430679e-07, "logits/chosen": -1.3024413585662842, "logits/rejected": -1.1702392101287842, "logps/chosen": -357.11248779296875, "logps/rejected": -299.4750061035156, "loss": 0.4758, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.1726531982421875, "rewards/margins": 0.998120129108429, "rewards/rejected": -0.825695812702179, "step": 1340 }, { "epoch": 0.7116499736425935, "grad_norm": 71.05619333072296, "learning_rate": 8.222192936215076e-07, "logits/chosen": -1.398046851158142, "logits/rejected": -1.2692382335662842, "logps/chosen": -344.6875, "logps/rejected": -317.7875061035156, "loss": 0.6614, "rewards/accuracies": 0.625, "rewards/chosen": -0.12794189155101776, "rewards/margins": 0.6993774175643921, "rewards/rejected": -0.8280426263809204, "step": 1350 }, { "epoch": 0.716921454928835, "grad_norm": 54.25885672638561, "learning_rate": 8.209014232999473e-07, "logits/chosen": -1.174707055091858, "logits/rejected": -1.0821654796600342, "logps/chosen": -325.88751220703125, "logps/rejected": -317.76251220703125, "loss": 0.4655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.05783386155962944, "rewards/margins": 1.112524390220642, "rewards/rejected": -1.054113745689392, "step": 1360 }, { "epoch": 0.7221929362150764, "grad_norm": 71.41409441964335, "learning_rate": 8.195835529783869e-07, "logits/chosen": -1.209814429283142, "logits/rejected": -1.001440405845642, "logps/chosen": -338.4375, "logps/rejected": -338.70001220703125, "loss": 0.4581, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.24586638808250427, "rewards/margins": 1.150415062904358, "rewards/rejected": -0.9040771722793579, "step": 1370 }, { "epoch": 0.7274644175013179, "grad_norm": 42.19588082570137, "learning_rate": 8.182656826568265e-07, "logits/chosen": -1.221166968345642, "logits/rejected": -1.164648413658142, "logps/chosen": -339.95001220703125, "logps/rejected": -313.6499938964844, "loss": 0.4386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3129638731479645, "rewards/margins": 1.133215308189392, "rewards/rejected": -0.8203185796737671, "step": 1380 }, { "epoch": 0.7327358987875593, "grad_norm": 53.862974008070836, "learning_rate": 8.169478123352662e-07, "logits/chosen": -1.3896484375, "logits/rejected": -1.209313988685608, "logps/chosen": -297.0, "logps/rejected": -313.2749938964844, "loss": 0.549, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.03078002855181694, "rewards/margins": 1.056616187095642, "rewards/rejected": -1.0885009765625, "step": 1390 }, { "epoch": 0.7380073800738007, "grad_norm": 57.53142722222553, "learning_rate": 8.156299420137058e-07, "logits/chosen": -1.2732422351837158, "logits/rejected": -1.051367163658142, "logps/chosen": -347.4312438964844, "logps/rejected": -331.51251220703125, "loss": 0.4337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.004077148623764515, "rewards/margins": 1.320196509361267, "rewards/rejected": -1.3164794445037842, "step": 1400 }, { "epoch": 0.7432788613600422, "grad_norm": 56.54800253659448, "learning_rate": 8.143120716921455e-07, "logits/chosen": -1.2448241710662842, "logits/rejected": -1.1424682140350342, "logps/chosen": -380.7749938964844, "logps/rejected": -334.29998779296875, "loss": 0.5863, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.10785827785730362, "rewards/margins": 0.997631847858429, "rewards/rejected": -1.1063873767852783, "step": 1410 }, { "epoch": 0.7485503426462836, "grad_norm": 47.32247575371337, "learning_rate": 8.129942013705851e-07, "logits/chosen": -1.214086890220642, "logits/rejected": -1.211035132408142, "logps/chosen": -301.01251220703125, "logps/rejected": -263.01251220703125, "loss": 0.5128, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.07247314602136612, "rewards/margins": 0.9266723394393921, "rewards/rejected": -0.854296863079071, "step": 1420 }, { "epoch": 0.7538218239325251, "grad_norm": 61.57021741610989, "learning_rate": 8.116763310490248e-07, "logits/chosen": -1.39892578125, "logits/rejected": -1.2280762195587158, "logps/chosen": -320.7250061035156, "logps/rejected": -307.92498779296875, "loss": 0.503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.49089354276657104, "rewards/margins": 1.0941162109375, "rewards/rejected": -0.60302734375, "step": 1430 }, { "epoch": 0.7590933052187665, "grad_norm": 50.052799013423254, "learning_rate": 8.103584607274644e-07, "logits/chosen": -1.154443383216858, "logits/rejected": -1.154809594154358, "logps/chosen": -338.8374938964844, "logps/rejected": -298.4750061035156, "loss": 0.5296, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5398803949356079, "rewards/margins": 0.9178466796875, "rewards/rejected": -0.37744140625, "step": 1440 }, { "epoch": 0.7643647865050079, "grad_norm": 80.6021724659396, "learning_rate": 8.09040590405904e-07, "logits/chosen": -1.300195336341858, "logits/rejected": -1.229711890220642, "logps/chosen": -322.4750061035156, "logps/rejected": -295.79998779296875, "loss": 0.5957, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.15181884169578552, "rewards/margins": 0.7458740472793579, "rewards/rejected": -0.5945068597793579, "step": 1450 }, { "epoch": 0.7696362677912494, "grad_norm": 62.1870433302889, "learning_rate": 8.077227200843436e-07, "logits/chosen": -1.2451660633087158, "logits/rejected": -1.0498046875, "logps/chosen": -359.53125, "logps/rejected": -314.17498779296875, "loss": 0.5411, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.03188476711511612, "rewards/margins": 1.0545654296875, "rewards/rejected": -1.023461937904358, "step": 1460 }, { "epoch": 0.7749077490774908, "grad_norm": 78.13570368118985, "learning_rate": 8.064048497627834e-07, "logits/chosen": -1.2900879383087158, "logits/rejected": -0.998242199420929, "logps/chosen": -321.1625061035156, "logps/rejected": -324.3500061035156, "loss": 0.5354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.00786743126809597, "rewards/margins": 0.936999499797821, "rewards/rejected": -0.9305419921875, "step": 1470 }, { "epoch": 0.7801792303637322, "grad_norm": 48.4523172871712, "learning_rate": 8.05086979441223e-07, "logits/chosen": -1.218042016029358, "logits/rejected": -1.158258080482483, "logps/chosen": -378.0, "logps/rejected": -321.92498779296875, "loss": 0.4912, "rewards/accuracies": 0.75, "rewards/chosen": 0.04661712795495987, "rewards/margins": 1.075524926185608, "rewards/rejected": -1.028222680091858, "step": 1480 }, { "epoch": 0.7854507116499736, "grad_norm": 62.010592501293296, "learning_rate": 8.037691091196626e-07, "logits/chosen": -1.393457055091858, "logits/rejected": -1.2997558116912842, "logps/chosen": -336.0625, "logps/rejected": -310.61248779296875, "loss": 0.5444, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.07761230319738388, "rewards/margins": 0.823962390422821, "rewards/rejected": -0.7469848394393921, "step": 1490 }, { "epoch": 0.790722192936215, "grad_norm": 69.29495857199251, "learning_rate": 8.024512387981023e-07, "logits/chosen": -1.165917992591858, "logits/rejected": -1.194580078125, "logps/chosen": -373.9125061035156, "logps/rejected": -324.3999938964844, "loss": 0.5514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3360961973667145, "rewards/margins": 0.88092041015625, "rewards/rejected": -0.544848620891571, "step": 1500 }, { "epoch": 0.7959936742224565, "grad_norm": 49.763221392218576, "learning_rate": 8.011333684765419e-07, "logits/chosen": -1.343164086341858, "logits/rejected": -1.156640648841858, "logps/chosen": -339.1499938964844, "logps/rejected": -310.57501220703125, "loss": 0.5632, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.503814697265625, "rewards/margins": 0.826916515827179, "rewards/rejected": -0.3234924376010895, "step": 1510 }, { "epoch": 0.8012651555086979, "grad_norm": 68.78085531539818, "learning_rate": 7.998154981549815e-07, "logits/chosen": -1.3787109851837158, "logits/rejected": -1.140716552734375, "logps/chosen": -326.21875, "logps/rejected": -308.6000061035156, "loss": 0.5006, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.710156261920929, "rewards/margins": 1.0765380859375, "rewards/rejected": -0.366058349609375, "step": 1520 }, { "epoch": 0.8065366367949394, "grad_norm": 58.16102543098187, "learning_rate": 7.984976278334212e-07, "logits/chosen": -1.3406493663787842, "logits/rejected": -1.136254906654358, "logps/chosen": -313.7250061035156, "logps/rejected": -293.1499938964844, "loss": 0.6111, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.11670532077550888, "rewards/margins": 0.8188720941543579, "rewards/rejected": -0.701153576374054, "step": 1530 }, { "epoch": 0.8118081180811808, "grad_norm": 43.44531241040939, "learning_rate": 7.971797575118609e-07, "logits/chosen": -1.3424804210662842, "logits/rejected": -1.2604553699493408, "logps/chosen": -306.09375, "logps/rejected": -293.3999938964844, "loss": 0.5488, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.14827270805835724, "rewards/margins": 0.9640747308731079, "rewards/rejected": -0.815600574016571, "step": 1540 }, { "epoch": 0.8170795993674222, "grad_norm": 55.20760897322391, "learning_rate": 7.958618871903005e-07, "logits/chosen": -1.257470726966858, "logits/rejected": -1.053381323814392, "logps/chosen": -315.625, "logps/rejected": -307.15625, "loss": 0.5888, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.21961060166358948, "rewards/margins": 0.7732177972793579, "rewards/rejected": -0.553906261920929, "step": 1550 }, { "epoch": 0.8223510806536637, "grad_norm": 52.41557662310013, "learning_rate": 7.9454401686874e-07, "logits/chosen": -1.3258788585662842, "logits/rejected": -1.147705078125, "logps/chosen": -331.625, "logps/rejected": -297.13751220703125, "loss": 0.5024, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.35887449979782104, "rewards/margins": 0.957629382610321, "rewards/rejected": -0.5986328125, "step": 1560 }, { "epoch": 0.8276225619399051, "grad_norm": 72.83904594431297, "learning_rate": 7.932261465471796e-07, "logits/chosen": -1.283544898033142, "logits/rejected": -1.208276391029358, "logps/chosen": -313.3500061035156, "logps/rejected": -306.15625, "loss": 0.5168, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2723327577114105, "rewards/margins": 0.913037121295929, "rewards/rejected": -0.6398864984512329, "step": 1570 }, { "epoch": 0.8328940432261466, "grad_norm": 46.342876537780015, "learning_rate": 7.919082762256194e-07, "logits/chosen": -1.2018554210662842, "logits/rejected": -1.0409667491912842, "logps/chosen": -319.3999938964844, "logps/rejected": -307.01251220703125, "loss": 0.5351, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.14931640028953552, "rewards/margins": 0.9189453125, "rewards/rejected": -0.7678741216659546, "step": 1580 }, { "epoch": 0.838165524512388, "grad_norm": 52.44979614364282, "learning_rate": 7.90590405904059e-07, "logits/chosen": -1.1926758289337158, "logits/rejected": -1.088598608970642, "logps/chosen": -309.75, "logps/rejected": -304.0375061035156, "loss": 0.5559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08772888034582138, "rewards/margins": 0.9941650629043579, "rewards/rejected": -0.905749499797821, "step": 1590 }, { "epoch": 0.8434370057986295, "grad_norm": 67.20916837990738, "learning_rate": 7.892725355824986e-07, "logits/chosen": -1.317773461341858, "logits/rejected": -1.157922387123108, "logps/chosen": -310.54998779296875, "logps/rejected": -281.86248779296875, "loss": 0.4725, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.20413818955421448, "rewards/margins": 1.162744164466858, "rewards/rejected": -0.9588943719863892, "step": 1600 }, { "epoch": 0.8487084870848709, "grad_norm": 55.3242630789027, "learning_rate": 7.879546652609382e-07, "logits/chosen": -1.194677710533142, "logits/rejected": -1.10498046875, "logps/chosen": -328.8999938964844, "logps/rejected": -303.61248779296875, "loss": 0.5365, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.06099243089556694, "rewards/margins": 0.780169665813446, "rewards/rejected": -0.8412841558456421, "step": 1610 }, { "epoch": 0.8539799683711122, "grad_norm": 53.50270625646187, "learning_rate": 7.866367949393779e-07, "logits/chosen": -1.260498046875, "logits/rejected": -1.097753882408142, "logps/chosen": -330.92498779296875, "logps/rejected": -327.0249938964844, "loss": 0.5881, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.15104980766773224, "rewards/margins": 0.843798816204071, "rewards/rejected": -0.995043933391571, "step": 1620 }, { "epoch": 0.8592514496573537, "grad_norm": 50.542044475111965, "learning_rate": 7.853189246178175e-07, "logits/chosen": -1.312768578529358, "logits/rejected": -1.091333031654358, "logps/chosen": -317.98748779296875, "logps/rejected": -299.4125061035156, "loss": 0.4686, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.06982727348804474, "rewards/margins": 1.0299072265625, "rewards/rejected": -0.9608215093612671, "step": 1630 }, { "epoch": 0.8645229309435951, "grad_norm": 46.8519121680924, "learning_rate": 7.840010542962572e-07, "logits/chosen": -1.201074242591858, "logits/rejected": -1.053808569908142, "logps/chosen": -341.7250061035156, "logps/rejected": -332.11248779296875, "loss": 0.5019, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.19959335029125214, "rewards/margins": 1.0611572265625, "rewards/rejected": -0.8606628179550171, "step": 1640 }, { "epoch": 0.8697944122298366, "grad_norm": 53.32567133611255, "learning_rate": 7.826831839746969e-07, "logits/chosen": -1.3486328125, "logits/rejected": -1.2105712890625, "logps/chosen": -322.79998779296875, "logps/rejected": -329.82501220703125, "loss": 0.5305, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.10155029594898224, "rewards/margins": 0.9431396722793579, "rewards/rejected": -0.8419433832168579, "step": 1650 }, { "epoch": 0.875065893516078, "grad_norm": 63.732421189650154, "learning_rate": 7.813653136531365e-07, "logits/chosen": -1.385156273841858, "logits/rejected": -1.2333495616912842, "logps/chosen": -322.4375, "logps/rejected": -309.07501220703125, "loss": 0.5393, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11317749321460724, "rewards/margins": 1.108300805091858, "rewards/rejected": -0.99493408203125, "step": 1660 }, { "epoch": 0.8803373748023194, "grad_norm": 41.484619379442, "learning_rate": 7.800474433315761e-07, "logits/chosen": -1.3026854991912842, "logits/rejected": -1.128625512123108, "logps/chosen": -329.3999938964844, "logps/rejected": -345.4750061035156, "loss": 0.4746, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.11607971042394638, "rewards/margins": 1.0651366710662842, "rewards/rejected": -0.9496093988418579, "step": 1670 }, { "epoch": 0.8856088560885609, "grad_norm": 50.520914942861936, "learning_rate": 7.787295730100157e-07, "logits/chosen": -1.2868163585662842, "logits/rejected": -1.2056152820587158, "logps/chosen": -362.25, "logps/rejected": -328.73748779296875, "loss": 0.5309, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.02480163611471653, "rewards/margins": 1.1927703619003296, "rewards/rejected": -1.16693115234375, "step": 1680 }, { "epoch": 0.8908803373748023, "grad_norm": 66.13900391905159, "learning_rate": 7.774117026884554e-07, "logits/chosen": -1.3745605945587158, "logits/rejected": -1.3026123046875, "logps/chosen": -330.86248779296875, "logps/rejected": -288.88751220703125, "loss": 0.467, "rewards/accuracies": 0.75, "rewards/chosen": 0.11585082858800888, "rewards/margins": 1.1682617664337158, "rewards/rejected": -1.053735375404358, "step": 1690 }, { "epoch": 0.8961518186610438, "grad_norm": 46.60165109526856, "learning_rate": 7.760938323668951e-07, "logits/chosen": -1.2632324695587158, "logits/rejected": -1.2293212413787842, "logps/chosen": -346.1499938964844, "logps/rejected": -330.79998779296875, "loss": 0.5121, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.115692138671875, "rewards/margins": 1.085717797279358, "rewards/rejected": -1.200433373451233, "step": 1700 }, { "epoch": 0.9014232999472852, "grad_norm": 62.399952922959116, "learning_rate": 7.747759620453347e-07, "logits/chosen": -1.4072265625, "logits/rejected": -1.3752930164337158, "logps/chosen": -337.2875061035156, "logps/rejected": -313.1000061035156, "loss": 0.5803, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4058776795864105, "rewards/margins": 0.8641723394393921, "rewards/rejected": -1.2706024646759033, "step": 1710 }, { "epoch": 0.9066947812335266, "grad_norm": 37.97844784366482, "learning_rate": 7.734580917237743e-07, "logits/chosen": -1.3935546875, "logits/rejected": -1.1694214344024658, "logps/chosen": -314.04998779296875, "logps/rejected": -303.29998779296875, "loss": 0.4767, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.49308472871780396, "rewards/margins": 1.1453125476837158, "rewards/rejected": -1.638146996498108, "step": 1720 }, { "epoch": 0.9119662625197681, "grad_norm": 59.65001016781537, "learning_rate": 7.72140221402214e-07, "logits/chosen": -1.3466796875, "logits/rejected": -1.324121117591858, "logps/chosen": -356.17498779296875, "logps/rejected": -308.3125, "loss": 0.4473, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3027282655239105, "rewards/margins": 1.433203101158142, "rewards/rejected": -1.734442114830017, "step": 1730 }, { "epoch": 0.9172377438060095, "grad_norm": 42.701703095241406, "learning_rate": 7.708223510806536e-07, "logits/chosen": -1.4043457508087158, "logits/rejected": -1.3674805164337158, "logps/chosen": -344.04998779296875, "logps/rejected": -315.07501220703125, "loss": 0.5182, "rewards/accuracies": 0.71875, "rewards/chosen": -0.590466320514679, "rewards/margins": 1.0505249500274658, "rewards/rejected": -1.6414062976837158, "step": 1740 }, { "epoch": 0.922509225092251, "grad_norm": 43.148394321661776, "learning_rate": 7.695044807590932e-07, "logits/chosen": -1.403710961341858, "logits/rejected": -1.1427795886993408, "logps/chosen": -309.54998779296875, "logps/rejected": -325.38751220703125, "loss": 0.5382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6145874261856079, "rewards/margins": 0.983837902545929, "rewards/rejected": -1.597998023033142, "step": 1750 }, { "epoch": 0.9277807063784923, "grad_norm": 63.742633207025584, "learning_rate": 7.68186610437533e-07, "logits/chosen": -1.299902319908142, "logits/rejected": -1.181249976158142, "logps/chosen": -337.4937438964844, "logps/rejected": -348.6000061035156, "loss": 0.5881, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6219848394393921, "rewards/margins": 0.782275378704071, "rewards/rejected": -1.4047973155975342, "step": 1760 }, { "epoch": 0.9330521876647337, "grad_norm": 40.671976526276794, "learning_rate": 7.668687401159726e-07, "logits/chosen": -1.297460913658142, "logits/rejected": -1.113195776939392, "logps/chosen": -315.5375061035156, "logps/rejected": -306.04998779296875, "loss": 0.571, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3244872987270355, "rewards/margins": 0.888073742389679, "rewards/rejected": -1.2126922607421875, "step": 1770 }, { "epoch": 0.9383236689509752, "grad_norm": 45.79349213050961, "learning_rate": 7.655508697944122e-07, "logits/chosen": -1.16357421875, "logits/rejected": -1.156640648841858, "logps/chosen": -363.29998779296875, "logps/rejected": -312.875, "loss": 0.4816, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.03574066236615181, "rewards/margins": 1.0858154296875, "rewards/rejected": -1.0502135753631592, "step": 1780 }, { "epoch": 0.9435951502372166, "grad_norm": 52.88112244356676, "learning_rate": 7.642329994728518e-07, "logits/chosen": -1.362207055091858, "logits/rejected": -1.2415039539337158, "logps/chosen": -340.8125, "logps/rejected": -335.57501220703125, "loss": 0.5371, "rewards/accuracies": 0.6875, "rewards/chosen": 3.662109520519152e-05, "rewards/margins": 0.873242199420929, "rewards/rejected": -0.8724120855331421, "step": 1790 }, { "epoch": 0.9488666315234581, "grad_norm": 61.39969375049028, "learning_rate": 7.629151291512915e-07, "logits/chosen": -1.369042992591858, "logits/rejected": -1.2691650390625, "logps/chosen": -298.70001220703125, "logps/rejected": -318.26251220703125, "loss": 0.4638, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04978637769818306, "rewards/margins": 1.058569312095642, "rewards/rejected": -1.108740210533142, "step": 1800 }, { "epoch": 0.9541381128096995, "grad_norm": 42.83603216548615, "learning_rate": 7.615972588297312e-07, "logits/chosen": -1.2744140625, "logits/rejected": -1.20550537109375, "logps/chosen": -318.0249938964844, "logps/rejected": -298.38751220703125, "loss": 0.5277, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.03697509691119194, "rewards/margins": 1.052978515625, "rewards/rejected": -1.0152435302734375, "step": 1810 }, { "epoch": 0.959409594095941, "grad_norm": 60.21262615221071, "learning_rate": 7.602793885081708e-07, "logits/chosen": -1.298486351966858, "logits/rejected": -1.296875, "logps/chosen": -336.7875061035156, "logps/rejected": -314.13751220703125, "loss": 0.5317, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.10150756686925888, "rewards/margins": 1.0389220714569092, "rewards/rejected": -0.937347412109375, "step": 1820 }, { "epoch": 0.9646810753821824, "grad_norm": 30.29083476525765, "learning_rate": 7.589615181866104e-07, "logits/chosen": -1.2199218273162842, "logits/rejected": -1.0191161632537842, "logps/chosen": -320.8999938964844, "logps/rejected": -305.6625061035156, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": 0.22072753310203552, "rewards/margins": 1.202123999595642, "rewards/rejected": -0.9807494878768921, "step": 1830 }, { "epoch": 0.9699525566684238, "grad_norm": 49.80908168240111, "learning_rate": 7.576436478650501e-07, "logits/chosen": -1.2703368663787842, "logits/rejected": -1.115136742591858, "logps/chosen": -353.04998779296875, "logps/rejected": -308.76251220703125, "loss": 0.538, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3444457948207855, "rewards/margins": 0.876782238483429, "rewards/rejected": -0.532635509967804, "step": 1840 }, { "epoch": 0.9752240379546653, "grad_norm": 57.2950942341527, "learning_rate": 7.563257775434897e-07, "logits/chosen": -1.2339355945587158, "logits/rejected": -1.121972680091858, "logps/chosen": -334.9750061035156, "logps/rejected": -316.48748779296875, "loss": 0.5151, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.06648559868335724, "rewards/margins": 1.22772216796875, "rewards/rejected": -1.1603882312774658, "step": 1850 }, { "epoch": 0.9804955192409067, "grad_norm": 55.10318242761449, "learning_rate": 7.550079072219293e-07, "logits/chosen": -1.252539038658142, "logits/rejected": -1.187597632408142, "logps/chosen": -322.7250061035156, "logps/rejected": -322.0375061035156, "loss": 0.5925, "rewards/accuracies": 0.71875, "rewards/chosen": 0.11102294921875, "rewards/margins": 0.9586426019668579, "rewards/rejected": -0.846752941608429, "step": 1860 }, { "epoch": 0.9857670005271482, "grad_norm": 56.0520555546714, "learning_rate": 7.53690036900369e-07, "logits/chosen": -1.1160156726837158, "logits/rejected": -1.0833008289337158, "logps/chosen": -311.04998779296875, "logps/rejected": -309.95001220703125, "loss": 0.5212, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.40374755859375, "rewards/margins": 1.025781273841858, "rewards/rejected": -0.6219848394393921, "step": 1870 }, { "epoch": 0.9910384818133896, "grad_norm": 51.09806353243728, "learning_rate": 7.523721665788087e-07, "logits/chosen": -1.28125, "logits/rejected": -1.197534203529358, "logps/chosen": -301.5625, "logps/rejected": -289.6625061035156, "loss": 0.4909, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6838134527206421, "rewards/margins": 0.9508422613143921, "rewards/rejected": -0.2675537168979645, "step": 1880 }, { "epoch": 0.996309963099631, "grad_norm": 53.65451673673479, "learning_rate": 7.510542962572483e-07, "logits/chosen": -1.273339867591858, "logits/rejected": -0.9838317632675171, "logps/chosen": -303.45001220703125, "logps/rejected": -295.1499938964844, "loss": 0.5054, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.36207276582717896, "rewards/margins": 1.1035888195037842, "rewards/rejected": -0.7422119379043579, "step": 1890 }, { "epoch": 1.0015814443858724, "grad_norm": 27.450345142704787, "learning_rate": 7.497364259356879e-07, "logits/chosen": -1.277734398841858, "logits/rejected": -1.194604516029358, "logps/chosen": -329.48126220703125, "logps/rejected": -307.48748779296875, "loss": 0.4348, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.184906005859375, "rewards/margins": 1.453710913658142, "rewards/rejected": -1.269555687904358, "step": 1900 }, { "epoch": 1.006852925672114, "grad_norm": 19.2724793376953, "learning_rate": 7.484185556141276e-07, "logits/chosen": -1.2033202648162842, "logits/rejected": -0.958386242389679, "logps/chosen": -319.3500061035156, "logps/rejected": -335.7749938964844, "loss": 0.1396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.756915271282196, "rewards/margins": 2.7757811546325684, "rewards/rejected": -2.017993211746216, "step": 1910 }, { "epoch": 1.0121244069583553, "grad_norm": 17.956027609267082, "learning_rate": 7.471006852925671e-07, "logits/chosen": -1.556640625, "logits/rejected": -1.3875000476837158, "logps/chosen": -305.8500061035156, "logps/rejected": -333.1875, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8642578125, "rewards/margins": 2.551953077316284, "rewards/rejected": -1.68896484375, "step": 1920 }, { "epoch": 1.0173958882445968, "grad_norm": 13.723233623533574, "learning_rate": 7.457828149710068e-07, "logits/chosen": -1.4223144054412842, "logits/rejected": -1.331884741783142, "logps/chosen": -316.4750061035156, "logps/rejected": -325.75, "loss": 0.1805, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.64703369140625, "rewards/margins": 2.469531297683716, "rewards/rejected": -1.823144555091858, "step": 1930 }, { "epoch": 1.0226673695308381, "grad_norm": 11.32260298489655, "learning_rate": 7.444649446494464e-07, "logits/chosen": -1.4717285633087158, "logits/rejected": -1.284570336341858, "logps/chosen": -367.82501220703125, "logps/rejected": -349.875, "loss": 0.115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.859912097454071, "rewards/margins": 2.858105421066284, "rewards/rejected": -1.998388648033142, "step": 1940 }, { "epoch": 1.0279388508170797, "grad_norm": 15.59681073933607, "learning_rate": 7.431470743278861e-07, "logits/chosen": -1.5714843273162842, "logits/rejected": -1.5358397960662842, "logps/chosen": -346.9750061035156, "logps/rejected": -337.6499938964844, "loss": 0.1857, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3848510682582855, "rewards/margins": 2.550097703933716, "rewards/rejected": -2.1639647483825684, "step": 1950 }, { "epoch": 1.033210332103321, "grad_norm": 29.56608368730858, "learning_rate": 7.418292040063257e-07, "logits/chosen": -1.59765625, "logits/rejected": -1.5201416015625, "logps/chosen": -327.75, "logps/rejected": -348.57501220703125, "loss": 0.192, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4413085877895355, "rewards/margins": 2.9124999046325684, "rewards/rejected": -2.473876953125, "step": 1960 }, { "epoch": 1.0384818133895624, "grad_norm": 15.709004513137025, "learning_rate": 7.405113336847653e-07, "logits/chosen": -1.6855957508087158, "logits/rejected": -1.588769555091858, "logps/chosen": -290.2749938964844, "logps/rejected": -299.82501220703125, "loss": 0.1864, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5622528195381165, "rewards/margins": 2.478222608566284, "rewards/rejected": -1.9142334461212158, "step": 1970 }, { "epoch": 1.043753294675804, "grad_norm": 15.614701131670586, "learning_rate": 7.39193463363205e-07, "logits/chosen": -1.6047852039337158, "logits/rejected": -1.4153320789337158, "logps/chosen": -341.625, "logps/rejected": -340.04998779296875, "loss": 0.1257, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2702697813510895, "rewards/margins": 3.078125, "rewards/rejected": -2.806640625, "step": 1980 }, { "epoch": 1.0490247759620452, "grad_norm": 20.513593047523894, "learning_rate": 7.378755930416447e-07, "logits/chosen": -1.56591796875, "logits/rejected": -1.4372069835662842, "logps/chosen": -320.79998779296875, "logps/rejected": -326.5874938964844, "loss": 0.1764, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.25871580839157104, "rewards/margins": 2.7554688453674316, "rewards/rejected": -2.497851610183716, "step": 1990 }, { "epoch": 1.0542962572482868, "grad_norm": 22.734930383543862, "learning_rate": 7.365577227200843e-07, "logits/chosen": -1.636328101158142, "logits/rejected": -1.5150878429412842, "logps/chosen": -325.2749938964844, "logps/rejected": -332.0, "loss": 0.1585, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.18966980278491974, "rewards/margins": 2.8797850608825684, "rewards/rejected": -3.0674805641174316, "step": 2000 }, { "epoch": 1.0595677385345281, "grad_norm": 18.095154542558898, "learning_rate": 7.352398523985239e-07, "logits/chosen": -1.7058594226837158, "logits/rejected": -1.5373046398162842, "logps/chosen": -316.8999938964844, "logps/rejected": -336.0249938964844, "loss": 0.1204, "rewards/accuracies": 0.96875, "rewards/chosen": -0.07844237983226776, "rewards/margins": 3.110156297683716, "rewards/rejected": -3.1884765625, "step": 2010 }, { "epoch": 1.0648392198207697, "grad_norm": 29.241725991176697, "learning_rate": 7.339219820769635e-07, "logits/chosen": -1.663476586341858, "logits/rejected": -1.6940429210662842, "logps/chosen": -319.95001220703125, "logps/rejected": -305.8125, "loss": 0.1436, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.08362426608800888, "rewards/margins": 2.800585985183716, "rewards/rejected": -2.717578172683716, "step": 2020 }, { "epoch": 1.070110701107011, "grad_norm": 38.09550519049994, "learning_rate": 7.326041117554032e-07, "logits/chosen": -1.6631348133087158, "logits/rejected": -1.59033203125, "logps/chosen": -324.20001220703125, "logps/rejected": -332.25, "loss": 0.1739, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.0911712646484375, "rewards/margins": 3.080273389816284, "rewards/rejected": -2.9892578125, "step": 2030 }, { "epoch": 1.0753821823932526, "grad_norm": 23.711402959177818, "learning_rate": 7.312862414338429e-07, "logits/chosen": -1.724755883216858, "logits/rejected": -1.672949194908142, "logps/chosen": -370.9750061035156, "logps/rejected": -322.7749938964844, "loss": 0.1855, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.13966064155101776, "rewards/margins": 2.8490233421325684, "rewards/rejected": -2.709765672683716, "step": 2040 }, { "epoch": 1.080653663679494, "grad_norm": 68.09922866864169, "learning_rate": 7.299683711122825e-07, "logits/chosen": -1.675390601158142, "logits/rejected": -1.56591796875, "logps/chosen": -318.0249938964844, "logps/rejected": -343.3500061035156, "loss": 0.2132, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.1466064453125, "rewards/margins": 2.730761766433716, "rewards/rejected": -2.87744140625, "step": 2050 }, { "epoch": 1.0859251449657354, "grad_norm": 35.91856475583258, "learning_rate": 7.286505007907222e-07, "logits/chosen": -1.6916992664337158, "logits/rejected": -1.63427734375, "logps/chosen": -304.1625061035156, "logps/rejected": -305.6625061035156, "loss": 0.1833, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.15096434950828552, "rewards/margins": 2.7030272483825684, "rewards/rejected": -2.5516600608825684, "step": 2060 }, { "epoch": 1.0911966262519768, "grad_norm": 11.964844477598941, "learning_rate": 7.273326304691618e-07, "logits/chosen": -1.6549804210662842, "logits/rejected": -1.484472632408142, "logps/chosen": -333.45001220703125, "logps/rejected": -326.2875061035156, "loss": 0.1467, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.07001037895679474, "rewards/margins": 2.9232420921325684, "rewards/rejected": -2.853515625, "step": 2070 }, { "epoch": 1.0964681075382183, "grad_norm": 19.28709595945145, "learning_rate": 7.260147601476014e-07, "logits/chosen": -1.584570288658142, "logits/rejected": -1.41845703125, "logps/chosen": -304.5375061035156, "logps/rejected": -328.1625061035156, "loss": 0.1562, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2589965760707855, "rewards/margins": 3.0302734375, "rewards/rejected": -2.772656202316284, "step": 2080 }, { "epoch": 1.1017395888244597, "grad_norm": 22.508038263970207, "learning_rate": 7.24696889826041e-07, "logits/chosen": -1.5999023914337158, "logits/rejected": -1.4655272960662842, "logps/chosen": -321.51251220703125, "logps/rejected": -300.0249938964844, "loss": 0.1763, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.25474244356155396, "rewards/margins": 2.770312547683716, "rewards/rejected": -2.514270067214966, "step": 2090 }, { "epoch": 1.1070110701107012, "grad_norm": 19.416071139425465, "learning_rate": 7.233790195044808e-07, "logits/chosen": -1.672460913658142, "logits/rejected": -1.5568358898162842, "logps/chosen": -269.8125, "logps/rejected": -288.8374938964844, "loss": 0.1429, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.15533141791820526, "rewards/margins": 3.0478515625, "rewards/rejected": -2.893749952316284, "step": 2100 }, { "epoch": 1.1122825513969425, "grad_norm": 11.405049592666794, "learning_rate": 7.220611491829204e-07, "logits/chosen": -1.6980469226837158, "logits/rejected": -1.5369141101837158, "logps/chosen": -287.17498779296875, "logps/rejected": -327.20001220703125, "loss": 0.1456, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.39279478788375854, "rewards/margins": 3.08984375, "rewards/rejected": -2.697558641433716, "step": 2110 }, { "epoch": 1.1175540326831839, "grad_norm": 25.58122934306382, "learning_rate": 7.2074327886136e-07, "logits/chosen": -1.6005859375, "logits/rejected": -1.4933593273162842, "logps/chosen": -326.13751220703125, "logps/rejected": -306.70001220703125, "loss": 0.1575, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.0269775390625, "rewards/margins": 2.9634766578674316, "rewards/rejected": -2.935839891433716, "step": 2120 }, { "epoch": 1.1228255139694254, "grad_norm": 30.150690256028906, "learning_rate": 7.194254085397996e-07, "logits/chosen": -1.6013672351837158, "logits/rejected": -1.509033203125, "logps/chosen": -354.7875061035156, "logps/rejected": -388.57501220703125, "loss": 0.1155, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04396362230181694, "rewards/margins": 3.6025390625, "rewards/rejected": -3.5564942359924316, "step": 2130 }, { "epoch": 1.1280969952556668, "grad_norm": 22.316314835408157, "learning_rate": 7.181075382182393e-07, "logits/chosen": -1.7566406726837158, "logits/rejected": -1.628808617591858, "logps/chosen": -285.4125061035156, "logps/rejected": -292.25, "loss": 0.2164, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3142333924770355, "rewards/margins": 2.911816358566284, "rewards/rejected": -3.2259764671325684, "step": 2140 }, { "epoch": 1.1333684765419083, "grad_norm": 22.634556994121326, "learning_rate": 7.16789667896679e-07, "logits/chosen": -1.80810546875, "logits/rejected": -1.660546898841858, "logps/chosen": -300.8999938964844, "logps/rejected": -298.1000061035156, "loss": 0.163, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.27772217988967896, "rewards/margins": 3.022656202316284, "rewards/rejected": -2.744335889816284, "step": 2150 }, { "epoch": 1.1386399578281496, "grad_norm": 44.92034081333781, "learning_rate": 7.154717975751186e-07, "logits/chosen": -1.70849609375, "logits/rejected": -1.62744140625, "logps/chosen": -324.5249938964844, "logps/rejected": -317.92498779296875, "loss": 0.1925, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.14116820693016052, "rewards/margins": 2.96875, "rewards/rejected": -2.8248047828674316, "step": 2160 }, { "epoch": 1.1439114391143912, "grad_norm": 18.68237731139296, "learning_rate": 7.141539272535582e-07, "logits/chosen": -1.732519507408142, "logits/rejected": -1.6118652820587158, "logps/chosen": -307.5249938964844, "logps/rejected": -299.7875061035156, "loss": 0.1943, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.06196289137005806, "rewards/margins": 2.8324217796325684, "rewards/rejected": -2.893798828125, "step": 2170 }, { "epoch": 1.1491829204006325, "grad_norm": 24.648546825810154, "learning_rate": 7.128360569319979e-07, "logits/chosen": -1.6435546875, "logits/rejected": -1.47900390625, "logps/chosen": -354.70001220703125, "logps/rejected": -321.25, "loss": 0.1505, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.03455200046300888, "rewards/margins": 3.286328077316284, "rewards/rejected": -3.3203125, "step": 2180 }, { "epoch": 1.154454401686874, "grad_norm": 21.138142298589408, "learning_rate": 7.115181866104375e-07, "logits/chosen": -1.675390601158142, "logits/rejected": -1.6044921875, "logps/chosen": -363.13751220703125, "logps/rejected": -381.4375, "loss": 0.1049, "rewards/accuracies": 0.96875, "rewards/chosen": -0.37461966276168823, "rewards/margins": 4.014843940734863, "rewards/rejected": -4.390820503234863, "step": 2190 }, { "epoch": 1.1597258829731154, "grad_norm": 34.94629637697001, "learning_rate": 7.102003162888771e-07, "logits/chosen": -1.667089819908142, "logits/rejected": -1.56396484375, "logps/chosen": -338.70001220703125, "logps/rejected": -327.0249938964844, "loss": 0.1538, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.20408324897289276, "rewards/margins": 3.384960889816284, "rewards/rejected": -3.591015577316284, "step": 2200 }, { "epoch": 1.164997364259357, "grad_norm": 11.395525034099839, "learning_rate": 7.088824459673169e-07, "logits/chosen": -1.775781273841858, "logits/rejected": -1.502905249595642, "logps/chosen": -312.0249938964844, "logps/rejected": -342.20001220703125, "loss": 0.1725, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.09434203803539276, "rewards/margins": 3.2261719703674316, "rewards/rejected": -3.1307616233825684, "step": 2210 }, { "epoch": 1.1702688455455983, "grad_norm": 27.675886855250937, "learning_rate": 7.075645756457565e-07, "logits/chosen": -1.792578101158142, "logits/rejected": -1.5193359851837158, "logps/chosen": -297.40625, "logps/rejected": -317.6499938964844, "loss": 0.1706, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.08919067680835724, "rewards/margins": 3.0716795921325684, "rewards/rejected": -2.9830079078674316, "step": 2220 }, { "epoch": 1.1755403268318398, "grad_norm": 17.487093702712503, "learning_rate": 7.062467053241961e-07, "logits/chosen": -1.728124976158142, "logits/rejected": -1.7321045398712158, "logps/chosen": -323.11248779296875, "logps/rejected": -318.2875061035156, "loss": 0.1906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.330474853515625, "rewards/margins": 2.908740282058716, "rewards/rejected": -2.579394578933716, "step": 2230 }, { "epoch": 1.1808118081180812, "grad_norm": 32.784606345120594, "learning_rate": 7.049288350026357e-07, "logits/chosen": -1.691015601158142, "logits/rejected": -1.6182129383087158, "logps/chosen": -343.70001220703125, "logps/rejected": -340.2124938964844, "loss": 0.1441, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0013549805153161287, "rewards/margins": 3.081347703933716, "rewards/rejected": -3.0833983421325684, "step": 2240 }, { "epoch": 1.1860832894043227, "grad_norm": 24.367029022103033, "learning_rate": 7.036109646810754e-07, "logits/chosen": -1.6789062023162842, "logits/rejected": -1.66357421875, "logps/chosen": -340.1499938964844, "logps/rejected": -330.875, "loss": 0.1615, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.05232543870806694, "rewards/margins": 2.8926758766174316, "rewards/rejected": -2.9447264671325684, "step": 2250 }, { "epoch": 1.191354770690564, "grad_norm": 28.064836628329523, "learning_rate": 7.02293094359515e-07, "logits/chosen": -1.7355468273162842, "logits/rejected": -1.6281249523162842, "logps/chosen": -294.3374938964844, "logps/rejected": -297.7250061035156, "loss": 0.1597, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.2905029356479645, "rewards/margins": 2.8804688453674316, "rewards/rejected": -3.1698241233825684, "step": 2260 }, { "epoch": 1.1966262519768054, "grad_norm": 26.834955102494426, "learning_rate": 7.009752240379547e-07, "logits/chosen": -1.7205078601837158, "logits/rejected": -1.618749976158142, "logps/chosen": -339.875, "logps/rejected": -333.875, "loss": 0.1606, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09013672173023224, "rewards/margins": 3.177539110183716, "rewards/rejected": -3.0877928733825684, "step": 2270 }, { "epoch": 1.201897733263047, "grad_norm": 22.399704119389348, "learning_rate": 6.996573537163942e-07, "logits/chosen": -1.688867211341858, "logits/rejected": -1.666015625, "logps/chosen": -333.32501220703125, "logps/rejected": -321.5, "loss": 0.1627, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.21730956435203552, "rewards/margins": 3.0455079078674316, "rewards/rejected": -2.829394578933716, "step": 2280 }, { "epoch": 1.2071692145492883, "grad_norm": 13.599751947154056, "learning_rate": 6.98339483394834e-07, "logits/chosen": -1.7104980945587158, "logits/rejected": -1.7039062976837158, "logps/chosen": -311.95001220703125, "logps/rejected": -330.125, "loss": 0.1906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.23972168564796448, "rewards/margins": 2.9078125953674316, "rewards/rejected": -2.66552734375, "step": 2290 }, { "epoch": 1.2124406958355298, "grad_norm": 18.01313199276019, "learning_rate": 6.970216130732735e-07, "logits/chosen": -1.71533203125, "logits/rejected": -1.6040527820587158, "logps/chosen": -341.5375061035156, "logps/rejected": -337.6000061035156, "loss": 0.1305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2906127870082855, "rewards/margins": 3.24609375, "rewards/rejected": -2.954882860183716, "step": 2300 }, { "epoch": 1.2177121771217712, "grad_norm": 42.61155777125437, "learning_rate": 6.957037427517131e-07, "logits/chosen": -1.8397948741912842, "logits/rejected": -1.745214819908142, "logps/chosen": -318.6000061035156, "logps/rejected": -303.32501220703125, "loss": 0.1627, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.06589965522289276, "rewards/margins": 3.344921827316284, "rewards/rejected": -3.2789063453674316, "step": 2310 }, { "epoch": 1.2229836584080127, "grad_norm": 48.000301207887, "learning_rate": 6.943858724301529e-07, "logits/chosen": -1.7174804210662842, "logits/rejected": -1.7099609375, "logps/chosen": -350.3500061035156, "logps/rejected": -311.3374938964844, "loss": 0.1634, "rewards/accuracies": 0.9375, "rewards/chosen": -0.32502442598342896, "rewards/margins": 3.1112303733825684, "rewards/rejected": -3.4380860328674316, "step": 2320 }, { "epoch": 1.228255139694254, "grad_norm": 23.98715283768609, "learning_rate": 6.930680021085925e-07, "logits/chosen": -1.809179663658142, "logits/rejected": -1.783349633216858, "logps/chosen": -307.3374938964844, "logps/rejected": -311.36248779296875, "loss": 0.1986, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.5060790777206421, "rewards/margins": 3.1073241233825684, "rewards/rejected": -3.6148438453674316, "step": 2330 }, { "epoch": 1.2335266209804956, "grad_norm": 25.47880319542612, "learning_rate": 6.917501317870321e-07, "logits/chosen": -1.7521483898162842, "logits/rejected": -1.686181664466858, "logps/chosen": -328.0, "logps/rejected": -327.57501220703125, "loss": 0.1569, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.3171752989292145, "rewards/margins": 3.0224609375, "rewards/rejected": -3.3392577171325684, "step": 2340 }, { "epoch": 1.238798102266737, "grad_norm": 20.302375316035977, "learning_rate": 6.904322614654717e-07, "logits/chosen": -1.7087891101837158, "logits/rejected": -1.577734351158142, "logps/chosen": -340.8500061035156, "logps/rejected": -357.0249938964844, "loss": 0.1551, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.33513182401657104, "rewards/margins": 3.228515625, "rewards/rejected": -3.563281297683716, "step": 2350 }, { "epoch": 1.2440695835529785, "grad_norm": 21.345443556936527, "learning_rate": 6.891143911439114e-07, "logits/chosen": -1.602148413658142, "logits/rejected": -1.5812499523162842, "logps/chosen": -358.8125, "logps/rejected": -374.9375, "loss": 0.1228, "rewards/accuracies": 0.96875, "rewards/chosen": -0.26493531465530396, "rewards/margins": 3.1376953125, "rewards/rejected": -3.4019532203674316, "step": 2360 }, { "epoch": 1.2493410648392198, "grad_norm": 21.95138732115164, "learning_rate": 6.87796520822351e-07, "logits/chosen": -1.632226586341858, "logits/rejected": -1.695214867591858, "logps/chosen": -324.76251220703125, "logps/rejected": -299.3500061035156, "loss": 0.2182, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.29359132051467896, "rewards/margins": 2.6473631858825684, "rewards/rejected": -2.9417967796325684, "step": 2370 }, { "epoch": 1.2546125461254611, "grad_norm": 25.999019981085187, "learning_rate": 6.864786505007907e-07, "logits/chosen": -1.635107398033142, "logits/rejected": -1.5720703601837158, "logps/chosen": -386.0, "logps/rejected": -349.07501220703125, "loss": 0.1423, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05192871019244194, "rewards/margins": 3.0650391578674316, "rewards/rejected": -3.116992235183716, "step": 2380 }, { "epoch": 1.2598840274117027, "grad_norm": 24.235279189790553, "learning_rate": 6.851607801792303e-07, "logits/chosen": -1.638574242591858, "logits/rejected": -1.609472632408142, "logps/chosen": -343.375, "logps/rejected": -357.25, "loss": 0.1397, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.20653685927391052, "rewards/margins": 2.9599609375, "rewards/rejected": -3.1673827171325684, "step": 2390 }, { "epoch": 1.2651555086979442, "grad_norm": 32.99067743198313, "learning_rate": 6.8384290985767e-07, "logits/chosen": -1.7744140625, "logits/rejected": -1.5977783203125, "logps/chosen": -287.7749938964844, "logps/rejected": -323.92498779296875, "loss": 0.1994, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.0710296630859375, "rewards/margins": 3.087597608566284, "rewards/rejected": -3.158203125, "step": 2400 }, { "epoch": 1.2704269899841856, "grad_norm": 45.047575402752564, "learning_rate": 6.825250395361096e-07, "logits/chosen": -1.697851538658142, "logits/rejected": -1.6975586414337158, "logps/chosen": -355.9750061035156, "logps/rejected": -349.125, "loss": 0.1433, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.014373779296875, "rewards/margins": 3.4037108421325684, "rewards/rejected": -3.41796875, "step": 2410 }, { "epoch": 1.275698471270427, "grad_norm": 30.14535362417959, "learning_rate": 6.812071692145492e-07, "logits/chosen": -1.7526366710662842, "logits/rejected": -1.620214819908142, "logps/chosen": -332.38751220703125, "logps/rejected": -292.6625061035156, "loss": 0.1597, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.19028320908546448, "rewards/margins": 2.9151368141174316, "rewards/rejected": -3.105273485183716, "step": 2420 }, { "epoch": 1.2809699525566685, "grad_norm": 27.399775976533288, "learning_rate": 6.798892988929888e-07, "logits/chosen": -1.6755859851837158, "logits/rejected": -1.509179711341858, "logps/chosen": -394.04998779296875, "logps/rejected": -389.86248779296875, "loss": 0.1095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1868850737810135, "rewards/margins": 3.7347655296325684, "rewards/rejected": -3.924023389816284, "step": 2430 }, { "epoch": 1.2862414338429098, "grad_norm": 24.010412007970817, "learning_rate": 6.785714285714286e-07, "logits/chosen": -1.8025391101837158, "logits/rejected": -1.615820288658142, "logps/chosen": -315.75, "logps/rejected": -326.125, "loss": 0.172, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.3994140625, "rewards/margins": 3.195507764816284, "rewards/rejected": -3.5912108421325684, "step": 2440 }, { "epoch": 1.2915129151291513, "grad_norm": 12.035133940067936, "learning_rate": 6.772535582498682e-07, "logits/chosen": -1.751953125, "logits/rejected": -1.7175781726837158, "logps/chosen": -313.5375061035156, "logps/rejected": -298.6625061035156, "loss": 0.1591, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2995971739292145, "rewards/margins": 3.065624952316284, "rewards/rejected": -3.368359327316284, "step": 2450 }, { "epoch": 1.2967843964153927, "grad_norm": 16.540693116040767, "learning_rate": 6.759356879283078e-07, "logits/chosen": -1.6609375476837158, "logits/rejected": -1.5405762195587158, "logps/chosen": -354.875, "logps/rejected": -374.375, "loss": 0.1644, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2694030702114105, "rewards/margins": 3.1195311546325684, "rewards/rejected": -3.389843702316284, "step": 2460 }, { "epoch": 1.3020558777016342, "grad_norm": 31.982247772109535, "learning_rate": 6.746178176067475e-07, "logits/chosen": -1.715917944908142, "logits/rejected": -1.6159179210662842, "logps/chosen": -368.42498779296875, "logps/rejected": -361.95001220703125, "loss": 0.1384, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.36237794160842896, "rewards/margins": 3.53515625, "rewards/rejected": -3.901171922683716, "step": 2470 }, { "epoch": 1.3073273589878756, "grad_norm": 24.112170300385273, "learning_rate": 6.732999472851871e-07, "logits/chosen": -1.7677733898162842, "logits/rejected": -1.639062523841858, "logps/chosen": -354.5, "logps/rejected": -347.7250061035156, "loss": 0.1584, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16776123642921448, "rewards/margins": 3.3667969703674316, "rewards/rejected": -3.5345702171325684, "step": 2480 }, { "epoch": 1.312598840274117, "grad_norm": 26.84554209642503, "learning_rate": 6.719820769636268e-07, "logits/chosen": -1.6896483898162842, "logits/rejected": -1.700292944908142, "logps/chosen": -337.42498779296875, "logps/rejected": -369.0, "loss": 0.1387, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.4039550721645355, "rewards/margins": 3.0517578125, "rewards/rejected": -3.454882860183716, "step": 2490 }, { "epoch": 1.3178703215603584, "grad_norm": 24.57288009047927, "learning_rate": 6.706642066420664e-07, "logits/chosen": -1.8621094226837158, "logits/rejected": -1.776464819908142, "logps/chosen": -318.3999938964844, "logps/rejected": -316.57501220703125, "loss": 0.173, "rewards/accuracies": 0.9375, "rewards/chosen": -0.572674572467804, "rewards/margins": 3.0557618141174316, "rewards/rejected": -3.6298828125, "step": 2500 }, { "epoch": 1.3231418028466, "grad_norm": 17.419129437731623, "learning_rate": 6.693463363205061e-07, "logits/chosen": -1.847070336341858, "logits/rejected": -1.814355492591858, "logps/chosen": -346.1499938964844, "logps/rejected": -358.125, "loss": 0.1125, "rewards/accuracies": 0.96875, "rewards/chosen": -0.600390613079071, "rewards/margins": 3.5093750953674316, "rewards/rejected": -4.111914157867432, "step": 2510 }, { "epoch": 1.3284132841328413, "grad_norm": 22.48410165297182, "learning_rate": 6.680284659989457e-07, "logits/chosen": -1.8372070789337158, "logits/rejected": -1.720117211341858, "logps/chosen": -348.32501220703125, "logps/rejected": -394.0, "loss": 0.1508, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.754162609577179, "rewards/margins": 3.39697265625, "rewards/rejected": -4.151562690734863, "step": 2520 }, { "epoch": 1.3336847654190827, "grad_norm": 25.13782774067034, "learning_rate": 6.667105956773853e-07, "logits/chosen": -1.820703148841858, "logits/rejected": -1.8385741710662842, "logps/chosen": -405.8500061035156, "logps/rejected": -391.79998779296875, "loss": 0.1476, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.598583996295929, "rewards/margins": 3.397705078125, "rewards/rejected": -3.996875047683716, "step": 2530 }, { "epoch": 1.3389562467053242, "grad_norm": 22.01201832923408, "learning_rate": 6.653927253558249e-07, "logits/chosen": -1.85009765625, "logits/rejected": -1.875585913658142, "logps/chosen": -357.5, "logps/rejected": -334.6000061035156, "loss": 0.1899, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9568847417831421, "rewards/margins": 3.0723633766174316, "rewards/rejected": -4.029687404632568, "step": 2540 }, { "epoch": 1.3442277279915658, "grad_norm": 12.369569974409737, "learning_rate": 6.640748550342647e-07, "logits/chosen": -1.805419921875, "logits/rejected": -1.662878394126892, "logps/chosen": -356.8999938964844, "logps/rejected": -353.92498779296875, "loss": 0.147, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6276489496231079, "rewards/margins": 2.9974608421325684, "rewards/rejected": -3.6205077171325684, "step": 2550 }, { "epoch": 1.349499209277807, "grad_norm": 44.18289640453084, "learning_rate": 6.627569847127043e-07, "logits/chosen": -1.755273461341858, "logits/rejected": -1.7500488758087158, "logps/chosen": -380.54998779296875, "logps/rejected": -335.82501220703125, "loss": 0.1542, "rewards/accuracies": 0.9375, "rewards/chosen": -0.528759777545929, "rewards/margins": 3.180468797683716, "rewards/rejected": -3.706835985183716, "step": 2560 }, { "epoch": 1.3547706905640484, "grad_norm": 42.81471024236139, "learning_rate": 6.614391143911439e-07, "logits/chosen": -1.855078101158142, "logits/rejected": -1.9275391101837158, "logps/chosen": -350.0249938964844, "logps/rejected": -320.67498779296875, "loss": 0.1376, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06941528618335724, "rewards/margins": 3.298046827316284, "rewards/rejected": -3.2275390625, "step": 2570 }, { "epoch": 1.36004217185029, "grad_norm": 51.51085426506088, "learning_rate": 6.601212440695835e-07, "logits/chosen": -1.8644530773162842, "logits/rejected": -1.785742163658142, "logps/chosen": -342.5375061035156, "logps/rejected": -328.73748779296875, "loss": 0.1731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5538696050643921, "rewards/margins": 3.0741209983825684, "rewards/rejected": -3.629101514816284, "step": 2580 }, { "epoch": 1.3653136531365313, "grad_norm": 14.030192559590104, "learning_rate": 6.588033737480232e-07, "logits/chosen": -1.818261742591858, "logits/rejected": -1.819238305091858, "logps/chosen": -354.95001220703125, "logps/rejected": -365.5249938964844, "loss": 0.1122, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.2649902403354645, "rewards/margins": 3.4076170921325684, "rewards/rejected": -3.673828125, "step": 2590 }, { "epoch": 1.3705851344227729, "grad_norm": 22.051994221279678, "learning_rate": 6.574855034264628e-07, "logits/chosen": -1.9609375, "logits/rejected": -1.7351562976837158, "logps/chosen": -288.875, "logps/rejected": -310.7749938964844, "loss": 0.1988, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.13292236626148224, "rewards/margins": 2.887451171875, "rewards/rejected": -3.0201172828674316, "step": 2600 }, { "epoch": 1.3758566157090142, "grad_norm": 22.93102013921953, "learning_rate": 6.561676331049025e-07, "logits/chosen": -1.716894507408142, "logits/rejected": -1.634179711341858, "logps/chosen": -382.75, "logps/rejected": -384.375, "loss": 0.153, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.03205261379480362, "rewards/margins": 3.211621046066284, "rewards/rejected": -3.2435059547424316, "step": 2610 }, { "epoch": 1.3811280969952557, "grad_norm": 11.167785317114882, "learning_rate": 6.548497627833422e-07, "logits/chosen": -1.7573730945587158, "logits/rejected": -1.749414086341858, "logps/chosen": -343.42498779296875, "logps/rejected": -340.3999938964844, "loss": 0.1459, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.06578369438648224, "rewards/margins": 3.353320360183716, "rewards/rejected": -3.2861328125, "step": 2620 }, { "epoch": 1.386399578281497, "grad_norm": 28.541794933539034, "learning_rate": 6.535318924617818e-07, "logits/chosen": -1.831933617591858, "logits/rejected": -1.7550780773162842, "logps/chosen": -337.17498779296875, "logps/rejected": -343.29998779296875, "loss": 0.1511, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.11610107123851776, "rewards/margins": 3.498242139816284, "rewards/rejected": -3.3824219703674316, "step": 2630 }, { "epoch": 1.3916710595677384, "grad_norm": 30.57276111083602, "learning_rate": 6.522140221402213e-07, "logits/chosen": -1.7371094226837158, "logits/rejected": -1.5739257335662842, "logps/chosen": -355.75, "logps/rejected": -354.1499938964844, "loss": 0.1515, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.22291412949562073, "rewards/margins": 3.586718797683716, "rewards/rejected": -3.366406202316284, "step": 2640 }, { "epoch": 1.39694254085398, "grad_norm": 17.50933618481225, "learning_rate": 6.508961518186609e-07, "logits/chosen": -1.684667944908142, "logits/rejected": -1.575292944908142, "logps/chosen": -365.7250061035156, "logps/rejected": -344.9750061035156, "loss": 0.1377, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4352783262729645, "rewards/margins": 3.369140625, "rewards/rejected": -2.9332032203674316, "step": 2650 }, { "epoch": 1.4022140221402215, "grad_norm": 22.73951776403932, "learning_rate": 6.495782814971007e-07, "logits/chosen": -1.885644555091858, "logits/rejected": -1.6572265625, "logps/chosen": -284.79998779296875, "logps/rejected": -303.75, "loss": 0.1175, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1406707763671875, "rewards/margins": 3.5941405296325684, "rewards/rejected": -3.453906297683716, "step": 2660 }, { "epoch": 1.4074855034264628, "grad_norm": 43.7991578386952, "learning_rate": 6.482604111755403e-07, "logits/chosen": -1.96875, "logits/rejected": -1.90771484375, "logps/chosen": -330.88751220703125, "logps/rejected": -349.70001220703125, "loss": 0.1771, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.40582275390625, "rewards/margins": 3.429882764816284, "rewards/rejected": -3.8365235328674316, "step": 2670 }, { "epoch": 1.4127569847127042, "grad_norm": 11.65186658544004, "learning_rate": 6.469425408539799e-07, "logits/chosen": -1.8669922351837158, "logits/rejected": -1.9055664539337158, "logps/chosen": -347.875, "logps/rejected": -328.3500061035156, "loss": 0.1428, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4513305723667145, "rewards/margins": 3.352734327316284, "rewards/rejected": -3.803906202316284, "step": 2680 }, { "epoch": 1.4180284659989457, "grad_norm": 42.972600868365326, "learning_rate": 6.456246705324195e-07, "logits/chosen": -1.8193359375, "logits/rejected": -1.672521948814392, "logps/chosen": -334.625, "logps/rejected": -368.54998779296875, "loss": 0.1779, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4918456971645355, "rewards/margins": 3.350292921066284, "rewards/rejected": -3.846386671066284, "step": 2690 }, { "epoch": 1.4232999472851873, "grad_norm": 16.482020632567874, "learning_rate": 6.443068002108592e-07, "logits/chosen": -1.78466796875, "logits/rejected": -1.728417992591858, "logps/chosen": -336.375, "logps/rejected": -370.45001220703125, "loss": 0.1223, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.33983153104782104, "rewards/margins": 3.6763672828674316, "rewards/rejected": -4.016015529632568, "step": 2700 }, { "epoch": 1.4285714285714286, "grad_norm": 30.861592946123622, "learning_rate": 6.429889298892988e-07, "logits/chosen": -1.7306640148162842, "logits/rejected": -1.6711914539337158, "logps/chosen": -342.17498779296875, "logps/rejected": -356.125, "loss": 0.1409, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3097290098667145, "rewards/margins": 3.2275390625, "rewards/rejected": -3.5390625, "step": 2710 }, { "epoch": 1.43384290985767, "grad_norm": 24.50432918960733, "learning_rate": 6.416710595677385e-07, "logits/chosen": -1.6676757335662842, "logits/rejected": -1.7345702648162842, "logps/chosen": -336.20001220703125, "logps/rejected": -324.48748779296875, "loss": 0.1764, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4016357362270355, "rewards/margins": 3.1158204078674316, "rewards/rejected": -3.516406297683716, "step": 2720 }, { "epoch": 1.4391143911439115, "grad_norm": 18.898508143571018, "learning_rate": 6.403531892461781e-07, "logits/chosen": -1.601806640625, "logits/rejected": -1.611718773841858, "logps/chosen": -380.6499938964844, "logps/rejected": -357.98748779296875, "loss": 0.1728, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.18781432509422302, "rewards/margins": 3.133105516433716, "rewards/rejected": -3.320507764816284, "step": 2730 }, { "epoch": 1.4443858724301528, "grad_norm": 12.611203866710389, "learning_rate": 6.390353189246178e-07, "logits/chosen": -1.610937476158142, "logits/rejected": -1.5979492664337158, "logps/chosen": -350.38751220703125, "logps/rejected": -335.4375, "loss": 0.1608, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0517578125, "rewards/margins": 3.0433592796325684, "rewards/rejected": -3.0947265625, "step": 2740 }, { "epoch": 1.4496573537163944, "grad_norm": 30.920865814220424, "learning_rate": 6.377174486030574e-07, "logits/chosen": -1.820898413658142, "logits/rejected": -1.617529273033142, "logps/chosen": -331.9375, "logps/rejected": -363.70001220703125, "loss": 0.172, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.18880005180835724, "rewards/margins": 3.21484375, "rewards/rejected": -3.4019532203674316, "step": 2750 }, { "epoch": 1.4549288350026357, "grad_norm": 21.9417371711049, "learning_rate": 6.36399578281497e-07, "logits/chosen": -1.6496093273162842, "logits/rejected": -1.5725586414337158, "logps/chosen": -360.0249938964844, "logps/rejected": -363.75, "loss": 0.1092, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.20637817680835724, "rewards/margins": 3.5625, "rewards/rejected": -3.767773389816284, "step": 2760 }, { "epoch": 1.4602003162888773, "grad_norm": 16.712491586560624, "learning_rate": 6.350817079599367e-07, "logits/chosen": -1.7570312023162842, "logits/rejected": -1.762109398841858, "logps/chosen": -331.1499938964844, "logps/rejected": -331.1000061035156, "loss": 0.149, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.34235841035842896, "rewards/margins": 3.47265625, "rewards/rejected": -3.813281297683716, "step": 2770 }, { "epoch": 1.4654717975751186, "grad_norm": 15.182348217993573, "learning_rate": 6.337638376383764e-07, "logits/chosen": -1.8205077648162842, "logits/rejected": -1.5519530773162842, "logps/chosen": -344.125, "logps/rejected": -361.5249938964844, "loss": 0.1566, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.6149657964706421, "rewards/margins": 3.4134764671325684, "rewards/rejected": -4.0283203125, "step": 2780 }, { "epoch": 1.47074327886136, "grad_norm": 36.837695889168074, "learning_rate": 6.32445967316816e-07, "logits/chosen": -1.7834961414337158, "logits/rejected": -1.66064453125, "logps/chosen": -354.98748779296875, "logps/rejected": -352.57501220703125, "loss": 0.1407, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9634033441543579, "rewards/margins": 3.457812547683716, "rewards/rejected": -4.424218654632568, "step": 2790 }, { "epoch": 1.4760147601476015, "grad_norm": 16.074119251672748, "learning_rate": 6.311280969952556e-07, "logits/chosen": -1.72802734375, "logits/rejected": -1.598242163658142, "logps/chosen": -368.67498779296875, "logps/rejected": -369.04998779296875, "loss": 0.1327, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.49090576171875, "rewards/margins": 3.278515577316284, "rewards/rejected": -4.771874904632568, "step": 2800 }, { "epoch": 1.481286241433843, "grad_norm": 20.37775055759416, "learning_rate": 6.298102266736953e-07, "logits/chosen": -1.9660155773162842, "logits/rejected": -1.8359863758087158, "logps/chosen": -348.61248779296875, "logps/rejected": -367.70001220703125, "loss": 0.1754, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.1052367687225342, "rewards/margins": 3.342041015625, "rewards/rejected": -4.443945407867432, "step": 2810 }, { "epoch": 1.4865577227200844, "grad_norm": 23.157507588769395, "learning_rate": 6.284923563521349e-07, "logits/chosen": -1.951562523841858, "logits/rejected": -1.8914062976837158, "logps/chosen": -345.7250061035156, "logps/rejected": -338.17498779296875, "loss": 0.1295, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.23095703125, "rewards/margins": 3.162304639816284, "rewards/rejected": -4.391406059265137, "step": 2820 }, { "epoch": 1.4918292040063257, "grad_norm": 48.83793330500991, "learning_rate": 6.271744860305746e-07, "logits/chosen": -1.779296875, "logits/rejected": -1.7874023914337158, "logps/chosen": -368.29998779296875, "logps/rejected": -372.13751220703125, "loss": 0.1392, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.240136742591858, "rewards/margins": 3.284374952316284, "rewards/rejected": -4.525000095367432, "step": 2830 }, { "epoch": 1.4971006852925672, "grad_norm": 11.639978229917297, "learning_rate": 6.258566157090142e-07, "logits/chosen": -2.003710985183716, "logits/rejected": -1.9685547351837158, "logps/chosen": -332.125, "logps/rejected": -356.75, "loss": 0.1704, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.9014068841934204, "rewards/margins": 3.3646483421325684, "rewards/rejected": -4.266015529632568, "step": 2840 }, { "epoch": 1.5023721665788088, "grad_norm": 46.15917746717861, "learning_rate": 6.245387453874539e-07, "logits/chosen": -1.874609351158142, "logits/rejected": -1.8214843273162842, "logps/chosen": -368.29998779296875, "logps/rejected": -364.29998779296875, "loss": 0.1297, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.983502209186554, "rewards/margins": 3.4771485328674316, "rewards/rejected": -4.461718559265137, "step": 2850 }, { "epoch": 1.5076436478650501, "grad_norm": 30.91975572547872, "learning_rate": 6.232208750658935e-07, "logits/chosen": -1.9470703601837158, "logits/rejected": -1.866796851158142, "logps/chosen": -319.45001220703125, "logps/rejected": -329.9624938964844, "loss": 0.1909, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.8100341558456421, "rewards/margins": 3.2261719703674316, "rewards/rejected": -4.034570217132568, "step": 2860 }, { "epoch": 1.5129151291512914, "grad_norm": 33.48371587802421, "learning_rate": 6.219030047443331e-07, "logits/chosen": -1.858300805091858, "logits/rejected": -1.987890601158142, "logps/chosen": -353.79998779296875, "logps/rejected": -329.9750061035156, "loss": 0.131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0107421875, "rewards/margins": 3.256054639816284, "rewards/rejected": -4.265234470367432, "step": 2870 }, { "epoch": 1.518186610437533, "grad_norm": 29.373623242002115, "learning_rate": 6.205851344227727e-07, "logits/chosen": -1.855078101158142, "logits/rejected": -1.7451171875, "logps/chosen": -356.3374938964844, "logps/rejected": -371.0, "loss": 0.1544, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.7534240484237671, "rewards/margins": 3.530468702316284, "rewards/rejected": -4.280077934265137, "step": 2880 }, { "epoch": 1.5234580917237743, "grad_norm": 22.4388016485181, "learning_rate": 6.192672641012125e-07, "logits/chosen": -2.0140624046325684, "logits/rejected": -1.9767577648162842, "logps/chosen": -317.3999938964844, "logps/rejected": -334.5249938964844, "loss": 0.1886, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.740313708782196, "rewards/margins": 3.1890625953674316, "rewards/rejected": -3.9300780296325684, "step": 2890 }, { "epoch": 1.5287295730100157, "grad_norm": 24.045082635037268, "learning_rate": 6.179493937796521e-07, "logits/chosen": -1.872656226158142, "logits/rejected": -1.8039062023162842, "logps/chosen": -355.8999938964844, "logps/rejected": -350.6000061035156, "loss": 0.1369, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6232665777206421, "rewards/margins": 3.431835889816284, "rewards/rejected": -4.055468559265137, "step": 2900 }, { "epoch": 1.5340010542962572, "grad_norm": 33.04556901324208, "learning_rate": 6.166315234580917e-07, "logits/chosen": -1.858789086341858, "logits/rejected": -1.745019555091858, "logps/chosen": -340.41876220703125, "logps/rejected": -324.45001220703125, "loss": 0.1827, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02940673753619194, "rewards/margins": 3.1630859375, "rewards/rejected": -3.136914014816284, "step": 2910 }, { "epoch": 1.5392725355824988, "grad_norm": 17.837961186947936, "learning_rate": 6.153136531365314e-07, "logits/chosen": -1.8986327648162842, "logits/rejected": -1.600195288658142, "logps/chosen": -302.0249938964844, "logps/rejected": -363.7250061035156, "loss": 0.1698, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.09085693210363388, "rewards/margins": 3.2923827171325684, "rewards/rejected": -3.3833985328674316, "step": 2920 }, { "epoch": 1.54454401686874, "grad_norm": 30.455256519577272, "learning_rate": 6.13995782814971e-07, "logits/chosen": -1.9011719226837158, "logits/rejected": -1.7304198741912842, "logps/chosen": -340.95001220703125, "logps/rejected": -351.7124938964844, "loss": 0.1218, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.447213739156723, "rewards/margins": 3.3763670921325684, "rewards/rejected": -3.824414014816284, "step": 2930 }, { "epoch": 1.5498154981549814, "grad_norm": 18.251482605401048, "learning_rate": 6.126779124934106e-07, "logits/chosen": -1.861328125, "logits/rejected": -1.7136719226837158, "logps/chosen": -341.25, "logps/rejected": -325.32501220703125, "loss": 0.1488, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.41680908203125, "rewards/margins": 3.202929735183716, "rewards/rejected": -3.6224608421325684, "step": 2940 }, { "epoch": 1.555086979441223, "grad_norm": 14.721126593274295, "learning_rate": 6.113600421718503e-07, "logits/chosen": -1.768164038658142, "logits/rejected": -1.581518530845642, "logps/chosen": -356.7250061035156, "logps/rejected": -371.17498779296875, "loss": 0.1476, "rewards/accuracies": 0.9375, "rewards/chosen": -0.48244017362594604, "rewards/margins": 3.4598145484924316, "rewards/rejected": -3.94189453125, "step": 2950 }, { "epoch": 1.5603584607274645, "grad_norm": 15.383932622071223, "learning_rate": 6.1004217185029e-07, "logits/chosen": -1.668554663658142, "logits/rejected": -1.546972632408142, "logps/chosen": -329.17498779296875, "logps/rejected": -332.2124938964844, "loss": 0.1354, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6695922613143921, "rewards/margins": 3.3960938453674316, "rewards/rejected": -4.065234184265137, "step": 2960 }, { "epoch": 1.5656299420137059, "grad_norm": 23.27410120844115, "learning_rate": 6.087243015287296e-07, "logits/chosen": -1.710058569908142, "logits/rejected": -1.643652319908142, "logps/chosen": -342.13751220703125, "logps/rejected": -331.5, "loss": 0.1493, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.4785827696323395, "rewards/margins": 3.416699171066284, "rewards/rejected": -3.896484375, "step": 2970 }, { "epoch": 1.5709014232999472, "grad_norm": 16.641566880402497, "learning_rate": 6.074064312071692e-07, "logits/chosen": -1.6398437023162842, "logits/rejected": -1.5675780773162842, "logps/chosen": -338.8500061035156, "logps/rejected": -331.5, "loss": 0.1281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.528698742389679, "rewards/margins": 3.4515624046325684, "rewards/rejected": -3.9791016578674316, "step": 2980 }, { "epoch": 1.5761729045861887, "grad_norm": 18.496258137541197, "learning_rate": 6.060885608856087e-07, "logits/chosen": -1.816064476966858, "logits/rejected": -1.6619141101837158, "logps/chosen": -333.01251220703125, "logps/rejected": -335.79998779296875, "loss": 0.1354, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.514331042766571, "rewards/margins": 3.5787596702575684, "rewards/rejected": -4.094531059265137, "step": 2990 }, { "epoch": 1.5814443858724303, "grad_norm": 24.581001127772243, "learning_rate": 6.047706905640486e-07, "logits/chosen": -1.811132788658142, "logits/rejected": -1.587060570716858, "logps/chosen": -336.57501220703125, "logps/rejected": -380.875, "loss": 0.1309, "rewards/accuracies": 0.9375, "rewards/chosen": -0.607250988483429, "rewards/margins": 3.620312452316284, "rewards/rejected": -4.226171970367432, "step": 3000 }, { "epoch": 1.5867158671586716, "grad_norm": 15.184215790995834, "learning_rate": 6.034528202424881e-07, "logits/chosen": -1.9010741710662842, "logits/rejected": -1.9416015148162842, "logps/chosen": -350.54998779296875, "logps/rejected": -339.6000061035156, "loss": 0.1793, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4512939453125, "rewards/margins": 3.27490234375, "rewards/rejected": -3.7255859375, "step": 3010 }, { "epoch": 1.591987348444913, "grad_norm": 50.054575460686024, "learning_rate": 6.021349499209277e-07, "logits/chosen": -1.849511742591858, "logits/rejected": -1.783105492591858, "logps/chosen": -338.375, "logps/rejected": -316.67498779296875, "loss": 0.1772, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.22459717094898224, "rewards/margins": 3.4287109375, "rewards/rejected": -3.652148485183716, "step": 3020 }, { "epoch": 1.5972588297311545, "grad_norm": 12.077529614856068, "learning_rate": 6.008170795993674e-07, "logits/chosen": -1.9059569835662842, "logits/rejected": -1.85400390625, "logps/chosen": -323.8374938964844, "logps/rejected": -309.9750061035156, "loss": 0.1592, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.34150391817092896, "rewards/margins": 3.328125, "rewards/rejected": -3.665820360183716, "step": 3030 }, { "epoch": 1.6025303110173958, "grad_norm": 30.710521737766882, "learning_rate": 5.99499209277807e-07, "logits/chosen": -1.9607422351837158, "logits/rejected": -1.892480492591858, "logps/chosen": -353.6000061035156, "logps/rejected": -378.54998779296875, "loss": 0.1468, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7057129144668579, "rewards/margins": 3.4593749046325684, "rewards/rejected": -4.165234565734863, "step": 3040 }, { "epoch": 1.6078017923036372, "grad_norm": 30.222053837149364, "learning_rate": 5.981813389562466e-07, "logits/chosen": -1.9070312976837158, "logits/rejected": -1.671289086341858, "logps/chosen": -339.6000061035156, "logps/rejected": -336.3500061035156, "loss": 0.1623, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.719470202922821, "rewards/margins": 3.3306641578674316, "rewards/rejected": -4.049414157867432, "step": 3050 }, { "epoch": 1.6130732735898787, "grad_norm": 18.16449883261072, "learning_rate": 5.968634686346863e-07, "logits/chosen": -1.804296851158142, "logits/rejected": -1.837499976158142, "logps/chosen": -358.82501220703125, "logps/rejected": -329.2124938964844, "loss": 0.1316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1722290515899658, "rewards/margins": 3.357421875, "rewards/rejected": -4.528124809265137, "step": 3060 }, { "epoch": 1.6183447548761203, "grad_norm": 22.317252577283845, "learning_rate": 5.95545598313126e-07, "logits/chosen": -1.897851586341858, "logits/rejected": -1.724023461341858, "logps/chosen": -348.67498779296875, "logps/rejected": -383.0249938964844, "loss": 0.1625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.923266589641571, "rewards/margins": 3.4976563453674316, "rewards/rejected": -4.420702934265137, "step": 3070 }, { "epoch": 1.6236162361623616, "grad_norm": 32.27150196467819, "learning_rate": 5.942277279915656e-07, "logits/chosen": -1.948339819908142, "logits/rejected": -1.8337891101837158, "logps/chosen": -364.1499938964844, "logps/rejected": -362.95001220703125, "loss": 0.1498, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.920092761516571, "rewards/margins": 3.66015625, "rewards/rejected": -4.57958984375, "step": 3080 }, { "epoch": 1.628887717448603, "grad_norm": 35.62816900982099, "learning_rate": 5.929098576700052e-07, "logits/chosen": -2.0435547828674316, "logits/rejected": -1.87451171875, "logps/chosen": -297.61248779296875, "logps/rejected": -321.3999938964844, "loss": 0.1516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9284728765487671, "rewards/margins": 3.2642579078674316, "rewards/rejected": -4.194140434265137, "step": 3090 }, { "epoch": 1.6341591987348445, "grad_norm": 18.163961598079457, "learning_rate": 5.915919873484448e-07, "logits/chosen": -1.954199194908142, "logits/rejected": -2.008984327316284, "logps/chosen": -302.2875061035156, "logps/rejected": -316.63751220703125, "loss": 0.1538, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7022460699081421, "rewards/margins": 3.409374952316284, "rewards/rejected": -4.110742092132568, "step": 3100 }, { "epoch": 1.639430680021086, "grad_norm": 24.65091094121933, "learning_rate": 5.902741170268845e-07, "logits/chosen": -1.848291039466858, "logits/rejected": -1.8127930164337158, "logps/chosen": -316.3374938964844, "logps/rejected": -341.7749938964844, "loss": 0.1491, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.3839050233364105, "rewards/margins": 3.2987303733825684, "rewards/rejected": -3.6830077171325684, "step": 3110 }, { "epoch": 1.6447021613073274, "grad_norm": 32.97602676659443, "learning_rate": 5.889562467053242e-07, "logits/chosen": -1.900976538658142, "logits/rejected": -1.7580077648162842, "logps/chosen": -346.6875, "logps/rejected": -364.5, "loss": 0.1782, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.14965209364891052, "rewards/margins": 3.4039063453674316, "rewards/rejected": -3.5542969703674316, "step": 3120 }, { "epoch": 1.6499736425935687, "grad_norm": 15.42479865356294, "learning_rate": 5.876383763837638e-07, "logits/chosen": -1.76953125, "logits/rejected": -1.6746094226837158, "logps/chosen": -339.95001220703125, "logps/rejected": -367.76251220703125, "loss": 0.1341, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.05546874925494194, "rewards/margins": 3.7183594703674316, "rewards/rejected": -3.6636719703674316, "step": 3130 }, { "epoch": 1.6552451238798103, "grad_norm": 21.157074825285086, "learning_rate": 5.863205060622034e-07, "logits/chosen": -1.76953125, "logits/rejected": -1.546875, "logps/chosen": -319.95001220703125, "logps/rejected": -361.2875061035156, "loss": 0.1569, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.36439210176467896, "rewards/margins": 3.478515625, "rewards/rejected": -3.84130859375, "step": 3140 }, { "epoch": 1.6605166051660518, "grad_norm": 10.184644559350557, "learning_rate": 5.850026357406431e-07, "logits/chosen": -1.8681640625, "logits/rejected": -1.753320336341858, "logps/chosen": -345.1625061035156, "logps/rejected": -344.4750061035156, "loss": 0.1423, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.34809571504592896, "rewards/margins": 3.5732421875, "rewards/rejected": -3.922070264816284, "step": 3150 }, { "epoch": 1.6657880864522931, "grad_norm": 31.773594790230913, "learning_rate": 5.836847654190827e-07, "logits/chosen": -1.7111327648162842, "logits/rejected": -1.5569336414337158, "logps/chosen": -349.45001220703125, "logps/rejected": -380.2749938964844, "loss": 0.1499, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.517932116985321, "rewards/margins": 3.4322266578674316, "rewards/rejected": -3.948437452316284, "step": 3160 }, { "epoch": 1.6710595677385345, "grad_norm": 27.561388279607026, "learning_rate": 5.823668950975223e-07, "logits/chosen": -1.7594726085662842, "logits/rejected": -1.742773413658142, "logps/chosen": -355.67498779296875, "logps/rejected": -343.0375061035156, "loss": 0.1394, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.968994140625, "rewards/margins": 3.3814454078674316, "rewards/rejected": -4.351171970367432, "step": 3170 }, { "epoch": 1.676331049024776, "grad_norm": 26.401004526746583, "learning_rate": 5.810490247759621e-07, "logits/chosen": -1.917578101158142, "logits/rejected": -1.681640625, "logps/chosen": -320.8374938964844, "logps/rejected": -344.20001220703125, "loss": 0.1123, "rewards/accuracies": 0.96875, "rewards/chosen": -1.042724609375, "rewards/margins": 3.3687500953674316, "rewards/rejected": -4.408593654632568, "step": 3180 }, { "epoch": 1.6816025303110174, "grad_norm": 40.84846325860029, "learning_rate": 5.797311544544017e-07, "logits/chosen": -1.794335961341858, "logits/rejected": -1.780615210533142, "logps/chosen": -359.1875, "logps/rejected": -366.32501220703125, "loss": 0.1412, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9248291254043579, "rewards/margins": 3.4593749046325684, "rewards/rejected": -4.383593559265137, "step": 3190 }, { "epoch": 1.6868740115972587, "grad_norm": 22.493180957403187, "learning_rate": 5.784132841328413e-07, "logits/chosen": -1.944726586341858, "logits/rejected": -1.779687523841858, "logps/chosen": -318.625, "logps/rejected": -347.9375, "loss": 0.1592, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0084960460662842, "rewards/margins": 3.455371141433716, "rewards/rejected": -4.462109565734863, "step": 3200 }, { "epoch": 1.6921454928835002, "grad_norm": 25.31431403035397, "learning_rate": 5.770954138112809e-07, "logits/chosen": -2.0049805641174316, "logits/rejected": -1.792578101158142, "logps/chosen": -312.2875061035156, "logps/rejected": -342.1000061035156, "loss": 0.1274, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.2181396484375, "rewards/margins": 3.676562547683716, "rewards/rejected": -4.894140720367432, "step": 3210 }, { "epoch": 1.6974169741697418, "grad_norm": 46.2971895673076, "learning_rate": 5.757775434897206e-07, "logits/chosen": -1.976660132408142, "logits/rejected": -1.904687523841858, "logps/chosen": -321.7875061035156, "logps/rejected": -321.67498779296875, "loss": 0.1626, "rewards/accuracies": 0.9375, "rewards/chosen": -1.718359351158142, "rewards/margins": 3.627148389816284, "rewards/rejected": -5.344140529632568, "step": 3220 }, { "epoch": 1.7026884554559831, "grad_norm": 31.584381909523973, "learning_rate": 5.744596731681603e-07, "logits/chosen": -1.848242163658142, "logits/rejected": -1.8532226085662842, "logps/chosen": -381.4750061035156, "logps/rejected": -372.04998779296875, "loss": 0.0944, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.956518530845642, "rewards/margins": 3.9330077171325684, "rewards/rejected": -5.887499809265137, "step": 3230 }, { "epoch": 1.7079599367422245, "grad_norm": 33.818904173923194, "learning_rate": 5.731418028465999e-07, "logits/chosen": -1.9728515148162842, "logits/rejected": -1.8054687976837158, "logps/chosen": -359.98748779296875, "logps/rejected": -356.9125061035156, "loss": 0.1353, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6968262195587158, "rewards/margins": 3.6890625953674316, "rewards/rejected": -5.38671875, "step": 3240 }, { "epoch": 1.713231418028466, "grad_norm": 42.08167385920864, "learning_rate": 5.718239325250395e-07, "logits/chosen": -1.9494140148162842, "logits/rejected": -1.828710913658142, "logps/chosen": -291.5625, "logps/rejected": -309.54998779296875, "loss": 0.1367, "rewards/accuracies": 0.96875, "rewards/chosen": -1.686254858970642, "rewards/margins": 3.592968702316284, "rewards/rejected": -5.280077934265137, "step": 3250 }, { "epoch": 1.7185028993147076, "grad_norm": 35.77838587117817, "learning_rate": 5.705060622034792e-07, "logits/chosen": -2.0224609375, "logits/rejected": -1.8781249523162842, "logps/chosen": -313.8999938964844, "logps/rejected": -346.25, "loss": 0.1573, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4754149913787842, "rewards/margins": 3.7396483421325684, "rewards/rejected": -5.21484375, "step": 3260 }, { "epoch": 1.723774380600949, "grad_norm": 26.997141394986016, "learning_rate": 5.691881918819188e-07, "logits/chosen": -2.01123046875, "logits/rejected": -1.816748023033142, "logps/chosen": -340.13751220703125, "logps/rejected": -351.20001220703125, "loss": 0.1728, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3629639148712158, "rewards/margins": 3.2025389671325684, "rewards/rejected": -4.564843654632568, "step": 3270 }, { "epoch": 1.7290458618871902, "grad_norm": 18.325808851882716, "learning_rate": 5.678703215603584e-07, "logits/chosen": -1.7786133289337158, "logits/rejected": -1.690332055091858, "logps/chosen": -365.7749938964844, "logps/rejected": -377.6499938964844, "loss": 0.0874, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1450684070587158, "rewards/margins": 3.661328077316284, "rewards/rejected": -4.805859565734863, "step": 3280 }, { "epoch": 1.7343173431734318, "grad_norm": 53.623477565786956, "learning_rate": 5.665524512387981e-07, "logits/chosen": -1.8466796875, "logits/rejected": -1.770117163658142, "logps/chosen": -334.6625061035156, "logps/rejected": -377.70001220703125, "loss": 0.1822, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0659576654434204, "rewards/margins": 3.520312547683716, "rewards/rejected": -4.589453220367432, "step": 3290 }, { "epoch": 1.7395888244596733, "grad_norm": 11.188746539285793, "learning_rate": 5.652345809172378e-07, "logits/chosen": -1.677636742591858, "logits/rejected": -1.7208983898162842, "logps/chosen": -436.2250061035156, "logps/rejected": -410.0, "loss": 0.1145, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5780670046806335, "rewards/margins": 3.866015672683716, "rewards/rejected": -4.444531440734863, "step": 3300 }, { "epoch": 1.7448603057459144, "grad_norm": 25.555420255049473, "learning_rate": 5.639167105956774e-07, "logits/chosen": -1.894433617591858, "logits/rejected": -1.8044922351837158, "logps/chosen": -358.9375, "logps/rejected": -348.45001220703125, "loss": 0.1526, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.21875, "rewards/margins": 3.215039014816284, "rewards/rejected": -4.439062595367432, "step": 3310 }, { "epoch": 1.750131787032156, "grad_norm": 12.60105540187396, "learning_rate": 5.62598840274117e-07, "logits/chosen": -1.845117211341858, "logits/rejected": -1.7291991710662842, "logps/chosen": -332.26251220703125, "logps/rejected": -370.4375, "loss": 0.1167, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3186553716659546, "rewards/margins": 3.9251952171325684, "rewards/rejected": -5.246484279632568, "step": 3320 }, { "epoch": 1.7554032683183975, "grad_norm": 62.835143224790734, "learning_rate": 5.612809699525567e-07, "logits/chosen": -2.0556640625, "logits/rejected": -1.897070288658142, "logps/chosen": -321.29998779296875, "logps/rejected": -342.79998779296875, "loss": 0.1549, "rewards/accuracies": 0.9375, "rewards/chosen": -1.236639380455017, "rewards/margins": 3.3881592750549316, "rewards/rejected": -4.625390529632568, "step": 3330 }, { "epoch": 1.7606747496046389, "grad_norm": 21.317544517070193, "learning_rate": 5.599630996309963e-07, "logits/chosen": -1.8444335460662842, "logits/rejected": -1.865820288658142, "logps/chosen": -373.20001220703125, "logps/rejected": -366.95001220703125, "loss": 0.1402, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.320959448814392, "rewards/margins": 3.8848633766174316, "rewards/rejected": -5.205468654632568, "step": 3340 }, { "epoch": 1.7659462308908802, "grad_norm": 15.46552007986581, "learning_rate": 5.58645229309436e-07, "logits/chosen": -1.8789551258087158, "logits/rejected": -1.8468749523162842, "logps/chosen": -356.8500061035156, "logps/rejected": -386.07501220703125, "loss": 0.1477, "rewards/accuracies": 0.9375, "rewards/chosen": -1.245642066001892, "rewards/margins": 3.6124024391174316, "rewards/rejected": -4.857031345367432, "step": 3350 }, { "epoch": 1.7712177121771218, "grad_norm": 17.932486290571752, "learning_rate": 5.573273589878755e-07, "logits/chosen": -1.788671851158142, "logits/rejected": -1.749902367591858, "logps/chosen": -390.0625, "logps/rejected": -386.375, "loss": 0.1248, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.364892601966858, "rewards/margins": 3.8197264671325684, "rewards/rejected": -5.186327934265137, "step": 3360 }, { "epoch": 1.7764891934633633, "grad_norm": 18.82545769747502, "learning_rate": 5.560094886663152e-07, "logits/chosen": -1.9235351085662842, "logits/rejected": -1.83056640625, "logps/chosen": -326.7124938964844, "logps/rejected": -349.375, "loss": 0.1281, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.308374047279358, "rewards/margins": 3.867968797683716, "rewards/rejected": -5.177343845367432, "step": 3370 }, { "epoch": 1.7817606747496046, "grad_norm": 31.18028713710813, "learning_rate": 5.546916183447548e-07, "logits/chosen": -1.8225586414337158, "logits/rejected": -1.708349585533142, "logps/chosen": -334.95001220703125, "logps/rejected": -346.5, "loss": 0.204, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.175048828125, "rewards/margins": 3.432812452316284, "rewards/rejected": -4.610156059265137, "step": 3380 }, { "epoch": 1.787032156035846, "grad_norm": 18.038813221819495, "learning_rate": 5.533737480231944e-07, "logits/chosen": -1.922460913658142, "logits/rejected": -1.7267577648162842, "logps/chosen": -299.86248779296875, "logps/rejected": -330.875, "loss": 0.1717, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.02960205078125, "rewards/margins": 3.1465821266174316, "rewards/rejected": -4.177343845367432, "step": 3390 }, { "epoch": 1.7923036373220875, "grad_norm": 20.51131353002816, "learning_rate": 5.520558777016341e-07, "logits/chosen": -1.70654296875, "logits/rejected": -1.68212890625, "logps/chosen": -380.2250061035156, "logps/rejected": -351.375, "loss": 0.119, "rewards/accuracies": 0.96875, "rewards/chosen": -0.949176013469696, "rewards/margins": 3.499218702316284, "rewards/rejected": -4.445703029632568, "step": 3400 }, { "epoch": 1.797575118608329, "grad_norm": 13.28327696149562, "learning_rate": 5.507380073800738e-07, "logits/chosen": -1.824609398841858, "logits/rejected": -1.666894555091858, "logps/chosen": -335.54998779296875, "logps/rejected": -317.95001220703125, "loss": 0.126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0531005859375, "rewards/margins": 3.6151366233825684, "rewards/rejected": -4.668359279632568, "step": 3410 }, { "epoch": 1.8028465998945704, "grad_norm": 18.74135949322565, "learning_rate": 5.494201370585134e-07, "logits/chosen": -1.8156249523162842, "logits/rejected": -1.698486328125, "logps/chosen": -358.3500061035156, "logps/rejected": -380.6000061035156, "loss": 0.1335, "rewards/accuracies": 0.96875, "rewards/chosen": -1.005621314048767, "rewards/margins": 3.618359327316284, "rewards/rejected": -4.623827934265137, "step": 3420 }, { "epoch": 1.8081180811808117, "grad_norm": 21.285549244373723, "learning_rate": 5.48102266736953e-07, "logits/chosen": -1.7921874523162842, "logits/rejected": -1.7595703601837158, "logps/chosen": -352.13751220703125, "logps/rejected": -358.38751220703125, "loss": 0.1429, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.23651123046875, "rewards/margins": 3.5748047828674316, "rewards/rejected": -4.810546875, "step": 3430 }, { "epoch": 1.8133895624670533, "grad_norm": 41.49603447075779, "learning_rate": 5.467843964153926e-07, "logits/chosen": -1.909570336341858, "logits/rejected": -1.8125, "logps/chosen": -314.625, "logps/rejected": -350.82501220703125, "loss": 0.1807, "rewards/accuracies": 0.9375, "rewards/chosen": -1.151391625404358, "rewards/margins": 3.3216795921325684, "rewards/rejected": -4.474609375, "step": 3440 }, { "epoch": 1.8186610437532946, "grad_norm": 23.06154971108953, "learning_rate": 5.454665260938323e-07, "logits/chosen": -1.819433569908142, "logits/rejected": -1.68603515625, "logps/chosen": -368.57501220703125, "logps/rejected": -358.79998779296875, "loss": 0.1926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.774383544921875, "rewards/margins": 3.343456983566284, "rewards/rejected": -4.118554592132568, "step": 3450 }, { "epoch": 1.823932525039536, "grad_norm": 47.1674471918034, "learning_rate": 5.44148655772272e-07, "logits/chosen": -1.94482421875, "logits/rejected": -1.699609398841858, "logps/chosen": -310.3374938964844, "logps/rejected": -351.25, "loss": 0.1307, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.58135986328125, "rewards/margins": 3.3900389671325684, "rewards/rejected": -3.9742188453674316, "step": 3460 }, { "epoch": 1.8292040063257775, "grad_norm": 28.312879557334316, "learning_rate": 5.428307854507116e-07, "logits/chosen": -1.8376953601837158, "logits/rejected": -1.856542944908142, "logps/chosen": -324.2749938964844, "logps/rejected": -295.3125, "loss": 0.1827, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.22652587294578552, "rewards/margins": 2.9671874046325684, "rewards/rejected": -3.1958985328674316, "step": 3470 }, { "epoch": 1.834475487612019, "grad_norm": 25.635275220961457, "learning_rate": 5.415129151291513e-07, "logits/chosen": -1.893457055091858, "logits/rejected": -1.8683593273162842, "logps/chosen": -338.45001220703125, "logps/rejected": -340.4125061035156, "loss": 0.1463, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.1660614013671875, "rewards/margins": 3.618945360183716, "rewards/rejected": -3.7867188453674316, "step": 3480 }, { "epoch": 1.8397469688982604, "grad_norm": 26.872009498842967, "learning_rate": 5.401950448075909e-07, "logits/chosen": -1.9587891101837158, "logits/rejected": -1.7426620721817017, "logps/chosen": -310.7875061035156, "logps/rejected": -346.04998779296875, "loss": 0.1615, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20444336533546448, "rewards/margins": 3.3402342796325684, "rewards/rejected": -3.5458984375, "step": 3490 }, { "epoch": 1.8450184501845017, "grad_norm": 29.91401007382486, "learning_rate": 5.388771744860305e-07, "logits/chosen": -1.6472656726837158, "logits/rejected": -1.554589867591858, "logps/chosen": -361.6875, "logps/rejected": -356.9750061035156, "loss": 0.1243, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.10750732570886612, "rewards/margins": 3.286328077316284, "rewards/rejected": -3.393749952316284, "step": 3500 }, { "epoch": 1.8502899314707433, "grad_norm": 14.567983791840545, "learning_rate": 5.375593041644701e-07, "logits/chosen": -1.856542944908142, "logits/rejected": -1.6218750476837158, "logps/chosen": -329.7124938964844, "logps/rejected": -363.67498779296875, "loss": 0.1303, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.585827648639679, "rewards/margins": 3.658007860183716, "rewards/rejected": -4.243750095367432, "step": 3510 }, { "epoch": 1.8555614127569848, "grad_norm": 32.619637616543166, "learning_rate": 5.362414338429099e-07, "logits/chosen": -1.9519531726837158, "logits/rejected": -1.8718750476837158, "logps/chosen": -324.25, "logps/rejected": -341.1875, "loss": 0.1607, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8178650140762329, "rewards/margins": 3.281054735183716, "rewards/rejected": -4.098242282867432, "step": 3520 }, { "epoch": 1.8608328940432262, "grad_norm": 11.522606994056073, "learning_rate": 5.349235635213495e-07, "logits/chosen": -1.9441406726837158, "logits/rejected": -1.79248046875, "logps/chosen": -316.6499938964844, "logps/rejected": -333.9750061035156, "loss": 0.1503, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5051025152206421, "rewards/margins": 3.4613280296325684, "rewards/rejected": -3.9654297828674316, "step": 3530 }, { "epoch": 1.8661043753294675, "grad_norm": 25.805772345693295, "learning_rate": 5.336056931997891e-07, "logits/chosen": -1.9518554210662842, "logits/rejected": -1.8317382335662842, "logps/chosen": -357.4125061035156, "logps/rejected": -357.25, "loss": 0.1215, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.99224853515625, "rewards/margins": 3.575000047683716, "rewards/rejected": -4.566796779632568, "step": 3540 }, { "epoch": 1.871375856615709, "grad_norm": 43.11563305178535, "learning_rate": 5.322878228782287e-07, "logits/chosen": -2.1001954078674316, "logits/rejected": -2.0791015625, "logps/chosen": -328.95001220703125, "logps/rejected": -333.07501220703125, "loss": 0.1885, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8788086175918579, "rewards/margins": 3.308789014816284, "rewards/rejected": -4.190039157867432, "step": 3550 }, { "epoch": 1.8766473379019506, "grad_norm": 41.21512709583967, "learning_rate": 5.309699525566684e-07, "logits/chosen": -1.868749976158142, "logits/rejected": -1.7451171875, "logps/chosen": -362.6000061035156, "logps/rejected": -356.70001220703125, "loss": 0.1461, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.806469738483429, "rewards/margins": 3.4115233421325684, "rewards/rejected": -4.217968940734863, "step": 3560 }, { "epoch": 1.881918819188192, "grad_norm": 15.163674982150514, "learning_rate": 5.296520822351081e-07, "logits/chosen": -1.9601562023162842, "logits/rejected": -1.791601538658142, "logps/chosen": -313.07501220703125, "logps/rejected": -342.0249938964844, "loss": 0.1325, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.760986328125, "rewards/margins": 3.7535157203674316, "rewards/rejected": -4.514452934265137, "step": 3570 }, { "epoch": 1.8871903004744333, "grad_norm": 13.769219078373292, "learning_rate": 5.283342119135477e-07, "logits/chosen": -1.8747069835662842, "logits/rejected": -1.843164086341858, "logps/chosen": -391.82501220703125, "logps/rejected": -360.92498779296875, "loss": 0.1216, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.709185779094696, "rewards/margins": 3.5875000953674316, "rewards/rejected": -4.293359279632568, "step": 3580 }, { "epoch": 1.8924617817606748, "grad_norm": 30.40050422252705, "learning_rate": 5.270163415919874e-07, "logits/chosen": -2.095507860183716, "logits/rejected": -1.9773437976837158, "logps/chosen": -338.45001220703125, "logps/rejected": -348.92498779296875, "loss": 0.1062, "rewards/accuracies": 0.96875, "rewards/chosen": -0.918469250202179, "rewards/margins": 3.9449219703674316, "rewards/rejected": -4.864843845367432, "step": 3590 }, { "epoch": 1.8977332630469161, "grad_norm": 35.256062653894126, "learning_rate": 5.25698471270427e-07, "logits/chosen": -1.900048851966858, "logits/rejected": -1.802734375, "logps/chosen": -393.625, "logps/rejected": -393.04998779296875, "loss": 0.1349, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.9106079339981079, "rewards/margins": 3.810546875, "rewards/rejected": -4.720312595367432, "step": 3600 }, { "epoch": 1.9030047443331575, "grad_norm": 34.71742576663307, "learning_rate": 5.243806009488666e-07, "logits/chosen": -2.017578125, "logits/rejected": -1.906835913658142, "logps/chosen": -333.2875061035156, "logps/rejected": -390.3999938964844, "loss": 0.136, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.809680163860321, "rewards/margins": 3.6712889671325684, "rewards/rejected": -4.4814453125, "step": 3610 }, { "epoch": 1.908276225619399, "grad_norm": 19.709170320932504, "learning_rate": 5.230627306273062e-07, "logits/chosen": -1.79931640625, "logits/rejected": -1.865234375, "logps/chosen": -360.95001220703125, "logps/rejected": -340.125, "loss": 0.1096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.435821533203125, "rewards/margins": 3.5863280296325684, "rewards/rejected": -4.021679878234863, "step": 3620 }, { "epoch": 1.9135477069056406, "grad_norm": 40.08768724528853, "learning_rate": 5.21744860305746e-07, "logits/chosen": -1.8821289539337158, "logits/rejected": -1.7392578125, "logps/chosen": -367.875, "logps/rejected": -372.92498779296875, "loss": 0.1618, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.70660400390625, "rewards/margins": 3.673828125, "rewards/rejected": -4.381445407867432, "step": 3630 }, { "epoch": 1.918819188191882, "grad_norm": 27.930893815043518, "learning_rate": 5.204269899841856e-07, "logits/chosen": -1.897851586341858, "logits/rejected": -1.699121117591858, "logps/chosen": -326.6000061035156, "logps/rejected": -349.7250061035156, "loss": 0.1635, "rewards/accuracies": 0.9375, "rewards/chosen": -0.73004150390625, "rewards/margins": 3.6982421875, "rewards/rejected": -4.427343845367432, "step": 3640 }, { "epoch": 1.9240906694781232, "grad_norm": 27.979602804908218, "learning_rate": 5.191091196626252e-07, "logits/chosen": -1.8916015625, "logits/rejected": -1.771484375, "logps/chosen": -334.1499938964844, "logps/rejected": -366.57501220703125, "loss": 0.1574, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8746093511581421, "rewards/margins": 3.681835889816284, "rewards/rejected": -4.556640625, "step": 3650 }, { "epoch": 1.9293621507643648, "grad_norm": 38.650977657312524, "learning_rate": 5.177912493410648e-07, "logits/chosen": -1.896093726158142, "logits/rejected": -1.8947265148162842, "logps/chosen": -365.0249938964844, "logps/rejected": -376.42498779296875, "loss": 0.1299, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7724243402481079, "rewards/margins": 3.642578125, "rewards/rejected": -4.416015625, "step": 3660 }, { "epoch": 1.9346336320506063, "grad_norm": 26.660761778083195, "learning_rate": 5.164733790195045e-07, "logits/chosen": -1.9748046398162842, "logits/rejected": -1.753515601158142, "logps/chosen": -314.1000061035156, "logps/rejected": -346.57501220703125, "loss": 0.1276, "rewards/accuracies": 0.96875, "rewards/chosen": -0.44671630859375, "rewards/margins": 3.53515625, "rewards/rejected": -3.981640577316284, "step": 3670 }, { "epoch": 1.9399051133368477, "grad_norm": 29.183091150722124, "learning_rate": 5.151555086979441e-07, "logits/chosen": -1.9636719226837158, "logits/rejected": -1.904687523841858, "logps/chosen": -334.4125061035156, "logps/rejected": -348.9375, "loss": 0.1618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.14404296875, "rewards/margins": 3.341601610183716, "rewards/rejected": -3.485156297683716, "step": 3680 }, { "epoch": 1.945176594623089, "grad_norm": 15.959160948556134, "learning_rate": 5.138376383763838e-07, "logits/chosen": -2.0439453125, "logits/rejected": -1.9158203601837158, "logps/chosen": -325.07501220703125, "logps/rejected": -344.125, "loss": 0.1663, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.25590819120407104, "rewards/margins": 3.174023389816284, "rewards/rejected": -3.4312500953674316, "step": 3690 }, { "epoch": 1.9504480759093306, "grad_norm": 33.127443358583626, "learning_rate": 5.125197680548234e-07, "logits/chosen": -1.9070312976837158, "logits/rejected": -1.833886742591858, "logps/chosen": -274.76251220703125, "logps/rejected": -319.95001220703125, "loss": 0.1651, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.5917114019393921, "rewards/margins": 3.6265625953674316, "rewards/rejected": -4.216992378234863, "step": 3700 }, { "epoch": 1.9557195571955721, "grad_norm": 14.10496383249452, "learning_rate": 5.11201897733263e-07, "logits/chosen": -1.8498046398162842, "logits/rejected": -1.7048828601837158, "logps/chosen": -352.875, "logps/rejected": -383.75, "loss": 0.1302, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4261535704135895, "rewards/margins": 3.465625047683716, "rewards/rejected": -3.8939452171325684, "step": 3710 }, { "epoch": 1.9609910384818134, "grad_norm": 30.488913443541676, "learning_rate": 5.098840274117026e-07, "logits/chosen": -1.8430664539337158, "logits/rejected": -1.7404296398162842, "logps/chosen": -343.67498779296875, "logps/rejected": -347.29998779296875, "loss": 0.1361, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7483764886856079, "rewards/margins": 3.4605469703674316, "rewards/rejected": -4.210546970367432, "step": 3720 }, { "epoch": 1.9662625197680548, "grad_norm": 19.409891511614656, "learning_rate": 5.085661570901422e-07, "logits/chosen": -1.9738280773162842, "logits/rejected": -1.795312523841858, "logps/chosen": -348.2124938964844, "logps/rejected": -372.875, "loss": 0.1514, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5690948367118835, "rewards/margins": 3.6357421875, "rewards/rejected": -4.20703125, "step": 3730 }, { "epoch": 1.9715340010542963, "grad_norm": 47.25515159413608, "learning_rate": 5.07248286768582e-07, "logits/chosen": -1.8810546398162842, "logits/rejected": -1.85791015625, "logps/chosen": -319.3999938964844, "logps/rejected": -324.3500061035156, "loss": 0.1724, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.896838366985321, "rewards/margins": 3.326171875, "rewards/rejected": -4.221484184265137, "step": 3740 }, { "epoch": 1.9768054823405377, "grad_norm": 18.78220052490528, "learning_rate": 5.059304164470216e-07, "logits/chosen": -1.896093726158142, "logits/rejected": -1.6306641101837158, "logps/chosen": -323.5, "logps/rejected": -342.2250061035156, "loss": 0.1371, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9059387445449829, "rewards/margins": 3.3783202171325684, "rewards/rejected": -4.282422065734863, "step": 3750 }, { "epoch": 1.982076963626779, "grad_norm": 27.967408223153477, "learning_rate": 5.046125461254612e-07, "logits/chosen": -1.884033203125, "logits/rejected": -1.8751952648162842, "logps/chosen": -339.7124938964844, "logps/rejected": -330.9125061035156, "loss": 0.1196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.621539294719696, "rewards/margins": 3.763476610183716, "rewards/rejected": -4.381249904632568, "step": 3760 }, { "epoch": 1.9873484449130205, "grad_norm": 12.059296745396594, "learning_rate": 5.032946758039008e-07, "logits/chosen": -1.9324219226837158, "logits/rejected": -1.8391602039337158, "logps/chosen": -338.6000061035156, "logps/rejected": -350.4375, "loss": 0.183, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5248047113418579, "rewards/margins": 3.505664110183716, "rewards/rejected": -4.029882907867432, "step": 3770 }, { "epoch": 1.992619926199262, "grad_norm": 36.27609410526328, "learning_rate": 5.019768054823405e-07, "logits/chosen": -1.8844726085662842, "logits/rejected": -1.802343726158142, "logps/chosen": -357.75, "logps/rejected": -342.5, "loss": 0.1401, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.4931274354457855, "rewards/margins": 3.4844727516174316, "rewards/rejected": -3.9755859375, "step": 3780 }, { "epoch": 1.9978914074855034, "grad_norm": 34.54598842670298, "learning_rate": 5.006589351607801e-07, "logits/chosen": -1.6854979991912842, "logits/rejected": -1.7374999523162842, "logps/chosen": -395.2749938964844, "logps/rejected": -373.25, "loss": 0.1114, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.45887452363967896, "rewards/margins": 3.667773485183716, "rewards/rejected": -4.126953125, "step": 3790 }, { "epoch": 2.0031628887717448, "grad_norm": 5.422924345377477, "learning_rate": 4.993410648392198e-07, "logits/chosen": -1.944726586341858, "logits/rejected": -1.8572266101837158, "logps/chosen": -342.7749938964844, "logps/rejected": -356.42498779296875, "loss": 0.0757, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4325927793979645, "rewards/margins": 3.9306640625, "rewards/rejected": -4.362890720367432, "step": 3800 }, { "epoch": 2.0084343700579863, "grad_norm": 4.858767090817545, "learning_rate": 4.980231945176594e-07, "logits/chosen": -1.995214819908142, "logits/rejected": -1.9171874523162842, "logps/chosen": -328.11248779296875, "logps/rejected": -337.04998779296875, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -0.2713867127895355, "rewards/margins": 4.651953220367432, "rewards/rejected": -4.922656059265137, "step": 3810 }, { "epoch": 2.013705851344228, "grad_norm": 11.52575397412426, "learning_rate": 4.96705324196099e-07, "logits/chosen": -2.1260743141174316, "logits/rejected": -1.9897949695587158, "logps/chosen": -347.375, "logps/rejected": -337.79998779296875, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -0.518298327922821, "rewards/margins": 4.662890434265137, "rewards/rejected": -5.184374809265137, "step": 3820 }, { "epoch": 2.018977332630469, "grad_norm": 8.364187693050573, "learning_rate": 4.953874538745387e-07, "logits/chosen": -1.9377930164337158, "logits/rejected": -2.0140624046325684, "logps/chosen": -393.57501220703125, "logps/rejected": -404.3999938964844, "loss": 0.0296, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1776244640350342, "rewards/margins": 5.482812404632568, "rewards/rejected": -6.663281440734863, "step": 3830 }, { "epoch": 2.0242488139167105, "grad_norm": 6.524047006235299, "learning_rate": 4.940695835529783e-07, "logits/chosen": -2.1016602516174316, "logits/rejected": -2.045166015625, "logps/chosen": -383.6875, "logps/rejected": -390.375, "loss": 0.0494, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.470947265625, "rewards/margins": 5.258203029632568, "rewards/rejected": -6.73046875, "step": 3840 }, { "epoch": 2.029520295202952, "grad_norm": 2.815400667212462, "learning_rate": 4.92751713231418e-07, "logits/chosen": -1.956152319908142, "logits/rejected": -2.0341796875, "logps/chosen": -361.8374938964844, "logps/rejected": -400.57501220703125, "loss": 0.0337, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9834839105606079, "rewards/margins": 5.283593654632568, "rewards/rejected": -6.268750190734863, "step": 3850 }, { "epoch": 2.0347917764891936, "grad_norm": 14.234417811349195, "learning_rate": 4.914338429098576e-07, "logits/chosen": -2.078418016433716, "logits/rejected": -2.280078172683716, "logps/chosen": -389.4750061035156, "logps/rejected": -364.45001220703125, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -0.7393432855606079, "rewards/margins": 5.317578315734863, "rewards/rejected": -6.05859375, "step": 3860 }, { "epoch": 2.0400632577754347, "grad_norm": 3.6085706437415137, "learning_rate": 4.901159725882973e-07, "logits/chosen": -2.257617235183716, "logits/rejected": -2.147265672683716, "logps/chosen": -337.8125, "logps/rejected": -375.82501220703125, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -1.2755858898162842, "rewards/margins": 5.454687595367432, "rewards/rejected": -6.728125095367432, "step": 3870 }, { "epoch": 2.0453347390616763, "grad_norm": 32.004935959078345, "learning_rate": 4.88798102266737e-07, "logits/chosen": -2.2337889671325684, "logits/rejected": -2.244140625, "logps/chosen": -363.29998779296875, "logps/rejected": -382.82501220703125, "loss": 0.0336, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6380126476287842, "rewards/margins": 5.862500190734863, "rewards/rejected": -7.501562595367432, "step": 3880 }, { "epoch": 2.050606220347918, "grad_norm": 24.968533576517554, "learning_rate": 4.874802319451766e-07, "logits/chosen": -2.322460889816284, "logits/rejected": -2.37890625, "logps/chosen": -339.0249938964844, "logps/rejected": -352.4750061035156, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.51995849609375, "rewards/margins": 5.249609470367432, "rewards/rejected": -6.771874904632568, "step": 3890 }, { "epoch": 2.0558777016341594, "grad_norm": 3.2040398850317255, "learning_rate": 4.861623616236162e-07, "logits/chosen": -2.264843702316284, "logits/rejected": -2.219921827316284, "logps/chosen": -335.29998779296875, "logps/rejected": -402.20001220703125, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.234838843345642, "rewards/margins": 5.447265625, "rewards/rejected": -6.682812690734863, "step": 3900 }, { "epoch": 2.0611491829204005, "grad_norm": 11.446287284983597, "learning_rate": 4.848444913020559e-07, "logits/chosen": -2.4847655296325684, "logits/rejected": -2.3921875953674316, "logps/chosen": -347.2124938964844, "logps/rejected": -392.67498779296875, "loss": 0.0328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.10107421875, "rewards/margins": 6.092187404632568, "rewards/rejected": -8.189844131469727, "step": 3910 }, { "epoch": 2.066420664206642, "grad_norm": 14.699430900624371, "learning_rate": 4.835266209804955e-07, "logits/chosen": -2.328320264816284, "logits/rejected": -2.3763670921325684, "logps/chosen": -358.48748779296875, "logps/rejected": -389.79998779296875, "loss": 0.0338, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9624512195587158, "rewards/margins": 5.735547065734863, "rewards/rejected": -7.702343940734863, "step": 3920 }, { "epoch": 2.0716921454928836, "grad_norm": 7.39821605768357, "learning_rate": 4.822087506589351e-07, "logits/chosen": -2.3750977516174316, "logits/rejected": -2.2320313453674316, "logps/chosen": -342.29998779296875, "logps/rejected": -412.6625061035156, "loss": 0.0445, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5088775157928467, "rewards/margins": 5.908203125, "rewards/rejected": -7.418359279632568, "step": 3930 }, { "epoch": 2.0769636267791247, "grad_norm": 9.545117314145193, "learning_rate": 4.808908803373748e-07, "logits/chosen": -2.3265624046325684, "logits/rejected": -2.258007764816284, "logps/chosen": -326.95001220703125, "logps/rejected": -367.625, "loss": 0.0327, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8697509765625, "rewards/margins": 5.590234279632568, "rewards/rejected": -7.4609375, "step": 3940 }, { "epoch": 2.0822351080653663, "grad_norm": 0.7955500765706701, "learning_rate": 4.795730100158144e-07, "logits/chosen": -2.33203125, "logits/rejected": -2.3837890625, "logps/chosen": -361.82501220703125, "logps/rejected": -349.95001220703125, "loss": 0.0342, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5266602039337158, "rewards/margins": 5.639843940734863, "rewards/rejected": -7.167187690734863, "step": 3950 }, { "epoch": 2.087506589351608, "grad_norm": 6.292516714780328, "learning_rate": 4.782551396942541e-07, "logits/chosen": -2.2650389671325684, "logits/rejected": -2.37890625, "logps/chosen": -380.2250061035156, "logps/rejected": -387.92498779296875, "loss": 0.0441, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.706201195716858, "rewards/margins": 5.528906345367432, "rewards/rejected": -7.232812404632568, "step": 3960 }, { "epoch": 2.0927780706378494, "grad_norm": 8.714412001620918, "learning_rate": 4.769372693726937e-07, "logits/chosen": -2.3017578125, "logits/rejected": -2.3140625953674316, "logps/chosen": -351.9750061035156, "logps/rejected": -368.45001220703125, "loss": 0.0437, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.981787085533142, "rewards/margins": 5.584765434265137, "rewards/rejected": -7.567968845367432, "step": 3970 }, { "epoch": 2.0980495519240905, "grad_norm": 6.087953794873433, "learning_rate": 4.756193990511334e-07, "logits/chosen": -2.1572265625, "logits/rejected": -2.2044920921325684, "logps/chosen": -335.54998779296875, "logps/rejected": -403.9750061035156, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3374512195587158, "rewards/margins": 5.604687690734863, "rewards/rejected": -6.942187309265137, "step": 3980 }, { "epoch": 2.103321033210332, "grad_norm": 6.599807207471127, "learning_rate": 4.7430152872957297e-07, "logits/chosen": -2.243847608566284, "logits/rejected": -2.356640577316284, "logps/chosen": -354.5375061035156, "logps/rejected": -364.1000061035156, "loss": 0.0351, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9986816644668579, "rewards/margins": 5.598046779632568, "rewards/rejected": -6.594531059265137, "step": 3990 }, { "epoch": 2.1085925144965736, "grad_norm": 4.642716514330909, "learning_rate": 4.729836584080126e-07, "logits/chosen": -2.2378907203674316, "logits/rejected": -2.293750047683716, "logps/chosen": -337.2749938964844, "logps/rejected": -348.17498779296875, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1925780773162842, "rewards/margins": 5.466406345367432, "rewards/rejected": -6.660937309265137, "step": 4000 }, { "epoch": 2.113863995782815, "grad_norm": 3.6040867289251257, "learning_rate": 4.716657880864523e-07, "logits/chosen": -2.241992235183716, "logits/rejected": -2.2867188453674316, "logps/chosen": -374.875, "logps/rejected": -376.45001220703125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.109460473060608, "rewards/margins": 5.675000190734863, "rewards/rejected": -6.782422065734863, "step": 4010 }, { "epoch": 2.1191354770690563, "grad_norm": 3.6645845434561255, "learning_rate": 4.703479177648919e-07, "logits/chosen": -2.3617186546325684, "logits/rejected": -2.210156202316284, "logps/chosen": -356.79998779296875, "logps/rejected": -385.4750061035156, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4943358898162842, "rewards/margins": 5.764843940734863, "rewards/rejected": -7.260156154632568, "step": 4020 }, { "epoch": 2.124406958355298, "grad_norm": 28.640675813399756, "learning_rate": 4.6903004744333156e-07, "logits/chosen": -2.487499952316284, "logits/rejected": -2.4683594703674316, "logps/chosen": -359.86248779296875, "logps/rejected": -400.7250061035156, "loss": 0.0466, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.793634057044983, "rewards/margins": 5.716796875, "rewards/rejected": -7.513281345367432, "step": 4030 }, { "epoch": 2.1296784396415394, "grad_norm": 16.638671164836634, "learning_rate": 4.677121771217712e-07, "logits/chosen": -2.403515577316284, "logits/rejected": -2.387500047683716, "logps/chosen": -342.57501220703125, "logps/rejected": -377.7250061035156, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -1.9762694835662842, "rewards/margins": 5.805859565734863, "rewards/rejected": -7.783593654632568, "step": 4040 }, { "epoch": 2.134949920927781, "grad_norm": 8.439655152254598, "learning_rate": 4.6639430680021086e-07, "logits/chosen": -2.4751954078674316, "logits/rejected": -2.357617139816284, "logps/chosen": -295.4750061035156, "logps/rejected": -344.3500061035156, "loss": 0.0427, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5130126476287842, "rewards/margins": 5.708593845367432, "rewards/rejected": -7.224218845367432, "step": 4050 }, { "epoch": 2.140221402214022, "grad_norm": 4.32375621879112, "learning_rate": 4.6507643647865045e-07, "logits/chosen": -2.3353514671325684, "logits/rejected": -2.2685546875, "logps/chosen": -350.70001220703125, "logps/rejected": -413.42498779296875, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -1.5195801258087158, "rewards/margins": 5.989453315734863, "rewards/rejected": -7.51171875, "step": 4060 }, { "epoch": 2.1454928835002636, "grad_norm": 2.301007930982602, "learning_rate": 4.6375856615709015e-07, "logits/chosen": -2.3397459983825684, "logits/rejected": -2.197265625, "logps/chosen": -330.2124938964844, "logps/rejected": -384.95001220703125, "loss": 0.0333, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.723059058189392, "rewards/margins": 5.818749904632568, "rewards/rejected": -7.539843559265137, "step": 4070 }, { "epoch": 2.150764364786505, "grad_norm": 29.497681482200555, "learning_rate": 4.6244069583552975e-07, "logits/chosen": -2.3382811546325684, "logits/rejected": -2.3564453125, "logps/chosen": -333.8999938964844, "logps/rejected": -374.95001220703125, "loss": 0.0357, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5887939929962158, "rewards/margins": 6.05078125, "rewards/rejected": -7.640625, "step": 4080 }, { "epoch": 2.1560358460727462, "grad_norm": 4.266264990508001, "learning_rate": 4.611228255139694e-07, "logits/chosen": -2.3978514671325684, "logits/rejected": -2.2933592796325684, "logps/chosen": -373.57501220703125, "logps/rejected": -390.1499938964844, "loss": 0.0389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8902466297149658, "rewards/margins": 5.891015529632568, "rewards/rejected": -7.77734375, "step": 4090 }, { "epoch": 2.161307327358988, "grad_norm": 7.152838683258718, "learning_rate": 4.5980495519240904e-07, "logits/chosen": -2.1615233421325684, "logits/rejected": -2.118945360183716, "logps/chosen": -412.79998779296875, "logps/rejected": -398.625, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -1.802734375, "rewards/margins": 5.687109470367432, "rewards/rejected": -7.494531154632568, "step": 4100 }, { "epoch": 2.1665788086452293, "grad_norm": 9.503417883748947, "learning_rate": 4.584870848708487e-07, "logits/chosen": -2.155468702316284, "logits/rejected": -2.1136717796325684, "logps/chosen": -382.63751220703125, "logps/rejected": -397.875, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.4376952648162842, "rewards/margins": 5.726953029632568, "rewards/rejected": -7.1640625, "step": 4110 }, { "epoch": 2.171850289931471, "grad_norm": 28.1023664283238, "learning_rate": 4.571692145492883e-07, "logits/chosen": -2.140429735183716, "logits/rejected": -2.2044920921325684, "logps/chosen": -410.88751220703125, "logps/rejected": -398.875, "loss": 0.0389, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.2295653820037842, "rewards/margins": 5.348046779632568, "rewards/rejected": -6.575781345367432, "step": 4120 }, { "epoch": 2.177121771217712, "grad_norm": 3.1886463634070665, "learning_rate": 4.55851344227728e-07, "logits/chosen": -2.437695264816284, "logits/rejected": -2.20361328125, "logps/chosen": -299.25, "logps/rejected": -366.5249938964844, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -1.2655029296875, "rewards/margins": 5.798828125, "rewards/rejected": -7.064843654632568, "step": 4130 }, { "epoch": 2.1823932525039536, "grad_norm": 10.083530008016744, "learning_rate": 4.545334739061676e-07, "logits/chosen": -2.2818360328674316, "logits/rejected": -2.5135741233825684, "logps/chosen": -379.6499938964844, "logps/rejected": -392.5, "loss": 0.0265, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0437254905700684, "rewards/margins": 6.030859470367432, "rewards/rejected": -8.074999809265137, "step": 4140 }, { "epoch": 2.187664733790195, "grad_norm": 4.923528219502507, "learning_rate": 4.532156035846073e-07, "logits/chosen": -2.389453172683716, "logits/rejected": -2.3926758766174316, "logps/chosen": -379.3500061035156, "logps/rejected": -387.8999938964844, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -2.157031297683716, "rewards/margins": 5.797265529632568, "rewards/rejected": -7.954687595367432, "step": 4150 }, { "epoch": 2.1929362150764367, "grad_norm": 4.4296263892818555, "learning_rate": 4.5189773326304693e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.43359375, "logps/chosen": -329.95001220703125, "logps/rejected": -381.9750061035156, "loss": 0.0321, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.849877953529358, "rewards/margins": 5.93359375, "rewards/rejected": -7.78515625, "step": 4160 }, { "epoch": 2.1982076963626778, "grad_norm": 16.438019431085827, "learning_rate": 4.505798629414865e-07, "logits/chosen": -2.2578125, "logits/rejected": -2.2577147483825684, "logps/chosen": -370.25, "logps/rejected": -389.70001220703125, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6861388683319092, "rewards/margins": 5.983984470367432, "rewards/rejected": -7.66796875, "step": 4170 }, { "epoch": 2.2034791776489193, "grad_norm": 21.495015688868037, "learning_rate": 4.492619926199262e-07, "logits/chosen": -2.39306640625, "logits/rejected": -2.314257860183716, "logps/chosen": -319.125, "logps/rejected": -374.45001220703125, "loss": 0.0343, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3330566883087158, "rewards/margins": 5.417187690734863, "rewards/rejected": -6.751562595367432, "step": 4180 }, { "epoch": 2.208750658935161, "grad_norm": 11.094051610954303, "learning_rate": 4.479441222983658e-07, "logits/chosen": -2.4322266578674316, "logits/rejected": -2.237499952316284, "logps/chosen": -333.92498779296875, "logps/rejected": -408.625, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -1.591009497642517, "rewards/margins": 5.624218940734863, "rewards/rejected": -7.214062690734863, "step": 4190 }, { "epoch": 2.2140221402214024, "grad_norm": 23.416800048912197, "learning_rate": 4.4662625197680546e-07, "logits/chosen": -2.301562547683716, "logits/rejected": -2.2759766578674316, "logps/chosen": -336.2124938964844, "logps/rejected": -372.7250061035156, "loss": 0.0301, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.535913109779358, "rewards/margins": 5.727734565734863, "rewards/rejected": -7.261328220367432, "step": 4200 }, { "epoch": 2.2192936215076435, "grad_norm": 20.081347136392136, "learning_rate": 4.453083816552451e-07, "logits/chosen": -2.315234422683716, "logits/rejected": -2.213671922683716, "logps/chosen": -365.26251220703125, "logps/rejected": -446.20001220703125, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -2.1648192405700684, "rewards/margins": 6.013671875, "rewards/rejected": -8.174219131469727, "step": 4210 }, { "epoch": 2.224565102793885, "grad_norm": 5.993750279421568, "learning_rate": 4.4399051133368476e-07, "logits/chosen": -2.352734327316284, "logits/rejected": -2.2164063453674316, "logps/chosen": -343.5, "logps/rejected": -401.25, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -2.469287157058716, "rewards/margins": 6.010156154632568, "rewards/rejected": -8.481249809265137, "step": 4220 }, { "epoch": 2.2298365840801266, "grad_norm": 5.67870124202582, "learning_rate": 4.4267264101212435e-07, "logits/chosen": -2.326953172683716, "logits/rejected": -2.2435545921325684, "logps/chosen": -359.5, "logps/rejected": -377.125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.8247559070587158, "rewards/margins": 6.634765625, "rewards/rejected": -8.45703125, "step": 4230 }, { "epoch": 2.2351080653663677, "grad_norm": 24.687279324514403, "learning_rate": 4.4135477069056405e-07, "logits/chosen": -2.3511719703674316, "logits/rejected": -2.4012694358825684, "logps/chosen": -376.6875, "logps/rejected": -410.70001220703125, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -1.8915526866912842, "rewards/margins": 6.283203125, "rewards/rejected": -8.176172256469727, "step": 4240 }, { "epoch": 2.2403795466526093, "grad_norm": 5.231232679524832, "learning_rate": 4.4003690036900365e-07, "logits/chosen": -2.408984422683716, "logits/rejected": -2.483593702316284, "logps/chosen": -361.79998779296875, "logps/rejected": -382.6499938964844, "loss": 0.0339, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0583863258361816, "rewards/margins": 6.152734279632568, "rewards/rejected": -8.210156440734863, "step": 4250 }, { "epoch": 2.245651027938851, "grad_norm": 6.091954274511851, "learning_rate": 4.387190300474433e-07, "logits/chosen": -2.545117139816284, "logits/rejected": -2.5069336891174316, "logps/chosen": -352.38751220703125, "logps/rejected": -378.79998779296875, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0400147438049316, "rewards/margins": 6.267578125, "rewards/rejected": -8.310937881469727, "step": 4260 }, { "epoch": 2.2509225092250924, "grad_norm": 34.16627576472246, "learning_rate": 4.3740115972588294e-07, "logits/chosen": -2.4853515625, "logits/rejected": -2.6019530296325684, "logps/chosen": -353.875, "logps/rejected": -372.70001220703125, "loss": 0.0417, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.608227491378784, "rewards/margins": 6.018359184265137, "rewards/rejected": -8.624218940734863, "step": 4270 }, { "epoch": 2.2561939905113335, "grad_norm": 10.27382374834233, "learning_rate": 4.360832894043226e-07, "logits/chosen": -2.4365234375, "logits/rejected": -2.33154296875, "logps/chosen": -388.7749938964844, "logps/rejected": -441.67498779296875, "loss": 0.0577, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.298168897628784, "rewards/margins": 6.203906059265137, "rewards/rejected": -8.501562118530273, "step": 4280 }, { "epoch": 2.261465471797575, "grad_norm": 15.105517305735061, "learning_rate": 4.3476541908276224e-07, "logits/chosen": -2.3900389671325684, "logits/rejected": -2.337109327316284, "logps/chosen": -385.11248779296875, "logps/rejected": -401.0, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -1.8174622058868408, "rewards/margins": 6.185156345367432, "rewards/rejected": -8.000781059265137, "step": 4290 }, { "epoch": 2.2667369530838166, "grad_norm": 5.930984928086645, "learning_rate": 4.334475487612019e-07, "logits/chosen": -2.4166016578674316, "logits/rejected": -2.4037108421325684, "logps/chosen": -338.88751220703125, "logps/rejected": -381.5, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.877050757408142, "rewards/margins": 6.134375095367432, "rewards/rejected": -8.010937690734863, "step": 4300 }, { "epoch": 2.272008434370058, "grad_norm": 4.4953082599047285, "learning_rate": 4.3212967843964153e-07, "logits/chosen": -2.236621141433716, "logits/rejected": -2.293164014816284, "logps/chosen": -354.0874938964844, "logps/rejected": -396.75, "loss": 0.0571, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.06414794921875, "rewards/margins": 6.30859375, "rewards/rejected": -8.373437881469727, "step": 4310 }, { "epoch": 2.2772799156562993, "grad_norm": 13.4897849761825, "learning_rate": 4.3081180811808113e-07, "logits/chosen": -2.4365234375, "logits/rejected": -2.4156250953674316, "logps/chosen": -364.7124938964844, "logps/rejected": -390.0249938964844, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -1.77264404296875, "rewards/margins": 6.051562309265137, "rewards/rejected": -7.821093559265137, "step": 4320 }, { "epoch": 2.282551396942541, "grad_norm": 7.197178570056089, "learning_rate": 4.2949393779652083e-07, "logits/chosen": -2.464062452316284, "logits/rejected": -2.532421827316284, "logps/chosen": -341.8125, "logps/rejected": -363.5375061035156, "loss": 0.0449, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0008544921875, "rewards/margins": 5.749218940734863, "rewards/rejected": -7.75390625, "step": 4330 }, { "epoch": 2.2878228782287824, "grad_norm": 9.108320289075964, "learning_rate": 4.281760674749604e-07, "logits/chosen": -2.426953077316284, "logits/rejected": -2.487109422683716, "logps/chosen": -347.8500061035156, "logps/rejected": -384.875, "loss": 0.0365, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.330615282058716, "rewards/margins": 5.761328220367432, "rewards/rejected": -8.088281631469727, "step": 4340 }, { "epoch": 2.293094359515024, "grad_norm": 3.7401350421566764, "learning_rate": 4.268581971534001e-07, "logits/chosen": -2.490429639816284, "logits/rejected": -2.345898389816284, "logps/chosen": -342.79998779296875, "logps/rejected": -375.5, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -2.5039305686950684, "rewards/margins": 5.787499904632568, "rewards/rejected": -8.291406631469727, "step": 4350 }, { "epoch": 2.298365840801265, "grad_norm": 4.087455456956429, "learning_rate": 4.255403268318397e-07, "logits/chosen": -2.450976610183716, "logits/rejected": -2.417285203933716, "logps/chosen": -354.1000061035156, "logps/rejected": -358.0, "loss": 0.0372, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1112608909606934, "rewards/margins": 5.756249904632568, "rewards/rejected": -7.87109375, "step": 4360 }, { "epoch": 2.3036373220875066, "grad_norm": 11.899228760167789, "learning_rate": 4.2422245651027937e-07, "logits/chosen": -2.484570264816284, "logits/rejected": -2.486132860183716, "logps/chosen": -348.6499938964844, "logps/rejected": -399.67498779296875, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -1.8479492664337158, "rewards/margins": 6.037109375, "rewards/rejected": -7.889062404632568, "step": 4370 }, { "epoch": 2.308908803373748, "grad_norm": 16.697768641979437, "learning_rate": 4.22904586188719e-07, "logits/chosen": -2.3304686546325684, "logits/rejected": -2.2699217796325684, "logps/chosen": -340.4750061035156, "logps/rejected": -409.8999938964844, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -2.209399461746216, "rewards/margins": 5.844922065734863, "rewards/rejected": -8.057812690734863, "step": 4380 }, { "epoch": 2.3141802846599893, "grad_norm": 3.541670125091878, "learning_rate": 4.2158671586715866e-07, "logits/chosen": -2.3780274391174316, "logits/rejected": -2.1996092796325684, "logps/chosen": -386.7875061035156, "logps/rejected": -438.0249938964844, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.024761915206909, "rewards/margins": 5.987500190734863, "rewards/rejected": -8.01171875, "step": 4390 }, { "epoch": 2.319451765946231, "grad_norm": 12.221802348671867, "learning_rate": 4.2026884554559826e-07, "logits/chosen": -2.435351610183716, "logits/rejected": -2.353710889816284, "logps/chosen": -348.8125, "logps/rejected": -379.88751220703125, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -1.9949219226837158, "rewards/margins": 5.592187404632568, "rewards/rejected": -7.583593845367432, "step": 4400 }, { "epoch": 2.3247232472324724, "grad_norm": 5.7941812531713515, "learning_rate": 4.1895097522403796e-07, "logits/chosen": -2.341601610183716, "logits/rejected": -2.2865233421325684, "logps/chosen": -371.88751220703125, "logps/rejected": -403.9750061035156, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -2.260449171066284, "rewards/margins": 6.440625190734863, "rewards/rejected": -8.698437690734863, "step": 4410 }, { "epoch": 2.329994728518714, "grad_norm": 3.7522586519693766, "learning_rate": 4.1763310490247755e-07, "logits/chosen": -2.510937452316284, "logits/rejected": -2.3968749046325684, "logps/chosen": -359.6625061035156, "logps/rejected": -402.79998779296875, "loss": 0.0426, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.419726610183716, "rewards/margins": 6.23046875, "rewards/rejected": -8.649218559265137, "step": 4420 }, { "epoch": 2.335266209804955, "grad_norm": 13.01242851144274, "learning_rate": 4.163152345809172e-07, "logits/chosen": -2.4917969703674316, "logits/rejected": -2.357617139816284, "logps/chosen": -347.875, "logps/rejected": -381.5, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -2.3701171875, "rewards/margins": 5.963281154632568, "rewards/rejected": -8.3359375, "step": 4430 }, { "epoch": 2.3405376910911966, "grad_norm": 3.5711064767270075, "learning_rate": 4.149973642593569e-07, "logits/chosen": -2.578320264816284, "logits/rejected": -2.532031297683716, "logps/chosen": -323.88751220703125, "logps/rejected": -366.70001220703125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.241748094558716, "rewards/margins": 6.5390625, "rewards/rejected": -8.780077934265137, "step": 4440 }, { "epoch": 2.345809172377438, "grad_norm": 10.031254001974295, "learning_rate": 4.136794939377965e-07, "logits/chosen": -2.51220703125, "logits/rejected": -2.5093750953674316, "logps/chosen": -351.9624938964844, "logps/rejected": -406.0249938964844, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -1.568017601966858, "rewards/margins": 6.140234470367432, "rewards/rejected": -7.706250190734863, "step": 4450 }, { "epoch": 2.3510806536636797, "grad_norm": 20.131879721316324, "learning_rate": 4.1236162361623614e-07, "logits/chosen": -2.482226610183716, "logits/rejected": -2.5492186546325684, "logps/chosen": -365.0, "logps/rejected": -405.625, "loss": 0.029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.056567430496216, "rewards/margins": 6.541406154632568, "rewards/rejected": -8.598437309265137, "step": 4460 }, { "epoch": 2.356352134949921, "grad_norm": 15.371621267364654, "learning_rate": 4.110437532946758e-07, "logits/chosen": -2.3617186546325684, "logits/rejected": -2.462890625, "logps/chosen": -360.7250061035156, "logps/rejected": -367.82501220703125, "loss": 0.0246, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.197399854660034, "rewards/margins": 6.20703125, "rewards/rejected": -8.40234375, "step": 4470 }, { "epoch": 2.3616236162361623, "grad_norm": 22.8593632614345, "learning_rate": 4.0972588297311544e-07, "logits/chosen": -2.312695264816284, "logits/rejected": -2.408984422683716, "logps/chosen": -359.875, "logps/rejected": -355.67498779296875, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -2.971630811691284, "rewards/margins": 6.131249904632568, "rewards/rejected": -9.1015625, "step": 4480 }, { "epoch": 2.366895097522404, "grad_norm": 37.5052998125965, "learning_rate": 4.0840801265155503e-07, "logits/chosen": -2.418164014816284, "logits/rejected": -2.380078077316284, "logps/chosen": -366.9375, "logps/rejected": -403.75, "loss": 0.0471, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0733399391174316, "rewards/margins": 6.183203220367432, "rewards/rejected": -9.2578125, "step": 4490 }, { "epoch": 2.3721665788086455, "grad_norm": 7.418372944933393, "learning_rate": 4.0709014232999473e-07, "logits/chosen": -2.6498045921325684, "logits/rejected": -2.564648389816284, "logps/chosen": -312.54998779296875, "logps/rejected": -379.29998779296875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -3.0946288108825684, "rewards/margins": 7.017968654632568, "rewards/rejected": -10.110937118530273, "step": 4500 }, { "epoch": 2.3774380600948866, "grad_norm": 3.654054792933456, "learning_rate": 4.0577227200843433e-07, "logits/chosen": -2.3998045921325684, "logits/rejected": -2.4111328125, "logps/chosen": -361.0, "logps/rejected": -415.45001220703125, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -2.8095703125, "rewards/margins": 6.920312404632568, "rewards/rejected": -9.732812881469727, "step": 4510 }, { "epoch": 2.382709541381128, "grad_norm": 30.844752972060423, "learning_rate": 4.0445440168687403e-07, "logits/chosen": -2.510546922683716, "logits/rejected": -2.581249952316284, "logps/chosen": -350.04998779296875, "logps/rejected": -367.4750061035156, "loss": 0.0309, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.009765625, "rewards/margins": 6.497656345367432, "rewards/rejected": -9.503125190734863, "step": 4520 }, { "epoch": 2.3879810226673697, "grad_norm": 14.774341927090022, "learning_rate": 4.031365313653136e-07, "logits/chosen": -2.432421922683716, "logits/rejected": -2.3550782203674316, "logps/chosen": -404.125, "logps/rejected": -443.42498779296875, "loss": 0.0257, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.67578125, "rewards/margins": 6.619531154632568, "rewards/rejected": -10.296093940734863, "step": 4530 }, { "epoch": 2.3932525039536108, "grad_norm": 12.460374352166212, "learning_rate": 4.0181866104375327e-07, "logits/chosen": -2.545117139816284, "logits/rejected": -2.5005860328674316, "logps/chosen": -322.625, "logps/rejected": -379.79998779296875, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.921130418777466, "rewards/margins": 6.614453315734863, "rewards/rejected": -9.541406631469727, "step": 4540 }, { "epoch": 2.3985239852398523, "grad_norm": 3.8343734624629424, "learning_rate": 4.005007907221929e-07, "logits/chosen": -2.34228515625, "logits/rejected": -2.4781250953674316, "logps/chosen": -385.07501220703125, "logps/rejected": -395.25, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -2.6553711891174316, "rewards/margins": 6.383593559265137, "rewards/rejected": -9.037500381469727, "step": 4550 }, { "epoch": 2.403795466526094, "grad_norm": 7.233140507524259, "learning_rate": 3.9918292040063256e-07, "logits/chosen": -2.529296875, "logits/rejected": -2.514843702316284, "logps/chosen": -357.04998779296875, "logps/rejected": -387.4750061035156, "loss": 0.0459, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.226757764816284, "rewards/margins": 6.180468559265137, "rewards/rejected": -8.40625, "step": 4560 }, { "epoch": 2.4090669478123354, "grad_norm": 6.566729598200574, "learning_rate": 3.978650500790722e-07, "logits/chosen": -2.377148389816284, "logits/rejected": -2.258593797683716, "logps/chosen": -409.75, "logps/rejected": -457.54998779296875, "loss": 0.0308, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.513427734375, "rewards/margins": 6.441015720367432, "rewards/rejected": -8.958593368530273, "step": 4570 }, { "epoch": 2.4143384290985765, "grad_norm": 5.5863545089551065, "learning_rate": 3.9654717975751186e-07, "logits/chosen": -2.5238280296325684, "logits/rejected": -2.4208984375, "logps/chosen": -324.29998779296875, "logps/rejected": -405.6499938964844, "loss": 0.0308, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.329760789871216, "rewards/margins": 6.49609375, "rewards/rejected": -8.826562881469727, "step": 4580 }, { "epoch": 2.419609910384818, "grad_norm": 11.879811388385038, "learning_rate": 3.952293094359515e-07, "logits/chosen": -2.4898438453674316, "logits/rejected": -2.5146484375, "logps/chosen": -357.23748779296875, "logps/rejected": -380.875, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5531005859375, "rewards/margins": 6.139843940734863, "rewards/rejected": -8.685937881469727, "step": 4590 }, { "epoch": 2.4248813916710596, "grad_norm": 21.325175745469878, "learning_rate": 3.939114391143911e-07, "logits/chosen": -2.3248047828674316, "logits/rejected": -2.3714842796325684, "logps/chosen": -398.875, "logps/rejected": -392.29998779296875, "loss": 0.039, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.45556640625, "rewards/margins": 6.412109375, "rewards/rejected": -8.867968559265137, "step": 4600 }, { "epoch": 2.4301528729573008, "grad_norm": 29.58340745150503, "learning_rate": 3.925935687928308e-07, "logits/chosen": -2.347851514816284, "logits/rejected": -2.353710889816284, "logps/chosen": -355.2250061035156, "logps/rejected": -390.1000061035156, "loss": 0.0279, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.413037061691284, "rewards/margins": 6.204687595367432, "rewards/rejected": -8.619531631469727, "step": 4610 }, { "epoch": 2.4354243542435423, "grad_norm": 13.524957087829367, "learning_rate": 3.912756984712704e-07, "logits/chosen": -2.442578077316284, "logits/rejected": -2.388867139816284, "logps/chosen": -335.76251220703125, "logps/rejected": -376.375, "loss": 0.033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.606689453125, "rewards/margins": 5.982031345367432, "rewards/rejected": -7.589062690734863, "step": 4620 }, { "epoch": 2.440695835529784, "grad_norm": 2.5831783262696133, "learning_rate": 3.8995782814971004e-07, "logits/chosen": -2.348437547683716, "logits/rejected": -2.28125, "logps/chosen": -362.9624938964844, "logps/rejected": -372.6499938964844, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -1.912988305091858, "rewards/margins": 6.3515625, "rewards/rejected": -8.26953125, "step": 4630 }, { "epoch": 2.4459673168160254, "grad_norm": 9.453861244681871, "learning_rate": 3.886399578281497e-07, "logits/chosen": -2.273632764816284, "logits/rejected": -2.29296875, "logps/chosen": -375.07501220703125, "logps/rejected": -399.04998779296875, "loss": 0.022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4274413585662842, "rewards/margins": 6.506249904632568, "rewards/rejected": -7.927343845367432, "step": 4640 }, { "epoch": 2.451238798102267, "grad_norm": 32.09285496420094, "learning_rate": 3.8732208750658934e-07, "logits/chosen": -2.408007860183716, "logits/rejected": -2.5697264671325684, "logps/chosen": -380.42498779296875, "logps/rejected": -381.3500061035156, "loss": 0.0251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4129881858825684, "rewards/margins": 6.339453220367432, "rewards/rejected": -8.754687309265137, "step": 4650 }, { "epoch": 2.456510279388508, "grad_norm": 13.212038515410592, "learning_rate": 3.8600421718502893e-07, "logits/chosen": -2.352734327316284, "logits/rejected": -2.367382764816284, "logps/chosen": -365.5249938964844, "logps/rejected": -406.79998779296875, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -2.8282227516174316, "rewards/margins": 6.369531154632568, "rewards/rejected": -9.196874618530273, "step": 4660 }, { "epoch": 2.4617817606747496, "grad_norm": 2.330391076042177, "learning_rate": 3.8468634686346863e-07, "logits/chosen": -2.497265577316284, "logits/rejected": -2.4164061546325684, "logps/chosen": -356.0375061035156, "logps/rejected": -404.92498779296875, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.296191453933716, "rewards/margins": 6.521484375, "rewards/rejected": -9.819531440734863, "step": 4670 }, { "epoch": 2.467053241960991, "grad_norm": 3.4915748860477733, "learning_rate": 3.8336847654190823e-07, "logits/chosen": -2.468554735183716, "logits/rejected": -2.519335985183716, "logps/chosen": -351.5, "logps/rejected": -388.0249938964844, "loss": 0.0223, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.447998046875, "rewards/margins": 6.684765815734863, "rewards/rejected": -9.13671875, "step": 4680 }, { "epoch": 2.4723247232472323, "grad_norm": 6.51656472559811, "learning_rate": 3.8205060622034793e-07, "logits/chosen": -2.3046875, "logits/rejected": -2.400585889816284, "logps/chosen": -360.04998779296875, "logps/rejected": -378.1499938964844, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -2.271728515625, "rewards/margins": 5.990234375, "rewards/rejected": -8.259374618530273, "step": 4690 }, { "epoch": 2.477596204533474, "grad_norm": 46.94441480124907, "learning_rate": 3.807327358987875e-07, "logits/chosen": -2.451367139816284, "logits/rejected": -2.456835985183716, "logps/chosen": -349.4750061035156, "logps/rejected": -385.98748779296875, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.25830078125, "rewards/margins": 5.951952934265137, "rewards/rejected": -8.211718559265137, "step": 4700 }, { "epoch": 2.4828676858197154, "grad_norm": 21.710564074395794, "learning_rate": 3.7941486557722717e-07, "logits/chosen": -2.434765577316284, "logits/rejected": -2.525195360183716, "logps/chosen": -350.23748779296875, "logps/rejected": -376.54998779296875, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -2.1756834983825684, "rewards/margins": 6.154296875, "rewards/rejected": -8.333984375, "step": 4710 }, { "epoch": 2.488139167105957, "grad_norm": 31.564163665298572, "learning_rate": 3.7809699525566687e-07, "logits/chosen": -2.320507764816284, "logits/rejected": -2.33984375, "logps/chosen": -375.3999938964844, "logps/rejected": -400.45001220703125, "loss": 0.0273, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0259766578674316, "rewards/margins": 6.062890529632568, "rewards/rejected": -8.087890625, "step": 4720 }, { "epoch": 2.493410648392198, "grad_norm": 4.816976325324126, "learning_rate": 3.7677912493410647e-07, "logits/chosen": -2.5400390625, "logits/rejected": -2.59375, "logps/chosen": -361.1000061035156, "logps/rejected": -370.375, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6088500022888184, "rewards/margins": 6.500781059265137, "rewards/rejected": -9.111719131469727, "step": 4730 }, { "epoch": 2.4986821296784396, "grad_norm": 16.355981825149158, "learning_rate": 3.754612546125461e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.5697264671325684, "logps/chosen": -315.9750061035156, "logps/rejected": -349.2749938964844, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -3.063915967941284, "rewards/margins": 6.009765625, "rewards/rejected": -9.074999809265137, "step": 4740 }, { "epoch": 2.503953610964681, "grad_norm": 5.059324172517705, "learning_rate": 3.7414338429098576e-07, "logits/chosen": -2.5355467796325684, "logits/rejected": -2.5224609375, "logps/chosen": -390.7875061035156, "logps/rejected": -404.95001220703125, "loss": 0.0273, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.976367235183716, "rewards/margins": 6.017187595367432, "rewards/rejected": -8.9921875, "step": 4750 }, { "epoch": 2.5092250922509223, "grad_norm": 13.869955916580583, "learning_rate": 3.728255139694254e-07, "logits/chosen": -2.537304639816284, "logits/rejected": -2.5650391578674316, "logps/chosen": -361.07501220703125, "logps/rejected": -384.0249938964844, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -2.7988524436950684, "rewards/margins": 6.403906345367432, "rewards/rejected": -9.200780868530273, "step": 4760 }, { "epoch": 2.514496573537164, "grad_norm": 59.38402455119547, "learning_rate": 3.71507643647865e-07, "logits/chosen": -2.4810547828674316, "logits/rejected": -2.4164061546325684, "logps/chosen": -369.8500061035156, "logps/rejected": -391.45001220703125, "loss": 0.0277, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4853272438049316, "rewards/margins": 6.053124904632568, "rewards/rejected": -8.537500381469727, "step": 4770 }, { "epoch": 2.5197680548234054, "grad_norm": 14.419123002742088, "learning_rate": 3.701897733263047e-07, "logits/chosen": -2.5855469703674316, "logits/rejected": -2.560351610183716, "logps/chosen": -326.5249938964844, "logps/rejected": -371.7250061035156, "loss": 0.033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.37939453125, "rewards/margins": 5.934374809265137, "rewards/rejected": -8.315625190734863, "step": 4780 }, { "epoch": 2.525039536109647, "grad_norm": 10.114157158662161, "learning_rate": 3.688719030047443e-07, "logits/chosen": -2.6572265625, "logits/rejected": -2.5467772483825684, "logps/chosen": -319.95001220703125, "logps/rejected": -372.25, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -2.426562547683716, "rewards/margins": 5.912499904632568, "rewards/rejected": -8.342187881469727, "step": 4790 }, { "epoch": 2.5303110173958885, "grad_norm": 4.662008232081926, "learning_rate": 3.6755403268318395e-07, "logits/chosen": -2.310546875, "logits/rejected": -2.540234327316284, "logps/chosen": -412.1499938964844, "logps/rejected": -387.2749938964844, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -2.5169677734375, "rewards/margins": 6.426953315734863, "rewards/rejected": -8.942968368530273, "step": 4800 }, { "epoch": 2.5355824986821296, "grad_norm": 17.81538011910818, "learning_rate": 3.662361623616236e-07, "logits/chosen": -2.521289110183716, "logits/rejected": -2.4185547828674316, "logps/chosen": -314.82501220703125, "logps/rejected": -383.4750061035156, "loss": 0.0384, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.225756883621216, "rewards/margins": 6.376172065734863, "rewards/rejected": -8.598437309265137, "step": 4810 }, { "epoch": 2.540853979968371, "grad_norm": 4.380183925308428, "learning_rate": 3.6491829204006324e-07, "logits/chosen": -2.351757764816284, "logits/rejected": -2.3275389671325684, "logps/chosen": -358.25, "logps/rejected": -413.07501220703125, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.25, "rewards/margins": 6.12890625, "rewards/rejected": -8.382031440734863, "step": 4820 }, { "epoch": 2.5461254612546127, "grad_norm": 4.0072892906861, "learning_rate": 3.6360042171850284e-07, "logits/chosen": -2.44482421875, "logits/rejected": -2.5248045921325684, "logps/chosen": -336.1875, "logps/rejected": -393.5, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.238842725753784, "rewards/margins": 6.2265625, "rewards/rejected": -8.467968940734863, "step": 4830 }, { "epoch": 2.551396942540854, "grad_norm": 9.380539448027191, "learning_rate": 3.6228255139694254e-07, "logits/chosen": -2.4315428733825684, "logits/rejected": -2.4419922828674316, "logps/chosen": -347.17498779296875, "logps/rejected": -389.29998779296875, "loss": 0.0383, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0243287086486816, "rewards/margins": 5.893359184265137, "rewards/rejected": -7.913281440734863, "step": 4840 }, { "epoch": 2.5566684238270954, "grad_norm": 25.25385542047146, "learning_rate": 3.609646810753822e-07, "logits/chosen": -2.4253907203674316, "logits/rejected": -2.5296874046325684, "logps/chosen": -342.625, "logps/rejected": -363.875, "loss": 0.0261, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2198243141174316, "rewards/margins": 6.241796970367432, "rewards/rejected": -8.457812309265137, "step": 4850 }, { "epoch": 2.561939905113337, "grad_norm": 34.6524944228794, "learning_rate": 3.5964681075382183e-07, "logits/chosen": -2.611328125, "logits/rejected": -2.526171922683716, "logps/chosen": -325.75, "logps/rejected": -358.95001220703125, "loss": 0.0688, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.835742235183716, "rewards/margins": 5.888281345367432, "rewards/rejected": -8.725781440734863, "step": 4860 }, { "epoch": 2.5672113863995785, "grad_norm": 1.7532926406868257, "learning_rate": 3.583289404322615e-07, "logits/chosen": -2.4986329078674316, "logits/rejected": -2.476367235183716, "logps/chosen": -342.125, "logps/rejected": -376.3999938964844, "loss": 0.0319, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6187500953674316, "rewards/margins": 6.301171779632568, "rewards/rejected": -9.919530868530273, "step": 4870 }, { "epoch": 2.5724828676858196, "grad_norm": 10.775532306894341, "learning_rate": 3.570110701107011e-07, "logits/chosen": -2.5918946266174316, "logits/rejected": -2.486132860183716, "logps/chosen": -345.3999938964844, "logps/rejected": -376.95001220703125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -3.08740234375, "rewards/margins": 6.285937309265137, "rewards/rejected": -9.376562118530273, "step": 4880 }, { "epoch": 2.577754348972061, "grad_norm": 1.3769205874889172, "learning_rate": 3.556931997891408e-07, "logits/chosen": -2.3814454078674316, "logits/rejected": -2.565234422683716, "logps/chosen": -344.17498779296875, "logps/rejected": -382.2749938964844, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.032421827316284, "rewards/margins": 6.714062690734863, "rewards/rejected": -9.75, "step": 4890 }, { "epoch": 2.5830258302583027, "grad_norm": 9.407352575900404, "learning_rate": 3.5437532946758037e-07, "logits/chosen": -2.453125, "logits/rejected": -2.512890577316284, "logps/chosen": -395.95001220703125, "logps/rejected": -427.625, "loss": 0.0278, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8564453125, "rewards/margins": 6.3203125, "rewards/rejected": -9.178906440734863, "step": 4900 }, { "epoch": 2.588297311544544, "grad_norm": 16.143086939043783, "learning_rate": 3.5305745914602e-07, "logits/chosen": -2.5072264671325684, "logits/rejected": -2.553906202316284, "logps/chosen": -368.6499938964844, "logps/rejected": -389.54998779296875, "loss": 0.0306, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7887206077575684, "rewards/margins": 6.118359565734863, "rewards/rejected": -8.907031059265137, "step": 4910 }, { "epoch": 2.5935687928307853, "grad_norm": 8.420949902066969, "learning_rate": 3.5173958882445966e-07, "logits/chosen": -2.3519530296325684, "logits/rejected": -2.4134764671325684, "logps/chosen": -398.82501220703125, "logps/rejected": -423.42498779296875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -3.24609375, "rewards/margins": 6.149218559265137, "rewards/rejected": -9.3984375, "step": 4920 }, { "epoch": 2.598840274117027, "grad_norm": 25.357914109659514, "learning_rate": 3.504217185028993e-07, "logits/chosen": -2.490429639816284, "logits/rejected": -2.3382811546325684, "logps/chosen": -370.2124938964844, "logps/rejected": -437.20001220703125, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0101561546325684, "rewards/margins": 6.321484565734863, "rewards/rejected": -9.328906059265137, "step": 4930 }, { "epoch": 2.6041117554032684, "grad_norm": 26.766101107994274, "learning_rate": 3.491038481813389e-07, "logits/chosen": -2.333203077316284, "logits/rejected": -2.3626952171325684, "logps/chosen": -383.1875, "logps/rejected": -408.75, "loss": 0.0423, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.873242139816284, "rewards/margins": 6.666406154632568, "rewards/rejected": -9.542187690734863, "step": 4940 }, { "epoch": 2.60938323668951, "grad_norm": 4.578326783718189, "learning_rate": 3.477859778597786e-07, "logits/chosen": -2.386914014816284, "logits/rejected": -2.5054688453674316, "logps/chosen": -385.07501220703125, "logps/rejected": -409.92498779296875, "loss": 0.0332, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.468212842941284, "rewards/margins": 6.352734565734863, "rewards/rejected": -8.823437690734863, "step": 4950 }, { "epoch": 2.614654717975751, "grad_norm": 2.636177665132458, "learning_rate": 3.464681075382182e-07, "logits/chosen": -2.5345702171325684, "logits/rejected": -2.602734327316284, "logps/chosen": -346.79998779296875, "logps/rejected": -375.8999938964844, "loss": 0.0271, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8873047828674316, "rewards/margins": 6.605078220367432, "rewards/rejected": -9.482812881469727, "step": 4960 }, { "epoch": 2.6199261992619927, "grad_norm": 11.00656359783026, "learning_rate": 3.4515023721665785e-07, "logits/chosen": -2.4085936546325684, "logits/rejected": -2.4306640625, "logps/chosen": -349.5, "logps/rejected": -381.875, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5691895484924316, "rewards/margins": 6.797656059265137, "rewards/rejected": -9.369531631469727, "step": 4970 }, { "epoch": 2.625197680548234, "grad_norm": 4.583851873091563, "learning_rate": 3.438323668950975e-07, "logits/chosen": -2.4029297828674316, "logits/rejected": -2.494335889816284, "logps/chosen": -386.67498779296875, "logps/rejected": -398.70001220703125, "loss": 0.0337, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9105467796325684, "rewards/margins": 5.889062404632568, "rewards/rejected": -8.800000190734863, "step": 4980 }, { "epoch": 2.6304691618344753, "grad_norm": 35.06585593802394, "learning_rate": 3.4251449657353714e-07, "logits/chosen": -2.5107421875, "logits/rejected": -2.5068359375, "logps/chosen": -338.2250061035156, "logps/rejected": -380.1875, "loss": 0.0254, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1869139671325684, "rewards/margins": 6.487109184265137, "rewards/rejected": -9.667187690734863, "step": 4990 }, { "epoch": 2.635740643120717, "grad_norm": 3.771790226346486, "learning_rate": 3.411966262519768e-07, "logits/chosen": -2.5498046875, "logits/rejected": -2.67578125, "logps/chosen": -337.9750061035156, "logps/rejected": -359.32501220703125, "loss": 0.0349, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7564940452575684, "rewards/margins": 6.202343940734863, "rewards/rejected": -8.960156440734863, "step": 5000 }, { "epoch": 2.6410121244069584, "grad_norm": 4.652220125050645, "learning_rate": 3.3987875593041644e-07, "logits/chosen": -2.5423827171325684, "logits/rejected": -2.504687547683716, "logps/chosen": -365.0625, "logps/rejected": -430.4750061035156, "loss": 0.0238, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.976123094558716, "rewards/margins": 6.591796875, "rewards/rejected": -9.56640625, "step": 5010 }, { "epoch": 2.6462836056932, "grad_norm": 9.699933751521531, "learning_rate": 3.385608856088561e-07, "logits/chosen": -2.394335985183716, "logits/rejected": -2.5179686546325684, "logps/chosen": -399.13751220703125, "logps/rejected": -397.57501220703125, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -2.387646436691284, "rewards/margins": 6.071875095367432, "rewards/rejected": -8.462499618530273, "step": 5020 }, { "epoch": 2.651555086979441, "grad_norm": 6.548941023534973, "learning_rate": 3.372430152872957e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.6849608421325684, "logps/chosen": -379.45001220703125, "logps/rejected": -391.4750061035156, "loss": 0.027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2099609375, "rewards/margins": 6.091406345367432, "rewards/rejected": -8.300000190734863, "step": 5030 }, { "epoch": 2.6568265682656826, "grad_norm": 5.561709988670695, "learning_rate": 3.359251449657354e-07, "logits/chosen": -2.486328125, "logits/rejected": -2.4615235328674316, "logps/chosen": -351.3500061035156, "logps/rejected": -357.625, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.282543897628784, "rewards/margins": 6.175000190734863, "rewards/rejected": -8.462499618530273, "step": 5040 }, { "epoch": 2.662098049551924, "grad_norm": 32.32936101425133, "learning_rate": 3.34607274644175e-07, "logits/chosen": -2.476757764816284, "logits/rejected": -2.417773485183716, "logps/chosen": -342.42498779296875, "logps/rejected": -393.92498779296875, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -2.470752000808716, "rewards/margins": 6.349218845367432, "rewards/rejected": -8.81640625, "step": 5050 }, { "epoch": 2.6673695308381653, "grad_norm": 2.2745355762642325, "learning_rate": 3.332894043226147e-07, "logits/chosen": -2.454882860183716, "logits/rejected": -2.4296875, "logps/chosen": -386.2250061035156, "logps/rejected": -406.0, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -2.805615186691284, "rewards/margins": 6.690625190734863, "rewards/rejected": -9.489843368530273, "step": 5060 }, { "epoch": 2.672641012124407, "grad_norm": 6.057142487879341, "learning_rate": 3.3197153400105427e-07, "logits/chosen": -2.6529297828674316, "logits/rejected": -2.593945264816284, "logps/chosen": -315.3999938964844, "logps/rejected": -358.42498779296875, "loss": 0.0378, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.937206983566284, "rewards/margins": 6.456640720367432, "rewards/rejected": -9.392187118530273, "step": 5070 }, { "epoch": 2.6779124934106484, "grad_norm": 6.934765875992084, "learning_rate": 3.306536636794939e-07, "logits/chosen": -2.4122071266174316, "logits/rejected": -2.4697265625, "logps/chosen": -375.6499938964844, "logps/rejected": -385.32501220703125, "loss": 0.0358, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8492188453674316, "rewards/margins": 6.486328125, "rewards/rejected": -9.338281631469727, "step": 5080 }, { "epoch": 2.68318397469689, "grad_norm": 8.743181637722403, "learning_rate": 3.2933579335793357e-07, "logits/chosen": -2.43359375, "logits/rejected": -2.4390625953674316, "logps/chosen": -366.54998779296875, "logps/rejected": -388.7250061035156, "loss": 0.0363, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.72119140625, "rewards/margins": 6.283984184265137, "rewards/rejected": -9.001562118530273, "step": 5090 }, { "epoch": 2.6884554559831315, "grad_norm": 2.9503075773910528, "learning_rate": 3.280179230363732e-07, "logits/chosen": -2.504687547683716, "logits/rejected": -2.5492186546325684, "logps/chosen": -365.0249938964844, "logps/rejected": -421.3999938964844, "loss": 0.0206, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5689454078674316, "rewards/margins": 6.568359375, "rewards/rejected": -9.137499809265137, "step": 5100 }, { "epoch": 2.6937269372693726, "grad_norm": 4.794379118947249, "learning_rate": 3.267000527148128e-07, "logits/chosen": -2.5703125, "logits/rejected": -2.6166014671325684, "logps/chosen": -316.82501220703125, "logps/rejected": -429.42498779296875, "loss": 0.0323, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.899609327316284, "rewards/margins": 6.749218940734863, "rewards/rejected": -9.646875381469727, "step": 5110 }, { "epoch": 2.698998418555614, "grad_norm": 9.384032064588489, "learning_rate": 3.253821823932525e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.5667967796325684, "logps/chosen": -372.36248779296875, "logps/rejected": -377.1499938964844, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.108062744140625, "rewards/margins": 5.996874809265137, "rewards/rejected": -9.109375, "step": 5120 }, { "epoch": 2.7042698998418553, "grad_norm": 4.478580264989858, "learning_rate": 3.2406431207169216e-07, "logits/chosen": -2.356152296066284, "logits/rejected": -2.5223631858825684, "logps/chosen": -404.2875061035156, "logps/rejected": -399.2250061035156, "loss": 0.0261, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.326733350753784, "rewards/margins": 6.592187404632568, "rewards/rejected": -8.925000190734863, "step": 5130 }, { "epoch": 2.709541381128097, "grad_norm": 8.837531741836548, "learning_rate": 3.2274644175013175e-07, "logits/chosen": -2.311328172683716, "logits/rejected": -2.379687547683716, "logps/chosen": -358.7875061035156, "logps/rejected": -380.95001220703125, "loss": 0.0397, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.729248046875, "rewards/margins": 6.311718940734863, "rewards/rejected": -9.040624618530273, "step": 5140 }, { "epoch": 2.7148128624143384, "grad_norm": 1.6745203686581558, "learning_rate": 3.2142857142857145e-07, "logits/chosen": -2.4571290016174316, "logits/rejected": -2.46484375, "logps/chosen": -350.2749938964844, "logps/rejected": -390.92498779296875, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -2.5283203125, "rewards/margins": 6.336328029632568, "rewards/rejected": -8.858593940734863, "step": 5150 }, { "epoch": 2.72008434370058, "grad_norm": 1.9592266587638987, "learning_rate": 3.2011070110701105e-07, "logits/chosen": -2.303515672683716, "logits/rejected": -2.350781202316284, "logps/chosen": -405.20001220703125, "logps/rejected": -421.8999938964844, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.588427782058716, "rewards/margins": 6.757031440734863, "rewards/rejected": -9.338281631469727, "step": 5160 }, { "epoch": 2.7253558249868215, "grad_norm": 5.7560252313125995, "learning_rate": 3.187928307854507e-07, "logits/chosen": -2.5322265625, "logits/rejected": -2.4986329078674316, "logps/chosen": -353.9750061035156, "logps/rejected": -427.20001220703125, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -2.831103563308716, "rewards/margins": 6.602343559265137, "rewards/rejected": -9.428906440734863, "step": 5170 }, { "epoch": 2.7306273062730626, "grad_norm": 9.462627781489445, "learning_rate": 3.1747496046389034e-07, "logits/chosen": -2.44140625, "logits/rejected": -2.4916014671325684, "logps/chosen": -331.45001220703125, "logps/rejected": -368.20001220703125, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -2.002270460128784, "rewards/margins": 6.550000190734863, "rewards/rejected": -8.556249618530273, "step": 5180 }, { "epoch": 2.735898787559304, "grad_norm": 23.19189142692797, "learning_rate": 3.1615709014233e-07, "logits/chosen": -2.4964842796325684, "logits/rejected": -2.5914063453674316, "logps/chosen": -376.67498779296875, "logps/rejected": -396.29998779296875, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -2.753405809402466, "rewards/margins": 6.514843940734863, "rewards/rejected": -9.268750190734863, "step": 5190 }, { "epoch": 2.7411702688455457, "grad_norm": 2.9271158376611113, "learning_rate": 3.148392198207696e-07, "logits/chosen": -2.4775390625, "logits/rejected": -2.6146483421325684, "logps/chosen": -372.20001220703125, "logps/rejected": -370.70001220703125, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5904297828674316, "rewards/margins": 6.592577934265137, "rewards/rejected": -9.18359375, "step": 5200 }, { "epoch": 2.746441750131787, "grad_norm": 9.629098129304486, "learning_rate": 3.135213494992093e-07, "logits/chosen": -2.49169921875, "logits/rejected": -2.468554735183716, "logps/chosen": -325.3125, "logps/rejected": -368.7250061035156, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -2.935839891433716, "rewards/margins": 6.371874809265137, "rewards/rejected": -9.30859375, "step": 5210 }, { "epoch": 2.7517132314180284, "grad_norm": 23.520375769897612, "learning_rate": 3.122034791776489e-07, "logits/chosen": -2.322265625, "logits/rejected": -2.420703172683716, "logps/chosen": -399.0874938964844, "logps/rejected": -382.42498779296875, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.71044921875, "rewards/margins": 6.278515815734863, "rewards/rejected": -8.986719131469727, "step": 5220 }, { "epoch": 2.75698471270427, "grad_norm": 5.750344140587931, "learning_rate": 3.108856088560886e-07, "logits/chosen": -2.6371092796325684, "logits/rejected": -2.5928711891174316, "logps/chosen": -318.11248779296875, "logps/rejected": -386.875, "loss": 0.0294, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.708081007003784, "rewards/margins": 6.970312595367432, "rewards/rejected": -9.678125381469727, "step": 5230 }, { "epoch": 2.7622561939905115, "grad_norm": 3.267592458056042, "learning_rate": 3.095677385345282e-07, "logits/chosen": -2.443164110183716, "logits/rejected": -2.3978514671325684, "logps/chosen": -370.67498779296875, "logps/rejected": -411.29998779296875, "loss": 0.0269, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.732421875, "rewards/margins": 6.544921875, "rewards/rejected": -9.2734375, "step": 5240 }, { "epoch": 2.767527675276753, "grad_norm": 9.977931482610712, "learning_rate": 3.082498682129678e-07, "logits/chosen": -2.4703125953674316, "logits/rejected": -2.6001954078674316, "logps/chosen": -360.8125, "logps/rejected": -370.4750061035156, "loss": 0.0311, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7757325172424316, "rewards/margins": 6.250781059265137, "rewards/rejected": -9.021875381469727, "step": 5250 }, { "epoch": 2.772799156562994, "grad_norm": 7.955301518076793, "learning_rate": 3.0693199789140747e-07, "logits/chosen": -2.4560546875, "logits/rejected": -2.421191453933716, "logps/chosen": -391.2250061035156, "logps/rejected": -452.75, "loss": 0.0272, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9876952171325684, "rewards/margins": 6.793749809265137, "rewards/rejected": -9.782812118530273, "step": 5260 }, { "epoch": 2.7780706378492357, "grad_norm": 4.47037511280937, "learning_rate": 3.056141275698471e-07, "logits/chosen": -2.657031297683716, "logits/rejected": -2.5648436546325684, "logps/chosen": -369.36248779296875, "logps/rejected": -441.9750061035156, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5859131813049316, "rewards/margins": 6.059374809265137, "rewards/rejected": -8.649218559265137, "step": 5270 }, { "epoch": 2.783342119135477, "grad_norm": 84.06314328340954, "learning_rate": 3.0429625724828676e-07, "logits/chosen": -2.5113282203674316, "logits/rejected": -2.6890625953674316, "logps/chosen": -366.3500061035156, "logps/rejected": -360.82501220703125, "loss": 0.0387, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.561816453933716, "rewards/margins": 6.879687309265137, "rewards/rejected": -9.443750381469727, "step": 5280 }, { "epoch": 2.7886136004217184, "grad_norm": 6.251925695079034, "learning_rate": 3.029783869267264e-07, "logits/chosen": -2.392578125, "logits/rejected": -2.471484422683716, "logps/chosen": -367.92498779296875, "logps/rejected": -408.5, "loss": 0.045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.594482421875, "rewards/margins": 6.415820121765137, "rewards/rejected": -9.007031440734863, "step": 5290 }, { "epoch": 2.79388508170796, "grad_norm": 17.212653173542158, "learning_rate": 3.0166051660516606e-07, "logits/chosen": -2.533203125, "logits/rejected": -2.4921875, "logps/chosen": -351.7875061035156, "logps/rejected": -419.29998779296875, "loss": 0.0375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.835681200027466, "rewards/margins": 6.307031154632568, "rewards/rejected": -9.138280868530273, "step": 5300 }, { "epoch": 2.7991565629942015, "grad_norm": 8.649047728693276, "learning_rate": 3.0034264628360565e-07, "logits/chosen": -2.4935545921325684, "logits/rejected": -2.4501953125, "logps/chosen": -379.375, "logps/rejected": -429.45001220703125, "loss": 0.0418, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.130566358566284, "rewards/margins": 6.165625095367432, "rewards/rejected": -9.296875, "step": 5310 }, { "epoch": 2.804428044280443, "grad_norm": 16.959847128296655, "learning_rate": 2.9902477596204535e-07, "logits/chosen": -2.5611329078674316, "logits/rejected": -2.5556640625, "logps/chosen": -367.8500061035156, "logps/rejected": -392.9750061035156, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.783984422683716, "rewards/margins": 6.543749809265137, "rewards/rejected": -9.324999809265137, "step": 5320 }, { "epoch": 2.809699525566684, "grad_norm": 5.35372485185501, "learning_rate": 2.9770690564048495e-07, "logits/chosen": -2.519335985183716, "logits/rejected": -2.5074219703674316, "logps/chosen": -355.54998779296875, "logps/rejected": -411.67498779296875, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": -2.843945264816284, "rewards/margins": 6.336718559265137, "rewards/rejected": -9.182031631469727, "step": 5330 }, { "epoch": 2.8149710068529257, "grad_norm": 4.724629474781166, "learning_rate": 2.963890353189246e-07, "logits/chosen": -2.349804639816284, "logits/rejected": -2.3382811546325684, "logps/chosen": -364.2250061035156, "logps/rejected": -369.375, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -2.5552735328674316, "rewards/margins": 6.456250190734863, "rewards/rejected": -9.014062881469727, "step": 5340 }, { "epoch": 2.8202424881391672, "grad_norm": 1.2481450516240786, "learning_rate": 2.9507116499736424e-07, "logits/chosen": -2.418750047683716, "logits/rejected": -2.424023389816284, "logps/chosen": -356.32501220703125, "logps/rejected": -410.23748779296875, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2862792015075684, "rewards/margins": 6.944531440734863, "rewards/rejected": -9.224218368530273, "step": 5350 }, { "epoch": 2.8255139694254083, "grad_norm": 17.580106229545315, "learning_rate": 2.937532946758039e-07, "logits/chosen": -2.667187452316284, "logits/rejected": -2.6240234375, "logps/chosen": -300.8125, "logps/rejected": -356.45001220703125, "loss": 0.0683, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0621094703674316, "rewards/margins": 6.311132907867432, "rewards/rejected": -9.373437881469727, "step": 5360 }, { "epoch": 2.83078545071165, "grad_norm": 3.0806518226305477, "learning_rate": 2.924354243542435e-07, "logits/chosen": -2.4888672828674316, "logits/rejected": -2.466601610183716, "logps/chosen": -376.63751220703125, "logps/rejected": -430.92498779296875, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -3.725390672683716, "rewards/margins": 6.702343940734863, "rewards/rejected": -10.435155868530273, "step": 5370 }, { "epoch": 2.8360569319978914, "grad_norm": 1.4042336995296651, "learning_rate": 2.911175540326832e-07, "logits/chosen": -2.42431640625, "logits/rejected": -2.426953077316284, "logps/chosen": -374.8999938964844, "logps/rejected": -410.8500061035156, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -3.11328125, "rewards/margins": 6.56640625, "rewards/rejected": -9.674219131469727, "step": 5380 }, { "epoch": 2.841328413284133, "grad_norm": 26.242690713708125, "learning_rate": 2.897996837111228e-07, "logits/chosen": -2.634765625, "logits/rejected": -2.567187547683716, "logps/chosen": -357.38751220703125, "logps/rejected": -382.25, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -2.532031297683716, "rewards/margins": 6.409375190734863, "rewards/rejected": -8.939062118530273, "step": 5390 }, { "epoch": 2.8465998945703745, "grad_norm": 49.25163139646493, "learning_rate": 2.884818133895625e-07, "logits/chosen": -2.4599609375, "logits/rejected": -2.4945311546325684, "logps/chosen": -322.6000061035156, "logps/rejected": -391.3999938964844, "loss": 0.0365, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.224926710128784, "rewards/margins": 6.502734184265137, "rewards/rejected": -8.729687690734863, "step": 5400 }, { "epoch": 2.8518713758566157, "grad_norm": 54.52533803043459, "learning_rate": 2.871639430680021e-07, "logits/chosen": -2.4546875953674316, "logits/rejected": -2.4974608421325684, "logps/chosen": -351.7749938964844, "logps/rejected": -376.0, "loss": 0.0352, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.113037109375, "rewards/margins": 6.206640720367432, "rewards/rejected": -8.32421875, "step": 5410 }, { "epoch": 2.857142857142857, "grad_norm": 9.972654857726713, "learning_rate": 2.858460727464417e-07, "logits/chosen": -2.5863280296325684, "logits/rejected": -2.5531249046325684, "logps/chosen": -336.0, "logps/rejected": -378.3999938964844, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -2.2603759765625, "rewards/margins": 6.396874904632568, "rewards/rejected": -8.657812118530273, "step": 5420 }, { "epoch": 2.8624143384290983, "grad_norm": 3.977078023499143, "learning_rate": 2.845282024248814e-07, "logits/chosen": -2.4853515625, "logits/rejected": -2.4482421875, "logps/chosen": -372.6000061035156, "logps/rejected": -410.92498779296875, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": -2.9549803733825684, "rewards/margins": 5.90234375, "rewards/rejected": -8.853906631469727, "step": 5430 }, { "epoch": 2.86768581971534, "grad_norm": 10.212196644812408, "learning_rate": 2.83210332103321e-07, "logits/chosen": -2.526171922683716, "logits/rejected": -2.532031297683716, "logps/chosen": -374.26251220703125, "logps/rejected": -407.2250061035156, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -2.9615235328674316, "rewards/margins": 6.668359279632568, "rewards/rejected": -9.637499809265137, "step": 5440 }, { "epoch": 2.8729573010015814, "grad_norm": 7.816127264893912, "learning_rate": 2.8189246178176067e-07, "logits/chosen": -2.517773389816284, "logits/rejected": -2.4781250953674316, "logps/chosen": -402.6499938964844, "logps/rejected": -426.32501220703125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -3.2711424827575684, "rewards/margins": 6.643750190734863, "rewards/rejected": -9.910937309265137, "step": 5450 }, { "epoch": 2.878228782287823, "grad_norm": 30.82970374225245, "learning_rate": 2.805745914602003e-07, "logits/chosen": -2.5931639671325684, "logits/rejected": -2.5849609375, "logps/chosen": -369.95001220703125, "logps/rejected": -421.95001220703125, "loss": 0.03, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1900391578674316, "rewards/margins": 6.37109375, "rewards/rejected": -9.5625, "step": 5460 }, { "epoch": 2.8835002635740645, "grad_norm": 7.2119969640521955, "learning_rate": 2.7925672113863996e-07, "logits/chosen": -2.5000977516174316, "logits/rejected": -2.488476514816284, "logps/chosen": -343.95001220703125, "logps/rejected": -362.7250061035156, "loss": 0.0297, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5213379859924316, "rewards/margins": 6.819531440734863, "rewards/rejected": -9.345312118530273, "step": 5470 }, { "epoch": 2.8887717448603056, "grad_norm": 10.86107341090755, "learning_rate": 2.7793885081707956e-07, "logits/chosen": -2.3951172828674316, "logits/rejected": -2.498242139816284, "logps/chosen": -418.82501220703125, "logps/rejected": -421.0249938964844, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -2.8121094703674316, "rewards/margins": 6.466015815734863, "rewards/rejected": -9.283594131469727, "step": 5480 }, { "epoch": 2.894043226146547, "grad_norm": 7.641029646285157, "learning_rate": 2.7662098049551926e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.553906202316284, "logps/chosen": -361.36248779296875, "logps/rejected": -394.7250061035156, "loss": 0.0486, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2906250953674316, "rewards/margins": 6.431640625, "rewards/rejected": -9.721875190734863, "step": 5490 }, { "epoch": 2.8993147074327887, "grad_norm": 5.537682613968269, "learning_rate": 2.7530311017395885e-07, "logits/chosen": -2.523632764816284, "logits/rejected": -2.48046875, "logps/chosen": -344.07501220703125, "logps/rejected": -392.8500061035156, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2557616233825684, "rewards/margins": 6.689062595367432, "rewards/rejected": -9.947656631469727, "step": 5500 }, { "epoch": 2.90458618871903, "grad_norm": 9.624082450656916, "learning_rate": 2.739852398523985e-07, "logits/chosen": -2.466601610183716, "logits/rejected": -2.5250000953674316, "logps/chosen": -374.07501220703125, "logps/rejected": -417.7749938964844, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.056835889816284, "rewards/margins": 7.34375, "rewards/rejected": -10.397656440734863, "step": 5510 }, { "epoch": 2.9098576700052714, "grad_norm": 19.342530505623508, "learning_rate": 2.7266736953083815e-07, "logits/chosen": -2.5943360328674316, "logits/rejected": -2.5658202171325684, "logps/chosen": -330.7749938964844, "logps/rejected": -411.70001220703125, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -3.1201171875, "rewards/margins": 6.428515434265137, "rewards/rejected": -9.548437118530273, "step": 5520 }, { "epoch": 2.915129151291513, "grad_norm": 5.897066376371074, "learning_rate": 2.713494992092778e-07, "logits/chosen": -2.316210985183716, "logits/rejected": -2.398242235183716, "logps/chosen": -344.67498779296875, "logps/rejected": -396.70001220703125, "loss": 0.0326, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.826342821121216, "rewards/margins": 6.494921684265137, "rewards/rejected": -9.321874618530273, "step": 5530 }, { "epoch": 2.9204006325777545, "grad_norm": 4.29176816727523, "learning_rate": 2.700316288877174e-07, "logits/chosen": -2.6537108421325684, "logits/rejected": -2.5386719703674316, "logps/chosen": -348.8999938964844, "logps/rejected": -410.4750061035156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -3.3412108421325684, "rewards/margins": 6.512499809265137, "rewards/rejected": -9.859375, "step": 5540 }, { "epoch": 2.925672113863996, "grad_norm": 8.487113737071832, "learning_rate": 2.687137585661571e-07, "logits/chosen": -2.627148389816284, "logits/rejected": -2.5374999046325684, "logps/chosen": -343.7250061035156, "logps/rejected": -393.2749938964844, "loss": 0.0396, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.732421875, "rewards/margins": 6.287109375, "rewards/rejected": -9.017187118530273, "step": 5550 }, { "epoch": 2.930943595150237, "grad_norm": 10.30884990625693, "learning_rate": 2.6739588824459674e-07, "logits/chosen": -2.553906202316284, "logits/rejected": -2.4749999046325684, "logps/chosen": -347.0375061035156, "logps/rejected": -414.1499938964844, "loss": 0.0421, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.234057664871216, "rewards/margins": 6.585156440734863, "rewards/rejected": -8.82421875, "step": 5560 }, { "epoch": 2.9362150764364787, "grad_norm": 17.042822587743977, "learning_rate": 2.660780179230364e-07, "logits/chosen": -2.474414110183716, "logits/rejected": -2.527539014816284, "logps/chosen": -338.8999938964844, "logps/rejected": -383.82501220703125, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -2.28662109375, "rewards/margins": 6.44921875, "rewards/rejected": -8.736719131469727, "step": 5570 }, { "epoch": 2.94148655772272, "grad_norm": 6.8474713793253175, "learning_rate": 2.6476014760147603e-07, "logits/chosen": -2.4771485328674316, "logits/rejected": -2.644335985183716, "logps/chosen": -334.3125, "logps/rejected": -352.0249938964844, "loss": 0.0272, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.699023485183716, "rewards/margins": 6.377343654632568, "rewards/rejected": -9.0703125, "step": 5580 }, { "epoch": 2.9467580390089614, "grad_norm": 4.296760074227696, "learning_rate": 2.634422772799156e-07, "logits/chosen": -2.527148485183716, "logits/rejected": -2.469531297683716, "logps/chosen": -331.2250061035156, "logps/rejected": -393.79998779296875, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.053906202316284, "rewards/margins": 6.338671684265137, "rewards/rejected": -9.393750190734863, "step": 5590 }, { "epoch": 2.952029520295203, "grad_norm": 7.858393015249107, "learning_rate": 2.6212440695835533e-07, "logits/chosen": -2.630078077316284, "logits/rejected": -2.4253907203674316, "logps/chosen": -328.36248779296875, "logps/rejected": -413.54998779296875, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.684375047683716, "rewards/margins": 6.892968654632568, "rewards/rejected": -9.573437690734863, "step": 5600 }, { "epoch": 2.9573010015814445, "grad_norm": 17.49219328188383, "learning_rate": 2.608065366367949e-07, "logits/chosen": -2.513476610183716, "logits/rejected": -2.4537110328674316, "logps/chosen": -336.3125, "logps/rejected": -400.125, "loss": 0.022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.751415967941284, "rewards/margins": 6.760546684265137, "rewards/rejected": -9.51171875, "step": 5610 }, { "epoch": 2.962572482867686, "grad_norm": 2.337416938172493, "learning_rate": 2.5948866631523457e-07, "logits/chosen": -2.3028321266174316, "logits/rejected": -2.436718702316284, "logps/chosen": -383.82501220703125, "logps/rejected": -403.04998779296875, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7123045921325684, "rewards/margins": 6.254687309265137, "rewards/rejected": -8.962499618530273, "step": 5620 }, { "epoch": 2.967843964153927, "grad_norm": 7.471799241699862, "learning_rate": 2.581707959936742e-07, "logits/chosen": -2.4505858421325684, "logits/rejected": -2.4058594703674316, "logps/chosen": -355.1625061035156, "logps/rejected": -420.25, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -2.513134717941284, "rewards/margins": 6.534375190734863, "rewards/rejected": -9.047656059265137, "step": 5630 }, { "epoch": 2.9731154454401687, "grad_norm": 6.704815462815935, "learning_rate": 2.5685292567211386e-07, "logits/chosen": -2.27734375, "logits/rejected": -2.428515672683716, "logps/chosen": -376.17498779296875, "logps/rejected": -423.5, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.6126465797424316, "rewards/margins": 6.5546875, "rewards/rejected": -9.174219131469727, "step": 5640 }, { "epoch": 2.9783869267264103, "grad_norm": 44.991853312019266, "learning_rate": 2.5553505535055346e-07, "logits/chosen": -2.6742186546325684, "logits/rejected": -2.664257764816284, "logps/chosen": -299.0, "logps/rejected": -365.2749938964844, "loss": 0.0262, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.588427782058716, "rewards/margins": 6.821484565734863, "rewards/rejected": -9.405468940734863, "step": 5650 }, { "epoch": 2.9836584080126514, "grad_norm": 4.595094363427402, "learning_rate": 2.5421718502899316e-07, "logits/chosen": -2.4476561546325684, "logits/rejected": -2.4730467796325684, "logps/chosen": -351.3187561035156, "logps/rejected": -379.75, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.624218702316284, "rewards/margins": 6.487890720367432, "rewards/rejected": -9.114062309265137, "step": 5660 }, { "epoch": 2.988929889298893, "grad_norm": 6.742968592661905, "learning_rate": 2.5289931470743275e-07, "logits/chosen": -2.4544920921325684, "logits/rejected": -2.419726610183716, "logps/chosen": -363.07501220703125, "logps/rejected": -411.3500061035156, "loss": 0.0488, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.922070264816284, "rewards/margins": 6.8359375, "rewards/rejected": -9.758593559265137, "step": 5670 }, { "epoch": 2.9942013705851345, "grad_norm": 29.18400093271178, "learning_rate": 2.515814443858724e-07, "logits/chosen": -2.434375047683716, "logits/rejected": -2.406445264816284, "logps/chosen": -372.5, "logps/rejected": -440.9750061035156, "loss": 0.0421, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1397461891174316, "rewards/margins": 6.915234565734863, "rewards/rejected": -10.053125381469727, "step": 5680 }, { "epoch": 2.999472851871376, "grad_norm": 2.4799619215642337, "learning_rate": 2.5026357406431205e-07, "logits/chosen": -2.4087891578674316, "logits/rejected": -2.3580079078674316, "logps/chosen": -368.57501220703125, "logps/rejected": -439.04998779296875, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -3.0248045921325684, "rewards/margins": 6.678906440734863, "rewards/rejected": -9.706250190734863, "step": 5690 }, { "epoch": 3.004744333157617, "grad_norm": 11.101549265030135, "learning_rate": 2.489457037427517e-07, "logits/chosen": -2.675976514816284, "logits/rejected": -2.5654296875, "logps/chosen": -377.1499938964844, "logps/rejected": -447.9750061035156, "loss": 0.0126, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.87896728515625, "rewards/margins": 6.900000095367432, "rewards/rejected": -9.778905868530273, "step": 5700 }, { "epoch": 3.0100158144438587, "grad_norm": 2.009381923617145, "learning_rate": 2.4762783342119134e-07, "logits/chosen": -2.5576171875, "logits/rejected": -2.6392579078674316, "logps/chosen": -349.32501220703125, "logps/rejected": -405.375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -2.995312452316284, "rewards/margins": 7.35546875, "rewards/rejected": -10.350000381469727, "step": 5710 }, { "epoch": 3.0152872957301002, "grad_norm": 0.6722448729523763, "learning_rate": 2.46309963099631e-07, "logits/chosen": -2.524218797683716, "logits/rejected": -2.546875, "logps/chosen": -361.95001220703125, "logps/rejected": -420.625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.4834961891174316, "rewards/margins": 7.650000095367432, "rewards/rejected": -11.134374618530273, "step": 5720 }, { "epoch": 3.020558777016342, "grad_norm": 0.7779035407115772, "learning_rate": 2.4499209277807064e-07, "logits/chosen": -2.383984327316284, "logits/rejected": -2.4862303733825684, "logps/chosen": -402.3999938964844, "logps/rejected": -444.79998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.949902296066284, "rewards/margins": 7.876562595367432, "rewards/rejected": -10.827343940734863, "step": 5730 }, { "epoch": 3.025830258302583, "grad_norm": 17.918284332839903, "learning_rate": 2.4367422245651023e-07, "logits/chosen": -2.3125, "logits/rejected": -2.453125, "logps/chosen": -425.625, "logps/rejected": -476.375, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.8995604515075684, "rewards/margins": 7.642187595367432, "rewards/rejected": -10.543749809265137, "step": 5740 }, { "epoch": 3.0311017395888245, "grad_norm": 1.208455321836374, "learning_rate": 2.423563521349499e-07, "logits/chosen": -2.582226514816284, "logits/rejected": -2.527539014816284, "logps/chosen": -378.29998779296875, "logps/rejected": -434.8999938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.416308641433716, "rewards/margins": 7.878125190734863, "rewards/rejected": -10.297656059265137, "step": 5750 }, { "epoch": 3.036373220875066, "grad_norm": 1.1708002661902333, "learning_rate": 2.4103848181338953e-07, "logits/chosen": -2.5091795921325684, "logits/rejected": -2.5386719703674316, "logps/chosen": -350.70001220703125, "logps/rejected": -388.29998779296875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.78814697265625, "rewards/margins": 7.643750190734863, "rewards/rejected": -10.430468559265137, "step": 5760 }, { "epoch": 3.041644702161307, "grad_norm": 1.316810661813218, "learning_rate": 2.3972061149182923e-07, "logits/chosen": -2.5355467796325684, "logits/rejected": -2.5316405296325684, "logps/chosen": -376.04998779296875, "logps/rejected": -436.42498779296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.198046922683716, "rewards/margins": 8.26171875, "rewards/rejected": -11.457812309265137, "step": 5770 }, { "epoch": 3.0469161834475487, "grad_norm": 2.0024039904189164, "learning_rate": 2.3840274117026885e-07, "logits/chosen": -2.492871046066284, "logits/rejected": -2.591601610183716, "logps/chosen": -416.57501220703125, "logps/rejected": -458.4750061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.8773436546325684, "rewards/margins": 8.154687881469727, "rewards/rejected": -12.034375190734863, "step": 5780 }, { "epoch": 3.05218766473379, "grad_norm": 6.736403804309742, "learning_rate": 2.370848708487085e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.6201171875, "logps/chosen": -357.7124938964844, "logps/rejected": -409.20001220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4169921875, "rewards/margins": 8.018750190734863, "rewards/rejected": -11.434374809265137, "step": 5790 }, { "epoch": 3.0574591460200318, "grad_norm": 4.905016932705703, "learning_rate": 2.3576700052714812e-07, "logits/chosen": -2.5904297828674316, "logits/rejected": -2.729687452316284, "logps/chosen": -354.875, "logps/rejected": -380.92498779296875, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -3.1651368141174316, "rewards/margins": 7.782812595367432, "rewards/rejected": -10.9453125, "step": 5800 }, { "epoch": 3.062730627306273, "grad_norm": 1.712251831149801, "learning_rate": 2.3444913020558777e-07, "logits/chosen": -2.5830078125, "logits/rejected": -2.764453172683716, "logps/chosen": -365.8999938964844, "logps/rejected": -396.6499938964844, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.185742139816284, "rewards/margins": 7.9765625, "rewards/rejected": -11.1640625, "step": 5810 }, { "epoch": 3.0680021085925144, "grad_norm": 0.6374043498509728, "learning_rate": 2.3313125988402741e-07, "logits/chosen": -2.627734422683716, "logits/rejected": -2.6732420921325684, "logps/chosen": -365.48748779296875, "logps/rejected": -414.1000061035156, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.1136717796325684, "rewards/margins": 8.270312309265137, "rewards/rejected": -11.376562118530273, "step": 5820 }, { "epoch": 3.073273589878756, "grad_norm": 5.222563484385973, "learning_rate": 2.3181338956246703e-07, "logits/chosen": -2.568359375, "logits/rejected": -2.621289014816284, "logps/chosen": -334.8374938964844, "logps/rejected": -414.20001220703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -3.6080079078674316, "rewards/margins": 8.065625190734863, "rewards/rejected": -11.67578125, "step": 5830 }, { "epoch": 3.0785450711649975, "grad_norm": 0.8863652301194332, "learning_rate": 2.3049551924090668e-07, "logits/chosen": -2.519335985183716, "logits/rejected": -2.6187500953674316, "logps/chosen": -371.7250061035156, "logps/rejected": -398.04998779296875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.153759717941284, "rewards/margins": 7.864843845367432, "rewards/rejected": -11.024218559265137, "step": 5840 }, { "epoch": 3.0838165524512386, "grad_norm": 0.6764171472413356, "learning_rate": 2.2917764891934633e-07, "logits/chosen": -2.6978516578674316, "logits/rejected": -2.7269530296325684, "logps/chosen": -379.0249938964844, "logps/rejected": -419.45001220703125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.7623047828674316, "rewards/margins": 8.06640625, "rewards/rejected": -10.830469131469727, "step": 5850 }, { "epoch": 3.08908803373748, "grad_norm": 2.590173547972466, "learning_rate": 2.2785977859778595e-07, "logits/chosen": -2.6439452171325684, "logits/rejected": -2.594531297683716, "logps/chosen": -364.8500061035156, "logps/rejected": -443.54998779296875, "loss": 0.0093, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.616015672683716, "rewards/margins": 8.404687881469727, "rewards/rejected": -12.024999618530273, "step": 5860 }, { "epoch": 3.0943595150237218, "grad_norm": 0.8978753652710939, "learning_rate": 2.265419082762256e-07, "logits/chosen": -2.6458983421325684, "logits/rejected": -2.603515625, "logps/chosen": -354.07501220703125, "logps/rejected": -422.6000061035156, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.517285108566284, "rewards/margins": 8.223437309265137, "rewards/rejected": -11.73828125, "step": 5870 }, { "epoch": 3.0996309963099633, "grad_norm": 17.809911788066344, "learning_rate": 2.2522403795466525e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.712109327316284, "logps/chosen": -381.07501220703125, "logps/rejected": -440.3999938964844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -3.356719970703125, "rewards/margins": 8.278905868530273, "rewards/rejected": -11.642187118530273, "step": 5880 }, { "epoch": 3.1049024775962044, "grad_norm": 0.7506432406006608, "learning_rate": 2.239061676331049e-07, "logits/chosen": -2.77734375, "logits/rejected": -2.6458983421325684, "logps/chosen": -330.13751220703125, "logps/rejected": -407.5, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.3441405296325684, "rewards/margins": 8.115625381469727, "rewards/rejected": -11.454687118530273, "step": 5890 }, { "epoch": 3.110173958882446, "grad_norm": 2.1465323335128588, "learning_rate": 2.2258829731154451e-07, "logits/chosen": -2.6005859375, "logits/rejected": -2.5367188453674316, "logps/chosen": -384.0874938964844, "logps/rejected": -424.875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.921875, "rewards/margins": 8.089062690734863, "rewards/rejected": -11.010937690734863, "step": 5900 }, { "epoch": 3.1154454401686875, "grad_norm": 0.9925880705773044, "learning_rate": 2.212704269899842e-07, "logits/chosen": -2.720703125, "logits/rejected": -2.6285157203674316, "logps/chosen": -384.0249938964844, "logps/rejected": -438.3500061035156, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.557727098464966, "rewards/margins": 8.143750190734863, "rewards/rejected": -11.698437690734863, "step": 5910 }, { "epoch": 3.1207169214549286, "grad_norm": 4.096564808037433, "learning_rate": 2.1995255666842384e-07, "logits/chosen": -2.6644530296325684, "logits/rejected": -2.5960936546325684, "logps/chosen": -374.79998779296875, "logps/rejected": -434.875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -3.167675733566284, "rewards/margins": 7.921093940734863, "rewards/rejected": -11.092968940734863, "step": 5920 }, { "epoch": 3.12598840274117, "grad_norm": 0.7999637491526248, "learning_rate": 2.1863468634686346e-07, "logits/chosen": -2.5025391578674316, "logits/rejected": -2.6664061546325684, "logps/chosen": -379.98748779296875, "logps/rejected": -440.5249938964844, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.218212842941284, "rewards/margins": 8.155468940734863, "rewards/rejected": -11.374218940734863, "step": 5930 }, { "epoch": 3.1312598840274117, "grad_norm": 3.6401782716902504, "learning_rate": 2.173168160253031e-07, "logits/chosen": -2.7115235328674316, "logits/rejected": -2.721874952316284, "logps/chosen": -361.54998779296875, "logps/rejected": -419.42498779296875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -3.985156297683716, "rewards/margins": 8.088671684265137, "rewards/rejected": -12.078125, "step": 5940 }, { "epoch": 3.1365313653136533, "grad_norm": 0.9981294094965233, "learning_rate": 2.1599894570374275e-07, "logits/chosen": -2.7791991233825684, "logits/rejected": -2.7318358421325684, "logps/chosen": -357.29998779296875, "logps/rejected": -448.7749938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.905468702316284, "rewards/margins": 8.300000190734863, "rewards/rejected": -12.19921875, "step": 5950 }, { "epoch": 3.1418028465998944, "grad_norm": 4.826807551021927, "learning_rate": 2.1468107538218237e-07, "logits/chosen": -2.5111327171325684, "logits/rejected": -2.6939454078674316, "logps/chosen": -362.875, "logps/rejected": -429.2749938964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -4.427343845367432, "rewards/margins": 7.91796875, "rewards/rejected": -12.342187881469727, "step": 5960 }, { "epoch": 3.147074327886136, "grad_norm": 2.3844004798412652, "learning_rate": 2.1336320506062202e-07, "logits/chosen": -2.720507860183716, "logits/rejected": -2.778125047683716, "logps/chosen": -356.5874938964844, "logps/rejected": -411.54998779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.277636528015137, "rewards/margins": 7.791406154632568, "rewards/rejected": -12.067187309265137, "step": 5970 }, { "epoch": 3.1523458091723775, "grad_norm": 2.023560709019241, "learning_rate": 2.1204533473906167e-07, "logits/chosen": -2.582226514816284, "logits/rejected": -2.6578125953674316, "logps/chosen": -367.8999938964844, "logps/rejected": -422.20001220703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -3.2999267578125, "rewards/margins": 7.760156154632568, "rewards/rejected": -11.0625, "step": 5980 }, { "epoch": 3.157617290458619, "grad_norm": 6.2535576061519, "learning_rate": 2.1072746441750132e-07, "logits/chosen": -2.651171922683716, "logits/rejected": -2.657031297683716, "logps/chosen": -363.875, "logps/rejected": -428.5249938964844, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.367968797683716, "rewards/margins": 8.314062118530273, "rewards/rejected": -11.680468559265137, "step": 5990 }, { "epoch": 3.16288877174486, "grad_norm": 2.687367983425943, "learning_rate": 2.0940959409594094e-07, "logits/chosen": -2.693554639816284, "logits/rejected": -2.6083984375, "logps/chosen": -357.2250061035156, "logps/rejected": -407.67498779296875, "loss": 0.0114, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.678515672683716, "rewards/margins": 7.616406440734863, "rewards/rejected": -11.293749809265137, "step": 6000 }, { "epoch": 3.1681602530311017, "grad_norm": 1.131288778514829, "learning_rate": 2.0809172377438058e-07, "logits/chosen": -2.405468702316284, "logits/rejected": -2.5947265625, "logps/chosen": -410.92498779296875, "logps/rejected": -402.42498779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.542773485183716, "rewards/margins": 7.640625, "rewards/rejected": -11.17578125, "step": 6010 }, { "epoch": 3.1734317343173433, "grad_norm": 3.0518117724551272, "learning_rate": 2.0677385345282023e-07, "logits/chosen": -2.7603516578674316, "logits/rejected": -2.6117186546325684, "logps/chosen": -324.26251220703125, "logps/rejected": -411.20001220703125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.77392578125, "rewards/margins": 8.037500381469727, "rewards/rejected": -11.80859375, "step": 6020 }, { "epoch": 3.1787032156035844, "grad_norm": 3.8597694347154, "learning_rate": 2.0545598313125985e-07, "logits/chosen": -2.76953125, "logits/rejected": -2.700000047683716, "logps/chosen": -348.04998779296875, "logps/rejected": -424.375, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.8271484375, "rewards/margins": 7.66796875, "rewards/rejected": -11.4921875, "step": 6030 }, { "epoch": 3.183974696889826, "grad_norm": 4.609764964334768, "learning_rate": 2.041381128096995e-07, "logits/chosen": -2.618945360183716, "logits/rejected": -2.635937452316284, "logps/chosen": -373.79998779296875, "logps/rejected": -413.25, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.903515577316284, "rewards/margins": 8.31640625, "rewards/rejected": -12.217187881469727, "step": 6040 }, { "epoch": 3.1892461781760675, "grad_norm": 2.366006596363299, "learning_rate": 2.0282024248813917e-07, "logits/chosen": -2.670703172683716, "logits/rejected": -2.674999952316284, "logps/chosen": -340.54998779296875, "logps/rejected": -402.07501220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.651562452316284, "rewards/margins": 7.837500095367432, "rewards/rejected": -11.493749618530273, "step": 6050 }, { "epoch": 3.194517659462309, "grad_norm": 14.464668767443891, "learning_rate": 2.0150237216657882e-07, "logits/chosen": -2.6302733421325684, "logits/rejected": -2.7646484375, "logps/chosen": -369.67498779296875, "logps/rejected": -406.6499938964844, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.0831298828125, "rewards/margins": 8.051562309265137, "rewards/rejected": -11.138280868530273, "step": 6060 }, { "epoch": 3.19978914074855, "grad_norm": 2.624698792642485, "learning_rate": 2.0018450184501844e-07, "logits/chosen": -2.532421827316284, "logits/rejected": -2.6634764671325684, "logps/chosen": -397.7250061035156, "logps/rejected": -429.04998779296875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -3.8187499046325684, "rewards/margins": 8.010156631469727, "rewards/rejected": -11.818750381469727, "step": 6070 }, { "epoch": 3.2050606220347917, "grad_norm": 3.7495019279703414, "learning_rate": 1.988666315234581e-07, "logits/chosen": -2.703320264816284, "logits/rejected": -2.798828125, "logps/chosen": -385.6000061035156, "logps/rejected": -419.8999938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.5443358421325684, "rewards/margins": 8.1171875, "rewards/rejected": -11.663281440734863, "step": 6080 }, { "epoch": 3.2103321033210332, "grad_norm": 1.9789022241472565, "learning_rate": 1.9754876120189774e-07, "logits/chosen": -2.622851610183716, "logits/rejected": -2.6527342796325684, "logps/chosen": -363.73748779296875, "logps/rejected": -455.7250061035156, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.803417921066284, "rewards/margins": 8.252344131469727, "rewards/rejected": -12.053125381469727, "step": 6090 }, { "epoch": 3.215603584607275, "grad_norm": 2.083113491920976, "learning_rate": 1.9623089088033736e-07, "logits/chosen": -2.658203125, "logits/rejected": -2.788281202316284, "logps/chosen": -336.1499938964844, "logps/rejected": -411.6499938964844, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.725390672683716, "rewards/margins": 7.918749809265137, "rewards/rejected": -11.654687881469727, "step": 6100 }, { "epoch": 3.220875065893516, "grad_norm": 0.82944000925276, "learning_rate": 1.94913020558777e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.8539061546325684, "logps/chosen": -393.54998779296875, "logps/rejected": -423.29998779296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.218945503234863, "rewards/margins": 8.485937118530273, "rewards/rejected": -12.706250190734863, "step": 6110 }, { "epoch": 3.2261465471797575, "grad_norm": 5.219741419513698, "learning_rate": 1.9359515023721665e-07, "logits/chosen": -2.6175780296325684, "logits/rejected": -2.765429735183716, "logps/chosen": -377.45001220703125, "logps/rejected": -439.57501220703125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.680859327316284, "rewards/margins": 8.357812881469727, "rewards/rejected": -12.032812118530273, "step": 6120 }, { "epoch": 3.231418028465999, "grad_norm": 1.610342572034083, "learning_rate": 1.9227727991565628e-07, "logits/chosen": -2.4361329078674316, "logits/rejected": -2.5580077171325684, "logps/chosen": -426.54998779296875, "logps/rejected": -474.25, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.7984375953674316, "rewards/margins": 9.408594131469727, "rewards/rejected": -13.207812309265137, "step": 6130 }, { "epoch": 3.2366895097522406, "grad_norm": 2.1033880562168137, "learning_rate": 1.9095940959409592e-07, "logits/chosen": -2.874218702316284, "logits/rejected": -2.947460889816284, "logps/chosen": -366.32501220703125, "logps/rejected": -419.92498779296875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -4.232421875, "rewards/margins": 8.834375381469727, "rewards/rejected": -13.067187309265137, "step": 6140 }, { "epoch": 3.2419609910384817, "grad_norm": 4.094493698260916, "learning_rate": 1.8964153927253557e-07, "logits/chosen": -2.6244139671325684, "logits/rejected": -2.739062547683716, "logps/chosen": -383.17498779296875, "logps/rejected": -419.75, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.129492282867432, "rewards/margins": 8.154687881469727, "rewards/rejected": -12.275781631469727, "step": 6150 }, { "epoch": 3.2472324723247232, "grad_norm": 0.3607872263399442, "learning_rate": 1.8832366895097522e-07, "logits/chosen": -2.7525391578674316, "logits/rejected": -2.746875047683716, "logps/chosen": -387.25, "logps/rejected": -445.2749938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.239160060882568, "rewards/margins": 8.561718940734863, "rewards/rejected": -12.794530868530273, "step": 6160 }, { "epoch": 3.252503953610965, "grad_norm": 0.9709139741473578, "learning_rate": 1.8700579862941484e-07, "logits/chosen": -2.718945264816284, "logits/rejected": -2.742968797683716, "logps/chosen": -362.79998779296875, "logps/rejected": -414.7250061035156, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.012499809265137, "rewards/margins": 8.624218940734863, "rewards/rejected": -12.638280868530273, "step": 6170 }, { "epoch": 3.257775434897206, "grad_norm": 3.2521463353698143, "learning_rate": 1.856879283078545e-07, "logits/chosen": -2.7943358421325684, "logits/rejected": -2.9419922828674316, "logps/chosen": -378.8999938964844, "logps/rejected": -436.45001220703125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -3.998046875, "rewards/margins": 8.256250381469727, "rewards/rejected": -12.251562118530273, "step": 6180 }, { "epoch": 3.2630469161834474, "grad_norm": 13.726462974996032, "learning_rate": 1.8437005798629416e-07, "logits/chosen": -2.7607421875, "logits/rejected": -2.8443360328674316, "logps/chosen": -339.04998779296875, "logps/rejected": -397.07501220703125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.8798828125, "rewards/margins": 8.478906631469727, "rewards/rejected": -12.365625381469727, "step": 6190 }, { "epoch": 3.268318397469689, "grad_norm": 4.630217016884267, "learning_rate": 1.8305218766473378e-07, "logits/chosen": -2.681445360183716, "logits/rejected": -2.849609375, "logps/chosen": -365.4750061035156, "logps/rejected": -426.95001220703125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.3828125, "rewards/margins": 8.318750381469727, "rewards/rejected": -12.699999809265137, "step": 6200 }, { "epoch": 3.2735898787559305, "grad_norm": 1.0510172800734296, "learning_rate": 1.8173431734317343e-07, "logits/chosen": -2.6460938453674316, "logits/rejected": -2.683398485183716, "logps/chosen": -372.875, "logps/rejected": -414.8999938964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -3.8804688453674316, "rewards/margins": 8.280468940734863, "rewards/rejected": -12.151562690734863, "step": 6210 }, { "epoch": 3.2788613600421717, "grad_norm": 2.4165807892369577, "learning_rate": 1.8041644702161308e-07, "logits/chosen": -2.6640625, "logits/rejected": -2.820117235183716, "logps/chosen": -379.2250061035156, "logps/rejected": -435.75, "loss": 0.0256, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9081053733825684, "rewards/margins": 8.082812309265137, "rewards/rejected": -11.991406440734863, "step": 6220 }, { "epoch": 3.284132841328413, "grad_norm": 1.153531093338482, "learning_rate": 1.790985767000527e-07, "logits/chosen": -2.6058592796325684, "logits/rejected": -2.659960985183716, "logps/chosen": -368.17498779296875, "logps/rejected": -420.875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.553906202316284, "rewards/margins": 8.518750190734863, "rewards/rejected": -12.071874618530273, "step": 6230 }, { "epoch": 3.2894043226146548, "grad_norm": 0.8753601971488846, "learning_rate": 1.7778070637849235e-07, "logits/chosen": -2.603710889816284, "logits/rejected": -2.652148485183716, "logps/chosen": -402.2749938964844, "logps/rejected": -435.1000061035156, "loss": 0.0079, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.341992139816284, "rewards/margins": 7.974999904632568, "rewards/rejected": -11.317968368530273, "step": 6240 }, { "epoch": 3.2946758039008963, "grad_norm": 2.5262048961356607, "learning_rate": 1.76462836056932e-07, "logits/chosen": -2.726757764816284, "logits/rejected": -2.733593702316284, "logps/chosen": -372.6000061035156, "logps/rejected": -431.0, "loss": 0.0077, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.89892578125, "rewards/margins": 8.567968368530273, "rewards/rejected": -12.470312118530273, "step": 6250 }, { "epoch": 3.2999472851871374, "grad_norm": 1.343392913646804, "learning_rate": 1.7514496573537164e-07, "logits/chosen": -2.7142577171325684, "logits/rejected": -2.812304735183716, "logps/chosen": -401.8500061035156, "logps/rejected": -450.57501220703125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.272851467132568, "rewards/margins": 8.407812118530273, "rewards/rejected": -12.682031631469727, "step": 6260 }, { "epoch": 3.305218766473379, "grad_norm": 6.5612275946618235, "learning_rate": 1.7382709541381126e-07, "logits/chosen": -2.6353516578674316, "logits/rejected": -2.7376952171325684, "logps/chosen": -407.7250061035156, "logps/rejected": -418.6000061035156, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -4.140234470367432, "rewards/margins": 8.194531440734863, "rewards/rejected": -12.342187881469727, "step": 6270 }, { "epoch": 3.3104902477596205, "grad_norm": 2.733705801782817, "learning_rate": 1.725092250922509e-07, "logits/chosen": -2.6787109375, "logits/rejected": -2.638867139816284, "logps/chosen": -365.375, "logps/rejected": -428.125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.183398246765137, "rewards/margins": 8.522656440734863, "rewards/rejected": -12.706250190734863, "step": 6280 }, { "epoch": 3.315761729045862, "grad_norm": 1.0291647512780853, "learning_rate": 1.7119135477069056e-07, "logits/chosen": -2.6435546875, "logits/rejected": -2.7642579078674316, "logps/chosen": -356.20001220703125, "logps/rejected": -432.32501220703125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -4.166601657867432, "rewards/margins": 8.339062690734863, "rewards/rejected": -12.5, "step": 6290 }, { "epoch": 3.321033210332103, "grad_norm": 0.9508659371010266, "learning_rate": 1.6987348444913018e-07, "logits/chosen": -2.6904296875, "logits/rejected": -2.699023485183716, "logps/chosen": -365.32501220703125, "logps/rejected": -434.70001220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.1689453125, "rewards/margins": 8.33984375, "rewards/rejected": -12.512499809265137, "step": 6300 }, { "epoch": 3.3263046916183447, "grad_norm": 5.1202933354797455, "learning_rate": 1.6855561412756983e-07, "logits/chosen": -2.695507764816284, "logits/rejected": -2.790234327316284, "logps/chosen": -354.29998779296875, "logps/rejected": -467.32501220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.798046827316284, "rewards/margins": 8.434374809265137, "rewards/rejected": -12.227343559265137, "step": 6310 }, { "epoch": 3.3315761729045863, "grad_norm": 1.120423327501567, "learning_rate": 1.6723774380600947e-07, "logits/chosen": -2.69921875, "logits/rejected": -2.7666015625, "logps/chosen": -321.13751220703125, "logps/rejected": -402.04998779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.2696776390075684, "rewards/margins": 8.293749809265137, "rewards/rejected": -11.559374809265137, "step": 6320 }, { "epoch": 3.3368476541908274, "grad_norm": 6.177783464688428, "learning_rate": 1.6591987348444915e-07, "logits/chosen": -2.553515672683716, "logits/rejected": -2.6136717796325684, "logps/chosen": -402.7749938964844, "logps/rejected": -444.7749938964844, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -3.513671875, "rewards/margins": 8.520312309265137, "rewards/rejected": -12.033594131469727, "step": 6330 }, { "epoch": 3.342119135477069, "grad_norm": 1.0208397381301066, "learning_rate": 1.6460200316288877e-07, "logits/chosen": -2.642578125, "logits/rejected": -2.7357420921325684, "logps/chosen": -360.6000061035156, "logps/rejected": -394.625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -3.498278856277466, "rewards/margins": 7.723437309265137, "rewards/rejected": -11.228906631469727, "step": 6340 }, { "epoch": 3.3473906167633105, "grad_norm": 5.071182106241019, "learning_rate": 1.6328413284132842e-07, "logits/chosen": -2.704882860183716, "logits/rejected": -2.73046875, "logps/chosen": -357.625, "logps/rejected": -418.3999938964844, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -3.584179639816284, "rewards/margins": 8.010937690734863, "rewards/rejected": -11.604687690734863, "step": 6350 }, { "epoch": 3.352662098049552, "grad_norm": 0.6878176584956237, "learning_rate": 1.6196626251976806e-07, "logits/chosen": -2.7445311546325684, "logits/rejected": -2.7060546875, "logps/chosen": -348.9125061035156, "logps/rejected": -424.32501220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.935351610183716, "rewards/margins": 8.064844131469727, "rewards/rejected": -12.004687309265137, "step": 6360 }, { "epoch": 3.357933579335793, "grad_norm": 1.346014848002635, "learning_rate": 1.6064839219820768e-07, "logits/chosen": -2.6195311546325684, "logits/rejected": -2.711132764816284, "logps/chosen": -408.07501220703125, "logps/rejected": -422.875, "loss": 0.0113, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2845215797424316, "rewards/margins": 8.365625381469727, "rewards/rejected": -11.653124809265137, "step": 6370 }, { "epoch": 3.3632050606220347, "grad_norm": 0.17703496021735068, "learning_rate": 1.5933052187664733e-07, "logits/chosen": -2.587109327316284, "logits/rejected": -2.6865234375, "logps/chosen": -416.54998779296875, "logps/rejected": -446.4750061035156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.713671922683716, "rewards/margins": 8.407812118530273, "rewards/rejected": -12.112500190734863, "step": 6380 }, { "epoch": 3.3684765419082763, "grad_norm": 1.7807440875920384, "learning_rate": 1.5801265155508698e-07, "logits/chosen": -2.7652344703674316, "logits/rejected": -2.9189453125, "logps/chosen": -355.70001220703125, "logps/rejected": -412.79998779296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.0546875, "rewards/margins": 8.274999618530273, "rewards/rejected": -12.331250190734863, "step": 6390 }, { "epoch": 3.373748023194518, "grad_norm": 1.2153574095683939, "learning_rate": 1.566947812335266e-07, "logits/chosen": -2.7867188453674316, "logits/rejected": -2.73828125, "logps/chosen": -390.625, "logps/rejected": -453.29998779296875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.229882717132568, "rewards/margins": 8.140625, "rewards/rejected": -12.368749618530273, "step": 6400 }, { "epoch": 3.379019504480759, "grad_norm": 3.7838027256650233, "learning_rate": 1.5537691091196625e-07, "logits/chosen": -2.793164014816284, "logits/rejected": -2.877148389816284, "logps/chosen": -367.8999938964844, "logps/rejected": -416.3500061035156, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.891797065734863, "rewards/margins": 9.214062690734863, "rewards/rejected": -14.104687690734863, "step": 6410 }, { "epoch": 3.3842909857670005, "grad_norm": 0.9787287975892819, "learning_rate": 1.540590405904059e-07, "logits/chosen": -2.6751952171325684, "logits/rejected": -2.712890625, "logps/chosen": -386.125, "logps/rejected": -455.04998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.550585746765137, "rewards/margins": 8.530468940734863, "rewards/rejected": -13.076562881469727, "step": 6420 }, { "epoch": 3.389562467053242, "grad_norm": 0.9862232472222828, "learning_rate": 1.5274117026884554e-07, "logits/chosen": -2.696093797683716, "logits/rejected": -2.724414110183716, "logps/chosen": -408.36248779296875, "logps/rejected": -466.6499938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.854687452316284, "rewards/margins": 8.11328125, "rewards/rejected": -11.961718559265137, "step": 6430 }, { "epoch": 3.3948339483394836, "grad_norm": 1.1185582871111701, "learning_rate": 1.5142329994728516e-07, "logits/chosen": -2.6400389671325684, "logits/rejected": -2.7650389671325684, "logps/chosen": -380.8999938964844, "logps/rejected": -431.54998779296875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.802929639816284, "rewards/margins": 8.334375381469727, "rewards/rejected": -12.145312309265137, "step": 6440 }, { "epoch": 3.4001054296257247, "grad_norm": 1.8791059869467328, "learning_rate": 1.501054296257248e-07, "logits/chosen": -2.6490235328674316, "logits/rejected": -2.770703077316284, "logps/chosen": -386.20001220703125, "logps/rejected": -430.125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.567773342132568, "rewards/margins": 8.037500381469727, "rewards/rejected": -12.610156059265137, "step": 6450 }, { "epoch": 3.4053769109119663, "grad_norm": 3.1841420880114373, "learning_rate": 1.4878755930416446e-07, "logits/chosen": -2.6548829078674316, "logits/rejected": -2.781445264816284, "logps/chosen": -402.82501220703125, "logps/rejected": -433.7749938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.149804592132568, "rewards/margins": 8.456250190734863, "rewards/rejected": -12.604687690734863, "step": 6460 }, { "epoch": 3.410648392198208, "grad_norm": 2.0873521663519883, "learning_rate": 1.474696889826041e-07, "logits/chosen": -2.8814454078674316, "logits/rejected": -2.916796922683716, "logps/chosen": -358.82501220703125, "logps/rejected": -406.8999938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -3.551953077316284, "rewards/margins": 8.12109375, "rewards/rejected": -11.678125381469727, "step": 6470 }, { "epoch": 3.415919873484449, "grad_norm": 1.8337695693186304, "learning_rate": 1.4615181866104375e-07, "logits/chosen": -2.6181640625, "logits/rejected": -2.721874952316284, "logps/chosen": -357.92498779296875, "logps/rejected": -436.2250061035156, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.5929198265075684, "rewards/margins": 8.052343368530273, "rewards/rejected": -11.6484375, "step": 6480 }, { "epoch": 3.4211913547706905, "grad_norm": 0.6014454178276425, "learning_rate": 1.448339483394834e-07, "logits/chosen": -2.741015672683716, "logits/rejected": -2.8441405296325684, "logps/chosen": -375.375, "logps/rejected": -432.70001220703125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -3.591503858566284, "rewards/margins": 8.728906631469727, "rewards/rejected": -12.322656631469727, "step": 6490 }, { "epoch": 3.426462836056932, "grad_norm": 4.497938457046712, "learning_rate": 1.4351607801792305e-07, "logits/chosen": -2.7035155296325684, "logits/rejected": -2.8091797828674316, "logps/chosen": -365.75, "logps/rejected": -423.95001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.880810499191284, "rewards/margins": 8.223437309265137, "rewards/rejected": -12.110156059265137, "step": 6500 }, { "epoch": 3.4317343173431736, "grad_norm": 0.8605470750151194, "learning_rate": 1.4219820769636267e-07, "logits/chosen": -2.550585985183716, "logits/rejected": -2.6650390625, "logps/chosen": -377.25, "logps/rejected": -432.75, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.0516357421875, "rewards/margins": 8.029687881469727, "rewards/rejected": -12.078125, "step": 6510 }, { "epoch": 3.4370057986294147, "grad_norm": 1.524997781336835, "learning_rate": 1.4088033737480232e-07, "logits/chosen": -2.662890672683716, "logits/rejected": -2.8335938453674316, "logps/chosen": -364.6625061035156, "logps/rejected": -386.32501220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.9439454078674316, "rewards/margins": 8.803906440734863, "rewards/rejected": -12.742968559265137, "step": 6520 }, { "epoch": 3.4422772799156562, "grad_norm": 0.716964285554212, "learning_rate": 1.3956246705324197e-07, "logits/chosen": -2.822460889816284, "logits/rejected": -2.702929735183716, "logps/chosen": -358.0874938964844, "logps/rejected": -421.79998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.107324123382568, "rewards/margins": 8.235156059265137, "rewards/rejected": -12.337499618530273, "step": 6530 }, { "epoch": 3.447548761201898, "grad_norm": 3.3004285580059105, "learning_rate": 1.382445967316816e-07, "logits/chosen": -2.572460889816284, "logits/rejected": -2.661914110183716, "logps/chosen": -404.45001220703125, "logps/rejected": -426.42498779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.956249952316284, "rewards/margins": 8.48046875, "rewards/rejected": -12.431249618530273, "step": 6540 }, { "epoch": 3.4528202424881393, "grad_norm": 0.6604711496177014, "learning_rate": 1.3692672641012123e-07, "logits/chosen": -2.779296875, "logits/rejected": -2.8375000953674316, "logps/chosen": -384.7250061035156, "logps/rejected": -436.0249938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.330859184265137, "rewards/margins": 8.746874809265137, "rewards/rejected": -13.078125, "step": 6550 }, { "epoch": 3.4580917237743805, "grad_norm": 0.7063173923648294, "learning_rate": 1.3560885608856088e-07, "logits/chosen": -2.712890625, "logits/rejected": -2.863085985183716, "logps/chosen": -366.63751220703125, "logps/rejected": -415.2749938964844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.806640625, "rewards/margins": 8.671093940734863, "rewards/rejected": -13.473437309265137, "step": 6560 }, { "epoch": 3.463363205060622, "grad_norm": 2.4281284658641615, "learning_rate": 1.342909857670005e-07, "logits/chosen": -2.562695264816284, "logits/rejected": -2.6753907203674316, "logps/chosen": -398.2875061035156, "logps/rejected": -452.42498779296875, "loss": 0.0127, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.554883003234863, "rewards/margins": 7.963281154632568, "rewards/rejected": -12.514062881469727, "step": 6570 }, { "epoch": 3.4686346863468636, "grad_norm": 0.9796111384128698, "learning_rate": 1.3297311544544015e-07, "logits/chosen": -2.6943359375, "logits/rejected": -2.638867139816284, "logps/chosen": -378.92498779296875, "logps/rejected": -446.7250061035156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.2548828125, "rewards/margins": 8.921875, "rewards/rejected": -13.182812690734863, "step": 6580 }, { "epoch": 3.473906167633105, "grad_norm": 4.610040390140728, "learning_rate": 1.316552451238798e-07, "logits/chosen": -2.8167967796325684, "logits/rejected": -2.975781202316284, "logps/chosen": -348.01251220703125, "logps/rejected": -408.0249938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.051953315734863, "rewards/margins": 8.127344131469727, "rewards/rejected": -12.177343368530273, "step": 6590 }, { "epoch": 3.479177648919346, "grad_norm": 1.26542673729054, "learning_rate": 1.3033737480231945e-07, "logits/chosen": -2.732421875, "logits/rejected": -2.78515625, "logps/chosen": -349.4750061035156, "logps/rejected": -430.92498779296875, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -3.529589891433716, "rewards/margins": 8.2265625, "rewards/rejected": -11.752344131469727, "step": 6600 }, { "epoch": 3.4844491302055878, "grad_norm": 1.9452581935489994, "learning_rate": 1.290195044807591e-07, "logits/chosen": -2.676953077316284, "logits/rejected": -2.6371092796325684, "logps/chosen": -375.5, "logps/rejected": -415.25, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.123339653015137, "rewards/margins": 8.028124809265137, "rewards/rejected": -12.150781631469727, "step": 6610 }, { "epoch": 3.4897206114918293, "grad_norm": 4.911173788930335, "learning_rate": 1.2770163415919874e-07, "logits/chosen": -2.6021485328674316, "logits/rejected": -2.607617139816284, "logps/chosen": -375.3125, "logps/rejected": -458.875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.9600586891174316, "rewards/margins": 8.686718940734863, "rewards/rejected": -12.643750190734863, "step": 6620 }, { "epoch": 3.4949920927780704, "grad_norm": 3.537149503920943, "learning_rate": 1.263837638376384e-07, "logits/chosen": -2.6792969703674316, "logits/rejected": -2.841015577316284, "logps/chosen": -390.92498779296875, "logps/rejected": -420.70001220703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.029492378234863, "rewards/margins": 8.203906059265137, "rewards/rejected": -12.241406440734863, "step": 6630 }, { "epoch": 3.500263574064312, "grad_norm": 1.7142005634576718, "learning_rate": 1.25065893516078e-07, "logits/chosen": -2.6373047828674316, "logits/rejected": -2.766406297683716, "logps/chosen": -393.70001220703125, "logps/rejected": -446.2749938964844, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -4.157031059265137, "rewards/margins": 8.286718368530273, "rewards/rejected": -12.440625190734863, "step": 6640 }, { "epoch": 3.5055350553505535, "grad_norm": 2.687042255532955, "learning_rate": 1.2374802319451766e-07, "logits/chosen": -2.5775389671325684, "logits/rejected": -2.6039061546325684, "logps/chosen": -402.5249938964844, "logps/rejected": -452.45001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.8624510765075684, "rewards/margins": 8.2421875, "rewards/rejected": -12.092187881469727, "step": 6650 }, { "epoch": 3.510806536636795, "grad_norm": 14.818680207517357, "learning_rate": 1.224301528729573e-07, "logits/chosen": -2.7582030296325684, "logits/rejected": -2.8628907203674316, "logps/chosen": -363.73748779296875, "logps/rejected": -400.3999938964844, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.8589844703674316, "rewards/margins": 8.298437118530273, "rewards/rejected": -12.162500381469727, "step": 6660 }, { "epoch": 3.5160780179230366, "grad_norm": 4.313777006697166, "learning_rate": 1.2111228255139693e-07, "logits/chosen": -2.892382860183716, "logits/rejected": -2.889843702316284, "logps/chosen": -338.88751220703125, "logps/rejected": -441.29998779296875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -3.7621092796325684, "rewards/margins": 8.354687690734863, "rewards/rejected": -12.122655868530273, "step": 6670 }, { "epoch": 3.5213494992092778, "grad_norm": 1.5360768338473592, "learning_rate": 1.1979441222983657e-07, "logits/chosen": -2.656054735183716, "logits/rejected": -2.757617235183716, "logps/chosen": -373.375, "logps/rejected": -430.5, "loss": 0.0134, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.239062309265137, "rewards/margins": 8.346875190734863, "rewards/rejected": -12.5859375, "step": 6680 }, { "epoch": 3.5266209804955193, "grad_norm": 1.4589672371123, "learning_rate": 1.1847654190827622e-07, "logits/chosen": -2.714062452316284, "logits/rejected": -2.6533203125, "logps/chosen": -355.32501220703125, "logps/rejected": -456.1499938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -4.189843654632568, "rewards/margins": 8.7421875, "rewards/rejected": -12.9296875, "step": 6690 }, { "epoch": 3.5318924617817604, "grad_norm": 1.21888210650349, "learning_rate": 1.1715867158671585e-07, "logits/chosen": -2.7376952171325684, "logits/rejected": -2.7757811546325684, "logps/chosen": -382.45001220703125, "logps/rejected": -438.0249938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.418554782867432, "rewards/margins": 8.145312309265137, "rewards/rejected": -12.565625190734863, "step": 6700 }, { "epoch": 3.537163943068002, "grad_norm": 0.5019260143747609, "learning_rate": 1.158408012651555e-07, "logits/chosen": -2.727343797683716, "logits/rejected": -2.8265624046325684, "logps/chosen": -380.3500061035156, "logps/rejected": -430.54998779296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.373046875, "rewards/margins": 8.336718559265137, "rewards/rejected": -12.7109375, "step": 6710 }, { "epoch": 3.5424354243542435, "grad_norm": 1.0661669151112458, "learning_rate": 1.1452293094359515e-07, "logits/chosen": -2.791210889816284, "logits/rejected": -2.8277344703674316, "logps/chosen": -351.2749938964844, "logps/rejected": -410.17498779296875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -4.46484375, "rewards/margins": 8.2265625, "rewards/rejected": -12.696874618530273, "step": 6720 }, { "epoch": 3.547706905640485, "grad_norm": 2.6691567846631883, "learning_rate": 1.1320506062203478e-07, "logits/chosen": -2.7201170921325684, "logits/rejected": -2.6869139671325684, "logps/chosen": -400.25, "logps/rejected": -461.5249938964844, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.351171970367432, "rewards/margins": 8.196874618530273, "rewards/rejected": -12.547656059265137, "step": 6730 }, { "epoch": 3.5529783869267266, "grad_norm": 2.2599429928530617, "learning_rate": 1.1188719030047443e-07, "logits/chosen": -2.7416014671325684, "logits/rejected": -2.9078125953674316, "logps/chosen": -392.25, "logps/rejected": -430.1499938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.243261814117432, "rewards/margins": 8.382031440734863, "rewards/rejected": -12.618749618530273, "step": 6740 }, { "epoch": 3.5582498682129677, "grad_norm": 0.8712424618270892, "learning_rate": 1.1056931997891407e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.868945360183716, "logps/chosen": -370.57501220703125, "logps/rejected": -435.79998779296875, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.724902153015137, "rewards/margins": 8.903905868530273, "rewards/rejected": -13.623437881469727, "step": 6750 }, { "epoch": 3.5635213494992093, "grad_norm": 0.3223259551579918, "learning_rate": 1.0925144965735371e-07, "logits/chosen": -2.706835985183716, "logits/rejected": -2.750781297683716, "logps/chosen": -392.8125, "logps/rejected": -459.3500061035156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.3955078125, "rewards/margins": 8.786718368530273, "rewards/rejected": -13.189062118530273, "step": 6760 }, { "epoch": 3.568792830785451, "grad_norm": 0.4288627759433479, "learning_rate": 1.0793357933579335e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.7740235328674316, "logps/chosen": -350.2875061035156, "logps/rejected": -421.375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.106835842132568, "rewards/margins": 8.95703125, "rewards/rejected": -13.059374809265137, "step": 6770 }, { "epoch": 3.574064312071692, "grad_norm": 1.0952909926376144, "learning_rate": 1.0661570901423298e-07, "logits/chosen": -2.5814452171325684, "logits/rejected": -2.702929735183716, "logps/chosen": -441.17498779296875, "logps/rejected": -478.95001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.166015625, "rewards/margins": 8.770312309265137, "rewards/rejected": -12.935937881469727, "step": 6780 }, { "epoch": 3.5793357933579335, "grad_norm": 4.189988321797498, "learning_rate": 1.0529783869267264e-07, "logits/chosen": -2.7574219703674316, "logits/rejected": -2.7925782203674316, "logps/chosen": -378.7749938964844, "logps/rejected": -420.0249938964844, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.763379096984863, "rewards/margins": 8.307031631469727, "rewards/rejected": -13.071874618530273, "step": 6790 }, { "epoch": 3.584607274644175, "grad_norm": 0.46516883073967086, "learning_rate": 1.0397996837111228e-07, "logits/chosen": -2.7300782203674316, "logits/rejected": -2.7777342796325684, "logps/chosen": -369.92498779296875, "logps/rejected": -446.57501220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.484375, "rewards/margins": 8.57421875, "rewards/rejected": -13.060937881469727, "step": 6800 }, { "epoch": 3.5898787559304166, "grad_norm": 4.8538070152002, "learning_rate": 1.0266209804955192e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.923828125, "logps/chosen": -375.9750061035156, "logps/rejected": -412.5249938964844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.334765434265137, "rewards/margins": 8.3046875, "rewards/rejected": -12.646875381469727, "step": 6810 }, { "epoch": 3.5951502372166577, "grad_norm": 0.7427995544033867, "learning_rate": 1.0134422772799156e-07, "logits/chosen": -2.7445311546325684, "logits/rejected": -2.76171875, "logps/chosen": -414.54998779296875, "logps/rejected": -442.1000061035156, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.5537109375, "rewards/margins": 8.1875, "rewards/rejected": -12.740625381469727, "step": 6820 }, { "epoch": 3.6004217185028993, "grad_norm": 1.0839162437111198, "learning_rate": 1.000263574064312e-07, "logits/chosen": -2.7396483421325684, "logits/rejected": -2.867382764816284, "logps/chosen": -368.6499938964844, "logps/rejected": -410.875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.386914253234863, "rewards/margins": 8.546875, "rewards/rejected": -12.939062118530273, "step": 6830 }, { "epoch": 3.605693199789141, "grad_norm": 0.5562908934643365, "learning_rate": 9.870848708487084e-08, "logits/chosen": -2.7671875953674316, "logits/rejected": -2.734570264816284, "logps/chosen": -343.375, "logps/rejected": -438.75, "loss": 0.01, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.624609470367432, "rewards/margins": 8.870312690734863, "rewards/rejected": -13.487500190734863, "step": 6840 }, { "epoch": 3.610964681075382, "grad_norm": 2.915284483489061, "learning_rate": 9.739061676331048e-08, "logits/chosen": -2.6712889671325684, "logits/rejected": -2.817578077316284, "logps/chosen": -377.9750061035156, "logps/rejected": -430.45001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.364453315734863, "rewards/margins": 9.039843559265137, "rewards/rejected": -13.3984375, "step": 6850 }, { "epoch": 3.6162361623616235, "grad_norm": 2.5482765659833175, "learning_rate": 9.607274644175014e-08, "logits/chosen": -2.681445360183716, "logits/rejected": -2.806445360183716, "logps/chosen": -387.29998779296875, "logps/rejected": -431.17498779296875, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.260937690734863, "rewards/margins": 9.007031440734863, "rewards/rejected": -13.262499809265137, "step": 6860 }, { "epoch": 3.621507643647865, "grad_norm": 6.935061006759977, "learning_rate": 9.475487612018977e-08, "logits/chosen": -2.6439452171325684, "logits/rejected": -2.684375047683716, "logps/chosen": -387.57501220703125, "logps/rejected": -434.20001220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -4.316796779632568, "rewards/margins": 8.939062118530273, "rewards/rejected": -13.25, "step": 6870 }, { "epoch": 3.6267791249341066, "grad_norm": 1.3511772221772596, "learning_rate": 9.343700579862942e-08, "logits/chosen": -2.692578077316284, "logits/rejected": -2.7076172828674316, "logps/chosen": -366.82501220703125, "logps/rejected": -444.6000061035156, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.852343797683716, "rewards/margins": 8.807031631469727, "rewards/rejected": -12.665624618530273, "step": 6880 }, { "epoch": 3.632050606220348, "grad_norm": 1.597544005021637, "learning_rate": 9.211913547706905e-08, "logits/chosen": -2.663281202316284, "logits/rejected": -2.904101610183716, "logps/chosen": -377.92498779296875, "logps/rejected": -425.42498779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.760937452316284, "rewards/margins": 8.5546875, "rewards/rejected": -12.315625190734863, "step": 6890 }, { "epoch": 3.6373220875065893, "grad_norm": 14.575513519100822, "learning_rate": 9.080126515550869e-08, "logits/chosen": -2.674023389816284, "logits/rejected": -2.704882860183716, "logps/chosen": -365.57501220703125, "logps/rejected": -435.95001220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.36328125, "rewards/margins": 8.458593368530273, "rewards/rejected": -12.826562881469727, "step": 6900 }, { "epoch": 3.642593568792831, "grad_norm": 0.6670869040889003, "learning_rate": 8.948339483394833e-08, "logits/chosen": -2.6201171875, "logits/rejected": -2.853515625, "logps/chosen": -377.875, "logps/rejected": -409.875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.119433403015137, "rewards/margins": 8.358593940734863, "rewards/rejected": -12.484375, "step": 6910 }, { "epoch": 3.6478650500790724, "grad_norm": 0.10938297847825244, "learning_rate": 8.816552451238797e-08, "logits/chosen": -2.617382764816284, "logits/rejected": -2.708984375, "logps/chosen": -387.29998779296875, "logps/rejected": -432.95001220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.875781297683716, "rewards/margins": 8.541406631469727, "rewards/rejected": -12.413281440734863, "step": 6920 }, { "epoch": 3.6531365313653135, "grad_norm": 2.7583124983751675, "learning_rate": 8.684765419082763e-08, "logits/chosen": -2.552929639816284, "logits/rejected": -2.629101514816284, "logps/chosen": -382.3999938964844, "logps/rejected": -439.07501220703125, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.947070360183716, "rewards/margins": 8.092968940734863, "rewards/rejected": -12.034375190734863, "step": 6930 }, { "epoch": 3.658408012651555, "grad_norm": 2.083610933812202, "learning_rate": 8.552978386926726e-08, "logits/chosen": -2.8802733421325684, "logits/rejected": -2.7236328125, "logps/chosen": -345.4624938964844, "logps/rejected": -439.4750061035156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.384667873382568, "rewards/margins": 9.520312309265137, "rewards/rejected": -13.910937309265137, "step": 6940 }, { "epoch": 3.6636794939377966, "grad_norm": 0.7392430874064359, "learning_rate": 8.42119135477069e-08, "logits/chosen": -2.573046922683716, "logits/rejected": -2.7470703125, "logps/chosen": -389.67498779296875, "logps/rejected": -453.1000061035156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.196533203125, "rewards/margins": 9.123437881469727, "rewards/rejected": -13.314062118530273, "step": 6950 }, { "epoch": 3.668950975224038, "grad_norm": 0.42931454744087894, "learning_rate": 8.289404322614655e-08, "logits/chosen": -2.719921827316284, "logits/rejected": -2.9156250953674316, "logps/chosen": -378.2749938964844, "logps/rejected": -443.2749938964844, "loss": 0.0077, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.0224609375, "rewards/margins": 8.2734375, "rewards/rejected": -12.295312881469727, "step": 6960 }, { "epoch": 3.6742224565102792, "grad_norm": 4.407319367432339, "learning_rate": 8.157617290458618e-08, "logits/chosen": -2.789843797683716, "logits/rejected": -2.9449219703674316, "logps/chosen": -353.6499938964844, "logps/rejected": -410.75, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -4.364648342132568, "rewards/margins": 7.590624809265137, "rewards/rejected": -11.960156440734863, "step": 6970 }, { "epoch": 3.679493937796521, "grad_norm": 1.140202492881011, "learning_rate": 8.025830258302583e-08, "logits/chosen": -2.685546875, "logits/rejected": -2.780468702316284, "logps/chosen": -389.57501220703125, "logps/rejected": -426.32501220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.804101467132568, "rewards/margins": 8.497655868530273, "rewards/rejected": -13.298437118530273, "step": 6980 }, { "epoch": 3.6847654190827623, "grad_norm": 7.354329597434688, "learning_rate": 7.894043226146546e-08, "logits/chosen": -2.5152344703674316, "logits/rejected": -2.6591796875, "logps/chosen": -391.6499938964844, "logps/rejected": -455.2749938964844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -3.84765625, "rewards/margins": 9.271875381469727, "rewards/rejected": -13.118749618530273, "step": 6990 }, { "epoch": 3.6900369003690034, "grad_norm": 0.2762400549670457, "learning_rate": 7.762256193990511e-08, "logits/chosen": -2.55078125, "logits/rejected": -2.763867139816284, "logps/chosen": -406.95001220703125, "logps/rejected": -439.95001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.1611328125, "rewards/margins": 9.175000190734863, "rewards/rejected": -13.3359375, "step": 7000 }, { "epoch": 3.695308381655245, "grad_norm": 7.232476323573055, "learning_rate": 7.630469161834476e-08, "logits/chosen": -2.7740235328674316, "logits/rejected": -2.7945313453674316, "logps/chosen": -359.4750061035156, "logps/rejected": -430.5249938964844, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.8755860328674316, "rewards/margins": 8.568750381469727, "rewards/rejected": -12.442187309265137, "step": 7010 }, { "epoch": 3.7005798629414866, "grad_norm": 0.2937735784518899, "learning_rate": 7.498682129678439e-08, "logits/chosen": -2.6851563453674316, "logits/rejected": -2.6615233421325684, "logps/chosen": -408.6499938964844, "logps/rejected": -474.45001220703125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.242285251617432, "rewards/margins": 8.227343559265137, "rewards/rejected": -12.475000381469727, "step": 7020 }, { "epoch": 3.705851344227728, "grad_norm": 1.8585419728152612, "learning_rate": 7.366895097522404e-08, "logits/chosen": -2.798632860183716, "logits/rejected": -2.826171875, "logps/chosen": -371.0375061035156, "logps/rejected": -444.375, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8792967796325684, "rewards/margins": 8.704687118530273, "rewards/rejected": -12.584375381469727, "step": 7030 }, { "epoch": 3.7111228255139697, "grad_norm": 0.7978531223335172, "learning_rate": 7.235108065366367e-08, "logits/chosen": -2.736523389816284, "logits/rejected": -2.932421922683716, "logps/chosen": -362.5874938964844, "logps/rejected": -423.3999938964844, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -3.826855421066284, "rewards/margins": 8.953906059265137, "rewards/rejected": -12.778124809265137, "step": 7040 }, { "epoch": 3.7163943068002108, "grad_norm": 2.722264493492235, "learning_rate": 7.103321033210331e-08, "logits/chosen": -2.726367235183716, "logits/rejected": -2.865429639816284, "logps/chosen": -354.32501220703125, "logps/rejected": -402.6499938964844, "loss": 0.0076, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.602734565734863, "rewards/margins": 8.323437690734863, "rewards/rejected": -12.9296875, "step": 7050 }, { "epoch": 3.7216657880864523, "grad_norm": 1.4546523466882098, "learning_rate": 6.971534001054295e-08, "logits/chosen": -2.6576170921325684, "logits/rejected": -2.6927733421325684, "logps/chosen": -422.04998779296875, "logps/rejected": -464.95001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.030957221984863, "rewards/margins": 8.228906631469727, "rewards/rejected": -12.253125190734863, "step": 7060 }, { "epoch": 3.726937269372694, "grad_norm": 14.731995594051247, "learning_rate": 6.83974696889826e-08, "logits/chosen": -2.7464842796325684, "logits/rejected": -2.903125047683716, "logps/chosen": -373.4125061035156, "logps/rejected": -415.875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -4.051953315734863, "rewards/margins": 8.430468559265137, "rewards/rejected": -12.482030868530273, "step": 7070 }, { "epoch": 3.732208750658935, "grad_norm": 2.2327440882713625, "learning_rate": 6.707959936742225e-08, "logits/chosen": -2.9214844703674316, "logits/rejected": -2.864453077316284, "logps/chosen": -341.6625061035156, "logps/rejected": -420.32501220703125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.285058498382568, "rewards/margins": 8.8125, "rewards/rejected": -13.096094131469727, "step": 7080 }, { "epoch": 3.7374802319451765, "grad_norm": 0.6418372167369418, "learning_rate": 6.576172904586188e-08, "logits/chosen": -2.7210936546325684, "logits/rejected": -2.8408203125, "logps/chosen": -343.125, "logps/rejected": -397.1000061035156, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -3.692822217941284, "rewards/margins": 8.185155868530273, "rewards/rejected": -11.875, "step": 7090 }, { "epoch": 3.742751713231418, "grad_norm": 11.853554160525096, "learning_rate": 6.444385872430153e-08, "logits/chosen": -2.675488233566284, "logits/rejected": -2.724804639816284, "logps/chosen": -371.3500061035156, "logps/rejected": -438.8500061035156, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -4.223340034484863, "rewards/margins": 8.465624809265137, "rewards/rejected": -12.692187309265137, "step": 7100 }, { "epoch": 3.7480231945176596, "grad_norm": 9.884425551457364, "learning_rate": 6.312598840274117e-08, "logits/chosen": -2.796679735183716, "logits/rejected": -2.896679639816284, "logps/chosen": -344.8125, "logps/rejected": -419.5, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.175097465515137, "rewards/margins": 9.389062881469727, "rewards/rejected": -13.557812690734863, "step": 7110 }, { "epoch": 3.7532946758039007, "grad_norm": 12.980080479671415, "learning_rate": 6.180811808118081e-08, "logits/chosen": -2.860546827316284, "logits/rejected": -2.9439454078674316, "logps/chosen": -374.625, "logps/rejected": -455.1000061035156, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.627734184265137, "rewards/margins": 8.707812309265137, "rewards/rejected": -13.340624809265137, "step": 7120 }, { "epoch": 3.7585661570901423, "grad_norm": 7.303450740513086, "learning_rate": 6.049024775962045e-08, "logits/chosen": -2.66796875, "logits/rejected": -2.759570360183716, "logps/chosen": -387.7749938964844, "logps/rejected": -424.0, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.058203220367432, "rewards/margins": 8.627344131469727, "rewards/rejected": -12.682031631469727, "step": 7130 }, { "epoch": 3.763837638376384, "grad_norm": 1.6162507673051283, "learning_rate": 5.917237743806009e-08, "logits/chosen": -2.7744140625, "logits/rejected": -2.8765625953674316, "logps/chosen": -353.29998779296875, "logps/rejected": -414.70001220703125, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -4.326855659484863, "rewards/margins": 8.485156059265137, "rewards/rejected": -12.814062118530273, "step": 7140 }, { "epoch": 3.769109119662625, "grad_norm": 1.7556760371286406, "learning_rate": 5.7854507116499736e-08, "logits/chosen": -2.5494141578674316, "logits/rejected": -2.7138671875, "logps/chosen": -404.375, "logps/rejected": -438.5, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.38818359375, "rewards/margins": 8.706250190734863, "rewards/rejected": -13.09375, "step": 7150 }, { "epoch": 3.7743806009488665, "grad_norm": 6.137604750950886, "learning_rate": 5.653663679493938e-08, "logits/chosen": -2.765625, "logits/rejected": -2.8580079078674316, "logps/chosen": -346.51251220703125, "logps/rejected": -430.95001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.160546779632568, "rewards/margins": 8.9609375, "rewards/rejected": -13.1171875, "step": 7160 }, { "epoch": 3.779652082235108, "grad_norm": 0.8070959008499156, "learning_rate": 5.521876647337902e-08, "logits/chosen": -2.617382764816284, "logits/rejected": -2.71484375, "logps/chosen": -393.3999938964844, "logps/rejected": -444.2250061035156, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.978320360183716, "rewards/margins": 8.884374618530273, "rewards/rejected": -12.859375, "step": 7170 }, { "epoch": 3.7849235635213496, "grad_norm": 2.307267285870182, "learning_rate": 5.390089615181866e-08, "logits/chosen": -2.638867139816284, "logits/rejected": -2.6412110328674316, "logps/chosen": -396.17498779296875, "logps/rejected": -451.67498779296875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.162304878234863, "rewards/margins": 8.7109375, "rewards/rejected": -12.873437881469727, "step": 7180 }, { "epoch": 3.790195044807591, "grad_norm": 2.147171562074873, "learning_rate": 5.25830258302583e-08, "logits/chosen": -2.6460938453674316, "logits/rejected": -2.671093702316284, "logps/chosen": -397.42498779296875, "logps/rejected": -439.79998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.080273628234863, "rewards/margins": 8.853124618530273, "rewards/rejected": -12.932812690734863, "step": 7190 }, { "epoch": 3.7954665260938323, "grad_norm": 0.3757892425137318, "learning_rate": 5.126515550869794e-08, "logits/chosen": -2.8076171875, "logits/rejected": -2.7505860328674316, "logps/chosen": -355.7124938964844, "logps/rejected": -443.4750061035156, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.611718654632568, "rewards/margins": 9.041406631469727, "rewards/rejected": -13.653124809265137, "step": 7200 }, { "epoch": 3.800738007380074, "grad_norm": 0.9979018397430885, "learning_rate": 4.994728518713758e-08, "logits/chosen": -2.732226610183716, "logits/rejected": -2.768749952316284, "logps/chosen": -386.75, "logps/rejected": -469.8500061035156, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.142187595367432, "rewards/margins": 9.035937309265137, "rewards/rejected": -13.181249618530273, "step": 7210 }, { "epoch": 3.8060094886663154, "grad_norm": 3.3028787267486766, "learning_rate": 4.862941486557722e-08, "logits/chosen": -2.5277342796325684, "logits/rejected": -2.6109375953674316, "logps/chosen": -415.8500061035156, "logps/rejected": -451.3500061035156, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -4.6162109375, "rewards/margins": 8.49609375, "rewards/rejected": -13.106249809265137, "step": 7220 }, { "epoch": 3.8112809699525565, "grad_norm": 4.77973977940579, "learning_rate": 4.731154454401687e-08, "logits/chosen": -2.625781297683716, "logits/rejected": -2.7064452171325684, "logps/chosen": -352.1000061035156, "logps/rejected": -459.54998779296875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.9330077171325684, "rewards/margins": 9.081250190734863, "rewards/rejected": -13.010937690734863, "step": 7230 }, { "epoch": 3.816552451238798, "grad_norm": 0.3909183772594211, "learning_rate": 4.5993674222456505e-08, "logits/chosen": -2.7408204078674316, "logits/rejected": -2.7113280296325684, "logps/chosen": -343.95001220703125, "logps/rejected": -436.2749938964844, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -4.251172065734863, "rewards/margins": 8.842968940734863, "rewards/rejected": -13.087499618530273, "step": 7240 }, { "epoch": 3.8218239325250396, "grad_norm": 2.085506998702091, "learning_rate": 4.4675803900896145e-08, "logits/chosen": -2.802929639816284, "logits/rejected": -2.857226610183716, "logps/chosen": -350.45001220703125, "logps/rejected": -400.1000061035156, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -4.199804782867432, "rewards/margins": 8.234375, "rewards/rejected": -12.428906440734863, "step": 7250 }, { "epoch": 3.827095413811281, "grad_norm": 3.0182548789785413, "learning_rate": 4.335793357933579e-08, "logits/chosen": -2.719921827316284, "logits/rejected": -2.795117139816284, "logps/chosen": -381.6000061035156, "logps/rejected": -459.3500061035156, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.4638671875, "rewards/margins": 8.627344131469727, "rewards/rejected": -13.090624809265137, "step": 7260 }, { "epoch": 3.8323668950975223, "grad_norm": 0.49326266632480426, "learning_rate": 4.2040063257775434e-08, "logits/chosen": -2.7359375953674316, "logits/rejected": -2.7890625, "logps/chosen": -373.4750061035156, "logps/rejected": -455.1000061035156, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.92333984375, "rewards/margins": 8.892969131469727, "rewards/rejected": -12.814062118530273, "step": 7270 }, { "epoch": 3.837638376383764, "grad_norm": 1.2956992220768337, "learning_rate": 4.0722192936215075e-08, "logits/chosen": -2.7876954078674316, "logits/rejected": -2.8570313453674316, "logps/chosen": -338.67498779296875, "logps/rejected": -413.17498779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.179296970367432, "rewards/margins": 8.833593368530273, "rewards/rejected": -13.015625, "step": 7280 }, { "epoch": 3.8429098576700054, "grad_norm": 2.3885852419851803, "learning_rate": 3.9404322614654716e-08, "logits/chosen": -2.7132811546325684, "logits/rejected": -2.8832030296325684, "logps/chosen": -381.13751220703125, "logps/rejected": -447.2250061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.346484184265137, "rewards/margins": 9.088281631469727, "rewards/rejected": -13.4375, "step": 7290 }, { "epoch": 3.8481813389562465, "grad_norm": 1.1093778670810281, "learning_rate": 3.808645229309436e-08, "logits/chosen": -2.843945264816284, "logits/rejected": -2.906054735183716, "logps/chosen": -359.79998779296875, "logps/rejected": -424.70001220703125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -4.18701171875, "rewards/margins": 9.142969131469727, "rewards/rejected": -13.324999809265137, "step": 7300 }, { "epoch": 3.853452820242488, "grad_norm": 1.873919917569966, "learning_rate": 3.6768581971534e-08, "logits/chosen": -2.55078125, "logits/rejected": -2.740429639816284, "logps/chosen": -404.6000061035156, "logps/rejected": -428.04998779296875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -4.342968940734863, "rewards/margins": 8.416406631469727, "rewards/rejected": -12.7578125, "step": 7310 }, { "epoch": 3.8587243015287296, "grad_norm": 1.3913335095666044, "learning_rate": 3.545071164997364e-08, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.725781202316284, "logps/chosen": -380.625, "logps/rejected": -450.5, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.6910157203674316, "rewards/margins": 8.750781059265137, "rewards/rejected": -12.434374809265137, "step": 7320 }, { "epoch": 3.863995782814971, "grad_norm": 0.5986190097128637, "learning_rate": 3.4132841328413286e-08, "logits/chosen": -2.5582032203674316, "logits/rejected": -2.8921875953674316, "logps/chosen": -387.70001220703125, "logps/rejected": -404.8999938964844, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -3.881640672683716, "rewards/margins": 8.403124809265137, "rewards/rejected": -12.2890625, "step": 7330 }, { "epoch": 3.8692672641012127, "grad_norm": 4.161850129082392, "learning_rate": 3.281497100685293e-08, "logits/chosen": -2.7132811546325684, "logits/rejected": -2.791015625, "logps/chosen": -399.2749938964844, "logps/rejected": -442.9750061035156, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -4.108593940734863, "rewards/margins": 8.532812118530273, "rewards/rejected": -12.645312309265137, "step": 7340 }, { "epoch": 3.874538745387454, "grad_norm": 1.3684217545428459, "learning_rate": 3.149710068529256e-08, "logits/chosen": -2.7388672828674316, "logits/rejected": -2.797168016433716, "logps/chosen": -357.9312438964844, "logps/rejected": -414.6499938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.05322265625, "rewards/margins": 8.728906631469727, "rewards/rejected": -12.775781631469727, "step": 7350 }, { "epoch": 3.8798102266736954, "grad_norm": 0.6334615890693832, "learning_rate": 3.017923036373221e-08, "logits/chosen": -2.6283202171325684, "logits/rejected": -2.7134766578674316, "logps/chosen": -396.4750061035156, "logps/rejected": -486.17498779296875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.060742378234863, "rewards/margins": 9.015625, "rewards/rejected": -13.06640625, "step": 7360 }, { "epoch": 3.885081707959937, "grad_norm": 1.0145147840997315, "learning_rate": 2.886136004217185e-08, "logits/chosen": -2.6128907203674316, "logits/rejected": -2.797070264816284, "logps/chosen": -380.6499938964844, "logps/rejected": -421.5249938964844, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.3797607421875, "rewards/margins": 8.466405868530273, "rewards/rejected": -12.844531059265137, "step": 7370 }, { "epoch": 3.890353189246178, "grad_norm": 0.5496435915929363, "learning_rate": 2.754348972061149e-08, "logits/chosen": -2.610546827316284, "logits/rejected": -2.7542967796325684, "logps/chosen": -372.9750061035156, "logps/rejected": -429.29998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.909960985183716, "rewards/margins": 8.758593559265137, "rewards/rejected": -12.6640625, "step": 7380 }, { "epoch": 3.8956246705324196, "grad_norm": 2.590247239564011, "learning_rate": 2.6225619399051132e-08, "logits/chosen": -2.850781202316284, "logits/rejected": -2.924999952316284, "logps/chosen": -311.3999938964844, "logps/rejected": -377.8999938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.758984327316284, "rewards/margins": 8.532031059265137, "rewards/rejected": -12.29296875, "step": 7390 }, { "epoch": 3.900896151818661, "grad_norm": 1.8942465395955903, "learning_rate": 2.4907749077490773e-08, "logits/chosen": -2.741406202316284, "logits/rejected": -2.828320264816284, "logps/chosen": -338.0249938964844, "logps/rejected": -379.67498779296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.8382811546325684, "rewards/margins": 8.514843940734863, "rewards/rejected": -12.35546875, "step": 7400 }, { "epoch": 3.9061676331049027, "grad_norm": 9.718758910319432, "learning_rate": 2.3589878755930417e-08, "logits/chosen": -2.5455079078674316, "logits/rejected": -2.807812452316284, "logps/chosen": -395.25, "logps/rejected": -422.07501220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.772656202316284, "rewards/margins": 8.83984375, "rewards/rejected": -12.611719131469727, "step": 7410 }, { "epoch": 3.911439114391144, "grad_norm": 0.49899690415720277, "learning_rate": 2.2272008434370054e-08, "logits/chosen": -2.524609327316284, "logits/rejected": -2.6143555641174316, "logps/chosen": -420.8500061035156, "logps/rejected": -474.0, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -3.674023389816284, "rewards/margins": 8.694531440734863, "rewards/rejected": -12.370312690734863, "step": 7420 }, { "epoch": 3.9167105956773853, "grad_norm": 2.3693955100410653, "learning_rate": 2.09541381128097e-08, "logits/chosen": -2.759960889816284, "logits/rejected": -2.7828125953674316, "logps/chosen": -352.9125061035156, "logps/rejected": -418.5249938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.198046684265137, "rewards/margins": 8.114843368530273, "rewards/rejected": -12.315625190734863, "step": 7430 }, { "epoch": 3.921982076963627, "grad_norm": 0.9735230279211368, "learning_rate": 1.9636267791249343e-08, "logits/chosen": -2.5580077171325684, "logits/rejected": -2.6431641578674316, "logps/chosen": -394.0249938964844, "logps/rejected": -442.75, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.080273628234863, "rewards/margins": 8.430468559265137, "rewards/rejected": -12.514062881469727, "step": 7440 }, { "epoch": 3.927253558249868, "grad_norm": 1.3568274842789538, "learning_rate": 1.831839746968898e-08, "logits/chosen": -2.732226610183716, "logits/rejected": -2.8060545921325684, "logps/chosen": -338.75, "logps/rejected": -405.625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.964306592941284, "rewards/margins": 9.21484375, "rewards/rejected": -13.178906440734863, "step": 7450 }, { "epoch": 3.9325250395361095, "grad_norm": 0.44166121186206236, "learning_rate": 1.7000527148128625e-08, "logits/chosen": -2.7818360328674316, "logits/rejected": -2.918750047683716, "logps/chosen": -354.5, "logps/rejected": -393.875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.197949409484863, "rewards/margins": 8.391406059265137, "rewards/rejected": -12.590624809265137, "step": 7460 }, { "epoch": 3.937796520822351, "grad_norm": 0.6496131533502929, "learning_rate": 1.5682656826568266e-08, "logits/chosen": -2.6689453125, "logits/rejected": -2.6519532203674316, "logps/chosen": -370.17498779296875, "logps/rejected": -469.9750061035156, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.266992092132568, "rewards/margins": 8.83984375, "rewards/rejected": -13.110937118530273, "step": 7470 }, { "epoch": 3.9430680021085927, "grad_norm": 1.1737069721088722, "learning_rate": 1.4364786505007907e-08, "logits/chosen": -2.695019483566284, "logits/rejected": -2.737109422683716, "logps/chosen": -367.82501220703125, "logps/rejected": -450.3999938964844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.283984184265137, "rewards/margins": 9.176562309265137, "rewards/rejected": -13.462499618530273, "step": 7480 }, { "epoch": 3.948339483394834, "grad_norm": 2.29267938136487, "learning_rate": 1.304691618344755e-08, "logits/chosen": -2.7056641578674316, "logits/rejected": -2.8931641578674316, "logps/chosen": -376.04998779296875, "logps/rejected": -394.7250061035156, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.016406059265137, "rewards/margins": 8.063281059265137, "rewards/rejected": -12.082812309265137, "step": 7490 }, { "epoch": 3.9536109646810753, "grad_norm": 1.322818084566656, "learning_rate": 1.172904586188719e-08, "logits/chosen": -2.561328172683716, "logits/rejected": -2.6373047828674316, "logps/chosen": -368.0249938964844, "logps/rejected": -459.8500061035156, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.884228467941284, "rewards/margins": 9.139062881469727, "rewards/rejected": -13.0234375, "step": 7500 }, { "epoch": 3.958882445967317, "grad_norm": 2.347518276696728, "learning_rate": 1.0411175540326831e-08, "logits/chosen": -2.724609375, "logits/rejected": -2.7935547828674316, "logps/chosen": -353.8500061035156, "logps/rejected": -406.29998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.252148628234863, "rewards/margins": 8.518750190734863, "rewards/rejected": -12.774999618530273, "step": 7510 }, { "epoch": 3.964153927253558, "grad_norm": 1.649905388557332, "learning_rate": 9.093305218766472e-09, "logits/chosen": -2.741992235183716, "logits/rejected": -2.75390625, "logps/chosen": -349.45001220703125, "logps/rejected": -442.57501220703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.6849608421325684, "rewards/margins": 9.125781059265137, "rewards/rejected": -12.806249618530273, "step": 7520 }, { "epoch": 3.9694254085397995, "grad_norm": 0.899321087065884, "learning_rate": 7.775434897206115e-09, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.701171875, "logps/chosen": -346.375, "logps/rejected": -445.75, "loss": 0.0322, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.167187690734863, "rewards/margins": 8.963281631469727, "rewards/rejected": -13.128125190734863, "step": 7530 }, { "epoch": 3.974696889826041, "grad_norm": 1.0448348236010903, "learning_rate": 6.457564575645756e-09, "logits/chosen": -2.755859375, "logits/rejected": -2.7386717796325684, "logps/chosen": -351.61248779296875, "logps/rejected": -446.5, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.905468702316284, "rewards/margins": 8.83203125, "rewards/rejected": -12.740625381469727, "step": 7540 }, { "epoch": 3.9799683711122826, "grad_norm": 2.7083189523623847, "learning_rate": 5.139694254085398e-09, "logits/chosen": -2.682421922683716, "logits/rejected": -2.760546922683716, "logps/chosen": -366.2250061035156, "logps/rejected": -419.07501220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.851855516433716, "rewards/margins": 8.507031440734863, "rewards/rejected": -12.359375, "step": 7550 }, { "epoch": 3.985239852398524, "grad_norm": 1.7404990782142684, "learning_rate": 3.82182393252504e-09, "logits/chosen": -2.808789014816284, "logits/rejected": -2.80078125, "logps/chosen": -340.375, "logps/rejected": -410.8999938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.146093845367432, "rewards/margins": 8.41015625, "rewards/rejected": -12.556249618530273, "step": 7560 }, { "epoch": 3.9905113336847653, "grad_norm": 0.2480598063192072, "learning_rate": 2.5039536109646808e-09, "logits/chosen": -2.634960889816284, "logits/rejected": -2.785351514816284, "logps/chosen": -394.8500061035156, "logps/rejected": -453.3999938964844, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.262499809265137, "rewards/margins": 8.850781440734863, "rewards/rejected": -13.1171875, "step": 7570 }, { "epoch": 3.995782814971007, "grad_norm": 6.502570157880342, "learning_rate": 1.1860832894043225e-09, "logits/chosen": -2.6683592796325684, "logits/rejected": -2.822265625, "logps/chosen": -381.5, "logps/rejected": -403.625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.98046875, "rewards/margins": 8.227343559265137, "rewards/rejected": -12.206250190734863, "step": 7580 } ], "logging_steps": 10, "max_steps": 7588, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }