{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 7588, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005271481286241434, "grad_norm": 110.28200596640947, "learning_rate": 9.988139167105956e-07, "logits/chosen": -2.000781297683716, "logits/rejected": -1.941796898841858, "logps/chosen": -362.3999938964844, "logps/rejected": -406.3500061035156, "loss": 0.6905, "rewards/accuracies": 0.375, "rewards/chosen": -0.01507568359375, "rewards/margins": 0.013448333367705345, "rewards/rejected": -0.02853088453412056, "step": 10 }, { "epoch": 0.010542962572482868, "grad_norm": 119.45402188965186, "learning_rate": 9.974960463890353e-07, "logits/chosen": -2.14453125, "logits/rejected": -2.037109375, "logps/chosen": -300.29998779296875, "logps/rejected": -363.6499938964844, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.06747741997241974, "rewards/margins": 0.02422485314309597, "rewards/rejected": -0.09177856147289276, "step": 20 }, { "epoch": 0.0158144438587243, "grad_norm": 179.7845722829342, "learning_rate": 9.96178176067475e-07, "logits/chosen": -1.9988281726837158, "logits/rejected": -1.948828101158142, "logps/chosen": -354.0, "logps/rejected": -412.79998779296875, "loss": 0.6522, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12734833359718323, "rewards/margins": 0.13385620713233948, "rewards/rejected": -0.2610107362270355, "step": 30 }, { "epoch": 0.021085925144965736, "grad_norm": 129.1252162608785, "learning_rate": 9.948603057459145e-07, "logits/chosen": -1.9421875476837158, "logits/rejected": -1.874609351158142, "logps/chosen": -330.7749938964844, "logps/rejected": -402.9750061035156, "loss": 0.6803, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.143707275390625, "rewards/margins": 0.09183044731616974, "rewards/rejected": -0.23576660454273224, "step": 40 }, { "epoch": 0.02635740643120717, "grad_norm": 144.97744130172296, "learning_rate": 9.935424354243542e-07, "logits/chosen": -2.0210938453674316, "logits/rejected": -1.91015625, "logps/chosen": -316.1499938964844, "logps/rejected": -377.95001220703125, "loss": 0.634, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.015411376953125, "rewards/margins": 0.20098266005516052, "rewards/rejected": -0.18582764267921448, "step": 50 }, { "epoch": 0.0316288877174486, "grad_norm": 129.77604831548703, "learning_rate": 9.922245651027939e-07, "logits/chosen": -1.990625023841858, "logits/rejected": -1.9929687976837158, "logps/chosen": -365.70001220703125, "logps/rejected": -419.3500061035156, "loss": 0.7082, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.16517333686351776, "rewards/margins": 0.05855713039636612, "rewards/rejected": -0.22370605170726776, "step": 60 }, { "epoch": 0.03690036900369004, "grad_norm": 98.31113344412375, "learning_rate": 9.909066947812334e-07, "logits/chosen": -1.999609351158142, "logits/rejected": -1.958593726158142, "logps/chosen": -366.54998779296875, "logps/rejected": -463.0, "loss": 0.6684, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08564148098230362, "rewards/margins": 0.139739990234375, "rewards/rejected": -0.225433349609375, "step": 70 }, { "epoch": 0.04217185028993147, "grad_norm": 103.87935008292261, "learning_rate": 9.895888244596733e-07, "logits/chosen": -2.037109375, "logits/rejected": -1.9949219226837158, "logps/chosen": -347.8500061035156, "logps/rejected": -389.20001220703125, "loss": 0.6375, "rewards/accuracies": 0.53125, "rewards/chosen": 0.03866882249712944, "rewards/margins": 0.1797744780778885, "rewards/rejected": -0.14116211235523224, "step": 80 }, { "epoch": 0.047443331576172906, "grad_norm": 128.8031455727838, "learning_rate": 9.882709541381128e-07, "logits/chosen": -1.9988281726837158, "logits/rejected": -1.8984375, "logps/chosen": -364.29998779296875, "logps/rejected": -457.6000061035156, "loss": 0.6096, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05935363844037056, "rewards/margins": 0.30646973848342896, "rewards/rejected": -0.246978759765625, "step": 90 }, { "epoch": 0.05271481286241434, "grad_norm": 104.50969717685936, "learning_rate": 9.869530838165525e-07, "logits/chosen": -2.0562500953674316, "logits/rejected": -1.9484374523162842, "logps/chosen": -329.8500061035156, "logps/rejected": -437.70001220703125, "loss": 0.6076, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.20646819472312927, "rewards/margins": 0.363037109375, "rewards/rejected": -0.15631408989429474, "step": 100 }, { "epoch": 0.05798629414865577, "grad_norm": 148.45920963134182, "learning_rate": 9.85635213494992e-07, "logits/chosen": -1.972265601158142, "logits/rejected": -1.968359351158142, "logps/chosen": -369.8999938964844, "logps/rejected": -425.04998779296875, "loss": 0.6586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2965331971645355, "rewards/margins": 0.27690428495407104, "rewards/rejected": 0.01972045935690403, "step": 110 }, { "epoch": 0.0632577754348972, "grad_norm": 102.49370369295339, "learning_rate": 9.843173431734316e-07, "logits/chosen": -2.133593797683716, "logits/rejected": -2.080859422683716, "logps/chosen": -321.6000061035156, "logps/rejected": -414.79998779296875, "loss": 0.6211, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.42908936738967896, "rewards/margins": 0.3643554747104645, "rewards/rejected": 0.06484069675207138, "step": 120 }, { "epoch": 0.06852925672113865, "grad_norm": 118.18963727378305, "learning_rate": 9.829994728518713e-07, "logits/chosen": -2.0269532203674316, "logits/rejected": -1.986718773841858, "logps/chosen": -303.79998779296875, "logps/rejected": -398.1499938964844, "loss": 0.5924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02681579627096653, "rewards/margins": 0.4505615234375, "rewards/rejected": -0.477294921875, "step": 130 }, { "epoch": 0.07380073800738007, "grad_norm": 108.44333619865189, "learning_rate": 9.81681602530311e-07, "logits/chosen": -2.025390625, "logits/rejected": -2.020312547683716, "logps/chosen": -312.54998779296875, "logps/rejected": -362.75, "loss": 0.624, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.06540527194738388, "rewards/margins": 0.3535308837890625, "rewards/rejected": -0.2879226803779602, "step": 140 }, { "epoch": 0.0790722192936215, "grad_norm": 104.14742125920928, "learning_rate": 9.803637322087505e-07, "logits/chosen": -2.091015577316284, "logits/rejected": -2.0140624046325684, "logps/chosen": -358.54998779296875, "logps/rejected": -444.8999938964844, "loss": 0.6758, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.14210204780101776, "rewards/margins": 0.26971435546875, "rewards/rejected": -0.12808838486671448, "step": 150 }, { "epoch": 0.08434370057986294, "grad_norm": 120.87311935801125, "learning_rate": 9.790458618871902e-07, "logits/chosen": -2.106250047683716, "logits/rejected": -1.933984398841858, "logps/chosen": -310.3500061035156, "logps/rejected": -398.45001220703125, "loss": 0.6003, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19640807807445526, "rewards/margins": 0.448486328125, "rewards/rejected": -0.2518676817417145, "step": 160 }, { "epoch": 0.08961518186610437, "grad_norm": 131.20527647030755, "learning_rate": 9.7772799156563e-07, "logits/chosen": -2.0960936546325684, "logits/rejected": -2.0562500953674316, "logps/chosen": -319.1000061035156, "logps/rejected": -360.45001220703125, "loss": 0.6332, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2948364317417145, "rewards/margins": 0.32587891817092896, "rewards/rejected": -0.03121337853372097, "step": 170 }, { "epoch": 0.09488666315234581, "grad_norm": 102.00945430593657, "learning_rate": 9.764101212440694e-07, "logits/chosen": -2.0328125953674316, "logits/rejected": -1.9191405773162842, "logps/chosen": -320.29998779296875, "logps/rejected": -446.1000061035156, "loss": 0.5367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2515014708042145, "rewards/margins": 0.5937744379043579, "rewards/rejected": -0.3422485291957855, "step": 180 }, { "epoch": 0.10015814443858724, "grad_norm": 132.66374394045843, "learning_rate": 9.750922509225091e-07, "logits/chosen": -2.055468797683716, "logits/rejected": -2.071093797683716, "logps/chosen": -309.25, "logps/rejected": -371.29998779296875, "loss": 0.6523, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06910095363855362, "rewards/margins": 0.37224119901657104, "rewards/rejected": -0.30424803495407104, "step": 190 }, { "epoch": 0.10542962572482868, "grad_norm": 138.12360818166133, "learning_rate": 9.737743806009488e-07, "logits/chosen": -2.0289063453674316, "logits/rejected": -2.08203125, "logps/chosen": -320.8999938964844, "logps/rejected": -381.75, "loss": 0.609, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.27110594511032104, "rewards/margins": 0.45515137910842896, "rewards/rejected": -0.18457336723804474, "step": 200 }, { "epoch": 0.11070110701107011, "grad_norm": 136.3848319233592, "learning_rate": 9.724565102793885e-07, "logits/chosen": -2.132031202316284, "logits/rejected": -1.9910156726837158, "logps/chosen": -309.1499938964844, "logps/rejected": -427.70001220703125, "loss": 0.6119, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.6709350347518921, "rewards/margins": 0.545703113079071, "rewards/rejected": 0.125396728515625, "step": 210 }, { "epoch": 0.11597258829731154, "grad_norm": 111.0421745492455, "learning_rate": 9.71138639957828e-07, "logits/chosen": -2.0023436546325684, "logits/rejected": -2.047656297683716, "logps/chosen": -316.1499938964844, "logps/rejected": -404.70001220703125, "loss": 0.6188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.576123058795929, "rewards/margins": 0.4097534120082855, "rewards/rejected": 0.16595458984375, "step": 220 }, { "epoch": 0.12124406958355298, "grad_norm": 151.14573567582264, "learning_rate": 9.698207696362677e-07, "logits/chosen": -2.137890577316284, "logits/rejected": -2.0777344703674316, "logps/chosen": -311.45001220703125, "logps/rejected": -378.25, "loss": 0.613, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.2848877012729645, "rewards/margins": 0.46009522676467896, "rewards/rejected": -0.17474365234375, "step": 230 }, { "epoch": 0.1265155508697944, "grad_norm": 122.98499689433284, "learning_rate": 9.685028993147074e-07, "logits/chosen": -2.069140672683716, "logits/rejected": -2.0101561546325684, "logps/chosen": -331.8999938964844, "logps/rejected": -375.54998779296875, "loss": 0.6312, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10131225734949112, "rewards/margins": 0.37434083223342896, "rewards/rejected": -0.4754882752895355, "step": 240 }, { "epoch": 0.13178703215603585, "grad_norm": 95.45237678882735, "learning_rate": 9.671850289931471e-07, "logits/chosen": -1.9617187976837158, "logits/rejected": -1.966796875, "logps/chosen": -316.20001220703125, "logps/rejected": -379.70001220703125, "loss": 0.5431, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04619140550494194, "rewards/margins": 0.66455078125, "rewards/rejected": -0.618701159954071, "step": 250 }, { "epoch": 0.1370585134422773, "grad_norm": 110.79858631666913, "learning_rate": 9.658671586715866e-07, "logits/chosen": -2.1167969703674316, "logits/rejected": -2.088671922683716, "logps/chosen": -315.95001220703125, "logps/rejected": -381.95001220703125, "loss": 0.5849, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0006225585821084678, "rewards/margins": 0.554821789264679, "rewards/rejected": -0.554186999797821, "step": 260 }, { "epoch": 0.1423299947285187, "grad_norm": 132.08320573517426, "learning_rate": 9.645492883500263e-07, "logits/chosen": -2.1273436546325684, "logits/rejected": -2.0078125, "logps/chosen": -341.75, "logps/rejected": -427.3999938964844, "loss": 0.6184, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.020355224609375, "rewards/margins": 0.535266101360321, "rewards/rejected": -0.5548950433731079, "step": 270 }, { "epoch": 0.14760147601476015, "grad_norm": 131.9089189507848, "learning_rate": 9.63231418028466e-07, "logits/chosen": -2.068359375, "logits/rejected": -1.982031226158142, "logps/chosen": -341.04998779296875, "logps/rejected": -345.1000061035156, "loss": 0.7317, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.475515753030777, "rewards/margins": 0.20308837294578552, "rewards/rejected": 0.2728637754917145, "step": 280 }, { "epoch": 0.1528729573010016, "grad_norm": 115.53783405260934, "learning_rate": 9.619135477069055e-07, "logits/chosen": -2.1187500953674316, "logits/rejected": -2.037890672683716, "logps/chosen": -374.8999938964844, "logps/rejected": -457.1499938964844, "loss": 0.5269, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.86328125, "rewards/margins": 0.6351867914199829, "rewards/rejected": 0.22807617485523224, "step": 290 }, { "epoch": 0.158144438587243, "grad_norm": 103.32606945777815, "learning_rate": 9.605956773853452e-07, "logits/chosen": -1.925390601158142, "logits/rejected": -1.826562523841858, "logps/chosen": -275.04998779296875, "logps/rejected": -366.8500061035156, "loss": 0.544, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.919238269329071, "rewards/margins": 0.605664074420929, "rewards/rejected": 0.313629150390625, "step": 300 }, { "epoch": 0.16341591987348444, "grad_norm": 117.23623208459513, "learning_rate": 9.59277807063785e-07, "logits/chosen": -1.964453101158142, "logits/rejected": -1.88671875, "logps/chosen": -351.25, "logps/rejected": -430.25, "loss": 0.5554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.785351574420929, "rewards/margins": 0.5738464593887329, "rewards/rejected": 0.21110840141773224, "step": 310 }, { "epoch": 0.16868740115972589, "grad_norm": 105.6660318573469, "learning_rate": 9.579599367422246e-07, "logits/chosen": -1.9453125, "logits/rejected": -1.9406249523162842, "logps/chosen": -342.1000061035156, "logps/rejected": -386.45001220703125, "loss": 0.649, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5643554925918579, "rewards/margins": 0.4151611328125, "rewards/rejected": 0.14984130859375, "step": 320 }, { "epoch": 0.17395888244596733, "grad_norm": 118.20709048204702, "learning_rate": 9.56642066420664e-07, "logits/chosen": -2.0699219703674316, "logits/rejected": -1.9777343273162842, "logps/chosen": -360.45001220703125, "logps/rejected": -483.54998779296875, "loss": 0.5179, "rewards/accuracies": 0.6875, "rewards/chosen": 0.547515869140625, "rewards/margins": 0.684527575969696, "rewards/rejected": -0.13732299208641052, "step": 330 }, { "epoch": 0.17923036373220874, "grad_norm": 124.30454003799358, "learning_rate": 9.553241960991038e-07, "logits/chosen": -1.9816405773162842, "logits/rejected": -1.960546851158142, "logps/chosen": -332.1499938964844, "logps/rejected": -363.95001220703125, "loss": 0.6514, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.4232421815395355, "rewards/margins": 0.519482433795929, "rewards/rejected": -0.09555663913488388, "step": 340 }, { "epoch": 0.18450184501845018, "grad_norm": 99.85879571289819, "learning_rate": 9.540063257775435e-07, "logits/chosen": -2.014453172683716, "logits/rejected": -1.992578148841858, "logps/chosen": -331.3500061035156, "logps/rejected": -420.70001220703125, "loss": 0.6015, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.59124755859375, "rewards/margins": 0.527453601360321, "rewards/rejected": 0.06422118842601776, "step": 350 }, { "epoch": 0.18977332630469163, "grad_norm": 82.74072605310414, "learning_rate": 9.526884554559831e-07, "logits/chosen": -1.989843726158142, "logits/rejected": -1.958984375, "logps/chosen": -313.45001220703125, "logps/rejected": -392.0, "loss": 0.5981, "rewards/accuracies": 0.625, "rewards/chosen": 0.713427722454071, "rewards/margins": 0.541552722454071, "rewards/rejected": 0.1710205078125, "step": 360 }, { "epoch": 0.19504480759093304, "grad_norm": 162.5270793917395, "learning_rate": 9.513705851344227e-07, "logits/chosen": -1.9609375, "logits/rejected": -1.919531226158142, "logps/chosen": -358.8500061035156, "logps/rejected": -421.29998779296875, "loss": 0.6143, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.523791491985321, "rewards/margins": 0.5987548828125, "rewards/rejected": -0.07493896782398224, "step": 370 }, { "epoch": 0.20031628887717448, "grad_norm": 107.2684477041618, "learning_rate": 9.500527148128624e-07, "logits/chosen": -1.990625023841858, "logits/rejected": -1.976171851158142, "logps/chosen": -298.82501220703125, "logps/rejected": -387.29998779296875, "loss": 0.5295, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4948669373989105, "rewards/margins": 0.820849597454071, "rewards/rejected": -0.3270263671875, "step": 380 }, { "epoch": 0.20558777016341592, "grad_norm": 97.29241559451673, "learning_rate": 9.487348444913021e-07, "logits/chosen": -2.1605467796325684, "logits/rejected": -2.1421875953674316, "logps/chosen": -310.6499938964844, "logps/rejected": -385.25, "loss": 0.6117, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3321594297885895, "rewards/margins": 0.5486816167831421, "rewards/rejected": -0.21665649116039276, "step": 390 }, { "epoch": 0.21085925144965736, "grad_norm": 130.15982651508153, "learning_rate": 9.474169741697417e-07, "logits/chosen": -1.965234398841858, "logits/rejected": -1.937109351158142, "logps/chosen": -296.8500061035156, "logps/rejected": -389.20001220703125, "loss": 0.5444, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.503222644329071, "rewards/margins": 0.7860473394393921, "rewards/rejected": -0.2833496034145355, "step": 400 }, { "epoch": 0.21613073273589878, "grad_norm": 98.45248999727757, "learning_rate": 9.460991038481813e-07, "logits/chosen": -2.051953077316284, "logits/rejected": -2.094531297683716, "logps/chosen": -344.3999938964844, "logps/rejected": -406.79998779296875, "loss": 0.5857, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3736938536167145, "rewards/margins": 0.639575183391571, "rewards/rejected": -0.2657226622104645, "step": 410 }, { "epoch": 0.22140221402214022, "grad_norm": 96.96438727840093, "learning_rate": 9.44781233526621e-07, "logits/chosen": -2.15234375, "logits/rejected": -2.032421827316284, "logps/chosen": -315.29998779296875, "logps/rejected": -406.04998779296875, "loss": 0.617, "rewards/accuracies": 0.65625, "rewards/chosen": 0.42631834745407104, "rewards/margins": 0.597277820110321, "rewards/rejected": -0.17075805366039276, "step": 420 }, { "epoch": 0.22667369530838166, "grad_norm": 91.64510667428438, "learning_rate": 9.434633632050606e-07, "logits/chosen": -1.9968750476837158, "logits/rejected": -1.964453101158142, "logps/chosen": -352.1000061035156, "logps/rejected": -418.79998779296875, "loss": 0.5404, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3512939512729645, "rewards/margins": 0.7843261957168579, "rewards/rejected": -0.43361204862594604, "step": 430 }, { "epoch": 0.23194517659462308, "grad_norm": 85.13112630548243, "learning_rate": 9.421454928835002e-07, "logits/chosen": -1.9792969226837158, "logits/rejected": -1.91796875, "logps/chosen": -279.8500061035156, "logps/rejected": -390.0, "loss": 0.548, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7587890625, "rewards/margins": 0.645263671875, "rewards/rejected": 0.11361084133386612, "step": 440 }, { "epoch": 0.23721665788086452, "grad_norm": 106.517587907862, "learning_rate": 9.408276225619399e-07, "logits/chosen": -1.98828125, "logits/rejected": -1.904687523841858, "logps/chosen": -323.8500061035156, "logps/rejected": -412.3500061035156, "loss": 0.631, "rewards/accuracies": 0.65625, "rewards/chosen": 0.9169067144393921, "rewards/margins": 0.5174804925918579, "rewards/rejected": 0.39887696504592896, "step": 450 }, { "epoch": 0.24248813916710596, "grad_norm": 92.94484207757097, "learning_rate": 9.395097522403796e-07, "logits/chosen": -2.00390625, "logits/rejected": -2.002734422683716, "logps/chosen": -300.25, "logps/rejected": -349.0, "loss": 0.6198, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0193359851837158, "rewards/margins": 0.5384277105331421, "rewards/rejected": 0.48066407442092896, "step": 460 }, { "epoch": 0.2477596204533474, "grad_norm": 102.5177370826966, "learning_rate": 9.381918819188192e-07, "logits/chosen": -1.9582030773162842, "logits/rejected": -2.0054688453674316, "logps/chosen": -305.79998779296875, "logps/rejected": -361.8500061035156, "loss": 0.5884, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.1041991710662842, "rewards/margins": 0.6160644292831421, "rewards/rejected": 0.48798829317092896, "step": 470 }, { "epoch": 0.2530311017395888, "grad_norm": 106.31077355218017, "learning_rate": 9.368740115972587e-07, "logits/chosen": -1.8562500476837158, "logits/rejected": -1.86328125, "logps/chosen": -335.20001220703125, "logps/rejected": -384.1499938964844, "loss": 0.5911, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.015234351158142, "rewards/margins": 0.549267590045929, "rewards/rejected": 0.46593016386032104, "step": 480 }, { "epoch": 0.25830258302583026, "grad_norm": 113.47682685480567, "learning_rate": 9.355561412756983e-07, "logits/chosen": -1.931640625, "logits/rejected": -1.87109375, "logps/chosen": -306.8999938964844, "logps/rejected": -362.0, "loss": 0.5916, "rewards/accuracies": 0.625, "rewards/chosen": 0.6687682867050171, "rewards/margins": 0.7141357660293579, "rewards/rejected": -0.04491577297449112, "step": 490 }, { "epoch": 0.2635740643120717, "grad_norm": 109.61507291046053, "learning_rate": 9.34238270954138e-07, "logits/chosen": -1.9992187023162842, "logits/rejected": -1.9230468273162842, "logps/chosen": -302.8999938964844, "logps/rejected": -392.8500061035156, "loss": 0.5694, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.09088134765625, "rewards/margins": 0.6865234375, "rewards/rejected": -0.596240222454071, "step": 500 }, { "epoch": 0.26884554559831314, "grad_norm": 114.4135424098023, "learning_rate": 9.329204006325777e-07, "logits/chosen": -1.9832031726837158, "logits/rejected": -1.8957030773162842, "logps/chosen": -298.8500061035156, "logps/rejected": -369.54998779296875, "loss": 0.5936, "rewards/accuracies": 0.625, "rewards/chosen": -0.097442626953125, "rewards/margins": 0.5915771722793579, "rewards/rejected": -0.6891235113143921, "step": 510 }, { "epoch": 0.2741170268845546, "grad_norm": 142.98626259736014, "learning_rate": 9.316025303110173e-07, "logits/chosen": -2.0394530296325684, "logits/rejected": -2.0132813453674316, "logps/chosen": -400.0, "logps/rejected": -470.8500061035156, "loss": 0.5735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18652954697608948, "rewards/margins": 0.7271484136581421, "rewards/rejected": -0.914599597454071, "step": 520 }, { "epoch": 0.27938850817079597, "grad_norm": 98.41996614960672, "learning_rate": 9.30284659989457e-07, "logits/chosen": -1.9972655773162842, "logits/rejected": -1.963281273841858, "logps/chosen": -369.04998779296875, "logps/rejected": -434.0, "loss": 0.503, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.02504882775247097, "rewards/margins": 0.8104492425918579, "rewards/rejected": -0.7848144769668579, "step": 530 }, { "epoch": 0.2846599894570374, "grad_norm": 137.724991555228, "learning_rate": 9.289667896678966e-07, "logits/chosen": -2.007031202316284, "logits/rejected": -1.993749976158142, "logps/chosen": -339.54998779296875, "logps/rejected": -403.6499938964844, "loss": 0.6356, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01933593675494194, "rewards/margins": 0.617358386516571, "rewards/rejected": -0.636425793170929, "step": 540 }, { "epoch": 0.28993147074327885, "grad_norm": 87.06901678784463, "learning_rate": 9.276489193463362e-07, "logits/chosen": -1.978124976158142, "logits/rejected": -1.9894530773162842, "logps/chosen": -354.45001220703125, "logps/rejected": -409.45001220703125, "loss": 0.5894, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.251181036233902, "rewards/margins": 0.6878417730331421, "rewards/rejected": -0.4364013671875, "step": 550 }, { "epoch": 0.2952029520295203, "grad_norm": 97.16358117373538, "learning_rate": 9.263310490247759e-07, "logits/chosen": -2.04296875, "logits/rejected": -2.0484375953674316, "logps/chosen": -340.79998779296875, "logps/rejected": -367.45001220703125, "loss": 0.6729, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.3115600645542145, "rewards/margins": 0.40521240234375, "rewards/rejected": -0.09353027492761612, "step": 560 }, { "epoch": 0.30047443331576174, "grad_norm": 78.51914925360175, "learning_rate": 9.250131787032156e-07, "logits/chosen": -2.0042967796325684, "logits/rejected": -1.9617187976837158, "logps/chosen": -304.1499938964844, "logps/rejected": -373.04998779296875, "loss": 0.62, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.33323365449905396, "rewards/margins": 0.8343261480331421, "rewards/rejected": -0.5006469488143921, "step": 570 }, { "epoch": 0.3057459146020032, "grad_norm": 84.88536497265127, "learning_rate": 9.236953083816552e-07, "logits/chosen": -1.970312476158142, "logits/rejected": -1.904687523841858, "logps/chosen": -312.6499938964844, "logps/rejected": -362.5, "loss": 0.6177, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.028350830078125, "rewards/margins": 0.5628662109375, "rewards/rejected": -0.53466796875, "step": 580 }, { "epoch": 0.3110173958882446, "grad_norm": 104.17885208614341, "learning_rate": 9.223774380600948e-07, "logits/chosen": -2.0152344703674316, "logits/rejected": -1.932031273841858, "logps/chosen": -324.1000061035156, "logps/rejected": -385.79998779296875, "loss": 0.6033, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08749695122241974, "rewards/margins": 0.544726550579071, "rewards/rejected": -0.4577392637729645, "step": 590 }, { "epoch": 0.316288877174486, "grad_norm": 100.54071588179156, "learning_rate": 9.210595677385344e-07, "logits/chosen": -2.0199217796325684, "logits/rejected": -2.0269532203674316, "logps/chosen": -354.20001220703125, "logps/rejected": -401.6000061035156, "loss": 0.6158, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04094848781824112, "rewards/margins": 0.524829089641571, "rewards/rejected": -0.48335570096969604, "step": 600 }, { "epoch": 0.32156035846072745, "grad_norm": 86.36314998579896, "learning_rate": 9.197416974169741e-07, "logits/chosen": -2.0875000953674316, "logits/rejected": -2.010937452316284, "logps/chosen": -296.3999938964844, "logps/rejected": -416.95001220703125, "loss": 0.5575, "rewards/accuracies": 0.71875, "rewards/chosen": 0.20690307021141052, "rewards/margins": 0.783886730670929, "rewards/rejected": -0.5778747797012329, "step": 610 }, { "epoch": 0.3268318397469689, "grad_norm": 73.7154518362987, "learning_rate": 9.184238270954138e-07, "logits/chosen": -1.990625023841858, "logits/rejected": -1.894921898841858, "logps/chosen": -314.5, "logps/rejected": -402.3500061035156, "loss": 0.5317, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.47008055448532104, "rewards/margins": 0.8702636957168579, "rewards/rejected": -0.40043944120407104, "step": 620 }, { "epoch": 0.33210332103321033, "grad_norm": 111.88728762623147, "learning_rate": 9.171059567738534e-07, "logits/chosen": -2.040234327316284, "logits/rejected": -1.955078125, "logps/chosen": -320.20001220703125, "logps/rejected": -394.70001220703125, "loss": 0.5693, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.28287965059280396, "rewards/margins": 0.8678222894668579, "rewards/rejected": -0.5846191644668579, "step": 630 }, { "epoch": 0.33737480231945177, "grad_norm": 126.20277273290164, "learning_rate": 9.157880864522931e-07, "logits/chosen": -2.043750047683716, "logits/rejected": -2.0355467796325684, "logps/chosen": -380.3500061035156, "logps/rejected": -467.8999938964844, "loss": 0.571, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0045532225631177425, "rewards/margins": 0.782519519329071, "rewards/rejected": -0.7864990234375, "step": 640 }, { "epoch": 0.3426462836056932, "grad_norm": 91.83701421157004, "learning_rate": 9.144702161307327e-07, "logits/chosen": -1.8976562023162842, "logits/rejected": -1.9070312976837158, "logps/chosen": -301.20001220703125, "logps/rejected": -326.5, "loss": 0.6066, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07406006008386612, "rewards/margins": 0.561474621295929, "rewards/rejected": -0.4874267578125, "step": 650 }, { "epoch": 0.34791776489193466, "grad_norm": 112.33735669326646, "learning_rate": 9.131523458091723e-07, "logits/chosen": -1.972265601158142, "logits/rejected": -1.937109351158142, "logps/chosen": -340.04998779296875, "logps/rejected": -423.70001220703125, "loss": 0.6571, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2785888612270355, "rewards/margins": 0.593029797077179, "rewards/rejected": -0.8721679449081421, "step": 660 }, { "epoch": 0.35318924617817604, "grad_norm": 93.89169525086463, "learning_rate": 9.118344754876119e-07, "logits/chosen": -2.049609422683716, "logits/rejected": -1.900390625, "logps/chosen": -322.1000061035156, "logps/rejected": -409.79998779296875, "loss": 0.6378, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10554198920726776, "rewards/margins": 0.590991199016571, "rewards/rejected": -0.696362316608429, "step": 670 }, { "epoch": 0.3584607274644175, "grad_norm": 100.63789271607763, "learning_rate": 9.105166051660517e-07, "logits/chosen": -2.0875000953674316, "logits/rejected": -2.043750047683716, "logps/chosen": -352.0, "logps/rejected": -453.0, "loss": 0.5119, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.068206787109375, "rewards/margins": 0.8328613042831421, "rewards/rejected": -0.765209972858429, "step": 680 }, { "epoch": 0.3637322087506589, "grad_norm": 84.54932559822319, "learning_rate": 9.091987348444913e-07, "logits/chosen": -1.9660155773162842, "logits/rejected": -1.921875, "logps/chosen": -290.6499938964844, "logps/rejected": -381.3999938964844, "loss": 0.6366, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.12758788466453552, "rewards/margins": 0.58978271484375, "rewards/rejected": -0.4613281190395355, "step": 690 }, { "epoch": 0.36900369003690037, "grad_norm": 130.2375315250302, "learning_rate": 9.078808645229309e-07, "logits/chosen": -2.022265672683716, "logits/rejected": -2.0390625, "logps/chosen": -345.25, "logps/rejected": -396.45001220703125, "loss": 0.5488, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.46160888671875, "rewards/margins": 0.767773449420929, "rewards/rejected": -0.30653077363967896, "step": 700 }, { "epoch": 0.3742751713231418, "grad_norm": 190.9898355535957, "learning_rate": 9.065629942013705e-07, "logits/chosen": -2.0621094703674316, "logits/rejected": -1.9832031726837158, "logps/chosen": -300.54998779296875, "logps/rejected": -355.6000061035156, "loss": 0.6877, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.46812742948532104, "rewards/margins": 0.5099853277206421, "rewards/rejected": -0.04106445237994194, "step": 710 }, { "epoch": 0.37954665260938325, "grad_norm": 125.49776190173313, "learning_rate": 9.052451238798102e-07, "logits/chosen": -2.0765624046325684, "logits/rejected": -2.0367188453674316, "logps/chosen": -350.8999938964844, "logps/rejected": -408.8500061035156, "loss": 0.6169, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7671142816543579, "rewards/margins": 0.672314465045929, "rewards/rejected": 0.09436645358800888, "step": 720 }, { "epoch": 0.3848181338956247, "grad_norm": 121.34764704099258, "learning_rate": 9.039272535582499e-07, "logits/chosen": -2.0562500953674316, "logits/rejected": -1.966796875, "logps/chosen": -337.1000061035156, "logps/rejected": -428.5, "loss": 0.5972, "rewards/accuracies": 0.65625, "rewards/chosen": 0.755126953125, "rewards/margins": 0.6612793207168579, "rewards/rejected": 0.09420166164636612, "step": 730 }, { "epoch": 0.3900896151818661, "grad_norm": 97.4103929630674, "learning_rate": 9.026093832366895e-07, "logits/chosen": -1.847265601158142, "logits/rejected": -1.917578101158142, "logps/chosen": -370.8500061035156, "logps/rejected": -387.1000061035156, "loss": 0.6497, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.735607922077179, "rewards/margins": 0.50616455078125, "rewards/rejected": 0.22950439155101776, "step": 740 }, { "epoch": 0.3953610964681075, "grad_norm": 125.93701222194377, "learning_rate": 9.012915129151291e-07, "logits/chosen": -2.0, "logits/rejected": -1.898828148841858, "logps/chosen": -316.3500061035156, "logps/rejected": -436.1000061035156, "loss": 0.5449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7518066167831421, "rewards/margins": 0.8001464605331421, "rewards/rejected": -0.04887695237994194, "step": 750 }, { "epoch": 0.40063257775434896, "grad_norm": 97.41982588096855, "learning_rate": 8.999736425935688e-07, "logits/chosen": -2.022656202316284, "logits/rejected": -1.947265625, "logps/chosen": -271.54998779296875, "logps/rejected": -367.29998779296875, "loss": 0.5656, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9366210699081421, "rewards/margins": 0.590258777141571, "rewards/rejected": 0.3467651307582855, "step": 760 }, { "epoch": 0.4059040590405904, "grad_norm": 61.45983056947825, "learning_rate": 8.986557722720084e-07, "logits/chosen": -1.974218726158142, "logits/rejected": -1.986328125, "logps/chosen": -311.75, "logps/rejected": -381.1000061035156, "loss": 0.5645, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.814648449420929, "rewards/margins": 0.768157958984375, "rewards/rejected": 0.04627685621380806, "step": 770 }, { "epoch": 0.41117554032683185, "grad_norm": 125.14965396950289, "learning_rate": 8.97337901950448e-07, "logits/chosen": -2.018749952316284, "logits/rejected": -1.8738281726837158, "logps/chosen": -309.8999938964844, "logps/rejected": -405.1000061035156, "loss": 0.5422, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3410400450229645, "rewards/margins": 0.754345715045929, "rewards/rejected": -0.4133667051792145, "step": 780 }, { "epoch": 0.4164470216130733, "grad_norm": 109.16459294158221, "learning_rate": 8.960200316288878e-07, "logits/chosen": -2.126171827316284, "logits/rejected": -2.065234422683716, "logps/chosen": -331.70001220703125, "logps/rejected": -403.6000061035156, "loss": 0.6036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.131134033203125, "rewards/margins": 0.755053699016571, "rewards/rejected": -0.885546863079071, "step": 790 }, { "epoch": 0.42171850289931473, "grad_norm": 75.39232165894488, "learning_rate": 8.947021613073274e-07, "logits/chosen": -2.010937452316284, "logits/rejected": -1.9933593273162842, "logps/chosen": -319.70001220703125, "logps/rejected": -354.70001220703125, "loss": 0.5283, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2709716856479645, "rewards/margins": 0.789599597454071, "rewards/rejected": -0.5177246332168579, "step": 800 }, { "epoch": 0.4269899841855561, "grad_norm": 92.82810565396575, "learning_rate": 8.93384290985767e-07, "logits/chosen": -1.912500023841858, "logits/rejected": -1.9363281726837158, "logps/chosen": -328.625, "logps/rejected": -403.3500061035156, "loss": 0.5666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.28115540742874146, "rewards/margins": 0.9566406011581421, "rewards/rejected": -0.6754150390625, "step": 810 }, { "epoch": 0.43226146547179756, "grad_norm": 104.8313101896086, "learning_rate": 8.920664206642066e-07, "logits/chosen": -2.048046827316284, "logits/rejected": -2.021484375, "logps/chosen": -308.6000061035156, "logps/rejected": -413.5, "loss": 0.5877, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40632325410842896, "rewards/margins": 0.764294445514679, "rewards/rejected": -0.3574585020542145, "step": 820 }, { "epoch": 0.437532946758039, "grad_norm": 110.53029700608015, "learning_rate": 8.907485503426463e-07, "logits/chosen": -1.9968750476837158, "logits/rejected": -1.976953148841858, "logps/chosen": -344.6499938964844, "logps/rejected": -410.6000061035156, "loss": 0.6289, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3232177793979645, "rewards/margins": 0.6004883050918579, "rewards/rejected": -0.27794188261032104, "step": 830 }, { "epoch": 0.44280442804428044, "grad_norm": 120.8985802197319, "learning_rate": 8.894306800210858e-07, "logits/chosen": -2.0121092796325684, "logits/rejected": -2.082812547683716, "logps/chosen": -327.04998779296875, "logps/rejected": -360.6499938964844, "loss": 0.6841, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.474853515625, "rewards/margins": 0.566967785358429, "rewards/rejected": -0.09184570610523224, "step": 840 }, { "epoch": 0.4480759093305219, "grad_norm": 92.9989393943493, "learning_rate": 8.881128096995255e-07, "logits/chosen": -1.851953148841858, "logits/rejected": -1.898828148841858, "logps/chosen": -338.29998779296875, "logps/rejected": -387.54998779296875, "loss": 0.6369, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.70556640625, "rewards/margins": 0.6152588129043579, "rewards/rejected": 0.09073486179113388, "step": 850 }, { "epoch": 0.4533473906167633, "grad_norm": 107.49702067097238, "learning_rate": 8.867949393779651e-07, "logits/chosen": -1.992578148841858, "logits/rejected": -2.0074219703674316, "logps/chosen": -288.25, "logps/rejected": -354.29998779296875, "loss": 0.6415, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.8790038824081421, "rewards/margins": 0.501232922077179, "rewards/rejected": 0.3776001036167145, "step": 860 }, { "epoch": 0.45861887190300477, "grad_norm": 119.00460749798071, "learning_rate": 8.854770690564048e-07, "logits/chosen": -2.0335936546325684, "logits/rejected": -2.005078077316284, "logps/chosen": -338.5, "logps/rejected": -413.95001220703125, "loss": 0.5527, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.994335949420929, "rewards/margins": 0.786938488483429, "rewards/rejected": 0.20623168349266052, "step": 870 }, { "epoch": 0.46389035318924615, "grad_norm": 96.81624577758286, "learning_rate": 8.841591987348444e-07, "logits/chosen": -2.061718702316284, "logits/rejected": -2.024218797683716, "logps/chosen": -345.1499938964844, "logps/rejected": -379.5, "loss": 0.6212, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.829785168170929, "rewards/margins": 0.4728149473667145, "rewards/rejected": 0.35698240995407104, "step": 880 }, { "epoch": 0.4691618344754876, "grad_norm": 175.57757498062307, "learning_rate": 8.82841328413284e-07, "logits/chosen": -2.127734422683716, "logits/rejected": -2.072265625, "logps/chosen": -330.04998779296875, "logps/rejected": -443.6000061035156, "loss": 0.6017, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.36311036348342896, "rewards/margins": 0.570813000202179, "rewards/rejected": -0.20772095024585724, "step": 890 }, { "epoch": 0.47443331576172904, "grad_norm": 116.28765533631079, "learning_rate": 8.815234580917237e-07, "logits/chosen": -2.129687547683716, "logits/rejected": -1.991796851158142, "logps/chosen": -329.6000061035156, "logps/rejected": -386.8500061035156, "loss": 0.6025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.103759765625, "rewards/margins": 0.5985107421875, "rewards/rejected": -0.494873046875, "step": 900 }, { "epoch": 0.4797047970479705, "grad_norm": 85.34827585606295, "learning_rate": 8.802055877701634e-07, "logits/chosen": -2.0269532203674316, "logits/rejected": -1.970312476158142, "logps/chosen": -351.29998779296875, "logps/rejected": -431.75, "loss": 0.5521, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.01578369177877903, "rewards/margins": 0.7275635004043579, "rewards/rejected": -0.71148681640625, "step": 910 }, { "epoch": 0.4849762783342119, "grad_norm": 105.80536466980202, "learning_rate": 8.78887717448603e-07, "logits/chosen": -2.0296874046325684, "logits/rejected": -1.948828101158142, "logps/chosen": -382.25, "logps/rejected": -414.45001220703125, "loss": 0.6486, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0042968750931322575, "rewards/margins": 0.573486328125, "rewards/rejected": -0.5692383050918579, "step": 920 }, { "epoch": 0.49024775962045336, "grad_norm": 107.8111061874468, "learning_rate": 8.775698471270426e-07, "logits/chosen": -1.87109375, "logits/rejected": -1.9249999523162842, "logps/chosen": -341.1000061035156, "logps/rejected": -376.54998779296875, "loss": 0.6121, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01925048790872097, "rewards/margins": 0.7153564691543579, "rewards/rejected": -0.6962890625, "step": 930 }, { "epoch": 0.4955192409066948, "grad_norm": 97.42545069557768, "learning_rate": 8.762519768054823e-07, "logits/chosen": -2.0054688453674316, "logits/rejected": -2.018749952316284, "logps/chosen": -325.1000061035156, "logps/rejected": -394.95001220703125, "loss": 0.5867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07335510104894638, "rewards/margins": 0.741058349609375, "rewards/rejected": -0.66827392578125, "step": 940 }, { "epoch": 0.5007907221929362, "grad_norm": 127.09246757688649, "learning_rate": 8.749341064839219e-07, "logits/chosen": -2.040234327316284, "logits/rejected": -2.0484375953674316, "logps/chosen": -347.0, "logps/rejected": -414.45001220703125, "loss": 0.5683, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05980835109949112, "rewards/margins": 0.840686023235321, "rewards/rejected": -0.7814697027206421, "step": 950 }, { "epoch": 0.5060622034791776, "grad_norm": 82.46814506013281, "learning_rate": 8.736162361623616e-07, "logits/chosen": -2.0277342796325684, "logits/rejected": -2.0160155296325684, "logps/chosen": -282.6000061035156, "logps/rejected": -376.1000061035156, "loss": 0.5762, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5047851800918579, "rewards/margins": 0.718554675579071, "rewards/rejected": -0.21333007514476776, "step": 960 }, { "epoch": 0.5113336847654191, "grad_norm": 126.10996277348774, "learning_rate": 8.722983658408012e-07, "logits/chosen": -2.0746092796325684, "logits/rejected": -2.05859375, "logps/chosen": -388.8500061035156, "logps/rejected": -387.6499938964844, "loss": 0.6567, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.704174816608429, "rewards/margins": 0.40092772245407104, "rewards/rejected": 0.3035644590854645, "step": 970 }, { "epoch": 0.5166051660516605, "grad_norm": 133.81908802294237, "learning_rate": 8.709804955192409e-07, "logits/chosen": -2.010546922683716, "logits/rejected": -1.94921875, "logps/chosen": -317.3999938964844, "logps/rejected": -359.6499938964844, "loss": 0.5841, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0034668445587158, "rewards/margins": 0.5387939214706421, "rewards/rejected": 0.464019775390625, "step": 980 }, { "epoch": 0.521876647337902, "grad_norm": 138.45336217629753, "learning_rate": 8.696626251976805e-07, "logits/chosen": -1.9070312976837158, "logits/rejected": -1.974609375, "logps/chosen": -345.79998779296875, "logps/rejected": -401.8500061035156, "loss": 0.5617, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7447509765625, "rewards/margins": 0.648938000202179, "rewards/rejected": 0.09566040337085724, "step": 990 }, { "epoch": 0.5271481286241434, "grad_norm": 108.99583654570881, "learning_rate": 8.683447548761201e-07, "logits/chosen": -1.9921875, "logits/rejected": -1.947656273841858, "logps/chosen": -323.0249938964844, "logps/rejected": -389.25, "loss": 0.5551, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.662670910358429, "rewards/margins": 0.76806640625, "rewards/rejected": -0.10554198920726776, "step": 1000 }, { "epoch": 0.5324196099103848, "grad_norm": 89.1859621826139, "learning_rate": 8.670268845545597e-07, "logits/chosen": -2.03125, "logits/rejected": -2.0433592796325684, "logps/chosen": -295.6000061035156, "logps/rejected": -359.79998779296875, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5596679449081421, "rewards/margins": 0.7015014886856079, "rewards/rejected": -0.14128418266773224, "step": 1010 }, { "epoch": 0.5376910911966263, "grad_norm": 78.05467978454385, "learning_rate": 8.657090142329995e-07, "logits/chosen": -2.075390577316284, "logits/rejected": -2.100781202316284, "logps/chosen": -386.3999938964844, "logps/rejected": -412.3999938964844, "loss": 0.5265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4942260682582855, "rewards/margins": 0.8224121332168579, "rewards/rejected": -0.32843017578125, "step": 1020 }, { "epoch": 0.5429625724828677, "grad_norm": 87.44295181727979, "learning_rate": 8.643911439114391e-07, "logits/chosen": -2.014453172683716, "logits/rejected": -2.01171875, "logps/chosen": -318.4750061035156, "logps/rejected": -418.0, "loss": 0.5145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.654833972454071, "rewards/margins": 0.9366210699081421, "rewards/rejected": -0.28251951932907104, "step": 1030 }, { "epoch": 0.5482340537691092, "grad_norm": 100.23929232866955, "learning_rate": 8.630732735898787e-07, "logits/chosen": -2.007031202316284, "logits/rejected": -2.013671875, "logps/chosen": -283.75, "logps/rejected": -374.6499938964844, "loss": 0.5591, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.801953136920929, "rewards/margins": 0.718518078327179, "rewards/rejected": 0.08266296237707138, "step": 1040 }, { "epoch": 0.5535055350553506, "grad_norm": 108.95575166261794, "learning_rate": 8.617554032683183e-07, "logits/chosen": -1.962890625, "logits/rejected": -1.9265625476837158, "logps/chosen": -292.57501220703125, "logps/rejected": -352.8500061035156, "loss": 0.6089, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.878710925579071, "rewards/margins": 0.7952514886856079, "rewards/rejected": 0.08334960788488388, "step": 1050 }, { "epoch": 0.5587770163415919, "grad_norm": 105.85671468746617, "learning_rate": 8.60437532946758e-07, "logits/chosen": -2.065234422683716, "logits/rejected": -2.0835938453674316, "logps/chosen": -319.3999938964844, "logps/rejected": -359.6499938964844, "loss": 0.6015, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.675048828125, "rewards/margins": 0.5667968988418579, "rewards/rejected": 0.10922851413488388, "step": 1060 }, { "epoch": 0.5640484976278334, "grad_norm": 105.64186170732084, "learning_rate": 8.591196626251977e-07, "logits/chosen": -1.956640601158142, "logits/rejected": -1.9402344226837158, "logps/chosen": -303.54998779296875, "logps/rejected": -359.8500061035156, "loss": 0.6936, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.808947741985321, "rewards/margins": 0.4760986268520355, "rewards/rejected": 0.33220213651657104, "step": 1070 }, { "epoch": 0.5693199789140748, "grad_norm": 87.76900573136048, "learning_rate": 8.578017923036373e-07, "logits/chosen": -2.0230469703674316, "logits/rejected": -2.0132813453674316, "logps/chosen": -319.1499938964844, "logps/rejected": -397.20001220703125, "loss": 0.5213, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.983203113079071, "rewards/margins": 0.92510986328125, "rewards/rejected": 0.05921630933880806, "step": 1080 }, { "epoch": 0.5745914602003163, "grad_norm": 124.91653540340118, "learning_rate": 8.56483921982077e-07, "logits/chosen": -2.033984422683716, "logits/rejected": -2.049609422683716, "logps/chosen": -299.1499938964844, "logps/rejected": -341.3500061035156, "loss": 0.6149, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.6735809445381165, "rewards/margins": 0.682177722454071, "rewards/rejected": -0.00800170935690403, "step": 1090 }, { "epoch": 0.5798629414865577, "grad_norm": 103.36003292184715, "learning_rate": 8.551660516605166e-07, "logits/chosen": -1.984375, "logits/rejected": -1.928125023841858, "logps/chosen": -307.04998779296875, "logps/rejected": -375.95001220703125, "loss": 0.5512, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.7284179925918579, "rewards/margins": 0.8141113519668579, "rewards/rejected": -0.08619384467601776, "step": 1100 }, { "epoch": 0.5851344227727991, "grad_norm": 80.82027091406354, "learning_rate": 8.538481813389562e-07, "logits/chosen": -2.071093797683716, "logits/rejected": -1.9890625476837158, "logps/chosen": -292.95001220703125, "logps/rejected": -336.0, "loss": 0.5993, "rewards/accuracies": 0.625, "rewards/chosen": 1.088476538658142, "rewards/margins": 0.6120849847793579, "rewards/rejected": 0.475555419921875, "step": 1110 }, { "epoch": 0.5904059040590406, "grad_norm": 119.73894305180849, "learning_rate": 8.525303110173958e-07, "logits/chosen": -2.120312452316284, "logits/rejected": -2.0804686546325684, "logps/chosen": -349.29998779296875, "logps/rejected": -417.25, "loss": 0.6174, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.135290503501892, "rewards/margins": 0.6104736328125, "rewards/rejected": 0.5245605707168579, "step": 1120 }, { "epoch": 0.595677385345282, "grad_norm": 85.74337734300799, "learning_rate": 8.512124406958356e-07, "logits/chosen": -2.0433592796325684, "logits/rejected": -2.038281202316284, "logps/chosen": -349.79998779296875, "logps/rejected": -431.79998779296875, "loss": 0.5842, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.137792944908142, "rewards/margins": 0.7589355707168579, "rewards/rejected": 0.37849122285842896, "step": 1130 }, { "epoch": 0.6009488666315235, "grad_norm": 151.1400519751203, "learning_rate": 8.498945703742752e-07, "logits/chosen": -2.126953125, "logits/rejected": -2.094531297683716, "logps/chosen": -397.0, "logps/rejected": -440.8999938964844, "loss": 0.6103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.946093738079071, "rewards/margins": 0.664440929889679, "rewards/rejected": 0.2817749083042145, "step": 1140 }, { "epoch": 0.6062203479177649, "grad_norm": 142.63752350388125, "learning_rate": 8.485767000527148e-07, "logits/chosen": -2.0257811546325684, "logits/rejected": -2.024609327316284, "logps/chosen": -351.17498779296875, "logps/rejected": -417.8999938964844, "loss": 0.5306, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.9375365972518921, "rewards/margins": 0.963513195514679, "rewards/rejected": -0.02556152269244194, "step": 1150 }, { "epoch": 0.6114918292040064, "grad_norm": 79.82786167058988, "learning_rate": 8.472588297311544e-07, "logits/chosen": -2.0015625953674316, "logits/rejected": -1.937890648841858, "logps/chosen": -340.6499938964844, "logps/rejected": -389.25, "loss": 0.5855, "rewards/accuracies": 0.71875, "rewards/chosen": 0.690356433391571, "rewards/margins": 0.795825183391571, "rewards/rejected": -0.10526733100414276, "step": 1160 }, { "epoch": 0.6167633104902478, "grad_norm": 86.86244435861752, "learning_rate": 8.459409594095941e-07, "logits/chosen": -2.06640625, "logits/rejected": -2.0679688453674316, "logps/chosen": -331.95001220703125, "logps/rejected": -397.70001220703125, "loss": 0.5361, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4412475526332855, "rewards/margins": 0.800097644329071, "rewards/rejected": -0.3589721620082855, "step": 1170 }, { "epoch": 0.6220347917764892, "grad_norm": 73.06107335916981, "learning_rate": 8.446230890880337e-07, "logits/chosen": -2.053515672683716, "logits/rejected": -1.995703101158142, "logps/chosen": -349.04998779296875, "logps/rejected": -435.29998779296875, "loss": 0.5456, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0010253905784338713, "rewards/margins": 0.8890136480331421, "rewards/rejected": -0.890332043170929, "step": 1180 }, { "epoch": 0.6273062730627307, "grad_norm": 137.58357548083754, "learning_rate": 8.433052187664734e-07, "logits/chosen": -2.073437452316284, "logits/rejected": -2.120312452316284, "logps/chosen": -309.29998779296875, "logps/rejected": -401.6000061035156, "loss": 0.5575, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.10422363132238388, "rewards/margins": 0.783447265625, "rewards/rejected": -0.6797424554824829, "step": 1190 }, { "epoch": 0.632577754348972, "grad_norm": 85.58840960915659, "learning_rate": 8.419873484449131e-07, "logits/chosen": -2.046875, "logits/rejected": -1.9640624523162842, "logps/chosen": -329.1499938964844, "logps/rejected": -403.29998779296875, "loss": 0.5994, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.3576904237270355, "rewards/margins": 0.699047863483429, "rewards/rejected": -0.3411621153354645, "step": 1200 }, { "epoch": 0.6378492356352135, "grad_norm": 167.02710754266025, "learning_rate": 8.406694781233526e-07, "logits/chosen": -2.0921874046325684, "logits/rejected": -2.059375047683716, "logps/chosen": -338.375, "logps/rejected": -437.8500061035156, "loss": 0.5824, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6830810308456421, "rewards/margins": 0.7319091558456421, "rewards/rejected": -0.05036621168255806, "step": 1210 }, { "epoch": 0.6431207169214549, "grad_norm": 101.18862541515375, "learning_rate": 8.393516078017922e-07, "logits/chosen": -2.120312452316284, "logits/rejected": -2.1195311546325684, "logps/chosen": -337.5, "logps/rejected": -411.45001220703125, "loss": 0.6015, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.835205078125, "rewards/margins": 0.645062267780304, "rewards/rejected": 0.19064942002296448, "step": 1220 }, { "epoch": 0.6483921982076963, "grad_norm": 108.06224320280167, "learning_rate": 8.380337374802318e-07, "logits/chosen": -2.1800780296325684, "logits/rejected": -2.170703172683716, "logps/chosen": -344.3500061035156, "logps/rejected": -400.8999938964844, "loss": 0.575, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8465820550918579, "rewards/margins": 0.687084972858429, "rewards/rejected": 0.15937499701976776, "step": 1230 }, { "epoch": 0.6536636794939378, "grad_norm": 132.74184602191545, "learning_rate": 8.367158671586716e-07, "logits/chosen": -2.078906297683716, "logits/rejected": -2.030078172683716, "logps/chosen": -388.95001220703125, "logps/rejected": -454.5, "loss": 0.5835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.42576295137405396, "rewards/margins": 0.697155773639679, "rewards/rejected": -0.27086180448532104, "step": 1240 }, { "epoch": 0.6589351607801792, "grad_norm": 114.59972736810592, "learning_rate": 8.353979968371112e-07, "logits/chosen": -2.009765625, "logits/rejected": -1.9375, "logps/chosen": -326.45001220703125, "logps/rejected": -418.75, "loss": 0.5552, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5306152105331421, "rewards/margins": 0.839111328125, "rewards/rejected": -0.30792236328125, "step": 1250 }, { "epoch": 0.6642066420664207, "grad_norm": 139.51293882892705, "learning_rate": 8.340801265155508e-07, "logits/chosen": -1.944921851158142, "logits/rejected": -1.982421875, "logps/chosen": -349.82501220703125, "logps/rejected": -420.04998779296875, "loss": 0.565, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5362304449081421, "rewards/margins": 0.844531238079071, "rewards/rejected": -0.308013916015625, "step": 1260 }, { "epoch": 0.6694781233526621, "grad_norm": 106.59152454418005, "learning_rate": 8.327622561939904e-07, "logits/chosen": -2.0054688453674316, "logits/rejected": -1.9523437023162842, "logps/chosen": -321.29998779296875, "logps/rejected": -425.45001220703125, "loss": 0.6043, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.6126464605331421, "rewards/margins": 0.6460937261581421, "rewards/rejected": -0.03377075120806694, "step": 1270 }, { "epoch": 0.6747496046389035, "grad_norm": 96.02176842423708, "learning_rate": 8.314443858724301e-07, "logits/chosen": -2.065234422683716, "logits/rejected": -2.0335936546325684, "logps/chosen": -314.0, "logps/rejected": -376.5, "loss": 0.6279, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6671386957168579, "rewards/margins": 0.6113525629043579, "rewards/rejected": 0.055908203125, "step": 1280 }, { "epoch": 0.680021085925145, "grad_norm": 66.52007601045193, "learning_rate": 8.301265155508697e-07, "logits/chosen": -2.0492186546325684, "logits/rejected": -2.020312547683716, "logps/chosen": -329.0, "logps/rejected": -381.6000061035156, "loss": 0.6161, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.92803955078125, "rewards/margins": 0.579052746295929, "rewards/rejected": 0.34882813692092896, "step": 1290 }, { "epoch": 0.6852925672113864, "grad_norm": 133.30902882583553, "learning_rate": 8.288086452293094e-07, "logits/chosen": -2.0894532203674316, "logits/rejected": -2.0511717796325684, "logps/chosen": -350.70001220703125, "logps/rejected": -436.0, "loss": 0.4596, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.0424315929412842, "rewards/margins": 1.039160132408142, "rewards/rejected": 0.0038085938431322575, "step": 1300 }, { "epoch": 0.6905640484976279, "grad_norm": 143.35726911639213, "learning_rate": 8.27490774907749e-07, "logits/chosen": -1.994531273841858, "logits/rejected": -2.076171875, "logps/chosen": -339.1499938964844, "logps/rejected": -377.0, "loss": 0.636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.9165283441543579, "rewards/margins": 0.647814929485321, "rewards/rejected": 0.2684082090854645, "step": 1310 }, { "epoch": 0.6958355297838693, "grad_norm": 132.4370570925169, "learning_rate": 8.261729045861887e-07, "logits/chosen": -2.037109375, "logits/rejected": -2.025390625, "logps/chosen": -290.8999938964844, "logps/rejected": -358.6000061035156, "loss": 0.5335, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.8968750238418579, "rewards/margins": 0.853759765625, "rewards/rejected": 0.04276733472943306, "step": 1320 }, { "epoch": 0.7011070110701108, "grad_norm": 110.65003219960973, "learning_rate": 8.248550342646283e-07, "logits/chosen": -2.032421827316284, "logits/rejected": -1.9914062023162842, "logps/chosen": -353.20001220703125, "logps/rejected": -429.75, "loss": 0.5762, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.553356945514679, "rewards/margins": 0.8409057855606079, "rewards/rejected": -0.2880004942417145, "step": 1330 }, { "epoch": 0.7063784923563521, "grad_norm": 101.27303694325262, "learning_rate": 8.235371639430679e-07, "logits/chosen": -1.997656226158142, "logits/rejected": -1.947265625, "logps/chosen": -297.29998779296875, "logps/rejected": -382.1000061035156, "loss": 0.5728, "rewards/accuracies": 0.625, "rewards/chosen": 0.6056976318359375, "rewards/margins": 0.834716796875, "rewards/rejected": -0.22929687798023224, "step": 1340 }, { "epoch": 0.7116499736425935, "grad_norm": 109.42760109215067, "learning_rate": 8.222192936215076e-07, "logits/chosen": -2.076171875, "logits/rejected": -2.0621094703674316, "logps/chosen": -339.95001220703125, "logps/rejected": -414.8999938964844, "loss": 0.52, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5984252691268921, "rewards/margins": 1.001562476158142, "rewards/rejected": -0.40431517362594604, "step": 1350 }, { "epoch": 0.716921454928835, "grad_norm": 83.74307351076158, "learning_rate": 8.209014232999473e-07, "logits/chosen": -2.059375047683716, "logits/rejected": -2.048828125, "logps/chosen": -325.9750061035156, "logps/rejected": -393.0, "loss": 0.5965, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.583056628704071, "rewards/margins": 0.7152343988418579, "rewards/rejected": -0.13205567002296448, "step": 1360 }, { "epoch": 0.7221929362150764, "grad_norm": 104.50544468173007, "learning_rate": 8.195835529783869e-07, "logits/chosen": -2.1351561546325684, "logits/rejected": -2.1226563453674316, "logps/chosen": -377.20001220703125, "logps/rejected": -456.3999938964844, "loss": 0.5958, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6120849847793579, "rewards/margins": 0.69677734375, "rewards/rejected": -0.08448486030101776, "step": 1370 }, { "epoch": 0.7274644175013179, "grad_norm": 77.11133874138584, "learning_rate": 8.182656826568265e-07, "logits/chosen": -2.0859375, "logits/rejected": -1.966406226158142, "logps/chosen": -370.6000061035156, "logps/rejected": -441.20001220703125, "loss": 0.5833, "rewards/accuracies": 0.65625, "rewards/chosen": 0.65771484375, "rewards/margins": 0.6843506097793579, "rewards/rejected": -0.02721557579934597, "step": 1380 }, { "epoch": 0.7327358987875593, "grad_norm": 133.20546679837773, "learning_rate": 8.169478123352662e-07, "logits/chosen": -2.1097655296325684, "logits/rejected": -2.119140625, "logps/chosen": -333.95001220703125, "logps/rejected": -428.29998779296875, "loss": 0.5888, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5220702886581421, "rewards/margins": 0.7606445550918579, "rewards/rejected": -0.23820801079273224, "step": 1390 }, { "epoch": 0.7380073800738007, "grad_norm": 107.845125267086, "learning_rate": 8.156299420137058e-07, "logits/chosen": -1.965234398841858, "logits/rejected": -2.0015625953674316, "logps/chosen": -320.0249938964844, "logps/rejected": -379.20001220703125, "loss": 0.5702, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.8310912847518921, "rewards/margins": 0.7427734136581421, "rewards/rejected": 0.08966674655675888, "step": 1400 }, { "epoch": 0.7432788613600422, "grad_norm": 83.18068738417722, "learning_rate": 8.143120716921455e-07, "logits/chosen": -2.167187452316284, "logits/rejected": -2.0121092796325684, "logps/chosen": -337.67498779296875, "logps/rejected": -420.8999938964844, "loss": 0.5068, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.7772461175918579, "rewards/margins": 1.005273461341858, "rewards/rejected": -0.22768554091453552, "step": 1410 }, { "epoch": 0.7485503426462836, "grad_norm": 94.12978306108211, "learning_rate": 8.129942013705851e-07, "logits/chosen": -2.0894532203674316, "logits/rejected": -2.0218749046325684, "logps/chosen": -288.79998779296875, "logps/rejected": -372.3999938964844, "loss": 0.5988, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5522094964981079, "rewards/margins": 0.8575439453125, "rewards/rejected": -0.3055419921875, "step": 1420 }, { "epoch": 0.7538218239325251, "grad_norm": 116.10255143110088, "learning_rate": 8.116763310490248e-07, "logits/chosen": -2.040234327316284, "logits/rejected": -2.018359422683716, "logps/chosen": -373.75, "logps/rejected": -424.8999938964844, "loss": 0.6151, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.45228272676467896, "rewards/margins": 0.6942138671875, "rewards/rejected": -0.24082031846046448, "step": 1430 }, { "epoch": 0.7590933052187665, "grad_norm": 104.44703845403521, "learning_rate": 8.103584607274644e-07, "logits/chosen": -2.001171827316284, "logits/rejected": -1.876953125, "logps/chosen": -331.04998779296875, "logps/rejected": -387.04998779296875, "loss": 0.5784, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.6857665777206421, "rewards/margins": 0.920166015625, "rewards/rejected": -0.23395995795726776, "step": 1440 }, { "epoch": 0.7643647865050079, "grad_norm": 100.67821545170753, "learning_rate": 8.09040590405904e-07, "logits/chosen": -1.965234398841858, "logits/rejected": -1.88671875, "logps/chosen": -324.3500061035156, "logps/rejected": -416.3500061035156, "loss": 0.5442, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8708251714706421, "rewards/margins": 0.9588378667831421, "rewards/rejected": -0.08785400539636612, "step": 1450 }, { "epoch": 0.7696362677912494, "grad_norm": 77.84496594018333, "learning_rate": 8.077227200843436e-07, "logits/chosen": -1.8894531726837158, "logits/rejected": -1.871484398841858, "logps/chosen": -310.0, "logps/rejected": -360.6000061035156, "loss": 0.5005, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.111914038658142, "rewards/margins": 0.926831066608429, "rewards/rejected": 0.185821533203125, "step": 1460 }, { "epoch": 0.7749077490774908, "grad_norm": 65.38086163416301, "learning_rate": 8.064048497627834e-07, "logits/chosen": -1.943750023841858, "logits/rejected": -1.869531273841858, "logps/chosen": -296.1000061035156, "logps/rejected": -393.0, "loss": 0.54, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.966796875, "rewards/margins": 1.02685546875, "rewards/rejected": -0.06040038913488388, "step": 1470 }, { "epoch": 0.7801792303637322, "grad_norm": 78.80863643503365, "learning_rate": 8.05086979441223e-07, "logits/chosen": -2.0179686546325684, "logits/rejected": -1.9578125476837158, "logps/chosen": -355.1000061035156, "logps/rejected": -414.20001220703125, "loss": 0.5667, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.837963879108429, "rewards/margins": 0.7451171875, "rewards/rejected": 0.09210205078125, "step": 1480 }, { "epoch": 0.7854507116499736, "grad_norm": 109.96074460165426, "learning_rate": 8.037691091196626e-07, "logits/chosen": -1.9796874523162842, "logits/rejected": -1.9386718273162842, "logps/chosen": -308.8999938964844, "logps/rejected": -370.45001220703125, "loss": 0.5773, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9875732660293579, "rewards/margins": 0.7423095703125, "rewards/rejected": 0.24583740532398224, "step": 1490 }, { "epoch": 0.790722192936215, "grad_norm": 108.42671260750653, "learning_rate": 8.024512387981023e-07, "logits/chosen": -2.067578077316284, "logits/rejected": -1.9910156726837158, "logps/chosen": -304.67498779296875, "logps/rejected": -372.54998779296875, "loss": 0.6728, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.046484351158142, "rewards/margins": 0.44465333223342896, "rewards/rejected": 0.602038562297821, "step": 1500 }, { "epoch": 0.7959936742224565, "grad_norm": 90.89398651643454, "learning_rate": 8.011333684765419e-07, "logits/chosen": -1.9539062976837158, "logits/rejected": -1.8917968273162842, "logps/chosen": -316.5, "logps/rejected": -439.3500061035156, "loss": 0.5503, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.594805896282196, "rewards/margins": 0.9115234613418579, "rewards/rejected": -0.31593626737594604, "step": 1510 }, { "epoch": 0.8012651555086979, "grad_norm": 106.01945871925513, "learning_rate": 7.998154981549815e-07, "logits/chosen": -2.092578172683716, "logits/rejected": -2.0394530296325684, "logps/chosen": -347.75, "logps/rejected": -396.95001220703125, "loss": 0.5801, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6507018804550171, "rewards/margins": 0.9134277105331421, "rewards/rejected": -0.263092041015625, "step": 1520 }, { "epoch": 0.8065366367949394, "grad_norm": 134.13212073426084, "learning_rate": 7.984976278334212e-07, "logits/chosen": -1.900781273841858, "logits/rejected": -1.964453101158142, "logps/chosen": -329.0249938964844, "logps/rejected": -380.29998779296875, "loss": 0.5799, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.864636242389679, "rewards/margins": 0.750927746295929, "rewards/rejected": 0.1142578125, "step": 1530 }, { "epoch": 0.8118081180811808, "grad_norm": 109.18550884808079, "learning_rate": 7.971797575118609e-07, "logits/chosen": -2.003124952316284, "logits/rejected": -2.0121092796325684, "logps/chosen": -296.5, "logps/rejected": -389.20001220703125, "loss": 0.597, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.885302722454071, "rewards/margins": 0.6842041015625, "rewards/rejected": 0.2008056640625, "step": 1540 }, { "epoch": 0.8170795993674222, "grad_norm": 108.03949595929659, "learning_rate": 7.958618871903005e-07, "logits/chosen": -1.966796875, "logits/rejected": -1.935937523841858, "logps/chosen": -348.20001220703125, "logps/rejected": -398.95001220703125, "loss": 0.5539, "rewards/accuracies": 0.6875, "rewards/chosen": 0.864208996295929, "rewards/margins": 0.8916015625, "rewards/rejected": -0.02680664137005806, "step": 1550 }, { "epoch": 0.8223510806536637, "grad_norm": 138.77894930973844, "learning_rate": 7.9454401686874e-07, "logits/chosen": -1.969140648841858, "logits/rejected": -1.921875, "logps/chosen": -266.70001220703125, "logps/rejected": -345.95001220703125, "loss": 0.5903, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.833251953125, "rewards/margins": 0.7995361089706421, "rewards/rejected": 0.03402099758386612, "step": 1560 }, { "epoch": 0.8276225619399051, "grad_norm": 127.8243155027599, "learning_rate": 7.932261465471796e-07, "logits/chosen": -2.03515625, "logits/rejected": -1.91796875, "logps/chosen": -318.8999938964844, "logps/rejected": -384.75, "loss": 0.6009, "rewards/accuracies": 0.65625, "rewards/chosen": 0.8472656011581421, "rewards/margins": 0.764892578125, "rewards/rejected": 0.08311767876148224, "step": 1570 }, { "epoch": 0.8328940432261466, "grad_norm": 117.42385503625259, "learning_rate": 7.919082762256194e-07, "logits/chosen": -2.026562452316284, "logits/rejected": -1.927343726158142, "logps/chosen": -349.75, "logps/rejected": -424.20001220703125, "loss": 0.5644, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.0546143054962158, "rewards/margins": 0.901171863079071, "rewards/rejected": 0.15296630561351776, "step": 1580 }, { "epoch": 0.838165524512388, "grad_norm": 103.05286137007681, "learning_rate": 7.90590405904059e-07, "logits/chosen": -1.964453101158142, "logits/rejected": -1.953515648841858, "logps/chosen": -342.3500061035156, "logps/rejected": -436.1000061035156, "loss": 0.5412, "rewards/accuracies": 0.6875, "rewards/chosen": 1.044677734375, "rewards/margins": 0.8407226800918579, "rewards/rejected": 0.203125, "step": 1590 }, { "epoch": 0.8434370057986295, "grad_norm": 98.88530764663383, "learning_rate": 7.892725355824986e-07, "logits/chosen": -2.039843797683716, "logits/rejected": -2.0960936546325684, "logps/chosen": -314.8500061035156, "logps/rejected": -425.25, "loss": 0.5232, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.863720715045929, "rewards/margins": 0.928149402141571, "rewards/rejected": -0.06477050483226776, "step": 1600 }, { "epoch": 0.8487084870848709, "grad_norm": 99.8047153759348, "learning_rate": 7.879546652609382e-07, "logits/chosen": -2.158203125, "logits/rejected": -2.116406202316284, "logps/chosen": -305.7749938964844, "logps/rejected": -367.8500061035156, "loss": 0.6646, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.7154541015625, "rewards/margins": 0.5922302007675171, "rewards/rejected": 0.12327881157398224, "step": 1610 }, { "epoch": 0.8539799683711122, "grad_norm": 118.52137832951159, "learning_rate": 7.866367949393779e-07, "logits/chosen": -2.0999999046325684, "logits/rejected": -2.07421875, "logps/chosen": -345.70001220703125, "logps/rejected": -410.25, "loss": 0.6237, "rewards/accuracies": 0.6875, "rewards/chosen": 0.692309558391571, "rewards/margins": 0.594281017780304, "rewards/rejected": 0.09797362983226776, "step": 1620 }, { "epoch": 0.8592514496573537, "grad_norm": 99.98810822458712, "learning_rate": 7.853189246178175e-07, "logits/chosen": -2.2144532203674316, "logits/rejected": -2.1957030296325684, "logps/chosen": -308.7250061035156, "logps/rejected": -361.5, "loss": 0.506, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.934814453125, "rewards/margins": 0.861560046672821, "rewards/rejected": 0.07305908203125, "step": 1630 }, { "epoch": 0.8645229309435951, "grad_norm": 78.15628664181878, "learning_rate": 7.840010542962572e-07, "logits/chosen": -2.140625, "logits/rejected": -2.053515672683716, "logps/chosen": -311.5, "logps/rejected": -397.95001220703125, "loss": 0.6358, "rewards/accuracies": 0.625, "rewards/chosen": 0.5571533441543579, "rewards/margins": 0.622314453125, "rewards/rejected": -0.06495971977710724, "step": 1640 }, { "epoch": 0.8697944122298366, "grad_norm": 71.8579645191711, "learning_rate": 7.826831839746969e-07, "logits/chosen": -1.9074218273162842, "logits/rejected": -1.9093749523162842, "logps/chosen": -319.625, "logps/rejected": -390.6000061035156, "loss": 0.5309, "rewards/accuracies": 0.6875, "rewards/chosen": 0.641162097454071, "rewards/margins": 0.926025390625, "rewards/rejected": -0.2849975526332855, "step": 1650 }, { "epoch": 0.875065893516078, "grad_norm": 116.88296195986695, "learning_rate": 7.813653136531365e-07, "logits/chosen": -2.1253905296325684, "logits/rejected": -2.0160155296325684, "logps/chosen": -326.45001220703125, "logps/rejected": -429.6000061035156, "loss": 0.5863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4993652403354645, "rewards/margins": 0.8038330078125, "rewards/rejected": -0.30485230684280396, "step": 1660 }, { "epoch": 0.8803373748023194, "grad_norm": 82.86187932020965, "learning_rate": 7.800474433315761e-07, "logits/chosen": -2.104687452316284, "logits/rejected": -2.110546827316284, "logps/chosen": -293.9750061035156, "logps/rejected": -357.5, "loss": 0.5092, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.726318359375, "rewards/margins": 0.8506835699081421, "rewards/rejected": -0.1241455078125, "step": 1670 }, { "epoch": 0.8856088560885609, "grad_norm": 124.43899888018004, "learning_rate": 7.787295730100157e-07, "logits/chosen": -2.0374999046325684, "logits/rejected": -2.1011719703674316, "logps/chosen": -314.45001220703125, "logps/rejected": -365.79998779296875, "loss": 0.6277, "rewards/accuracies": 0.65625, "rewards/chosen": 0.633105456829071, "rewards/margins": 0.6467345952987671, "rewards/rejected": -0.014294433407485485, "step": 1680 }, { "epoch": 0.8908803373748023, "grad_norm": 114.94696465434112, "learning_rate": 7.774117026884554e-07, "logits/chosen": -2.0757813453674316, "logits/rejected": -2.1171875, "logps/chosen": -355.6000061035156, "logps/rejected": -440.79998779296875, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.431884765625, "rewards/margins": 0.962646484375, "rewards/rejected": -0.530322253704071, "step": 1690 }, { "epoch": 0.8961518186610438, "grad_norm": 118.6929315957764, "learning_rate": 7.760938323668951e-07, "logits/chosen": -1.983984351158142, "logits/rejected": -2.026562452316284, "logps/chosen": -378.3500061035156, "logps/rejected": -431.95001220703125, "loss": 0.616, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.37553709745407104, "rewards/margins": 0.799121081829071, "rewards/rejected": -0.42425233125686646, "step": 1700 }, { "epoch": 0.9014232999472852, "grad_norm": 137.03883312358334, "learning_rate": 7.747759620453347e-07, "logits/chosen": -2.0999999046325684, "logits/rejected": -2.049609422683716, "logps/chosen": -325.04998779296875, "logps/rejected": -389.54998779296875, "loss": 0.605, "rewards/accuracies": 0.6875, "rewards/chosen": 0.45159912109375, "rewards/margins": 0.791259765625, "rewards/rejected": -0.3397216796875, "step": 1710 }, { "epoch": 0.9066947812335266, "grad_norm": 113.42543449057213, "learning_rate": 7.734580917237743e-07, "logits/chosen": -1.955078125, "logits/rejected": -1.9914062023162842, "logps/chosen": -369.8500061035156, "logps/rejected": -422.8999938964844, "loss": 0.6313, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.41752928495407104, "rewards/margins": 0.722705066204071, "rewards/rejected": -0.3052734434604645, "step": 1720 }, { "epoch": 0.9119662625197681, "grad_norm": 105.60199015159802, "learning_rate": 7.72140221402214e-07, "logits/chosen": -2.076953172683716, "logits/rejected": -2.015625, "logps/chosen": -348.3999938964844, "logps/rejected": -411.8999938964844, "loss": 0.5981, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7818603515625, "rewards/margins": 0.6983398199081421, "rewards/rejected": 0.08333130180835724, "step": 1730 }, { "epoch": 0.9172377438060095, "grad_norm": 81.36664758870008, "learning_rate": 7.708223510806536e-07, "logits/chosen": -2.069531202316284, "logits/rejected": -2.0132813453674316, "logps/chosen": -317.20001220703125, "logps/rejected": -395.45001220703125, "loss": 0.5603, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.912353515625, "rewards/margins": 0.65362548828125, "rewards/rejected": 0.25822752714157104, "step": 1740 }, { "epoch": 0.922509225092251, "grad_norm": 98.60145087911184, "learning_rate": 7.695044807590932e-07, "logits/chosen": -2.1597657203674316, "logits/rejected": -2.1273436546325684, "logps/chosen": -367.79998779296875, "logps/rejected": -380.45001220703125, "loss": 0.5646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6644347906112671, "rewards/margins": 0.757092297077179, "rewards/rejected": -0.09322509914636612, "step": 1750 }, { "epoch": 0.9277807063784923, "grad_norm": 105.65515209599086, "learning_rate": 7.68186610437533e-07, "logits/chosen": -2.127734422683716, "logits/rejected": -1.991796851158142, "logps/chosen": -346.6499938964844, "logps/rejected": -416.79998779296875, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.43168944120407104, "rewards/margins": 0.44316405057907104, "rewards/rejected": -0.01113281212747097, "step": 1760 }, { "epoch": 0.9330521876647337, "grad_norm": 80.97999914154865, "learning_rate": 7.668687401159726e-07, "logits/chosen": -2.098828077316284, "logits/rejected": -2.0374999046325684, "logps/chosen": -328.5, "logps/rejected": -378.1000061035156, "loss": 0.6428, "rewards/accuracies": 0.65625, "rewards/chosen": 0.53729248046875, "rewards/margins": 0.576403796672821, "rewards/rejected": -0.039093017578125, "step": 1770 }, { "epoch": 0.9383236689509752, "grad_norm": 97.85319855484059, "learning_rate": 7.655508697944122e-07, "logits/chosen": -2.0621094703674316, "logits/rejected": -2.0367188453674316, "logps/chosen": -337.70001220703125, "logps/rejected": -365.1499938964844, "loss": 0.5994, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5945800542831421, "rewards/margins": 0.7104247808456421, "rewards/rejected": -0.11535034328699112, "step": 1780 }, { "epoch": 0.9435951502372166, "grad_norm": 88.00982400561796, "learning_rate": 7.642329994728518e-07, "logits/chosen": -2.077343702316284, "logits/rejected": -2.05859375, "logps/chosen": -340.1499938964844, "logps/rejected": -398.3999938964844, "loss": 0.5094, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.734179675579071, "rewards/margins": 0.836865246295929, "rewards/rejected": -0.10235595703125, "step": 1790 }, { "epoch": 0.9488666315234581, "grad_norm": 86.69055438705648, "learning_rate": 7.629151291512915e-07, "logits/chosen": -2.0296874046325684, "logits/rejected": -2.0218749046325684, "logps/chosen": -347.67498779296875, "logps/rejected": -380.54998779296875, "loss": 0.6, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.63018798828125, "rewards/margins": 0.697265625, "rewards/rejected": -0.06746826320886612, "step": 1800 }, { "epoch": 0.9541381128096995, "grad_norm": 93.20928521489407, "learning_rate": 7.615972588297312e-07, "logits/chosen": -2.002734422683716, "logits/rejected": -1.9425780773162842, "logps/chosen": -327.3500061035156, "logps/rejected": -389.95001220703125, "loss": 0.5859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.634228527545929, "rewards/margins": 0.605883777141571, "rewards/rejected": 0.02839355543255806, "step": 1810 }, { "epoch": 0.959409594095941, "grad_norm": 121.34649580738896, "learning_rate": 7.602793885081708e-07, "logits/chosen": -1.9753906726837158, "logits/rejected": -1.976171851158142, "logps/chosen": -299.70001220703125, "logps/rejected": -368.5, "loss": 0.5601, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.7747802734375, "rewards/margins": 0.758251965045929, "rewards/rejected": 0.01716308668255806, "step": 1820 }, { "epoch": 0.9646810753821824, "grad_norm": 77.06835129625065, "learning_rate": 7.589615181866104e-07, "logits/chosen": -2.0023436546325684, "logits/rejected": -1.869140625, "logps/chosen": -312.3500061035156, "logps/rejected": -418.54998779296875, "loss": 0.5911, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.530468761920929, "rewards/margins": 0.713977038860321, "rewards/rejected": -0.18430176377296448, "step": 1830 }, { "epoch": 0.9699525566684238, "grad_norm": 94.86262304242426, "learning_rate": 7.576436478650501e-07, "logits/chosen": -1.9308593273162842, "logits/rejected": -1.8585937023162842, "logps/chosen": -265.625, "logps/rejected": -350.6000061035156, "loss": 0.5951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5582275390625, "rewards/margins": 0.838946521282196, "rewards/rejected": -0.280679315328598, "step": 1840 }, { "epoch": 0.9752240379546653, "grad_norm": 73.6377294961965, "learning_rate": 7.563257775434897e-07, "logits/chosen": -1.9929687976837158, "logits/rejected": -1.982421875, "logps/chosen": -310.54998779296875, "logps/rejected": -408.3999938964844, "loss": 0.5177, "rewards/accuracies": 0.71875, "rewards/chosen": 0.38884276151657104, "rewards/margins": 0.954418957233429, "rewards/rejected": -0.5650390386581421, "step": 1850 }, { "epoch": 0.9804955192409067, "grad_norm": 139.93277964980226, "learning_rate": 7.550079072219293e-07, "logits/chosen": -2.089062452316284, "logits/rejected": -2.0546875, "logps/chosen": -383.29998779296875, "logps/rejected": -415.6499938964844, "loss": 0.5865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10722656548023224, "rewards/margins": 0.663891613483429, "rewards/rejected": -0.5562957525253296, "step": 1860 }, { "epoch": 0.9857670005271482, "grad_norm": 109.07608613653656, "learning_rate": 7.53690036900369e-07, "logits/chosen": -2.119921922683716, "logits/rejected": -2.0785155296325684, "logps/chosen": -320.2250061035156, "logps/rejected": -370.04998779296875, "loss": 0.6398, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5160278081893921, "rewards/margins": 0.4950927793979645, "rewards/rejected": 0.02058105543255806, "step": 1870 }, { "epoch": 0.9910384818133896, "grad_norm": 103.62713426599905, "learning_rate": 7.523721665788087e-07, "logits/chosen": -2.100781202316284, "logits/rejected": -2.0796875953674316, "logps/chosen": -306.1499938964844, "logps/rejected": -398.8999938964844, "loss": 0.5294, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7283935546875, "rewards/margins": 0.8792968988418579, "rewards/rejected": -0.14984130859375, "step": 1880 }, { "epoch": 0.996309963099631, "grad_norm": 122.27530680008337, "learning_rate": 7.510542962572483e-07, "logits/chosen": -2.0835938453674316, "logits/rejected": -2.00390625, "logps/chosen": -332.54998779296875, "logps/rejected": -406.79998779296875, "loss": 0.537, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6012817621231079, "rewards/margins": 0.832080066204071, "rewards/rejected": -0.23064574599266052, "step": 1890 }, { "epoch": 1.0015814443858724, "grad_norm": 35.499703476365994, "learning_rate": 7.497364259356879e-07, "logits/chosen": -2.065234422683716, "logits/rejected": -2.0589842796325684, "logps/chosen": -298.45001220703125, "logps/rejected": -363.8500061035156, "loss": 0.4238, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.81634122133255, "rewards/margins": 1.312841773033142, "rewards/rejected": -0.49609375, "step": 1900 }, { "epoch": 1.006852925672114, "grad_norm": 41.85609467575387, "learning_rate": 7.484185556141276e-07, "logits/chosen": -2.162109375, "logits/rejected": -2.0804686546325684, "logps/chosen": -341.95001220703125, "logps/rejected": -435.0, "loss": 0.1926, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4636719226837158, "rewards/margins": 2.388671875, "rewards/rejected": -0.923431396484375, "step": 1910 }, { "epoch": 1.0121244069583553, "grad_norm": 37.913391248354145, "learning_rate": 7.471006852925671e-07, "logits/chosen": -2.0511717796325684, "logits/rejected": -2.083203077316284, "logps/chosen": -298.70001220703125, "logps/rejected": -412.75, "loss": 0.2011, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.048828125, "rewards/margins": 2.4281249046325684, "rewards/rejected": -1.378393530845642, "step": 1920 }, { "epoch": 1.0173958882445968, "grad_norm": 31.83459414539412, "learning_rate": 7.457828149710068e-07, "logits/chosen": -2.130859375, "logits/rejected": -1.9714844226837158, "logps/chosen": -334.79998779296875, "logps/rejected": -436.45001220703125, "loss": 0.2101, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.44647216796875, "rewards/margins": 2.5328125953674316, "rewards/rejected": -2.0863280296325684, "step": 1930 }, { "epoch": 1.0226673695308381, "grad_norm": 49.88787157081552, "learning_rate": 7.444649446494464e-07, "logits/chosen": -2.185546875, "logits/rejected": -2.114453077316284, "logps/chosen": -343.7749938964844, "logps/rejected": -410.6499938964844, "loss": 0.1768, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.639331042766571, "rewards/margins": 2.5335936546325684, "rewards/rejected": -1.8953125476837158, "step": 1940 }, { "epoch": 1.0279388508170797, "grad_norm": 24.376454525794088, "learning_rate": 7.431470743278861e-07, "logits/chosen": -2.117968797683716, "logits/rejected": -2.100781202316284, "logps/chosen": -283.04998779296875, "logps/rejected": -371.75, "loss": 0.2007, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.777050793170929, "rewards/margins": 2.569140672683716, "rewards/rejected": -1.795019507408142, "step": 1950 }, { "epoch": 1.033210332103321, "grad_norm": 64.13779553652809, "learning_rate": 7.418292040063257e-07, "logits/chosen": -2.2125000953674316, "logits/rejected": -2.186718702316284, "logps/chosen": -308.0, "logps/rejected": -405.3500061035156, "loss": 0.2015, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.8943847417831421, "rewards/margins": 2.5042967796325684, "rewards/rejected": -1.6100585460662842, "step": 1960 }, { "epoch": 1.0384818133895624, "grad_norm": 54.50987768318855, "learning_rate": 7.405113336847653e-07, "logits/chosen": -2.1285157203674316, "logits/rejected": -2.0492186546325684, "logps/chosen": -335.5, "logps/rejected": -407.3999938964844, "loss": 0.1667, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.866406261920929, "rewards/margins": 2.8218750953674316, "rewards/rejected": -1.9563477039337158, "step": 1970 }, { "epoch": 1.043753294675804, "grad_norm": 41.62701904002389, "learning_rate": 7.39193463363205e-07, "logits/chosen": -2.2054686546325684, "logits/rejected": -2.173046827316284, "logps/chosen": -342.5, "logps/rejected": -473.8500061035156, "loss": 0.1754, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8367919921875, "rewards/margins": 3.171875, "rewards/rejected": -2.338085889816284, "step": 1980 }, { "epoch": 1.0490247759620452, "grad_norm": 52.01700177824062, "learning_rate": 7.378755930416447e-07, "logits/chosen": -2.348437547683716, "logits/rejected": -2.2503905296325684, "logps/chosen": -320.375, "logps/rejected": -381.95001220703125, "loss": 0.1874, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9670654535293579, "rewards/margins": 2.5777344703674316, "rewards/rejected": -1.6116821765899658, "step": 1990 }, { "epoch": 1.0542962572482868, "grad_norm": 35.97136992787203, "learning_rate": 7.365577227200843e-07, "logits/chosen": -2.4007811546325684, "logits/rejected": -2.3734374046325684, "logps/chosen": -353.0, "logps/rejected": -415.6000061035156, "loss": 0.215, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5303955078125, "rewards/margins": 2.689453125, "rewards/rejected": -2.158203125, "step": 2000 }, { "epoch": 1.0595677385345281, "grad_norm": 31.500851867031713, "learning_rate": 7.352398523985239e-07, "logits/chosen": -2.2671875953674316, "logits/rejected": -2.1539063453674316, "logps/chosen": -270.29998779296875, "logps/rejected": -389.45001220703125, "loss": 0.1906, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.685137927532196, "rewards/margins": 2.602343797683716, "rewards/rejected": -1.91796875, "step": 2010 }, { "epoch": 1.0648392198207697, "grad_norm": 70.66856501567634, "learning_rate": 7.339219820769635e-07, "logits/chosen": -2.2710938453674316, "logits/rejected": -2.1859374046325684, "logps/chosen": -317.70001220703125, "logps/rejected": -429.5, "loss": 0.2051, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4110961854457855, "rewards/margins": 2.673046827316284, "rewards/rejected": -2.261523485183716, "step": 2020 }, { "epoch": 1.070110701107011, "grad_norm": 54.237259083654, "learning_rate": 7.326041117554032e-07, "logits/chosen": -2.298046827316284, "logits/rejected": -2.176953077316284, "logps/chosen": -322.6499938964844, "logps/rejected": -436.54998779296875, "loss": 0.1786, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04555664211511612, "rewards/margins": 2.778515577316284, "rewards/rejected": -2.8246092796325684, "step": 2030 }, { "epoch": 1.0753821823932526, "grad_norm": 44.531574174128174, "learning_rate": 7.312862414338429e-07, "logits/chosen": -2.268359422683716, "logits/rejected": -2.3199219703674316, "logps/chosen": -371.1000061035156, "logps/rejected": -453.29998779296875, "loss": 0.1923, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.0150146484375, "rewards/margins": 2.788281202316284, "rewards/rejected": -2.803906202316284, "step": 2040 }, { "epoch": 1.080653663679494, "grad_norm": 49.01383494968416, "learning_rate": 7.299683711122825e-07, "logits/chosen": -2.1226563453674316, "logits/rejected": -2.064453125, "logps/chosen": -314.75, "logps/rejected": -422.29998779296875, "loss": 0.2062, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.36663818359375, "rewards/margins": 2.9007811546325684, "rewards/rejected": -2.5335936546325684, "step": 2050 }, { "epoch": 1.0859251449657354, "grad_norm": 48.78892372160223, "learning_rate": 7.286505007907222e-07, "logits/chosen": -2.247265577316284, "logits/rejected": -2.221484422683716, "logps/chosen": -331.0249938964844, "logps/rejected": -393.8999938964844, "loss": 0.184, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5553222894668579, "rewards/margins": 2.647265672683716, "rewards/rejected": -2.0894532203674316, "step": 2060 }, { "epoch": 1.0911966262519768, "grad_norm": 32.84588202457111, "learning_rate": 7.273326304691618e-07, "logits/chosen": -2.292187452316284, "logits/rejected": -2.1917967796325684, "logps/chosen": -342.75, "logps/rejected": -443.5, "loss": 0.1723, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.923144519329071, "rewards/margins": 2.8218750953674316, "rewards/rejected": -1.8976562023162842, "step": 2070 }, { "epoch": 1.0964681075382183, "grad_norm": 39.83766249514471, "learning_rate": 7.260147601476014e-07, "logits/chosen": -2.266796827316284, "logits/rejected": -2.2847657203674316, "logps/chosen": -332.8500061035156, "logps/rejected": -412.75, "loss": 0.1537, "rewards/accuracies": 0.96875, "rewards/chosen": 1.176629662513733, "rewards/margins": 2.801562547683716, "rewards/rejected": -1.6255614757537842, "step": 2080 }, { "epoch": 1.1017395888244597, "grad_norm": 36.56162948763821, "learning_rate": 7.24696889826041e-07, "logits/chosen": -2.2554688453674316, "logits/rejected": -2.182421922683716, "logps/chosen": -326.75, "logps/rejected": -411.25, "loss": 0.2078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.998242199420929, "rewards/margins": 2.5386719703674316, "rewards/rejected": -1.539648413658142, "step": 2090 }, { "epoch": 1.1070110701107012, "grad_norm": 39.51536578069774, "learning_rate": 7.233790195044808e-07, "logits/chosen": -2.231250047683716, "logits/rejected": -2.197265625, "logps/chosen": -357.75, "logps/rejected": -434.79998779296875, "loss": 0.2238, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.7587890625, "rewards/margins": 2.8531250953674316, "rewards/rejected": -2.0923829078674316, "step": 2100 }, { "epoch": 1.1122825513969425, "grad_norm": 27.571668228579988, "learning_rate": 7.220611491829204e-07, "logits/chosen": -2.147656202316284, "logits/rejected": -2.1070313453674316, "logps/chosen": -333.54998779296875, "logps/rejected": -422.8999938964844, "loss": 0.1492, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.854931652545929, "rewards/margins": 2.953125, "rewards/rejected": -2.0982422828674316, "step": 2110 }, { "epoch": 1.1175540326831839, "grad_norm": 38.008164134049515, "learning_rate": 7.2074327886136e-07, "logits/chosen": -2.369140625, "logits/rejected": -2.2894530296325684, "logps/chosen": -296.8500061035156, "logps/rejected": -409.54998779296875, "loss": 0.1926, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.046289086341858, "rewards/margins": 2.714062452316284, "rewards/rejected": -1.667578101158142, "step": 2120 }, { "epoch": 1.1228255139694254, "grad_norm": 76.68917241793791, "learning_rate": 7.194254085397996e-07, "logits/chosen": -2.313671827316284, "logits/rejected": -2.220703125, "logps/chosen": -315.1499938964844, "logps/rejected": -433.25, "loss": 0.1454, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9251953363418579, "rewards/margins": 3.0433592796325684, "rewards/rejected": -2.1167969703674316, "step": 2130 }, { "epoch": 1.1280969952556668, "grad_norm": 51.55597649648496, "learning_rate": 7.181075382182393e-07, "logits/chosen": -2.3656249046325684, "logits/rejected": -2.3199219703674316, "logps/chosen": -286.8500061035156, "logps/rejected": -383.0, "loss": 0.2208, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.7372802495956421, "rewards/margins": 2.689453125, "rewards/rejected": -1.9496581554412842, "step": 2140 }, { "epoch": 1.1333684765419083, "grad_norm": 94.47541510890645, "learning_rate": 7.16789667896679e-07, "logits/chosen": -2.267578125, "logits/rejected": -2.225781202316284, "logps/chosen": -331.20001220703125, "logps/rejected": -430.95001220703125, "loss": 0.193, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.295257568359375, "rewards/margins": 2.9384765625, "rewards/rejected": -2.6441407203674316, "step": 2150 }, { "epoch": 1.1386399578281496, "grad_norm": 39.02767272186435, "learning_rate": 7.154717975751186e-07, "logits/chosen": -2.34765625, "logits/rejected": -2.3550782203674316, "logps/chosen": -331.0, "logps/rejected": -407.75, "loss": 0.201, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.3296875059604645, "rewards/margins": 2.80078125, "rewards/rejected": -2.471874952316284, "step": 2160 }, { "epoch": 1.1439114391143912, "grad_norm": 71.91178353557336, "learning_rate": 7.141539272535582e-07, "logits/chosen": -2.37109375, "logits/rejected": -2.274218797683716, "logps/chosen": -314.20001220703125, "logps/rejected": -390.0, "loss": 0.2209, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.596728503704071, "rewards/margins": 2.7054686546325684, "rewards/rejected": -2.1083006858825684, "step": 2170 }, { "epoch": 1.1491829204006325, "grad_norm": 28.369267609998772, "learning_rate": 7.128360569319979e-07, "logits/chosen": -2.3671875, "logits/rejected": -2.3109374046325684, "logps/chosen": -261.25, "logps/rejected": -329.79998779296875, "loss": 0.2215, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.965624988079071, "rewards/margins": 2.268749952316284, "rewards/rejected": -1.300195336341858, "step": 2180 }, { "epoch": 1.154454401686874, "grad_norm": 48.0421145764738, "learning_rate": 7.115181866104375e-07, "logits/chosen": -2.186328172683716, "logits/rejected": -2.16015625, "logps/chosen": -289.1499938964844, "logps/rejected": -406.04998779296875, "loss": 0.1863, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.043310523033142, "rewards/margins": 2.8707032203674316, "rewards/rejected": -1.828222632408142, "step": 2190 }, { "epoch": 1.1597258829731154, "grad_norm": 43.1759818501818, "learning_rate": 7.102003162888771e-07, "logits/chosen": -2.2347655296325684, "logits/rejected": -2.178515672683716, "logps/chosen": -303.29998779296875, "logps/rejected": -371.25, "loss": 0.2348, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9415038824081421, "rewards/margins": 2.5582032203674316, "rewards/rejected": -1.615136742591858, "step": 2200 }, { "epoch": 1.164997364259357, "grad_norm": 47.68280374470817, "learning_rate": 7.088824459673169e-07, "logits/chosen": -2.35546875, "logits/rejected": -2.3382811546325684, "logps/chosen": -356.07501220703125, "logps/rejected": -416.45001220703125, "loss": 0.2047, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.69598388671875, "rewards/margins": 2.8570313453674316, "rewards/rejected": -2.160205125808716, "step": 2210 }, { "epoch": 1.1702688455455983, "grad_norm": 40.17228098181883, "learning_rate": 7.075645756457565e-07, "logits/chosen": -2.335156202316284, "logits/rejected": -2.3046875, "logps/chosen": -311.75, "logps/rejected": -356.1000061035156, "loss": 0.2463, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.604736328125, "rewards/margins": 2.292187452316284, "rewards/rejected": -1.6875, "step": 2220 }, { "epoch": 1.1755403268318398, "grad_norm": 33.83183761440477, "learning_rate": 7.062467053241961e-07, "logits/chosen": -2.313281297683716, "logits/rejected": -2.2730469703674316, "logps/chosen": -343.3999938964844, "logps/rejected": -453.3999938964844, "loss": 0.161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.33203125, "rewards/margins": 3.028515577316284, "rewards/rejected": -2.6957030296325684, "step": 2230 }, { "epoch": 1.1808118081180812, "grad_norm": 34.44709217205047, "learning_rate": 7.049288350026357e-07, "logits/chosen": -2.3578124046325684, "logits/rejected": -2.321484327316284, "logps/chosen": -349.6000061035156, "logps/rejected": -462.79998779296875, "loss": 0.1942, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.010388183407485485, "rewards/margins": 3.055859327316284, "rewards/rejected": -3.0660157203674316, "step": 2240 }, { "epoch": 1.1860832894043227, "grad_norm": 83.23258150530064, "learning_rate": 7.036109646810754e-07, "logits/chosen": -2.399609327316284, "logits/rejected": -2.3539061546325684, "logps/chosen": -347.79998779296875, "logps/rejected": -403.29998779296875, "loss": 0.189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14739379286766052, "rewards/margins": 2.7953124046325684, "rewards/rejected": -2.651562452316284, "step": 2250 }, { "epoch": 1.191354770690564, "grad_norm": 47.516403396768375, "learning_rate": 7.02293094359515e-07, "logits/chosen": -2.295703172683716, "logits/rejected": -2.2925782203674316, "logps/chosen": -348.6499938964844, "logps/rejected": -409.70001220703125, "loss": 0.1987, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07304076850414276, "rewards/margins": 2.867968797683716, "rewards/rejected": -2.9398436546325684, "step": 2260 }, { "epoch": 1.1966262519768054, "grad_norm": 60.1549819255733, "learning_rate": 7.009752240379547e-07, "logits/chosen": -2.404296875, "logits/rejected": -2.353515625, "logps/chosen": -309.1499938964844, "logps/rejected": -384.6000061035156, "loss": 0.1862, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.10422973334789276, "rewards/margins": 2.75390625, "rewards/rejected": -2.6519532203674316, "step": 2270 }, { "epoch": 1.201897733263047, "grad_norm": 46.99056635628824, "learning_rate": 6.996573537163942e-07, "logits/chosen": -2.354687452316284, "logits/rejected": -2.2679686546325684, "logps/chosen": -354.3999938964844, "logps/rejected": -439.8999938964844, "loss": 0.1846, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.05256347730755806, "rewards/margins": 2.807812452316284, "rewards/rejected": -2.860546827316284, "step": 2280 }, { "epoch": 1.2071692145492883, "grad_norm": 31.281143108630392, "learning_rate": 6.98339483394834e-07, "logits/chosen": -2.387890577316284, "logits/rejected": -2.36328125, "logps/chosen": -356.95001220703125, "logps/rejected": -471.79998779296875, "loss": 0.1758, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.28455811738967896, "rewards/margins": 3.132031202316284, "rewards/rejected": -3.416015625, "step": 2290 }, { "epoch": 1.2124406958355298, "grad_norm": 54.97857646694864, "learning_rate": 6.970216130732735e-07, "logits/chosen": -2.3828125, "logits/rejected": -2.3558592796325684, "logps/chosen": -342.29998779296875, "logps/rejected": -431.54998779296875, "loss": 0.219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.15681762993335724, "rewards/margins": 2.714062452316284, "rewards/rejected": -2.867968797683716, "step": 2300 }, { "epoch": 1.2177121771217712, "grad_norm": 73.0598123888874, "learning_rate": 6.957037427517131e-07, "logits/chosen": -2.3656249046325684, "logits/rejected": -2.326171875, "logps/chosen": -345.1499938964844, "logps/rejected": -463.79998779296875, "loss": 0.1683, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.19688110053539276, "rewards/margins": 3.1421875953674316, "rewards/rejected": -2.947265625, "step": 2310 }, { "epoch": 1.2229836584080127, "grad_norm": 54.993151337017075, "learning_rate": 6.943858724301529e-07, "logits/chosen": -2.342578172683716, "logits/rejected": -2.235546827316284, "logps/chosen": -342.6000061035156, "logps/rejected": -432.0, "loss": 0.1739, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2845214903354645, "rewards/margins": 2.973828077316284, "rewards/rejected": -2.6888670921325684, "step": 2320 }, { "epoch": 1.228255139694254, "grad_norm": 68.3176409267328, "learning_rate": 6.930680021085925e-07, "logits/chosen": -2.1953125, "logits/rejected": -2.235156297683716, "logps/chosen": -364.04998779296875, "logps/rejected": -468.20001220703125, "loss": 0.2161, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4458374083042145, "rewards/margins": 2.9144530296325684, "rewards/rejected": -2.468554735183716, "step": 2330 }, { "epoch": 1.2335266209804956, "grad_norm": 36.057487588652705, "learning_rate": 6.917501317870321e-07, "logits/chosen": -2.3804688453674316, "logits/rejected": -2.264843702316284, "logps/chosen": -331.6499938964844, "logps/rejected": -437.8999938964844, "loss": 0.1738, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.577789306640625, "rewards/margins": 3.033984422683716, "rewards/rejected": -2.455078125, "step": 2340 }, { "epoch": 1.238798102266737, "grad_norm": 42.25192699014723, "learning_rate": 6.904322614654717e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.405468702316284, "logps/chosen": -352.6499938964844, "logps/rejected": -461.5, "loss": 0.1336, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6073242425918579, "rewards/margins": 3.26953125, "rewards/rejected": -2.6644530296325684, "step": 2350 }, { "epoch": 1.2440695835529785, "grad_norm": 54.37872356207909, "learning_rate": 6.891143911439114e-07, "logits/chosen": -2.296875, "logits/rejected": -2.32421875, "logps/chosen": -327.5, "logps/rejected": -413.8500061035156, "loss": 0.1562, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.36485594511032104, "rewards/margins": 2.928906202316284, "rewards/rejected": -2.559765577316284, "step": 2360 }, { "epoch": 1.2493410648392198, "grad_norm": 66.31691438980612, "learning_rate": 6.87796520822351e-07, "logits/chosen": -2.3257813453674316, "logits/rejected": -2.275390625, "logps/chosen": -359.5, "logps/rejected": -441.79998779296875, "loss": 0.1777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3321533203125, "rewards/margins": 2.755859375, "rewards/rejected": -2.424609422683716, "step": 2370 }, { "epoch": 1.2546125461254611, "grad_norm": 32.869445817099674, "learning_rate": 6.864786505007907e-07, "logits/chosen": -2.342578172683716, "logits/rejected": -2.3394532203674316, "logps/chosen": -376.70001220703125, "logps/rejected": -459.3999938964844, "loss": 0.1649, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.34510499238967896, "rewards/margins": 3.241406202316284, "rewards/rejected": -2.896484375, "step": 2380 }, { "epoch": 1.2598840274117027, "grad_norm": 53.17024930172298, "learning_rate": 6.851607801792303e-07, "logits/chosen": -2.342968702316284, "logits/rejected": -2.391796827316284, "logps/chosen": -373.8999938964844, "logps/rejected": -430.0, "loss": 0.1605, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.03339843824505806, "rewards/margins": 3.0054688453674316, "rewards/rejected": -2.9683594703674316, "step": 2390 }, { "epoch": 1.2651555086979442, "grad_norm": 49.928151889430126, "learning_rate": 6.8384290985767e-07, "logits/chosen": -2.3359375, "logits/rejected": -2.3160157203674316, "logps/chosen": -335.95001220703125, "logps/rejected": -384.6000061035156, "loss": 0.1963, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12416992336511612, "rewards/margins": 2.912890672683716, "rewards/rejected": -3.041015625, "step": 2400 }, { "epoch": 1.2704269899841856, "grad_norm": 41.267372182741, "learning_rate": 6.825250395361096e-07, "logits/chosen": -2.353515625, "logits/rejected": -2.2632813453674316, "logps/chosen": -339.3500061035156, "logps/rejected": -431.3500061035156, "loss": 0.1765, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.04427490383386612, "rewards/margins": 2.8863282203674316, "rewards/rejected": -2.926562547683716, "step": 2410 }, { "epoch": 1.275698471270427, "grad_norm": 52.03327882023692, "learning_rate": 6.812071692145492e-07, "logits/chosen": -2.2718749046325684, "logits/rejected": -2.28515625, "logps/chosen": -315.04998779296875, "logps/rejected": -381.0, "loss": 0.2179, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5525878667831421, "rewards/margins": 2.8783202171325684, "rewards/rejected": -2.3228516578674316, "step": 2420 }, { "epoch": 1.2809699525566685, "grad_norm": 86.00752985666938, "learning_rate": 6.798892988929888e-07, "logits/chosen": -2.341796875, "logits/rejected": -2.3003907203674316, "logps/chosen": -326.57501220703125, "logps/rejected": -436.70001220703125, "loss": 0.1931, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.974560558795929, "rewards/margins": 2.876953125, "rewards/rejected": -1.9025390148162842, "step": 2430 }, { "epoch": 1.2862414338429098, "grad_norm": 39.80714107831882, "learning_rate": 6.785714285714286e-07, "logits/chosen": -2.388671875, "logits/rejected": -2.318359375, "logps/chosen": -335.45001220703125, "logps/rejected": -455.29998779296875, "loss": 0.1881, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9750000238418579, "rewards/margins": 2.982421875, "rewards/rejected": -2.0068359375, "step": 2440 }, { "epoch": 1.2915129151291513, "grad_norm": 52.75683468498168, "learning_rate": 6.772535582498682e-07, "logits/chosen": -2.3843750953674316, "logits/rejected": -2.3003907203674316, "logps/chosen": -348.3999938964844, "logps/rejected": -411.54998779296875, "loss": 0.2228, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0093262195587158, "rewards/margins": 2.794921875, "rewards/rejected": -1.785546898841858, "step": 2450 }, { "epoch": 1.2967843964153927, "grad_norm": 49.55564158352592, "learning_rate": 6.759356879283078e-07, "logits/chosen": -2.2496094703674316, "logits/rejected": -2.2171874046325684, "logps/chosen": -330.1499938964844, "logps/rejected": -427.20001220703125, "loss": 0.1863, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.7317870855331421, "rewards/margins": 3.0191407203674316, "rewards/rejected": -2.2867188453674316, "step": 2460 }, { "epoch": 1.3020558777016342, "grad_norm": 64.397953691408, "learning_rate": 6.746178176067475e-07, "logits/chosen": -2.1695313453674316, "logits/rejected": -2.235546827316284, "logps/chosen": -349.9750061035156, "logps/rejected": -455.29998779296875, "loss": 0.1937, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.4492553770542145, "rewards/margins": 3.20703125, "rewards/rejected": -2.758593797683716, "step": 2470 }, { "epoch": 1.3073273589878756, "grad_norm": 45.80157499863999, "learning_rate": 6.732999472851871e-07, "logits/chosen": -2.30078125, "logits/rejected": -2.3160157203674316, "logps/chosen": -316.20001220703125, "logps/rejected": -414.3999938964844, "loss": 0.1871, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.542309582233429, "rewards/margins": 3.055859327316284, "rewards/rejected": -2.5132813453674316, "step": 2480 }, { "epoch": 1.312598840274117, "grad_norm": 61.2162233682351, "learning_rate": 6.719820769636268e-07, "logits/chosen": -2.267578125, "logits/rejected": -2.2847657203674316, "logps/chosen": -317.54998779296875, "logps/rejected": -415.70001220703125, "loss": 0.1969, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.40703123807907104, "rewards/margins": 2.899218797683716, "rewards/rejected": -2.492382764816284, "step": 2490 }, { "epoch": 1.3178703215603584, "grad_norm": 63.26101194444503, "learning_rate": 6.706642066420664e-07, "logits/chosen": -2.23828125, "logits/rejected": -2.1664061546325684, "logps/chosen": -318.70001220703125, "logps/rejected": -462.70001220703125, "loss": 0.1955, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.2971252501010895, "rewards/margins": 3.0582032203674316, "rewards/rejected": -2.759765625, "step": 2500 }, { "epoch": 1.3231418028466, "grad_norm": 64.16276727216831, "learning_rate": 6.693463363205061e-07, "logits/chosen": -2.302734375, "logits/rejected": -2.229687452316284, "logps/chosen": -344.1499938964844, "logps/rejected": -434.5, "loss": 0.2259, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.22026367485523224, "rewards/margins": 2.7103514671325684, "rewards/rejected": -2.4883790016174316, "step": 2510 }, { "epoch": 1.3284132841328413, "grad_norm": 37.47039645629102, "learning_rate": 6.680284659989457e-07, "logits/chosen": -2.3460936546325684, "logits/rejected": -2.2242188453674316, "logps/chosen": -263.4750061035156, "logps/rejected": -389.95001220703125, "loss": 0.2127, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.608154296875, "rewards/margins": 2.662890672683716, "rewards/rejected": -2.052734375, "step": 2520 }, { "epoch": 1.3336847654190827, "grad_norm": 33.718025528010216, "learning_rate": 6.667105956773853e-07, "logits/chosen": -2.274218797683716, "logits/rejected": -2.227734327316284, "logps/chosen": -339.29998779296875, "logps/rejected": -440.3999938964844, "loss": 0.1956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.31170654296875, "rewards/margins": 2.6957030296325684, "rewards/rejected": -2.3841795921325684, "step": 2530 }, { "epoch": 1.3389562467053242, "grad_norm": 41.00735304788115, "learning_rate": 6.653927253558249e-07, "logits/chosen": -2.284374952316284, "logits/rejected": -2.2269530296325684, "logps/chosen": -343.8500061035156, "logps/rejected": -456.3999938964844, "loss": 0.1771, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.520550549030304, "rewards/margins": 3.103515625, "rewards/rejected": -2.579882860183716, "step": 2540 }, { "epoch": 1.3442277279915658, "grad_norm": 32.9766038275084, "learning_rate": 6.640748550342647e-07, "logits/chosen": -2.3148436546325684, "logits/rejected": -2.2734375, "logps/chosen": -307.98748779296875, "logps/rejected": -427.6000061035156, "loss": 0.1893, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5020751953125, "rewards/margins": 2.9105467796325684, "rewards/rejected": -2.4097657203674316, "step": 2550 }, { "epoch": 1.349499209277807, "grad_norm": 49.39829526291435, "learning_rate": 6.627569847127043e-07, "logits/chosen": -2.3441405296325684, "logits/rejected": -2.3902344703674316, "logps/chosen": -375.5, "logps/rejected": -486.5, "loss": 0.1568, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3656249940395355, "rewards/margins": 3.581249952316284, "rewards/rejected": -3.213671922683716, "step": 2560 }, { "epoch": 1.3547706905640484, "grad_norm": 83.11268559218462, "learning_rate": 6.614391143911439e-07, "logits/chosen": -2.391406297683716, "logits/rejected": -2.270703077316284, "logps/chosen": -320.25, "logps/rejected": -413.20001220703125, "loss": 0.2188, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3728271424770355, "rewards/margins": 2.8257813453674316, "rewards/rejected": -2.4505858421325684, "step": 2570 }, { "epoch": 1.36004217185029, "grad_norm": 40.315228005988736, "learning_rate": 6.601212440695835e-07, "logits/chosen": -2.422656297683716, "logits/rejected": -2.413281202316284, "logps/chosen": -334.5, "logps/rejected": -402.4750061035156, "loss": 0.1918, "rewards/accuracies": 0.9375, "rewards/chosen": 0.33623045682907104, "rewards/margins": 2.962109327316284, "rewards/rejected": -2.623242139816284, "step": 2580 }, { "epoch": 1.3653136531365313, "grad_norm": 45.00998934384981, "learning_rate": 6.588033737480232e-07, "logits/chosen": -2.400390625, "logits/rejected": -2.434375047683716, "logps/chosen": -321.75, "logps/rejected": -398.8999938964844, "loss": 0.1633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16410522162914276, "rewards/margins": 2.9195313453674316, "rewards/rejected": -2.756640672683716, "step": 2590 }, { "epoch": 1.3705851344227729, "grad_norm": 45.65785115516571, "learning_rate": 6.574855034264628e-07, "logits/chosen": -2.3921875953674316, "logits/rejected": -2.3746094703674316, "logps/chosen": -336.1000061035156, "logps/rejected": -387.79998779296875, "loss": 0.2023, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.1976318359375, "rewards/margins": 2.8394532203674316, "rewards/rejected": -2.641406297683716, "step": 2600 }, { "epoch": 1.3758566157090142, "grad_norm": 48.988295041059715, "learning_rate": 6.561676331049025e-07, "logits/chosen": -2.3570313453674316, "logits/rejected": -2.400390625, "logps/chosen": -336.54998779296875, "logps/rejected": -418.75, "loss": 0.1932, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5531250238418579, "rewards/margins": 2.983203172683716, "rewards/rejected": -2.4291014671325684, "step": 2610 }, { "epoch": 1.3811280969952557, "grad_norm": 54.322259296616146, "learning_rate": 6.548497627833422e-07, "logits/chosen": -2.278515577316284, "logits/rejected": -2.27734375, "logps/chosen": -307.32501220703125, "logps/rejected": -391.20001220703125, "loss": 0.1937, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7589966058731079, "rewards/margins": 2.819531202316284, "rewards/rejected": -2.0609374046325684, "step": 2620 }, { "epoch": 1.386399578281497, "grad_norm": 55.39424574974257, "learning_rate": 6.535318924617818e-07, "logits/chosen": -2.542187452316284, "logits/rejected": -2.399218797683716, "logps/chosen": -343.3500061035156, "logps/rejected": -433.8999938964844, "loss": 0.2332, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.560864269733429, "rewards/margins": 2.5414061546325684, "rewards/rejected": -1.9792969226837158, "step": 2630 }, { "epoch": 1.3916710595677384, "grad_norm": 44.051947777871995, "learning_rate": 6.522140221402213e-07, "logits/chosen": -2.4351563453674316, "logits/rejected": -2.403125047683716, "logps/chosen": -371.45001220703125, "logps/rejected": -446.0, "loss": 0.1657, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5406554937362671, "rewards/margins": 3.127734422683716, "rewards/rejected": -2.5902342796325684, "step": 2640 }, { "epoch": 1.39694254085398, "grad_norm": 28.525369346655403, "learning_rate": 6.508961518186609e-07, "logits/chosen": -2.397265672683716, "logits/rejected": -2.3148436546325684, "logps/chosen": -311.92498779296875, "logps/rejected": -414.0, "loss": 0.1584, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.47813111543655396, "rewards/margins": 3.000781297683716, "rewards/rejected": -2.522265672683716, "step": 2650 }, { "epoch": 1.4022140221402215, "grad_norm": 28.584004987995563, "learning_rate": 6.495782814971007e-07, "logits/chosen": -2.381640672683716, "logits/rejected": -2.3414063453674316, "logps/chosen": -285.75, "logps/rejected": -369.8500061035156, "loss": 0.207, "rewards/accuracies": 0.90625, "rewards/chosen": 0.558673083782196, "rewards/margins": 2.807812452316284, "rewards/rejected": -2.249218702316284, "step": 2660 }, { "epoch": 1.4074855034264628, "grad_norm": 49.95226353263052, "learning_rate": 6.482604111755403e-07, "logits/chosen": -2.352734327316284, "logits/rejected": -2.390625, "logps/chosen": -343.5, "logps/rejected": -401.70001220703125, "loss": 0.163, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.31428831815719604, "rewards/margins": 3.0648436546325684, "rewards/rejected": -2.7476563453674316, "step": 2670 }, { "epoch": 1.4127569847127042, "grad_norm": 31.74211547965931, "learning_rate": 6.469425408539799e-07, "logits/chosen": -2.4164061546325684, "logits/rejected": -2.396484375, "logps/chosen": -313.54998779296875, "logps/rejected": -403.3999938964844, "loss": 0.2142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.10651855170726776, "rewards/margins": 2.82421875, "rewards/rejected": -2.719531297683716, "step": 2680 }, { "epoch": 1.4180284659989457, "grad_norm": 66.9896850612038, "learning_rate": 6.456246705324195e-07, "logits/chosen": -2.4976563453674316, "logits/rejected": -2.3531250953674316, "logps/chosen": -297.04998779296875, "logps/rejected": -403.25, "loss": 0.1934, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.146759033203125, "rewards/margins": 2.928515672683716, "rewards/rejected": -2.780566453933716, "step": 2690 }, { "epoch": 1.4232999472851873, "grad_norm": 35.602300584926724, "learning_rate": 6.443068002108592e-07, "logits/chosen": -2.4765625, "logits/rejected": -2.46875, "logps/chosen": -342.1000061035156, "logps/rejected": -423.95001220703125, "loss": 0.1661, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2555175721645355, "rewards/margins": 2.901171922683716, "rewards/rejected": -2.643359422683716, "step": 2700 }, { "epoch": 1.4285714285714286, "grad_norm": 24.075976220133523, "learning_rate": 6.429889298892988e-07, "logits/chosen": -2.383593797683716, "logits/rejected": -2.37109375, "logps/chosen": -337.20001220703125, "logps/rejected": -398.1000061035156, "loss": 0.1607, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.22272948920726776, "rewards/margins": 3.038281202316284, "rewards/rejected": -2.813281297683716, "step": 2710 }, { "epoch": 1.43384290985767, "grad_norm": 46.91148189881905, "learning_rate": 6.416710595677385e-07, "logits/chosen": -2.383593797683716, "logits/rejected": -2.3890624046325684, "logps/chosen": -336.8500061035156, "logps/rejected": -422.04998779296875, "loss": 0.1917, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35361939668655396, "rewards/margins": 2.840625047683716, "rewards/rejected": -2.48828125, "step": 2720 }, { "epoch": 1.4391143911439115, "grad_norm": 70.64208462379104, "learning_rate": 6.403531892461781e-07, "logits/chosen": -2.426562547683716, "logits/rejected": -2.5210938453674316, "logps/chosen": -366.54998779296875, "logps/rejected": -420.6000061035156, "loss": 0.1853, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.07466430962085724, "rewards/margins": 2.7769532203674316, "rewards/rejected": -2.703906297683716, "step": 2730 }, { "epoch": 1.4443858724301528, "grad_norm": 36.393183775466305, "learning_rate": 6.390353189246178e-07, "logits/chosen": -2.4453125, "logits/rejected": -2.411328077316284, "logps/chosen": -297.82501220703125, "logps/rejected": -396.1499938964844, "loss": 0.1684, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.11459960788488388, "rewards/margins": 3.29296875, "rewards/rejected": -3.177734375, "step": 2740 }, { "epoch": 1.4496573537163944, "grad_norm": 47.47785904904805, "learning_rate": 6.377174486030574e-07, "logits/chosen": -2.3656249046325684, "logits/rejected": -2.3667969703674316, "logps/chosen": -331.20001220703125, "logps/rejected": -410.5, "loss": 0.212, "rewards/accuracies": 0.90625, "rewards/chosen": 0.23582153022289276, "rewards/margins": 3.1058592796325684, "rewards/rejected": -2.872265577316284, "step": 2750 }, { "epoch": 1.4549288350026357, "grad_norm": 36.44508410725486, "learning_rate": 6.36399578281497e-07, "logits/chosen": -2.418750047683716, "logits/rejected": -2.3882813453674316, "logps/chosen": -292.8999938964844, "logps/rejected": -396.79998779296875, "loss": 0.1854, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2549194395542145, "rewards/margins": 3.001953125, "rewards/rejected": -2.74609375, "step": 2760 }, { "epoch": 1.4602003162888773, "grad_norm": 47.45657610353401, "learning_rate": 6.350817079599367e-07, "logits/chosen": -2.430859327316284, "logits/rejected": -2.3765625953674316, "logps/chosen": -342.79998779296875, "logps/rejected": -438.3500061035156, "loss": 0.1856, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.1707000732421875, "rewards/margins": 3.3375000953674316, "rewards/rejected": -3.1675782203674316, "step": 2770 }, { "epoch": 1.4654717975751186, "grad_norm": 42.46111413162928, "learning_rate": 6.337638376383764e-07, "logits/chosen": -2.3285155296325684, "logits/rejected": -2.253124952316284, "logps/chosen": -301.8500061035156, "logps/rejected": -419.29998779296875, "loss": 0.1735, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3357788026332855, "rewards/margins": 2.905468702316284, "rewards/rejected": -2.5687499046325684, "step": 2780 }, { "epoch": 1.47074327886136, "grad_norm": 43.75838368525629, "learning_rate": 6.32445967316816e-07, "logits/chosen": -2.3734374046325684, "logits/rejected": -2.336718797683716, "logps/chosen": -312.75, "logps/rejected": -420.0, "loss": 0.1574, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.597399890422821, "rewards/margins": 3.520312547683716, "rewards/rejected": -2.9222655296325684, "step": 2790 }, { "epoch": 1.4760147601476015, "grad_norm": 38.1945073574433, "learning_rate": 6.311280969952556e-07, "logits/chosen": -2.263671875, "logits/rejected": -2.2578125, "logps/chosen": -377.0, "logps/rejected": -449.8999938964844, "loss": 0.1594, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4945434629917145, "rewards/margins": 3.371875047683716, "rewards/rejected": -2.879687547683716, "step": 2800 }, { "epoch": 1.481286241433843, "grad_norm": 53.75045523815727, "learning_rate": 6.298102266736953e-07, "logits/chosen": -2.411328077316284, "logits/rejected": -2.411328077316284, "logps/chosen": -324.3500061035156, "logps/rejected": -414.3999938964844, "loss": 0.1997, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3314453065395355, "rewards/margins": 2.9378905296325684, "rewards/rejected": -2.6103515625, "step": 2810 }, { "epoch": 1.4865577227200844, "grad_norm": 50.79174120782522, "learning_rate": 6.284923563521349e-07, "logits/chosen": -2.421093702316284, "logits/rejected": -2.353515625, "logps/chosen": -315.54998779296875, "logps/rejected": -450.79998779296875, "loss": 0.1497, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.20234374701976776, "rewards/margins": 3.1429686546325684, "rewards/rejected": -2.9390625953674316, "step": 2820 }, { "epoch": 1.4918292040063257, "grad_norm": 31.614547040510978, "learning_rate": 6.271744860305746e-07, "logits/chosen": -2.287109375, "logits/rejected": -2.2992186546325684, "logps/chosen": -352.8999938964844, "logps/rejected": -409.1499938964844, "loss": 0.2165, "rewards/accuracies": 0.90625, "rewards/chosen": 0.01911010779440403, "rewards/margins": 2.8519530296325684, "rewards/rejected": -2.8345704078674316, "step": 2830 }, { "epoch": 1.4971006852925672, "grad_norm": 42.59917604967003, "learning_rate": 6.258566157090142e-07, "logits/chosen": -2.4234375953674316, "logits/rejected": -2.46484375, "logps/chosen": -350.04998779296875, "logps/rejected": -444.25, "loss": 0.1236, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.22495117783546448, "rewards/margins": 3.5999999046325684, "rewards/rejected": -3.373046875, "step": 2840 }, { "epoch": 1.5023721665788088, "grad_norm": 57.99195223899894, "learning_rate": 6.245387453874539e-07, "logits/chosen": -2.360546827316284, "logits/rejected": -2.383593797683716, "logps/chosen": -354.25, "logps/rejected": -421.25, "loss": 0.2251, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.05558471754193306, "rewards/margins": 3.126953125, "rewards/rejected": -3.0726561546325684, "step": 2850 }, { "epoch": 1.5076436478650501, "grad_norm": 94.844954476543, "learning_rate": 6.232208750658935e-07, "logits/chosen": -2.283203125, "logits/rejected": -2.2398438453674316, "logps/chosen": -309.3500061035156, "logps/rejected": -413.20001220703125, "loss": 0.2363, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4422607421875, "rewards/margins": 2.873046875, "rewards/rejected": -2.4332032203674316, "step": 2860 }, { "epoch": 1.5129151291512914, "grad_norm": 38.61033426980324, "learning_rate": 6.219030047443331e-07, "logits/chosen": -2.412890672683716, "logits/rejected": -2.303515672683716, "logps/chosen": -357.70001220703125, "logps/rejected": -460.6499938964844, "loss": 0.1896, "rewards/accuracies": 0.90625, "rewards/chosen": 0.17156982421875, "rewards/margins": 3.16796875, "rewards/rejected": -2.9957032203674316, "step": 2870 }, { "epoch": 1.518186610437533, "grad_norm": 48.038175753194295, "learning_rate": 6.205851344227727e-07, "logits/chosen": -2.45703125, "logits/rejected": -2.423828125, "logps/chosen": -298.25, "logps/rejected": -435.95001220703125, "loss": 0.1521, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.27567750215530396, "rewards/margins": 3.1382813453674316, "rewards/rejected": -2.8626952171325684, "step": 2880 }, { "epoch": 1.5234580917237743, "grad_norm": 63.03351374798294, "learning_rate": 6.192672641012125e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.424999952316284, "logps/chosen": -331.1499938964844, "logps/rejected": -419.70001220703125, "loss": 0.1914, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12769165635108948, "rewards/margins": 3.08203125, "rewards/rejected": -2.9556641578674316, "step": 2890 }, { "epoch": 1.5287295730100157, "grad_norm": 62.78502607235668, "learning_rate": 6.179493937796521e-07, "logits/chosen": -2.4546875953674316, "logits/rejected": -2.44140625, "logps/chosen": -337.8500061035156, "logps/rejected": -400.54998779296875, "loss": 0.2027, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2590087950229645, "rewards/margins": 2.940234422683716, "rewards/rejected": -2.680468797683716, "step": 2900 }, { "epoch": 1.5340010542962572, "grad_norm": 54.05569196821833, "learning_rate": 6.166315234580917e-07, "logits/chosen": -2.4488282203674316, "logits/rejected": -2.443359375, "logps/chosen": -365.6000061035156, "logps/rejected": -424.8999938964844, "loss": 0.1949, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.008129882626235485, "rewards/margins": 3.080859422683716, "rewards/rejected": -3.0746092796325684, "step": 2910 }, { "epoch": 1.5392725355824988, "grad_norm": 50.738271840705096, "learning_rate": 6.153136531365314e-07, "logits/chosen": -2.393359422683716, "logits/rejected": -2.2710938453674316, "logps/chosen": -344.25, "logps/rejected": -422.70001220703125, "loss": 0.2089, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.05434570461511612, "rewards/margins": 2.9652342796325684, "rewards/rejected": -2.9136719703674316, "step": 2920 }, { "epoch": 1.54454401686874, "grad_norm": 60.59545474435105, "learning_rate": 6.13995782814971e-07, "logits/chosen": -2.4554686546325684, "logits/rejected": -2.4429688453674316, "logps/chosen": -327.6000061035156, "logps/rejected": -410.1000061035156, "loss": 0.2376, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.01245727576315403, "rewards/margins": 2.649218797683716, "rewards/rejected": -2.6634764671325684, "step": 2930 }, { "epoch": 1.5498154981549814, "grad_norm": 63.04740382604312, "learning_rate": 6.126779124934106e-07, "logits/chosen": -2.465625047683716, "logits/rejected": -2.48828125, "logps/chosen": -381.79998779296875, "logps/rejected": -455.45001220703125, "loss": 0.1572, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.1185302734375, "rewards/margins": 3.242968797683716, "rewards/rejected": -3.36328125, "step": 2940 }, { "epoch": 1.555086979441223, "grad_norm": 49.595905700203915, "learning_rate": 6.113600421718503e-07, "logits/chosen": -2.2582030296325684, "logits/rejected": -2.232421875, "logps/chosen": -278.1000061035156, "logps/rejected": -406.29998779296875, "loss": 0.1948, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.04080810397863388, "rewards/margins": 3.1195311546325684, "rewards/rejected": -3.159374952316284, "step": 2950 }, { "epoch": 1.5603584607274645, "grad_norm": 26.159680580732402, "learning_rate": 6.1004217185029e-07, "logits/chosen": -2.4027342796325684, "logits/rejected": -2.311328172683716, "logps/chosen": -342.0, "logps/rejected": -429.95001220703125, "loss": 0.163, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.16435547173023224, "rewards/margins": 3.212890625, "rewards/rejected": -3.048046827316284, "step": 2960 }, { "epoch": 1.5656299420137059, "grad_norm": 59.91853499383317, "learning_rate": 6.087243015287296e-07, "logits/chosen": -2.2796874046325684, "logits/rejected": -2.204296827316284, "logps/chosen": -340.75, "logps/rejected": -420.3999938964844, "loss": 0.1683, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.30085450410842896, "rewards/margins": 3.1695313453674316, "rewards/rejected": -2.8671875, "step": 2970 }, { "epoch": 1.5709014232999472, "grad_norm": 52.00775486184884, "learning_rate": 6.074064312071692e-07, "logits/chosen": -2.401562452316284, "logits/rejected": -2.397265672683716, "logps/chosen": -337.6499938964844, "logps/rejected": -433.8500061035156, "loss": 0.1455, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.686474621295929, "rewards/margins": 3.124218702316284, "rewards/rejected": -2.4380860328674316, "step": 2980 }, { "epoch": 1.5761729045861887, "grad_norm": 49.21434852765544, "learning_rate": 6.060885608856087e-07, "logits/chosen": -2.21875, "logits/rejected": -2.3003907203674316, "logps/chosen": -363.79998779296875, "logps/rejected": -439.20001220703125, "loss": 0.2069, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.40313720703125, "rewards/margins": 3.215625047683716, "rewards/rejected": -2.8128905296325684, "step": 2990 }, { "epoch": 1.5814443858724303, "grad_norm": 36.51797358173777, "learning_rate": 6.047706905640486e-07, "logits/chosen": -2.432812452316284, "logits/rejected": -2.389843702316284, "logps/chosen": -339.79998779296875, "logps/rejected": -455.5, "loss": 0.1674, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.43059080839157104, "rewards/margins": 3.3570313453674316, "rewards/rejected": -2.9283204078674316, "step": 3000 }, { "epoch": 1.5867158671586716, "grad_norm": 36.46796822991459, "learning_rate": 6.034528202424881e-07, "logits/chosen": -2.3726563453674316, "logits/rejected": -2.2874999046325684, "logps/chosen": -297.8500061035156, "logps/rejected": -402.45001220703125, "loss": 0.1867, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4769043028354645, "rewards/margins": 3.206249952316284, "rewards/rejected": -2.728710889816284, "step": 3010 }, { "epoch": 1.591987348444913, "grad_norm": 41.59082550695852, "learning_rate": 6.021349499209277e-07, "logits/chosen": -2.3636717796325684, "logits/rejected": -2.3140625953674316, "logps/chosen": -319.8999938964844, "logps/rejected": -385.04998779296875, "loss": 0.2018, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6031738519668579, "rewards/margins": 2.9058594703674316, "rewards/rejected": -2.303906202316284, "step": 3020 }, { "epoch": 1.5972588297311545, "grad_norm": 43.950386924986844, "learning_rate": 6.008170795993674e-07, "logits/chosen": -2.389453172683716, "logits/rejected": -2.325000047683716, "logps/chosen": -330.54998779296875, "logps/rejected": -398.3500061035156, "loss": 0.2142, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.6677795648574829, "rewards/margins": 2.967578172683716, "rewards/rejected": -2.299609422683716, "step": 3030 }, { "epoch": 1.6025303110173958, "grad_norm": 41.3344097283446, "learning_rate": 5.99499209277807e-07, "logits/chosen": -2.5, "logits/rejected": -2.446093797683716, "logps/chosen": -333.7749938964844, "logps/rejected": -425.29998779296875, "loss": 0.1766, "rewards/accuracies": 0.9375, "rewards/chosen": 0.664715588092804, "rewards/margins": 2.8492188453674316, "rewards/rejected": -2.182910203933716, "step": 3040 }, { "epoch": 1.6078017923036372, "grad_norm": 38.85167404687259, "learning_rate": 5.981813389562466e-07, "logits/chosen": -2.443359375, "logits/rejected": -2.419140577316284, "logps/chosen": -338.7749938964844, "logps/rejected": -415.6499938964844, "loss": 0.1859, "rewards/accuracies": 0.9375, "rewards/chosen": 0.698046863079071, "rewards/margins": 2.805859327316284, "rewards/rejected": -2.105664014816284, "step": 3050 }, { "epoch": 1.6130732735898787, "grad_norm": 38.67297292528384, "learning_rate": 5.968634686346863e-07, "logits/chosen": -2.330078125, "logits/rejected": -2.377734422683716, "logps/chosen": -363.70001220703125, "logps/rejected": -461.70001220703125, "loss": 0.1688, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.6231933832168579, "rewards/margins": 3.1480469703674316, "rewards/rejected": -2.5257811546325684, "step": 3060 }, { "epoch": 1.6183447548761203, "grad_norm": 39.04057435843742, "learning_rate": 5.95545598313126e-07, "logits/chosen": -2.514843702316284, "logits/rejected": -2.392578125, "logps/chosen": -318.6000061035156, "logps/rejected": -415.3999938964844, "loss": 0.1717, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.661938488483429, "rewards/margins": 3.1156249046325684, "rewards/rejected": -2.455078125, "step": 3070 }, { "epoch": 1.6236162361623616, "grad_norm": 41.46070507942899, "learning_rate": 5.942277279915656e-07, "logits/chosen": -2.5078125, "logits/rejected": -2.3812499046325684, "logps/chosen": -300.0, "logps/rejected": -421.95001220703125, "loss": 0.1951, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5619140863418579, "rewards/margins": 2.8648438453674316, "rewards/rejected": -2.301464796066284, "step": 3080 }, { "epoch": 1.628887717448603, "grad_norm": 33.662733468911775, "learning_rate": 5.929098576700052e-07, "logits/chosen": -2.499218702316284, "logits/rejected": -2.4320311546325684, "logps/chosen": -339.6499938964844, "logps/rejected": -451.1499938964844, "loss": 0.1578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6418182253837585, "rewards/margins": 3.434765577316284, "rewards/rejected": -2.794921875, "step": 3090 }, { "epoch": 1.6341591987348445, "grad_norm": 71.57671196073079, "learning_rate": 5.915919873484448e-07, "logits/chosen": -2.530468702316284, "logits/rejected": -2.410937547683716, "logps/chosen": -329.875, "logps/rejected": -426.04998779296875, "loss": 0.204, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4160217344760895, "rewards/margins": 2.962109327316284, "rewards/rejected": -2.5453124046325684, "step": 3100 }, { "epoch": 1.639430680021086, "grad_norm": 42.40480333983954, "learning_rate": 5.902741170268845e-07, "logits/chosen": -2.524218797683716, "logits/rejected": -2.474609375, "logps/chosen": -392.1000061035156, "logps/rejected": -485.25, "loss": 0.161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.44343262910842896, "rewards/margins": 3.3304686546325684, "rewards/rejected": -2.88671875, "step": 3110 }, { "epoch": 1.6447021613073274, "grad_norm": 54.97252216702515, "learning_rate": 5.889562467053242e-07, "logits/chosen": -2.361328125, "logits/rejected": -2.3277344703674316, "logps/chosen": -297.79998779296875, "logps/rejected": -379.1499938964844, "loss": 0.2175, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.751757800579071, "rewards/margins": 2.787890672683716, "rewards/rejected": -2.034374952316284, "step": 3120 }, { "epoch": 1.6499736425935687, "grad_norm": 53.33041396513802, "learning_rate": 5.876383763837638e-07, "logits/chosen": -2.473828077316284, "logits/rejected": -2.323046922683716, "logps/chosen": -284.625, "logps/rejected": -433.70001220703125, "loss": 0.1777, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.597705066204071, "rewards/margins": 3.0531249046325684, "rewards/rejected": -2.4603514671325684, "step": 3130 }, { "epoch": 1.6552451238798103, "grad_norm": 58.768993173947585, "learning_rate": 5.863205060622034e-07, "logits/chosen": -2.4683594703674316, "logits/rejected": -2.3460936546325684, "logps/chosen": -300.8999938964844, "logps/rejected": -483.5, "loss": 0.1824, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5074462890625, "rewards/margins": 3.640625, "rewards/rejected": -3.133984327316284, "step": 3140 }, { "epoch": 1.6605166051660518, "grad_norm": 51.15103543519565, "learning_rate": 5.850026357406431e-07, "logits/chosen": -2.4847655296325684, "logits/rejected": -2.362499952316284, "logps/chosen": -310.54998779296875, "logps/rejected": -428.04998779296875, "loss": 0.1744, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.22939452528953552, "rewards/margins": 3.2320313453674316, "rewards/rejected": -3.005859375, "step": 3150 }, { "epoch": 1.6657880864522931, "grad_norm": 46.55671977199873, "learning_rate": 5.836847654190827e-07, "logits/chosen": -2.43359375, "logits/rejected": -2.3636717796325684, "logps/chosen": -355.25, "logps/rejected": -417.0, "loss": 0.1866, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12258300930261612, "rewards/margins": 2.9898438453674316, "rewards/rejected": -2.8675780296325684, "step": 3160 }, { "epoch": 1.6710595677385345, "grad_norm": 33.46016771170251, "learning_rate": 5.823668950975223e-07, "logits/chosen": -2.4609375, "logits/rejected": -2.432812452316284, "logps/chosen": -319.3999938964844, "logps/rejected": -427.3500061035156, "loss": 0.1891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.34527587890625, "rewards/margins": 2.996875047683716, "rewards/rejected": -2.6507811546325684, "step": 3170 }, { "epoch": 1.676331049024776, "grad_norm": 45.35357098343959, "learning_rate": 5.810490247759621e-07, "logits/chosen": -2.4476561546325684, "logits/rejected": -2.42578125, "logps/chosen": -292.75, "logps/rejected": -403.95001220703125, "loss": 0.1767, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.16948242485523224, "rewards/margins": 3.0074219703674316, "rewards/rejected": -2.8394532203674316, "step": 3180 }, { "epoch": 1.6816025303110174, "grad_norm": 77.975393625257, "learning_rate": 5.797311544544017e-07, "logits/chosen": -2.515625, "logits/rejected": -2.455859422683716, "logps/chosen": -314.25, "logps/rejected": -397.5, "loss": 0.1694, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.15976563096046448, "rewards/margins": 3.4683594703674316, "rewards/rejected": -3.3062500953674316, "step": 3190 }, { "epoch": 1.6868740115972587, "grad_norm": 59.05000999834404, "learning_rate": 5.784132841328413e-07, "logits/chosen": -2.4124999046325684, "logits/rejected": -2.430468797683716, "logps/chosen": -319.95001220703125, "logps/rejected": -432.95001220703125, "loss": 0.1808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.37474364042282104, "rewards/margins": 3.2562499046325684, "rewards/rejected": -2.877734422683716, "step": 3200 }, { "epoch": 1.6921454928835002, "grad_norm": 35.14109448830097, "learning_rate": 5.770954138112809e-07, "logits/chosen": -2.3355469703674316, "logits/rejected": -2.4140625, "logps/chosen": -290.25, "logps/rejected": -390.6000061035156, "loss": 0.156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3988403379917145, "rewards/margins": 3.303906202316284, "rewards/rejected": -2.904296875, "step": 3210 }, { "epoch": 1.6974169741697418, "grad_norm": 43.501912095699275, "learning_rate": 5.757775434897206e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.423046827316284, "logps/chosen": -303.0249938964844, "logps/rejected": -448.79998779296875, "loss": 0.1452, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2846435606479645, "rewards/margins": 3.2796874046325684, "rewards/rejected": -2.99609375, "step": 3220 }, { "epoch": 1.7026884554559831, "grad_norm": 52.463551357887276, "learning_rate": 5.744596731681603e-07, "logits/chosen": -2.3414063453674316, "logits/rejected": -2.4039063453674316, "logps/chosen": -322.04998779296875, "logps/rejected": -435.95001220703125, "loss": 0.1373, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.09230957180261612, "rewards/margins": 3.5914063453674316, "rewards/rejected": -3.4976563453674316, "step": 3230 }, { "epoch": 1.7079599367422245, "grad_norm": 42.996824560220034, "learning_rate": 5.731418028465999e-07, "logits/chosen": -2.4164061546325684, "logits/rejected": -2.3167967796325684, "logps/chosen": -315.29998779296875, "logps/rejected": -442.5, "loss": 0.1665, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.0882568359375, "rewards/margins": 3.24609375, "rewards/rejected": -3.159374952316284, "step": 3240 }, { "epoch": 1.713231418028466, "grad_norm": 38.1063636094783, "learning_rate": 5.718239325250395e-07, "logits/chosen": -2.450000047683716, "logits/rejected": -2.3355469703674316, "logps/chosen": -320.6000061035156, "logps/rejected": -404.95001220703125, "loss": 0.1572, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.20361328125, "rewards/margins": 3.278125047683716, "rewards/rejected": -3.072265625, "step": 3250 }, { "epoch": 1.7185028993147076, "grad_norm": 24.433718739307466, "learning_rate": 5.705060622034792e-07, "logits/chosen": -2.464062452316284, "logits/rejected": -2.489062547683716, "logps/chosen": -315.8999938964844, "logps/rejected": -381.04998779296875, "loss": 0.1638, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17706298828125, "rewards/margins": 3.236328125, "rewards/rejected": -3.059375047683716, "step": 3260 }, { "epoch": 1.723774380600949, "grad_norm": 37.04190626877952, "learning_rate": 5.691881918819188e-07, "logits/chosen": -2.453125, "logits/rejected": -2.424999952316284, "logps/chosen": -328.45001220703125, "logps/rejected": -451.29998779296875, "loss": 0.1831, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07441405951976776, "rewards/margins": 3.559375047683716, "rewards/rejected": -3.485546827316284, "step": 3270 }, { "epoch": 1.7290458618871902, "grad_norm": 29.16843272496697, "learning_rate": 5.678703215603584e-07, "logits/chosen": -2.373828172683716, "logits/rejected": -2.396484375, "logps/chosen": -289.45001220703125, "logps/rejected": -392.0, "loss": 0.1943, "rewards/accuracies": 0.90625, "rewards/chosen": 0.24352416396141052, "rewards/margins": 2.9808592796325684, "rewards/rejected": -2.7396483421325684, "step": 3280 }, { "epoch": 1.7343173431734318, "grad_norm": 55.475737877926356, "learning_rate": 5.665524512387981e-07, "logits/chosen": -2.3871092796325684, "logits/rejected": -2.415234327316284, "logps/chosen": -289.5, "logps/rejected": -421.6499938964844, "loss": 0.2152, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.4345336854457855, "rewards/margins": 3.3539061546325684, "rewards/rejected": -2.916210889816284, "step": 3290 }, { "epoch": 1.7395888244596733, "grad_norm": 43.18829083384744, "learning_rate": 5.652345809172378e-07, "logits/chosen": -2.438671827316284, "logits/rejected": -2.395312547683716, "logps/chosen": -318.5, "logps/rejected": -424.0, "loss": 0.2032, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.48491209745407104, "rewards/margins": 3.119140625, "rewards/rejected": -2.632031202316284, "step": 3300 }, { "epoch": 1.7448603057459144, "grad_norm": 32.79387345355609, "learning_rate": 5.639167105956774e-07, "logits/chosen": -2.2847657203674316, "logits/rejected": -2.296875, "logps/chosen": -328.6000061035156, "logps/rejected": -405.04998779296875, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4406982362270355, "rewards/margins": 3.15625, "rewards/rejected": -2.712890625, "step": 3310 }, { "epoch": 1.750131787032156, "grad_norm": 35.360641461879155, "learning_rate": 5.62598840274117e-07, "logits/chosen": -2.37109375, "logits/rejected": -2.330078125, "logps/chosen": -337.5, "logps/rejected": -403.45001220703125, "loss": 0.1722, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23635253310203552, "rewards/margins": 3.2757811546325684, "rewards/rejected": -3.039843797683716, "step": 3320 }, { "epoch": 1.7554032683183975, "grad_norm": 42.014215138713176, "learning_rate": 5.612809699525567e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.395312547683716, "logps/chosen": -356.3500061035156, "logps/rejected": -428.6000061035156, "loss": 0.1649, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1033935546875, "rewards/margins": 3.078906297683716, "rewards/rejected": -2.978515625, "step": 3330 }, { "epoch": 1.7606747496046389, "grad_norm": 65.00519605182392, "learning_rate": 5.599630996309963e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.380078077316284, "logps/chosen": -308.79998779296875, "logps/rejected": -391.0, "loss": 0.1498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.26945799589157104, "rewards/margins": 3.4066405296325684, "rewards/rejected": -3.133984327316284, "step": 3340 }, { "epoch": 1.7659462308908802, "grad_norm": 73.22899347760043, "learning_rate": 5.58645229309436e-07, "logits/chosen": -2.466015577316284, "logits/rejected": -2.4765625, "logps/chosen": -332.3500061035156, "logps/rejected": -434.8500061035156, "loss": 0.1874, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05340576171875, "rewards/margins": 3.512500047683716, "rewards/rejected": -3.4609375, "step": 3350 }, { "epoch": 1.7712177121771218, "grad_norm": 30.949044226537648, "learning_rate": 5.573273589878755e-07, "logits/chosen": -2.55859375, "logits/rejected": -2.4625000953674316, "logps/chosen": -332.45001220703125, "logps/rejected": -446.8999938964844, "loss": 0.1564, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3915771543979645, "rewards/margins": 3.337109327316284, "rewards/rejected": -2.9488282203674316, "step": 3360 }, { "epoch": 1.7764891934633633, "grad_norm": 57.88781380702448, "learning_rate": 5.560094886663152e-07, "logits/chosen": -2.4117188453674316, "logits/rejected": -2.435546875, "logps/chosen": -313.6499938964844, "logps/rejected": -446.3999938964844, "loss": 0.1704, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.49135130643844604, "rewards/margins": 3.842968702316284, "rewards/rejected": -3.3511719703674316, "step": 3370 }, { "epoch": 1.7817606747496046, "grad_norm": 55.7810376002888, "learning_rate": 5.546916183447548e-07, "logits/chosen": -2.5277342796325684, "logits/rejected": -2.572265625, "logps/chosen": -347.8500061035156, "logps/rejected": -432.8999938964844, "loss": 0.2034, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.4986328184604645, "rewards/margins": 3.0816407203674316, "rewards/rejected": -2.583789110183716, "step": 3380 }, { "epoch": 1.787032156035846, "grad_norm": 43.33943274570778, "learning_rate": 5.533737480231944e-07, "logits/chosen": -2.532031297683716, "logits/rejected": -2.461718797683716, "logps/chosen": -365.5, "logps/rejected": -458.1000061035156, "loss": 0.1897, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.6274169683456421, "rewards/margins": 3.0238280296325684, "rewards/rejected": -2.396484375, "step": 3390 }, { "epoch": 1.7923036373220875, "grad_norm": 84.22213987561273, "learning_rate": 5.520558777016341e-07, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.434375047683716, "logps/chosen": -322.1000061035156, "logps/rejected": -432.8999938964844, "loss": 0.2025, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6998535394668579, "rewards/margins": 3.266406297683716, "rewards/rejected": -2.569531202316284, "step": 3400 }, { "epoch": 1.797575118608329, "grad_norm": 112.68243817765345, "learning_rate": 5.507380073800738e-07, "logits/chosen": -2.461718797683716, "logits/rejected": -2.4097657203674316, "logps/chosen": -322.8999938964844, "logps/rejected": -433.3999938964844, "loss": 0.1845, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7296508550643921, "rewards/margins": 3.108203172683716, "rewards/rejected": -2.37890625, "step": 3410 }, { "epoch": 1.8028465998945704, "grad_norm": 49.60342748363444, "learning_rate": 5.494201370585134e-07, "logits/chosen": -2.37890625, "logits/rejected": -2.3394532203674316, "logps/chosen": -309.0, "logps/rejected": -395.75, "loss": 0.1939, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5239502191543579, "rewards/margins": 3.1246094703674316, "rewards/rejected": -2.6011719703674316, "step": 3420 }, { "epoch": 1.8081180811808117, "grad_norm": 53.69376613632286, "learning_rate": 5.48102266736953e-07, "logits/chosen": -2.403125047683716, "logits/rejected": -2.367968797683716, "logps/chosen": -333.1499938964844, "logps/rejected": -459.3500061035156, "loss": 0.1795, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2804199159145355, "rewards/margins": 3.278125047683716, "rewards/rejected": -3.0015625953674316, "step": 3430 }, { "epoch": 1.8133895624670533, "grad_norm": 45.39029902360145, "learning_rate": 5.467843964153926e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.390625, "logps/chosen": -318.79998779296875, "logps/rejected": -422.79998779296875, "loss": 0.1566, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3256988525390625, "rewards/margins": 3.053906202316284, "rewards/rejected": -2.7269530296325684, "step": 3440 }, { "epoch": 1.8186610437532946, "grad_norm": 43.043333541509334, "learning_rate": 5.454665260938323e-07, "logits/chosen": -2.416796922683716, "logits/rejected": -2.426953077316284, "logps/chosen": -331.6000061035156, "logps/rejected": -412.79998779296875, "loss": 0.1741, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.11845092475414276, "rewards/margins": 3.046875, "rewards/rejected": -3.163281202316284, "step": 3450 }, { "epoch": 1.823932525039536, "grad_norm": 69.15493018807909, "learning_rate": 5.44148655772272e-07, "logits/chosen": -2.346484422683716, "logits/rejected": -2.4468750953674316, "logps/chosen": -322.20001220703125, "logps/rejected": -406.6000061035156, "loss": 0.1575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15230712294578552, "rewards/margins": 3.317187547683716, "rewards/rejected": -3.471484422683716, "step": 3460 }, { "epoch": 1.8292040063257775, "grad_norm": 44.00639484486801, "learning_rate": 5.428307854507116e-07, "logits/chosen": -2.4019532203674316, "logits/rejected": -2.4105467796325684, "logps/chosen": -318.45001220703125, "logps/rejected": -415.3999938964844, "loss": 0.1645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.17689208686351776, "rewards/margins": 3.375781297683716, "rewards/rejected": -3.555468797683716, "step": 3470 }, { "epoch": 1.834475487612019, "grad_norm": 65.42084729965917, "learning_rate": 5.415129151291513e-07, "logits/chosen": -2.528125047683716, "logits/rejected": -2.4710936546325684, "logps/chosen": -319.8500061035156, "logps/rejected": -401.70001220703125, "loss": 0.1671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0299072265625, "rewards/margins": 3.328906297683716, "rewards/rejected": -3.359179735183716, "step": 3480 }, { "epoch": 1.8397469688982604, "grad_norm": 53.85711685912332, "learning_rate": 5.401950448075909e-07, "logits/chosen": -2.4945311546325684, "logits/rejected": -2.4859375953674316, "logps/chosen": -359.1000061035156, "logps/rejected": -419.45001220703125, "loss": 0.1887, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.16448363661766052, "rewards/margins": 3.30859375, "rewards/rejected": -3.471484422683716, "step": 3490 }, { "epoch": 1.8450184501845017, "grad_norm": 27.97355794258741, "learning_rate": 5.388771744860305e-07, "logits/chosen": -2.4410157203674316, "logits/rejected": -2.4066405296325684, "logps/chosen": -368.45001220703125, "logps/rejected": -431.04998779296875, "loss": 0.1966, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.07366943359375, "rewards/margins": 3.272265672683716, "rewards/rejected": -3.34765625, "step": 3500 }, { "epoch": 1.8502899314707433, "grad_norm": 27.35735934149921, "learning_rate": 5.375593041644701e-07, "logits/chosen": -2.444140672683716, "logits/rejected": -2.307812452316284, "logps/chosen": -360.6499938964844, "logps/rejected": -465.1000061035156, "loss": 0.1942, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02283935621380806, "rewards/margins": 3.2593750953674316, "rewards/rejected": -3.2347655296325684, "step": 3510 }, { "epoch": 1.8555614127569848, "grad_norm": 26.27400260695994, "learning_rate": 5.362414338429099e-07, "logits/chosen": -2.3949217796325684, "logits/rejected": -2.4105467796325684, "logps/chosen": -331.3500061035156, "logps/rejected": -440.70001220703125, "loss": 0.1719, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.4793945252895355, "rewards/margins": 3.4273438453674316, "rewards/rejected": -2.94921875, "step": 3520 }, { "epoch": 1.8608328940432262, "grad_norm": 55.945929320899275, "learning_rate": 5.349235635213495e-07, "logits/chosen": -2.4945311546325684, "logits/rejected": -2.442578077316284, "logps/chosen": -333.8999938964844, "logps/rejected": -432.70001220703125, "loss": 0.1761, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.34711915254592896, "rewards/margins": 3.1910157203674316, "rewards/rejected": -2.842968702316284, "step": 3530 }, { "epoch": 1.8661043753294675, "grad_norm": 22.377586625973766, "learning_rate": 5.336056931997891e-07, "logits/chosen": -2.4839844703674316, "logits/rejected": -2.491015672683716, "logps/chosen": -349.42498779296875, "logps/rejected": -459.1499938964844, "loss": 0.1823, "rewards/accuracies": 0.9375, "rewards/chosen": 0.42924803495407104, "rewards/margins": 3.2886719703674316, "rewards/rejected": -2.861523389816284, "step": 3540 }, { "epoch": 1.871375856615709, "grad_norm": 46.337687572738794, "learning_rate": 5.322878228782287e-07, "logits/chosen": -2.431640625, "logits/rejected": -2.4351563453674316, "logps/chosen": -345.20001220703125, "logps/rejected": -443.8999938964844, "loss": 0.1854, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.3210815489292145, "rewards/margins": 3.06640625, "rewards/rejected": -2.74609375, "step": 3550 }, { "epoch": 1.8766473379019506, "grad_norm": 26.610084233646134, "learning_rate": 5.309699525566684e-07, "logits/chosen": -2.3828125, "logits/rejected": -2.3003907203674316, "logps/chosen": -309.2250061035156, "logps/rejected": -420.3500061035156, "loss": 0.17, "rewards/accuracies": 0.9375, "rewards/chosen": 0.38554686307907104, "rewards/margins": 3.184765577316284, "rewards/rejected": -2.8023438453674316, "step": 3560 }, { "epoch": 1.881918819188192, "grad_norm": 56.422654795341515, "learning_rate": 5.296520822351081e-07, "logits/chosen": -2.4085936546325684, "logits/rejected": -2.3359375, "logps/chosen": -334.8999938964844, "logps/rejected": -429.95001220703125, "loss": 0.1829, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.31093138456344604, "rewards/margins": 3.352343797683716, "rewards/rejected": -3.0414061546325684, "step": 3570 }, { "epoch": 1.8871903004744333, "grad_norm": 34.69256473889352, "learning_rate": 5.283342119135477e-07, "logits/chosen": -2.3148436546325684, "logits/rejected": -2.3304686546325684, "logps/chosen": -380.1000061035156, "logps/rejected": -464.3500061035156, "loss": 0.1515, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3628173768520355, "rewards/margins": 3.502734422683716, "rewards/rejected": -3.134765625, "step": 3580 }, { "epoch": 1.8924617817606748, "grad_norm": 21.09549073537172, "learning_rate": 5.270163415919874e-07, "logits/chosen": -2.4496092796325684, "logits/rejected": -2.3785157203674316, "logps/chosen": -328.1499938964844, "logps/rejected": -488.6499938964844, "loss": 0.1723, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.26814574003219604, "rewards/margins": 3.551562547683716, "rewards/rejected": -3.2847657203674316, "step": 3590 }, { "epoch": 1.8977332630469161, "grad_norm": 43.179053071955686, "learning_rate": 5.25698471270427e-07, "logits/chosen": -2.4449219703674316, "logits/rejected": -2.41015625, "logps/chosen": -304.95001220703125, "logps/rejected": -406.1000061035156, "loss": 0.2013, "rewards/accuracies": 0.9375, "rewards/chosen": -0.004443359561264515, "rewards/margins": 3.254687547683716, "rewards/rejected": -3.2582030296325684, "step": 3600 }, { "epoch": 1.9030047443331575, "grad_norm": 89.53395374854689, "learning_rate": 5.243806009488666e-07, "logits/chosen": -2.520312547683716, "logits/rejected": -2.5132813453674316, "logps/chosen": -331.5, "logps/rejected": -412.1000061035156, "loss": 0.1834, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2490234375, "rewards/margins": 3.2515625953674316, "rewards/rejected": -3.001953125, "step": 3610 }, { "epoch": 1.908276225619399, "grad_norm": 124.8903034551957, "learning_rate": 5.230627306273062e-07, "logits/chosen": -2.477343797683716, "logits/rejected": -2.424999952316284, "logps/chosen": -303.8999938964844, "logps/rejected": -390.70001220703125, "loss": 0.1972, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.31461793184280396, "rewards/margins": 2.973828077316284, "rewards/rejected": -2.661328077316284, "step": 3620 }, { "epoch": 1.9135477069056406, "grad_norm": 99.56769930636159, "learning_rate": 5.21744860305746e-07, "logits/chosen": -2.4429688453674316, "logits/rejected": -2.3960938453674316, "logps/chosen": -392.20001220703125, "logps/rejected": -452.0, "loss": 0.206, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.023468017578125, "rewards/margins": 3.0433592796325684, "rewards/rejected": -3.0199217796325684, "step": 3630 }, { "epoch": 1.918819188191882, "grad_norm": 48.19317360441356, "learning_rate": 5.204269899841856e-07, "logits/chosen": -2.4078125953674316, "logits/rejected": -2.440624952316284, "logps/chosen": -335.6000061035156, "logps/rejected": -389.79998779296875, "loss": 0.1818, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.0017578124534338713, "rewards/margins": 2.890625, "rewards/rejected": -2.889843702316284, "step": 3640 }, { "epoch": 1.9240906694781232, "grad_norm": 78.68180952913308, "learning_rate": 5.191091196626252e-07, "logits/chosen": -2.471874952316284, "logits/rejected": -2.460156202316284, "logps/chosen": -384.6499938964844, "logps/rejected": -396.54998779296875, "loss": 0.1701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.20903320610523224, "rewards/margins": 3.077343702316284, "rewards/rejected": -2.8648438453674316, "step": 3650 }, { "epoch": 1.9293621507643648, "grad_norm": 45.21083400053915, "learning_rate": 5.177912493410648e-07, "logits/chosen": -2.4609375, "logits/rejected": -2.4453125, "logps/chosen": -363.04998779296875, "logps/rejected": -464.1499938964844, "loss": 0.1916, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.24817505478858948, "rewards/margins": 3.207812547683716, "rewards/rejected": -3.456249952316284, "step": 3660 }, { "epoch": 1.9346336320506063, "grad_norm": 48.24893455110899, "learning_rate": 5.164733790195045e-07, "logits/chosen": -2.3910155296325684, "logits/rejected": -2.3402342796325684, "logps/chosen": -341.8999938964844, "logps/rejected": -438.79998779296875, "loss": 0.1581, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.14455565810203552, "rewards/margins": 3.301562547683716, "rewards/rejected": -3.4476561546325684, "step": 3670 }, { "epoch": 1.9399051133368477, "grad_norm": 50.89539448353681, "learning_rate": 5.151555086979441e-07, "logits/chosen": -2.4703125953674316, "logits/rejected": -2.4398436546325684, "logps/chosen": -336.0, "logps/rejected": -456.20001220703125, "loss": 0.1395, "rewards/accuracies": 0.96875, "rewards/chosen": -0.12802734971046448, "rewards/margins": 3.338671922683716, "rewards/rejected": -3.466015577316284, "step": 3680 }, { "epoch": 1.945176594623089, "grad_norm": 47.0155313549499, "learning_rate": 5.138376383763838e-07, "logits/chosen": -2.514843702316284, "logits/rejected": -2.5679688453674316, "logps/chosen": -334.7749938964844, "logps/rejected": -398.29998779296875, "loss": 0.1583, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09335479885339737, "rewards/margins": 3.4468750953674316, "rewards/rejected": -3.541015625, "step": 3690 }, { "epoch": 1.9504480759093306, "grad_norm": 41.154429631833764, "learning_rate": 5.125197680548234e-07, "logits/chosen": -2.4554686546325684, "logits/rejected": -2.3492188453674316, "logps/chosen": -353.5, "logps/rejected": -479.20001220703125, "loss": 0.1858, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23794326186180115, "rewards/margins": 3.5640625953674316, "rewards/rejected": -3.801562547683716, "step": 3700 }, { "epoch": 1.9557195571955721, "grad_norm": 34.98316935275387, "learning_rate": 5.11201897733263e-07, "logits/chosen": -2.481640577316284, "logits/rejected": -2.5015625953674316, "logps/chosen": -313.6000061035156, "logps/rejected": -395.1000061035156, "loss": 0.1873, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.04155273362994194, "rewards/margins": 3.106250047683716, "rewards/rejected": -3.149218797683716, "step": 3710 }, { "epoch": 1.9609910384818134, "grad_norm": 93.09078717465003, "learning_rate": 5.098840274117026e-07, "logits/chosen": -2.376953125, "logits/rejected": -2.4371094703674316, "logps/chosen": -362.25, "logps/rejected": -426.79998779296875, "loss": 0.1966, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.109039306640625, "rewards/margins": 3.2445311546325684, "rewards/rejected": -3.354687452316284, "step": 3720 }, { "epoch": 1.9662625197680548, "grad_norm": 13.359462712785602, "learning_rate": 5.085661570901422e-07, "logits/chosen": -2.496875047683716, "logits/rejected": -2.4749999046325684, "logps/chosen": -342.1000061035156, "logps/rejected": -469.0, "loss": 0.1762, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.04301147535443306, "rewards/margins": 3.474609375, "rewards/rejected": -3.428906202316284, "step": 3730 }, { "epoch": 1.9715340010542963, "grad_norm": 36.733469112316016, "learning_rate": 5.07248286768582e-07, "logits/chosen": -2.451171875, "logits/rejected": -2.4242186546325684, "logps/chosen": -317.20001220703125, "logps/rejected": -451.1499938964844, "loss": 0.1648, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.06588134914636612, "rewards/margins": 3.396484375, "rewards/rejected": -3.461718797683716, "step": 3740 }, { "epoch": 1.9768054823405377, "grad_norm": 57.68728826522842, "learning_rate": 5.059304164470216e-07, "logits/chosen": -2.589062452316284, "logits/rejected": -2.42578125, "logps/chosen": -324.0, "logps/rejected": -447.32501220703125, "loss": 0.1606, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.08944244682788849, "rewards/margins": 3.3441405296325684, "rewards/rejected": -3.43359375, "step": 3750 }, { "epoch": 1.982076963626779, "grad_norm": 90.04811009668377, "learning_rate": 5.046125461254612e-07, "logits/chosen": -2.5765624046325684, "logits/rejected": -2.48046875, "logps/chosen": -356.8500061035156, "logps/rejected": -457.75, "loss": 0.1725, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.173828125, "rewards/margins": 3.759765625, "rewards/rejected": -3.9320311546325684, "step": 3760 }, { "epoch": 1.9873484449130205, "grad_norm": 80.19524585765123, "learning_rate": 5.032946758039008e-07, "logits/chosen": -2.418750047683716, "logits/rejected": -2.4320311546325684, "logps/chosen": -373.70001220703125, "logps/rejected": -433.1000061035156, "loss": 0.2072, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.525634765625, "rewards/margins": 3.180468797683716, "rewards/rejected": -3.7046875953674316, "step": 3770 }, { "epoch": 1.992619926199262, "grad_norm": 74.74151546226221, "learning_rate": 5.019768054823405e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.4765625, "logps/chosen": -325.04998779296875, "logps/rejected": -385.6499938964844, "loss": 0.2047, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3072265684604645, "rewards/margins": 2.938281297683716, "rewards/rejected": -3.24609375, "step": 3780 }, { "epoch": 1.9978914074855034, "grad_norm": 53.27794788204781, "learning_rate": 5.006589351607801e-07, "logits/chosen": -2.4453125, "logits/rejected": -2.491406202316284, "logps/chosen": -343.1000061035156, "logps/rejected": -399.1499938964844, "loss": 0.2304, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.49099427461624146, "rewards/margins": 2.8414063453674316, "rewards/rejected": -3.333984375, "step": 3790 }, { "epoch": 2.0031628887717448, "grad_norm": 13.573182507089502, "learning_rate": 4.993410648392198e-07, "logits/chosen": -2.53515625, "logits/rejected": -2.503124952316284, "logps/chosen": -330.3999938964844, "logps/rejected": -465.70001220703125, "loss": 0.0881, "rewards/accuracies": 0.9854167103767395, "rewards/chosen": 0.02149658277630806, "rewards/margins": 3.8335938453674316, "rewards/rejected": -3.813671827316284, "step": 3800 }, { "epoch": 2.0084343700579863, "grad_norm": 25.590655178485694, "learning_rate": 4.980231945176594e-07, "logits/chosen": -2.58203125, "logits/rejected": -2.563281297683716, "logps/chosen": -330.95001220703125, "logps/rejected": -412.8999938964844, "loss": 0.0697, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.15981444716453552, "rewards/margins": 4.153124809265137, "rewards/rejected": -3.99609375, "step": 3810 }, { "epoch": 2.013705851344228, "grad_norm": 25.33704081336845, "learning_rate": 4.96705324196099e-07, "logits/chosen": -2.439453125, "logits/rejected": -2.403125047683716, "logps/chosen": -360.1499938964844, "logps/rejected": -490.79998779296875, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -0.10511474311351776, "rewards/margins": 4.586718559265137, "rewards/rejected": -4.689843654632568, "step": 3820 }, { "epoch": 2.018977332630469, "grad_norm": 13.64949241485214, "learning_rate": 4.953874538745387e-07, "logits/chosen": -2.5648436546325684, "logits/rejected": -2.5015625953674316, "logps/chosen": -327.45001220703125, "logps/rejected": -434.5, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 0.09931640326976776, "rewards/margins": 4.694531440734863, "rewards/rejected": -4.592187404632568, "step": 3830 }, { "epoch": 2.0242488139167105, "grad_norm": 21.75653893690208, "learning_rate": 4.940695835529783e-07, "logits/chosen": -2.444531202316284, "logits/rejected": -2.4593749046325684, "logps/chosen": -332.1000061035156, "logps/rejected": -458.8999938964844, "loss": 0.0487, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.02080078050494194, "rewards/margins": 4.672656059265137, "rewards/rejected": -4.69140625, "step": 3840 }, { "epoch": 2.029520295202952, "grad_norm": 19.50542957537481, "learning_rate": 4.92751713231418e-07, "logits/chosen": -2.698437452316284, "logits/rejected": -2.578125, "logps/chosen": -296.25, "logps/rejected": -404.6000061035156, "loss": 0.0754, "rewards/accuracies": 0.96875, "rewards/chosen": -0.03049316443502903, "rewards/margins": 4.317968845367432, "rewards/rejected": -4.344922065734863, "step": 3850 }, { "epoch": 2.0347917764891936, "grad_norm": 21.698526897266206, "learning_rate": 4.914338429098576e-07, "logits/chosen": -2.6484375, "logits/rejected": -2.5328125953674316, "logps/chosen": -300.1499938964844, "logps/rejected": -417.8500061035156, "loss": 0.0578, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.29851073026657104, "rewards/margins": 4.763281345367432, "rewards/rejected": -5.0625, "step": 3860 }, { "epoch": 2.0400632577754347, "grad_norm": 11.758100770283113, "learning_rate": 4.901159725882973e-07, "logits/chosen": -2.57421875, "logits/rejected": -2.5875000953674316, "logps/chosen": -288.3500061035156, "logps/rejected": -408.25, "loss": 0.0703, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4469238221645355, "rewards/margins": 4.302343845367432, "rewards/rejected": -4.750781059265137, "step": 3870 }, { "epoch": 2.0453347390616763, "grad_norm": 10.503714189128056, "learning_rate": 4.88798102266737e-07, "logits/chosen": -2.651562452316284, "logits/rejected": -2.5453124046325684, "logps/chosen": -315.04998779296875, "logps/rejected": -440.29998779296875, "loss": 0.0562, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.4388671815395355, "rewards/margins": 4.908593654632568, "rewards/rejected": -5.345312595367432, "step": 3880 }, { "epoch": 2.050606220347918, "grad_norm": 17.87455400863963, "learning_rate": 4.874802319451766e-07, "logits/chosen": -2.6109375953674316, "logits/rejected": -2.600781202316284, "logps/chosen": -307.82501220703125, "logps/rejected": -425.6000061035156, "loss": 0.056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.793872058391571, "rewards/margins": 4.834374904632568, "rewards/rejected": -5.632031440734863, "step": 3890 }, { "epoch": 2.0558777016341594, "grad_norm": 14.205742350252725, "learning_rate": 4.861623616236162e-07, "logits/chosen": -2.6578125953674316, "logits/rejected": -2.6468749046325684, "logps/chosen": -327.75, "logps/rejected": -439.1000061035156, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -0.8280029296875, "rewards/margins": 4.514843940734863, "rewards/rejected": -5.33984375, "step": 3900 }, { "epoch": 2.0611491829204005, "grad_norm": 12.158734576202562, "learning_rate": 4.848444913020559e-07, "logits/chosen": -2.6484375, "logits/rejected": -2.6312499046325684, "logps/chosen": -368.1000061035156, "logps/rejected": -446.0, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -0.7828124761581421, "rewards/margins": 4.559374809265137, "rewards/rejected": -5.345312595367432, "step": 3910 }, { "epoch": 2.066420664206642, "grad_norm": 6.042845072882287, "learning_rate": 4.835266209804955e-07, "logits/chosen": -2.719531297683716, "logits/rejected": -2.6773438453674316, "logps/chosen": -326.29998779296875, "logps/rejected": -439.3500061035156, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.3814697265625, "rewards/margins": 4.862500190734863, "rewards/rejected": -5.243750095367432, "step": 3920 }, { "epoch": 2.0716921454928836, "grad_norm": 49.79518845272951, "learning_rate": 4.822087506589351e-07, "logits/chosen": -2.702343702316284, "logits/rejected": -2.645312547683716, "logps/chosen": -352.1499938964844, "logps/rejected": -437.5, "loss": 0.0589, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.188923642039299, "rewards/margins": 4.609375, "rewards/rejected": -4.791406154632568, "step": 3930 }, { "epoch": 2.0769636267791247, "grad_norm": 8.451043542982958, "learning_rate": 4.808908803373748e-07, "logits/chosen": -2.6167969703674316, "logits/rejected": -2.680468797683716, "logps/chosen": -385.5, "logps/rejected": -499.8999938964844, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -0.525585949420929, "rewards/margins": 5.081250190734863, "rewards/rejected": -5.603906154632568, "step": 3940 }, { "epoch": 2.0822351080653663, "grad_norm": 14.31842668015912, "learning_rate": 4.795730100158144e-07, "logits/chosen": -2.741406202316284, "logits/rejected": -2.671093702316284, "logps/chosen": -367.3999938964844, "logps/rejected": -475.70001220703125, "loss": 0.0507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1451904773712158, "rewards/margins": 4.84765625, "rewards/rejected": -5.990624904632568, "step": 3950 }, { "epoch": 2.087506589351608, "grad_norm": 7.359388876283682, "learning_rate": 4.782551396942541e-07, "logits/chosen": -2.624218702316284, "logits/rejected": -2.639843702316284, "logps/chosen": -353.04998779296875, "logps/rejected": -468.3999938964844, "loss": 0.0643, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.373925805091858, "rewards/margins": 5.150000095367432, "rewards/rejected": -6.521874904632568, "step": 3960 }, { "epoch": 2.0927780706378494, "grad_norm": 28.09882734229531, "learning_rate": 4.769372693726937e-07, "logits/chosen": -2.7015624046325684, "logits/rejected": -2.633593797683716, "logps/chosen": -347.25, "logps/rejected": -443.70001220703125, "loss": 0.0715, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1779296398162842, "rewards/margins": 4.862500190734863, "rewards/rejected": -6.046875, "step": 3970 }, { "epoch": 2.0980495519240905, "grad_norm": 20.954415336696304, "learning_rate": 4.756193990511334e-07, "logits/chosen": -2.633593797683716, "logits/rejected": -2.7054686546325684, "logps/chosen": -370.3500061035156, "logps/rejected": -429.75, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.97247314453125, "rewards/margins": 4.784375190734863, "rewards/rejected": -5.754687309265137, "step": 3980 }, { "epoch": 2.103321033210332, "grad_norm": 45.67088757672762, "learning_rate": 4.7430152872957297e-07, "logits/chosen": -2.567187547683716, "logits/rejected": -2.514453172683716, "logps/chosen": -339.54998779296875, "logps/rejected": -450.79998779296875, "loss": 0.0505, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.0781738758087158, "rewards/margins": 5.278124809265137, "rewards/rejected": -6.349999904632568, "step": 3990 }, { "epoch": 2.1085925144965736, "grad_norm": 24.166444864522937, "learning_rate": 4.729836584080126e-07, "logits/chosen": -2.692187547683716, "logits/rejected": -2.624218702316284, "logps/chosen": -323.0, "logps/rejected": -450.6000061035156, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.8589843511581421, "rewards/margins": 5.028906345367432, "rewards/rejected": -5.893750190734863, "step": 4000 }, { "epoch": 2.113863995782815, "grad_norm": 27.40467794569332, "learning_rate": 4.716657880864523e-07, "logits/chosen": -2.640625, "logits/rejected": -2.672656297683716, "logps/chosen": -401.95001220703125, "logps/rejected": -505.29998779296875, "loss": 0.062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.041748046875, "rewards/margins": 5.204687595367432, "rewards/rejected": -6.246874809265137, "step": 4010 }, { "epoch": 2.1191354770690563, "grad_norm": 15.270450522602951, "learning_rate": 4.703479177648919e-07, "logits/chosen": -2.5765624046325684, "logits/rejected": -2.5718750953674316, "logps/chosen": -352.79998779296875, "logps/rejected": -485.20001220703125, "loss": 0.0424, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0308105945587158, "rewards/margins": 4.903124809265137, "rewards/rejected": -5.939062595367432, "step": 4020 }, { "epoch": 2.124406958355298, "grad_norm": 76.01765360886786, "learning_rate": 4.6903004744333156e-07, "logits/chosen": -2.604687452316284, "logits/rejected": -2.6640625, "logps/chosen": -382.95001220703125, "logps/rejected": -487.20001220703125, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0697021484375, "rewards/margins": 4.971875190734863, "rewards/rejected": -6.046875, "step": 4030 }, { "epoch": 2.1296784396415394, "grad_norm": 19.519995626649596, "learning_rate": 4.677121771217712e-07, "logits/chosen": -2.5757813453674316, "logits/rejected": -2.698437452316284, "logps/chosen": -392.8999938964844, "logps/rejected": -421.3999938964844, "loss": 0.0608, "rewards/accuracies": 0.96875, "rewards/chosen": -1.260986328125, "rewards/margins": 4.685937404632568, "rewards/rejected": -5.946875095367432, "step": 4040 }, { "epoch": 2.134949920927781, "grad_norm": 39.27089502115964, "learning_rate": 4.6639430680021086e-07, "logits/chosen": -2.637500047683716, "logits/rejected": -2.75, "logps/chosen": -360.3999938964844, "logps/rejected": -469.70001220703125, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.455664038658142, "rewards/margins": 5.180468559265137, "rewards/rejected": -6.637499809265137, "step": 4050 }, { "epoch": 2.140221402214022, "grad_norm": 46.51615749632793, "learning_rate": 4.6507643647865045e-07, "logits/chosen": -2.7578125, "logits/rejected": -2.69140625, "logps/chosen": -373.75, "logps/rejected": -496.1000061035156, "loss": 0.0536, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.602929711341858, "rewards/margins": 5.303906440734863, "rewards/rejected": -6.903906345367432, "step": 4060 }, { "epoch": 2.1454928835002636, "grad_norm": 11.723069519501015, "learning_rate": 4.6375856615709015e-07, "logits/chosen": -2.5914063453674316, "logits/rejected": -2.5843749046325684, "logps/chosen": -351.875, "logps/rejected": -492.5, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": -1.133886694908142, "rewards/margins": 5.295312404632568, "rewards/rejected": -6.424218654632568, "step": 4070 }, { "epoch": 2.150764364786505, "grad_norm": 15.193707046740506, "learning_rate": 4.6244069583552975e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.5542969703674316, "logps/chosen": -371.0, "logps/rejected": -507.5, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -0.834765613079071, "rewards/margins": 5.071093559265137, "rewards/rejected": -5.907812595367432, "step": 4080 }, { "epoch": 2.1560358460727462, "grad_norm": 46.091188081589976, "learning_rate": 4.611228255139694e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.745312452316284, "logps/chosen": -324.25, "logps/rejected": -441.6000061035156, "loss": 0.0616, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.769702136516571, "rewards/margins": 4.949999809265137, "rewards/rejected": -5.724999904632568, "step": 4090 }, { "epoch": 2.161307327358988, "grad_norm": 8.484468266315355, "learning_rate": 4.5980495519240904e-07, "logits/chosen": -2.799999952316284, "logits/rejected": -2.7890625, "logps/chosen": -357.0, "logps/rejected": -465.54998779296875, "loss": 0.0484, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.2565796375274658, "rewards/margins": 5.142968654632568, "rewards/rejected": -6.400000095367432, "step": 4100 }, { "epoch": 2.1665788086452293, "grad_norm": 31.70388488811083, "learning_rate": 4.584870848708487e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -2.862499952316284, "logps/chosen": -369.45001220703125, "logps/rejected": -493.6000061035156, "loss": 0.0429, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9603027105331421, "rewards/margins": 5.126562595367432, "rewards/rejected": -6.082812309265137, "step": 4110 }, { "epoch": 2.171850289931471, "grad_norm": 21.27735591653322, "learning_rate": 4.571692145492883e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.686718702316284, "logps/chosen": -372.79998779296875, "logps/rejected": -466.0, "loss": 0.0676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.700665295124054, "rewards/margins": 5.151562690734863, "rewards/rejected": -5.8515625, "step": 4120 }, { "epoch": 2.177121771217712, "grad_norm": 12.471469924168833, "learning_rate": 4.55851344227728e-07, "logits/chosen": -2.7914061546325684, "logits/rejected": -2.745312452316284, "logps/chosen": -328.1000061035156, "logps/rejected": -429.1499938964844, "loss": 0.061, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6201416254043579, "rewards/margins": 5.055468559265137, "rewards/rejected": -5.674218654632568, "step": 4130 }, { "epoch": 2.1823932525039536, "grad_norm": 12.764989045891706, "learning_rate": 4.545334739061676e-07, "logits/chosen": -2.6898436546325684, "logits/rejected": -2.734375, "logps/chosen": -363.32501220703125, "logps/rejected": -450.0, "loss": 0.0786, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0113036632537842, "rewards/margins": 4.581250190734863, "rewards/rejected": -5.592968940734863, "step": 4140 }, { "epoch": 2.187664733790195, "grad_norm": 32.61852516577979, "learning_rate": 4.532156035846073e-07, "logits/chosen": -2.641406297683716, "logits/rejected": -2.713671922683716, "logps/chosen": -362.45001220703125, "logps/rejected": -471.8999938964844, "loss": 0.0693, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.97607421875, "rewards/margins": 4.671093940734863, "rewards/rejected": -5.645312309265137, "step": 4150 }, { "epoch": 2.1929362150764367, "grad_norm": 26.623808477384525, "learning_rate": 4.5189773326304693e-07, "logits/chosen": -2.709765672683716, "logits/rejected": -2.665234327316284, "logps/chosen": -308.54998779296875, "logps/rejected": -431.1499938964844, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9995971918106079, "rewards/margins": 4.78125, "rewards/rejected": -5.78125, "step": 4160 }, { "epoch": 2.1982076963626778, "grad_norm": 21.697919691250984, "learning_rate": 4.505798629414865e-07, "logits/chosen": -2.731250047683716, "logits/rejected": -2.71875, "logps/chosen": -376.6000061035156, "logps/rejected": -496.3999938964844, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -1.121484398841858, "rewards/margins": 4.940625190734863, "rewards/rejected": -6.064062595367432, "step": 4170 }, { "epoch": 2.2034791776489193, "grad_norm": 33.23472816671202, "learning_rate": 4.492619926199262e-07, "logits/chosen": -2.860156297683716, "logits/rejected": -2.852343797683716, "logps/chosen": -307.0, "logps/rejected": -453.1000061035156, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": -0.856152355670929, "rewards/margins": 5.4765625, "rewards/rejected": -6.335156440734863, "step": 4180 }, { "epoch": 2.208750658935161, "grad_norm": 2.696261683513029, "learning_rate": 4.479441222983658e-07, "logits/chosen": -2.80078125, "logits/rejected": -2.6976561546325684, "logps/chosen": -354.1499938964844, "logps/rejected": -459.5, "loss": 0.036, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.543066382408142, "rewards/margins": 5.2734375, "rewards/rejected": -6.8125, "step": 4190 }, { "epoch": 2.2140221402214024, "grad_norm": 3.8257865359549172, "learning_rate": 4.4662625197680546e-07, "logits/chosen": -2.8203125, "logits/rejected": -2.741406202316284, "logps/chosen": -341.0, "logps/rejected": -472.3999938964844, "loss": 0.0371, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6695556640625, "rewards/margins": 5.460156440734863, "rewards/rejected": -7.126562595367432, "step": 4200 }, { "epoch": 2.2192936215076435, "grad_norm": 42.35353216264341, "learning_rate": 4.453083816552451e-07, "logits/chosen": -2.7367186546325684, "logits/rejected": -2.753124952316284, "logps/chosen": -325.42498779296875, "logps/rejected": -425.3500061035156, "loss": 0.0608, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.672265648841858, "rewards/margins": 4.91796875, "rewards/rejected": -6.590624809265137, "step": 4210 }, { "epoch": 2.224565102793885, "grad_norm": 14.516465304040647, "learning_rate": 4.4399051133368476e-07, "logits/chosen": -2.682812452316284, "logits/rejected": -2.7265625, "logps/chosen": -370.54998779296875, "logps/rejected": -513.2000122070312, "loss": 0.0416, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8517577648162842, "rewards/margins": 5.684374809265137, "rewards/rejected": -7.532812595367432, "step": 4220 }, { "epoch": 2.2298365840801266, "grad_norm": 16.50691482781967, "learning_rate": 4.4267264101212435e-07, "logits/chosen": -2.782031297683716, "logits/rejected": -2.6148438453674316, "logps/chosen": -359.1499938964844, "logps/rejected": -498.1000061035156, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6378173828125, "rewards/margins": 5.454687595367432, "rewards/rejected": -7.096875190734863, "step": 4230 }, { "epoch": 2.2351080653663677, "grad_norm": 18.749733765689463, "learning_rate": 4.4135477069056405e-07, "logits/chosen": -2.701953172683716, "logits/rejected": -2.586718797683716, "logps/chosen": -377.375, "logps/rejected": -514.2000122070312, "loss": 0.0497, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1965820789337158, "rewards/margins": 5.763281345367432, "rewards/rejected": -6.963281154632568, "step": 4240 }, { "epoch": 2.2403795466526093, "grad_norm": 8.747835587068046, "learning_rate": 4.4003690036900365e-07, "logits/chosen": -2.7554688453674316, "logits/rejected": -2.67578125, "logps/chosen": -331.3500061035156, "logps/rejected": -476.3999938964844, "loss": 0.065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.208459496498108, "rewards/margins": 5.194531440734863, "rewards/rejected": -6.401562690734863, "step": 4250 }, { "epoch": 2.245651027938851, "grad_norm": 31.734089760947697, "learning_rate": 4.387190300474433e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.8031249046325684, "logps/chosen": -328.1499938964844, "logps/rejected": -418.79998779296875, "loss": 0.0554, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.160058617591858, "rewards/margins": 4.967187404632568, "rewards/rejected": -6.127343654632568, "step": 4260 }, { "epoch": 2.2509225092250924, "grad_norm": 39.18423633623952, "learning_rate": 4.3740115972588294e-07, "logits/chosen": -2.7984375953674316, "logits/rejected": -2.7710938453674316, "logps/chosen": -342.3500061035156, "logps/rejected": -455.1000061035156, "loss": 0.0413, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.187158226966858, "rewards/margins": 5.206250190734863, "rewards/rejected": -6.390625, "step": 4270 }, { "epoch": 2.2561939905113335, "grad_norm": 16.443526559786957, "learning_rate": 4.360832894043226e-07, "logits/chosen": -2.632031202316284, "logits/rejected": -2.703125, "logps/chosen": -356.70001220703125, "logps/rejected": -441.25, "loss": 0.0445, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2688477039337158, "rewards/margins": 5.050000190734863, "rewards/rejected": -6.317187309265137, "step": 4280 }, { "epoch": 2.261465471797575, "grad_norm": 22.276921810145318, "learning_rate": 4.3476541908276224e-07, "logits/chosen": -2.64453125, "logits/rejected": -2.646484375, "logps/chosen": -316.2250061035156, "logps/rejected": -451.3999938964844, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -1.350732445716858, "rewards/margins": 5.301562309265137, "rewards/rejected": -6.654687404632568, "step": 4290 }, { "epoch": 2.2667369530838166, "grad_norm": 18.476750351601485, "learning_rate": 4.334475487612019e-07, "logits/chosen": -2.671875, "logits/rejected": -2.66796875, "logps/chosen": -320.1000061035156, "logps/rejected": -419.8999938964844, "loss": 0.0525, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9908447265625, "rewards/margins": 5.434374809265137, "rewards/rejected": -6.423437595367432, "step": 4300 }, { "epoch": 2.272008434370058, "grad_norm": 21.59700191447698, "learning_rate": 4.3212967843964153e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.6929688453674316, "logps/chosen": -321.54998779296875, "logps/rejected": -452.29998779296875, "loss": 0.0477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.963775634765625, "rewards/margins": 5.3515625, "rewards/rejected": -6.317187309265137, "step": 4310 }, { "epoch": 2.2772799156562993, "grad_norm": 13.490865243687852, "learning_rate": 4.3081180811808113e-07, "logits/chosen": -2.684375047683716, "logits/rejected": -2.63671875, "logps/chosen": -334.70001220703125, "logps/rejected": -473.79998779296875, "loss": 0.0624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.294287085533142, "rewards/margins": 5.510937690734863, "rewards/rejected": -6.810937404632568, "step": 4320 }, { "epoch": 2.282551396942541, "grad_norm": 37.596582502599, "learning_rate": 4.2949393779652083e-07, "logits/chosen": -2.694531202316284, "logits/rejected": -2.6328125, "logps/chosen": -374.8500061035156, "logps/rejected": -446.45001220703125, "loss": 0.0566, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.367040991783142, "rewards/margins": 5.516406059265137, "rewards/rejected": -6.878125190734863, "step": 4330 }, { "epoch": 2.2878228782287824, "grad_norm": 77.79026187472981, "learning_rate": 4.281760674749604e-07, "logits/chosen": -2.530468702316284, "logits/rejected": -2.494140625, "logps/chosen": -340.75, "logps/rejected": -462.20001220703125, "loss": 0.0452, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.198144555091858, "rewards/margins": 5.469531059265137, "rewards/rejected": -6.665625095367432, "step": 4340 }, { "epoch": 2.293094359515024, "grad_norm": 12.65949235069371, "learning_rate": 4.268581971534001e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.6929688453674316, "logps/chosen": -391.29998779296875, "logps/rejected": -473.8500061035156, "loss": 0.046, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.029687523841858, "rewards/margins": 5.285937309265137, "rewards/rejected": -6.315625190734863, "step": 4350 }, { "epoch": 2.298365840801265, "grad_norm": 4.559015007800447, "learning_rate": 4.255403268318397e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.7210936546325684, "logps/chosen": -343.29998779296875, "logps/rejected": -504.0, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.98382568359375, "rewards/margins": 5.485937595367432, "rewards/rejected": -6.470312595367432, "step": 4360 }, { "epoch": 2.3036373220875066, "grad_norm": 12.991619593604392, "learning_rate": 4.2422245651027937e-07, "logits/chosen": -2.7789063453674316, "logits/rejected": -2.707812547683716, "logps/chosen": -345.8999938964844, "logps/rejected": -440.75, "loss": 0.0464, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.408203125, "rewards/margins": 5.254687309265137, "rewards/rejected": -6.665625095367432, "step": 4370 }, { "epoch": 2.308908803373748, "grad_norm": 35.09712210755187, "learning_rate": 4.22904586188719e-07, "logits/chosen": -2.581249952316284, "logits/rejected": -2.612499952316284, "logps/chosen": -340.79998779296875, "logps/rejected": -437.70001220703125, "loss": 0.0642, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.411993384361267, "rewards/margins": 5.375, "rewards/rejected": -6.7890625, "step": 4380 }, { "epoch": 2.3141802846599893, "grad_norm": 42.85095989589048, "learning_rate": 4.2158671586715866e-07, "logits/chosen": -2.7437500953674316, "logits/rejected": -2.629687547683716, "logps/chosen": -301.8999938964844, "logps/rejected": -510.8999938964844, "loss": 0.0722, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.352636694908142, "rewards/margins": 5.684374809265137, "rewards/rejected": -7.0390625, "step": 4390 }, { "epoch": 2.319451765946231, "grad_norm": 84.07069287589054, "learning_rate": 4.2026884554559826e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.7945313453674316, "logps/chosen": -371.3500061035156, "logps/rejected": -449.95001220703125, "loss": 0.0493, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0964844226837158, "rewards/margins": 5.53125, "rewards/rejected": -6.622656345367432, "step": 4400 }, { "epoch": 2.3247232472324724, "grad_norm": 14.931192549475593, "learning_rate": 4.1895097522403796e-07, "logits/chosen": -2.8890624046325684, "logits/rejected": -2.784374952316284, "logps/chosen": -350.95001220703125, "logps/rejected": -503.79998779296875, "loss": 0.0656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4851562976837158, "rewards/margins": 5.620312690734863, "rewards/rejected": -7.103125095367432, "step": 4410 }, { "epoch": 2.329994728518714, "grad_norm": 85.69718796940019, "learning_rate": 4.1763310490247755e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.8203125, "logps/chosen": -347.6499938964844, "logps/rejected": -456.8999938964844, "loss": 0.0785, "rewards/accuracies": 0.96875, "rewards/chosen": -1.52392578125, "rewards/margins": 5.122656345367432, "rewards/rejected": -6.6484375, "step": 4420 }, { "epoch": 2.335266209804955, "grad_norm": 60.85010572479069, "learning_rate": 4.163152345809172e-07, "logits/chosen": -2.754687547683716, "logits/rejected": -2.6484375, "logps/chosen": -278.6000061035156, "logps/rejected": -436.8500061035156, "loss": 0.0474, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0475585460662842, "rewards/margins": 5.778124809265137, "rewards/rejected": -6.828906059265137, "step": 4430 }, { "epoch": 2.3405376910911966, "grad_norm": 41.71088013418291, "learning_rate": 4.149973642593569e-07, "logits/chosen": -2.7757811546325684, "logits/rejected": -2.72265625, "logps/chosen": -332.6000061035156, "logps/rejected": -442.8999938964844, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8545257449150085, "rewards/margins": 5.175000190734863, "rewards/rejected": -6.032031059265137, "step": 4440 }, { "epoch": 2.345809172377438, "grad_norm": 18.95859338856421, "learning_rate": 4.136794939377965e-07, "logits/chosen": -2.807812452316284, "logits/rejected": -2.775390625, "logps/chosen": -334.17498779296875, "logps/rejected": -462.8500061035156, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -1.4170653820037842, "rewards/margins": 5.668749809265137, "rewards/rejected": -7.087500095367432, "step": 4450 }, { "epoch": 2.3510806536636797, "grad_norm": 17.845771994819287, "learning_rate": 4.1236162361623614e-07, "logits/chosen": -2.813281297683716, "logits/rejected": -2.753124952316284, "logps/chosen": -338.1499938964844, "logps/rejected": -508.3500061035156, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.828125, "rewards/margins": 5.739062309265137, "rewards/rejected": -7.565625190734863, "step": 4460 }, { "epoch": 2.356352134949921, "grad_norm": 8.84603096253509, "learning_rate": 4.110437532946758e-07, "logits/chosen": -2.617968797683716, "logits/rejected": -2.6859374046325684, "logps/chosen": -357.25, "logps/rejected": -477.20001220703125, "loss": 0.0538, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7726562023162842, "rewards/margins": 5.409375190734863, "rewards/rejected": -7.182812690734863, "step": 4470 }, { "epoch": 2.3616236162361623, "grad_norm": 64.85330543101293, "learning_rate": 4.0972588297311544e-07, "logits/chosen": -2.832812547683716, "logits/rejected": -2.7953124046325684, "logps/chosen": -360.1000061035156, "logps/rejected": -507.1000061035156, "loss": 0.0472, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.4984862804412842, "rewards/margins": 5.909375190734863, "rewards/rejected": -7.412499904632568, "step": 4480 }, { "epoch": 2.366895097522404, "grad_norm": 11.809168195407258, "learning_rate": 4.0840801265155503e-07, "logits/chosen": -2.783203125, "logits/rejected": -2.785937547683716, "logps/chosen": -333.875, "logps/rejected": -464.6000061035156, "loss": 0.0428, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.658471703529358, "rewards/margins": 5.309374809265137, "rewards/rejected": -6.970312595367432, "step": 4490 }, { "epoch": 2.3721665788086455, "grad_norm": 19.843954638856175, "learning_rate": 4.0709014232999473e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.739062547683716, "logps/chosen": -341.79998779296875, "logps/rejected": -444.3999938964844, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -1.4958984851837158, "rewards/margins": 5.40625, "rewards/rejected": -6.8984375, "step": 4500 }, { "epoch": 2.3774380600948866, "grad_norm": 55.09911811935526, "learning_rate": 4.0577227200843433e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.625781297683716, "logps/chosen": -340.5, "logps/rejected": -444.5, "loss": 0.0585, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.10791015625, "rewards/margins": 5.344531059265137, "rewards/rejected": -6.453125, "step": 4510 }, { "epoch": 2.382709541381128, "grad_norm": 13.861026187353996, "learning_rate": 4.0445440168687403e-07, "logits/chosen": -2.7015624046325684, "logits/rejected": -2.7359375953674316, "logps/chosen": -333.54998779296875, "logps/rejected": -452.04998779296875, "loss": 0.0429, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.27392578125, "rewards/margins": 5.221875190734863, "rewards/rejected": -6.496874809265137, "step": 4520 }, { "epoch": 2.3879810226673697, "grad_norm": 11.078470955095368, "learning_rate": 4.031365313653136e-07, "logits/chosen": -2.7300782203674316, "logits/rejected": -2.719531297683716, "logps/chosen": -358.75, "logps/rejected": -513.2000122070312, "loss": 0.0401, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7580077648162842, "rewards/margins": 5.684374809265137, "rewards/rejected": -7.443749904632568, "step": 4530 }, { "epoch": 2.3932525039536108, "grad_norm": 7.702724668949863, "learning_rate": 4.0181866104375327e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.7320313453674316, "logps/chosen": -447.8999938964844, "logps/rejected": -523.2999877929688, "loss": 0.0501, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.962011694908142, "rewards/margins": 5.599218845367432, "rewards/rejected": -7.557812690734863, "step": 4540 }, { "epoch": 2.3985239852398523, "grad_norm": 41.74789410995279, "learning_rate": 4.005007907221929e-07, "logits/chosen": -2.7953124046325684, "logits/rejected": -2.792187452316284, "logps/chosen": -343.6000061035156, "logps/rejected": -480.6000061035156, "loss": 0.0468, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9384765625, "rewards/margins": 5.79296875, "rewards/rejected": -7.720312595367432, "step": 4550 }, { "epoch": 2.403795466526094, "grad_norm": 27.638262437955493, "learning_rate": 3.9918292040063256e-07, "logits/chosen": -2.7828125953674316, "logits/rejected": -2.74609375, "logps/chosen": -344.04998779296875, "logps/rejected": -468.70001220703125, "loss": 0.0607, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4800293445587158, "rewards/margins": 5.072656154632568, "rewards/rejected": -6.551562309265137, "step": 4560 }, { "epoch": 2.4090669478123354, "grad_norm": 21.907232153292796, "learning_rate": 3.978650500790722e-07, "logits/chosen": -2.8226561546325684, "logits/rejected": -2.75, "logps/chosen": -348.8500061035156, "logps/rejected": -506.1000061035156, "loss": 0.0476, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.436193823814392, "rewards/margins": 5.732031345367432, "rewards/rejected": -7.168749809265137, "step": 4570 }, { "epoch": 2.4143384290985765, "grad_norm": 35.31669116101055, "learning_rate": 3.9654717975751186e-07, "logits/chosen": -2.7007813453674316, "logits/rejected": -2.788281202316284, "logps/chosen": -334.95001220703125, "logps/rejected": -408.3500061035156, "loss": 0.0389, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.19873046875, "rewards/margins": 5.235937595367432, "rewards/rejected": -6.431250095367432, "step": 4580 }, { "epoch": 2.419609910384818, "grad_norm": 27.875266580008386, "learning_rate": 3.952293094359515e-07, "logits/chosen": -2.930468797683716, "logits/rejected": -2.764843702316284, "logps/chosen": -356.125, "logps/rejected": -484.1000061035156, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -1.14324951171875, "rewards/margins": 5.561718940734863, "rewards/rejected": -6.703125, "step": 4590 }, { "epoch": 2.4248813916710596, "grad_norm": 15.216235026898278, "learning_rate": 3.939114391143911e-07, "logits/chosen": -2.7265625, "logits/rejected": -2.7171874046325684, "logps/chosen": -362.95001220703125, "logps/rejected": -444.20001220703125, "loss": 0.062, "rewards/accuracies": 0.96875, "rewards/chosen": -1.164038062095642, "rewards/margins": 5.116406440734863, "rewards/rejected": -6.284375190734863, "step": 4600 }, { "epoch": 2.4301528729573008, "grad_norm": 14.933010196971102, "learning_rate": 3.925935687928308e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.73828125, "logps/chosen": -348.75, "logps/rejected": -464.5, "loss": 0.0645, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2326171398162842, "rewards/margins": 5.457812309265137, "rewards/rejected": -6.6953125, "step": 4610 }, { "epoch": 2.4354243542435423, "grad_norm": 20.860560503492003, "learning_rate": 3.912756984712704e-07, "logits/chosen": -2.8882813453674316, "logits/rejected": -2.846874952316284, "logps/chosen": -348.20001220703125, "logps/rejected": -443.5, "loss": 0.0673, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.01397705078125, "rewards/margins": 4.989843845367432, "rewards/rejected": -6.004687309265137, "step": 4620 }, { "epoch": 2.440695835529784, "grad_norm": 44.50595452610347, "learning_rate": 3.8995782814971004e-07, "logits/chosen": -2.7109375, "logits/rejected": -2.72265625, "logps/chosen": -356.6499938964844, "logps/rejected": -461.6000061035156, "loss": 0.0407, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0065429210662842, "rewards/margins": 5.327343940734863, "rewards/rejected": -6.342187404632568, "step": 4630 }, { "epoch": 2.4459673168160254, "grad_norm": 12.333835921550758, "learning_rate": 3.886399578281497e-07, "logits/chosen": -2.6976561546325684, "logits/rejected": -2.7347655296325684, "logps/chosen": -362.20001220703125, "logps/rejected": -456.0, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": -0.906005859375, "rewards/margins": 5.21484375, "rewards/rejected": -6.120312690734863, "step": 4640 }, { "epoch": 2.451238798102267, "grad_norm": 53.70214293100868, "learning_rate": 3.8732208750658934e-07, "logits/chosen": -2.70703125, "logits/rejected": -2.69140625, "logps/chosen": -378.8500061035156, "logps/rejected": -491.8999938964844, "loss": 0.0434, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.957324206829071, "rewards/margins": 5.292187690734863, "rewards/rejected": -6.251562595367432, "step": 4650 }, { "epoch": 2.456510279388508, "grad_norm": 45.632591766271176, "learning_rate": 3.8600421718502893e-07, "logits/chosen": -2.831249952316284, "logits/rejected": -2.83203125, "logps/chosen": -314.3500061035156, "logps/rejected": -433.5, "loss": 0.059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2849609851837158, "rewards/margins": 5.036718845367432, "rewards/rejected": -6.319531440734863, "step": 4660 }, { "epoch": 2.4617817606747496, "grad_norm": 15.608248239196016, "learning_rate": 3.8468634686346863e-07, "logits/chosen": -2.7359375953674316, "logits/rejected": -2.7750000953674316, "logps/chosen": -373.79998779296875, "logps/rejected": -464.95001220703125, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -1.610742211341858, "rewards/margins": 5.260156154632568, "rewards/rejected": -6.878125190734863, "step": 4670 }, { "epoch": 2.467053241960991, "grad_norm": 9.243378656861372, "learning_rate": 3.8336847654190823e-07, "logits/chosen": -2.78515625, "logits/rejected": -2.83984375, "logps/chosen": -364.45001220703125, "logps/rejected": -474.6000061035156, "loss": 0.0556, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.2380859851837158, "rewards/margins": 5.052343845367432, "rewards/rejected": -6.290625095367432, "step": 4680 }, { "epoch": 2.4723247232472323, "grad_norm": 76.99901049980457, "learning_rate": 3.8205060622034793e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.711718797683716, "logps/chosen": -363.8500061035156, "logps/rejected": -503.3999938964844, "loss": 0.0821, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.9093749523162842, "rewards/margins": 5.479687690734863, "rewards/rejected": -7.384375095367432, "step": 4690 }, { "epoch": 2.477596204533474, "grad_norm": 36.06895924240457, "learning_rate": 3.807327358987875e-07, "logits/chosen": -2.832812547683716, "logits/rejected": -2.776562452316284, "logps/chosen": -305.75, "logps/rejected": -428.8500061035156, "loss": 0.0614, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6395995616912842, "rewards/margins": 5.036718845367432, "rewards/rejected": -6.675000190734863, "step": 4700 }, { "epoch": 2.4828676858197154, "grad_norm": 3.50501554734538, "learning_rate": 3.7941486557722717e-07, "logits/chosen": -2.792187452316284, "logits/rejected": -2.7789063453674316, "logps/chosen": -342.75, "logps/rejected": -457.29998779296875, "loss": 0.0536, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5686767101287842, "rewards/margins": 5.609375, "rewards/rejected": -7.182812690734863, "step": 4710 }, { "epoch": 2.488139167105957, "grad_norm": 5.966322756446567, "learning_rate": 3.7809699525566687e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.803906202316284, "logps/chosen": -354.70001220703125, "logps/rejected": -452.3999938964844, "loss": 0.0395, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5675780773162842, "rewards/margins": 5.332812309265137, "rewards/rejected": -6.8984375, "step": 4720 }, { "epoch": 2.493410648392198, "grad_norm": 32.83027763584583, "learning_rate": 3.7677912493410647e-07, "logits/chosen": -2.7828125953674316, "logits/rejected": -2.770312547683716, "logps/chosen": -356.25, "logps/rejected": -470.8999938964844, "loss": 0.0527, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5945312976837158, "rewards/margins": 5.424218654632568, "rewards/rejected": -7.018750190734863, "step": 4730 }, { "epoch": 2.4986821296784396, "grad_norm": 10.148522243461937, "learning_rate": 3.754612546125461e-07, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.7242188453674316, "logps/chosen": -341.1000061035156, "logps/rejected": -504.6000061035156, "loss": 0.0617, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.261376976966858, "rewards/margins": 5.515625, "rewards/rejected": -6.783593654632568, "step": 4740 }, { "epoch": 2.503953610964681, "grad_norm": 14.940814994506402, "learning_rate": 3.7414338429098576e-07, "logits/chosen": -2.7484374046325684, "logits/rejected": -2.76953125, "logps/chosen": -338.29998779296875, "logps/rejected": -464.0, "loss": 0.0422, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.33349609375, "rewards/margins": 5.357812404632568, "rewards/rejected": -6.6875, "step": 4750 }, { "epoch": 2.5092250922509223, "grad_norm": 13.12520515455798, "learning_rate": 3.728255139694254e-07, "logits/chosen": -2.6695313453674316, "logits/rejected": -2.6988282203674316, "logps/chosen": -363.70001220703125, "logps/rejected": -453.6499938964844, "loss": 0.047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.270349144935608, "rewards/margins": 5.249218940734863, "rewards/rejected": -6.520312309265137, "step": 4760 }, { "epoch": 2.514496573537164, "grad_norm": 12.957257670617983, "learning_rate": 3.71507643647865e-07, "logits/chosen": -2.862499952316284, "logits/rejected": -2.72265625, "logps/chosen": -345.1000061035156, "logps/rejected": -434.0, "loss": 0.0592, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3479797840118408, "rewards/margins": 5.19140625, "rewards/rejected": -6.545312404632568, "step": 4770 }, { "epoch": 2.5197680548234054, "grad_norm": 24.564942675570983, "learning_rate": 3.701897733263047e-07, "logits/chosen": -2.74609375, "logits/rejected": -2.65234375, "logps/chosen": -325.29998779296875, "logps/rejected": -456.1499938964844, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1919434070587158, "rewards/margins": 5.157812595367432, "rewards/rejected": -6.349999904632568, "step": 4780 }, { "epoch": 2.525039536109647, "grad_norm": 25.605539787117237, "learning_rate": 3.688719030047443e-07, "logits/chosen": -2.765625, "logits/rejected": -2.7515625953674316, "logps/chosen": -375.29998779296875, "logps/rejected": -503.1000061035156, "loss": 0.0537, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.347802758216858, "rewards/margins": 5.3828125, "rewards/rejected": -6.728125095367432, "step": 4790 }, { "epoch": 2.5303110173958885, "grad_norm": 67.1290412686009, "learning_rate": 3.6755403268318395e-07, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.8101563453674316, "logps/chosen": -364.5, "logps/rejected": -438.1000061035156, "loss": 0.0435, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.424951195716858, "rewards/margins": 5.154687404632568, "rewards/rejected": -6.572656154632568, "step": 4800 }, { "epoch": 2.5355824986821296, "grad_norm": 24.314667306942322, "learning_rate": 3.662361623616236e-07, "logits/chosen": -2.81640625, "logits/rejected": -2.792187452316284, "logps/chosen": -352.54998779296875, "logps/rejected": -465.20001220703125, "loss": 0.0415, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5646483898162842, "rewards/margins": 5.396874904632568, "rewards/rejected": -6.962500095367432, "step": 4810 }, { "epoch": 2.540853979968371, "grad_norm": 3.5506065741346466, "learning_rate": 3.6491829204006324e-07, "logits/chosen": -2.733593702316284, "logits/rejected": -2.7210936546325684, "logps/chosen": -344.375, "logps/rejected": -481.6499938964844, "loss": 0.0515, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.567285180091858, "rewards/margins": 5.999218940734863, "rewards/rejected": -7.564062595367432, "step": 4820 }, { "epoch": 2.5461254612546127, "grad_norm": 35.60243522923207, "learning_rate": 3.6360042171850284e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.75, "logps/chosen": -318.70001220703125, "logps/rejected": -428.29998779296875, "loss": 0.0405, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.29052734375, "rewards/margins": 5.401562690734863, "rewards/rejected": -6.693749904632568, "step": 4830 }, { "epoch": 2.551396942540854, "grad_norm": 83.06488352710592, "learning_rate": 3.6228255139694254e-07, "logits/chosen": -2.6832032203674316, "logits/rejected": -2.680468797683716, "logps/chosen": -325.29998779296875, "logps/rejected": -409.1000061035156, "loss": 0.0449, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.30810546875, "rewards/margins": 5.529687404632568, "rewards/rejected": -6.8359375, "step": 4840 }, { "epoch": 2.5566684238270954, "grad_norm": 23.365016094391528, "learning_rate": 3.609646810753822e-07, "logits/chosen": -2.7265625, "logits/rejected": -2.714062452316284, "logps/chosen": -341.54998779296875, "logps/rejected": -465.1499938964844, "loss": 0.0607, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.647851586341858, "rewards/margins": 5.278124809265137, "rewards/rejected": -6.928124904632568, "step": 4850 }, { "epoch": 2.561939905113337, "grad_norm": 25.925772330037876, "learning_rate": 3.5964681075382183e-07, "logits/chosen": -2.7359375953674316, "logits/rejected": -2.753124952316284, "logps/chosen": -361.75, "logps/rejected": -472.8999938964844, "loss": 0.049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7190430164337158, "rewards/margins": 5.8984375, "rewards/rejected": -7.6171875, "step": 4860 }, { "epoch": 2.5672113863995785, "grad_norm": 16.240346299202812, "learning_rate": 3.583289404322615e-07, "logits/chosen": -2.698046922683716, "logits/rejected": -2.6929688453674316, "logps/chosen": -386.125, "logps/rejected": -461.70001220703125, "loss": 0.0731, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.4542968273162842, "rewards/margins": 5.049218654632568, "rewards/rejected": -6.5, "step": 4870 }, { "epoch": 2.5724828676858196, "grad_norm": 13.922183632347817, "learning_rate": 3.570110701107011e-07, "logits/chosen": -2.748046875, "logits/rejected": -2.840625047683716, "logps/chosen": -341.8500061035156, "logps/rejected": -423.6000061035156, "loss": 0.0831, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.625097632408142, "rewards/margins": 4.877343654632568, "rewards/rejected": -6.506249904632568, "step": 4880 }, { "epoch": 2.577754348972061, "grad_norm": 14.520597037625842, "learning_rate": 3.556931997891408e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.832812547683716, "logps/chosen": -321.95001220703125, "logps/rejected": -391.75, "loss": 0.067, "rewards/accuracies": 0.96875, "rewards/chosen": -1.26904296875, "rewards/margins": 5.2109375, "rewards/rejected": -6.474999904632568, "step": 4890 }, { "epoch": 2.5830258302583027, "grad_norm": 9.4909683777504, "learning_rate": 3.5437532946758037e-07, "logits/chosen": -2.7867188453674316, "logits/rejected": -2.7249999046325684, "logps/chosen": -320.5, "logps/rejected": -454.3999938964844, "loss": 0.0369, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.842675805091858, "rewards/margins": 5.48828125, "rewards/rejected": -7.337500095367432, "step": 4900 }, { "epoch": 2.588297311544544, "grad_norm": 8.971686885440647, "learning_rate": 3.5305745914602e-07, "logits/chosen": -2.7406249046325684, "logits/rejected": -2.766406297683716, "logps/chosen": -396.54998779296875, "logps/rejected": -458.6000061035156, "loss": 0.0549, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4072387218475342, "rewards/margins": 5.14453125, "rewards/rejected": -6.552343845367432, "step": 4910 }, { "epoch": 2.5935687928307853, "grad_norm": 44.97646746609553, "learning_rate": 3.5173958882445966e-07, "logits/chosen": -2.727734327316284, "logits/rejected": -2.77734375, "logps/chosen": -318.3500061035156, "logps/rejected": -465.79998779296875, "loss": 0.0453, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2430908679962158, "rewards/margins": 5.453125, "rewards/rejected": -6.6953125, "step": 4920 }, { "epoch": 2.598840274117027, "grad_norm": 42.44256959516209, "learning_rate": 3.504217185028993e-07, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.7734375, "logps/chosen": -342.45001220703125, "logps/rejected": -463.1000061035156, "loss": 0.0634, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.366583228111267, "rewards/margins": 5.536718845367432, "rewards/rejected": -6.901562690734863, "step": 4930 }, { "epoch": 2.6041117554032684, "grad_norm": 11.08660353189701, "learning_rate": 3.491038481813389e-07, "logits/chosen": -2.684765577316284, "logits/rejected": -2.71875, "logps/chosen": -343.6499938964844, "logps/rejected": -445.95001220703125, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -1.266210913658142, "rewards/margins": 5.560156345367432, "rewards/rejected": -6.826562404632568, "step": 4940 }, { "epoch": 2.60938323668951, "grad_norm": 39.94258173071973, "learning_rate": 3.477859778597786e-07, "logits/chosen": -2.805468797683716, "logits/rejected": -2.7632813453674316, "logps/chosen": -321.70001220703125, "logps/rejected": -473.79998779296875, "loss": 0.0528, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4453613758087158, "rewards/margins": 5.768750190734863, "rewards/rejected": -7.217187404632568, "step": 4950 }, { "epoch": 2.614654717975751, "grad_norm": 42.92271004934685, "learning_rate": 3.464681075382182e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.7367186546325684, "logps/chosen": -351.1000061035156, "logps/rejected": -488.5, "loss": 0.0613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6325194835662842, "rewards/margins": 5.708593845367432, "rewards/rejected": -7.339062690734863, "step": 4960 }, { "epoch": 2.6199261992619927, "grad_norm": 14.725577031435815, "learning_rate": 3.4515023721665785e-07, "logits/chosen": -2.7542967796325684, "logits/rejected": -2.764843702316284, "logps/chosen": -323.20001220703125, "logps/rejected": -437.1000061035156, "loss": 0.0455, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.459570288658142, "rewards/margins": 5.327343940734863, "rewards/rejected": -6.784375190734863, "step": 4970 }, { "epoch": 2.625197680548234, "grad_norm": 67.65533922456822, "learning_rate": 3.438323668950975e-07, "logits/chosen": -2.741406202316284, "logits/rejected": -2.734375, "logps/chosen": -351.1499938964844, "logps/rejected": -491.79998779296875, "loss": 0.0475, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0550780296325684, "rewards/margins": 5.16015625, "rewards/rejected": -7.2109375, "step": 4980 }, { "epoch": 2.6304691618344753, "grad_norm": 7.2401782589647175, "learning_rate": 3.4251449657353714e-07, "logits/chosen": -2.842968702316284, "logits/rejected": -2.8226561546325684, "logps/chosen": -330.95001220703125, "logps/rejected": -445.70001220703125, "loss": 0.052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7470703125, "rewards/margins": 5.329687595367432, "rewards/rejected": -7.079687595367432, "step": 4990 }, { "epoch": 2.635740643120717, "grad_norm": 5.623898379079398, "learning_rate": 3.411966262519768e-07, "logits/chosen": -2.7867188453674316, "logits/rejected": -2.762500047683716, "logps/chosen": -373.8500061035156, "logps/rejected": -488.20001220703125, "loss": 0.0564, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.80029296875, "rewards/margins": 4.992968559265137, "rewards/rejected": -6.793749809265137, "step": 5000 }, { "epoch": 2.6410121244069584, "grad_norm": 45.65639525736731, "learning_rate": 3.3987875593041644e-07, "logits/chosen": -2.8382811546325684, "logits/rejected": -2.828906297683716, "logps/chosen": -359.0, "logps/rejected": -499.1000061035156, "loss": 0.0414, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.773339867591858, "rewards/margins": 5.675000190734863, "rewards/rejected": -7.456250190734863, "step": 5010 }, { "epoch": 2.6462836056932, "grad_norm": 22.122961512612928, "learning_rate": 3.385608856088561e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -2.731250047683716, "logps/chosen": -384.3500061035156, "logps/rejected": -520.7999877929688, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8474609851837158, "rewards/margins": 5.739843845367432, "rewards/rejected": -7.581250190734863, "step": 5020 }, { "epoch": 2.651555086979441, "grad_norm": 44.35539447893372, "learning_rate": 3.372430152872957e-07, "logits/chosen": -2.8148436546325684, "logits/rejected": -2.7007813453674316, "logps/chosen": -336.5, "logps/rejected": -504.20001220703125, "loss": 0.0424, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7267577648162842, "rewards/margins": 5.543749809265137, "rewards/rejected": -7.264062404632568, "step": 5030 }, { "epoch": 2.6568265682656826, "grad_norm": 26.666784121023408, "learning_rate": 3.359251449657354e-07, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.788281202316284, "logps/chosen": -393.29998779296875, "logps/rejected": -457.04998779296875, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6349608898162842, "rewards/margins": 5.224999904632568, "rewards/rejected": -6.862500190734863, "step": 5040 }, { "epoch": 2.662098049551924, "grad_norm": 89.46010924779519, "learning_rate": 3.34607274644175e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.75, "logps/chosen": -357.3999938964844, "logps/rejected": -470.8999938964844, "loss": 0.0637, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.404394507408142, "rewards/margins": 5.34375, "rewards/rejected": -6.748437404632568, "step": 5050 }, { "epoch": 2.6673695308381653, "grad_norm": 83.07072661614811, "learning_rate": 3.332894043226147e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.7867188453674316, "logps/chosen": -338.3500061035156, "logps/rejected": -482.3999938964844, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1723144054412842, "rewards/margins": 5.632031440734863, "rewards/rejected": -6.806250095367432, "step": 5060 }, { "epoch": 2.672641012124407, "grad_norm": 13.677554297603393, "learning_rate": 3.3197153400105427e-07, "logits/chosen": -2.6898436546325684, "logits/rejected": -2.71484375, "logps/chosen": -382.45001220703125, "logps/rejected": -472.54998779296875, "loss": 0.0535, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8511536121368408, "rewards/margins": 5.496874809265137, "rewards/rejected": -7.3515625, "step": 5070 }, { "epoch": 2.6779124934106484, "grad_norm": 13.380100468438659, "learning_rate": 3.306536636794939e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.733593702316284, "logps/chosen": -323.1000061035156, "logps/rejected": -447.79998779296875, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -1.439916968345642, "rewards/margins": 5.555468559265137, "rewards/rejected": -6.993750095367432, "step": 5080 }, { "epoch": 2.68318397469689, "grad_norm": 60.265792989882144, "learning_rate": 3.2933579335793357e-07, "logits/chosen": -2.6351561546325684, "logits/rejected": -2.733593702316284, "logps/chosen": -317.0, "logps/rejected": -432.3500061035156, "loss": 0.0936, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.582177758216858, "rewards/margins": 5.043749809265137, "rewards/rejected": -6.623437404632568, "step": 5090 }, { "epoch": 2.6884554559831315, "grad_norm": 34.79669891459443, "learning_rate": 3.280179230363732e-07, "logits/chosen": -2.608593702316284, "logits/rejected": -2.671875, "logps/chosen": -374.3999938964844, "logps/rejected": -491.1000061035156, "loss": 0.0641, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0859375, "rewards/margins": 5.557812690734863, "rewards/rejected": -7.642187595367432, "step": 5100 }, { "epoch": 2.6937269372693726, "grad_norm": 22.17329485588596, "learning_rate": 3.267000527148128e-07, "logits/chosen": -2.7476563453674316, "logits/rejected": -2.7085938453674316, "logps/chosen": -368.54998779296875, "logps/rejected": -502.8999938964844, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -2.4488282203674316, "rewards/margins": 5.587500095367432, "rewards/rejected": -8.035937309265137, "step": 5110 }, { "epoch": 2.698998418555614, "grad_norm": 27.797366803779234, "learning_rate": 3.253821823932525e-07, "logits/chosen": -2.6976561546325684, "logits/rejected": -2.6507811546325684, "logps/chosen": -379.45001220703125, "logps/rejected": -509.29998779296875, "loss": 0.0601, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.356640577316284, "rewards/margins": 5.278906345367432, "rewards/rejected": -7.639062404632568, "step": 5120 }, { "epoch": 2.7042698998418553, "grad_norm": 61.15507840630094, "learning_rate": 3.2406431207169216e-07, "logits/chosen": -2.6742186546325684, "logits/rejected": -2.7164063453674316, "logps/chosen": -320.79998779296875, "logps/rejected": -462.5, "loss": 0.0579, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.155078172683716, "rewards/margins": 5.224999904632568, "rewards/rejected": -7.378125190734863, "step": 5130 }, { "epoch": 2.709541381128097, "grad_norm": 10.625021699107187, "learning_rate": 3.2274644175013175e-07, "logits/chosen": -2.6640625, "logits/rejected": -2.717968702316284, "logps/chosen": -299.6499938964844, "logps/rejected": -404.8500061035156, "loss": 0.058, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.75927734375, "rewards/margins": 5.167187690734863, "rewards/rejected": -6.921875, "step": 5140 }, { "epoch": 2.7148128624143384, "grad_norm": 18.90957447767202, "learning_rate": 3.2142857142857145e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.817187547683716, "logps/chosen": -336.70001220703125, "logps/rejected": -446.70001220703125, "loss": 0.0531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0277342796325684, "rewards/margins": 5.581250190734863, "rewards/rejected": -7.604687690734863, "step": 5150 }, { "epoch": 2.72008434370058, "grad_norm": 49.32590770968461, "learning_rate": 3.2011070110701105e-07, "logits/chosen": -2.6976561546325684, "logits/rejected": -2.6953125, "logps/chosen": -341.95001220703125, "logps/rejected": -449.29998779296875, "loss": 0.0739, "rewards/accuracies": 0.96875, "rewards/chosen": -1.748925805091858, "rewards/margins": 5.1875, "rewards/rejected": -6.935937404632568, "step": 5160 }, { "epoch": 2.7253558249868215, "grad_norm": 50.77056367583863, "learning_rate": 3.187928307854507e-07, "logits/chosen": -2.860156297683716, "logits/rejected": -2.8421874046325684, "logps/chosen": -329.95001220703125, "logps/rejected": -456.70001220703125, "loss": 0.068, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.073046922683716, "rewards/margins": 5.303124904632568, "rewards/rejected": -7.384375095367432, "step": 5170 }, { "epoch": 2.7306273062730626, "grad_norm": 15.237247828126689, "learning_rate": 3.1747496046389034e-07, "logits/chosen": -2.7007813453674316, "logits/rejected": -2.6695313453674316, "logps/chosen": -319.6000061035156, "logps/rejected": -468.79998779296875, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -1.77880859375, "rewards/margins": 5.896874904632568, "rewards/rejected": -7.667187690734863, "step": 5180 }, { "epoch": 2.735898787559304, "grad_norm": 20.463645579617804, "learning_rate": 3.1615709014233e-07, "logits/chosen": -2.961718797683716, "logits/rejected": -2.8046875, "logps/chosen": -355.0, "logps/rejected": -481.8999938964844, "loss": 0.0449, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0609374046325684, "rewards/margins": 5.053906440734863, "rewards/rejected": -7.115624904632568, "step": 5190 }, { "epoch": 2.7411702688455457, "grad_norm": 17.850471168160915, "learning_rate": 3.148392198207696e-07, "logits/chosen": -2.8617186546325684, "logits/rejected": -2.8804688453674316, "logps/chosen": -348.3500061035156, "logps/rejected": -454.29998779296875, "loss": 0.038, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.922265648841858, "rewards/margins": 5.3984375, "rewards/rejected": -7.321875095367432, "step": 5200 }, { "epoch": 2.746441750131787, "grad_norm": 19.45384664434194, "learning_rate": 3.135213494992093e-07, "logits/chosen": -2.90234375, "logits/rejected": -2.893749952316284, "logps/chosen": -332.95001220703125, "logps/rejected": -451.5, "loss": 0.0502, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.715234398841858, "rewards/margins": 5.503125190734863, "rewards/rejected": -7.217187404632568, "step": 5210 }, { "epoch": 2.7517132314180284, "grad_norm": 50.65310834545226, "learning_rate": 3.122034791776489e-07, "logits/chosen": -2.68359375, "logits/rejected": -2.76171875, "logps/chosen": -341.04998779296875, "logps/rejected": -446.95001220703125, "loss": 0.0467, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.673242211341858, "rewards/margins": 5.391406059265137, "rewards/rejected": -7.067187309265137, "step": 5220 }, { "epoch": 2.75698471270427, "grad_norm": 35.41127642273708, "learning_rate": 3.108856088560886e-07, "logits/chosen": -2.766406297683716, "logits/rejected": -2.835156202316284, "logps/chosen": -368.1499938964844, "logps/rejected": -476.20001220703125, "loss": 0.0407, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.921484351158142, "rewards/margins": 5.4375, "rewards/rejected": -7.362500190734863, "step": 5230 }, { "epoch": 2.7622561939905115, "grad_norm": 50.75461339082073, "learning_rate": 3.095677385345282e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -2.772656202316284, "logps/chosen": -347.45001220703125, "logps/rejected": -456.6000061035156, "loss": 0.058, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9386718273162842, "rewards/margins": 5.599999904632568, "rewards/rejected": -7.535937309265137, "step": 5240 }, { "epoch": 2.767527675276753, "grad_norm": 35.22875838009829, "learning_rate": 3.082498682129678e-07, "logits/chosen": -2.83984375, "logits/rejected": -2.835156202316284, "logps/chosen": -352.6499938964844, "logps/rejected": -483.20001220703125, "loss": 0.0653, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.56201171875, "rewards/margins": 5.4609375, "rewards/rejected": -7.025000095367432, "step": 5250 }, { "epoch": 2.772799156562994, "grad_norm": 28.63000459713162, "learning_rate": 3.0693199789140747e-07, "logits/chosen": -2.7953124046325684, "logits/rejected": -2.8179688453674316, "logps/chosen": -348.04998779296875, "logps/rejected": -475.0, "loss": 0.07, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.819055199623108, "rewards/margins": 5.448437690734863, "rewards/rejected": -7.260156154632568, "step": 5260 }, { "epoch": 2.7780706378492357, "grad_norm": 43.425599562367125, "learning_rate": 3.056141275698471e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.7593750953674316, "logps/chosen": -394.6000061035156, "logps/rejected": -497.1000061035156, "loss": 0.0741, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.787695288658142, "rewards/margins": 5.107031345367432, "rewards/rejected": -6.896874904632568, "step": 5270 }, { "epoch": 2.783342119135477, "grad_norm": 12.382065661790724, "learning_rate": 3.0429625724828676e-07, "logits/chosen": -2.7867188453674316, "logits/rejected": -2.737499952316284, "logps/chosen": -312.2749938964844, "logps/rejected": -465.8999938964844, "loss": 0.0404, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6085205078125, "rewards/margins": 5.16796875, "rewards/rejected": -6.778124809265137, "step": 5280 }, { "epoch": 2.7886136004217184, "grad_norm": 27.234234416326103, "learning_rate": 3.029783869267264e-07, "logits/chosen": -2.828906297683716, "logits/rejected": -2.854687452316284, "logps/chosen": -323.0, "logps/rejected": -421.1000061035156, "loss": 0.0575, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3591735363006592, "rewards/margins": 4.90625, "rewards/rejected": -6.264062404632568, "step": 5290 }, { "epoch": 2.79388508170796, "grad_norm": 47.628935115149794, "learning_rate": 3.0166051660516606e-07, "logits/chosen": -2.7953124046325684, "logits/rejected": -2.789843797683716, "logps/chosen": -341.32501220703125, "logps/rejected": -473.8999938964844, "loss": 0.045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.275390625, "rewards/margins": 5.474999904632568, "rewards/rejected": -6.75, "step": 5300 }, { "epoch": 2.7991565629942015, "grad_norm": 14.953734571581364, "learning_rate": 3.0034264628360565e-07, "logits/chosen": -2.7789063453674316, "logits/rejected": -2.807812452316284, "logps/chosen": -295.70001220703125, "logps/rejected": -422.79998779296875, "loss": 0.0426, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.479882836341858, "rewards/margins": 5.194531440734863, "rewards/rejected": -6.667187690734863, "step": 5310 }, { "epoch": 2.804428044280443, "grad_norm": 18.543477753135118, "learning_rate": 2.9902477596204535e-07, "logits/chosen": -2.7578125, "logits/rejected": -2.789843797683716, "logps/chosen": -361.6000061035156, "logps/rejected": -455.70001220703125, "loss": 0.0416, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.374609351158142, "rewards/margins": 5.3984375, "rewards/rejected": -6.775000095367432, "step": 5320 }, { "epoch": 2.809699525566684, "grad_norm": 60.209548097780626, "learning_rate": 2.9770690564048495e-07, "logits/chosen": -2.8031249046325684, "logits/rejected": -2.7835936546325684, "logps/chosen": -326.79998779296875, "logps/rejected": -430.70001220703125, "loss": 0.0662, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2931029796600342, "rewards/margins": 5.046875, "rewards/rejected": -6.348437309265137, "step": 5330 }, { "epoch": 2.8149710068529257, "grad_norm": 86.63289135849611, "learning_rate": 2.963890353189246e-07, "logits/chosen": -2.8335938453674316, "logits/rejected": -2.752734422683716, "logps/chosen": -320.8999938964844, "logps/rejected": -427.8999938964844, "loss": 0.0847, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5079224109649658, "rewards/margins": 5.106249809265137, "rewards/rejected": -6.615624904632568, "step": 5340 }, { "epoch": 2.8202424881391672, "grad_norm": 16.288712793189113, "learning_rate": 2.9507116499736424e-07, "logits/chosen": -2.895312547683716, "logits/rejected": -2.8296875953674316, "logps/chosen": -298.25, "logps/rejected": -423.6000061035156, "loss": 0.0724, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4730956554412842, "rewards/margins": 4.94921875, "rewards/rejected": -6.4296875, "step": 5350 }, { "epoch": 2.8255139694254083, "grad_norm": 3.909342686290173, "learning_rate": 2.937532946758039e-07, "logits/chosen": -2.8929686546325684, "logits/rejected": -2.875781297683716, "logps/chosen": -393.20001220703125, "logps/rejected": -489.3500061035156, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -1.2591552734375, "rewards/margins": 5.300000190734863, "rewards/rejected": -6.556250095367432, "step": 5360 }, { "epoch": 2.83078545071165, "grad_norm": 41.466014252341, "learning_rate": 2.924354243542435e-07, "logits/chosen": -2.866406202316284, "logits/rejected": -2.8218750953674316, "logps/chosen": -369.1000061035156, "logps/rejected": -503.79998779296875, "loss": 0.0449, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7771484851837158, "rewards/margins": 5.487500190734863, "rewards/rejected": -7.265625, "step": 5370 }, { "epoch": 2.8360569319978914, "grad_norm": 6.848787532007887, "learning_rate": 2.911175540326832e-07, "logits/chosen": -2.782421827316284, "logits/rejected": -2.7640624046325684, "logps/chosen": -353.6499938964844, "logps/rejected": -441.20001220703125, "loss": 0.0785, "rewards/accuracies": 0.96875, "rewards/chosen": -1.659765601158142, "rewards/margins": 5.0625, "rewards/rejected": -6.721875190734863, "step": 5380 }, { "epoch": 2.841328413284133, "grad_norm": 13.861634411805793, "learning_rate": 2.897996837111228e-07, "logits/chosen": -2.8101563453674316, "logits/rejected": -2.836718797683716, "logps/chosen": -321.1499938964844, "logps/rejected": -427.29998779296875, "loss": 0.0446, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.70849609375, "rewards/margins": 5.3359375, "rewards/rejected": -7.048437595367432, "step": 5390 }, { "epoch": 2.8465998945703745, "grad_norm": 17.036574291016507, "learning_rate": 2.884818133895625e-07, "logits/chosen": -2.7671875953674316, "logits/rejected": -2.766406297683716, "logps/chosen": -385.1000061035156, "logps/rejected": -456.8500061035156, "loss": 0.0722, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0111327171325684, "rewards/margins": 5.033593654632568, "rewards/rejected": -7.050000190734863, "step": 5400 }, { "epoch": 2.8518713758566157, "grad_norm": 47.346864838562595, "learning_rate": 2.871639430680021e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.6226563453674316, "logps/chosen": -377.1499938964844, "logps/rejected": -541.2000122070312, "loss": 0.0418, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9197266101837158, "rewards/margins": 6.103125095367432, "rewards/rejected": -8.015625, "step": 5410 }, { "epoch": 2.857142857142857, "grad_norm": 16.315907192464334, "learning_rate": 2.858460727464417e-07, "logits/chosen": -2.8031249046325684, "logits/rejected": -2.7484374046325684, "logps/chosen": -403.95001220703125, "logps/rejected": -512.4000244140625, "loss": 0.0487, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9660155773162842, "rewards/margins": 5.568749904632568, "rewards/rejected": -7.540625095367432, "step": 5420 }, { "epoch": 2.8624143384290983, "grad_norm": 17.923243990627824, "learning_rate": 2.845282024248814e-07, "logits/chosen": -2.7109375, "logits/rejected": -2.7632813453674316, "logps/chosen": -331.8999938964844, "logps/rejected": -467.5, "loss": 0.045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.74462890625, "rewards/margins": 5.622656345367432, "rewards/rejected": -7.362500190734863, "step": 5430 }, { "epoch": 2.86768581971534, "grad_norm": 15.164948550526848, "learning_rate": 2.83210332103321e-07, "logits/chosen": -2.9000000953674316, "logits/rejected": -2.9359374046325684, "logps/chosen": -376.29998779296875, "logps/rejected": -458.45001220703125, "loss": 0.0459, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5576660633087158, "rewards/margins": 5.47265625, "rewards/rejected": -7.026562690734863, "step": 5440 }, { "epoch": 2.8729573010015814, "grad_norm": 4.140145074846663, "learning_rate": 2.8189246178176067e-07, "logits/chosen": -2.78125, "logits/rejected": -2.7632813453674316, "logps/chosen": -370.29998779296875, "logps/rejected": -517.0999755859375, "loss": 0.045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4609253406524658, "rewards/margins": 5.598437309265137, "rewards/rejected": -7.0625, "step": 5450 }, { "epoch": 2.878228782287823, "grad_norm": 17.2986171293935, "learning_rate": 2.805745914602003e-07, "logits/chosen": -2.8648438453674316, "logits/rejected": -2.7867188453674316, "logps/chosen": -345.75, "logps/rejected": -461.75, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": -1.426171898841858, "rewards/margins": 5.298437595367432, "rewards/rejected": -6.729687690734863, "step": 5460 }, { "epoch": 2.8835002635740645, "grad_norm": 22.23525832456, "learning_rate": 2.7925672113863996e-07, "logits/chosen": -2.73046875, "logits/rejected": -2.796093702316284, "logps/chosen": -392.04998779296875, "logps/rejected": -500.79998779296875, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3575196266174316, "rewards/margins": 5.442968845367432, "rewards/rejected": -7.803124904632568, "step": 5470 }, { "epoch": 2.8887717448603056, "grad_norm": 36.40366133426469, "learning_rate": 2.7793885081707956e-07, "logits/chosen": -2.952343702316284, "logits/rejected": -2.8343749046325684, "logps/chosen": -326.04998779296875, "logps/rejected": -438.3999938964844, "loss": 0.1173, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.827734351158142, "rewards/margins": 5.22265625, "rewards/rejected": -7.052343845367432, "step": 5480 }, { "epoch": 2.894043226146547, "grad_norm": 18.11686779133397, "learning_rate": 2.7662098049551926e-07, "logits/chosen": -2.8890624046325684, "logits/rejected": -2.852343797683716, "logps/chosen": -340.6000061035156, "logps/rejected": -476.79998779296875, "loss": 0.0532, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7267577648162842, "rewards/margins": 5.5546875, "rewards/rejected": -7.282812595367432, "step": 5490 }, { "epoch": 2.8993147074327887, "grad_norm": 33.933760897992606, "learning_rate": 2.7530311017395885e-07, "logits/chosen": -2.789843797683716, "logits/rejected": -2.76953125, "logps/chosen": -316.1000061035156, "logps/rejected": -452.79998779296875, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -1.4248046875, "rewards/margins": 5.34765625, "rewards/rejected": -6.771874904632568, "step": 5500 }, { "epoch": 2.90458618871903, "grad_norm": 42.11610098725784, "learning_rate": 2.739852398523985e-07, "logits/chosen": -2.78125, "logits/rejected": -2.827343702316284, "logps/chosen": -365.5, "logps/rejected": -476.79998779296875, "loss": 0.0496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7512695789337158, "rewards/margins": 5.810156345367432, "rewards/rejected": -7.5546875, "step": 5510 }, { "epoch": 2.9098576700052714, "grad_norm": 30.891286673217504, "learning_rate": 2.7266736953083815e-07, "logits/chosen": -2.8515625, "logits/rejected": -2.832812547683716, "logps/chosen": -322.6499938964844, "logps/rejected": -438.5, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -1.4208984375, "rewards/margins": 5.235937595367432, "rewards/rejected": -6.657812595367432, "step": 5520 }, { "epoch": 2.915129151291513, "grad_norm": 28.07298361749741, "learning_rate": 2.713494992092778e-07, "logits/chosen": -2.9046874046325684, "logits/rejected": -2.9117188453674316, "logps/chosen": -362.1000061035156, "logps/rejected": -454.1499938964844, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2936522960662842, "rewards/margins": 5.361718654632568, "rewards/rejected": -6.660937309265137, "step": 5530 }, { "epoch": 2.9204006325777545, "grad_norm": 49.33357539747382, "learning_rate": 2.700316288877174e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.760937452316284, "logps/chosen": -353.45001220703125, "logps/rejected": -466.8999938964844, "loss": 0.0491, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.720605492591858, "rewards/margins": 5.129687309265137, "rewards/rejected": -6.846875190734863, "step": 5540 }, { "epoch": 2.925672113863996, "grad_norm": 10.107077951978281, "learning_rate": 2.687137585661571e-07, "logits/chosen": -2.75, "logits/rejected": -2.8031249046325684, "logps/chosen": -367.6000061035156, "logps/rejected": -499.1499938964844, "loss": 0.0446, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6804687976837158, "rewards/margins": 5.5546875, "rewards/rejected": -7.235937595367432, "step": 5550 }, { "epoch": 2.930943595150237, "grad_norm": 14.271012577425303, "learning_rate": 2.6739588824459674e-07, "logits/chosen": -2.8179688453674316, "logits/rejected": -2.7671875953674316, "logps/chosen": -331.54998779296875, "logps/rejected": -447.5, "loss": 0.0549, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.803857445716858, "rewards/margins": 5.6875, "rewards/rejected": -7.487500190734863, "step": 5560 }, { "epoch": 2.9362150764364787, "grad_norm": 26.698148266104443, "learning_rate": 2.660780179230364e-07, "logits/chosen": -2.8070311546325684, "logits/rejected": -2.7984375953674316, "logps/chosen": -294.95001220703125, "logps/rejected": -399.3999938964844, "loss": 0.07, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.558203101158142, "rewards/margins": 4.657031059265137, "rewards/rejected": -6.214062690734863, "step": 5570 }, { "epoch": 2.94148655772272, "grad_norm": 17.623923554603845, "learning_rate": 2.6476014760147603e-07, "logits/chosen": -2.801562547683716, "logits/rejected": -2.696093797683716, "logps/chosen": -318.29998779296875, "logps/rejected": -420.0, "loss": 0.0672, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.407617211341858, "rewards/margins": 4.981249809265137, "rewards/rejected": -6.384375095367432, "step": 5580 }, { "epoch": 2.9467580390089614, "grad_norm": 22.26095020605877, "learning_rate": 2.634422772799156e-07, "logits/chosen": -2.782031297683716, "logits/rejected": -2.784374952316284, "logps/chosen": -362.3999938964844, "logps/rejected": -460.6000061035156, "loss": 0.0607, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7392578125, "rewards/margins": 4.737500190734863, "rewards/rejected": -6.478906154632568, "step": 5590 }, { "epoch": 2.952029520295203, "grad_norm": 24.399794177158917, "learning_rate": 2.6212440695835533e-07, "logits/chosen": -2.8257813453674316, "logits/rejected": -2.828906297683716, "logps/chosen": -317.04998779296875, "logps/rejected": -436.3999938964844, "loss": 0.0494, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3290283679962158, "rewards/margins": 5.771874904632568, "rewards/rejected": -7.099999904632568, "step": 5600 }, { "epoch": 2.9573010015814445, "grad_norm": 7.656564645669014, "learning_rate": 2.608065366367949e-07, "logits/chosen": -2.846874952316284, "logits/rejected": -2.8218750953674316, "logps/chosen": -403.3999938964844, "logps/rejected": -468.0, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.36962890625, "rewards/margins": 5.215624809265137, "rewards/rejected": -6.587500095367432, "step": 5610 }, { "epoch": 2.962572482867686, "grad_norm": 17.584907813283262, "learning_rate": 2.5948866631523457e-07, "logits/chosen": -2.745312452316284, "logits/rejected": -2.7421875, "logps/chosen": -325.79998779296875, "logps/rejected": -450.0, "loss": 0.0372, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5496094226837158, "rewards/margins": 5.56640625, "rewards/rejected": -7.121874809265137, "step": 5620 }, { "epoch": 2.967843964153927, "grad_norm": 56.82786903415386, "learning_rate": 2.581707959936742e-07, "logits/chosen": -2.817187547683716, "logits/rejected": -2.828906297683716, "logps/chosen": -404.79998779296875, "logps/rejected": -496.20001220703125, "loss": 0.0493, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7392578125, "rewards/margins": 5.57421875, "rewards/rejected": -7.315625190734863, "step": 5630 }, { "epoch": 2.9731154454401687, "grad_norm": 33.95158889142447, "learning_rate": 2.5685292567211386e-07, "logits/chosen": -2.813281297683716, "logits/rejected": -2.797656297683716, "logps/chosen": -407.5, "logps/rejected": -510.20001220703125, "loss": 0.0396, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.39453125, "rewards/margins": 5.879687309265137, "rewards/rejected": -8.274999618530273, "step": 5640 }, { "epoch": 2.9783869267264103, "grad_norm": 31.92053856979509, "learning_rate": 2.5553505535055346e-07, "logits/chosen": -2.76953125, "logits/rejected": -2.8578124046325684, "logps/chosen": -375.2250061035156, "logps/rejected": -458.29998779296875, "loss": 0.0502, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.660937547683716, "rewards/margins": 5.666406154632568, "rewards/rejected": -8.323437690734863, "step": 5650 }, { "epoch": 2.9836584080126514, "grad_norm": 23.652255735038747, "learning_rate": 2.5421718502899316e-07, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.7796874046325684, "logps/chosen": -345.3999938964844, "logps/rejected": -499.70001220703125, "loss": 0.0499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2542967796325684, "rewards/margins": 5.5, "rewards/rejected": -7.754687309265137, "step": 5660 }, { "epoch": 2.988929889298893, "grad_norm": 5.635800318458864, "learning_rate": 2.5289931470743275e-07, "logits/chosen": -2.844531297683716, "logits/rejected": -2.7984375953674316, "logps/chosen": -353.8500061035156, "logps/rejected": -499.1000061035156, "loss": 0.0492, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.951171875, "rewards/margins": 5.5, "rewards/rejected": -7.443749904632568, "step": 5670 }, { "epoch": 2.9942013705851345, "grad_norm": 33.8264056118505, "learning_rate": 2.515814443858724e-07, "logits/chosen": -2.7210936546325684, "logits/rejected": -2.723437547683716, "logps/chosen": -431.70001220703125, "logps/rejected": -575.0, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -2.257031202316284, "rewards/margins": 6.151562690734863, "rewards/rejected": -8.407812118530273, "step": 5680 }, { "epoch": 2.999472851871376, "grad_norm": 52.91255743694317, "learning_rate": 2.5026357406431205e-07, "logits/chosen": -2.90234375, "logits/rejected": -2.76953125, "logps/chosen": -360.6499938964844, "logps/rejected": -473.6499938964844, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -2.1878905296325684, "rewards/margins": 5.658593654632568, "rewards/rejected": -7.845312595367432, "step": 5690 }, { "epoch": 3.004744333157617, "grad_norm": 9.173004645332952, "learning_rate": 2.489457037427517e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -2.666015625, "logps/chosen": -327.3999938964844, "logps/rejected": -434.45001220703125, "loss": 0.0298, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -1.5446288585662842, "rewards/margins": 5.803124904632568, "rewards/rejected": -7.3515625, "step": 5700 }, { "epoch": 3.0100158144438587, "grad_norm": 2.9688680672545247, "learning_rate": 2.4762783342119134e-07, "logits/chosen": -2.8109374046325684, "logits/rejected": -2.76953125, "logps/chosen": -357.6499938964844, "logps/rejected": -468.79998779296875, "loss": 0.0138, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.680761694908142, "rewards/margins": 6.006249904632568, "rewards/rejected": -7.684374809265137, "step": 5710 }, { "epoch": 3.0152872957301002, "grad_norm": 3.436259490272926, "learning_rate": 2.46309963099631e-07, "logits/chosen": -2.832812547683716, "logits/rejected": -2.84765625, "logps/chosen": -353.79998779296875, "logps/rejected": -488.5, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.769921898841858, "rewards/margins": 6.153124809265137, "rewards/rejected": -7.920312404632568, "step": 5720 }, { "epoch": 3.020558777016342, "grad_norm": 10.631869301416693, "learning_rate": 2.4499209277807064e-07, "logits/chosen": -2.7554688453674316, "logits/rejected": -2.700000047683716, "logps/chosen": -352.75, "logps/rejected": -484.8999938964844, "loss": 0.0299, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.89453125, "rewards/margins": 6.165625095367432, "rewards/rejected": -8.059374809265137, "step": 5730 }, { "epoch": 3.025830258302583, "grad_norm": 4.294548402919998, "learning_rate": 2.4367422245651023e-07, "logits/chosen": -2.875781297683716, "logits/rejected": -2.8656249046325684, "logps/chosen": -406.8999938964844, "logps/rejected": -522.4000244140625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -2.342334032058716, "rewards/margins": 6.4140625, "rewards/rejected": -8.770312309265137, "step": 5740 }, { "epoch": 3.0311017395888245, "grad_norm": 4.2218433878369765, "learning_rate": 2.423563521349499e-07, "logits/chosen": -2.848437547683716, "logits/rejected": -2.864062547683716, "logps/chosen": -324.79998779296875, "logps/rejected": -427.25, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.979101538658142, "rewards/margins": 6.301562309265137, "rewards/rejected": -8.276562690734863, "step": 5750 }, { "epoch": 3.036373220875066, "grad_norm": 8.155637642879888, "learning_rate": 2.4103848181338953e-07, "logits/chosen": -2.836718797683716, "logits/rejected": -2.74609375, "logps/chosen": -331.29998779296875, "logps/rejected": -484.25, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -1.6994140148162842, "rewards/margins": 6.926562309265137, "rewards/rejected": -8.623437881469727, "step": 5760 }, { "epoch": 3.041644702161307, "grad_norm": 4.295034165643975, "learning_rate": 2.3972061149182923e-07, "logits/chosen": -2.890625, "logits/rejected": -2.907031297683716, "logps/chosen": -339.6499938964844, "logps/rejected": -503.8999938964844, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -2.091796875, "rewards/margins": 6.84375, "rewards/rejected": -8.931249618530273, "step": 5770 }, { "epoch": 3.0469161834475487, "grad_norm": 4.6960785265644605, "learning_rate": 2.3840274117026885e-07, "logits/chosen": -2.807812452316284, "logits/rejected": -2.8031249046325684, "logps/chosen": -373.875, "logps/rejected": -482.6000061035156, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -2.7398438453674316, "rewards/margins": 6.6015625, "rewards/rejected": -9.3359375, "step": 5780 }, { "epoch": 3.05218766473379, "grad_norm": 6.742039614646932, "learning_rate": 2.370848708487085e-07, "logits/chosen": -2.913281202316284, "logits/rejected": -2.890625, "logps/chosen": -394.1499938964844, "logps/rejected": -501.70001220703125, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -2.7710938453674316, "rewards/margins": 6.535937309265137, "rewards/rejected": -9.303125381469727, "step": 5790 }, { "epoch": 3.0574591460200318, "grad_norm": 7.404061558870575, "learning_rate": 2.3576700052714812e-07, "logits/chosen": -2.882031202316284, "logits/rejected": -2.90234375, "logps/chosen": -364.54998779296875, "logps/rejected": -483.3999938964844, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7007813453674316, "rewards/margins": 6.546875, "rewards/rejected": -9.243749618530273, "step": 5800 }, { "epoch": 3.062730627306273, "grad_norm": 7.339305568683978, "learning_rate": 2.3444913020558777e-07, "logits/chosen": -2.828125, "logits/rejected": -2.8187499046325684, "logps/chosen": -339.1499938964844, "logps/rejected": -478.3999938964844, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4556641578674316, "rewards/margins": 6.620312690734863, "rewards/rejected": -9.078125, "step": 5810 }, { "epoch": 3.0680021085925144, "grad_norm": 5.263050807789349, "learning_rate": 2.3313125988402741e-07, "logits/chosen": -2.8394532203674316, "logits/rejected": -2.862499952316284, "logps/chosen": -349.0, "logps/rejected": -511.8999938964844, "loss": 0.0172, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.458789110183716, "rewards/margins": 6.8359375, "rewards/rejected": -9.292187690734863, "step": 5820 }, { "epoch": 3.073273589878756, "grad_norm": 3.9847997369149577, "learning_rate": 2.3181338956246703e-07, "logits/chosen": -2.8492188453674316, "logits/rejected": -2.890625, "logps/chosen": -398.92498779296875, "logps/rejected": -538.0999755859375, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.880078077316284, "rewards/margins": 7.5625, "rewards/rejected": -10.4375, "step": 5830 }, { "epoch": 3.0785450711649975, "grad_norm": 42.24400085574194, "learning_rate": 2.3049551924090668e-07, "logits/chosen": -2.825000047683716, "logits/rejected": -2.83984375, "logps/chosen": -338.5, "logps/rejected": -472.1000061035156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -2.653125047683716, "rewards/margins": 7.114062309265137, "rewards/rejected": -9.765625, "step": 5840 }, { "epoch": 3.0838165524512386, "grad_norm": 22.568243990457315, "learning_rate": 2.2917764891934633e-07, "logits/chosen": -2.88671875, "logits/rejected": -2.940624952316284, "logps/chosen": -384.8999938964844, "logps/rejected": -501.79998779296875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -2.5628905296325684, "rewards/margins": 6.984375, "rewards/rejected": -9.550000190734863, "step": 5850 }, { "epoch": 3.08908803373748, "grad_norm": 2.2803989320967006, "learning_rate": 2.2785977859778595e-07, "logits/chosen": -2.984375, "logits/rejected": -3.0078125, "logps/chosen": -383.8999938964844, "logps/rejected": -517.7999877929688, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -2.9097657203674316, "rewards/margins": 7.046875, "rewards/rejected": -9.957812309265137, "step": 5860 }, { "epoch": 3.0943595150237218, "grad_norm": 12.388169048803084, "learning_rate": 2.265419082762256e-07, "logits/chosen": -2.8125, "logits/rejected": -2.8843750953674316, "logps/chosen": -401.95001220703125, "logps/rejected": -545.0999755859375, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -2.913281202316284, "rewards/margins": 7.025000095367432, "rewards/rejected": -9.939062118530273, "step": 5870 }, { "epoch": 3.0996309963099633, "grad_norm": 4.026129943983278, "learning_rate": 2.2522403795466525e-07, "logits/chosen": -2.9140625, "logits/rejected": -2.934375047683716, "logps/chosen": -333.6000061035156, "logps/rejected": -449.0, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.764453172683716, "rewards/margins": 7.110937595367432, "rewards/rejected": -9.878125190734863, "step": 5880 }, { "epoch": 3.1049024775962044, "grad_norm": 11.557413384063675, "learning_rate": 2.239061676331049e-07, "logits/chosen": -2.868359327316284, "logits/rejected": -2.9234375953674316, "logps/chosen": -366.3999938964844, "logps/rejected": -523.4000244140625, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -3.2808594703674316, "rewards/margins": 6.8671875, "rewards/rejected": -10.149999618530273, "step": 5890 }, { "epoch": 3.110173958882446, "grad_norm": 9.842766930197202, "learning_rate": 2.2258829731154451e-07, "logits/chosen": -2.9281249046325684, "logits/rejected": -2.9156250953674316, "logps/chosen": -371.04998779296875, "logps/rejected": -513.5999755859375, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.229687452316284, "rewards/margins": 6.8203125, "rewards/rejected": -10.053125381469727, "step": 5900 }, { "epoch": 3.1154454401686875, "grad_norm": 14.711195146547324, "learning_rate": 2.212704269899842e-07, "logits/chosen": -2.9312500953674316, "logits/rejected": -2.8671875, "logps/chosen": -344.04998779296875, "logps/rejected": -498.79998779296875, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -3.173828125, "rewards/margins": 6.589062690734863, "rewards/rejected": -9.7578125, "step": 5910 }, { "epoch": 3.1207169214549286, "grad_norm": 4.955137440972616, "learning_rate": 2.1995255666842384e-07, "logits/chosen": -2.7835936546325684, "logits/rejected": -2.864062547683716, "logps/chosen": -346.25, "logps/rejected": -506.0, "loss": 0.0306, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.705078125, "rewards/margins": 6.6796875, "rewards/rejected": -9.384374618530273, "step": 5920 }, { "epoch": 3.12598840274117, "grad_norm": 3.849957788127257, "learning_rate": 2.1863468634686346e-07, "logits/chosen": -2.8984375, "logits/rejected": -2.8726563453674316, "logps/chosen": -359.29998779296875, "logps/rejected": -482.0, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -2.3499999046325684, "rewards/margins": 6.520312309265137, "rewards/rejected": -8.871874809265137, "step": 5930 }, { "epoch": 3.1312598840274117, "grad_norm": 14.384980113478665, "learning_rate": 2.173168160253031e-07, "logits/chosen": -2.885937452316284, "logits/rejected": -2.848828077316284, "logps/chosen": -378.07501220703125, "logps/rejected": -498.79998779296875, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -2.6265625953674316, "rewards/margins": 6.6484375, "rewards/rejected": -9.274999618530273, "step": 5940 }, { "epoch": 3.1365313653136533, "grad_norm": 12.675004828088419, "learning_rate": 2.1599894570374275e-07, "logits/chosen": -2.946093797683716, "logits/rejected": -2.8984375, "logps/chosen": -356.8999938964844, "logps/rejected": -481.6000061035156, "loss": 0.0154, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5648436546325684, "rewards/margins": 6.849999904632568, "rewards/rejected": -9.415624618530273, "step": 5950 }, { "epoch": 3.1418028465998944, "grad_norm": 51.432345104113246, "learning_rate": 2.1468107538218237e-07, "logits/chosen": -2.850781202316284, "logits/rejected": -2.8296875953674316, "logps/chosen": -375.3500061035156, "logps/rejected": -474.3999938964844, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.331249952316284, "rewards/margins": 6.209374904632568, "rewards/rejected": -8.543749809265137, "step": 5960 }, { "epoch": 3.147074327886136, "grad_norm": 10.261968140952867, "learning_rate": 2.1336320506062202e-07, "logits/chosen": -2.9398436546325684, "logits/rejected": -2.936718702316284, "logps/chosen": -333.6000061035156, "logps/rejected": -481.3500061035156, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.4478516578674316, "rewards/margins": 6.645312309265137, "rewards/rejected": -9.095312118530273, "step": 5970 }, { "epoch": 3.1523458091723775, "grad_norm": 3.958145330763275, "learning_rate": 2.1204533473906167e-07, "logits/chosen": -2.8890624046325684, "logits/rejected": -2.8023438453674316, "logps/chosen": -342.20001220703125, "logps/rejected": -474.6499938964844, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.560546875, "rewards/margins": 6.349999904632568, "rewards/rejected": -8.9140625, "step": 5980 }, { "epoch": 3.157617290458619, "grad_norm": 7.147606253268531, "learning_rate": 2.1072746441750132e-07, "logits/chosen": -2.9273438453674316, "logits/rejected": -2.84375, "logps/chosen": -357.6000061035156, "logps/rejected": -508.20001220703125, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -3.0738282203674316, "rewards/margins": 6.760937690734863, "rewards/rejected": -9.834375381469727, "step": 5990 }, { "epoch": 3.16288877174486, "grad_norm": 2.2565118705204896, "learning_rate": 2.0940959409594094e-07, "logits/chosen": -2.8609375953674316, "logits/rejected": -2.8687500953674316, "logps/chosen": -369.6499938964844, "logps/rejected": -506.3999938964844, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.9634766578674316, "rewards/margins": 7.295312404632568, "rewards/rejected": -10.260937690734863, "step": 6000 }, { "epoch": 3.1681602530311017, "grad_norm": 23.80352388965675, "learning_rate": 2.0809172377438058e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.8460936546325684, "logps/chosen": -339.3999938964844, "logps/rejected": -473.8999938964844, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -3.2347655296325684, "rewards/margins": 6.584374904632568, "rewards/rejected": -9.8203125, "step": 6010 }, { "epoch": 3.1734317343173433, "grad_norm": 7.535964732039914, "learning_rate": 2.0677385345282023e-07, "logits/chosen": -2.9453125, "logits/rejected": -2.971874952316284, "logps/chosen": -379.5, "logps/rejected": -503.6000061035156, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.255859375, "rewards/margins": 6.703125, "rewards/rejected": -9.964062690734863, "step": 6020 }, { "epoch": 3.1787032156035844, "grad_norm": 4.879846090607543, "learning_rate": 2.0545598313125985e-07, "logits/chosen": -2.8812499046325684, "logits/rejected": -2.907031297683716, "logps/chosen": -395.8999938964844, "logps/rejected": -491.3999938964844, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.755078077316284, "rewards/margins": 6.490624904632568, "rewards/rejected": -9.243749618530273, "step": 6030 }, { "epoch": 3.183974696889826, "grad_norm": 22.33403282334566, "learning_rate": 2.041381128096995e-07, "logits/chosen": -2.8515625, "logits/rejected": -2.867968797683716, "logps/chosen": -356.25, "logps/rejected": -492.04998779296875, "loss": 0.0249, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7552733421325684, "rewards/margins": 6.7578125, "rewards/rejected": -9.512499809265137, "step": 6040 }, { "epoch": 3.1892461781760675, "grad_norm": 5.080417488730043, "learning_rate": 2.0282024248813917e-07, "logits/chosen": -2.8101563453674316, "logits/rejected": -2.7796874046325684, "logps/chosen": -330.70001220703125, "logps/rejected": -469.45001220703125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -2.70703125, "rewards/margins": 6.875, "rewards/rejected": -9.5859375, "step": 6050 }, { "epoch": 3.194517659462309, "grad_norm": 9.29526480246406, "learning_rate": 2.0150237216657882e-07, "logits/chosen": -2.9140625, "logits/rejected": -2.871875047683716, "logps/chosen": -377.8999938964844, "logps/rejected": -505.20001220703125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -3.083203077316284, "rewards/margins": 6.581250190734863, "rewards/rejected": -9.667187690734863, "step": 6060 }, { "epoch": 3.19978914074855, "grad_norm": 15.443497893414918, "learning_rate": 2.0018450184501844e-07, "logits/chosen": -2.8203125, "logits/rejected": -2.8921875953674316, "logps/chosen": -372.8999938964844, "logps/rejected": -501.25, "loss": 0.0183, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9039063453674316, "rewards/margins": 6.967187404632568, "rewards/rejected": -9.871874809265137, "step": 6070 }, { "epoch": 3.2050606220347917, "grad_norm": 18.687376613876665, "learning_rate": 1.988666315234581e-07, "logits/chosen": -2.7906250953674316, "logits/rejected": -2.8296875953674316, "logps/chosen": -382.04998779296875, "logps/rejected": -465.20001220703125, "loss": 0.0179, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.42578125, "rewards/margins": 6.71875, "rewards/rejected": -9.135937690734863, "step": 6080 }, { "epoch": 3.2103321033210332, "grad_norm": 2.6054612956674617, "learning_rate": 1.9754876120189774e-07, "logits/chosen": -2.8671875, "logits/rejected": -2.8492188453674316, "logps/chosen": -368.04998779296875, "logps/rejected": -509.29998779296875, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.829296827316284, "rewards/margins": 6.904687404632568, "rewards/rejected": -9.735937118530273, "step": 6090 }, { "epoch": 3.215603584607275, "grad_norm": 30.12982856160416, "learning_rate": 1.9623089088033736e-07, "logits/chosen": -2.92578125, "logits/rejected": -2.99609375, "logps/chosen": -356.0, "logps/rejected": -492.0, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.984375, "rewards/margins": 6.735937595367432, "rewards/rejected": -9.717187881469727, "step": 6100 }, { "epoch": 3.220875065893516, "grad_norm": 2.7614867907842737, "learning_rate": 1.94913020558777e-07, "logits/chosen": -2.917187452316284, "logits/rejected": -2.9742188453674316, "logps/chosen": -383.79998779296875, "logps/rejected": -496.8999938964844, "loss": 0.0205, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0947265625, "rewards/margins": 6.785937309265137, "rewards/rejected": -9.879687309265137, "step": 6110 }, { "epoch": 3.2261465471797575, "grad_norm": 2.559533723246061, "learning_rate": 1.9359515023721665e-07, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.932812452316284, "logps/chosen": -380.1000061035156, "logps/rejected": -526.0999755859375, "loss": 0.0214, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9273438453674316, "rewards/margins": 7.215624809265137, "rewards/rejected": -10.146875381469727, "step": 6120 }, { "epoch": 3.231418028465999, "grad_norm": 5.62204755991924, "learning_rate": 1.9227727991565628e-07, "logits/chosen": -2.716015577316284, "logits/rejected": -2.90234375, "logps/chosen": -368.8500061035156, "logps/rejected": -488.6000061035156, "loss": 0.0127, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.642773389816284, "rewards/margins": 7.4453125, "rewards/rejected": -10.090624809265137, "step": 6130 }, { "epoch": 3.2366895097522406, "grad_norm": 7.191908893271037, "learning_rate": 1.9095940959409592e-07, "logits/chosen": -2.938281297683716, "logits/rejected": -2.9820313453674316, "logps/chosen": -394.04998779296875, "logps/rejected": -493.20001220703125, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.717968702316284, "rewards/margins": 6.801562309265137, "rewards/rejected": -9.520312309265137, "step": 6140 }, { "epoch": 3.2419609910384817, "grad_norm": 4.934840469013626, "learning_rate": 1.8964153927253557e-07, "logits/chosen": -2.92578125, "logits/rejected": -2.885937452316284, "logps/chosen": -325.6499938964844, "logps/rejected": -454.29998779296875, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -2.903515577316284, "rewards/margins": 7.009375095367432, "rewards/rejected": -9.904687881469727, "step": 6150 }, { "epoch": 3.2472324723247232, "grad_norm": 4.133610279621868, "learning_rate": 1.8832366895097522e-07, "logits/chosen": -2.8179688453674316, "logits/rejected": -2.8609375953674316, "logps/chosen": -344.70001220703125, "logps/rejected": -476.29998779296875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.6449217796325684, "rewards/margins": 6.846875190734863, "rewards/rejected": -9.496874809265137, "step": 6160 }, { "epoch": 3.252503953610965, "grad_norm": 3.998370352740276, "learning_rate": 1.8700579862941484e-07, "logits/chosen": -2.817187547683716, "logits/rejected": -2.8890624046325684, "logps/chosen": -352.1499938964844, "logps/rejected": -545.7999877929688, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -2.9839844703674316, "rewards/margins": 7.503125190734863, "rewards/rejected": -10.490625381469727, "step": 6170 }, { "epoch": 3.257775434897206, "grad_norm": 5.392457181164391, "learning_rate": 1.856879283078545e-07, "logits/chosen": -2.7890625, "logits/rejected": -2.7914061546325684, "logps/chosen": -366.3999938964844, "logps/rejected": -509.8500061035156, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -2.977343797683716, "rewards/margins": 7.131249904632568, "rewards/rejected": -10.107812881469727, "step": 6180 }, { "epoch": 3.2630469161834474, "grad_norm": 2.67129886804871, "learning_rate": 1.8437005798629416e-07, "logits/chosen": -2.831249952316284, "logits/rejected": -2.8687500953674316, "logps/chosen": -373.0, "logps/rejected": -498.79998779296875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -3.102343797683716, "rewards/margins": 7.098437309265137, "rewards/rejected": -10.198437690734863, "step": 6190 }, { "epoch": 3.268318397469689, "grad_norm": 6.658619498652495, "learning_rate": 1.8305218766473378e-07, "logits/chosen": -2.7789063453674316, "logits/rejected": -2.842968702316284, "logps/chosen": -370.82501220703125, "logps/rejected": -521.5499877929688, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -2.941601514816284, "rewards/margins": 7.115624904632568, "rewards/rejected": -10.056249618530273, "step": 6200 }, { "epoch": 3.2735898787559305, "grad_norm": 5.188932067575655, "learning_rate": 1.8173431734317343e-07, "logits/chosen": -3.035937547683716, "logits/rejected": -2.926562547683716, "logps/chosen": -385.8500061035156, "logps/rejected": -523.5999755859375, "loss": 0.0321, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.150390625, "rewards/margins": 6.754687309265137, "rewards/rejected": -9.899999618530273, "step": 6210 }, { "epoch": 3.2788613600421717, "grad_norm": 67.68264356181432, "learning_rate": 1.8041644702161308e-07, "logits/chosen": -2.867968797683716, "logits/rejected": -2.918750047683716, "logps/chosen": -357.5, "logps/rejected": -496.29998779296875, "loss": 0.0305, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.89453125, "rewards/margins": 6.896874904632568, "rewards/rejected": -9.785937309265137, "step": 6220 }, { "epoch": 3.284132841328413, "grad_norm": 13.86297694216105, "learning_rate": 1.790985767000527e-07, "logits/chosen": -2.96484375, "logits/rejected": -2.9007811546325684, "logps/chosen": -371.29998779296875, "logps/rejected": -452.5, "loss": 0.0174, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.842578172683716, "rewards/margins": 6.503125190734863, "rewards/rejected": -9.348437309265137, "step": 6230 }, { "epoch": 3.2894043226146548, "grad_norm": 8.136979073634459, "learning_rate": 1.7778070637849235e-07, "logits/chosen": -2.9117188453674316, "logits/rejected": -2.948437452316284, "logps/chosen": -329.95001220703125, "logps/rejected": -444.8999938964844, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -2.8021483421325684, "rewards/margins": 6.629687309265137, "rewards/rejected": -9.432812690734863, "step": 6240 }, { "epoch": 3.2946758039008963, "grad_norm": 28.85652125292597, "learning_rate": 1.76462836056932e-07, "logits/chosen": -2.928906202316284, "logits/rejected": -2.858593702316284, "logps/chosen": -353.70001220703125, "logps/rejected": -504.5, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -3.106640577316284, "rewards/margins": 6.878125190734863, "rewards/rejected": -9.982812881469727, "step": 6250 }, { "epoch": 3.2999472851871374, "grad_norm": 4.011383908786923, "learning_rate": 1.7514496573537164e-07, "logits/chosen": -2.90625, "logits/rejected": -2.901562452316284, "logps/chosen": -376.20001220703125, "logps/rejected": -521.9000244140625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -3.05078125, "rewards/margins": 7.184374809265137, "rewards/rejected": -10.235937118530273, "step": 6260 }, { "epoch": 3.305218766473379, "grad_norm": 5.400147054772792, "learning_rate": 1.7382709541381126e-07, "logits/chosen": -2.801562547683716, "logits/rejected": -2.84765625, "logps/chosen": -363.75, "logps/rejected": -517.2999877929688, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -2.789843797683716, "rewards/margins": 6.7890625, "rewards/rejected": -9.579687118530273, "step": 6270 }, { "epoch": 3.3104902477596205, "grad_norm": 6.946340027021249, "learning_rate": 1.725092250922509e-07, "logits/chosen": -2.78125, "logits/rejected": -2.7828125953674316, "logps/chosen": -393.1499938964844, "logps/rejected": -502.29998779296875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -2.8497557640075684, "rewards/margins": 6.778124809265137, "rewards/rejected": -9.631250381469727, "step": 6280 }, { "epoch": 3.315761729045862, "grad_norm": 17.19929132284622, "learning_rate": 1.7119135477069056e-07, "logits/chosen": -2.856250047683716, "logits/rejected": -2.907031297683716, "logps/chosen": -349.54998779296875, "logps/rejected": -465.79998779296875, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -3.1328125, "rewards/margins": 6.862500190734863, "rewards/rejected": -9.996874809265137, "step": 6290 }, { "epoch": 3.321033210332103, "grad_norm": 2.0029767549963915, "learning_rate": 1.6987348444913018e-07, "logits/chosen": -2.977343797683716, "logits/rejected": -2.859375, "logps/chosen": -328.6499938964844, "logps/rejected": -522.2000122070312, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -2.934765577316284, "rewards/margins": 7.3515625, "rewards/rejected": -10.279687881469727, "step": 6300 }, { "epoch": 3.3263046916183447, "grad_norm": 10.658399820997216, "learning_rate": 1.6855561412756983e-07, "logits/chosen": -2.895312547683716, "logits/rejected": -2.9281249046325684, "logps/chosen": -366.6000061035156, "logps/rejected": -469.75, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.624755859375, "rewards/margins": 6.735937595367432, "rewards/rejected": -9.362500190734863, "step": 6310 }, { "epoch": 3.3315761729045863, "grad_norm": 1.4705171638601, "learning_rate": 1.6723774380600947e-07, "logits/chosen": -2.871875047683716, "logits/rejected": -2.9124999046325684, "logps/chosen": -403.54998779296875, "logps/rejected": -528.0, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -3.3648438453674316, "rewards/margins": 7.4453125, "rewards/rejected": -10.8125, "step": 6320 }, { "epoch": 3.3368476541908274, "grad_norm": 8.442813284413049, "learning_rate": 1.6591987348444915e-07, "logits/chosen": -2.950000047683716, "logits/rejected": -2.8617186546325684, "logps/chosen": -353.6000061035156, "logps/rejected": -490.1000061035156, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -2.792187452316284, "rewards/margins": 6.728125095367432, "rewards/rejected": -9.517187118530273, "step": 6330 }, { "epoch": 3.342119135477069, "grad_norm": 7.948724816227337, "learning_rate": 1.6460200316288877e-07, "logits/chosen": -2.9312500953674316, "logits/rejected": -2.967968702316284, "logps/chosen": -388.79998779296875, "logps/rejected": -521.0999755859375, "loss": 0.018, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2515625953674316, "rewards/margins": 6.892187595367432, "rewards/rejected": -10.140625, "step": 6340 }, { "epoch": 3.3473906167633105, "grad_norm": 56.80756767959709, "learning_rate": 1.6328413284132842e-07, "logits/chosen": -2.932812452316284, "logits/rejected": -2.9195313453674316, "logps/chosen": -352.70001220703125, "logps/rejected": -465.6000061035156, "loss": 0.0198, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.031445264816284, "rewards/margins": 7.104687690734863, "rewards/rejected": -10.135937690734863, "step": 6350 }, { "epoch": 3.352662098049552, "grad_norm": 5.730441315393223, "learning_rate": 1.6196626251976806e-07, "logits/chosen": -2.912109375, "logits/rejected": -2.901562452316284, "logps/chosen": -377.6499938964844, "logps/rejected": -526.0, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -3.26171875, "rewards/margins": 7.092187404632568, "rewards/rejected": -10.353124618530273, "step": 6360 }, { "epoch": 3.357933579335793, "grad_norm": 3.9253391024289006, "learning_rate": 1.6064839219820768e-07, "logits/chosen": -3.01171875, "logits/rejected": -2.9984374046325684, "logps/chosen": -371.3999938964844, "logps/rejected": -516.0999755859375, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -3.3902344703674316, "rewards/margins": 7.003125190734863, "rewards/rejected": -10.395312309265137, "step": 6370 }, { "epoch": 3.3632050606220347, "grad_norm": 8.301465493265505, "learning_rate": 1.5933052187664733e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.8968749046325684, "logps/chosen": -376.6000061035156, "logps/rejected": -504.20001220703125, "loss": 0.019, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.294921875, "rewards/margins": 7.167187690734863, "rewards/rejected": -10.465624809265137, "step": 6380 }, { "epoch": 3.3684765419082763, "grad_norm": 21.157722176047898, "learning_rate": 1.5801265155508698e-07, "logits/chosen": -2.905468702316284, "logits/rejected": -2.895312547683716, "logps/chosen": -365.75, "logps/rejected": -494.5, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -3.0033202171325684, "rewards/margins": 6.934374809265137, "rewards/rejected": -9.934374809265137, "step": 6390 }, { "epoch": 3.373748023194518, "grad_norm": 15.992168486511332, "learning_rate": 1.566947812335266e-07, "logits/chosen": -2.895312547683716, "logits/rejected": -2.991406202316284, "logps/chosen": -364.04998779296875, "logps/rejected": -496.3999938964844, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.283203125, "rewards/margins": 7.457812309265137, "rewards/rejected": -10.737500190734863, "step": 6400 }, { "epoch": 3.379019504480759, "grad_norm": 5.421154135323722, "learning_rate": 1.5537691091196625e-07, "logits/chosen": -3.0054688453674316, "logits/rejected": -2.938281297683716, "logps/chosen": -368.6000061035156, "logps/rejected": -473.1499938964844, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -3.369140625, "rewards/margins": 6.792187690734863, "rewards/rejected": -10.159375190734863, "step": 6410 }, { "epoch": 3.3842909857670005, "grad_norm": 5.487773767750452, "learning_rate": 1.540590405904059e-07, "logits/chosen": -2.8515625, "logits/rejected": -2.991406202316284, "logps/chosen": -416.04998779296875, "logps/rejected": -493.1000061035156, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.37890625, "rewards/margins": 6.729687690734863, "rewards/rejected": -10.100000381469727, "step": 6420 }, { "epoch": 3.389562467053242, "grad_norm": 32.654570165549615, "learning_rate": 1.5274117026884554e-07, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.9000000953674316, "logps/chosen": -387.45001220703125, "logps/rejected": -539.7999877929688, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -3.196093797683716, "rewards/margins": 6.821875095367432, "rewards/rejected": -10.015625, "step": 6430 }, { "epoch": 3.3948339483394836, "grad_norm": 11.697640096507577, "learning_rate": 1.5142329994728516e-07, "logits/chosen": -2.940624952316284, "logits/rejected": -2.9437499046325684, "logps/chosen": -370.8999938964844, "logps/rejected": -502.0, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -3.5240235328674316, "rewards/margins": 6.650000095367432, "rewards/rejected": -10.178125381469727, "step": 6440 }, { "epoch": 3.4001054296257247, "grad_norm": 7.15574113117011, "learning_rate": 1.501054296257248e-07, "logits/chosen": -2.897656202316284, "logits/rejected": -2.905468702316284, "logps/chosen": -358.25, "logps/rejected": -484.6499938964844, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.826171875, "rewards/margins": 6.650000095367432, "rewards/rejected": -9.478124618530273, "step": 6450 }, { "epoch": 3.4053769109119663, "grad_norm": 3.455507249436526, "learning_rate": 1.4878755930416446e-07, "logits/chosen": -2.967968702316284, "logits/rejected": -2.95703125, "logps/chosen": -310.20001220703125, "logps/rejected": -443.1000061035156, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.640625, "rewards/margins": 6.887499809265137, "rewards/rejected": -9.537500381469727, "step": 6460 }, { "epoch": 3.410648392198208, "grad_norm": 2.403421047994701, "learning_rate": 1.474696889826041e-07, "logits/chosen": -2.9976563453674316, "logits/rejected": -3.043750047683716, "logps/chosen": -365.5, "logps/rejected": -523.5999755859375, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -2.9800782203674316, "rewards/margins": 7.323437690734863, "rewards/rejected": -10.301562309265137, "step": 6470 }, { "epoch": 3.415919873484449, "grad_norm": 4.22760156320338, "learning_rate": 1.4615181866104375e-07, "logits/chosen": -2.8226561546325684, "logits/rejected": -2.832812547683716, "logps/chosen": -311.8999938964844, "logps/rejected": -478.6000061035156, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -2.893359422683716, "rewards/margins": 7.204687595367432, "rewards/rejected": -10.09375, "step": 6480 }, { "epoch": 3.4211913547706905, "grad_norm": 13.209484396379436, "learning_rate": 1.448339483394834e-07, "logits/chosen": -2.957812547683716, "logits/rejected": -2.93359375, "logps/chosen": -411.54998779296875, "logps/rejected": -525.7000122070312, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.2308592796325684, "rewards/margins": 6.712500095367432, "rewards/rejected": -9.949999809265137, "step": 6490 }, { "epoch": 3.426462836056932, "grad_norm": 3.069577619009565, "learning_rate": 1.4351607801792305e-07, "logits/chosen": -3.055468797683716, "logits/rejected": -2.9820313453674316, "logps/chosen": -354.6499938964844, "logps/rejected": -483.0, "loss": 0.0254, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9964842796325684, "rewards/margins": 6.649218559265137, "rewards/rejected": -9.646875381469727, "step": 6500 }, { "epoch": 3.4317343173431736, "grad_norm": 8.757387719683846, "learning_rate": 1.4219820769636267e-07, "logits/chosen": -2.905468702316284, "logits/rejected": -2.90234375, "logps/chosen": -376.54998779296875, "logps/rejected": -529.2000122070312, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -2.8218750953674316, "rewards/margins": 6.984375, "rewards/rejected": -9.807812690734863, "step": 6510 }, { "epoch": 3.4370057986294147, "grad_norm": 4.131342149606525, "learning_rate": 1.4088033737480232e-07, "logits/chosen": -2.975781202316284, "logits/rejected": -2.94921875, "logps/chosen": -413.3500061035156, "logps/rejected": -478.6000061035156, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.912890672683716, "rewards/margins": 6.709374904632568, "rewards/rejected": -9.626562118530273, "step": 6520 }, { "epoch": 3.4422772799156562, "grad_norm": 3.899754333944389, "learning_rate": 1.3956246705324197e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.883593797683716, "logps/chosen": -331.75, "logps/rejected": -463.0, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.7406249046325684, "rewards/margins": 6.826562404632568, "rewards/rejected": -9.567187309265137, "step": 6530 }, { "epoch": 3.447548761201898, "grad_norm": 20.687642595447496, "learning_rate": 1.382445967316816e-07, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.8921875953674316, "logps/chosen": -370.6499938964844, "logps/rejected": -497.0, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.3828125, "rewards/margins": 6.946875095367432, "rewards/rejected": -10.337499618530273, "step": 6540 }, { "epoch": 3.4528202424881393, "grad_norm": 4.87446904928106, "learning_rate": 1.3692672641012123e-07, "logits/chosen": -2.8617186546325684, "logits/rejected": -2.897656202316284, "logps/chosen": -405.79998779296875, "logps/rejected": -542.0999755859375, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.8714842796325684, "rewards/margins": 6.956250190734863, "rewards/rejected": -10.834375381469727, "step": 6550 }, { "epoch": 3.4580917237743805, "grad_norm": 23.657729310940784, "learning_rate": 1.3560885608856088e-07, "logits/chosen": -2.864062547683716, "logits/rejected": -2.864062547683716, "logps/chosen": -363.75, "logps/rejected": -496.1000061035156, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.073046922683716, "rewards/margins": 7.139062404632568, "rewards/rejected": -10.217187881469727, "step": 6560 }, { "epoch": 3.463363205060622, "grad_norm": 23.53017811577447, "learning_rate": 1.342909857670005e-07, "logits/chosen": -2.924999952316284, "logits/rejected": -2.965625047683716, "logps/chosen": -338.95001220703125, "logps/rejected": -456.20001220703125, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -2.8340821266174316, "rewards/margins": 7.014062404632568, "rewards/rejected": -9.845312118530273, "step": 6570 }, { "epoch": 3.4686346863468636, "grad_norm": 5.4966341655293744, "learning_rate": 1.3297311544544015e-07, "logits/chosen": -2.8726563453674316, "logits/rejected": -2.80078125, "logps/chosen": -310.8999938964844, "logps/rejected": -478.75, "loss": 0.0253, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.959765672683716, "rewards/margins": 7.134375095367432, "rewards/rejected": -10.095312118530273, "step": 6580 }, { "epoch": 3.473906167633105, "grad_norm": 4.313568650637865, "learning_rate": 1.316552451238798e-07, "logits/chosen": -2.917187452316284, "logits/rejected": -2.9632811546325684, "logps/chosen": -399.8999938964844, "logps/rejected": -532.7000122070312, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.986328125, "rewards/margins": 7.015625, "rewards/rejected": -10.004687309265137, "step": 6590 }, { "epoch": 3.479177648919346, "grad_norm": 6.987091030769324, "learning_rate": 1.3033737480231945e-07, "logits/chosen": -2.9156250953674316, "logits/rejected": -2.995312452316284, "logps/chosen": -363.1499938964844, "logps/rejected": -472.70001220703125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.009765625, "rewards/margins": 7.0390625, "rewards/rejected": -10.045312881469727, "step": 6600 }, { "epoch": 3.4844491302055878, "grad_norm": 13.133272210828725, "learning_rate": 1.290195044807591e-07, "logits/chosen": -3.0179686546325684, "logits/rejected": -2.953125, "logps/chosen": -383.8999938964844, "logps/rejected": -493.79998779296875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -2.678906202316284, "rewards/margins": 6.845312595367432, "rewards/rejected": -9.520312309265137, "step": 6610 }, { "epoch": 3.4897206114918293, "grad_norm": 4.108931629928708, "learning_rate": 1.2770163415919874e-07, "logits/chosen": -3.0296874046325684, "logits/rejected": -3.026562452316284, "logps/chosen": -366.8999938964844, "logps/rejected": -488.3999938964844, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.9629883766174316, "rewards/margins": 7.207812309265137, "rewards/rejected": -10.176562309265137, "step": 6620 }, { "epoch": 3.4949920927780704, "grad_norm": 16.88299883491673, "learning_rate": 1.263837638376384e-07, "logits/chosen": -2.918750047683716, "logits/rejected": -3.0023436546325684, "logps/chosen": -418.8999938964844, "logps/rejected": -538.7000122070312, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -3.2818360328674316, "rewards/margins": 7.550000190734863, "rewards/rejected": -10.842187881469727, "step": 6630 }, { "epoch": 3.500263574064312, "grad_norm": 13.814222148225154, "learning_rate": 1.25065893516078e-07, "logits/chosen": -2.850781202316284, "logits/rejected": -2.9765625, "logps/chosen": -399.25, "logps/rejected": -530.2999877929688, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2158203125, "rewards/margins": 6.809374809265137, "rewards/rejected": -10.028124809265137, "step": 6640 }, { "epoch": 3.5055350553505535, "grad_norm": 20.912335179905643, "learning_rate": 1.2374802319451766e-07, "logits/chosen": -2.984375, "logits/rejected": -2.9859375953674316, "logps/chosen": -345.75, "logps/rejected": -499.3999938964844, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -2.735546827316284, "rewards/margins": 7.034375190734863, "rewards/rejected": -9.778124809265137, "step": 6650 }, { "epoch": 3.510806536636795, "grad_norm": 8.388258125138783, "learning_rate": 1.224301528729573e-07, "logits/chosen": -2.8515625, "logits/rejected": -2.903125047683716, "logps/chosen": -389.95001220703125, "logps/rejected": -548.7000122070312, "loss": 0.047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.060546875, "rewards/margins": 7.462500095367432, "rewards/rejected": -10.524999618530273, "step": 6660 }, { "epoch": 3.5160780179230366, "grad_norm": 1.3962212215793388, "learning_rate": 1.2111228255139693e-07, "logits/chosen": -2.9117188453674316, "logits/rejected": -2.8968749046325684, "logps/chosen": -363.7250061035156, "logps/rejected": -537.0, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.203906297683716, "rewards/margins": 7.548437595367432, "rewards/rejected": -10.751562118530273, "step": 6670 }, { "epoch": 3.5213494992092778, "grad_norm": 6.092724476569233, "learning_rate": 1.1979441222983657e-07, "logits/chosen": -2.832812547683716, "logits/rejected": -2.914843797683716, "logps/chosen": -320.8500061035156, "logps/rejected": -463.8999938964844, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -3.056640625, "rewards/margins": 7.09375, "rewards/rejected": -10.140625, "step": 6680 }, { "epoch": 3.5266209804955193, "grad_norm": 11.305062856114228, "learning_rate": 1.1847654190827622e-07, "logits/chosen": -2.9820313453674316, "logits/rejected": -2.907031297683716, "logps/chosen": -376.29998779296875, "logps/rejected": -493.29998779296875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -3.494921922683716, "rewards/margins": 6.684374809265137, "rewards/rejected": -10.185937881469727, "step": 6690 }, { "epoch": 3.5318924617817604, "grad_norm": 45.2973998653223, "learning_rate": 1.1715867158671585e-07, "logits/chosen": -2.944531202316284, "logits/rejected": -2.889843702316284, "logps/chosen": -380.54998779296875, "logps/rejected": -520.7999877929688, "loss": 0.0204, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.965625047683716, "rewards/margins": 6.923437595367432, "rewards/rejected": -9.890625, "step": 6700 }, { "epoch": 3.537163943068002, "grad_norm": 8.175355056869249, "learning_rate": 1.158408012651555e-07, "logits/chosen": -2.9515624046325684, "logits/rejected": -2.9375, "logps/chosen": -383.45001220703125, "logps/rejected": -531.2000122070312, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.2738280296325684, "rewards/margins": 7.354687690734863, "rewards/rejected": -10.625, "step": 6710 }, { "epoch": 3.5424354243542435, "grad_norm": 7.649409963637889, "learning_rate": 1.1452293094359515e-07, "logits/chosen": -2.9945311546325684, "logits/rejected": -2.973437547683716, "logps/chosen": -378.1000061035156, "logps/rejected": -525.0, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.111328125, "rewards/margins": 7.040625095367432, "rewards/rejected": -10.151562690734863, "step": 6720 }, { "epoch": 3.547706905640485, "grad_norm": 6.552843354395644, "learning_rate": 1.1320506062203478e-07, "logits/chosen": -2.9765625, "logits/rejected": -3.0101561546325684, "logps/chosen": -327.79998779296875, "logps/rejected": -436.29998779296875, "loss": 0.0216, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.920703172683716, "rewards/margins": 7.190625190734863, "rewards/rejected": -10.104687690734863, "step": 6730 }, { "epoch": 3.5529783869267266, "grad_norm": 5.48843898429323, "learning_rate": 1.1188719030047443e-07, "logits/chosen": -2.8125, "logits/rejected": -2.94921875, "logps/chosen": -422.1000061035156, "logps/rejected": -536.4000244140625, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.696484327316284, "rewards/margins": 7.381249904632568, "rewards/rejected": -11.078125, "step": 6740 }, { "epoch": 3.5582498682129677, "grad_norm": 3.8576622954094892, "learning_rate": 1.1056931997891407e-07, "logits/chosen": -2.897656202316284, "logits/rejected": -3.0250000953674316, "logps/chosen": -405.29998779296875, "logps/rejected": -491.3500061035156, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4554686546325684, "rewards/margins": 6.832812309265137, "rewards/rejected": -10.284375190734863, "step": 6750 }, { "epoch": 3.5635213494992093, "grad_norm": 9.787102991177829, "learning_rate": 1.0925144965735371e-07, "logits/chosen": -2.922656297683716, "logits/rejected": -2.96875, "logps/chosen": -379.6499938964844, "logps/rejected": -486.79998779296875, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5250000953674316, "rewards/margins": 6.698437690734863, "rewards/rejected": -10.220312118530273, "step": 6760 }, { "epoch": 3.568792830785451, "grad_norm": 2.029734807956018, "learning_rate": 1.0793357933579335e-07, "logits/chosen": -2.91796875, "logits/rejected": -2.96875, "logps/chosen": -398.29998779296875, "logps/rejected": -552.2999877929688, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -3.5921874046325684, "rewards/margins": 7.296875, "rewards/rejected": -10.890625, "step": 6770 }, { "epoch": 3.574064312071692, "grad_norm": 1.3425265080980688, "learning_rate": 1.0661570901423298e-07, "logits/chosen": -2.903125047683716, "logits/rejected": -2.9671874046325684, "logps/chosen": -359.0, "logps/rejected": -488.70001220703125, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5601563453674316, "rewards/margins": 7.420312404632568, "rewards/rejected": -10.978124618530273, "step": 6780 }, { "epoch": 3.5793357933579335, "grad_norm": 1.5828221674888125, "learning_rate": 1.0529783869267264e-07, "logits/chosen": -2.98046875, "logits/rejected": -2.98828125, "logps/chosen": -330.29998779296875, "logps/rejected": -514.0, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.323437452316284, "rewards/margins": 7.795312404632568, "rewards/rejected": -11.1171875, "step": 6790 }, { "epoch": 3.584607274644175, "grad_norm": 10.0517309469929, "learning_rate": 1.0397996837111228e-07, "logits/chosen": -2.9078125953674316, "logits/rejected": -2.903125047683716, "logps/chosen": -346.1000061035156, "logps/rejected": -475.70001220703125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.967578172683716, "rewards/margins": 7.078125, "rewards/rejected": -10.046875, "step": 6800 }, { "epoch": 3.5898787559304166, "grad_norm": 5.48503991268565, "learning_rate": 1.0266209804955192e-07, "logits/chosen": -2.9437499046325684, "logits/rejected": -2.858203172683716, "logps/chosen": -322.6000061035156, "logps/rejected": -451.1000061035156, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.682812452316284, "rewards/margins": 7.067187309265137, "rewards/rejected": -9.737500190734863, "step": 6810 }, { "epoch": 3.5951502372166577, "grad_norm": 4.479900480495104, "learning_rate": 1.0134422772799156e-07, "logits/chosen": -2.913281202316284, "logits/rejected": -2.874218702316284, "logps/chosen": -404.8500061035156, "logps/rejected": -518.0999755859375, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -3.114453077316284, "rewards/margins": 7.057812690734863, "rewards/rejected": -10.176562309265137, "step": 6820 }, { "epoch": 3.6004217185028993, "grad_norm": 0.9288778576554191, "learning_rate": 1.000263574064312e-07, "logits/chosen": -2.803906202316284, "logits/rejected": -2.9906249046325684, "logps/chosen": -382.6499938964844, "logps/rejected": -527.9000244140625, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.0250000953674316, "rewards/margins": 8.198437690734863, "rewards/rejected": -11.21875, "step": 6830 }, { "epoch": 3.605693199789141, "grad_norm": 10.231993696203496, "learning_rate": 9.870848708487084e-08, "logits/chosen": -2.8851561546325684, "logits/rejected": -2.889843702316284, "logps/chosen": -344.20001220703125, "logps/rejected": -483.3999938964844, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -2.60546875, "rewards/margins": 6.892187595367432, "rewards/rejected": -9.498437881469727, "step": 6840 }, { "epoch": 3.610964681075382, "grad_norm": 14.369635568712434, "learning_rate": 9.739061676331048e-08, "logits/chosen": -2.864062547683716, "logits/rejected": -2.885937452316284, "logps/chosen": -336.1499938964844, "logps/rejected": -480.70001220703125, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.663891553878784, "rewards/margins": 7.503125190734863, "rewards/rejected": -10.178125381469727, "step": 6850 }, { "epoch": 3.6162361623616235, "grad_norm": 11.350794943165207, "learning_rate": 9.607274644175014e-08, "logits/chosen": -3.008593797683716, "logits/rejected": -3.078906297683716, "logps/chosen": -382.25, "logps/rejected": -516.5999755859375, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -3.421875, "rewards/margins": 7.360937595367432, "rewards/rejected": -10.7890625, "step": 6860 }, { "epoch": 3.621507643647865, "grad_norm": 13.51225694260375, "learning_rate": 9.475487612018977e-08, "logits/chosen": -2.881640672683716, "logits/rejected": -2.9359374046325684, "logps/chosen": -345.1499938964844, "logps/rejected": -470.3999938964844, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.8101563453674316, "rewards/margins": 7.0859375, "rewards/rejected": -9.90625, "step": 6870 }, { "epoch": 3.6267791249341066, "grad_norm": 7.586450840582755, "learning_rate": 9.343700579862942e-08, "logits/chosen": -2.8515625, "logits/rejected": -2.9984374046325684, "logps/chosen": -372.04998779296875, "logps/rejected": -470.70001220703125, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.9769530296325684, "rewards/margins": 6.890625, "rewards/rejected": -9.871874809265137, "step": 6880 }, { "epoch": 3.632050606220348, "grad_norm": 4.818098388252842, "learning_rate": 9.211913547706905e-08, "logits/chosen": -2.9789061546325684, "logits/rejected": -2.9296875, "logps/chosen": -358.8999938964844, "logps/rejected": -477.5, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -3.212890625, "rewards/margins": 6.78125, "rewards/rejected": -9.990625381469727, "step": 6890 }, { "epoch": 3.6373220875065893, "grad_norm": 35.552247485624115, "learning_rate": 9.080126515550869e-08, "logits/chosen": -2.9000000953674316, "logits/rejected": -2.9867186546325684, "logps/chosen": -323.79998779296875, "logps/rejected": -457.0, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -2.9375, "rewards/margins": 6.917187690734863, "rewards/rejected": -9.856249809265137, "step": 6900 }, { "epoch": 3.642593568792831, "grad_norm": 70.66863421507811, "learning_rate": 8.948339483394833e-08, "logits/chosen": -3.0601563453674316, "logits/rejected": -3.04296875, "logps/chosen": -382.6499938964844, "logps/rejected": -507.79998779296875, "loss": 0.0486, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1410155296325684, "rewards/margins": 7.035937309265137, "rewards/rejected": -10.1875, "step": 6910 }, { "epoch": 3.6478650500790724, "grad_norm": 35.692028773876956, "learning_rate": 8.816552451238797e-08, "logits/chosen": -2.8746094703674316, "logits/rejected": -2.8359375, "logps/chosen": -333.92498779296875, "logps/rejected": -528.2999877929688, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -2.9144530296325684, "rewards/margins": 7.349999904632568, "rewards/rejected": -10.267187118530273, "step": 6920 }, { "epoch": 3.6531365313653135, "grad_norm": 5.2549342942087, "learning_rate": 8.684765419082763e-08, "logits/chosen": -2.9203124046325684, "logits/rejected": -2.92578125, "logps/chosen": -366.6499938964844, "logps/rejected": -537.4000244140625, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3785157203674316, "rewards/margins": 7.28125, "rewards/rejected": -10.657812118530273, "step": 6930 }, { "epoch": 3.658408012651555, "grad_norm": 6.555502763065069, "learning_rate": 8.552978386926726e-08, "logits/chosen": -2.9906249046325684, "logits/rejected": -3.03515625, "logps/chosen": -348.0, "logps/rejected": -489.5, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.992504835128784, "rewards/margins": 7.595312595367432, "rewards/rejected": -10.587499618530273, "step": 6940 }, { "epoch": 3.6636794939377966, "grad_norm": 12.355063474008547, "learning_rate": 8.42119135477069e-08, "logits/chosen": -3.03515625, "logits/rejected": -3.0210938453674316, "logps/chosen": -374.5, "logps/rejected": -488.20001220703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.0748047828674316, "rewards/margins": 6.550000190734863, "rewards/rejected": -9.628125190734863, "step": 6950 }, { "epoch": 3.668950975224038, "grad_norm": 6.432248824355135, "learning_rate": 8.289404322614655e-08, "logits/chosen": -2.9476561546325684, "logits/rejected": -3.0171875953674316, "logps/chosen": -366.1499938964844, "logps/rejected": -485.20001220703125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -3.2119140625, "rewards/margins": 6.856249809265137, "rewards/rejected": -10.067187309265137, "step": 6960 }, { "epoch": 3.6742224565102792, "grad_norm": 11.306259963535922, "learning_rate": 8.157617290458618e-08, "logits/chosen": -2.909374952316284, "logits/rejected": -3.0234375, "logps/chosen": -357.6000061035156, "logps/rejected": -482.5, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.2730469703674316, "rewards/margins": 6.8828125, "rewards/rejected": -10.151562690734863, "step": 6970 }, { "epoch": 3.679493937796521, "grad_norm": 3.039411012202053, "learning_rate": 8.025830258302583e-08, "logits/chosen": -2.91796875, "logits/rejected": -2.9203124046325684, "logps/chosen": -365.54998779296875, "logps/rejected": -497.8999938964844, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.2484374046325684, "rewards/margins": 6.939062595367432, "rewards/rejected": -10.181249618530273, "step": 6980 }, { "epoch": 3.6847654190827623, "grad_norm": 37.5166680432612, "learning_rate": 7.894043226146546e-08, "logits/chosen": -2.91796875, "logits/rejected": -2.844531297683716, "logps/chosen": -366.29998779296875, "logps/rejected": -518.5, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -3.1109375953674316, "rewards/margins": 7.556250095367432, "rewards/rejected": -10.673437118530273, "step": 6990 }, { "epoch": 3.6900369003690034, "grad_norm": 3.6485675201635877, "learning_rate": 7.762256193990511e-08, "logits/chosen": -2.9671874046325684, "logits/rejected": -2.905468702316284, "logps/chosen": -360.45001220703125, "logps/rejected": -544.4000244140625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.184765577316284, "rewards/margins": 7.7265625, "rewards/rejected": -10.909375190734863, "step": 7000 }, { "epoch": 3.695308381655245, "grad_norm": 13.978375802156704, "learning_rate": 7.630469161834476e-08, "logits/chosen": -2.836718797683716, "logits/rejected": -2.996875047683716, "logps/chosen": -353.6499938964844, "logps/rejected": -463.3999938964844, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -3.4488282203674316, "rewards/margins": 6.989062309265137, "rewards/rejected": -10.4375, "step": 7010 }, { "epoch": 3.7005798629414866, "grad_norm": 22.63413915435003, "learning_rate": 7.498682129678439e-08, "logits/chosen": -2.9375, "logits/rejected": -2.9164061546325684, "logps/chosen": -401.75, "logps/rejected": -546.2999877929688, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -3.5531249046325684, "rewards/margins": 7.467187404632568, "rewards/rejected": -11.021875381469727, "step": 7020 }, { "epoch": 3.705851344227728, "grad_norm": 13.683423221980334, "learning_rate": 7.366895097522404e-08, "logits/chosen": -2.8421874046325684, "logits/rejected": -2.9078125953674316, "logps/chosen": -389.1499938964844, "logps/rejected": -555.7999877929688, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -3.592578172683716, "rewards/margins": 7.723437309265137, "rewards/rejected": -11.300000190734863, "step": 7030 }, { "epoch": 3.7111228255139697, "grad_norm": 9.814444139994167, "learning_rate": 7.235108065366367e-08, "logits/chosen": -2.98828125, "logits/rejected": -3.0484375953674316, "logps/chosen": -356.5, "logps/rejected": -476.6000061035156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.151171922683716, "rewards/margins": 7.051562309265137, "rewards/rejected": -10.203125, "step": 7040 }, { "epoch": 3.7163943068002108, "grad_norm": 9.826364269184037, "learning_rate": 7.103321033210331e-08, "logits/chosen": -3.00390625, "logits/rejected": -2.9671874046325684, "logps/chosen": -341.79998779296875, "logps/rejected": -471.3500061035156, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -3.426953077316284, "rewards/margins": 7.015625, "rewards/rejected": -10.435937881469727, "step": 7050 }, { "epoch": 3.7216657880864523, "grad_norm": 6.824493372695757, "learning_rate": 6.971534001054295e-08, "logits/chosen": -3.0093750953674316, "logits/rejected": -2.948437452316284, "logps/chosen": -353.1499938964844, "logps/rejected": -491.0, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.073046922683716, "rewards/margins": 6.842187404632568, "rewards/rejected": -9.910937309265137, "step": 7060 }, { "epoch": 3.726937269372694, "grad_norm": 12.050291459160054, "learning_rate": 6.83974696889826e-08, "logits/chosen": -2.97265625, "logits/rejected": -2.9085936546325684, "logps/chosen": -376.1000061035156, "logps/rejected": -479.8999938964844, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -3.309375047683716, "rewards/margins": 6.918749809265137, "rewards/rejected": -10.228124618530273, "step": 7070 }, { "epoch": 3.732208750658935, "grad_norm": 11.94356810772103, "learning_rate": 6.707959936742225e-08, "logits/chosen": -2.96875, "logits/rejected": -2.914843797683716, "logps/chosen": -342.45001220703125, "logps/rejected": -489.8999938964844, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -3.5386719703674316, "rewards/margins": 7.282812595367432, "rewards/rejected": -10.821874618530273, "step": 7080 }, { "epoch": 3.7374802319451765, "grad_norm": 3.811528499775366, "learning_rate": 6.576172904586188e-08, "logits/chosen": -2.950000047683716, "logits/rejected": -3.0445313453674316, "logps/chosen": -384.6000061035156, "logps/rejected": -495.6000061035156, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -2.7953124046325684, "rewards/margins": 7.284375190734863, "rewards/rejected": -10.079687118530273, "step": 7090 }, { "epoch": 3.742751713231418, "grad_norm": 3.503139536048655, "learning_rate": 6.444385872430153e-08, "logits/chosen": -2.953125, "logits/rejected": -2.91796875, "logps/chosen": -372.3999938964844, "logps/rejected": -527.2999877929688, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -3.260937452316284, "rewards/margins": 7.165625095367432, "rewards/rejected": -10.431249618530273, "step": 7100 }, { "epoch": 3.7480231945176596, "grad_norm": 1.9461962647519977, "learning_rate": 6.312598840274117e-08, "logits/chosen": -2.975781202316284, "logits/rejected": -3.0078125, "logps/chosen": -384.70001220703125, "logps/rejected": -513.5999755859375, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.5990233421325684, "rewards/margins": 7.800000190734863, "rewards/rejected": -11.404687881469727, "step": 7110 }, { "epoch": 3.7532946758039007, "grad_norm": 6.697495618687715, "learning_rate": 6.180811808118081e-08, "logits/chosen": -2.889843702316284, "logits/rejected": -2.934375047683716, "logps/chosen": -367.8999938964844, "logps/rejected": -499.45001220703125, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.4867186546325684, "rewards/margins": 7.248437404632568, "rewards/rejected": -10.731249809265137, "step": 7120 }, { "epoch": 3.7585661570901423, "grad_norm": 9.783184258238064, "learning_rate": 6.049024775962045e-08, "logits/chosen": -2.921875, "logits/rejected": -2.9359374046325684, "logps/chosen": -342.1000061035156, "logps/rejected": -497.79998779296875, "loss": 0.0232, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.589062452316284, "rewards/margins": 7.498437404632568, "rewards/rejected": -11.09375, "step": 7130 }, { "epoch": 3.763837638376384, "grad_norm": 17.982956462023356, "learning_rate": 5.917237743806009e-08, "logits/chosen": -3.020312547683716, "logits/rejected": -3.0367188453674316, "logps/chosen": -390.1000061035156, "logps/rejected": -508.8999938964844, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -3.136523485183716, "rewards/margins": 7.099999904632568, "rewards/rejected": -10.240625381469727, "step": 7140 }, { "epoch": 3.769109119662625, "grad_norm": 28.07888075753915, "learning_rate": 5.7854507116499736e-08, "logits/chosen": -2.907031297683716, "logits/rejected": -2.890625, "logps/chosen": -376.04998779296875, "logps/rejected": -479.29998779296875, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -3.0140624046325684, "rewards/margins": 6.925000190734863, "rewards/rejected": -9.9296875, "step": 7150 }, { "epoch": 3.7743806009488665, "grad_norm": 1.135241721756799, "learning_rate": 5.653663679493938e-08, "logits/chosen": -2.985156297683716, "logits/rejected": -2.964062452316284, "logps/chosen": -358.0, "logps/rejected": -464.79998779296875, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.399609327316284, "rewards/margins": 6.912499904632568, "rewards/rejected": -10.310937881469727, "step": 7160 }, { "epoch": 3.779652082235108, "grad_norm": 33.46617284859401, "learning_rate": 5.521876647337902e-08, "logits/chosen": -2.828906297683716, "logits/rejected": -2.909374952316284, "logps/chosen": -357.8999938964844, "logps/rejected": -493.29998779296875, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -3.5679688453674316, "rewards/margins": 7.170312404632568, "rewards/rejected": -10.731249809265137, "step": 7170 }, { "epoch": 3.7849235635213496, "grad_norm": 1.8922383364404236, "learning_rate": 5.390089615181866e-08, "logits/chosen": -2.91796875, "logits/rejected": -2.88671875, "logps/chosen": -324.8500061035156, "logps/rejected": -458.70001220703125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.929882764816284, "rewards/margins": 7.128125190734863, "rewards/rejected": -10.057812690734863, "step": 7180 }, { "epoch": 3.790195044807591, "grad_norm": 4.610330265672739, "learning_rate": 5.25830258302583e-08, "logits/chosen": -2.889843702316284, "logits/rejected": -2.866406202316284, "logps/chosen": -335.8500061035156, "logps/rejected": -505.79998779296875, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1167969703674316, "rewards/margins": 7.239062309265137, "rewards/rejected": -10.359375, "step": 7190 }, { "epoch": 3.7954665260938323, "grad_norm": 82.32034159867696, "learning_rate": 5.126515550869794e-08, "logits/chosen": -3.022656202316284, "logits/rejected": -3.0445313453674316, "logps/chosen": -392.95001220703125, "logps/rejected": -485.3999938964844, "loss": 0.0249, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3460936546325684, "rewards/margins": 6.432812690734863, "rewards/rejected": -9.784375190734863, "step": 7200 }, { "epoch": 3.800738007380074, "grad_norm": 15.563219697078564, "learning_rate": 4.994728518713758e-08, "logits/chosen": -3.02734375, "logits/rejected": -3.0804686546325684, "logps/chosen": -388.6499938964844, "logps/rejected": -529.7999877929688, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -3.62109375, "rewards/margins": 7.193749904632568, "rewards/rejected": -10.8125, "step": 7210 }, { "epoch": 3.8060094886663154, "grad_norm": 2.7056795146698995, "learning_rate": 4.862941486557722e-08, "logits/chosen": -2.975781202316284, "logits/rejected": -2.94140625, "logps/chosen": -422.5, "logps/rejected": -542.7000122070312, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -3.4048829078674316, "rewards/margins": 7.175000190734863, "rewards/rejected": -10.581250190734863, "step": 7220 }, { "epoch": 3.8112809699525565, "grad_norm": 2.0326847500593144, "learning_rate": 4.731154454401687e-08, "logits/chosen": -2.966796875, "logits/rejected": -2.9453125, "logps/chosen": -391.5, "logps/rejected": -488.5, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.390869140625, "rewards/margins": 6.854687690734863, "rewards/rejected": -10.243749618530273, "step": 7230 }, { "epoch": 3.816552451238798, "grad_norm": 44.11040823342203, "learning_rate": 4.5993674222456505e-08, "logits/chosen": -2.8843750953674316, "logits/rejected": -2.9398436546325684, "logps/chosen": -390.25, "logps/rejected": -511.5, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -3.477343797683716, "rewards/margins": 7.532812595367432, "rewards/rejected": -11.006250381469727, "step": 7240 }, { "epoch": 3.8218239325250396, "grad_norm": 10.835586459603652, "learning_rate": 4.4675803900896145e-08, "logits/chosen": -3.012500047683716, "logits/rejected": -3.1070313453674316, "logps/chosen": -334.1000061035156, "logps/rejected": -490.5, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -3.052539110183716, "rewards/margins": 7.478125095367432, "rewards/rejected": -10.526562690734863, "step": 7250 }, { "epoch": 3.827095413811281, "grad_norm": 3.790205762601594, "learning_rate": 4.335793357933579e-08, "logits/chosen": -2.97265625, "logits/rejected": -2.930468797683716, "logps/chosen": -369.25, "logps/rejected": -516.5, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.2679686546325684, "rewards/margins": 7.603125095367432, "rewards/rejected": -10.875, "step": 7260 }, { "epoch": 3.8323668950975223, "grad_norm": 13.795783393037786, "learning_rate": 4.2040063257775434e-08, "logits/chosen": -3.004687547683716, "logits/rejected": -2.987499952316284, "logps/chosen": -364.29998779296875, "logps/rejected": -482.70001220703125, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -3.749218702316284, "rewards/margins": 6.856249809265137, "rewards/rejected": -10.609375, "step": 7270 }, { "epoch": 3.837638376383764, "grad_norm": 19.56977839505198, "learning_rate": 4.0722192936215075e-08, "logits/chosen": -3.0, "logits/rejected": -2.9632811546325684, "logps/chosen": -365.75, "logps/rejected": -489.79998779296875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -3.664843797683716, "rewards/margins": 6.642187595367432, "rewards/rejected": -10.3125, "step": 7280 }, { "epoch": 3.8429098576700054, "grad_norm": 49.40936100313088, "learning_rate": 3.9404322614654716e-08, "logits/chosen": -3.00390625, "logits/rejected": -2.8929686546325684, "logps/chosen": -406.3500061035156, "logps/rejected": -557.0999755859375, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5863280296325684, "rewards/margins": 7.428124904632568, "rewards/rejected": -11.021875381469727, "step": 7290 }, { "epoch": 3.8481813389562465, "grad_norm": 6.1496875737479595, "learning_rate": 3.808645229309436e-08, "logits/chosen": -2.917187452316284, "logits/rejected": -2.9320311546325684, "logps/chosen": -371.79998779296875, "logps/rejected": -501.8999938964844, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.229296922683716, "rewards/margins": 7.176562309265137, "rewards/rejected": -10.40625, "step": 7300 }, { "epoch": 3.853452820242488, "grad_norm": 10.310731861361019, "learning_rate": 3.6768581971534e-08, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.8636717796325684, "logps/chosen": -364.3500061035156, "logps/rejected": -478.1000061035156, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.3550782203674316, "rewards/margins": 7.076562404632568, "rewards/rejected": -10.425000190734863, "step": 7310 }, { "epoch": 3.8587243015287296, "grad_norm": 7.718802457959731, "learning_rate": 3.545071164997364e-08, "logits/chosen": -2.9476561546325684, "logits/rejected": -2.9312500953674316, "logps/chosen": -342.79998779296875, "logps/rejected": -459.6000061035156, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.323437452316284, "rewards/margins": 6.651562690734863, "rewards/rejected": -9.978124618530273, "step": 7320 }, { "epoch": 3.863995782814971, "grad_norm": 35.537839526660946, "learning_rate": 3.4132841328413286e-08, "logits/chosen": -3.0101561546325684, "logits/rejected": -2.944531202316284, "logps/chosen": -322.1499938964844, "logps/rejected": -473.5, "loss": 0.0166, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.965625047683716, "rewards/margins": 7.331250190734863, "rewards/rejected": -10.296875, "step": 7330 }, { "epoch": 3.8692672641012127, "grad_norm": 11.934307838394634, "learning_rate": 3.281497100685293e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -2.9351563453674316, "logps/chosen": -337.4750061035156, "logps/rejected": -491.5, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.866015672683716, "rewards/margins": 7.412499904632568, "rewards/rejected": -10.278124809265137, "step": 7340 }, { "epoch": 3.874538745387454, "grad_norm": 8.293297971253054, "learning_rate": 3.149710068529256e-08, "logits/chosen": -3.00390625, "logits/rejected": -2.9625000953674316, "logps/chosen": -341.1000061035156, "logps/rejected": -480.5, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.830859422683716, "rewards/margins": 6.840624809265137, "rewards/rejected": -9.673437118530273, "step": 7350 }, { "epoch": 3.8798102266736954, "grad_norm": 17.909144234703323, "learning_rate": 3.017923036373221e-08, "logits/chosen": -2.852343797683716, "logits/rejected": -3.0171875953674316, "logps/chosen": -399.5, "logps/rejected": -484.20001220703125, "loss": 0.0174, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.16015625, "rewards/margins": 6.865624904632568, "rewards/rejected": -10.0234375, "step": 7360 }, { "epoch": 3.885081707959937, "grad_norm": 23.645985191644133, "learning_rate": 2.886136004217185e-08, "logits/chosen": -3.12109375, "logits/rejected": -3.0328125953674316, "logps/chosen": -351.79998779296875, "logps/rejected": -510.1000061035156, "loss": 0.0241, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.176562547683716, "rewards/margins": 7.167187690734863, "rewards/rejected": -10.340624809265137, "step": 7370 }, { "epoch": 3.890353189246178, "grad_norm": 3.545348870123865, "learning_rate": 2.754348972061149e-08, "logits/chosen": -2.9453125, "logits/rejected": -2.8773436546325684, "logps/chosen": -373.25, "logps/rejected": -507.29998779296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -3.171093702316284, "rewards/margins": 7.518750190734863, "rewards/rejected": -10.692187309265137, "step": 7380 }, { "epoch": 3.8956246705324196, "grad_norm": 1.5854541284718062, "learning_rate": 2.6225619399051132e-08, "logits/chosen": -2.9124999046325684, "logits/rejected": -2.983593702316284, "logps/chosen": -385.25, "logps/rejected": -549.7999877929688, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.186718702316284, "rewards/margins": 7.628125190734863, "rewards/rejected": -10.803125381469727, "step": 7390 }, { "epoch": 3.900896151818661, "grad_norm": 2.9894207756758027, "learning_rate": 2.4907749077490773e-08, "logits/chosen": -2.969531297683716, "logits/rejected": -3.0367188453674316, "logps/chosen": -396.70001220703125, "logps/rejected": -479.20001220703125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.313281297683716, "rewards/margins": 7.235937595367432, "rewards/rejected": -10.548437118530273, "step": 7400 }, { "epoch": 3.9061676331049027, "grad_norm": 59.17063355431289, "learning_rate": 2.3589878755930417e-08, "logits/chosen": -2.848437547683716, "logits/rejected": -2.91015625, "logps/chosen": -345.25, "logps/rejected": -503.0, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.356640577316284, "rewards/margins": 7.651562690734863, "rewards/rejected": -11.010937690734863, "step": 7410 }, { "epoch": 3.911439114391144, "grad_norm": 37.64262660561514, "learning_rate": 2.2272008434370054e-08, "logits/chosen": -2.973437547683716, "logits/rejected": -2.995312452316284, "logps/chosen": -345.29998779296875, "logps/rejected": -520.2000122070312, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.171679735183716, "rewards/margins": 7.154687404632568, "rewards/rejected": -10.323437690734863, "step": 7420 }, { "epoch": 3.9167105956773853, "grad_norm": 21.808339897793044, "learning_rate": 2.09541381128097e-08, "logits/chosen": -2.9453125, "logits/rejected": -2.97265625, "logps/chosen": -339.6499938964844, "logps/rejected": -477.5, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -2.891406297683716, "rewards/margins": 7.457812309265137, "rewards/rejected": -10.353124618530273, "step": 7430 }, { "epoch": 3.921982076963627, "grad_norm": 5.865365222133183, "learning_rate": 1.9636267791249343e-08, "logits/chosen": -2.9156250953674316, "logits/rejected": -2.944531202316284, "logps/chosen": -368.3500061035156, "logps/rejected": -513.5, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.028125047683716, "rewards/margins": 7.243750095367432, "rewards/rejected": -10.276562690734863, "step": 7440 }, { "epoch": 3.927253558249868, "grad_norm": 3.066079743533668, "learning_rate": 1.831839746968898e-08, "logits/chosen": -2.99609375, "logits/rejected": -2.9859375953674316, "logps/chosen": -355.70001220703125, "logps/rejected": -516.2999877929688, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -3.3843750953674316, "rewards/margins": 7.6640625, "rewards/rejected": -11.046875, "step": 7450 }, { "epoch": 3.9325250395361095, "grad_norm": 3.1988165453837047, "learning_rate": 1.7000527148128625e-08, "logits/chosen": -2.8765625953674316, "logits/rejected": -2.94921875, "logps/chosen": -369.8999938964844, "logps/rejected": -507.3999938964844, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -3.112109422683716, "rewards/margins": 7.035937309265137, "rewards/rejected": -10.140625, "step": 7460 }, { "epoch": 3.937796520822351, "grad_norm": 2.839884125684661, "learning_rate": 1.5682656826568266e-08, "logits/chosen": -2.9156250953674316, "logits/rejected": -2.893749952316284, "logps/chosen": -326.0, "logps/rejected": -452.5, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -2.98828125, "rewards/margins": 7.059374809265137, "rewards/rejected": -10.048437118530273, "step": 7470 }, { "epoch": 3.9430680021085927, "grad_norm": 2.482906905152781, "learning_rate": 1.4364786505007907e-08, "logits/chosen": -2.81640625, "logits/rejected": -2.922656297683716, "logps/chosen": -327.54998779296875, "logps/rejected": -467.1000061035156, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -2.876953125, "rewards/margins": 7.293749809265137, "rewards/rejected": -10.162500381469727, "step": 7480 }, { "epoch": 3.948339483394834, "grad_norm": 5.640848201241061, "learning_rate": 1.304691618344755e-08, "logits/chosen": -3.0289063453674316, "logits/rejected": -3.0328125953674316, "logps/chosen": -367.6499938964844, "logps/rejected": -515.7999877929688, "loss": 0.0157, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.131640672683716, "rewards/margins": 7.471875190734863, "rewards/rejected": -10.609375, "step": 7490 }, { "epoch": 3.9536109646810753, "grad_norm": 2.737453535807709, "learning_rate": 1.172904586188719e-08, "logits/chosen": -2.831249952316284, "logits/rejected": -2.907031297683716, "logps/chosen": -380.57501220703125, "logps/rejected": -504.0, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.263867139816284, "rewards/margins": 7.215624809265137, "rewards/rejected": -10.481249809265137, "step": 7500 }, { "epoch": 3.958882445967317, "grad_norm": 7.145820618789193, "learning_rate": 1.0411175540326831e-08, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.948437452316284, "logps/chosen": -358.6499938964844, "logps/rejected": -506.29998779296875, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.197265625, "rewards/margins": 7.209374904632568, "rewards/rejected": -10.407812118530273, "step": 7510 }, { "epoch": 3.964153927253558, "grad_norm": 18.714618716376552, "learning_rate": 9.093305218766472e-09, "logits/chosen": -2.9945311546325684, "logits/rejected": -2.8148436546325684, "logps/chosen": -342.5, "logps/rejected": -516.5999755859375, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -3.165820360183716, "rewards/margins": 7.5703125, "rewards/rejected": -10.737500190734863, "step": 7520 }, { "epoch": 3.9694254085397995, "grad_norm": 4.986820010223568, "learning_rate": 7.775434897206115e-09, "logits/chosen": -2.9976563453674316, "logits/rejected": -3.0570311546325684, "logps/chosen": -357.25, "logps/rejected": -497.29998779296875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -2.8363280296325684, "rewards/margins": 6.953125, "rewards/rejected": -9.7890625, "step": 7530 }, { "epoch": 3.974696889826041, "grad_norm": 3.5699829959232128, "learning_rate": 6.457564575645756e-09, "logits/chosen": -2.8671875, "logits/rejected": -2.864062547683716, "logps/chosen": -388.8999938964844, "logps/rejected": -522.2000122070312, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -3.1546874046325684, "rewards/margins": 7.532812595367432, "rewards/rejected": -10.693750381469727, "step": 7540 }, { "epoch": 3.9799683711122826, "grad_norm": 15.214586715991395, "learning_rate": 5.139694254085398e-09, "logits/chosen": -2.852343797683716, "logits/rejected": -2.88671875, "logps/chosen": -338.45001220703125, "logps/rejected": -470.79998779296875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.13671875, "rewards/margins": 7.028124809265137, "rewards/rejected": -10.162500381469727, "step": 7550 }, { "epoch": 3.985239852398524, "grad_norm": 4.61544875661831, "learning_rate": 3.82182393252504e-09, "logits/chosen": -2.8609375953674316, "logits/rejected": -2.917187452316284, "logps/chosen": -383.8500061035156, "logps/rejected": -505.79998779296875, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -3.360156297683716, "rewards/margins": 6.923437595367432, "rewards/rejected": -10.284375190734863, "step": 7560 }, { "epoch": 3.9905113336847653, "grad_norm": 4.441693610448272, "learning_rate": 2.5039536109646808e-09, "logits/chosen": -2.9749999046325684, "logits/rejected": -3.024218797683716, "logps/chosen": -363.25, "logps/rejected": -463.3999938964844, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -3.026562452316284, "rewards/margins": 6.965624809265137, "rewards/rejected": -9.993749618530273, "step": 7570 }, { "epoch": 3.995782814971007, "grad_norm": 2.390116385200875, "learning_rate": 1.1860832894043225e-09, "logits/chosen": -2.9781250953674316, "logits/rejected": -2.9359374046325684, "logps/chosen": -350.75, "logps/rejected": -504.20001220703125, "loss": 0.017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0992188453674316, "rewards/margins": 7.456250190734863, "rewards/rejected": -10.550000190734863, "step": 7580 } ], "logging_steps": 10, "max_steps": 7588, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }