{ "best_metric": 0.2292831838130951, "best_model_checkpoint": "models/llama3.2-3b-orpo-finegrained-2e/checkpoint-10000", "epoch": 1.8046632498375803, "eval_steps": 5000, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.218652999350322e-05, "grad_norm": 39.183845052779965, "learning_rate": 8e-08, "logits/chosen": -0.98046875, "logits/rejected": -0.3984375, "logps/chosen": -268.0, "logps/rejected": -250.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0007218652999350321, "grad_norm": 31.076870179342087, "learning_rate": 8e-07, "logits/chosen": -0.8125, "logits/rejected": -0.67578125, "logps/chosen": -270.0, "logps/rejected": -236.0, "loss": 0.6893, "rewards/accuracies": 0.2222222238779068, "rewards/chosen": -0.0006866455078125, "rewards/margins": -0.007659912109375, "rewards/rejected": 0.0069580078125, "step": 10 }, { "epoch": 0.0014437305998700643, "grad_norm": 21.175963614017803, "learning_rate": 1.6e-06, "logits/chosen": -1.078125, "logits/rejected": -0.890625, "logps/chosen": -266.0, "logps/rejected": -224.0, "loss": 0.6261, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.443359375, "rewards/margins": 0.1865234375, "rewards/rejected": 0.2578125, "step": 20 }, { "epoch": 0.0021655958998050965, "grad_norm": 21.775231172767626, "learning_rate": 2.4e-06, "logits/chosen": -1.2109375, "logits/rejected": -0.95703125, "logps/chosen": -250.0, "logps/rejected": -234.0, "loss": 0.5626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1328125, "rewards/margins": 0.59765625, "rewards/rejected": 0.53515625, "step": 30 }, { "epoch": 0.0028874611997401285, "grad_norm": 21.635928820533405, "learning_rate": 3.2e-06, "logits/chosen": -1.1953125, "logits/rejected": -0.890625, "logps/chosen": -266.0, "logps/rejected": -252.0, "loss": 0.5247, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.5546875, "rewards/margins": 1.1015625, "rewards/rejected": 0.455078125, "step": 40 }, { "epoch": 0.0036093264996751606, "grad_norm": 22.959695040966608, "learning_rate": 4e-06, "logits/chosen": -1.0234375, "logits/rejected": -0.78515625, "logps/chosen": -272.0, "logps/rejected": -237.0, "loss": 0.4994, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.3203125, "rewards/margins": 1.1875, "rewards/rejected": 0.13671875, "step": 50 }, { "epoch": 0.004331191799610193, "grad_norm": 19.68420693687621, "learning_rate": 4.8e-06, "logits/chosen": -0.984375, "logits/rejected": -0.75, "logps/chosen": -266.0, "logps/rejected": -231.0, "loss": 0.4508, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.109375, "rewards/margins": 1.3203125, "rewards/rejected": -0.2099609375, "step": 60 }, { "epoch": 0.005053057099545225, "grad_norm": 19.906762834400418, "learning_rate": 5.6e-06, "logits/chosen": -1.140625, "logits/rejected": -0.79296875, "logps/chosen": -250.0, "logps/rejected": -248.0, "loss": 0.4582, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.1953125, "rewards/margins": 1.4765625, "rewards/rejected": -0.28125, "step": 70 }, { "epoch": 0.005774922399480257, "grad_norm": 24.82142599643133, "learning_rate": 6.4e-06, "logits/chosen": -1.1484375, "logits/rejected": -0.92578125, "logps/chosen": -278.0, "logps/rejected": -248.0, "loss": 0.4917, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.1875, "rewards/margins": 1.8046875, "rewards/rejected": -0.62109375, "step": 80 }, { "epoch": 0.006496787699415289, "grad_norm": 23.102131491448425, "learning_rate": 7.2e-06, "logits/chosen": -1.078125, "logits/rejected": -0.75390625, "logps/chosen": -292.0, "logps/rejected": -260.0, "loss": 0.448, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5859375, "rewards/margins": 2.109375, "rewards/rejected": -1.53125, "step": 90 }, { "epoch": 0.007218652999350321, "grad_norm": 18.725478638112424, "learning_rate": 8e-06, "logits/chosen": -1.0390625, "logits/rejected": -0.671875, "logps/chosen": -294.0, "logps/rejected": -276.0, "loss": 0.4193, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1318359375, "rewards/margins": 2.1875, "rewards/rejected": -2.3125, "step": 100 }, { "epoch": 0.007940518299285354, "grad_norm": 13.800941717440335, "learning_rate": 7.627700713964738e-06, "logits/chosen": -0.9375, "logits/rejected": -0.54296875, "logps/chosen": -282.0, "logps/rejected": -268.0, "loss": 0.4233, "rewards/accuracies": 0.84375, "rewards/chosen": -1.515625, "rewards/margins": 2.5625, "rewards/rejected": -4.0625, "step": 110 }, { "epoch": 0.008662383599220386, "grad_norm": 29.46849814377739, "learning_rate": 7.3029674334022146e-06, "logits/chosen": -0.8984375, "logits/rejected": -0.5859375, "logps/chosen": -270.0, "logps/rejected": -260.0, "loss": 0.4609, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2578125, "rewards/margins": 2.171875, "rewards/rejected": -3.421875, "step": 120 }, { "epoch": 0.009384248899155418, "grad_norm": 27.079304956420785, "learning_rate": 7.016464154456233e-06, "logits/chosen": -0.99609375, "logits/rejected": -0.71484375, "logps/chosen": -276.0, "logps/rejected": -276.0, "loss": 0.5053, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.82421875, "rewards/margins": 2.234375, "rewards/rejected": -3.046875, "step": 130 }, { "epoch": 0.01010611419909045, "grad_norm": 16.54343754139313, "learning_rate": 6.7612340378281325e-06, "logits/chosen": -0.640625, "logits/rejected": -0.435546875, "logps/chosen": -308.0, "logps/rejected": -292.0, "loss": 0.3899, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.859375, "rewards/margins": 2.5625, "rewards/rejected": -4.40625, "step": 140 }, { "epoch": 0.010827979499025482, "grad_norm": 24.363162726167648, "learning_rate": 6.531972647421809e-06, "logits/chosen": -0.486328125, "logits/rejected": -0.228515625, "logps/chosen": -324.0, "logps/rejected": -306.0, "loss": 0.4729, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3125, "rewards/margins": 3.03125, "rewards/rejected": -6.34375, "step": 150 }, { "epoch": 0.011549844798960514, "grad_norm": 16.226244062364913, "learning_rate": 6.3245553203367575e-06, "logits/chosen": -0.515625, "logits/rejected": -0.14453125, "logps/chosen": -304.0, "logps/rejected": -310.0, "loss": 0.4506, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.421875, "rewards/margins": 3.296875, "rewards/rejected": -5.71875, "step": 160 }, { "epoch": 0.012271710098895546, "grad_norm": 14.144043147626373, "learning_rate": 6.135719910778963e-06, "logits/chosen": -0.58203125, "logits/rejected": -0.26171875, "logps/chosen": -302.0, "logps/rejected": -284.0, "loss": 0.381, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.234375, "rewards/margins": 2.46875, "rewards/rejected": -4.71875, "step": 170 }, { "epoch": 0.012993575398830578, "grad_norm": 17.791124440519322, "learning_rate": 5.962847939999439e-06, "logits/chosen": -0.431640625, "logits/rejected": -0.1806640625, "logps/chosen": -348.0, "logps/rejected": -316.0, "loss": 0.4342, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.9375, "rewards/margins": 2.15625, "rewards/rejected": -6.09375, "step": 180 }, { "epoch": 0.01371544069876561, "grad_norm": 17.146267516727967, "learning_rate": 5.803810000880094e-06, "logits/chosen": -0.48828125, "logits/rejected": -0.1826171875, "logps/chosen": -326.0, "logps/rejected": -306.0, "loss": 0.4248, "rewards/accuracies": 0.875, "rewards/chosen": -4.0625, "rewards/margins": 2.71875, "rewards/rejected": -6.78125, "step": 190 }, { "epoch": 0.014437305998700642, "grad_norm": 15.643229303728845, "learning_rate": 5.65685424949238e-06, "logits/chosen": -0.361328125, "logits/rejected": -0.046875, "logps/chosen": -310.0, "logps/rejected": -304.0, "loss": 0.4142, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.75, "rewards/margins": 2.953125, "rewards/rejected": -7.71875, "step": 200 }, { "epoch": 0.015159171298635674, "grad_norm": 15.623550125710054, "learning_rate": 5.5205244747388325e-06, "logits/chosen": -0.47265625, "logits/rejected": -0.169921875, "logps/chosen": -318.0, "logps/rejected": -316.0, "loss": 0.3604, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.71875, "rewards/margins": 3.125, "rewards/rejected": -7.84375, "step": 210 }, { "epoch": 0.015881036598570708, "grad_norm": 14.039298510886779, "learning_rate": 5.393598899705936e-06, "logits/chosen": -0.76953125, "logits/rejected": -0.3125, "logps/chosen": -328.0, "logps/rejected": -334.0, "loss": 0.3764, "rewards/accuracies": 0.8125, "rewards/chosen": -4.34375, "rewards/margins": 3.140625, "rewards/rejected": -7.5, "step": 220 }, { "epoch": 0.01660290189850574, "grad_norm": 15.472304985252848, "learning_rate": 5.275043787166296e-06, "logits/chosen": -0.5703125, "logits/rejected": -0.2294921875, "logps/chosen": -338.0, "logps/rejected": -320.0, "loss": 0.4345, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.0, "rewards/margins": 2.6875, "rewards/rejected": -7.6875, "step": 230 }, { "epoch": 0.017324767198440772, "grad_norm": 12.094721951332676, "learning_rate": 5.163977794943223e-06, "logits/chosen": -0.64453125, "logits/rejected": -0.376953125, "logps/chosen": -326.0, "logps/rejected": -320.0, "loss": 0.39, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.6875, "rewards/margins": 3.203125, "rewards/rejected": -6.875, "step": 240 }, { "epoch": 0.018046632498375802, "grad_norm": 17.818324566636008, "learning_rate": 5.059644256269407e-06, "logits/chosen": -0.6171875, "logits/rejected": -0.33984375, "logps/chosen": -368.0, "logps/rejected": -368.0, "loss": 0.3609, "rewards/accuracies": 0.84375, "rewards/chosen": -3.984375, "rewards/margins": 3.953125, "rewards/rejected": -7.9375, "step": 250 }, { "epoch": 0.018768497798310836, "grad_norm": 16.92689786227541, "learning_rate": 4.961389383568338e-06, "logits/chosen": -0.48046875, "logits/rejected": -0.07275390625, "logps/chosen": -310.0, "logps/rejected": -338.0, "loss": 0.37, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.125, "rewards/margins": 3.78125, "rewards/rejected": -7.90625, "step": 260 }, { "epoch": 0.019490363098245866, "grad_norm": 16.802511811771378, "learning_rate": 4.8686449556014755e-06, "logits/chosen": -0.43359375, "logits/rejected": 0.00830078125, "logps/chosen": -338.0, "logps/rejected": -324.0, "loss": 0.3996, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.53125, "rewards/margins": 3.078125, "rewards/rejected": -7.625, "step": 270 }, { "epoch": 0.0202122283981809, "grad_norm": 15.207255170129951, "learning_rate": 4.780914437337574e-06, "logits/chosen": -0.416015625, "logits/rejected": 0.05908203125, "logps/chosen": -316.0, "logps/rejected": -332.0, "loss": 0.3749, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.59375, "rewards/margins": 3.6875, "rewards/rejected": -7.28125, "step": 280 }, { "epoch": 0.02093409369811593, "grad_norm": 19.420890259778123, "learning_rate": 4.697761756117627e-06, "logits/chosen": -0.3828125, "logits/rejected": -0.08203125, "logps/chosen": -324.0, "logps/rejected": -314.0, "loss": 0.3824, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.46875, "rewards/margins": 3.140625, "rewards/rejected": -6.625, "step": 290 }, { "epoch": 0.021655958998050964, "grad_norm": 13.813910389327559, "learning_rate": 4.618802153517006e-06, "logits/chosen": -0.203125, "logits/rejected": 0.041259765625, "logps/chosen": -324.0, "logps/rejected": -342.0, "loss": 0.3636, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.40625, "rewards/margins": 3.09375, "rewards/rejected": -7.5, "step": 300 }, { "epoch": 0.022377824297985995, "grad_norm": 14.632752537013134, "learning_rate": 4.543694673976518e-06, "logits/chosen": -0.408203125, "logits/rejected": -0.06494140625, "logps/chosen": -322.0, "logps/rejected": -334.0, "loss": 0.3152, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.59375, "rewards/margins": 3.734375, "rewards/rejected": -9.3125, "step": 310 }, { "epoch": 0.02309968959792103, "grad_norm": 14.78236854045537, "learning_rate": 4.472135954999579e-06, "logits/chosen": -0.5703125, "logits/rejected": -0.1357421875, "logps/chosen": -330.0, "logps/rejected": -312.0, "loss": 0.3726, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.28125, "rewards/margins": 3.359375, "rewards/rejected": -7.625, "step": 320 }, { "epoch": 0.02382155489785606, "grad_norm": 13.705393113490159, "learning_rate": 4.403855060505443e-06, "logits/chosen": -0.578125, "logits/rejected": -0.060302734375, "logps/chosen": -324.0, "logps/rejected": -318.0, "loss": 0.3589, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.0625, "rewards/margins": 4.03125, "rewards/rejected": -8.0625, "step": 330 }, { "epoch": 0.024543420197791092, "grad_norm": 14.187410527686973, "learning_rate": 4.338609156373123e-06, "logits/chosen": -0.546875, "logits/rejected": -0.2578125, "logps/chosen": -328.0, "logps/rejected": -348.0, "loss": 0.3757, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -4.9375, "rewards/margins": 3.859375, "rewards/rejected": -8.8125, "step": 340 }, { "epoch": 0.025265285497726123, "grad_norm": 20.528523929299055, "learning_rate": 4.27617987059879e-06, "logits/chosen": -0.4453125, "logits/rejected": -0.09375, "logps/chosen": -346.0, "logps/rejected": -320.0, "loss": 0.3922, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -5.0625, "rewards/margins": 3.5, "rewards/rejected": -8.5625, "step": 350 }, { "epoch": 0.025987150797661156, "grad_norm": 11.490954219307202, "learning_rate": 4.216370213557839e-06, "logits/chosen": -0.279296875, "logits/rejected": 0.08740234375, "logps/chosen": -360.0, "logps/rejected": -346.0, "loss": 0.3944, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -6.875, "rewards/margins": 2.78125, "rewards/rejected": -9.6875, "step": 360 }, { "epoch": 0.02670901609759619, "grad_norm": 14.376197388692082, "learning_rate": 4.15900195928029e-06, "logits/chosen": -0.212890625, "logits/rejected": 0.04345703125, "logps/chosen": -326.0, "logps/rejected": -344.0, "loss": 0.3212, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -6.4375, "rewards/margins": 4.0625, "rewards/rejected": -10.5, "step": 370 }, { "epoch": 0.02743088139753122, "grad_norm": 10.697701401279515, "learning_rate": 4.103913408340617e-06, "logits/chosen": -0.373046875, "logits/rejected": 0.16796875, "logps/chosen": -360.0, "logps/rejected": -366.0, "loss": 0.3368, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.4375, "rewards/margins": 4.90625, "rewards/rejected": -12.375, "step": 380 }, { "epoch": 0.028152746697466254, "grad_norm": 22.29766266513623, "learning_rate": 4.050957468334666e-06, "logits/chosen": -0.197265625, "logits/rejected": 0.2099609375, "logps/chosen": -322.0, "logps/rejected": -334.0, "loss": 0.3197, "rewards/accuracies": 0.875, "rewards/chosen": -5.75, "rewards/margins": 4.71875, "rewards/rejected": -10.4375, "step": 390 }, { "epoch": 0.028874611997401285, "grad_norm": 19.793469923160668, "learning_rate": 4e-06, "logits/chosen": -0.03173828125, "logits/rejected": 0.353515625, "logps/chosen": -390.0, "logps/rejected": -392.0, "loss": 0.3157, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -9.1875, "rewards/margins": 4.40625, "rewards/rejected": -13.625, "step": 400 }, { "epoch": 0.02959647729733632, "grad_norm": 13.159768240559597, "learning_rate": 3.950918386598359e-06, "logits/chosen": 0.037353515625, "logits/rejected": 0.27734375, "logps/chosen": -374.0, "logps/rejected": -376.0, "loss": 0.3326, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -10.4375, "rewards/margins": 4.65625, "rewards/rejected": -15.0625, "step": 410 }, { "epoch": 0.03031834259727135, "grad_norm": 15.397323459285412, "learning_rate": 3.903600291794132e-06, "logits/chosen": -0.03173828125, "logits/rejected": 0.404296875, "logps/chosen": -380.0, "logps/rejected": -366.0, "loss": 0.3061, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -8.5625, "rewards/margins": 4.40625, "rewards/rejected": -12.9375, "step": 420 }, { "epoch": 0.031040207897206382, "grad_norm": 12.243430321801844, "learning_rate": 3.857942577363297e-06, "logits/chosen": -0.1494140625, "logits/rejected": 0.357421875, "logps/chosen": -360.0, "logps/rejected": -348.0, "loss": 0.3693, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -6.71875, "rewards/margins": 4.34375, "rewards/rejected": -11.0625, "step": 430 }, { "epoch": 0.031762073197141416, "grad_norm": 12.189712190640137, "learning_rate": 3.813850356982369e-06, "logits/chosen": 0.07421875, "logits/rejected": 0.3359375, "logps/chosen": -334.0, "logps/rejected": -344.0, "loss": 0.3726, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.34375, "rewards/margins": 4.15625, "rewards/rejected": -10.5, "step": 440 }, { "epoch": 0.032483938497076446, "grad_norm": 16.78797893415546, "learning_rate": 3.7712361663282537e-06, "logits/chosen": 0.00701904296875, "logits/rejected": 0.419921875, "logps/chosen": -342.0, "logps/rejected": -348.0, "loss": 0.3689, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.21875, "rewards/margins": 3.96875, "rewards/rejected": -11.1875, "step": 450 }, { "epoch": 0.03320580379701148, "grad_norm": 8.961299095782637, "learning_rate": 3.730019232961255e-06, "logits/chosen": -0.04638671875, "logits/rejected": 0.369140625, "logps/chosen": -356.0, "logps/rejected": -356.0, "loss": 0.3412, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.28125, "rewards/margins": 4.5, "rewards/rejected": -11.8125, "step": 460 }, { "epoch": 0.03392766909694651, "grad_norm": 14.992664870296256, "learning_rate": 3.6901248321155403e-06, "logits/chosen": -0.1806640625, "logits/rejected": 0.3203125, "logps/chosen": -370.0, "logps/rejected": -386.0, "loss": 0.2873, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.96875, "rewards/margins": 5.1875, "rewards/rejected": -12.125, "step": 470 }, { "epoch": 0.034649534396881544, "grad_norm": 13.482933430674874, "learning_rate": 3.6514837167011073e-06, "logits/chosen": -0.11181640625, "logits/rejected": 0.244140625, "logps/chosen": -340.0, "logps/rejected": -342.0, "loss": 0.3375, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.84375, "rewards/margins": 4.46875, "rewards/rejected": -10.3125, "step": 480 }, { "epoch": 0.035371399696816574, "grad_norm": 14.381739856390974, "learning_rate": 3.6140316116210052e-06, "logits/chosen": -0.08447265625, "logits/rejected": 0.26953125, "logps/chosen": -350.0, "logps/rejected": -356.0, "loss": 0.3181, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.8125, "rewards/margins": 4.28125, "rewards/rejected": -12.125, "step": 490 }, { "epoch": 0.036093264996751605, "grad_norm": 13.058132045977628, "learning_rate": 3.5777087639996634e-06, "logits/chosen": -0.1474609375, "logits/rejected": 0.333984375, "logps/chosen": -366.0, "logps/rejected": -382.0, "loss": 0.309, "rewards/accuracies": 0.84375, "rewards/chosen": -8.1875, "rewards/margins": 4.5, "rewards/rejected": -12.6875, "step": 500 }, { "epoch": 0.036815130296686635, "grad_norm": 10.19256374291624, "learning_rate": 3.5424595421603814e-06, "logits/chosen": -0.1904296875, "logits/rejected": 0.29296875, "logps/chosen": -352.0, "logps/rejected": -360.0, "loss": 0.3023, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -7.5625, "rewards/margins": 5.375, "rewards/rejected": -12.9375, "step": 510 }, { "epoch": 0.03753699559662167, "grad_norm": 18.94194804238697, "learning_rate": 3.5082320772281165e-06, "logits/chosen": -0.126953125, "logits/rejected": 0.275390625, "logps/chosen": -360.0, "logps/rejected": -378.0, "loss": 0.3131, "rewards/accuracies": 0.875, "rewards/chosen": -9.0625, "rewards/margins": 5.15625, "rewards/rejected": -14.25, "step": 520 }, { "epoch": 0.0382588608965567, "grad_norm": 11.032650225015894, "learning_rate": 3.474977942104555e-06, "logits/chosen": 0.0693359375, "logits/rejected": 0.484375, "logps/chosen": -360.0, "logps/rejected": -374.0, "loss": 0.2757, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.75, "rewards/margins": 4.15625, "rewards/rejected": -12.875, "step": 530 }, { "epoch": 0.03898072619649173, "grad_norm": 13.473720018918133, "learning_rate": 3.442651863295481e-06, "logits/chosen": -0.302734375, "logits/rejected": 0.25390625, "logps/chosen": -346.0, "logps/rejected": -372.0, "loss": 0.3406, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -6.59375, "rewards/margins": 4.8125, "rewards/rejected": -11.4375, "step": 540 }, { "epoch": 0.03970259149642677, "grad_norm": 10.317412100683466, "learning_rate": 3.4112114616897665e-06, "logits/chosen": -0.2099609375, "logits/rejected": 0.205078125, "logps/chosen": -360.0, "logps/rejected": -346.0, "loss": 0.3006, "rewards/accuracies": 0.9375, "rewards/chosen": -6.1875, "rewards/margins": 4.8125, "rewards/rejected": -11.0, "step": 550 }, { "epoch": 0.0404244567963618, "grad_norm": 12.696703961135151, "learning_rate": 3.3806170189140663e-06, "logits/chosen": -0.032958984375, "logits/rejected": 0.43359375, "logps/chosen": -358.0, "logps/rejected": -372.0, "loss": 0.3173, "rewards/accuracies": 0.90625, "rewards/chosen": -7.0, "rewards/margins": 4.84375, "rewards/rejected": -11.8125, "step": 560 }, { "epoch": 0.04114632209629683, "grad_norm": 14.44465556922375, "learning_rate": 3.350831266333564e-06, "logits/chosen": 0.1689453125, "logits/rejected": 0.5546875, "logps/chosen": -370.0, "logps/rejected": -364.0, "loss": 0.3179, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.125, "rewards/margins": 4.6875, "rewards/rejected": -12.8125, "step": 570 }, { "epoch": 0.04186818739623186, "grad_norm": 13.182724081544478, "learning_rate": 3.3218191941495984e-06, "logits/chosen": 0.033935546875, "logits/rejected": 0.46484375, "logps/chosen": -374.0, "logps/rejected": -370.0, "loss": 0.349, "rewards/accuracies": 0.875, "rewards/chosen": -8.625, "rewards/margins": 4.09375, "rewards/rejected": -12.75, "step": 580 }, { "epoch": 0.0425900526961669, "grad_norm": 15.703115079369434, "learning_rate": 3.293547878370473e-06, "logits/chosen": 0.036376953125, "logits/rejected": 0.357421875, "logps/chosen": -356.0, "logps/rejected": -372.0, "loss": 0.2565, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.75, "rewards/margins": 4.5625, "rewards/rejected": -12.375, "step": 590 }, { "epoch": 0.04331191799610193, "grad_norm": 13.791519926514919, "learning_rate": 3.2659863237109044e-06, "logits/chosen": 0.0966796875, "logits/rejected": 0.5390625, "logps/chosen": -342.0, "logps/rejected": -334.0, "loss": 0.3133, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.0625, "rewards/margins": 3.4375, "rewards/rejected": -10.5, "step": 600 }, { "epoch": 0.04403378329603696, "grad_norm": 15.936816956712514, "learning_rate": 3.239105320715664e-06, "logits/chosen": 0.1708984375, "logits/rejected": 0.48828125, "logps/chosen": -346.0, "logps/rejected": -358.0, "loss": 0.3286, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -6.0, "rewards/margins": 4.59375, "rewards/rejected": -10.625, "step": 610 }, { "epoch": 0.04475564859597199, "grad_norm": 12.117537621010397, "learning_rate": 3.2128773156099956e-06, "logits/chosen": -0.09765625, "logits/rejected": 0.55859375, "logps/chosen": -372.0, "logps/rejected": -364.0, "loss": 0.3219, "rewards/accuracies": 0.875, "rewards/chosen": -8.5, "rewards/margins": 4.34375, "rewards/rejected": -12.8125, "step": 620 }, { "epoch": 0.045477513895907026, "grad_norm": 13.00731546631491, "learning_rate": 3.187276291558383e-06, "logits/chosen": 0.10986328125, "logits/rejected": 0.4609375, "logps/chosen": -352.0, "logps/rejected": -356.0, "loss": 0.3231, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -6.65625, "rewards/margins": 4.65625, "rewards/rejected": -11.3125, "step": 630 }, { "epoch": 0.04619937919584206, "grad_norm": 13.8777799550721, "learning_rate": 3.1622776601683788e-06, "logits/chosen": 0.0947265625, "logits/rejected": 0.40625, "logps/chosen": -360.0, "logps/rejected": -362.0, "loss": 0.2794, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -6.8125, "rewards/margins": 4.125, "rewards/rejected": -10.9375, "step": 640 }, { "epoch": 0.04692124449577709, "grad_norm": 14.274981854608033, "learning_rate": 3.1378581622109444e-06, "logits/chosen": 0.03515625, "logits/rejected": 0.458984375, "logps/chosen": -368.0, "logps/rejected": -384.0, "loss": 0.3138, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -9.0, "rewards/margins": 4.5625, "rewards/rejected": -13.5625, "step": 650 }, { "epoch": 0.04764310979571212, "grad_norm": 13.524622981357124, "learning_rate": 3.113995776646092e-06, "logits/chosen": 0.062255859375, "logits/rejected": 0.408203125, "logps/chosen": -376.0, "logps/rejected": -386.0, "loss": 0.3217, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -9.6875, "rewards/margins": 5.03125, "rewards/rejected": -14.75, "step": 660 }, { "epoch": 0.048364975095647154, "grad_norm": 14.156245791776398, "learning_rate": 3.090669637145023e-06, "logits/chosen": 0.2158203125, "logits/rejected": 0.484375, "logps/chosen": -346.0, "logps/rejected": -364.0, "loss": 0.3375, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -8.25, "rewards/margins": 4.21875, "rewards/rejected": -12.5, "step": 670 }, { "epoch": 0.049086840395582185, "grad_norm": 11.590738595098415, "learning_rate": 3.0678599553894814e-06, "logits/chosen": 0.10546875, "logits/rejected": 0.423828125, "logps/chosen": -366.0, "logps/rejected": -376.0, "loss": 0.3022, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.8125, "rewards/margins": 4.46875, "rewards/rejected": -14.25, "step": 680 }, { "epoch": 0.049808705695517215, "grad_norm": 12.555011116041074, "learning_rate": 3.0455479505075235e-06, "logits/chosen": 0.2392578125, "logits/rejected": 0.44140625, "logps/chosen": -388.0, "logps/rejected": -394.0, "loss": 0.2891, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.4375, "rewards/margins": 4.65625, "rewards/rejected": -14.125, "step": 690 }, { "epoch": 0.050530570995452245, "grad_norm": 12.78207323236996, "learning_rate": 3.0237157840738173e-06, "logits/chosen": 0.07763671875, "logits/rejected": 0.46875, "logps/chosen": -368.0, "logps/rejected": -384.0, "loss": 0.312, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.0, "rewards/margins": 4.71875, "rewards/rejected": -13.6875, "step": 700 }, { "epoch": 0.05125243629538728, "grad_norm": 11.249891196168837, "learning_rate": 3.002346500163206e-06, "logits/chosen": -0.08544921875, "logits/rejected": 0.3125, "logps/chosen": -360.0, "logps/rejected": -382.0, "loss": 0.3137, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -8.625, "rewards/margins": 4.875, "rewards/rejected": -13.5, "step": 710 }, { "epoch": 0.05197430159532231, "grad_norm": 19.382681579438334, "learning_rate": 2.9814239699997195e-06, "logits/chosen": -0.12060546875, "logits/rejected": 0.2314453125, "logps/chosen": -382.0, "logps/rejected": -396.0, "loss": 0.3045, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.1875, "rewards/margins": 4.4375, "rewards/rejected": -14.625, "step": 720 }, { "epoch": 0.05269616689525734, "grad_norm": 12.622526822059625, "learning_rate": 2.9609328407904207e-06, "logits/chosen": -0.0849609375, "logits/rejected": 0.4140625, "logps/chosen": -368.0, "logps/rejected": -390.0, "loss": 0.3198, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -7.8125, "rewards/margins": 4.9375, "rewards/rejected": -12.75, "step": 730 }, { "epoch": 0.05341803219519238, "grad_norm": 14.466539702911536, "learning_rate": 2.940858488375231e-06, "logits/chosen": 0.03955078125, "logits/rejected": 0.365234375, "logps/chosen": -364.0, "logps/rejected": -362.0, "loss": 0.3097, "rewards/accuracies": 0.875, "rewards/chosen": -7.8125, "rewards/margins": 4.1875, "rewards/rejected": -12.0, "step": 740 }, { "epoch": 0.05413989749512741, "grad_norm": 11.749702099867502, "learning_rate": 2.9211869733608857e-06, "logits/chosen": 0.05029296875, "logits/rejected": 0.490234375, "logps/chosen": -360.0, "logps/rejected": -370.0, "loss": 0.278, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.78125, "rewards/margins": 4.84375, "rewards/rejected": -12.625, "step": 750 }, { "epoch": 0.05486176279506244, "grad_norm": 15.115240943848852, "learning_rate": 2.901905000440047e-06, "logits/chosen": 0.158203125, "logits/rejected": 0.498046875, "logps/chosen": -366.0, "logps/rejected": -370.0, "loss": 0.2785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.8125, "rewards/margins": 4.90625, "rewards/rejected": -13.6875, "step": 760 }, { "epoch": 0.05558362809499747, "grad_norm": 18.549488851968622, "learning_rate": 2.8829998806257885e-06, "logits/chosen": 0.09619140625, "logits/rejected": 0.439453125, "logps/chosen": -364.0, "logps/rejected": -392.0, "loss": 0.3079, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -8.9375, "rewards/margins": 6.46875, "rewards/rejected": -15.375, "step": 770 }, { "epoch": 0.05630549339493251, "grad_norm": 15.332428485904607, "learning_rate": 2.8644594961577314e-06, "logits/chosen": 0.05419921875, "logits/rejected": 0.4296875, "logps/chosen": -380.0, "logps/rejected": -414.0, "loss": 0.2993, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -10.125, "rewards/margins": 5.21875, "rewards/rejected": -15.375, "step": 780 }, { "epoch": 0.05702735869486754, "grad_norm": 18.350676671924248, "learning_rate": 2.84627226785928e-06, "logits/chosen": 0.2392578125, "logits/rejected": 0.6171875, "logps/chosen": -386.0, "logps/rejected": -410.0, "loss": 0.2664, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -10.6875, "rewards/margins": 4.6875, "rewards/rejected": -15.375, "step": 790 }, { "epoch": 0.05774922399480257, "grad_norm": 13.130110828377047, "learning_rate": 2.82842712474619e-06, "logits/chosen": 0.2109375, "logits/rejected": 0.62890625, "logps/chosen": -372.0, "logps/rejected": -368.0, "loss": 0.2526, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -9.375, "rewards/margins": 4.90625, "rewards/rejected": -14.25, "step": 800 }, { "epoch": 0.0584710892947376, "grad_norm": 9.289424030660065, "learning_rate": 2.810913475705226e-06, "logits/chosen": 0.1689453125, "logits/rejected": 0.57421875, "logps/chosen": -368.0, "logps/rejected": -396.0, "loss": 0.2398, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -10.875, "rewards/margins": 4.46875, "rewards/rejected": -15.375, "step": 810 }, { "epoch": 0.05919295459467264, "grad_norm": 9.358292860884875, "learning_rate": 2.7937211830783128e-06, "logits/chosen": 0.341796875, "logits/rejected": 0.64453125, "logps/chosen": -366.0, "logps/rejected": -404.0, "loss": 0.2961, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.625, "rewards/margins": 5.40625, "rewards/rejected": -15.0, "step": 820 }, { "epoch": 0.05991481989460767, "grad_norm": 11.729693712040229, "learning_rate": 2.776840538002493e-06, "logits/chosen": 0.224609375, "logits/rejected": 0.75, "logps/chosen": -380.0, "logps/rejected": -398.0, "loss": 0.3123, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.5625, "rewards/margins": 5.21875, "rewards/rejected": -14.8125, "step": 830 }, { "epoch": 0.0606366851945427, "grad_norm": 10.256838448444945, "learning_rate": 2.7602622373694163e-06, "logits/chosen": 0.267578125, "logits/rejected": 0.66796875, "logps/chosen": -362.0, "logps/rejected": -392.0, "loss": 0.3157, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.8125, "rewards/margins": 4.90625, "rewards/rejected": -14.6875, "step": 840 }, { "epoch": 0.06135855049447773, "grad_norm": 11.352254075920321, "learning_rate": 2.743977362280141e-06, "logits/chosen": 0.279296875, "logits/rejected": 0.73046875, "logps/chosen": -372.0, "logps/rejected": -404.0, "loss": 0.278, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -9.125, "rewards/margins": 4.8125, "rewards/rejected": -13.875, "step": 850 }, { "epoch": 0.062080415794412765, "grad_norm": 10.642971744992057, "learning_rate": 2.7279773578818937e-06, "logits/chosen": 0.1748046875, "logits/rejected": 0.56640625, "logps/chosen": -372.0, "logps/rejected": -386.0, "loss": 0.3031, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -10.0625, "rewards/margins": 4.5625, "rewards/rejected": -14.625, "step": 860 }, { "epoch": 0.0628022810943478, "grad_norm": 14.106879709505675, "learning_rate": 2.7122540144832417e-06, "logits/chosen": 0.369140625, "logits/rejected": 0.66015625, "logps/chosen": -372.0, "logps/rejected": -392.0, "loss": 0.258, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.9375, "rewards/margins": 4.46875, "rewards/rejected": -14.4375, "step": 870 }, { "epoch": 0.06352414639428283, "grad_norm": 15.430924834656496, "learning_rate": 2.696799449852968e-06, "logits/chosen": 0.28125, "logits/rejected": 0.65234375, "logps/chosen": -346.0, "logps/rejected": -362.0, "loss": 0.2794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.0, "rewards/margins": 5.21875, "rewards/rejected": -13.25, "step": 880 }, { "epoch": 0.06424601169421786, "grad_norm": 17.985696358572667, "learning_rate": 2.6816060926159636e-06, "logits/chosen": 0.32421875, "logits/rejected": 0.6328125, "logps/chosen": -392.0, "logps/rejected": -392.0, "loss": 0.2617, "rewards/accuracies": 0.90625, "rewards/chosen": -10.9375, "rewards/margins": 4.59375, "rewards/rejected": -15.5, "step": 890 }, { "epoch": 0.06496787699415289, "grad_norm": 11.021231708528743, "learning_rate": 2.6666666666666664e-06, "logits/chosen": -0.00897216796875, "logits/rejected": 0.49609375, "logps/chosen": -370.0, "logps/rejected": -386.0, "loss": 0.2406, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -8.5625, "rewards/margins": 6.4375, "rewards/rejected": -15.0, "step": 900 }, { "epoch": 0.06568974229408793, "grad_norm": 14.220810050981346, "learning_rate": 2.6519741765271837e-06, "logits/chosen": 0.08984375, "logits/rejected": 0.5625, "logps/chosen": -372.0, "logps/rejected": -400.0, "loss": 0.2817, "rewards/accuracies": 0.90625, "rewards/chosen": -9.3125, "rewards/margins": 5.09375, "rewards/rejected": -14.4375, "step": 910 }, { "epoch": 0.06641160759402295, "grad_norm": 11.058946956173958, "learning_rate": 2.637521893583148e-06, "logits/chosen": 0.06396484375, "logits/rejected": 0.5078125, "logps/chosen": -396.0, "logps/rejected": -400.0, "loss": 0.2357, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.6875, "rewards/margins": 5.65625, "rewards/rejected": -15.3125, "step": 920 }, { "epoch": 0.06713347289395799, "grad_norm": 13.115462137673646, "learning_rate": 2.6233033431358115e-06, "logits/chosen": 0.240234375, "logits/rejected": 0.5625, "logps/chosen": -392.0, "logps/rejected": -398.0, "loss": 0.244, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.5625, "rewards/margins": 5.40625, "rewards/rejected": -16.0, "step": 930 }, { "epoch": 0.06785533819389301, "grad_norm": 11.67287072403859, "learning_rate": 2.6093122922137685e-06, "logits/chosen": 0.306640625, "logits/rejected": 0.65234375, "logps/chosen": -376.0, "logps/rejected": -400.0, "loss": 0.2336, "rewards/accuracies": 0.90625, "rewards/chosen": -10.25, "rewards/margins": 5.5, "rewards/rejected": -15.75, "step": 940 }, { "epoch": 0.06857720349382805, "grad_norm": 11.61123932469845, "learning_rate": 2.5955427380922006e-06, "logits/chosen": 0.34765625, "logits/rejected": 0.7109375, "logps/chosen": -364.0, "logps/rejected": -368.0, "loss": 0.3037, "rewards/accuracies": 0.90625, "rewards/chosen": -9.375, "rewards/margins": 4.625, "rewards/rejected": -14.0, "step": 950 }, { "epoch": 0.06929906879376309, "grad_norm": 13.86521871974538, "learning_rate": 2.5819888974716113e-06, "logits/chosen": 0.22265625, "logits/rejected": 0.6640625, "logps/chosen": -384.0, "logps/rejected": -392.0, "loss": 0.2813, "rewards/accuracies": 0.875, "rewards/chosen": -10.0625, "rewards/margins": 4.65625, "rewards/rejected": -14.6875, "step": 960 }, { "epoch": 0.07002093409369811, "grad_norm": 13.176721177171645, "learning_rate": 2.5686451962717425e-06, "logits/chosen": 0.2109375, "logits/rejected": 0.5234375, "logps/chosen": -358.0, "logps/rejected": -384.0, "loss": 0.257, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.9375, "rewards/margins": 6.0, "rewards/rejected": -14.9375, "step": 970 }, { "epoch": 0.07074279939363315, "grad_norm": 13.880897503951262, "learning_rate": 2.5555062599997596e-06, "logits/chosen": 0.1611328125, "logits/rejected": 0.482421875, "logps/chosen": -378.0, "logps/rejected": -398.0, "loss": 0.2851, "rewards/accuracies": 0.90625, "rewards/chosen": -10.125, "rewards/margins": 5.09375, "rewards/rejected": -15.1875, "step": 980 }, { "epoch": 0.07146466469356819, "grad_norm": 16.04674661688896, "learning_rate": 2.5425669046549126e-06, "logits/chosen": 0.1240234375, "logits/rejected": 0.5390625, "logps/chosen": -376.0, "logps/rejected": -368.0, "loss": 0.2414, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.9375, "rewards/margins": 4.6875, "rewards/rejected": -13.625, "step": 990 }, { "epoch": 0.07218652999350321, "grad_norm": 13.795077897318079, "learning_rate": 2.5298221281347034e-06, "logits/chosen": 0.1982421875, "logits/rejected": 0.439453125, "logps/chosen": -384.0, "logps/rejected": -398.0, "loss": 0.2158, "rewards/accuracies": 0.90625, "rewards/chosen": -9.25, "rewards/margins": 5.0, "rewards/rejected": -14.1875, "step": 1000 }, { "epoch": 0.07290839529343825, "grad_norm": 11.59793816283185, "learning_rate": 2.5172671021102103e-06, "logits/chosen": 0.2216796875, "logits/rejected": 0.4609375, "logps/chosen": -390.0, "logps/rejected": -424.0, "loss": 0.2606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.375, "rewards/margins": 5.4375, "rewards/rejected": -16.875, "step": 1010 }, { "epoch": 0.07363026059337327, "grad_norm": 12.537095203029393, "learning_rate": 2.504897164340598e-06, "logits/chosen": 0.12109375, "logits/rejected": 0.4453125, "logps/chosen": -382.0, "logps/rejected": -422.0, "loss": 0.2839, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.875, "rewards/margins": 5.40625, "rewards/rejected": -16.25, "step": 1020 }, { "epoch": 0.07435212589330831, "grad_norm": 11.111428759986184, "learning_rate": 2.492707811399023e-06, "logits/chosen": 0.26171875, "logits/rejected": 0.5234375, "logps/chosen": -336.0, "logps/rejected": -380.0, "loss": 0.2724, "rewards/accuracies": 0.90625, "rewards/chosen": -8.1875, "rewards/margins": 4.75, "rewards/rejected": -12.9375, "step": 1030 }, { "epoch": 0.07507399119324334, "grad_norm": 11.149817843527142, "learning_rate": 2.480694691784169e-06, "logits/chosen": 0.3046875, "logits/rejected": 0.64453125, "logps/chosen": -382.0, "logps/rejected": -404.0, "loss": 0.3042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.375, "rewards/margins": 4.6875, "rewards/rejected": -16.0, "step": 1040 }, { "epoch": 0.07579585649317837, "grad_norm": 22.802061352310425, "learning_rate": 2.4688535993934706e-06, "logits/chosen": 0.263671875, "logits/rejected": 0.6015625, "logps/chosen": -382.0, "logps/rejected": -432.0, "loss": 0.3077, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -10.875, "rewards/margins": 5.1875, "rewards/rejected": -16.0, "step": 1050 }, { "epoch": 0.0765177217931134, "grad_norm": 10.748527798717818, "learning_rate": 2.457180467335805e-06, "logits/chosen": 0.1943359375, "logits/rejected": 0.51171875, "logps/chosen": -376.0, "logps/rejected": -386.0, "loss": 0.2522, "rewards/accuracies": 0.875, "rewards/chosen": -9.5, "rewards/margins": 4.59375, "rewards/rejected": -14.125, "step": 1060 }, { "epoch": 0.07723958709304844, "grad_norm": 11.231271361197551, "learning_rate": 2.4456713620629725e-06, "logits/chosen": 0.134765625, "logits/rejected": 0.52734375, "logps/chosen": -370.0, "logps/rejected": -382.0, "loss": 0.2799, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -8.75, "rewards/margins": 4.75, "rewards/rejected": -13.5, "step": 1070 }, { "epoch": 0.07796145239298347, "grad_norm": 13.832434878883664, "learning_rate": 2.4343224778007378e-06, "logits/chosen": -0.05419921875, "logits/rejected": 0.55078125, "logps/chosen": -380.0, "logps/rejected": -380.0, "loss": 0.2159, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.6875, "rewards/margins": 5.1875, "rewards/rejected": -14.875, "step": 1080 }, { "epoch": 0.0786833176929185, "grad_norm": 10.910256624556416, "learning_rate": 2.4231301312615306e-06, "logits/chosen": 0.0167236328125, "logits/rejected": 0.50390625, "logps/chosen": -396.0, "logps/rejected": -396.0, "loss": 0.27, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -10.75, "rewards/margins": 5.09375, "rewards/rejected": -15.8125, "step": 1090 }, { "epoch": 0.07940518299285354, "grad_norm": 10.47724748620521, "learning_rate": 2.412090756622109e-06, "logits/chosen": 0.11328125, "logits/rejected": 0.45703125, "logps/chosen": -384.0, "logps/rejected": -410.0, "loss": 0.2475, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -11.25, "rewards/margins": 4.65625, "rewards/rejected": -15.9375, "step": 1100 }, { "epoch": 0.08012704829278856, "grad_norm": 18.711604343905545, "learning_rate": 2.401200900750657e-06, "logits/chosen": 0.07666015625, "logits/rejected": 0.56640625, "logps/chosen": -390.0, "logps/rejected": -426.0, "loss": 0.3231, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -11.5625, "rewards/margins": 6.09375, "rewards/rejected": -17.625, "step": 1110 }, { "epoch": 0.0808489135927236, "grad_norm": 14.99303146958539, "learning_rate": 2.390457218668787e-06, "logits/chosen": 0.2060546875, "logits/rejected": 0.55859375, "logps/chosen": -386.0, "logps/rejected": -428.0, "loss": 0.231, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -11.1875, "rewards/margins": 5.5, "rewards/rejected": -16.625, "step": 1120 }, { "epoch": 0.08157077889265862, "grad_norm": 8.541437755712654, "learning_rate": 2.379856469234918e-06, "logits/chosen": 0.2001953125, "logits/rejected": 0.5234375, "logps/chosen": -410.0, "logps/rejected": -430.0, "loss": 0.274, "rewards/accuracies": 0.9375, "rewards/chosen": -11.8125, "rewards/margins": 5.1875, "rewards/rejected": -17.0, "step": 1130 }, { "epoch": 0.08229264419259366, "grad_norm": 8.862767452266308, "learning_rate": 2.369395511036369e-06, "logits/chosen": 0.1708984375, "logits/rejected": 0.5546875, "logps/chosen": -404.0, "logps/rejected": -418.0, "loss": 0.2382, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.875, "rewards/margins": 5.0625, "rewards/rejected": -17.875, "step": 1140 }, { "epoch": 0.0830145094925287, "grad_norm": 15.403443773507826, "learning_rate": 2.359071298478354e-06, "logits/chosen": 0.02392578125, "logits/rejected": 0.41796875, "logps/chosen": -422.0, "logps/rejected": -428.0, "loss": 0.2307, "rewards/accuracies": 0.90625, "rewards/chosen": -11.6875, "rewards/margins": 5.5625, "rewards/rejected": -17.25, "step": 1150 }, { "epoch": 0.08373637479246372, "grad_norm": 14.83682027229091, "learning_rate": 2.3488808780588137e-06, "logits/chosen": 0.298828125, "logits/rejected": 0.63671875, "logps/chosen": -378.0, "logps/rejected": -418.0, "loss": 0.2821, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.3125, "rewards/margins": 4.84375, "rewards/rejected": -17.125, "step": 1160 }, { "epoch": 0.08445824009239876, "grad_norm": 10.017670039542773, "learning_rate": 2.3388213848187446e-06, "logits/chosen": 0.2158203125, "logits/rejected": 0.486328125, "logps/chosen": -384.0, "logps/rejected": -412.0, "loss": 0.2595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.9375, "rewards/margins": 5.25, "rewards/rejected": -16.125, "step": 1170 }, { "epoch": 0.0851801053923338, "grad_norm": 15.06082080695069, "learning_rate": 2.328890038958328e-06, "logits/chosen": -0.10986328125, "logits/rejected": 0.51171875, "logps/chosen": -376.0, "logps/rejected": -392.0, "loss": 0.2792, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.5625, "rewards/margins": 5.53125, "rewards/rejected": -16.125, "step": 1180 }, { "epoch": 0.08590197069226882, "grad_norm": 11.180556317210138, "learning_rate": 2.3190841426097937e-06, "logits/chosen": 0.052001953125, "logits/rejected": 0.56640625, "logps/chosen": -378.0, "logps/rejected": -396.0, "loss": 0.2176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.75, "rewards/margins": 5.75, "rewards/rejected": -16.5, "step": 1190 }, { "epoch": 0.08662383599220386, "grad_norm": 11.673020269938165, "learning_rate": 2.309401076758503e-06, "logits/chosen": 0.134765625, "logits/rejected": 0.396484375, "logps/chosen": -400.0, "logps/rejected": -418.0, "loss": 0.2385, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -11.0625, "rewards/margins": 4.625, "rewards/rejected": -15.6875, "step": 1200 }, { "epoch": 0.08734570129213888, "grad_norm": 10.313173452487495, "learning_rate": 2.299838298304276e-06, "logits/chosen": 0.0257568359375, "logits/rejected": 0.43359375, "logps/chosen": -364.0, "logps/rejected": -408.0, "loss": 0.2283, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -11.125, "rewards/margins": 7.0, "rewards/rejected": -18.125, "step": 1210 }, { "epoch": 0.08806756659207392, "grad_norm": 11.682844751439422, "learning_rate": 2.2903933372554728e-06, "logits/chosen": -0.2099609375, "logits/rejected": 0.4609375, "logps/chosen": -374.0, "logps/rejected": -384.0, "loss": 0.2478, "rewards/accuracies": 0.90625, "rewards/chosen": -8.875, "rewards/margins": 5.375, "rewards/rejected": -14.25, "step": 1220 }, { "epoch": 0.08878943189200895, "grad_norm": 12.4386165557421, "learning_rate": 2.281063794048804e-06, "logits/chosen": -0.1669921875, "logits/rejected": 0.205078125, "logps/chosen": -354.0, "logps/rejected": -380.0, "loss": 0.2517, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.75, "rewards/margins": 5.53125, "rewards/rejected": -14.3125, "step": 1230 }, { "epoch": 0.08951129719194398, "grad_norm": 10.31051409480371, "learning_rate": 2.271847336988259e-06, "logits/chosen": -0.1396484375, "logits/rejected": 0.33203125, "logps/chosen": -424.0, "logps/rejected": -442.0, "loss": 0.2608, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.625, "rewards/margins": 5.4375, "rewards/rejected": -18.125, "step": 1240 }, { "epoch": 0.09023316249187902, "grad_norm": 15.441434871520878, "learning_rate": 2.262741699796952e-06, "logits/chosen": 0.09326171875, "logits/rejected": 0.359375, "logps/chosen": -394.0, "logps/rejected": -416.0, "loss": 0.2259, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.875, "rewards/margins": 4.9375, "rewards/rejected": -17.75, "step": 1250 }, { "epoch": 0.09095502779181405, "grad_norm": 15.420745204667519, "learning_rate": 2.253744679276044e-06, "logits/chosen": -0.005126953125, "logits/rejected": 0.484375, "logps/chosen": -394.0, "logps/rejected": -420.0, "loss": 0.2461, "rewards/accuracies": 0.90625, "rewards/chosen": -12.5625, "rewards/margins": 5.9375, "rewards/rejected": -18.5, "step": 1260 }, { "epoch": 0.09167689309174908, "grad_norm": 16.454359760518187, "learning_rate": 2.244854133065255e-06, "logits/chosen": -0.05078125, "logits/rejected": 0.2255859375, "logps/chosen": -388.0, "logps/rejected": -414.0, "loss": 0.2533, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -10.5625, "rewards/margins": 5.71875, "rewards/rejected": -16.375, "step": 1270 }, { "epoch": 0.09239875839168411, "grad_norm": 11.132932256804878, "learning_rate": 2.2360679774997895e-06, "logits/chosen": 0.052734375, "logits/rejected": 0.46875, "logps/chosen": -434.0, "logps/rejected": -434.0, "loss": 0.4294, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -13.125, "rewards/margins": 5.28125, "rewards/rejected": -18.375, "step": 1280 }, { "epoch": 0.09312062369161915, "grad_norm": 14.254041006994026, "learning_rate": 2.2273841855588183e-06, "logits/chosen": 0.09716796875, "logits/rejected": 0.578125, "logps/chosen": -406.0, "logps/rejected": -432.0, "loss": 0.2386, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.5, "rewards/margins": 5.59375, "rewards/rejected": -19.125, "step": 1290 }, { "epoch": 0.09384248899155417, "grad_norm": 10.40537466304724, "learning_rate": 2.2188007849009167e-06, "logits/chosen": -0.0220947265625, "logits/rejected": 0.361328125, "logps/chosen": -404.0, "logps/rejected": -424.0, "loss": 0.2656, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -11.625, "rewards/margins": 5.90625, "rewards/rejected": -17.5, "step": 1300 }, { "epoch": 0.09456435429148921, "grad_norm": 12.782259978119182, "learning_rate": 2.2103158559821502e-06, "logits/chosen": 0.1474609375, "logits/rejected": 0.484375, "logps/chosen": -388.0, "logps/rejected": -410.0, "loss": 0.2452, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.0, "rewards/margins": 5.3125, "rewards/rejected": -16.25, "step": 1310 }, { "epoch": 0.09528621959142423, "grad_norm": 12.670135677961259, "learning_rate": 2.2019275302527213e-06, "logits/chosen": -0.00701904296875, "logits/rejected": 0.470703125, "logps/chosen": -386.0, "logps/rejected": -398.0, "loss": 0.2163, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -10.6875, "rewards/margins": 4.8125, "rewards/rejected": -15.5, "step": 1320 }, { "epoch": 0.09600808489135927, "grad_norm": 11.033595940222902, "learning_rate": 2.193633988428327e-06, "logits/chosen": 0.134765625, "logits/rejected": 0.51953125, "logps/chosen": -374.0, "logps/rejected": -412.0, "loss": 0.2386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.3125, "rewards/margins": 6.1875, "rewards/rejected": -17.5, "step": 1330 }, { "epoch": 0.09672995019129431, "grad_norm": 13.72752236541224, "learning_rate": 2.185433458832612e-06, "logits/chosen": 0.083984375, "logits/rejected": 0.369140625, "logps/chosen": -394.0, "logps/rejected": -426.0, "loss": 0.2818, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.5625, "rewards/margins": 5.15625, "rewards/rejected": -16.75, "step": 1340 }, { "epoch": 0.09745181549122933, "grad_norm": 11.285270968387689, "learning_rate": 2.177324215807269e-06, "logits/chosen": 0.048828125, "logits/rejected": 0.546875, "logps/chosen": -442.0, "logps/rejected": -440.0, "loss": 0.2558, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.375, "rewards/margins": 5.65625, "rewards/rejected": -19.0, "step": 1350 }, { "epoch": 0.09817368079116437, "grad_norm": 12.424391492312013, "learning_rate": 2.1693045781865616e-06, "logits/chosen": 0.035400390625, "logits/rejected": 0.48046875, "logps/chosen": -442.0, "logps/rejected": -468.0, "loss": 0.2977, "rewards/accuracies": 0.9375, "rewards/chosen": -14.6875, "rewards/margins": 6.0, "rewards/rejected": -20.625, "step": 1360 }, { "epoch": 0.0988955460910994, "grad_norm": 11.35277883973332, "learning_rate": 2.1613729078331965e-06, "logits/chosen": 0.259765625, "logits/rejected": 0.65234375, "logps/chosen": -416.0, "logps/rejected": -412.0, "loss": 0.2321, "rewards/accuracies": 0.90625, "rewards/chosen": -13.5625, "rewards/margins": 5.09375, "rewards/rejected": -18.625, "step": 1370 }, { "epoch": 0.09961741139103443, "grad_norm": 15.757560440543193, "learning_rate": 2.1535276082326617e-06, "logits/chosen": 0.1396484375, "logits/rejected": 0.5234375, "logps/chosen": -432.0, "logps/rejected": -448.0, "loss": 0.2423, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.3125, "rewards/margins": 5.4375, "rewards/rejected": -18.75, "step": 1380 }, { "epoch": 0.10033927669096947, "grad_norm": 13.345328984157922, "learning_rate": 2.14576712314328e-06, "logits/chosen": 0.2392578125, "logits/rejected": 0.703125, "logps/chosen": -440.0, "logps/rejected": -448.0, "loss": 0.2082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.375, "rewards/margins": 5.375, "rewards/rejected": -20.75, "step": 1390 }, { "epoch": 0.10106114199090449, "grad_norm": 11.021569365002641, "learning_rate": 2.138089935299395e-06, "logits/chosen": 0.2578125, "logits/rejected": 0.70703125, "logps/chosen": -370.0, "logps/rejected": -398.0, "loss": 0.2331, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -11.75, "rewards/margins": 5.15625, "rewards/rejected": -17.0, "step": 1400 }, { "epoch": 0.10178300729083953, "grad_norm": 9.696867339430037, "learning_rate": 2.1304945651652297e-06, "logits/chosen": 0.16796875, "logits/rejected": 0.62890625, "logps/chosen": -398.0, "logps/rejected": -414.0, "loss": 0.2765, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -10.9375, "rewards/margins": 5.71875, "rewards/rejected": -16.625, "step": 1410 }, { "epoch": 0.10250487259077457, "grad_norm": 11.852921019387706, "learning_rate": 2.122979569737101e-06, "logits/chosen": 0.1572265625, "logits/rejected": 0.482421875, "logps/chosen": -422.0, "logps/rejected": -460.0, "loss": 0.2569, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -12.5625, "rewards/margins": 5.09375, "rewards/rejected": -17.625, "step": 1420 }, { "epoch": 0.10322673789070959, "grad_norm": 10.219931896303242, "learning_rate": 2.11554354139178e-06, "logits/chosen": 0.30078125, "logits/rejected": 0.61328125, "logps/chosen": -370.0, "logps/rejected": -410.0, "loss": 0.2733, "rewards/accuracies": 0.875, "rewards/chosen": -11.1875, "rewards/margins": 5.1875, "rewards/rejected": -16.375, "step": 1430 }, { "epoch": 0.10394860319064463, "grad_norm": 13.524301887449147, "learning_rate": 2.1081851067789196e-06, "logits/chosen": 0.3984375, "logits/rejected": 0.734375, "logps/chosen": -366.0, "logps/rejected": -394.0, "loss": 0.2731, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -11.3125, "rewards/margins": 5.21875, "rewards/rejected": -16.5, "step": 1440 }, { "epoch": 0.10467046849057966, "grad_norm": 8.931764655138245, "learning_rate": 2.1009029257555606e-06, "logits/chosen": 0.0771484375, "logits/rejected": 0.51953125, "logps/chosen": -410.0, "logps/rejected": -452.0, "loss": 0.2086, "rewards/accuracies": 0.90625, "rewards/chosen": -12.375, "rewards/margins": 4.9375, "rewards/rejected": -17.25, "step": 1450 }, { "epoch": 0.10539233379051469, "grad_norm": 11.450977533297955, "learning_rate": 2.0936956903608545e-06, "logits/chosen": 0.1953125, "logits/rejected": 0.5859375, "logps/chosen": -432.0, "logps/rejected": -442.0, "loss": 0.2471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.125, "rewards/margins": 5.96875, "rewards/rejected": -19.125, "step": 1460 }, { "epoch": 0.10611419909044972, "grad_norm": 14.84982981672084, "learning_rate": 2.0865621238292046e-06, "logits/chosen": 0.13671875, "logits/rejected": 0.63671875, "logps/chosen": -390.0, "logps/rejected": -378.0, "loss": 0.2422, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -11.0, "rewards/margins": 5.0, "rewards/rejected": -16.0, "step": 1470 }, { "epoch": 0.10683606439038476, "grad_norm": 10.346449857240232, "learning_rate": 2.079500979640145e-06, "logits/chosen": 0.2734375, "logits/rejected": 0.56640625, "logps/chosen": -412.0, "logps/rejected": -414.0, "loss": 0.2421, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -11.5, "rewards/margins": 5.625, "rewards/rejected": -17.125, "step": 1480 }, { "epoch": 0.10755792969031978, "grad_norm": 8.326346350054795, "learning_rate": 2.072511040603359e-06, "logits/chosen": 0.3046875, "logits/rejected": 0.6640625, "logps/chosen": -384.0, "logps/rejected": -416.0, "loss": 0.2209, "rewards/accuracies": 0.9375, "rewards/chosen": -10.875, "rewards/margins": 6.5, "rewards/rejected": -17.375, "step": 1490 }, { "epoch": 0.10827979499025482, "grad_norm": 12.62882443156198, "learning_rate": 2.065591117977289e-06, "logits/chosen": 0.2109375, "logits/rejected": 0.6484375, "logps/chosen": -400.0, "logps/rejected": -416.0, "loss": 0.2162, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.0625, "rewards/margins": 5.875, "rewards/rejected": -17.0, "step": 1500 }, { "epoch": 0.10900166029018984, "grad_norm": 14.007350949897122, "learning_rate": 2.058740050619915e-06, "logits/chosen": 0.171875, "logits/rejected": 0.625, "logps/chosen": -410.0, "logps/rejected": -406.0, "loss": 0.2146, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -10.75, "rewards/margins": 5.5625, "rewards/rejected": -16.375, "step": 1510 }, { "epoch": 0.10972352559012488, "grad_norm": 13.541722380113178, "learning_rate": 2.0519567041703083e-06, "logits/chosen": 0.4453125, "logits/rejected": 0.76171875, "logps/chosen": -402.0, "logps/rejected": -428.0, "loss": 0.2502, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.375, "rewards/margins": 5.5, "rewards/rejected": -18.875, "step": 1520 }, { "epoch": 0.11044539089005992, "grad_norm": 11.130773108050988, "learning_rate": 2.0452399702596544e-06, "logits/chosen": 0.322265625, "logits/rejected": 0.7890625, "logps/chosen": -414.0, "logps/rejected": -424.0, "loss": 0.2088, "rewards/accuracies": 0.90625, "rewards/chosen": -12.875, "rewards/margins": 6.125, "rewards/rejected": -19.0, "step": 1530 }, { "epoch": 0.11116725618999494, "grad_norm": 10.54373937197833, "learning_rate": 2.0385887657505017e-06, "logits/chosen": 0.36328125, "logits/rejected": 0.578125, "logps/chosen": -394.0, "logps/rejected": -426.0, "loss": 0.2078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.0625, "rewards/margins": 5.6875, "rewards/rejected": -17.75, "step": 1540 }, { "epoch": 0.11188912148992998, "grad_norm": 7.969881291985247, "learning_rate": 2.032002032003048e-06, "logits/chosen": 0.35546875, "logits/rejected": 0.7421875, "logps/chosen": -396.0, "logps/rejected": -410.0, "loss": 0.2461, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.5625, "rewards/margins": 5.03125, "rewards/rejected": -17.625, "step": 1550 }, { "epoch": 0.11261098678986502, "grad_norm": 9.430166932925735, "learning_rate": 2.025478734167333e-06, "logits/chosen": 0.359375, "logits/rejected": 0.78515625, "logps/chosen": -416.0, "logps/rejected": -430.0, "loss": 0.2203, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.6875, "rewards/margins": 6.40625, "rewards/rejected": -19.0, "step": 1560 }, { "epoch": 0.11333285208980004, "grad_norm": 15.673180486042241, "learning_rate": 2.0190178605002747e-06, "logits/chosen": 0.341796875, "logits/rejected": 0.6796875, "logps/chosen": -408.0, "logps/rejected": -424.0, "loss": 0.2356, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -12.6875, "rewards/margins": 5.625, "rewards/rejected": -18.375, "step": 1570 }, { "epoch": 0.11405471738973508, "grad_norm": 11.446024106497036, "learning_rate": 2.0126184217065104e-06, "logits/chosen": 0.2451171875, "logits/rejected": 0.4765625, "logps/chosen": -384.0, "logps/rejected": -394.0, "loss": 0.2262, "rewards/accuracies": 0.90625, "rewards/chosen": -10.0625, "rewards/margins": 5.15625, "rewards/rejected": -15.1875, "step": 1580 }, { "epoch": 0.1147765826896701, "grad_norm": 8.761511241132519, "learning_rate": 2.0062794503020765e-06, "logits/chosen": 0.365234375, "logits/rejected": 0.65234375, "logps/chosen": -332.0, "logps/rejected": -362.0, "loss": 0.1939, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.5625, "rewards/margins": 4.9375, "rewards/rejected": -13.5, "step": 1590 }, { "epoch": 0.11549844798960514, "grad_norm": 13.006922041419442, "learning_rate": 2e-06, "logits/chosen": 0.2431640625, "logits/rejected": 0.52734375, "logps/chosen": -362.0, "logps/rejected": -398.0, "loss": 0.2108, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -9.625, "rewards/margins": 6.21875, "rewards/rejected": -15.875, "step": 1600 }, { "epoch": 0.11622031328954018, "grad_norm": 9.656197689064037, "learning_rate": 1.993779145116907e-06, "logits/chosen": 0.3203125, "logits/rejected": 0.84375, "logps/chosen": -400.0, "logps/rejected": -426.0, "loss": 0.2459, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -12.5625, "rewards/margins": 6.0, "rewards/rejected": -18.5, "step": 1610 }, { "epoch": 0.1169421785894752, "grad_norm": 8.733065211337397, "learning_rate": 1.987615979999813e-06, "logits/chosen": 0.2431640625, "logits/rejected": 0.859375, "logps/chosen": -436.0, "logps/rejected": -454.0, "loss": 0.2293, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.875, "rewards/margins": 6.8125, "rewards/rejected": -20.625, "step": 1620 }, { "epoch": 0.11766404388941024, "grad_norm": 10.741016037829644, "learning_rate": 1.9815096184722797e-06, "logits/chosen": 0.357421875, "logits/rejected": 0.6640625, "logps/chosen": -374.0, "logps/rejected": -406.0, "loss": 0.2307, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.625, "rewards/margins": 5.4375, "rewards/rejected": -16.0, "step": 1630 }, { "epoch": 0.11838590918934527, "grad_norm": 12.857795445823685, "learning_rate": 1.9754591932991793e-06, "logits/chosen": 0.2578125, "logits/rejected": 0.61328125, "logps/chosen": -420.0, "logps/rejected": -436.0, "loss": 0.225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.5, "rewards/margins": 5.46875, "rewards/rejected": -18.0, "step": 1640 }, { "epoch": 0.1191077744892803, "grad_norm": 9.211941219185555, "learning_rate": 1.9694638556693235e-06, "logits/chosen": 0.2177734375, "logits/rejected": 0.59765625, "logps/chosen": -424.0, "logps/rejected": -456.0, "loss": 0.2285, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.8125, "rewards/margins": 7.5625, "rewards/rejected": -20.375, "step": 1650 }, { "epoch": 0.11982963978921533, "grad_norm": 11.645818100444602, "learning_rate": 1.963522774695264e-06, "logits/chosen": 0.310546875, "logits/rejected": 0.65234375, "logps/chosen": -416.0, "logps/rejected": -412.0, "loss": 0.2309, "rewards/accuracies": 0.875, "rewards/chosen": -13.125, "rewards/margins": 5.15625, "rewards/rejected": -18.375, "step": 1660 }, { "epoch": 0.12055150508915037, "grad_norm": 12.472908996926925, "learning_rate": 1.9576351369295853e-06, "logits/chosen": 0.263671875, "logits/rejected": 0.6171875, "logps/chosen": -364.0, "logps/rejected": -384.0, "loss": 0.2015, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.9375, "rewards/margins": 4.90625, "rewards/rejected": -14.875, "step": 1670 }, { "epoch": 0.1212733703890854, "grad_norm": 6.762482029780577, "learning_rate": 1.951800145897066e-06, "logits/chosen": 0.19140625, "logits/rejected": 0.6015625, "logps/chosen": -382.0, "logps/rejected": -416.0, "loss": 0.2329, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.6875, "rewards/margins": 6.125, "rewards/rejected": -16.875, "step": 1680 }, { "epoch": 0.12199523568902043, "grad_norm": 9.138325880236248, "learning_rate": 1.9460170216420797e-06, "logits/chosen": 0.2578125, "logits/rejected": 0.6953125, "logps/chosen": -384.0, "logps/rejected": -402.0, "loss": 0.2194, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.5625, "rewards/margins": 5.8125, "rewards/rejected": -17.375, "step": 1690 }, { "epoch": 0.12271710098895545, "grad_norm": 10.759074261444505, "learning_rate": 1.9402850002906637e-06, "logits/chosen": 0.365234375, "logits/rejected": 0.66015625, "logps/chosen": -402.0, "logps/rejected": -440.0, "loss": 0.2694, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.25, "rewards/margins": 5.375, "rewards/rejected": -17.625, "step": 1700 }, { "epoch": 0.12343896628889049, "grad_norm": 10.041893360294631, "learning_rate": 1.9346033336266974e-06, "logits/chosen": 0.30859375, "logits/rejected": 0.6875, "logps/chosen": -420.0, "logps/rejected": -446.0, "loss": 0.2493, "rewards/accuracies": 0.90625, "rewards/chosen": -12.875, "rewards/margins": 5.21875, "rewards/rejected": -18.125, "step": 1710 }, { "epoch": 0.12416083158882553, "grad_norm": 10.442707017685455, "learning_rate": 1.9289712886816486e-06, "logits/chosen": 0.333984375, "logits/rejected": 0.70703125, "logps/chosen": -394.0, "logps/rejected": -420.0, "loss": 0.155, "rewards/accuracies": 0.9375, "rewards/chosen": -12.875, "rewards/margins": 5.8125, "rewards/rejected": -18.75, "step": 1720 }, { "epoch": 0.12488269688876055, "grad_norm": 10.333733862343534, "learning_rate": 1.92338814733738e-06, "logits/chosen": 0.06494140625, "logits/rejected": 0.494140625, "logps/chosen": -380.0, "logps/rejected": -400.0, "loss": 0.2558, "rewards/accuracies": 0.90625, "rewards/chosen": -10.625, "rewards/margins": 5.90625, "rewards/rejected": -16.5, "step": 1730 }, { "epoch": 0.1256045621886956, "grad_norm": 8.645785201223179, "learning_rate": 1.9178532059415367e-06, "logits/chosen": 0.2275390625, "logits/rejected": 0.5234375, "logps/chosen": -362.0, "logps/rejected": -392.0, "loss": 0.2242, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.25, "rewards/margins": 5.40625, "rewards/rejected": -14.6875, "step": 1740 }, { "epoch": 0.12632642748863063, "grad_norm": 13.145566082622368, "learning_rate": 1.9123657749350298e-06, "logits/chosen": 0.02392578125, "logits/rejected": 0.5625, "logps/chosen": -404.0, "logps/rejected": -414.0, "loss": 0.2202, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.125, "rewards/margins": 5.78125, "rewards/rejected": -16.875, "step": 1750 }, { "epoch": 0.12704829278856566, "grad_norm": 9.059152232677294, "learning_rate": 1.9069251784911844e-06, "logits/chosen": 0.18359375, "logits/rejected": 0.609375, "logps/chosen": -424.0, "logps/rejected": -442.0, "loss": 0.221, "rewards/accuracies": 0.9375, "rewards/chosen": -12.875, "rewards/margins": 6.09375, "rewards/rejected": -19.0, "step": 1760 }, { "epoch": 0.12777015808850067, "grad_norm": 13.96772224378229, "learning_rate": 1.9015307541661132e-06, "logits/chosen": 0.1953125, "logits/rejected": 0.56640625, "logps/chosen": -394.0, "logps/rejected": -440.0, "loss": 0.228, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5, "rewards/margins": 6.0625, "rewards/rejected": -19.5, "step": 1770 }, { "epoch": 0.1284920233884357, "grad_norm": 14.357343397786414, "learning_rate": 1.8961818525599089e-06, "logits/chosen": 0.220703125, "logits/rejected": 0.7265625, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.2482, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.25, "rewards/margins": 6.53125, "rewards/rejected": -19.75, "step": 1780 }, { "epoch": 0.12921388868837075, "grad_norm": 11.930779676118881, "learning_rate": 1.890877836988262e-06, "logits/chosen": 0.146484375, "logits/rejected": 0.546875, "logps/chosen": -422.0, "logps/rejected": -454.0, "loss": 0.2255, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.9375, "rewards/margins": 6.0625, "rewards/rejected": -19.0, "step": 1790 }, { "epoch": 0.12993575398830579, "grad_norm": 11.190016587887087, "learning_rate": 1.8856180831641269e-06, "logits/chosen": 0.330078125, "logits/rejected": 0.71875, "logps/chosen": -402.0, "logps/rejected": -442.0, "loss": 0.2247, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.5, "rewards/margins": 5.46875, "rewards/rejected": -19.0, "step": 1800 }, { "epoch": 0.13065761928824082, "grad_norm": 5.9538991282997165, "learning_rate": 1.8804019788890737e-06, "logits/chosen": 0.158203125, "logits/rejected": 0.61328125, "logps/chosen": -400.0, "logps/rejected": -408.0, "loss": 0.1732, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.1875, "rewards/margins": 5.8125, "rewards/rejected": -18.0, "step": 1810 }, { "epoch": 0.13137948458817586, "grad_norm": 11.728379694447028, "learning_rate": 1.8752289237539816e-06, "logits/chosen": 0.1572265625, "logits/rejected": 0.59765625, "logps/chosen": -396.0, "logps/rejected": -442.0, "loss": 0.2102, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -12.1875, "rewards/margins": 6.59375, "rewards/rejected": -18.75, "step": 1820 }, { "epoch": 0.13210134988811087, "grad_norm": 13.541588907739696, "learning_rate": 1.8700983288487376e-06, "logits/chosen": 0.07763671875, "logits/rejected": 0.5234375, "logps/chosen": -392.0, "logps/rejected": -430.0, "loss": 0.241, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.0625, "rewards/margins": 6.0625, "rewards/rejected": -17.125, "step": 1830 }, { "epoch": 0.1328232151880459, "grad_norm": 9.276841018311883, "learning_rate": 1.8650096164806275e-06, "logits/chosen": 0.16015625, "logits/rejected": 0.51953125, "logps/chosen": -366.0, "logps/rejected": -382.0, "loss": 0.2302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.0, "rewards/margins": 5.40625, "rewards/rejected": -16.375, "step": 1840 }, { "epoch": 0.13354508048798094, "grad_norm": 14.26831153390648, "learning_rate": 1.8599622199011084e-06, "logits/chosen": -0.068359375, "logits/rejected": 0.5234375, "logps/chosen": -402.0, "logps/rejected": -436.0, "loss": 0.239, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -12.5, "rewards/margins": 6.46875, "rewards/rejected": -19.0, "step": 1850 }, { "epoch": 0.13426694578791598, "grad_norm": 13.871813622230386, "learning_rate": 1.854955583040673e-06, "logits/chosen": 0.107421875, "logits/rejected": 0.55078125, "logps/chosen": -418.0, "logps/rejected": -452.0, "loss": 0.2265, "rewards/accuracies": 0.9375, "rewards/chosen": -13.4375, "rewards/margins": 6.21875, "rewards/rejected": -19.625, "step": 1860 }, { "epoch": 0.13498881108785102, "grad_norm": 13.782369719550509, "learning_rate": 1.849989160251521e-06, "logits/chosen": 0.08251953125, "logits/rejected": 0.68359375, "logps/chosen": -398.0, "logps/rejected": -438.0, "loss": 0.1886, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.5625, "rewards/margins": 6.84375, "rewards/rejected": -19.375, "step": 1870 }, { "epoch": 0.13571067638778603, "grad_norm": 12.01760116344873, "learning_rate": 1.8450624160577701e-06, "logits/chosen": -0.01422119140625, "logits/rejected": 0.484375, "logps/chosen": -400.0, "logps/rejected": -400.0, "loss": 0.2055, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -10.75, "rewards/margins": 6.03125, "rewards/rejected": -16.75, "step": 1880 }, { "epoch": 0.13643254168772107, "grad_norm": 11.225970353954569, "learning_rate": 1.8401748249129445e-06, "logits/chosen": 0.1953125, "logits/rejected": 0.61328125, "logps/chosen": -402.0, "logps/rejected": -426.0, "loss": 0.1885, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.875, "rewards/margins": 6.0, "rewards/rejected": -18.875, "step": 1890 }, { "epoch": 0.1371544069876561, "grad_norm": 6.697409573871043, "learning_rate": 1.8353258709644938e-06, "logits/chosen": 0.083984375, "logits/rejected": 0.55859375, "logps/chosen": -404.0, "logps/rejected": -416.0, "loss": 0.2241, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.0625, "rewards/margins": 6.3125, "rewards/rejected": -18.375, "step": 1900 }, { "epoch": 0.13787627228759114, "grad_norm": 8.553737922607906, "learning_rate": 1.830515047825102e-06, "logits/chosen": 0.099609375, "logits/rejected": 0.56640625, "logps/chosen": -400.0, "logps/rejected": -418.0, "loss": 0.2272, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.625, "rewards/margins": 5.34375, "rewards/rejected": -18.0, "step": 1910 }, { "epoch": 0.13859813758752618, "grad_norm": 11.479215826415656, "learning_rate": 1.8257418583505536e-06, "logits/chosen": 0.17578125, "logits/rejected": 0.5703125, "logps/chosen": -426.0, "logps/rejected": -450.0, "loss": 0.1865, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.1875, "rewards/margins": 6.21875, "rewards/rejected": -19.375, "step": 1920 }, { "epoch": 0.1393200028874612, "grad_norm": 9.524934939553313, "learning_rate": 1.8210058144239416e-06, "logits/chosen": 0.279296875, "logits/rejected": 0.58984375, "logps/chosen": -414.0, "logps/rejected": -426.0, "loss": 0.2444, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.3125, "rewards/margins": 5.78125, "rewards/rejected": -18.125, "step": 1930 }, { "epoch": 0.14004186818739622, "grad_norm": 8.360345534211046, "learning_rate": 1.816306436745999e-06, "logits/chosen": 0.365234375, "logits/rejected": 0.72265625, "logps/chosen": -388.0, "logps/rejected": -422.0, "loss": 0.1912, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.375, "rewards/margins": 5.875, "rewards/rejected": -19.25, "step": 1940 }, { "epoch": 0.14076373348733126, "grad_norm": 10.785869963286851, "learning_rate": 1.8116432546313529e-06, "logits/chosen": 0.26171875, "logits/rejected": 0.69140625, "logps/chosen": -444.0, "logps/rejected": -470.0, "loss": 0.187, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.8125, "rewards/margins": 6.5, "rewards/rejected": -21.25, "step": 1950 }, { "epoch": 0.1414855987872663, "grad_norm": 11.453683403180497, "learning_rate": 1.8070158058105026e-06, "logits/chosen": 0.166015625, "logits/rejected": 0.62109375, "logps/chosen": -394.0, "logps/rejected": -422.0, "loss": 0.233, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.5, "rewards/margins": 5.78125, "rewards/rejected": -18.25, "step": 1960 }, { "epoch": 0.14220746408720134, "grad_norm": 13.062365096114274, "learning_rate": 1.8024236362373315e-06, "logits/chosen": 0.23828125, "logits/rejected": 0.578125, "logps/chosen": -398.0, "logps/rejected": -432.0, "loss": 0.268, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -10.6875, "rewards/margins": 5.6875, "rewards/rejected": -16.375, "step": 1970 }, { "epoch": 0.14292932938713637, "grad_norm": 10.624019550535927, "learning_rate": 1.7978662999019787e-06, "logits/chosen": 0.37109375, "logits/rejected": 0.63671875, "logps/chosen": -410.0, "logps/rejected": -438.0, "loss": 0.1878, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.5625, "rewards/margins": 5.5625, "rewards/rejected": -18.125, "step": 1980 }, { "epoch": 0.14365119468707138, "grad_norm": 10.74458009808042, "learning_rate": 1.793343358648881e-06, "logits/chosen": 0.2001953125, "logits/rejected": 0.61328125, "logps/chosen": -400.0, "logps/rejected": -414.0, "loss": 0.2297, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.5, "rewards/margins": 5.71875, "rewards/rejected": -18.125, "step": 1990 }, { "epoch": 0.14437305998700642, "grad_norm": 12.282973223667438, "learning_rate": 1.7888543819998317e-06, "logits/chosen": 0.265625, "logits/rejected": 0.609375, "logps/chosen": -400.0, "logps/rejected": -460.0, "loss": 0.1806, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.625, "rewards/margins": 6.71875, "rewards/rejected": -18.375, "step": 2000 }, { "epoch": 0.14509492528694146, "grad_norm": 15.204546234117293, "learning_rate": 1.7843989469818819e-06, "logits/chosen": 0.2412109375, "logits/rejected": 0.5234375, "logps/chosen": -406.0, "logps/rejected": -464.0, "loss": 0.1954, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -12.75, "rewards/margins": 7.84375, "rewards/rejected": -20.625, "step": 2010 }, { "epoch": 0.1458167905868765, "grad_norm": 12.534594184426462, "learning_rate": 1.779976637959939e-06, "logits/chosen": 0.1533203125, "logits/rejected": 0.67578125, "logps/chosen": -398.0, "logps/rejected": -428.0, "loss": 0.2128, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.625, "rewards/margins": 6.90625, "rewards/rejected": -19.5, "step": 2020 }, { "epoch": 0.14653865588681153, "grad_norm": 14.021855277501333, "learning_rate": 1.7755870464739012e-06, "logits/chosen": 0.33203125, "logits/rejected": 0.65625, "logps/chosen": -382.0, "logps/rejected": -408.0, "loss": 0.2105, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.3125, "rewards/margins": 5.6875, "rewards/rejected": -19.0, "step": 2030 }, { "epoch": 0.14726052118674654, "grad_norm": 15.709831743443994, "learning_rate": 1.7712297710801907e-06, "logits/chosen": 0.212890625, "logits/rejected": 0.56640625, "logps/chosen": -406.0, "logps/rejected": -496.0, "loss": 0.2, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -15.0625, "rewards/margins": 8.4375, "rewards/rejected": -23.5, "step": 2040 }, { "epoch": 0.14798238648668158, "grad_norm": 13.295218847675114, "learning_rate": 1.7669044171975444e-06, "logits/chosen": 0.23828125, "logits/rejected": 0.6640625, "logps/chosen": -408.0, "logps/rejected": -432.0, "loss": 0.2206, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.5625, "rewards/margins": 5.40625, "rewards/rejected": -19.875, "step": 2050 }, { "epoch": 0.14870425178661661, "grad_norm": 9.409166778465407, "learning_rate": 1.7626105969569268e-06, "logits/chosen": 0.140625, "logits/rejected": 0.45703125, "logps/chosen": -406.0, "logps/rejected": -452.0, "loss": 0.2162, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -11.625, "rewards/margins": 5.9375, "rewards/rejected": -17.5, "step": 2060 }, { "epoch": 0.14942611708655165, "grad_norm": 9.132218588835698, "learning_rate": 1.758347929055432e-06, "logits/chosen": 0.203125, "logits/rejected": 0.58203125, "logps/chosen": -390.0, "logps/rejected": -424.0, "loss": 0.1736, "rewards/accuracies": 0.9375, "rewards/chosen": -11.625, "rewards/margins": 5.71875, "rewards/rejected": -17.375, "step": 2070 }, { "epoch": 0.1501479823864867, "grad_norm": 14.007755567962002, "learning_rate": 1.7541160386140582e-06, "logits/chosen": 0.1748046875, "logits/rejected": 0.6015625, "logps/chosen": -424.0, "logps/rejected": -444.0, "loss": 0.2392, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.8125, "rewards/margins": 5.46875, "rewards/rejected": -20.25, "step": 2080 }, { "epoch": 0.15086984768642173, "grad_norm": 12.054828319802324, "learning_rate": 1.7499145570392284e-06, "logits/chosen": 0.014892578125, "logits/rejected": 0.625, "logps/chosen": -454.0, "logps/rejected": -480.0, "loss": 0.2321, "rewards/accuracies": 0.90625, "rewards/chosen": -16.375, "rewards/margins": 7.125, "rewards/rejected": -23.5, "step": 2090 }, { "epoch": 0.15159171298635674, "grad_norm": 16.350239081193916, "learning_rate": 1.745743121887939e-06, "logits/chosen": 0.064453125, "logits/rejected": 0.40625, "logps/chosen": -448.0, "logps/rejected": -448.0, "loss": 0.2174, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.75, "rewards/margins": 5.78125, "rewards/rejected": -20.5, "step": 2100 }, { "epoch": 0.15231357828629177, "grad_norm": 15.313416651009947, "learning_rate": 1.7416013767364324e-06, "logits/chosen": 0.1865234375, "logits/rejected": 0.515625, "logps/chosen": -402.0, "logps/rejected": -418.0, "loss": 0.2481, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -13.9375, "rewards/margins": 5.03125, "rewards/rejected": -19.0, "step": 2110 }, { "epoch": 0.1530354435862268, "grad_norm": 9.922509352163086, "learning_rate": 1.7374889710522776e-06, "logits/chosen": 0.115234375, "logits/rejected": 0.625, "logps/chosen": -434.0, "logps/rejected": -456.0, "loss": 0.1782, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.25, "rewards/margins": 6.34375, "rewards/rejected": -20.5, "step": 2120 }, { "epoch": 0.15375730888616185, "grad_norm": 9.644735054211296, "learning_rate": 1.7334055600697579e-06, "logits/chosen": 0.10986328125, "logits/rejected": 0.58984375, "logps/chosen": -430.0, "logps/rejected": -468.0, "loss": 0.2259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.875, "rewards/margins": 6.5625, "rewards/rejected": -21.5, "step": 2130 }, { "epoch": 0.15447917418609688, "grad_norm": 11.01066558661573, "learning_rate": 1.7293508046684678e-06, "logits/chosen": 0.173828125, "logits/rejected": 0.5859375, "logps/chosen": -460.0, "logps/rejected": -472.0, "loss": 0.1838, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -15.5, "rewards/margins": 6.53125, "rewards/rejected": -22.125, "step": 2140 }, { "epoch": 0.1552010394860319, "grad_norm": 12.044157818855963, "learning_rate": 1.7253243712550145e-06, "logits/chosen": 0.21484375, "logits/rejected": 0.6171875, "logps/chosen": -404.0, "logps/rejected": -430.0, "loss": 0.2021, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.625, "rewards/margins": 5.46875, "rewards/rejected": -19.125, "step": 2150 }, { "epoch": 0.15592290478596693, "grad_norm": 9.914671007771878, "learning_rate": 1.7213259316477406e-06, "logits/chosen": 0.076171875, "logits/rejected": 0.439453125, "logps/chosen": -412.0, "logps/rejected": -434.0, "loss": 0.1868, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.4375, "rewards/margins": 5.78125, "rewards/rejected": -18.25, "step": 2160 }, { "epoch": 0.15664477008590197, "grad_norm": 12.060103707295902, "learning_rate": 1.7173551629643674e-06, "logits/chosen": 0.1669921875, "logits/rejected": 0.546875, "logps/chosen": -386.0, "logps/rejected": -404.0, "loss": 0.2364, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -10.8125, "rewards/margins": 6.125, "rewards/rejected": -16.875, "step": 2170 }, { "epoch": 0.157366635385837, "grad_norm": 9.966243982503787, "learning_rate": 1.713411747512477e-06, "logits/chosen": 0.185546875, "logits/rejected": 0.412109375, "logps/chosen": -394.0, "logps/rejected": -424.0, "loss": 0.1849, "rewards/accuracies": 0.90625, "rewards/chosen": -11.3125, "rewards/margins": 5.3125, "rewards/rejected": -16.625, "step": 2180 }, { "epoch": 0.15808850068577204, "grad_norm": 9.152836463229223, "learning_rate": 1.709495372682753e-06, "logits/chosen": 0.1826171875, "logits/rejected": 0.5859375, "logps/chosen": -408.0, "logps/rejected": -420.0, "loss": 0.2077, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.9375, "rewards/margins": 5.5, "rewards/rejected": -17.375, "step": 2190 }, { "epoch": 0.15881036598570708, "grad_norm": 10.149154570750877, "learning_rate": 1.7056057308448832e-06, "logits/chosen": 0.162109375, "logits/rejected": 0.69921875, "logps/chosen": -410.0, "logps/rejected": -424.0, "loss": 0.205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.4375, "rewards/margins": 6.4375, "rewards/rejected": -17.875, "step": 2200 }, { "epoch": 0.1595322312856421, "grad_norm": 8.431360747791258, "learning_rate": 1.701742519246068e-06, "logits/chosen": 0.1416015625, "logits/rejected": 0.56640625, "logps/chosen": -410.0, "logps/rejected": -444.0, "loss": 0.2018, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.375, "rewards/margins": 6.21875, "rewards/rejected": -18.625, "step": 2210 }, { "epoch": 0.16025409658557713, "grad_norm": 55.234357967881564, "learning_rate": 1.6979054399120355e-06, "logits/chosen": 0.09521484375, "logits/rejected": 0.62890625, "logps/chosen": -414.0, "logps/rejected": -416.0, "loss": 0.2934, "rewards/accuracies": 0.90625, "rewards/chosen": -12.125, "rewards/margins": 4.9375, "rewards/rejected": -17.125, "step": 2220 }, { "epoch": 0.16097596188551216, "grad_norm": 8.934500804692604, "learning_rate": 1.6940941995505069e-06, "logits/chosen": 0.1650390625, "logits/rejected": 0.50390625, "logps/chosen": -398.0, "logps/rejected": -432.0, "loss": 0.1755, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.1875, "rewards/margins": 6.375, "rewards/rejected": -17.5, "step": 2230 }, { "epoch": 0.1616978271854472, "grad_norm": 10.052595872186448, "learning_rate": 1.6903085094570331e-06, "logits/chosen": 0.13671875, "logits/rejected": 0.6484375, "logps/chosen": -418.0, "logps/rejected": -436.0, "loss": 0.1884, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.25, "rewards/margins": 6.71875, "rewards/rejected": -20.0, "step": 2240 }, { "epoch": 0.16241969248538224, "grad_norm": 8.831774484386823, "learning_rate": 1.6865480854231356e-06, "logits/chosen": 0.09326171875, "logits/rejected": 0.4453125, "logps/chosen": -438.0, "logps/rejected": -444.0, "loss": 0.1955, "rewards/accuracies": 0.90625, "rewards/chosen": -14.1875, "rewards/margins": 5.875, "rewards/rejected": -20.0, "step": 2250 }, { "epoch": 0.16314155778531725, "grad_norm": 5.5636179377576696, "learning_rate": 1.682812647646685e-06, "logits/chosen": 0.006134033203125, "logits/rejected": 0.52734375, "logps/chosen": -402.0, "logps/rejected": -422.0, "loss": 0.1935, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.5, "rewards/margins": 5.9375, "rewards/rejected": -18.5, "step": 2260 }, { "epoch": 0.16386342308525229, "grad_norm": 10.415868156441604, "learning_rate": 1.6791019206444541e-06, "logits/chosen": 0.1494140625, "logits/rejected": 0.53515625, "logps/chosen": -414.0, "logps/rejected": -420.0, "loss": 0.2061, "rewards/accuracies": 0.90625, "rewards/chosen": -12.6875, "rewards/margins": 5.6875, "rewards/rejected": -18.375, "step": 2270 }, { "epoch": 0.16458528838518732, "grad_norm": 14.417139545330683, "learning_rate": 1.675415633166782e-06, "logits/chosen": 0.212890625, "logits/rejected": 0.53515625, "logps/chosen": -398.0, "logps/rejected": -428.0, "loss": 0.2311, "rewards/accuracies": 0.90625, "rewards/chosen": -12.375, "rewards/margins": 6.1875, "rewards/rejected": -18.5, "step": 2280 }, { "epoch": 0.16530715368512236, "grad_norm": 8.975780689269767, "learning_rate": 1.6717535181142914e-06, "logits/chosen": 0.216796875, "logits/rejected": 0.609375, "logps/chosen": -430.0, "logps/rejected": -454.0, "loss": 0.1959, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.1875, "rewards/margins": 6.34375, "rewards/rejected": -20.5, "step": 2290 }, { "epoch": 0.1660290189850574, "grad_norm": 9.749818904849564, "learning_rate": 1.668115312456598e-06, "logits/chosen": 0.337890625, "logits/rejected": 0.66796875, "logps/chosen": -424.0, "logps/rejected": -472.0, "loss": 0.2021, "rewards/accuracies": 0.9375, "rewards/chosen": -15.5625, "rewards/margins": 5.96875, "rewards/rejected": -21.5, "step": 2300 }, { "epoch": 0.1667508842849924, "grad_norm": 11.868753870661774, "learning_rate": 1.6645007571529578e-06, "logits/chosen": 0.390625, "logits/rejected": 0.74609375, "logps/chosen": -406.0, "logps/rejected": -436.0, "loss": 0.1785, "rewards/accuracies": 0.90625, "rewards/chosen": -14.5625, "rewards/margins": 5.25, "rewards/rejected": -19.875, "step": 2310 }, { "epoch": 0.16747274958492744, "grad_norm": 10.65954452814044, "learning_rate": 1.6609095970747992e-06, "logits/chosen": 0.181640625, "logits/rejected": 0.63671875, "logps/chosen": -426.0, "logps/rejected": -464.0, "loss": 0.2683, "rewards/accuracies": 0.90625, "rewards/chosen": -13.75, "rewards/margins": 6.25, "rewards/rejected": -20.0, "step": 2320 }, { "epoch": 0.16819461488486248, "grad_norm": 11.707709690937815, "learning_rate": 1.6573415809300833e-06, "logits/chosen": 0.2041015625, "logits/rejected": 0.6484375, "logps/chosen": -440.0, "logps/rejected": -432.0, "loss": 0.1807, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.25, "rewards/margins": 5.90625, "rewards/rejected": -19.125, "step": 2330 }, { "epoch": 0.16891648018479752, "grad_norm": 12.917039927509615, "learning_rate": 1.6537964611894462e-06, "logits/chosen": 0.2734375, "logits/rejected": 0.65625, "logps/chosen": -412.0, "logps/rejected": -454.0, "loss": 0.2093, "rewards/accuracies": 0.9375, "rewards/chosen": -14.6875, "rewards/margins": 6.15625, "rewards/rejected": -20.875, "step": 2340 }, { "epoch": 0.16963834548473256, "grad_norm": 13.109265734601914, "learning_rate": 1.6502739940140692e-06, "logits/chosen": 0.2275390625, "logits/rejected": 0.6015625, "logps/chosen": -406.0, "logps/rejected": -416.0, "loss": 0.1983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.6875, "rewards/margins": 6.1875, "rewards/rejected": -18.875, "step": 2350 }, { "epoch": 0.1703602107846676, "grad_norm": 12.7852010670046, "learning_rate": 1.6467739391852364e-06, "logits/chosen": 0.2314453125, "logits/rejected": 0.5703125, "logps/chosen": -426.0, "logps/rejected": -452.0, "loss": 0.2182, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -14.75, "rewards/margins": 5.5, "rewards/rejected": -20.25, "step": 2360 }, { "epoch": 0.1710820760846026, "grad_norm": 12.006913338677231, "learning_rate": 1.6432960600355221e-06, "logits/chosen": 0.392578125, "logits/rejected": 0.76953125, "logps/chosen": -418.0, "logps/rejected": -448.0, "loss": 0.1858, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.375, "rewards/margins": 5.96875, "rewards/rejected": -21.375, "step": 2370 }, { "epoch": 0.17180394138453764, "grad_norm": 10.449935295754848, "learning_rate": 1.6398401233815756e-06, "logits/chosen": 0.265625, "logits/rejected": 0.67578125, "logps/chosen": -468.0, "logps/rejected": -486.0, "loss": 0.2286, "rewards/accuracies": 0.9375, "rewards/chosen": -16.75, "rewards/margins": 6.78125, "rewards/rejected": -23.625, "step": 2380 }, { "epoch": 0.17252580668447268, "grad_norm": 9.107480351815683, "learning_rate": 1.6364058994584524e-06, "logits/chosen": 0.28125, "logits/rejected": 0.73046875, "logps/chosen": -428.0, "logps/rejected": -482.0, "loss": 0.2075, "rewards/accuracies": 0.96875, "rewards/chosen": -15.3125, "rewards/margins": 7.6875, "rewards/rejected": -23.0, "step": 2390 }, { "epoch": 0.17324767198440771, "grad_norm": 11.598515386975478, "learning_rate": 1.6329931618554522e-06, "logits/chosen": 0.4375, "logits/rejected": 0.82421875, "logps/chosen": -420.0, "logps/rejected": -456.0, "loss": 0.2015, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.3125, "rewards/margins": 6.46875, "rewards/rejected": -21.75, "step": 2400 }, { "epoch": 0.17396953728434275, "grad_norm": 15.004127054254583, "learning_rate": 1.6296016874534209e-06, "logits/chosen": 0.287109375, "logits/rejected": 0.6953125, "logps/chosen": -432.0, "logps/rejected": -454.0, "loss": 0.1972, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.25, "rewards/margins": 6.125, "rewards/rejected": -20.375, "step": 2410 }, { "epoch": 0.17469140258427776, "grad_norm": 7.425961683260218, "learning_rate": 1.6262312563634835e-06, "logits/chosen": 0.310546875, "logits/rejected": 0.73046875, "logps/chosen": -438.0, "logps/rejected": -472.0, "loss": 0.1841, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.1875, "rewards/margins": 5.875, "rewards/rejected": -21.0, "step": 2420 }, { "epoch": 0.1754132678842128, "grad_norm": 11.234521988399736, "learning_rate": 1.6228816518671587e-06, "logits/chosen": 0.27734375, "logits/rejected": 0.63671875, "logps/chosen": -460.0, "logps/rejected": -488.0, "loss": 0.1965, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.6875, "rewards/margins": 5.84375, "rewards/rejected": -21.5, "step": 2430 }, { "epoch": 0.17613513318414784, "grad_norm": 11.359892253488683, "learning_rate": 1.619552660357832e-06, "logits/chosen": 0.46484375, "logits/rejected": 0.80859375, "logps/chosen": -426.0, "logps/rejected": -464.0, "loss": 0.2204, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.125, "rewards/margins": 6.09375, "rewards/rejected": -22.25, "step": 2440 }, { "epoch": 0.17685699848408287, "grad_norm": 12.492864396070615, "learning_rate": 1.616244071283537e-06, "logits/chosen": 0.44140625, "logits/rejected": 0.796875, "logps/chosen": -424.0, "logps/rejected": -446.0, "loss": 0.1792, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.9375, "rewards/margins": 5.6875, "rewards/rejected": -19.625, "step": 2450 }, { "epoch": 0.1775788637840179, "grad_norm": 11.280239518464997, "learning_rate": 1.6129556770910235e-06, "logits/chosen": 0.271484375, "logits/rejected": 0.73046875, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.1866, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.4375, "rewards/margins": 6.25, "rewards/rejected": -19.625, "step": 2460 }, { "epoch": 0.17830072908395295, "grad_norm": 12.824739516975816, "learning_rate": 1.6096872731710673e-06, "logits/chosen": 0.1826171875, "logits/rejected": 0.5859375, "logps/chosen": -432.0, "logps/rejected": -472.0, "loss": 0.1907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.5625, "rewards/margins": 6.84375, "rewards/rejected": -21.375, "step": 2470 }, { "epoch": 0.17902259438388796, "grad_norm": 8.606581441159705, "learning_rate": 1.6064386578049978e-06, "logits/chosen": 0.255859375, "logits/rejected": 0.640625, "logps/chosen": -470.0, "logps/rejected": -482.0, "loss": 0.1949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.375, "rewards/margins": 7.0625, "rewards/rejected": -23.375, "step": 2480 }, { "epoch": 0.179744459683823, "grad_norm": 8.20022200654204, "learning_rate": 1.6032096321124046e-06, "logits/chosen": 0.1904296875, "logits/rejected": 0.7421875, "logps/chosen": -438.0, "logps/rejected": -448.0, "loss": 0.2032, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.1875, "rewards/margins": 6.0625, "rewards/rejected": -21.25, "step": 2490 }, { "epoch": 0.18046632498375803, "grad_norm": 12.300881569940305, "learning_rate": 1.6e-06, "logits/chosen": 0.349609375, "logits/rejected": 0.7890625, "logps/chosen": -398.0, "logps/rejected": -420.0, "loss": 0.198, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.625, "rewards/margins": 5.96875, "rewards/rejected": -19.625, "step": 2500 }, { "epoch": 0.18118819028369307, "grad_norm": 14.422231881861714, "learning_rate": 1.5968095681115984e-06, "logits/chosen": 0.265625, "logits/rejected": 0.6328125, "logps/chosen": -408.0, "logps/rejected": -432.0, "loss": 0.1862, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.5, "rewards/margins": 6.46875, "rewards/rejected": -19.0, "step": 2510 }, { "epoch": 0.1819100555836281, "grad_norm": 16.10477264070319, "learning_rate": 1.5936381457791914e-06, "logits/chosen": 0.26171875, "logits/rejected": 0.578125, "logps/chosen": -408.0, "logps/rejected": -432.0, "loss": 0.1909, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.5625, "rewards/margins": 6.21875, "rewards/rejected": -18.75, "step": 2520 }, { "epoch": 0.18263192088356311, "grad_norm": 10.335228415701346, "learning_rate": 1.590485544975088e-06, "logits/chosen": 0.041748046875, "logits/rejected": 0.5234375, "logps/chosen": -416.0, "logps/rejected": -460.0, "loss": 0.2408, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.0, "rewards/margins": 6.5, "rewards/rejected": -19.5, "step": 2530 }, { "epoch": 0.18335378618349815, "grad_norm": 12.80454958102725, "learning_rate": 1.5873515802650901e-06, "logits/chosen": 0.1787109375, "logits/rejected": 0.625, "logps/chosen": -418.0, "logps/rejected": -436.0, "loss": 0.1904, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.5, "rewards/margins": 6.125, "rewards/rejected": -19.625, "step": 2540 }, { "epoch": 0.1840756514834332, "grad_norm": 9.840395629814845, "learning_rate": 1.584236068762679e-06, "logits/chosen": 0.2080078125, "logits/rejected": 0.58203125, "logps/chosen": -428.0, "logps/rejected": -458.0, "loss": 0.1802, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.5, "rewards/margins": 6.0625, "rewards/rejected": -20.5, "step": 2550 }, { "epoch": 0.18479751678336823, "grad_norm": 14.835595756588026, "learning_rate": 1.5811388300841894e-06, "logits/chosen": 0.1162109375, "logits/rejected": 0.70703125, "logps/chosen": -430.0, "logps/rejected": -450.0, "loss": 0.2243, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.125, "rewards/margins": 5.9375, "rewards/rejected": -21.125, "step": 2560 }, { "epoch": 0.18551938208330326, "grad_norm": 10.202276538934665, "learning_rate": 1.5780596863049431e-06, "logits/chosen": 0.1474609375, "logits/rejected": 0.625, "logps/chosen": -422.0, "logps/rejected": -454.0, "loss": 0.1745, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.0, "rewards/margins": 6.78125, "rewards/rejected": -20.75, "step": 2570 }, { "epoch": 0.1862412473832383, "grad_norm": 8.667430710729915, "learning_rate": 1.5749984619163156e-06, "logits/chosen": 0.130859375, "logits/rejected": 0.55078125, "logps/chosen": -428.0, "logps/rejected": -476.0, "loss": 0.1383, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 6.5, "rewards/rejected": -21.125, "step": 2580 }, { "epoch": 0.1869631126831733, "grad_norm": 12.651999186200433, "learning_rate": 1.5719549837837187e-06, "logits/chosen": 0.1298828125, "logits/rejected": 0.64453125, "logps/chosen": -420.0, "logps/rejected": -470.0, "loss": 0.2232, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.3125, "rewards/margins": 6.28125, "rewards/rejected": -20.625, "step": 2590 }, { "epoch": 0.18768497798310835, "grad_norm": 13.18574050384436, "learning_rate": 1.5689290811054722e-06, "logits/chosen": 0.251953125, "logits/rejected": 0.66015625, "logps/chosen": -432.0, "logps/rejected": -470.0, "loss": 0.1888, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.875, "rewards/margins": 6.09375, "rewards/rejected": -21.0, "step": 2600 }, { "epoch": 0.18840684328304338, "grad_norm": 7.448537282036098, "learning_rate": 1.5659205853725426e-06, "logits/chosen": 0.11328125, "logits/rejected": 0.57421875, "logps/chosen": -444.0, "logps/rejected": -462.0, "loss": 0.2051, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -15.0625, "rewards/margins": 5.25, "rewards/rejected": -20.25, "step": 2610 }, { "epoch": 0.18912870858297842, "grad_norm": 9.810287344467847, "learning_rate": 1.562929330329127e-06, "logits/chosen": 0.29296875, "logits/rejected": 0.765625, "logps/chosen": -390.0, "logps/rejected": -446.0, "loss": 0.1911, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.25, "rewards/margins": 6.0625, "rewards/rejected": -19.375, "step": 2620 }, { "epoch": 0.18985057388291346, "grad_norm": 15.492985829180554, "learning_rate": 1.5599551519340636e-06, "logits/chosen": 0.2216796875, "logits/rejected": 0.56640625, "logps/chosen": -422.0, "logps/rejected": -474.0, "loss": 0.252, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.9375, "rewards/margins": 6.375, "rewards/rejected": -20.25, "step": 2630 }, { "epoch": 0.19057243918284847, "grad_norm": 9.866600585028282, "learning_rate": 1.556997888323046e-06, "logits/chosen": 0.34765625, "logits/rejected": 0.84765625, "logps/chosen": -438.0, "logps/rejected": -472.0, "loss": 0.1799, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.75, "rewards/margins": 6.8125, "rewards/rejected": -22.5, "step": 2640 }, { "epoch": 0.1912943044827835, "grad_norm": 14.288291846556952, "learning_rate": 1.5540573797716226e-06, "logits/chosen": 0.1513671875, "logits/rejected": 0.54296875, "logps/chosen": -472.0, "logps/rejected": -492.0, "loss": 0.1516, "rewards/accuracies": 0.9375, "rewards/chosen": -16.5, "rewards/margins": 6.625, "rewards/rejected": -23.0, "step": 2650 }, { "epoch": 0.19201616978271854, "grad_norm": 13.952970345689721, "learning_rate": 1.5511334686589623e-06, "logits/chosen": 0.31640625, "logits/rejected": 0.71484375, "logps/chosen": -424.0, "logps/rejected": -460.0, "loss": 0.1752, "rewards/accuracies": 0.9375, "rewards/chosen": -14.375, "rewards/margins": 6.4375, "rewards/rejected": -20.875, "step": 2660 }, { "epoch": 0.19273803508265358, "grad_norm": 11.428289231888769, "learning_rate": 1.548225999432367e-06, "logits/chosen": 0.16796875, "logits/rejected": 0.6640625, "logps/chosen": -376.0, "logps/rejected": -406.0, "loss": 0.2034, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.0, "rewards/margins": 6.125, "rewards/rejected": -18.125, "step": 2670 }, { "epoch": 0.19345990038258862, "grad_norm": 13.893667945894439, "learning_rate": 1.5453348185725114e-06, "logits/chosen": 0.06982421875, "logits/rejected": 0.4765625, "logps/chosen": -388.0, "logps/rejected": -438.0, "loss": 0.2028, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -11.5625, "rewards/margins": 6.1875, "rewards/rejected": -17.75, "step": 2680 }, { "epoch": 0.19418176568252365, "grad_norm": 11.343952617550604, "learning_rate": 1.542459774559398e-06, "logits/chosen": 0.234375, "logits/rejected": 0.58984375, "logps/chosen": -388.0, "logps/rejected": -404.0, "loss": 0.197, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.8125, "rewards/margins": 6.15625, "rewards/rejected": -17.0, "step": 2690 }, { "epoch": 0.19490363098245866, "grad_norm": 10.676684711448711, "learning_rate": 1.539600717839002e-06, "logits/chosen": 0.08154296875, "logits/rejected": 0.58984375, "logps/chosen": -396.0, "logps/rejected": -432.0, "loss": 0.2267, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.25, "rewards/margins": 6.5, "rewards/rejected": -18.75, "step": 2700 }, { "epoch": 0.1956254962823937, "grad_norm": 14.048097366108951, "learning_rate": 1.536757500790597e-06, "logits/chosen": 0.22265625, "logits/rejected": 0.74609375, "logps/chosen": -408.0, "logps/rejected": -422.0, "loss": 0.1807, "rewards/accuracies": 0.90625, "rewards/chosen": -13.25, "rewards/margins": 6.3125, "rewards/rejected": -19.5, "step": 2710 }, { "epoch": 0.19634736158232874, "grad_norm": 9.70528016289558, "learning_rate": 1.5339299776947407e-06, "logits/chosen": 0.28125, "logits/rejected": 0.7109375, "logps/chosen": -424.0, "logps/rejected": -454.0, "loss": 0.1618, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.125, "rewards/margins": 7.09375, "rewards/rejected": -21.25, "step": 2720 }, { "epoch": 0.19706922688226378, "grad_norm": 12.404384958106617, "learning_rate": 1.5311180047019054e-06, "logits/chosen": 0.158203125, "logits/rejected": 0.61328125, "logps/chosen": -434.0, "logps/rejected": -468.0, "loss": 0.2529, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5, "rewards/margins": 5.84375, "rewards/rejected": -19.375, "step": 2730 }, { "epoch": 0.1977910921821988, "grad_norm": 11.505071076793424, "learning_rate": 1.5283214398017402e-06, "logits/chosen": 0.345703125, "logits/rejected": 0.6875, "logps/chosen": -418.0, "logps/rejected": -458.0, "loss": 0.1659, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.875, "rewards/margins": 6.84375, "rewards/rejected": -20.75, "step": 2740 }, { "epoch": 0.19851295748213382, "grad_norm": 6.501972730303094, "learning_rate": 1.5255401427929477e-06, "logits/chosen": 0.419921875, "logits/rejected": 0.82421875, "logps/chosen": -414.0, "logps/rejected": -438.0, "loss": 0.2053, "rewards/accuracies": 0.9375, "rewards/chosen": -15.0625, "rewards/margins": 6.25, "rewards/rejected": -21.25, "step": 2750 }, { "epoch": 0.19923482278206886, "grad_norm": 12.69238138816897, "learning_rate": 1.5227739752537617e-06, "logits/chosen": 0.2265625, "logits/rejected": 0.6328125, "logps/chosen": -426.0, "logps/rejected": -462.0, "loss": 0.1697, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.75, "rewards/margins": 6.28125, "rewards/rejected": -20.0, "step": 2760 }, { "epoch": 0.1999566880820039, "grad_norm": 13.619669811297522, "learning_rate": 1.5200228005130127e-06, "logits/chosen": 0.376953125, "logits/rejected": 0.8984375, "logps/chosen": -412.0, "logps/rejected": -440.0, "loss": 0.17, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.25, "rewards/margins": 6.34375, "rewards/rejected": -19.625, "step": 2770 }, { "epoch": 0.20067855338193893, "grad_norm": 12.131751634305132, "learning_rate": 1.5172864836217631e-06, "logits/chosen": 0.287109375, "logits/rejected": 0.8203125, "logps/chosen": -400.0, "logps/rejected": -418.0, "loss": 0.1569, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.0625, "rewards/margins": 6.40625, "rewards/rejected": -19.5, "step": 2780 }, { "epoch": 0.20140041868187397, "grad_norm": 7.6373809150214695, "learning_rate": 1.514564891325506e-06, "logits/chosen": 0.11083984375, "logits/rejected": 0.65234375, "logps/chosen": -440.0, "logps/rejected": -482.0, "loss": 0.1525, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.4375, "rewards/margins": 7.3125, "rewards/rejected": -21.75, "step": 2790 }, { "epoch": 0.20212228398180898, "grad_norm": 14.300010569539554, "learning_rate": 1.5118578920369086e-06, "logits/chosen": 0.26953125, "logits/rejected": 0.7421875, "logps/chosen": -424.0, "logps/rejected": -500.0, "loss": 0.1546, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.0625, "rewards/margins": 7.25, "rewards/rejected": -21.375, "step": 2800 }, { "epoch": 0.20284414928174402, "grad_norm": 12.272183938004055, "learning_rate": 1.5091653558090898e-06, "logits/chosen": 0.265625, "logits/rejected": 0.70703125, "logps/chosen": -434.0, "logps/rejected": -462.0, "loss": 0.1906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.5625, "rewards/margins": 6.46875, "rewards/rejected": -21.0, "step": 2810 }, { "epoch": 0.20356601458167906, "grad_norm": 16.38928501194481, "learning_rate": 1.506487154309419e-06, "logits/chosen": 0.11669921875, "logits/rejected": 0.6953125, "logps/chosen": -420.0, "logps/rejected": -442.0, "loss": 0.207, "rewards/accuracies": 0.9375, "rewards/chosen": -13.75, "rewards/margins": 6.6875, "rewards/rejected": -20.5, "step": 2820 }, { "epoch": 0.2042878798816141, "grad_norm": 13.222458502818446, "learning_rate": 1.5038231607938247e-06, "logits/chosen": 0.1845703125, "logits/rejected": 0.484375, "logps/chosen": -404.0, "logps/rejected": -446.0, "loss": 0.1861, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.625, "rewards/margins": 6.5625, "rewards/rejected": -20.125, "step": 2830 }, { "epoch": 0.20500974518154913, "grad_norm": 9.982854161862525, "learning_rate": 1.501173250081603e-06, "logits/chosen": 0.0296630859375, "logits/rejected": 0.6953125, "logps/chosen": -420.0, "logps/rejected": -438.0, "loss": 0.1849, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.8125, "rewards/margins": 6.5, "rewards/rejected": -19.25, "step": 2840 }, { "epoch": 0.20573161048148417, "grad_norm": 11.460440211823183, "learning_rate": 1.4985372985307103e-06, "logits/chosen": 0.1513671875, "logits/rejected": 0.54296875, "logps/chosen": -438.0, "logps/rejected": -450.0, "loss": 0.181, "rewards/accuracies": 0.9375, "rewards/chosen": -14.0625, "rewards/margins": 6.40625, "rewards/rejected": -20.5, "step": 2850 }, { "epoch": 0.20645347578141918, "grad_norm": 9.79672924484836, "learning_rate": 1.4959151840135313e-06, "logits/chosen": 0.25, "logits/rejected": 0.671875, "logps/chosen": -422.0, "logps/rejected": -454.0, "loss": 0.1722, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.875, "rewards/margins": 6.59375, "rewards/rejected": -20.375, "step": 2860 }, { "epoch": 0.20717534108135421, "grad_norm": 11.28878246527859, "learning_rate": 1.4933067858931148e-06, "logits/chosen": 0.126953125, "logits/rejected": 0.69140625, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.1883, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.25, "rewards/margins": 6.6875, "rewards/rejected": -19.875, "step": 2870 }, { "epoch": 0.20789720638128925, "grad_norm": 14.664223066987828, "learning_rate": 1.4907119849998597e-06, "logits/chosen": 0.30078125, "logits/rejected": 0.5390625, "logps/chosen": -414.0, "logps/rejected": -460.0, "loss": 0.1915, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.9375, "rewards/margins": 6.59375, "rewards/rejected": -20.5, "step": 2880 }, { "epoch": 0.2086190716812243, "grad_norm": 10.691903086475453, "learning_rate": 1.488130663608649e-06, "logits/chosen": 0.2041015625, "logits/rejected": 0.58984375, "logps/chosen": -394.0, "logps/rejected": -418.0, "loss": 0.1719, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -11.875, "rewards/margins": 6.625, "rewards/rejected": -18.5, "step": 2890 }, { "epoch": 0.20934093698115933, "grad_norm": 11.428211364757644, "learning_rate": 1.4855627054164149e-06, "logits/chosen": 0.08642578125, "logits/rejected": 0.439453125, "logps/chosen": -410.0, "logps/rejected": -436.0, "loss": 0.1694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.25, "rewards/margins": 6.09375, "rewards/rejected": -18.375, "step": 2900 }, { "epoch": 0.21006280228109434, "grad_norm": 9.14566341117851, "learning_rate": 1.4830079955201294e-06, "logits/chosen": 0.025634765625, "logits/rejected": 0.494140625, "logps/chosen": -422.0, "logps/rejected": -430.0, "loss": 0.156, "rewards/accuracies": 0.96875, "rewards/chosen": -12.1875, "rewards/margins": 6.5625, "rewards/rejected": -18.75, "step": 2910 }, { "epoch": 0.21078466758102937, "grad_norm": 9.44623018335635, "learning_rate": 1.4804664203952103e-06, "logits/chosen": 0.138671875, "logits/rejected": 0.6171875, "logps/chosen": -414.0, "logps/rejected": -458.0, "loss": 0.1584, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.0625, "rewards/margins": 6.125, "rewards/rejected": -19.25, "step": 2920 }, { "epoch": 0.2115065328809644, "grad_norm": 9.405280142170081, "learning_rate": 1.4779378678743327e-06, "logits/chosen": 0.059814453125, "logits/rejected": 0.55859375, "logps/chosen": -420.0, "logps/rejected": -478.0, "loss": 0.2158, "rewards/accuracies": 0.9375, "rewards/chosen": -13.875, "rewards/margins": 8.75, "rewards/rejected": -22.625, "step": 2930 }, { "epoch": 0.21222839818089945, "grad_norm": 6.142082717026958, "learning_rate": 1.4754222271266348e-06, "logits/chosen": 0.318359375, "logits/rejected": 0.73046875, "logps/chosen": -422.0, "logps/rejected": -462.0, "loss": 0.193, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -15.25, "rewards/margins": 5.75, "rewards/rejected": -21.0, "step": 2940 }, { "epoch": 0.21295026348083448, "grad_norm": 10.358986622378714, "learning_rate": 1.4729193886373175e-06, "logits/chosen": 0.1669921875, "logits/rejected": 0.640625, "logps/chosen": -426.0, "logps/rejected": -444.0, "loss": 0.169, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.125, "rewards/margins": 6.625, "rewards/rejected": -21.75, "step": 2950 }, { "epoch": 0.21367212878076952, "grad_norm": 12.873055511012865, "learning_rate": 1.4704292441876156e-06, "logits/chosen": 0.2158203125, "logits/rejected": 0.546875, "logps/chosen": -442.0, "logps/rejected": -464.0, "loss": 0.1918, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.75, "rewards/margins": 6.1875, "rewards/rejected": -21.0, "step": 2960 }, { "epoch": 0.21439399408070453, "grad_norm": 10.452255742182105, "learning_rate": 1.4679516868351474e-06, "logits/chosen": 0.3515625, "logits/rejected": 0.73046875, "logps/chosen": -434.0, "logps/rejected": -454.0, "loss": 0.1901, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.375, "rewards/margins": 5.84375, "rewards/rejected": -21.25, "step": 2970 }, { "epoch": 0.21511585938063957, "grad_norm": 17.840944077323023, "learning_rate": 1.4654866108946234e-06, "logits/chosen": 0.212890625, "logits/rejected": 0.703125, "logps/chosen": -448.0, "logps/rejected": -466.0, "loss": 0.1854, "rewards/accuracies": 0.9375, "rewards/chosen": -16.25, "rewards/margins": 6.59375, "rewards/rejected": -22.75, "step": 2980 }, { "epoch": 0.2158377246805746, "grad_norm": 11.482199402328927, "learning_rate": 1.4630339119189101e-06, "logits/chosen": 0.203125, "logits/rejected": 0.703125, "logps/chosen": -436.0, "logps/rejected": -472.0, "loss": 0.1662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.875, "rewards/margins": 6.59375, "rewards/rejected": -22.5, "step": 2990 }, { "epoch": 0.21655958998050964, "grad_norm": 9.463141196734142, "learning_rate": 1.4605934866804429e-06, "logits/chosen": 0.05712890625, "logits/rejected": 0.65234375, "logps/chosen": -452.0, "logps/rejected": -476.0, "loss": 0.1919, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.25, "rewards/margins": 7.28125, "rewards/rejected": -23.625, "step": 3000 }, { "epoch": 0.21728145528044468, "grad_norm": 12.984124336299544, "learning_rate": 1.4581652331529784e-06, "logits/chosen": 0.2392578125, "logits/rejected": 0.5703125, "logps/chosen": -446.0, "logps/rejected": -482.0, "loss": 0.1598, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -16.125, "rewards/margins": 6.40625, "rewards/rejected": -22.5, "step": 3010 }, { "epoch": 0.2180033205803797, "grad_norm": 10.979645732499227, "learning_rate": 1.4557490504936778e-06, "logits/chosen": 0.248046875, "logits/rejected": 0.6875, "logps/chosen": -430.0, "logps/rejected": -464.0, "loss": 0.216, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -15.75, "rewards/margins": 6.6875, "rewards/rejected": -22.5, "step": 3020 }, { "epoch": 0.21872518588031473, "grad_norm": 12.208067276945314, "learning_rate": 1.453344839025519e-06, "logits/chosen": 0.251953125, "logits/rejected": 0.546875, "logps/chosen": -428.0, "logps/rejected": -474.0, "loss": 0.1551, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.8125, "rewards/margins": 6.8125, "rewards/rejected": -21.625, "step": 3030 }, { "epoch": 0.21944705118024976, "grad_norm": 9.20970951491161, "learning_rate": 1.4509525002200234e-06, "logits/chosen": 0.125, "logits/rejected": 0.6484375, "logps/chosen": -410.0, "logps/rejected": -426.0, "loss": 0.154, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.0, "rewards/margins": 6.8125, "rewards/rejected": -19.75, "step": 3040 }, { "epoch": 0.2201689164801848, "grad_norm": 10.122483022617127, "learning_rate": 1.4485719366802965e-06, "logits/chosen": 0.263671875, "logits/rejected": 0.73046875, "logps/chosen": -418.0, "logps/rejected": -434.0, "loss": 0.1944, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5625, "rewards/margins": 6.0625, "rewards/rejected": -19.625, "step": 3050 }, { "epoch": 0.22089078178011984, "grad_norm": 11.650142604340479, "learning_rate": 1.4462030521243742e-06, "logits/chosen": 0.26171875, "logits/rejected": 0.66796875, "logps/chosen": -430.0, "logps/rejected": -458.0, "loss": 0.1669, "rewards/accuracies": 0.9375, "rewards/chosen": -14.4375, "rewards/margins": 6.6875, "rewards/rejected": -21.125, "step": 3060 }, { "epoch": 0.22161264708005488, "grad_norm": 14.718243709006993, "learning_rate": 1.443845751368867e-06, "logits/chosen": 0.265625, "logits/rejected": 0.6953125, "logps/chosen": -434.0, "logps/rejected": -466.0, "loss": 0.202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.625, "rewards/margins": 6.1875, "rewards/rejected": -22.75, "step": 3070 }, { "epoch": 0.22233451237998988, "grad_norm": 8.88844746809707, "learning_rate": 1.4414999403128943e-06, "logits/chosen": 0.283203125, "logits/rejected": 0.66015625, "logps/chosen": -458.0, "logps/rejected": -478.0, "loss": 0.2999, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -17.625, "rewards/margins": 6.6875, "rewards/rejected": -24.25, "step": 3080 }, { "epoch": 0.22305637767992492, "grad_norm": 12.791800334112644, "learning_rate": 1.439165525922309e-06, "logits/chosen": 0.328125, "logits/rejected": 0.6484375, "logps/chosen": -452.0, "logps/rejected": -472.0, "loss": 0.1662, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -16.5, "rewards/margins": 5.875, "rewards/rejected": -22.375, "step": 3090 }, { "epoch": 0.22377824297985996, "grad_norm": 10.697198900355902, "learning_rate": 1.4368424162141992e-06, "logits/chosen": 0.0159912109375, "logits/rejected": 0.5859375, "logps/chosen": -446.0, "logps/rejected": -466.0, "loss": 0.1564, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.375, "rewards/margins": 6.625, "rewards/rejected": -22.0, "step": 3100 }, { "epoch": 0.224500108279795, "grad_norm": 10.869291708737759, "learning_rate": 1.434530520241665e-06, "logits/chosen": 0.06298828125, "logits/rejected": 0.5390625, "logps/chosen": -456.0, "logps/rejected": -488.0, "loss": 0.1766, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.25, "rewards/margins": 6.5625, "rewards/rejected": -22.875, "step": 3110 }, { "epoch": 0.22522197357973003, "grad_norm": 9.80013440421795, "learning_rate": 1.4322297480788657e-06, "logits/chosen": 0.25390625, "logits/rejected": 0.640625, "logps/chosen": -474.0, "logps/rejected": -512.0, "loss": 0.167, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -16.75, "rewards/margins": 7.4375, "rewards/rejected": -24.125, "step": 3120 }, { "epoch": 0.22594383887966504, "grad_norm": 8.383308066021378, "learning_rate": 1.4299400108063247e-06, "logits/chosen": 0.1611328125, "logits/rejected": 0.55078125, "logps/chosen": -446.0, "logps/rejected": -460.0, "loss": 0.1766, "rewards/accuracies": 0.9375, "rewards/chosen": -15.8125, "rewards/margins": 6.53125, "rewards/rejected": -22.375, "step": 3130 }, { "epoch": 0.22666570417960008, "grad_norm": 11.068422790846917, "learning_rate": 1.4276612204964992e-06, "logits/chosen": 0.294921875, "logits/rejected": 0.66015625, "logps/chosen": -430.0, "logps/rejected": -450.0, "loss": 0.184, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.4375, "rewards/margins": 5.3125, "rewards/rejected": -20.75, "step": 3140 }, { "epoch": 0.22738756947953512, "grad_norm": 11.317336773935823, "learning_rate": 1.4253932901995967e-06, "logits/chosen": 0.197265625, "logits/rejected": 0.7265625, "logps/chosen": -394.0, "logps/rejected": -432.0, "loss": 0.2263, "rewards/accuracies": 0.9375, "rewards/chosen": -13.5, "rewards/margins": 6.125, "rewards/rejected": -19.625, "step": 3150 }, { "epoch": 0.22810943477947015, "grad_norm": 9.318224767631092, "learning_rate": 1.42313613392964e-06, "logits/chosen": 0.18359375, "logits/rejected": 0.69921875, "logps/chosen": -406.0, "logps/rejected": -434.0, "loss": 0.1486, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.25, "rewards/margins": 6.9375, "rewards/rejected": -20.25, "step": 3160 }, { "epoch": 0.2288313000794052, "grad_norm": 10.297893356193828, "learning_rate": 1.4208896666507756e-06, "logits/chosen": 0.1259765625, "logits/rejected": 0.5390625, "logps/chosen": -406.0, "logps/rejected": -454.0, "loss": 0.1565, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.6875, "rewards/margins": 6.28125, "rewards/rejected": -20.0, "step": 3170 }, { "epoch": 0.2295531653793402, "grad_norm": 13.66499542838159, "learning_rate": 1.4186538042638173e-06, "logits/chosen": 0.232421875, "logits/rejected": 0.55078125, "logps/chosen": -450.0, "logps/rejected": -484.0, "loss": 0.1806, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.625, "rewards/margins": 6.875, "rewards/rejected": -22.5, "step": 3180 }, { "epoch": 0.23027503067927524, "grad_norm": 8.379785845370755, "learning_rate": 1.416428463593022e-06, "logits/chosen": 0.193359375, "logits/rejected": 0.5703125, "logps/chosen": -444.0, "logps/rejected": -472.0, "loss": 0.2071, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.625, "rewards/margins": 6.71875, "rewards/rejected": -21.375, "step": 3190 }, { "epoch": 0.23099689597921028, "grad_norm": 8.024197550793282, "learning_rate": 1.414213562373095e-06, "logits/chosen": 0.322265625, "logits/rejected": 0.51171875, "logps/chosen": -412.0, "logps/rejected": -450.0, "loss": 0.1974, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.0625, "rewards/margins": 6.125, "rewards/rejected": -20.125, "step": 3200 }, { "epoch": 0.2317187612791453, "grad_norm": 15.352706059597429, "learning_rate": 1.4120090192364154e-06, "logits/chosen": 0.12890625, "logits/rejected": 0.546875, "logps/chosen": -446.0, "logps/rejected": -462.0, "loss": 0.1734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.0625, "rewards/margins": 6.125, "rewards/rejected": -21.125, "step": 3210 }, { "epoch": 0.23244062657908035, "grad_norm": 10.888403992538748, "learning_rate": 1.4098147537004828e-06, "logits/chosen": 0.169921875, "logits/rejected": 0.640625, "logps/chosen": -436.0, "logps/rejected": -472.0, "loss": 0.1569, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.1875, "rewards/margins": 7.03125, "rewards/rejected": -22.25, "step": 3220 }, { "epoch": 0.2331624918790154, "grad_norm": 11.04664988642541, "learning_rate": 1.4076306861555735e-06, "logits/chosen": 0.220703125, "logits/rejected": 0.58984375, "logps/chosen": -440.0, "logps/rejected": -484.0, "loss": 0.1688, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.5625, "rewards/margins": 7.1875, "rewards/rejected": -22.75, "step": 3230 }, { "epoch": 0.2338843571789504, "grad_norm": 11.621498763070004, "learning_rate": 1.405456737852613e-06, "logits/chosen": 0.1640625, "logits/rejected": 0.6640625, "logps/chosen": -418.0, "logps/rejected": -446.0, "loss": 0.1598, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.375, "rewards/margins": 6.15625, "rewards/rejected": -20.5, "step": 3240 }, { "epoch": 0.23460622247888543, "grad_norm": 8.848990953741367, "learning_rate": 1.4032928308912468e-06, "logits/chosen": 0.14453125, "logits/rejected": 0.7421875, "logps/chosen": -404.0, "logps/rejected": -436.0, "loss": 0.211, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.1875, "rewards/margins": 6.71875, "rewards/rejected": -20.875, "step": 3250 }, { "epoch": 0.23532808777882047, "grad_norm": 8.594152929860261, "learning_rate": 1.4011388882081175e-06, "logits/chosen": 0.251953125, "logits/rejected": 0.66015625, "logps/chosen": -412.0, "logps/rejected": -440.0, "loss": 0.1772, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.875, "rewards/margins": 5.84375, "rewards/rejected": -19.75, "step": 3260 }, { "epoch": 0.2360499530787555, "grad_norm": 8.962724357781346, "learning_rate": 1.3989948335653378e-06, "logits/chosen": 0.2412109375, "logits/rejected": 0.546875, "logps/chosen": -384.0, "logps/rejected": -440.0, "loss": 0.1393, "rewards/accuracies": 0.90625, "rewards/chosen": -14.1875, "rewards/margins": 5.625, "rewards/rejected": -19.875, "step": 3270 }, { "epoch": 0.23677181837869055, "grad_norm": 11.122940339943122, "learning_rate": 1.3968605915391564e-06, "logits/chosen": 0.2890625, "logits/rejected": 0.6953125, "logps/chosen": -432.0, "logps/rejected": -456.0, "loss": 0.1935, "rewards/accuracies": 0.9375, "rewards/chosen": -14.6875, "rewards/margins": 6.71875, "rewards/rejected": -21.375, "step": 3280 }, { "epoch": 0.23749368367862556, "grad_norm": 7.457179954744636, "learning_rate": 1.3947360875088132e-06, "logits/chosen": 0.10595703125, "logits/rejected": 0.59765625, "logps/chosen": -450.0, "logps/rejected": -466.0, "loss": 0.168, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.4375, "rewards/margins": 6.53125, "rewards/rejected": -22.0, "step": 3290 }, { "epoch": 0.2382155489785606, "grad_norm": 10.639798037149706, "learning_rate": 1.3926212476455828e-06, "logits/chosen": 0.08837890625, "logits/rejected": 0.70703125, "logps/chosen": -452.0, "logps/rejected": -472.0, "loss": 0.138, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.25, "rewards/margins": 7.0, "rewards/rejected": -23.25, "step": 3300 }, { "epoch": 0.23893741427849563, "grad_norm": 10.621151526497814, "learning_rate": 1.3905159989019964e-06, "logits/chosen": 0.2578125, "logits/rejected": 0.8671875, "logps/chosen": -434.0, "logps/rejected": -458.0, "loss": 0.1631, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 6.6875, "rewards/rejected": -23.75, "step": 3310 }, { "epoch": 0.23965927957843067, "grad_norm": 9.274817744814136, "learning_rate": 1.3884202690012465e-06, "logits/chosen": 0.298828125, "logits/rejected": 0.703125, "logps/chosen": -430.0, "logps/rejected": -466.0, "loss": 0.1767, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.125, "rewards/margins": 6.3125, "rewards/rejected": -22.375, "step": 3320 }, { "epoch": 0.2403811448783657, "grad_norm": 8.410684127442419, "learning_rate": 1.3863339864267636e-06, "logits/chosen": 0.026611328125, "logits/rejected": 0.53125, "logps/chosen": -438.0, "logps/rejected": -448.0, "loss": 0.1869, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.125, "rewards/margins": 6.1875, "rewards/rejected": -20.25, "step": 3330 }, { "epoch": 0.24110301017830074, "grad_norm": 9.872142097864787, "learning_rate": 1.3842570804119655e-06, "logits/chosen": 0.1640625, "logits/rejected": 0.61328125, "logps/chosen": -422.0, "logps/rejected": -452.0, "loss": 0.1229, "rewards/accuracies": 0.9375, "rewards/chosen": -13.3125, "rewards/margins": 6.3125, "rewards/rejected": -19.625, "step": 3340 }, { "epoch": 0.24182487547823575, "grad_norm": 11.134401480520259, "learning_rate": 1.3821894809301763e-06, "logits/chosen": 0.12353515625, "logits/rejected": 0.6015625, "logps/chosen": -426.0, "logps/rejected": -436.0, "loss": 0.1516, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.75, "rewards/margins": 6.71875, "rewards/rejected": -20.375, "step": 3350 }, { "epoch": 0.2425467407781708, "grad_norm": 11.73523881606576, "learning_rate": 1.3801311186847081e-06, "logits/chosen": 0.1826171875, "logits/rejected": 0.734375, "logps/chosen": -420.0, "logps/rejected": -438.0, "loss": 0.1518, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 7.0, "rewards/rejected": -21.625, "step": 3360 }, { "epoch": 0.24326860607810583, "grad_norm": 12.140123544161735, "learning_rate": 1.378081925099109e-06, "logits/chosen": 0.173828125, "logits/rejected": 0.61328125, "logps/chosen": -402.0, "logps/rejected": -440.0, "loss": 0.1579, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.25, "rewards/margins": 6.71875, "rewards/rejected": -20.0, "step": 3370 }, { "epoch": 0.24399047137804086, "grad_norm": 14.250214162436123, "learning_rate": 1.376041832307563e-06, "logits/chosen": 0.0771484375, "logits/rejected": 0.6171875, "logps/chosen": -424.0, "logps/rejected": -450.0, "loss": 0.1996, "rewards/accuracies": 0.90625, "rewards/chosen": -14.5, "rewards/margins": 6.1875, "rewards/rejected": -20.75, "step": 3380 }, { "epoch": 0.2447123366779759, "grad_norm": 8.798298243718165, "learning_rate": 1.3740107731454524e-06, "logits/chosen": 0.23046875, "logits/rejected": 0.64453125, "logps/chosen": -420.0, "logps/rejected": -462.0, "loss": 0.1758, "rewards/accuracies": 0.9375, "rewards/chosen": -14.6875, "rewards/margins": 6.9375, "rewards/rejected": -21.625, "step": 3390 }, { "epoch": 0.2454342019779109, "grad_norm": 10.307328324248399, "learning_rate": 1.3719886811400705e-06, "logits/chosen": 0.1318359375, "logits/rejected": 0.5703125, "logps/chosen": -416.0, "logps/rejected": -458.0, "loss": 0.1556, "rewards/accuracies": 0.90625, "rewards/chosen": -13.75, "rewards/margins": 6.5, "rewards/rejected": -20.25, "step": 3400 }, { "epoch": 0.24615606727784595, "grad_norm": 13.430279712511194, "learning_rate": 1.3699754905014834e-06, "logits/chosen": 0.22265625, "logits/rejected": 0.625, "logps/chosen": -432.0, "logps/rejected": -446.0, "loss": 0.1599, "rewards/accuracies": 0.9375, "rewards/chosen": -14.4375, "rewards/margins": 6.28125, "rewards/rejected": -20.75, "step": 3410 }, { "epoch": 0.24687793257778098, "grad_norm": 11.548590723173112, "learning_rate": 1.3679711361135388e-06, "logits/chosen": 0.2099609375, "logits/rejected": 0.5859375, "logps/chosen": -426.0, "logps/rejected": -464.0, "loss": 0.1665, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.3125, "rewards/margins": 6.625, "rewards/rejected": -21.875, "step": 3420 }, { "epoch": 0.24759979787771602, "grad_norm": 13.914743218458442, "learning_rate": 1.3659755535250212e-06, "logits/chosen": 0.185546875, "logits/rejected": 0.59765625, "logps/chosen": -436.0, "logps/rejected": -472.0, "loss": 0.1779, "rewards/accuracies": 0.9375, "rewards/chosen": -15.4375, "rewards/margins": 6.5, "rewards/rejected": -21.875, "step": 3430 }, { "epoch": 0.24832166317765106, "grad_norm": 10.479274748957614, "learning_rate": 1.3639886789409469e-06, "logits/chosen": 0.20703125, "logits/rejected": 0.5859375, "logps/chosen": -412.0, "logps/rejected": -452.0, "loss": 0.1605, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 6.6875, "rewards/rejected": -21.25, "step": 3440 }, { "epoch": 0.2490435284775861, "grad_norm": 10.416842951408132, "learning_rate": 1.3620104492139977e-06, "logits/chosen": 0.002685546875, "logits/rejected": 0.482421875, "logps/chosen": -422.0, "logps/rejected": -444.0, "loss": 0.1684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.4375, "rewards/margins": 6.5625, "rewards/rejected": -20.0, "step": 3450 }, { "epoch": 0.2497653937775211, "grad_norm": 11.109398413238043, "learning_rate": 1.3600408018360918e-06, "logits/chosen": -0.02978515625, "logits/rejected": 0.5234375, "logps/chosen": -418.0, "logps/rejected": -432.0, "loss": 0.1322, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.8125, "rewards/margins": 7.15625, "rewards/rejected": -20.0, "step": 3460 }, { "epoch": 0.25048725907745617, "grad_norm": 15.306419959524419, "learning_rate": 1.3580796749300878e-06, "logits/chosen": 0.09130859375, "logits/rejected": 0.48828125, "logps/chosen": -452.0, "logps/rejected": -488.0, "loss": 0.1955, "rewards/accuracies": 0.90625, "rewards/chosen": -16.125, "rewards/margins": 7.5, "rewards/rejected": -23.625, "step": 3470 }, { "epoch": 0.2512091243773912, "grad_norm": 13.190741957650921, "learning_rate": 1.3561270072416209e-06, "logits/chosen": 0.181640625, "logits/rejected": 0.44921875, "logps/chosen": -442.0, "logps/rejected": -466.0, "loss": 0.1954, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.875, "rewards/margins": 5.875, "rewards/rejected": -21.75, "step": 3480 }, { "epoch": 0.2519309896773262, "grad_norm": 5.888418725485683, "learning_rate": 1.3541827381310652e-06, "logits/chosen": 0.1259765625, "logits/rejected": 0.578125, "logps/chosen": -438.0, "logps/rejected": -468.0, "loss": 0.1582, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.1875, "rewards/margins": 7.15625, "rewards/rejected": -22.375, "step": 3490 }, { "epoch": 0.25265285497726125, "grad_norm": 6.039664926369914, "learning_rate": 1.3522468075656264e-06, "logits/chosen": 0.08251953125, "logits/rejected": 0.578125, "logps/chosen": -434.0, "logps/rejected": -472.0, "loss": 0.153, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.5625, "rewards/margins": 7.5625, "rewards/rejected": -22.125, "step": 3500 }, { "epoch": 0.25337472027719626, "grad_norm": 14.93829948652733, "learning_rate": 1.3503191561115553e-06, "logits/chosen": 0.1396484375, "logits/rejected": 0.66015625, "logps/chosen": -424.0, "logps/rejected": -444.0, "loss": 0.156, "rewards/accuracies": 0.9375, "rewards/chosen": -14.75, "rewards/margins": 6.09375, "rewards/rejected": -20.875, "step": 3510 }, { "epoch": 0.25409658557713133, "grad_norm": 14.017374396197571, "learning_rate": 1.348399724926484e-06, "logits/chosen": 0.07861328125, "logits/rejected": 0.416015625, "logps/chosen": -426.0, "logps/rejected": -458.0, "loss": 0.1416, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 6.65625, "rewards/rejected": -21.25, "step": 3520 }, { "epoch": 0.25481845087706634, "grad_norm": 23.03232070896239, "learning_rate": 1.346488455751882e-06, "logits/chosen": 0.002410888671875, "logits/rejected": 0.51171875, "logps/chosen": -422.0, "logps/rejected": -462.0, "loss": 0.1639, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.875, "rewards/margins": 6.8125, "rewards/rejected": -20.75, "step": 3530 }, { "epoch": 0.25554031617700135, "grad_norm": 14.996210043414337, "learning_rate": 1.3445852909056286e-06, "logits/chosen": 0.04345703125, "logits/rejected": 0.427734375, "logps/chosen": -414.0, "logps/rejected": -434.0, "loss": 0.1928, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -12.875, "rewards/margins": 6.4375, "rewards/rejected": -19.375, "step": 3540 }, { "epoch": 0.2562621814769364, "grad_norm": 7.282390646918073, "learning_rate": 1.3426901732747024e-06, "logits/chosen": 0.18359375, "logits/rejected": 0.50390625, "logps/chosen": -388.0, "logps/rejected": -424.0, "loss": 0.165, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -11.5625, "rewards/margins": 6.40625, "rewards/rejected": -17.875, "step": 3550 }, { "epoch": 0.2569840467768714, "grad_norm": 8.274606095318536, "learning_rate": 1.3408030463079818e-06, "logits/chosen": 0.1357421875, "logits/rejected": 0.53515625, "logps/chosen": -436.0, "logps/rejected": -492.0, "loss": 0.1689, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.75, "rewards/margins": 6.46875, "rewards/rejected": -21.25, "step": 3560 }, { "epoch": 0.2577059120768065, "grad_norm": 8.363363762802987, "learning_rate": 1.3389238540091568e-06, "logits/chosen": 0.19140625, "logits/rejected": 0.5546875, "logps/chosen": -440.0, "logps/rejected": -488.0, "loss": 0.195, "rewards/accuracies": 0.90625, "rewards/chosen": -16.75, "rewards/margins": 6.9375, "rewards/rejected": -23.75, "step": 3570 }, { "epoch": 0.2584277773767415, "grad_norm": 10.730742313884443, "learning_rate": 1.337052540929751e-06, "logits/chosen": 0.064453125, "logits/rejected": 0.58203125, "logps/chosen": -468.0, "logps/rejected": -498.0, "loss": 0.1864, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.75, "rewards/margins": 7.15625, "rewards/rejected": -24.875, "step": 3580 }, { "epoch": 0.2591496426766765, "grad_norm": 7.4965081031301475, "learning_rate": 1.33518905216225e-06, "logits/chosen": 0.169921875, "logits/rejected": 0.671875, "logps/chosen": -450.0, "logps/rejected": -486.0, "loss": 0.1783, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -17.125, "rewards/margins": 6.375, "rewards/rejected": -23.5, "step": 3590 }, { "epoch": 0.25987150797661157, "grad_norm": 12.747735039515131, "learning_rate": 1.3333333333333332e-06, "logits/chosen": 0.1494140625, "logits/rejected": 0.6640625, "logps/chosen": -420.0, "logps/rejected": -448.0, "loss": 0.1789, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.1875, "rewards/margins": 6.40625, "rewards/rejected": -20.625, "step": 3600 }, { "epoch": 0.2605933732765466, "grad_norm": 7.546301337399981, "learning_rate": 1.3314853305972122e-06, "logits/chosen": 0.0859375, "logits/rejected": 0.6171875, "logps/chosen": -416.0, "logps/rejected": -436.0, "loss": 0.1467, "rewards/accuracies": 0.9375, "rewards/chosen": -13.1875, "rewards/margins": 6.4375, "rewards/rejected": -19.625, "step": 3610 }, { "epoch": 0.26131523857648165, "grad_norm": 7.261696407347176, "learning_rate": 1.3296449906290671e-06, "logits/chosen": 0.1201171875, "logits/rejected": 0.6171875, "logps/chosen": -432.0, "logps/rejected": -460.0, "loss": 0.173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.9375, "rewards/margins": 6.875, "rewards/rejected": -21.75, "step": 3620 }, { "epoch": 0.26203710387641665, "grad_norm": 5.706909740285448, "learning_rate": 1.3278122606185844e-06, "logits/chosen": -0.05517578125, "logits/rejected": 0.64453125, "logps/chosen": -474.0, "logps/rejected": -500.0, "loss": 0.1666, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.5, "rewards/margins": 7.34375, "rewards/rejected": -24.875, "step": 3630 }, { "epoch": 0.2627589691763517, "grad_norm": 12.504943605439774, "learning_rate": 1.3259870882635918e-06, "logits/chosen": 0.162109375, "logits/rejected": 0.62109375, "logps/chosen": -434.0, "logps/rejected": -498.0, "loss": 0.1442, "rewards/accuracies": 0.96875, "rewards/chosen": -15.25, "rewards/margins": 7.03125, "rewards/rejected": -22.375, "step": 3640 }, { "epoch": 0.26348083447628673, "grad_norm": 11.277451040303212, "learning_rate": 1.3241694217637886e-06, "logits/chosen": 0.0311279296875, "logits/rejected": 0.5078125, "logps/chosen": -424.0, "logps/rejected": -472.0, "loss": 0.1718, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.5, "rewards/margins": 7.0, "rewards/rejected": -22.5, "step": 3650 }, { "epoch": 0.26420269977622174, "grad_norm": 8.977366853127899, "learning_rate": 1.3223592098145723e-06, "logits/chosen": 0.271484375, "logits/rejected": 0.6953125, "logps/chosen": -398.0, "logps/rejected": -436.0, "loss": 0.1535, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.5625, "rewards/margins": 6.5625, "rewards/rejected": -21.125, "step": 3660 }, { "epoch": 0.2649245650761568, "grad_norm": 16.811103495836914, "learning_rate": 1.3205564016009555e-06, "logits/chosen": 0.0185546875, "logits/rejected": 0.54296875, "logps/chosen": -416.0, "logps/rejected": -434.0, "loss": 0.1969, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.75, "rewards/margins": 7.28125, "rewards/rejected": -21.0, "step": 3670 }, { "epoch": 0.2656464303760918, "grad_norm": 13.7586357705908, "learning_rate": 1.318760946791574e-06, "logits/chosen": 0.07177734375, "logits/rejected": 0.5234375, "logps/chosen": -428.0, "logps/rejected": -452.0, "loss": 0.1999, "rewards/accuracies": 0.9375, "rewards/chosen": -15.1875, "rewards/margins": 6.5, "rewards/rejected": -21.625, "step": 3680 }, { "epoch": 0.2663682956760269, "grad_norm": 9.820353684999723, "learning_rate": 1.316972795532786e-06, "logits/chosen": 0.09765625, "logits/rejected": 0.671875, "logps/chosen": -406.0, "logps/rejected": -440.0, "loss": 0.1661, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.1875, "rewards/margins": 7.1875, "rewards/rejected": -20.375, "step": 3690 }, { "epoch": 0.2670901609759619, "grad_norm": 8.398502038545217, "learning_rate": 1.3151918984428582e-06, "logits/chosen": -0.0908203125, "logits/rejected": 0.4375, "logps/chosen": -404.0, "logps/rejected": -440.0, "loss": 0.1615, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.0625, "rewards/margins": 6.0625, "rewards/rejected": -19.125, "step": 3700 }, { "epoch": 0.2678120262758969, "grad_norm": 9.077706019886692, "learning_rate": 1.313418206606237e-06, "logits/chosen": -0.1669921875, "logits/rejected": 0.359375, "logps/chosen": -412.0, "logps/rejected": -452.0, "loss": 0.1581, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.125, "rewards/margins": 6.59375, "rewards/rejected": -20.75, "step": 3710 }, { "epoch": 0.26853389157583196, "grad_norm": 6.131804004421976, "learning_rate": 1.3116516715679057e-06, "logits/chosen": -0.01324462890625, "logits/rejected": 0.337890625, "logps/chosen": -390.0, "logps/rejected": -432.0, "loss": 0.1994, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -13.625, "rewards/margins": 6.0625, "rewards/rejected": -19.75, "step": 3720 }, { "epoch": 0.26925575687576697, "grad_norm": 7.387017274424251, "learning_rate": 1.3098922453278258e-06, "logits/chosen": -0.032958984375, "logits/rejected": 0.451171875, "logps/chosen": -436.0, "logps/rejected": -456.0, "loss": 0.1444, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.6875, "rewards/margins": 6.40625, "rewards/rejected": -20.0, "step": 3730 }, { "epoch": 0.26997762217570204, "grad_norm": 10.590666204427261, "learning_rate": 1.3081398803354573e-06, "logits/chosen": 0.0751953125, "logits/rejected": 0.53515625, "logps/chosen": -414.0, "logps/rejected": -430.0, "loss": 0.1371, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.0, "rewards/margins": 7.03125, "rewards/rejected": -20.0, "step": 3740 }, { "epoch": 0.27069948747563705, "grad_norm": 11.209195936576956, "learning_rate": 1.3063945294843617e-06, "logits/chosen": 0.05419921875, "logits/rejected": 0.5390625, "logps/chosen": -424.0, "logps/rejected": -460.0, "loss": 0.1349, "rewards/accuracies": 0.96875, "rewards/chosen": -14.25, "rewards/margins": 7.46875, "rewards/rejected": -21.75, "step": 3750 }, { "epoch": 0.27142135277557206, "grad_norm": 12.862969667445496, "learning_rate": 1.3046561461068843e-06, "logits/chosen": 0.09130859375, "logits/rejected": 0.47265625, "logps/chosen": -420.0, "logps/rejected": -460.0, "loss": 0.1392, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.4375, "rewards/margins": 7.25, "rewards/rejected": -21.625, "step": 3760 }, { "epoch": 0.2721432180755071, "grad_norm": 10.088842248676368, "learning_rate": 1.3029246839689124e-06, "logits/chosen": 0.1630859375, "logits/rejected": 0.49609375, "logps/chosen": -466.0, "logps/rejected": -480.0, "loss": 0.1562, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.5, "rewards/margins": 7.84375, "rewards/rejected": -24.375, "step": 3770 }, { "epoch": 0.27286508337544213, "grad_norm": 9.075968214866379, "learning_rate": 1.3012000972647109e-06, "logits/chosen": 0.005126953125, "logits/rejected": 0.4375, "logps/chosen": -446.0, "logps/rejected": -496.0, "loss": 0.1687, "rewards/accuracies": 0.9375, "rewards/chosen": -15.8125, "rewards/margins": 7.46875, "rewards/rejected": -23.25, "step": 3780 }, { "epoch": 0.2735869486753772, "grad_norm": 10.862455837196782, "learning_rate": 1.299482340611832e-06, "logits/chosen": 0.045166015625, "logits/rejected": 0.3984375, "logps/chosen": -432.0, "logps/rejected": -450.0, "loss": 0.1668, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.125, "rewards/margins": 6.65625, "rewards/rejected": -20.75, "step": 3790 }, { "epoch": 0.2743088139753122, "grad_norm": 9.041152712987431, "learning_rate": 1.2977713690461003e-06, "logits/chosen": 0.1923828125, "logits/rejected": 0.494140625, "logps/chosen": -420.0, "logps/rejected": -456.0, "loss": 0.133, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.6875, "rewards/margins": 6.46875, "rewards/rejected": -20.125, "step": 3800 }, { "epoch": 0.2750306792752472, "grad_norm": 8.487714145098858, "learning_rate": 1.296067138016669e-06, "logits/chosen": 0.0458984375, "logits/rejected": 0.5546875, "logps/chosen": -418.0, "logps/rejected": -420.0, "loss": 0.1432, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.0625, "rewards/margins": 6.40625, "rewards/rejected": -19.5, "step": 3810 }, { "epoch": 0.2757525445751823, "grad_norm": 7.923968010711494, "learning_rate": 1.294369603381147e-06, "logits/chosen": 0.0595703125, "logits/rejected": 0.5, "logps/chosen": -444.0, "logps/rejected": -476.0, "loss": 0.1532, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.9375, "rewards/margins": 7.59375, "rewards/rejected": -23.5, "step": 3820 }, { "epoch": 0.2764744098751173, "grad_norm": 9.866185439268147, "learning_rate": 1.2926787214007981e-06, "logits/chosen": 0.33203125, "logits/rejected": 0.65234375, "logps/chosen": -428.0, "logps/rejected": -452.0, "loss": 0.1358, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.5, "rewards/margins": 6.78125, "rewards/rejected": -22.25, "step": 3830 }, { "epoch": 0.27719627517505235, "grad_norm": 11.22158931137104, "learning_rate": 1.2909944487358056e-06, "logits/chosen": 0.12890625, "logits/rejected": 0.578125, "logps/chosen": -426.0, "logps/rejected": -474.0, "loss": 0.1863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.75, "rewards/margins": 6.0625, "rewards/rejected": -21.75, "step": 3840 }, { "epoch": 0.27791814047498736, "grad_norm": 9.773616018959997, "learning_rate": 1.2893167424406084e-06, "logits/chosen": 0.26171875, "logits/rejected": 0.5703125, "logps/chosen": -428.0, "logps/rejected": -476.0, "loss": 0.1449, "rewards/accuracies": 0.9375, "rewards/chosen": -16.25, "rewards/margins": 7.03125, "rewards/rejected": -23.25, "step": 3850 }, { "epoch": 0.2786400057749224, "grad_norm": 8.845696672061733, "learning_rate": 1.2876455599593008e-06, "logits/chosen": 0.08251953125, "logits/rejected": 0.62109375, "logps/chosen": -412.0, "logps/rejected": -448.0, "loss": 0.1429, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -15.4375, "rewards/margins": 6.15625, "rewards/rejected": -21.625, "step": 3860 }, { "epoch": 0.27936187107485744, "grad_norm": 12.414370670608047, "learning_rate": 1.285980859121099e-06, "logits/chosen": 0.11767578125, "logits/rejected": 0.609375, "logps/chosen": -478.0, "logps/rejected": -498.0, "loss": 0.1813, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.875, "rewards/margins": 7.09375, "rewards/rejected": -24.0, "step": 3870 }, { "epoch": 0.28008373637479245, "grad_norm": 11.039218998719619, "learning_rate": 1.2843225981358712e-06, "logits/chosen": 0.0654296875, "logits/rejected": 0.6640625, "logps/chosen": -430.0, "logps/rejected": -456.0, "loss": 0.1495, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.0, "rewards/margins": 7.28125, "rewards/rejected": -23.375, "step": 3880 }, { "epoch": 0.2808056016747275, "grad_norm": 13.376262597651257, "learning_rate": 1.2826707355897317e-06, "logits/chosen": 0.193359375, "logits/rejected": 0.49609375, "logps/chosen": -426.0, "logps/rejected": -464.0, "loss": 0.225, "rewards/accuracies": 0.9375, "rewards/chosen": -15.3125, "rewards/margins": 6.96875, "rewards/rejected": -22.25, "step": 3890 }, { "epoch": 0.2815274669746625, "grad_norm": 6.031759687666924, "learning_rate": 1.281025230440697e-06, "logits/chosen": 0.0240478515625, "logits/rejected": 0.546875, "logps/chosen": -436.0, "logps/rejected": -474.0, "loss": 0.1303, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5, "rewards/margins": 7.21875, "rewards/rejected": -22.75, "step": 3900 }, { "epoch": 0.2822493322745976, "grad_norm": 9.311470562673849, "learning_rate": 1.2793860420144025e-06, "logits/chosen": 0.1494140625, "logits/rejected": 0.6953125, "logps/chosen": -404.0, "logps/rejected": -454.0, "loss": 0.1544, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.375, "rewards/margins": 7.25, "rewards/rejected": -22.625, "step": 3910 }, { "epoch": 0.2829711975745326, "grad_norm": 14.172681819223003, "learning_rate": 1.2777531299998798e-06, "logits/chosen": 0.0019989013671875, "logits/rejected": 0.451171875, "logps/chosen": -444.0, "logps/rejected": -478.0, "loss": 0.1113, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.1875, "rewards/margins": 7.40625, "rewards/rejected": -22.5, "step": 3920 }, { "epoch": 0.2836930628744676, "grad_norm": 8.582963321127755, "learning_rate": 1.2761264544453928e-06, "logits/chosen": 0.05712890625, "logits/rejected": 0.58984375, "logps/chosen": -474.0, "logps/rejected": -512.0, "loss": 0.1669, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.375, "rewards/margins": 7.84375, "rewards/rejected": -26.25, "step": 3930 }, { "epoch": 0.28441492817440267, "grad_norm": 15.576992688736702, "learning_rate": 1.2745059757543324e-06, "logits/chosen": 0.11572265625, "logits/rejected": 0.470703125, "logps/chosen": -468.0, "logps/rejected": -510.0, "loss": 0.1514, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -17.5, "rewards/margins": 7.59375, "rewards/rejected": -25.125, "step": 3940 }, { "epoch": 0.2851367934743377, "grad_norm": 12.194994328373397, "learning_rate": 1.272891654681168e-06, "logits/chosen": 0.142578125, "logits/rejected": 0.44140625, "logps/chosen": -438.0, "logps/rejected": -520.0, "loss": 0.1607, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.0, "rewards/margins": 7.40625, "rewards/rejected": -23.5, "step": 3950 }, { "epoch": 0.28585865877427274, "grad_norm": 10.033570141015378, "learning_rate": 1.2712834523274563e-06, "logits/chosen": 0.091796875, "logits/rejected": 0.66015625, "logps/chosen": -432.0, "logps/rejected": -458.0, "loss": 0.1317, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5, "rewards/margins": 7.09375, "rewards/rejected": -22.625, "step": 3960 }, { "epoch": 0.28658052407420775, "grad_norm": 6.100990665854184, "learning_rate": 1.2696813301379032e-06, "logits/chosen": 0.0244140625, "logits/rejected": 0.43359375, "logps/chosen": -482.0, "logps/rejected": -512.0, "loss": 0.1436, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.75, "rewards/margins": 7.15625, "rewards/rejected": -24.875, "step": 3970 }, { "epoch": 0.28730238937414276, "grad_norm": 10.661859400485692, "learning_rate": 1.2680852498964829e-06, "logits/chosen": 0.01239013671875, "logits/rejected": 0.63671875, "logps/chosen": -454.0, "logps/rejected": -496.0, "loss": 0.1388, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.5, "rewards/margins": 7.21875, "rewards/rejected": -23.75, "step": 3980 }, { "epoch": 0.28802425467407783, "grad_norm": 10.62571479387531, "learning_rate": 1.266495173722607e-06, "logits/chosen": 0.09423828125, "logits/rejected": 0.5390625, "logps/chosen": -422.0, "logps/rejected": -452.0, "loss": 0.1377, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.375, "rewards/margins": 7.09375, "rewards/rejected": -22.5, "step": 3990 }, { "epoch": 0.28874611997401284, "grad_norm": 11.673124788387879, "learning_rate": 1.2649110640673517e-06, "logits/chosen": 0.04541015625, "logits/rejected": 0.546875, "logps/chosen": -424.0, "logps/rejected": -468.0, "loss": 0.163, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.25, "rewards/margins": 7.1875, "rewards/rejected": -22.5, "step": 4000 }, { "epoch": 0.2894679852739479, "grad_norm": 13.124832576695715, "learning_rate": 1.2633328837097308e-06, "logits/chosen": 0.029296875, "logits/rejected": 0.61328125, "logps/chosen": -450.0, "logps/rejected": -464.0, "loss": 0.1642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.9375, "rewards/margins": 6.875, "rewards/rejected": -22.875, "step": 4010 }, { "epoch": 0.2901898505738829, "grad_norm": 9.914838503417519, "learning_rate": 1.2617605957530233e-06, "logits/chosen": 0.208984375, "logits/rejected": 0.6875, "logps/chosen": -426.0, "logps/rejected": -450.0, "loss": 0.1564, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.4375, "rewards/margins": 7.34375, "rewards/rejected": -21.75, "step": 4020 }, { "epoch": 0.2909117158738179, "grad_norm": 12.358494115565545, "learning_rate": 1.2601941636211516e-06, "logits/chosen": 0.06298828125, "logits/rejected": 0.62109375, "logps/chosen": -414.0, "logps/rejected": -458.0, "loss": 0.1598, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.125, "rewards/margins": 7.21875, "rewards/rejected": -21.375, "step": 4030 }, { "epoch": 0.291633581173753, "grad_norm": 12.20316551767981, "learning_rate": 1.2586335510551052e-06, "logits/chosen": -0.05615234375, "logits/rejected": 0.53125, "logps/chosen": -432.0, "logps/rejected": -472.0, "loss": 0.1782, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.5, "rewards/margins": 6.65625, "rewards/rejected": -21.25, "step": 4040 }, { "epoch": 0.292355446473688, "grad_norm": 14.03807850545456, "learning_rate": 1.2570787221094177e-06, "logits/chosen": 0.07666015625, "logits/rejected": 0.52734375, "logps/chosen": -404.0, "logps/rejected": -444.0, "loss": 0.1858, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 6.53125, "rewards/rejected": -21.25, "step": 4050 }, { "epoch": 0.29307731177362306, "grad_norm": 9.76885566440202, "learning_rate": 1.255529641148689e-06, "logits/chosen": 0.353515625, "logits/rejected": 0.498046875, "logps/chosen": -418.0, "logps/rejected": -458.0, "loss": 0.1421, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.875, "rewards/margins": 5.78125, "rewards/rejected": -20.625, "step": 4060 }, { "epoch": 0.29379917707355807, "grad_norm": 5.4912472500558716, "learning_rate": 1.2539862728441536e-06, "logits/chosen": 0.13671875, "logits/rejected": 0.609375, "logps/chosen": -436.0, "logps/rejected": -486.0, "loss": 0.1498, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -17.25, "rewards/margins": 7.3125, "rewards/rejected": -24.5, "step": 4070 }, { "epoch": 0.2945210423734931, "grad_norm": 11.491609490443375, "learning_rate": 1.252448582170299e-06, "logits/chosen": 0.08056640625, "logits/rejected": 0.59375, "logps/chosen": -460.0, "logps/rejected": -502.0, "loss": 0.1474, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.0, "rewards/margins": 8.625, "rewards/rejected": -26.625, "step": 4080 }, { "epoch": 0.29524290767342815, "grad_norm": 13.41566586972795, "learning_rate": 1.2509165344015243e-06, "logits/chosen": -0.01324462890625, "logits/rejected": 0.5390625, "logps/chosen": -450.0, "logps/rejected": -484.0, "loss": 0.1585, "rewards/accuracies": 0.9375, "rewards/chosen": -17.375, "rewards/margins": 7.3125, "rewards/rejected": -24.625, "step": 4090 }, { "epoch": 0.29596477297336315, "grad_norm": 11.762351732052611, "learning_rate": 1.2493900951088486e-06, "logits/chosen": 0.041259765625, "logits/rejected": 0.466796875, "logps/chosen": -436.0, "logps/rejected": -486.0, "loss": 0.1346, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.25, "rewards/margins": 8.5625, "rewards/rejected": -24.75, "step": 4100 }, { "epoch": 0.2966866382732982, "grad_norm": 13.24259404598737, "learning_rate": 1.2478692301566601e-06, "logits/chosen": 0.023681640625, "logits/rejected": 0.49609375, "logps/chosen": -456.0, "logps/rejected": -474.0, "loss": 0.1601, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.625, "rewards/margins": 7.4375, "rewards/rejected": -23.0, "step": 4110 }, { "epoch": 0.29740850357323323, "grad_norm": 9.877013136394865, "learning_rate": 1.2463539056995116e-06, "logits/chosen": 0.0869140625, "logits/rejected": 0.44921875, "logps/chosen": -438.0, "logps/rejected": -464.0, "loss": 0.1533, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5625, "rewards/margins": 6.625, "rewards/rejected": -22.25, "step": 4120 }, { "epoch": 0.2981303688731683, "grad_norm": 7.949652565393485, "learning_rate": 1.2448440881789541e-06, "logits/chosen": 0.1455078125, "logits/rejected": 0.51171875, "logps/chosen": -472.0, "logps/rejected": -494.0, "loss": 0.1704, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 6.78125, "rewards/rejected": -25.0, "step": 4130 }, { "epoch": 0.2988522341731033, "grad_norm": 13.521573312085025, "learning_rate": 1.2433397443204184e-06, "logits/chosen": 0.19140625, "logits/rejected": 0.59765625, "logps/chosen": -462.0, "logps/rejected": -486.0, "loss": 0.1322, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -18.5, "rewards/margins": 6.46875, "rewards/rejected": -24.875, "step": 4140 }, { "epoch": 0.2995740994730383, "grad_norm": 11.553072835028662, "learning_rate": 1.2418408411301324e-06, "logits/chosen": 0.0830078125, "logits/rejected": 0.625, "logps/chosen": -432.0, "logps/rejected": -496.0, "loss": 0.1644, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.9375, "rewards/margins": 7.375, "rewards/rejected": -23.375, "step": 4150 }, { "epoch": 0.3002959647729734, "grad_norm": 9.802506538512336, "learning_rate": 1.2403473458920844e-06, "logits/chosen": 0.00494384765625, "logits/rejected": 0.5234375, "logps/chosen": -458.0, "logps/rejected": -484.0, "loss": 0.1906, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.8125, "rewards/margins": 7.40625, "rewards/rejected": -22.25, "step": 4160 }, { "epoch": 0.3010178300729084, "grad_norm": 6.945724296418889, "learning_rate": 1.2388592261650217e-06, "logits/chosen": -0.061767578125, "logits/rejected": 0.375, "logps/chosen": -436.0, "logps/rejected": -476.0, "loss": 0.1793, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.875, "rewards/margins": 7.3125, "rewards/rejected": -22.25, "step": 4170 }, { "epoch": 0.30173969537284345, "grad_norm": 13.110587965303756, "learning_rate": 1.2373764497794918e-06, "logits/chosen": 0.11376953125, "logits/rejected": 0.5546875, "logps/chosen": -432.0, "logps/rejected": -474.0, "loss": 0.1475, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.8125, "rewards/margins": 6.3125, "rewards/rejected": -22.125, "step": 4180 }, { "epoch": 0.30246156067277846, "grad_norm": 11.869421003575003, "learning_rate": 1.2358989848349217e-06, "logits/chosen": 0.10888671875, "logits/rejected": 0.4921875, "logps/chosen": -448.0, "logps/rejected": -464.0, "loss": 0.1871, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.25, "rewards/margins": 6.65625, "rewards/rejected": -21.875, "step": 4190 }, { "epoch": 0.30318342597271347, "grad_norm": 9.842196081596166, "learning_rate": 1.2344267996967353e-06, "logits/chosen": 0.181640625, "logits/rejected": 0.5078125, "logps/chosen": -414.0, "logps/rejected": -460.0, "loss": 0.1509, "rewards/accuracies": 0.9375, "rewards/chosen": -14.3125, "rewards/margins": 6.5625, "rewards/rejected": -20.875, "step": 4200 }, { "epoch": 0.30390529127264854, "grad_norm": 9.047608397457779, "learning_rate": 1.2329598629935076e-06, "logits/chosen": 0.162109375, "logits/rejected": 0.57421875, "logps/chosen": -408.0, "logps/rejected": -442.0, "loss": 0.1541, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.9375, "rewards/margins": 6.59375, "rewards/rejected": -20.5, "step": 4210 }, { "epoch": 0.30462715657258355, "grad_norm": 10.929670838399204, "learning_rate": 1.2314981436141583e-06, "logits/chosen": 0.185546875, "logits/rejected": 0.57421875, "logps/chosen": -400.0, "logps/rejected": -460.0, "loss": 0.1681, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.0625, "rewards/margins": 7.125, "rewards/rejected": -22.125, "step": 4220 }, { "epoch": 0.3053490218725186, "grad_norm": 9.703768153409133, "learning_rate": 1.2300416107051802e-06, "logits/chosen": 0.2041015625, "logits/rejected": 0.62109375, "logps/chosen": -420.0, "logps/rejected": -448.0, "loss": 0.1465, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.1875, "rewards/margins": 6.90625, "rewards/rejected": -21.125, "step": 4230 }, { "epoch": 0.3060708871724536, "grad_norm": 7.625902291654867, "learning_rate": 1.2285902336679024e-06, "logits/chosen": 0.06396484375, "logits/rejected": 0.46484375, "logps/chosen": -410.0, "logps/rejected": -434.0, "loss": 0.167, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.9375, "rewards/margins": 6.0, "rewards/rejected": -20.0, "step": 4240 }, { "epoch": 0.30679275247238863, "grad_norm": 4.212683351078669, "learning_rate": 1.2271439821557926e-06, "logits/chosen": 0.1435546875, "logits/rejected": 0.490234375, "logps/chosen": -408.0, "logps/rejected": -446.0, "loss": 0.1306, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.0625, "rewards/margins": 6.25, "rewards/rejected": -20.375, "step": 4250 }, { "epoch": 0.3075146177723237, "grad_norm": 8.265386639684317, "learning_rate": 1.225702826071791e-06, "logits/chosen": -0.0791015625, "logits/rejected": 0.5, "logps/chosen": -432.0, "logps/rejected": -470.0, "loss": 0.1352, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 7.625, "rewards/rejected": -22.875, "step": 4260 }, { "epoch": 0.3082364830722587, "grad_norm": 9.657307858477607, "learning_rate": 1.2242667355656797e-06, "logits/chosen": 0.0177001953125, "logits/rejected": 0.59375, "logps/chosen": -422.0, "logps/rejected": -460.0, "loss": 0.1722, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.0, "rewards/margins": 7.21875, "rewards/rejected": -22.25, "step": 4270 }, { "epoch": 0.30895834837219377, "grad_norm": 8.504105633790308, "learning_rate": 1.2228356810314862e-06, "logits/chosen": -0.103515625, "logits/rejected": 0.40625, "logps/chosen": -416.0, "logps/rejected": -452.0, "loss": 0.1671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.6875, "rewards/margins": 7.28125, "rewards/rejected": -20.0, "step": 4280 }, { "epoch": 0.3096802136721288, "grad_norm": 9.907681132546637, "learning_rate": 1.2214096331049186e-06, "logits/chosen": -0.0244140625, "logits/rejected": 0.384765625, "logps/chosen": -382.0, "logps/rejected": -430.0, "loss": 0.1532, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -12.3125, "rewards/margins": 7.0, "rewards/rejected": -19.375, "step": 4290 }, { "epoch": 0.3104020789720638, "grad_norm": 13.481582098205285, "learning_rate": 1.2199885626608373e-06, "logits/chosen": 0.171875, "logits/rejected": 0.50390625, "logps/chosen": -414.0, "logps/rejected": -436.0, "loss": 0.1752, "rewards/accuracies": 0.90625, "rewards/chosen": -13.5, "rewards/margins": 6.375, "rewards/rejected": -19.875, "step": 4300 }, { "epoch": 0.31112394427199885, "grad_norm": 9.441502288302814, "learning_rate": 1.2185724408107546e-06, "logits/chosen": -0.02783203125, "logits/rejected": 0.50390625, "logps/chosen": -428.0, "logps/rejected": -442.0, "loss": 0.1693, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.125, "rewards/margins": 6.71875, "rewards/rejected": -20.875, "step": 4310 }, { "epoch": 0.31184580957193386, "grad_norm": 8.92815233860186, "learning_rate": 1.2171612389003689e-06, "logits/chosen": -0.014404296875, "logits/rejected": 0.51171875, "logps/chosen": -438.0, "logps/rejected": -458.0, "loss": 0.1526, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.875, "rewards/margins": 7.0625, "rewards/rejected": -22.875, "step": 4320 }, { "epoch": 0.31256767487186893, "grad_norm": 8.27719332439587, "learning_rate": 1.2157549285071297e-06, "logits/chosen": 0.06103515625, "logits/rejected": 0.51953125, "logps/chosen": -446.0, "logps/rejected": -486.0, "loss": 0.1409, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.6875, "rewards/margins": 7.53125, "rewards/rejected": -23.125, "step": 4330 }, { "epoch": 0.31328954017180394, "grad_norm": 8.744468204949463, "learning_rate": 1.2143534814378327e-06, "logits/chosen": -0.037109375, "logits/rejected": 0.337890625, "logps/chosen": -442.0, "logps/rejected": -492.0, "loss": 0.1471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.8125, "rewards/margins": 6.875, "rewards/rejected": -22.75, "step": 4340 }, { "epoch": 0.31401140547173895, "grad_norm": 10.821830717281609, "learning_rate": 1.2129568697262454e-06, "logits/chosen": 0.003326416015625, "logits/rejected": 0.466796875, "logps/chosen": -414.0, "logps/rejected": -436.0, "loss": 0.126, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.625, "rewards/margins": 6.59375, "rewards/rejected": -20.25, "step": 4350 }, { "epoch": 0.314733270771674, "grad_norm": 9.835185132036324, "learning_rate": 1.2115650656307653e-06, "logits/chosen": 0.1240234375, "logits/rejected": 0.578125, "logps/chosen": -424.0, "logps/rejected": -458.0, "loss": 0.1356, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.875, "rewards/margins": 6.65625, "rewards/rejected": -20.5, "step": 4360 }, { "epoch": 0.315455136071609, "grad_norm": 8.98182039414085, "learning_rate": 1.210178041632103e-06, "logits/chosen": -0.0255126953125, "logits/rejected": 0.447265625, "logps/chosen": -418.0, "logps/rejected": -452.0, "loss": 0.1578, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5625, "rewards/margins": 7.0, "rewards/rejected": -20.625, "step": 4370 }, { "epoch": 0.3161770013715441, "grad_norm": 7.026394060610229, "learning_rate": 1.2087957704309988e-06, "logits/chosen": 0.173828125, "logits/rejected": 0.56640625, "logps/chosen": -400.0, "logps/rejected": -436.0, "loss": 0.1332, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.0, "rewards/margins": 7.09375, "rewards/rejected": -21.125, "step": 4380 }, { "epoch": 0.3168988666714791, "grad_norm": 10.898164072087859, "learning_rate": 1.2074182249459642e-06, "logits/chosen": 0.142578125, "logits/rejected": 0.423828125, "logps/chosen": -420.0, "logps/rejected": -456.0, "loss": 0.1351, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.375, "rewards/margins": 6.40625, "rewards/rejected": -20.75, "step": 4390 }, { "epoch": 0.31762073197141416, "grad_norm": 10.620867714481594, "learning_rate": 1.2060453783110545e-06, "logits/chosen": 0.091796875, "logits/rejected": 0.54296875, "logps/chosen": -412.0, "logps/rejected": -440.0, "loss": 0.1691, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.3125, "rewards/margins": 7.15625, "rewards/rejected": -20.5, "step": 4400 }, { "epoch": 0.31834259727134917, "grad_norm": 8.222067087866256, "learning_rate": 1.2046772038736682e-06, "logits/chosen": 0.052734375, "logits/rejected": 0.4140625, "logps/chosen": -422.0, "logps/rejected": -460.0, "loss": 0.1474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.375, "rewards/margins": 6.84375, "rewards/rejected": -21.25, "step": 4410 }, { "epoch": 0.3190644625712842, "grad_norm": 10.160708477411525, "learning_rate": 1.2033136751923736e-06, "logits/chosen": 0.011962890625, "logits/rejected": 0.423828125, "logps/chosen": -414.0, "logps/rejected": -476.0, "loss": 0.1669, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.0625, "rewards/margins": 7.3125, "rewards/rejected": -20.375, "step": 4420 }, { "epoch": 0.31978632787121924, "grad_norm": 8.784187435713232, "learning_rate": 1.201954766034762e-06, "logits/chosen": -0.0634765625, "logits/rejected": 0.404296875, "logps/chosen": -428.0, "logps/rejected": -434.0, "loss": 0.1615, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.3125, "rewards/margins": 7.28125, "rewards/rejected": -19.625, "step": 4430 }, { "epoch": 0.32050819317115425, "grad_norm": 11.105600406434885, "learning_rate": 1.2006004503753285e-06, "logits/chosen": 0.09765625, "logits/rejected": 0.48046875, "logps/chosen": -422.0, "logps/rejected": -442.0, "loss": 0.1475, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.25, "rewards/margins": 6.5625, "rewards/rejected": -19.875, "step": 4440 }, { "epoch": 0.3212300584710893, "grad_norm": 10.495284167056672, "learning_rate": 1.1992507023933782e-06, "logits/chosen": 0.1435546875, "logits/rejected": 0.470703125, "logps/chosen": -418.0, "logps/rejected": -436.0, "loss": 0.1472, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.6875, "rewards/margins": 6.84375, "rewards/rejected": -19.5, "step": 4450 }, { "epoch": 0.32195192377102433, "grad_norm": 12.73172899925232, "learning_rate": 1.1979054964709597e-06, "logits/chosen": 0.0179443359375, "logits/rejected": 0.5390625, "logps/chosen": -414.0, "logps/rejected": -416.0, "loss": 0.1181, "rewards/accuracies": 0.96875, "rewards/chosen": -12.0, "rewards/margins": 6.90625, "rewards/rejected": -18.875, "step": 4460 }, { "epoch": 0.32267378907095934, "grad_norm": 11.279886387868295, "learning_rate": 1.1965648071908207e-06, "logits/chosen": -0.07470703125, "logits/rejected": 0.353515625, "logps/chosen": -442.0, "logps/rejected": -462.0, "loss": 0.1277, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.0, "rewards/margins": 7.125, "rewards/rejected": -22.125, "step": 4470 }, { "epoch": 0.3233956543708944, "grad_norm": 8.778140467922572, "learning_rate": 1.1952286093343935e-06, "logits/chosen": -0.04931640625, "logits/rejected": 0.5, "logps/chosen": -416.0, "logps/rejected": -456.0, "loss": 0.1411, "rewards/accuracies": 0.9375, "rewards/chosen": -12.9375, "rewards/margins": 7.0, "rewards/rejected": -20.0, "step": 4480 }, { "epoch": 0.3241175196708294, "grad_norm": 13.435638195545799, "learning_rate": 1.1938968778798005e-06, "logits/chosen": 0.0537109375, "logits/rejected": 0.46484375, "logps/chosen": -436.0, "logps/rejected": -456.0, "loss": 0.1759, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.625, "rewards/margins": 7.15625, "rewards/rejected": -21.75, "step": 4490 }, { "epoch": 0.3248393849707645, "grad_norm": 10.006094090046073, "learning_rate": 1.1925695879998878e-06, "logits/chosen": 0.037841796875, "logits/rejected": 0.37890625, "logps/chosen": -444.0, "logps/rejected": -474.0, "loss": 0.1696, "rewards/accuracies": 0.9375, "rewards/chosen": -14.5625, "rewards/margins": 7.59375, "rewards/rejected": -22.125, "step": 4500 }, { "epoch": 0.3255612502706995, "grad_norm": 8.125828824269984, "learning_rate": 1.1912467150602794e-06, "logits/chosen": -0.0634765625, "logits/rejected": 0.54296875, "logps/chosen": -428.0, "logps/rejected": -456.0, "loss": 0.144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.75, "rewards/margins": 7.21875, "rewards/rejected": -23.0, "step": 4510 }, { "epoch": 0.3262831155706345, "grad_norm": 11.768418668731188, "learning_rate": 1.189928234617459e-06, "logits/chosen": 0.234375, "logits/rejected": 0.458984375, "logps/chosen": -418.0, "logps/rejected": -468.0, "loss": 0.1411, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.75, "rewards/margins": 7.40625, "rewards/rejected": -22.125, "step": 4520 }, { "epoch": 0.32700498087056956, "grad_norm": 12.683868698856102, "learning_rate": 1.1886141224168716e-06, "logits/chosen": 0.0830078125, "logits/rejected": 0.33984375, "logps/chosen": -426.0, "logps/rejected": -448.0, "loss": 0.1402, "rewards/accuracies": 0.90625, "rewards/chosen": -14.5, "rewards/margins": 6.125, "rewards/rejected": -20.625, "step": 4530 }, { "epoch": 0.32772684617050457, "grad_norm": 6.666273635895802, "learning_rate": 1.1873043543910495e-06, "logits/chosen": -0.01031494140625, "logits/rejected": 0.357421875, "logps/chosen": -412.0, "logps/rejected": -466.0, "loss": 0.1444, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.1875, "rewards/margins": 6.8125, "rewards/rejected": -20.0, "step": 4540 }, { "epoch": 0.32844871147043964, "grad_norm": 9.591897199545985, "learning_rate": 1.1859989066577617e-06, "logits/chosen": 0.0751953125, "logits/rejected": 0.4453125, "logps/chosen": -412.0, "logps/rejected": -456.0, "loss": 0.2029, "rewards/accuracies": 0.96875, "rewards/chosen": -14.5, "rewards/margins": 7.3125, "rewards/rejected": -21.875, "step": 4550 }, { "epoch": 0.32917057677037465, "grad_norm": 5.536218504264083, "learning_rate": 1.1846977555181846e-06, "logits/chosen": 0.2080078125, "logits/rejected": 0.55078125, "logps/chosen": -418.0, "logps/rejected": -458.0, "loss": 0.1434, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.6875, "rewards/margins": 6.75, "rewards/rejected": -22.375, "step": 4560 }, { "epoch": 0.32989244207030965, "grad_norm": 8.941013570350204, "learning_rate": 1.1834008774550946e-06, "logits/chosen": 0.1650390625, "logits/rejected": 0.49609375, "logps/chosen": -422.0, "logps/rejected": -432.0, "loss": 0.1639, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.5, "rewards/margins": 6.25, "rewards/rejected": -20.75, "step": 4570 }, { "epoch": 0.3306143073702447, "grad_norm": 14.303724625446705, "learning_rate": 1.1821082491310835e-06, "logits/chosen": 0.01141357421875, "logits/rejected": 0.404296875, "logps/chosen": -428.0, "logps/rejected": -440.0, "loss": 0.1559, "rewards/accuracies": 0.9375, "rewards/chosen": -13.9375, "rewards/margins": 6.5625, "rewards/rejected": -20.5, "step": 4580 }, { "epoch": 0.33133617267017973, "grad_norm": 7.264841331655194, "learning_rate": 1.1808198473867937e-06, "logits/chosen": 0.0269775390625, "logits/rejected": 0.419921875, "logps/chosen": -408.0, "logps/rejected": -436.0, "loss": 0.144, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.25, "rewards/margins": 6.40625, "rewards/rejected": -19.625, "step": 4590 }, { "epoch": 0.3320580379701148, "grad_norm": 9.732300760523689, "learning_rate": 1.179535649239177e-06, "logits/chosen": 0.048828125, "logits/rejected": 0.44921875, "logps/chosen": -404.0, "logps/rejected": -430.0, "loss": 0.1437, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.625, "rewards/margins": 6.6875, "rewards/rejected": -20.25, "step": 4600 }, { "epoch": 0.3327799032700498, "grad_norm": 12.028473355754437, "learning_rate": 1.178255631879771e-06, "logits/chosen": -0.10205078125, "logits/rejected": 0.35546875, "logps/chosen": -402.0, "logps/rejected": -446.0, "loss": 0.171, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.75, "rewards/margins": 6.84375, "rewards/rejected": -19.5, "step": 4610 }, { "epoch": 0.3335017685699848, "grad_norm": 10.674405645321041, "learning_rate": 1.1769797726729992e-06, "logits/chosen": 0.0556640625, "logits/rejected": 0.388671875, "logps/chosen": -394.0, "logps/rejected": -418.0, "loss": 0.1543, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.25, "rewards/margins": 7.15625, "rewards/rejected": -18.375, "step": 4620 }, { "epoch": 0.3342236338699199, "grad_norm": 14.982042355168739, "learning_rate": 1.1757080491544881e-06, "logits/chosen": 0.049560546875, "logits/rejected": 0.390625, "logps/chosen": -388.0, "logps/rejected": -420.0, "loss": 0.1581, "rewards/accuracies": 0.9375, "rewards/chosen": -11.75, "rewards/margins": 6.28125, "rewards/rejected": -18.0, "step": 4630 }, { "epoch": 0.3349454991698549, "grad_norm": 8.214668253868481, "learning_rate": 1.1744404390294068e-06, "logits/chosen": 0.08154296875, "logits/rejected": 0.4375, "logps/chosen": -410.0, "logps/rejected": -454.0, "loss": 0.1638, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.8125, "rewards/margins": 7.25, "rewards/rejected": -21.125, "step": 4640 }, { "epoch": 0.33566736446978995, "grad_norm": 8.53560026743475, "learning_rate": 1.1731769201708264e-06, "logits/chosen": 0.0341796875, "logits/rejected": 0.37109375, "logps/chosen": -446.0, "logps/rejected": -458.0, "loss": 0.1942, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.8125, "rewards/margins": 6.9375, "rewards/rejected": -20.75, "step": 4650 }, { "epoch": 0.33638922976972496, "grad_norm": 8.605837737962405, "learning_rate": 1.1719174706180952e-06, "logits/chosen": 0.11474609375, "logits/rejected": 0.392578125, "logps/chosen": -386.0, "logps/rejected": -444.0, "loss": 0.1537, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.8125, "rewards/margins": 6.25, "rewards/rejected": -19.0, "step": 4660 }, { "epoch": 0.33711109506966, "grad_norm": 7.059335392549087, "learning_rate": 1.1706620685752386e-06, "logits/chosen": 0.040283203125, "logits/rejected": 0.41015625, "logps/chosen": -390.0, "logps/rejected": -436.0, "loss": 0.141, "rewards/accuracies": 0.96875, "rewards/chosen": -12.375, "rewards/margins": 7.09375, "rewards/rejected": -19.5, "step": 4670 }, { "epoch": 0.33783296036959504, "grad_norm": 7.6957513031269675, "learning_rate": 1.1694106924093723e-06, "logits/chosen": 0.015625, "logits/rejected": 0.404296875, "logps/chosen": -410.0, "logps/rejected": -434.0, "loss": 0.1588, "rewards/accuracies": 0.9375, "rewards/chosen": -13.8125, "rewards/margins": 6.5, "rewards/rejected": -20.25, "step": 4680 }, { "epoch": 0.33855482566953005, "grad_norm": 10.420792536625791, "learning_rate": 1.1681633206491381e-06, "logits/chosen": 0.095703125, "logits/rejected": 0.462890625, "logps/chosen": -418.0, "logps/rejected": -466.0, "loss": 0.1211, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.3125, "rewards/margins": 6.84375, "rewards/rejected": -22.125, "step": 4690 }, { "epoch": 0.3392766909694651, "grad_norm": 15.061375284439185, "learning_rate": 1.1669199319831564e-06, "logits/chosen": 0.0235595703125, "logits/rejected": 0.54296875, "logps/chosen": -428.0, "logps/rejected": -432.0, "loss": 0.169, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.375, "rewards/margins": 6.8125, "rewards/rejected": -21.25, "step": 4700 }, { "epoch": 0.3399985562694001, "grad_norm": 11.308931467631304, "learning_rate": 1.1656805052584958e-06, "logits/chosen": -0.0167236328125, "logits/rejected": 0.392578125, "logps/chosen": -434.0, "logps/rejected": -482.0, "loss": 0.1474, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 8.1875, "rewards/rejected": -22.875, "step": 4710 }, { "epoch": 0.3407204215693352, "grad_norm": 9.482416335711111, "learning_rate": 1.164445019479164e-06, "logits/chosen": 0.0703125, "logits/rejected": 0.4375, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.1459, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.875, "rewards/margins": 6.125, "rewards/rejected": -20.0, "step": 4720 }, { "epoch": 0.3414422868692702, "grad_norm": 9.99200105367312, "learning_rate": 1.1632134538046105e-06, "logits/chosen": -0.1669921875, "logits/rejected": 0.39453125, "logps/chosen": -426.0, "logps/rejected": -454.0, "loss": 0.1401, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.5, "rewards/margins": 7.84375, "rewards/rejected": -21.375, "step": 4730 }, { "epoch": 0.3421641521692052, "grad_norm": 10.381949160350848, "learning_rate": 1.1619857875482536e-06, "logits/chosen": 0.0947265625, "logits/rejected": 0.466796875, "logps/chosen": -398.0, "logps/rejected": -458.0, "loss": 0.1658, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.375, "rewards/margins": 7.28125, "rewards/rejected": -19.75, "step": 4740 }, { "epoch": 0.34288601746914027, "grad_norm": 11.148269602629368, "learning_rate": 1.1607620001760185e-06, "logits/chosen": 0.053955078125, "logits/rejected": 0.416015625, "logps/chosen": -404.0, "logps/rejected": -424.0, "loss": 0.1413, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.4375, "rewards/margins": 7.09375, "rewards/rejected": -19.5, "step": 4750 }, { "epoch": 0.3436078827690753, "grad_norm": 10.967961305068238, "learning_rate": 1.1595420713048968e-06, "logits/chosen": 0.197265625, "logits/rejected": 0.55859375, "logps/chosen": -400.0, "logps/rejected": -442.0, "loss": 0.1582, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.5625, "rewards/margins": 6.21875, "rewards/rejected": -20.75, "step": 4760 }, { "epoch": 0.34432974806901034, "grad_norm": 7.918978302594898, "learning_rate": 1.1583259807015182e-06, "logits/chosen": -0.0556640625, "logits/rejected": 0.322265625, "logps/chosen": -412.0, "logps/rejected": -456.0, "loss": 0.1242, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.5, "rewards/margins": 6.96875, "rewards/rejected": -21.5, "step": 4770 }, { "epoch": 0.34505161336894535, "grad_norm": 12.317276913458779, "learning_rate": 1.1571137082807434e-06, "logits/chosen": 0.1103515625, "logits/rejected": 0.49609375, "logps/chosen": -440.0, "logps/rejected": -460.0, "loss": 0.1399, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.125, "rewards/margins": 7.09375, "rewards/rejected": -21.25, "step": 4780 }, { "epoch": 0.34577347866888036, "grad_norm": 12.520852416416862, "learning_rate": 1.155905234104269e-06, "logits/chosen": 0.197265625, "logits/rejected": 0.43359375, "logps/chosen": -436.0, "logps/rejected": -462.0, "loss": 0.1236, "rewards/accuracies": 0.9375, "rewards/chosen": -14.25, "rewards/margins": 7.5, "rewards/rejected": -21.75, "step": 4790 }, { "epoch": 0.34649534396881543, "grad_norm": 10.360951925984837, "learning_rate": 1.1547005383792516e-06, "logits/chosen": 0.0791015625, "logits/rejected": 0.4453125, "logps/chosen": -422.0, "logps/rejected": -466.0, "loss": 0.1446, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.75, "rewards/margins": 7.96875, "rewards/rejected": -21.75, "step": 4800 }, { "epoch": 0.34721720926875044, "grad_norm": 15.506608792532662, "learning_rate": 1.1534996014569446e-06, "logits/chosen": -0.0947265625, "logits/rejected": 0.3203125, "logps/chosen": -426.0, "logps/rejected": -456.0, "loss": 0.169, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.375, "rewards/margins": 7.5625, "rewards/rejected": -22.0, "step": 4810 }, { "epoch": 0.3479390745686855, "grad_norm": 10.77682337233514, "learning_rate": 1.1523024038313547e-06, "logits/chosen": 0.08203125, "logits/rejected": 0.484375, "logps/chosen": -416.0, "logps/rejected": -448.0, "loss": 0.1392, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5, "rewards/margins": 7.15625, "rewards/rejected": -20.625, "step": 4820 }, { "epoch": 0.3486609398686205, "grad_norm": 7.610635882029855, "learning_rate": 1.1511089261379083e-06, "logits/chosen": -0.10107421875, "logits/rejected": 0.259765625, "logps/chosen": -426.0, "logps/rejected": -440.0, "loss": 0.1482, "rewards/accuracies": 0.9375, "rewards/chosen": -12.5, "rewards/margins": 6.71875, "rewards/rejected": -19.25, "step": 4830 }, { "epoch": 0.3493828051685555, "grad_norm": 7.934145227397568, "learning_rate": 1.149919149152138e-06, "logits/chosen": -0.01361083984375, "logits/rejected": 0.38671875, "logps/chosen": -404.0, "logps/rejected": -444.0, "loss": 0.1401, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.1875, "rewards/margins": 7.5625, "rewards/rejected": -19.75, "step": 4840 }, { "epoch": 0.3501046704684906, "grad_norm": 9.935833681658748, "learning_rate": 1.148733053788381e-06, "logits/chosen": 0.0230712890625, "logits/rejected": 0.4375, "logps/chosen": -416.0, "logps/rejected": -464.0, "loss": 0.1408, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.5625, "rewards/margins": 7.46875, "rewards/rejected": -22.0, "step": 4850 }, { "epoch": 0.3508265357684256, "grad_norm": 7.576340545486862, "learning_rate": 1.1475506210984938e-06, "logits/chosen": 0.107421875, "logits/rejected": 0.5625, "logps/chosen": -426.0, "logps/rejected": -472.0, "loss": 0.151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.125, "rewards/margins": 7.15625, "rewards/rejected": -23.25, "step": 4860 }, { "epoch": 0.35154840106836066, "grad_norm": 9.75294449425741, "learning_rate": 1.1463718322705807e-06, "logits/chosen": 0.036865234375, "logits/rejected": 0.4921875, "logps/chosen": -414.0, "logps/rejected": -450.0, "loss": 0.1634, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.6875, "rewards/margins": 6.96875, "rewards/rejected": -20.625, "step": 4870 }, { "epoch": 0.35227026636829567, "grad_norm": 8.274816366002526, "learning_rate": 1.1451966686277364e-06, "logits/chosen": 0.01123046875, "logits/rejected": 0.40625, "logps/chosen": -414.0, "logps/rejected": -436.0, "loss": 0.1635, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.125, "rewards/margins": 6.53125, "rewards/rejected": -19.625, "step": 4880 }, { "epoch": 0.35299213166823074, "grad_norm": 19.63436997469731, "learning_rate": 1.1440251116268034e-06, "logits/chosen": 0.00775146484375, "logits/rejected": 0.41796875, "logps/chosen": -414.0, "logps/rejected": -448.0, "loss": 0.1513, "rewards/accuracies": 0.96875, "rewards/chosen": -14.25, "rewards/margins": 6.96875, "rewards/rejected": -21.25, "step": 4890 }, { "epoch": 0.35371399696816574, "grad_norm": 9.725105631815483, "learning_rate": 1.1428571428571428e-06, "logits/chosen": -0.1396484375, "logits/rejected": 0.39453125, "logps/chosen": -456.0, "logps/rejected": -502.0, "loss": 0.1478, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.125, "rewards/margins": 7.15625, "rewards/rejected": -23.25, "step": 4900 }, { "epoch": 0.35443586226810075, "grad_norm": 10.820588133574837, "learning_rate": 1.14169274403942e-06, "logits/chosen": -0.08349609375, "logits/rejected": 0.5234375, "logps/chosen": -434.0, "logps/rejected": -452.0, "loss": 0.1433, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.375, "rewards/margins": 6.65625, "rewards/rejected": -22.0, "step": 4910 }, { "epoch": 0.3551577275680358, "grad_norm": 11.069211836836908, "learning_rate": 1.140531897024402e-06, "logits/chosen": -0.02197265625, "logits/rejected": 0.337890625, "logps/chosen": -426.0, "logps/rejected": -466.0, "loss": 0.1636, "rewards/accuracies": 0.9375, "rewards/chosen": -13.1875, "rewards/margins": 7.1875, "rewards/rejected": -20.375, "step": 4920 }, { "epoch": 0.35587959286797083, "grad_norm": 9.455244623523766, "learning_rate": 1.13937458379177e-06, "logits/chosen": 0.06494140625, "logits/rejected": 0.515625, "logps/chosen": -400.0, "logps/rejected": -436.0, "loss": 0.1375, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.125, "rewards/margins": 7.625, "rewards/rejected": -20.75, "step": 4930 }, { "epoch": 0.3566014581679059, "grad_norm": 9.176240341505327, "learning_rate": 1.1382207864489444e-06, "logits/chosen": -0.03759765625, "logits/rejected": 0.40625, "logps/chosen": -408.0, "logps/rejected": -438.0, "loss": 0.1614, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.4375, "rewards/margins": 6.90625, "rewards/rejected": -19.375, "step": 4940 }, { "epoch": 0.3573233234678409, "grad_norm": 11.70902769391576, "learning_rate": 1.1370704872299223e-06, "logits/chosen": 0.0751953125, "logits/rejected": 0.3984375, "logps/chosen": -424.0, "logps/rejected": -454.0, "loss": 0.1501, "rewards/accuracies": 0.9375, "rewards/chosen": -14.25, "rewards/margins": 6.5625, "rewards/rejected": -20.875, "step": 4950 }, { "epoch": 0.3580451887677759, "grad_norm": 12.047967312408748, "learning_rate": 1.1359236684941295e-06, "logits/chosen": -0.01220703125, "logits/rejected": 0.41796875, "logps/chosen": -432.0, "logps/rejected": -462.0, "loss": 0.1449, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.0625, "rewards/margins": 7.6875, "rewards/rejected": -20.75, "step": 4960 }, { "epoch": 0.358767054067711, "grad_norm": 9.703335298093146, "learning_rate": 1.1347803127252839e-06, "logits/chosen": 0.043212890625, "logits/rejected": 0.443359375, "logps/chosen": -418.0, "logps/rejected": -440.0, "loss": 0.1492, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.5, "rewards/margins": 7.09375, "rewards/rejected": -20.625, "step": 4970 }, { "epoch": 0.359488919367646, "grad_norm": 11.325198076659609, "learning_rate": 1.1336404025302715e-06, "logits/chosen": -0.01263427734375, "logits/rejected": 0.404296875, "logps/chosen": -402.0, "logps/rejected": -420.0, "loss": 0.1517, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.75, "rewards/margins": 6.59375, "rewards/rejected": -18.375, "step": 4980 }, { "epoch": 0.36021078466758105, "grad_norm": 12.06646016866154, "learning_rate": 1.1325039206380352e-06, "logits/chosen": 0.042236328125, "logits/rejected": 0.3671875, "logps/chosen": -384.0, "logps/rejected": -446.0, "loss": 0.1421, "rewards/accuracies": 0.96875, "rewards/chosen": -11.5625, "rewards/margins": 7.375, "rewards/rejected": -18.875, "step": 4990 }, { "epoch": 0.36093264996751606, "grad_norm": 6.965136087713262, "learning_rate": 1.131370849898476e-06, "logits/chosen": -0.08642578125, "logits/rejected": 0.2275390625, "logps/chosen": -398.0, "logps/rejected": -450.0, "loss": 0.1293, "rewards/accuracies": 0.9375, "rewards/chosen": -12.3125, "rewards/margins": 7.28125, "rewards/rejected": -19.625, "step": 5000 }, { "epoch": 0.36093264996751606, "eval_logits/chosen": -0.032958984375, "eval_logits/rejected": 0.35546875, "eval_logps/chosen": -408.0, "eval_logps/rejected": -430.0, "eval_loss": 0.22958673536777496, "eval_rewards/accuracies": 0.9074675440788269, "eval_rewards/chosen": -12.625, "eval_rewards/margins": 6.0, "eval_rewards/rejected": -18.625, "eval_runtime": 2856.4248, "eval_samples_per_second": 34.484, "eval_steps_per_second": 0.539, "step": 5000 }, { "epoch": 0.36165451526745107, "grad_norm": 11.848193695858857, "learning_rate": 1.130241173281366e-06, "logits/chosen": 0.0281982421875, "logits/rejected": 0.46484375, "logps/chosen": -380.0, "logps/rejected": -406.0, "loss": 0.1425, "rewards/accuracies": 0.96875, "rewards/chosen": -11.5625, "rewards/margins": 6.8125, "rewards/rejected": -18.375, "step": 5010 }, { "epoch": 0.36237638056738614, "grad_norm": 13.393672114042658, "learning_rate": 1.1291148738752732e-06, "logits/chosen": -0.06787109375, "logits/rejected": 0.271484375, "logps/chosen": -448.0, "logps/rejected": -472.0, "loss": 0.1268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.0, "rewards/margins": 7.78125, "rewards/rejected": -20.75, "step": 5020 }, { "epoch": 0.36309824586732115, "grad_norm": 9.53479127441827, "learning_rate": 1.1279919348864981e-06, "logits/chosen": -0.015625, "logits/rejected": 0.369140625, "logps/chosen": -426.0, "logps/rejected": -468.0, "loss": 0.1447, "rewards/accuracies": 0.9375, "rewards/chosen": -14.6875, "rewards/margins": 7.25, "rewards/rejected": -22.0, "step": 5030 }, { "epoch": 0.3638201111672562, "grad_norm": 11.152572394412019, "learning_rate": 1.126872339638022e-06, "logits/chosen": -0.0625, "logits/rejected": 0.435546875, "logps/chosen": -444.0, "logps/rejected": -490.0, "loss": 0.131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.5, "rewards/margins": 7.375, "rewards/rejected": -22.875, "step": 5040 }, { "epoch": 0.3645419764671912, "grad_norm": 9.301087113697477, "learning_rate": 1.1257560715684668e-06, "logits/chosen": 0.06494140625, "logits/rejected": 0.423828125, "logps/chosen": -420.0, "logps/rejected": -484.0, "loss": 0.1092, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.875, "rewards/margins": 7.34375, "rewards/rejected": -22.25, "step": 5050 }, { "epoch": 0.36526384176712623, "grad_norm": 8.003095689246399, "learning_rate": 1.1246431142310665e-06, "logits/chosen": 0.0966796875, "logits/rejected": 0.44921875, "logps/chosen": -412.0, "logps/rejected": -460.0, "loss": 0.1345, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.75, "rewards/margins": 7.3125, "rewards/rejected": -21.0, "step": 5060 }, { "epoch": 0.3659857070670613, "grad_norm": 7.183499519534575, "learning_rate": 1.1235334512926484e-06, "logits/chosen": 0.006927490234375, "logits/rejected": 0.2490234375, "logps/chosen": -410.0, "logps/rejected": -482.0, "loss": 0.13, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.0625, "rewards/margins": 6.875, "rewards/rejected": -20.0, "step": 5070 }, { "epoch": 0.3667075723669963, "grad_norm": 5.7739253619343875, "learning_rate": 1.1224270665326274e-06, "logits/chosen": 0.0615234375, "logits/rejected": 0.396484375, "logps/chosen": -434.0, "logps/rejected": -462.0, "loss": 0.1436, "rewards/accuracies": 0.90625, "rewards/chosen": -13.9375, "rewards/margins": 6.4375, "rewards/rejected": -20.375, "step": 5080 }, { "epoch": 0.36742943766693137, "grad_norm": 13.001833608596108, "learning_rate": 1.12132394384201e-06, "logits/chosen": 0.21484375, "logits/rejected": 0.48828125, "logps/chosen": -406.0, "logps/rejected": -450.0, "loss": 0.1217, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.5, "rewards/margins": 6.125, "rewards/rejected": -19.625, "step": 5090 }, { "epoch": 0.3681513029668664, "grad_norm": 9.195049361769259, "learning_rate": 1.1202240672224076e-06, "logits/chosen": 0.103515625, "logits/rejected": 0.46484375, "logps/chosen": -436.0, "logps/rejected": -448.0, "loss": 0.1781, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.25, "rewards/margins": 6.90625, "rewards/rejected": -21.25, "step": 5100 }, { "epoch": 0.3688731682668014, "grad_norm": 11.391872233453675, "learning_rate": 1.1191274207850654e-06, "logits/chosen": -0.0693359375, "logits/rejected": 0.380859375, "logps/chosen": -398.0, "logps/rejected": -462.0, "loss": 0.1694, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.0, "rewards/margins": 7.0, "rewards/rejected": -21.0, "step": 5110 }, { "epoch": 0.36959503356673645, "grad_norm": 7.5461916197657635, "learning_rate": 1.1180339887498948e-06, "logits/chosen": -0.05322265625, "logits/rejected": 0.478515625, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.1228, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.8125, "rewards/margins": 6.84375, "rewards/rejected": -19.625, "step": 5120 }, { "epoch": 0.37031689886667146, "grad_norm": 12.194994374751813, "learning_rate": 1.1169437554445213e-06, "logits/chosen": 0.04541015625, "logits/rejected": 0.52734375, "logps/chosen": -414.0, "logps/rejected": -446.0, "loss": 0.1274, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.125, "rewards/margins": 7.65625, "rewards/rejected": -19.75, "step": 5130 }, { "epoch": 0.3710387641666065, "grad_norm": 11.846532125221406, "learning_rate": 1.1158567053033413e-06, "logits/chosen": 0.02587890625, "logits/rejected": 0.32421875, "logps/chosen": -394.0, "logps/rejected": -438.0, "loss": 0.1273, "rewards/accuracies": 0.9375, "rewards/chosen": -11.9375, "rewards/margins": 6.65625, "rewards/rejected": -18.625, "step": 5140 }, { "epoch": 0.37176062946654154, "grad_norm": 9.646772020391154, "learning_rate": 1.1147728228665882e-06, "logits/chosen": 0.005615234375, "logits/rejected": 0.36328125, "logps/chosen": -410.0, "logps/rejected": -454.0, "loss": 0.1414, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.875, "rewards/margins": 6.875, "rewards/rejected": -19.75, "step": 5150 }, { "epoch": 0.3724824947664766, "grad_norm": 9.559653039160379, "learning_rate": 1.1136920927794092e-06, "logits/chosen": -0.06982421875, "logits/rejected": 0.2490234375, "logps/chosen": -408.0, "logps/rejected": -440.0, "loss": 0.1494, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.8125, "rewards/margins": 6.8125, "rewards/rejected": -19.625, "step": 5160 }, { "epoch": 0.3732043600664116, "grad_norm": 8.599822914289671, "learning_rate": 1.1126144997909508e-06, "logits/chosen": 0.004547119140625, "logits/rejected": 0.36328125, "logps/chosen": -396.0, "logps/rejected": -422.0, "loss": 0.1402, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -11.3125, "rewards/margins": 6.375, "rewards/rejected": -17.625, "step": 5170 }, { "epoch": 0.3739262253663466, "grad_norm": 8.062258322503556, "learning_rate": 1.1115400287534568e-06, "logits/chosen": 0.040771484375, "logits/rejected": 0.326171875, "logps/chosen": -394.0, "logps/rejected": -404.0, "loss": 0.1304, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -10.75, "rewards/margins": 6.21875, "rewards/rejected": -17.0, "step": 5180 }, { "epoch": 0.3746480906662817, "grad_norm": 11.220328983356872, "learning_rate": 1.110468664621372e-06, "logits/chosen": 0.00799560546875, "logits/rejected": 0.396484375, "logps/chosen": -400.0, "logps/rejected": -434.0, "loss": 0.1478, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -12.3125, "rewards/margins": 6.8125, "rewards/rejected": -19.125, "step": 5190 }, { "epoch": 0.3753699559662167, "grad_norm": 10.850671096332228, "learning_rate": 1.1094003924504583e-06, "logits/chosen": -0.05419921875, "logits/rejected": 0.4609375, "logps/chosen": -402.0, "logps/rejected": -434.0, "loss": 0.156, "rewards/accuracies": 0.96875, "rewards/chosen": -13.125, "rewards/margins": 6.78125, "rewards/rejected": -19.875, "step": 5200 }, { "epoch": 0.37609182126615176, "grad_norm": 9.53806609257017, "learning_rate": 1.1083351973969191e-06, "logits/chosen": -0.1708984375, "logits/rejected": 0.25390625, "logps/chosen": -428.0, "logps/rejected": -444.0, "loss": 0.1331, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.1875, "rewards/margins": 6.1875, "rewards/rejected": -19.375, "step": 5210 }, { "epoch": 0.37681368656608677, "grad_norm": 9.3765539157815, "learning_rate": 1.107273064716533e-06, "logits/chosen": -0.053955078125, "logits/rejected": 0.376953125, "logps/chosen": -400.0, "logps/rejected": -422.0, "loss": 0.1332, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.1875, "rewards/margins": 6.78125, "rewards/rejected": -20.0, "step": 5220 }, { "epoch": 0.3775355518660218, "grad_norm": 12.206922925359015, "learning_rate": 1.1062139797637962e-06, "logits/chosen": -0.01226806640625, "logits/rejected": 0.435546875, "logps/chosen": -402.0, "logps/rejected": -452.0, "loss": 0.1798, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.0625, "rewards/margins": 7.125, "rewards/rejected": -20.25, "step": 5230 }, { "epoch": 0.37825741716595684, "grad_norm": 9.328753229134062, "learning_rate": 1.1051579279910751e-06, "logits/chosen": 0.08056640625, "logits/rejected": 0.44921875, "logps/chosen": -404.0, "logps/rejected": -454.0, "loss": 0.1242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.125, "rewards/margins": 7.4375, "rewards/rejected": -20.5, "step": 5240 }, { "epoch": 0.37897928246589185, "grad_norm": 10.341473876102198, "learning_rate": 1.1041048949477667e-06, "logits/chosen": 0.06591796875, "logits/rejected": 0.3359375, "logps/chosen": -414.0, "logps/rejected": -460.0, "loss": 0.1491, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.3125, "rewards/margins": 6.78125, "rewards/rejected": -20.125, "step": 5250 }, { "epoch": 0.3797011477658269, "grad_norm": 11.253772399096984, "learning_rate": 1.1030548662794673e-06, "logits/chosen": -0.004425048828125, "logits/rejected": 0.42578125, "logps/chosen": -404.0, "logps/rejected": -454.0, "loss": 0.1487, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -12.9375, "rewards/margins": 6.96875, "rewards/rejected": -19.875, "step": 5260 }, { "epoch": 0.38042301306576193, "grad_norm": 10.903183562940525, "learning_rate": 1.102007827727152e-06, "logits/chosen": -0.1806640625, "logits/rejected": 0.375, "logps/chosen": -406.0, "logps/rejected": -448.0, "loss": 0.1438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.8125, "rewards/margins": 7.40625, "rewards/rejected": -21.25, "step": 5270 }, { "epoch": 0.38114487836569694, "grad_norm": 13.776914127576061, "learning_rate": 1.1009637651263607e-06, "logits/chosen": -0.032958984375, "logits/rejected": 0.4609375, "logps/chosen": -420.0, "logps/rejected": -430.0, "loss": 0.1403, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.0625, "rewards/margins": 7.1875, "rewards/rejected": -20.25, "step": 5280 }, { "epoch": 0.381866743665632, "grad_norm": 9.228918291070988, "learning_rate": 1.0999226644063927e-06, "logits/chosen": 0.0712890625, "logits/rejected": 0.404296875, "logps/chosen": -412.0, "logps/rejected": -452.0, "loss": 0.152, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.5, "rewards/margins": 6.78125, "rewards/rejected": -19.25, "step": 5290 }, { "epoch": 0.382588608965567, "grad_norm": 5.8975801975637205, "learning_rate": 1.0988845115895123e-06, "logits/chosen": -0.09326171875, "logits/rejected": 0.369140625, "logps/chosen": -422.0, "logps/rejected": -480.0, "loss": 0.1362, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.875, "rewards/margins": 7.28125, "rewards/rejected": -22.125, "step": 5300 }, { "epoch": 0.3833104742655021, "grad_norm": 9.686481424793918, "learning_rate": 1.0978492927901574e-06, "logits/chosen": -0.212890625, "logits/rejected": 0.283203125, "logps/chosen": -484.0, "logps/rejected": -508.0, "loss": 0.1354, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.0, "rewards/margins": 8.5, "rewards/rejected": -23.5, "step": 5310 }, { "epoch": 0.3840323395654371, "grad_norm": 7.359700097068131, "learning_rate": 1.0968169942141634e-06, "logits/chosen": 0.04150390625, "logits/rejected": 0.4609375, "logps/chosen": -456.0, "logps/rejected": -458.0, "loss": 0.1375, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.25, "rewards/margins": 6.375, "rewards/rejected": -22.625, "step": 5320 }, { "epoch": 0.3847542048653721, "grad_norm": 15.141825512350811, "learning_rate": 1.0957876021579874e-06, "logits/chosen": -0.2294921875, "logits/rejected": 0.2099609375, "logps/chosen": -440.0, "logps/rejected": -498.0, "loss": 0.1705, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.0, "rewards/margins": 8.3125, "rewards/rejected": -22.375, "step": 5330 }, { "epoch": 0.38547607016530716, "grad_norm": 7.244188267668933, "learning_rate": 1.0947611030079466e-06, "logits/chosen": -0.0306396484375, "logits/rejected": 0.38671875, "logps/chosen": -424.0, "logps/rejected": -454.0, "loss": 0.1209, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.0625, "rewards/margins": 7.34375, "rewards/rejected": -20.375, "step": 5340 }, { "epoch": 0.38619793546524217, "grad_norm": 11.27372984174605, "learning_rate": 1.0937374832394612e-06, "logits/chosen": -0.08447265625, "logits/rejected": 0.298828125, "logps/chosen": -406.0, "logps/rejected": -440.0, "loss": 0.1286, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.5625, "rewards/margins": 7.15625, "rewards/rejected": -19.75, "step": 5350 }, { "epoch": 0.38691980076517724, "grad_norm": 14.478259596797194, "learning_rate": 1.092716729416306e-06, "logits/chosen": -0.055908203125, "logits/rejected": 0.41796875, "logps/chosen": -412.0, "logps/rejected": -446.0, "loss": 0.1252, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.125, "rewards/margins": 7.0, "rewards/rejected": -19.125, "step": 5360 }, { "epoch": 0.38764166606511224, "grad_norm": 13.079876947413483, "learning_rate": 1.0916988281898703e-06, "logits/chosen": -0.0203857421875, "logits/rejected": 0.328125, "logps/chosen": -420.0, "logps/rejected": -458.0, "loss": 0.1442, "rewards/accuracies": 0.9375, "rewards/chosen": -12.875, "rewards/margins": 7.125, "rewards/rejected": -20.0, "step": 5370 }, { "epoch": 0.3883635313650473, "grad_norm": 8.5570649263147, "learning_rate": 1.0906837662984237e-06, "logits/chosen": -0.060791015625, "logits/rejected": 0.392578125, "logps/chosen": -414.0, "logps/rejected": -452.0, "loss": 0.1137, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.8125, "rewards/margins": 7.34375, "rewards/rejected": -21.125, "step": 5380 }, { "epoch": 0.3890853966649823, "grad_norm": 8.60516506650713, "learning_rate": 1.089671530566391e-06, "logits/chosen": 0.0181884765625, "logits/rejected": 0.47265625, "logps/chosen": -410.0, "logps/rejected": -434.0, "loss": 0.1317, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.375, "rewards/margins": 6.84375, "rewards/rejected": -21.25, "step": 5390 }, { "epoch": 0.38980726196491733, "grad_norm": 6.847588708793033, "learning_rate": 1.0886621079036346e-06, "logits/chosen": 0.01446533203125, "logits/rejected": 0.392578125, "logps/chosen": -390.0, "logps/rejected": -426.0, "loss": 0.1382, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.9375, "rewards/margins": 7.1875, "rewards/rejected": -20.125, "step": 5400 }, { "epoch": 0.3905291272648524, "grad_norm": 9.231598970791062, "learning_rate": 1.0876554853047417e-06, "logits/chosen": -0.1220703125, "logits/rejected": 0.357421875, "logps/chosen": -400.0, "logps/rejected": -436.0, "loss": 0.1487, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.625, "rewards/margins": 6.71875, "rewards/rejected": -20.375, "step": 5410 }, { "epoch": 0.3912509925647874, "grad_norm": 8.231209755693264, "learning_rate": 1.0866516498483225e-06, "logits/chosen": -0.08642578125, "logits/rejected": 0.412109375, "logps/chosen": -402.0, "logps/rejected": -438.0, "loss": 0.1566, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.5625, "rewards/margins": 7.1875, "rewards/rejected": -19.75, "step": 5420 }, { "epoch": 0.39197285786472247, "grad_norm": 12.0357525564595, "learning_rate": 1.0856505886963116e-06, "logits/chosen": 0.0294189453125, "logits/rejected": 0.30078125, "logps/chosen": -408.0, "logps/rejected": -448.0, "loss": 0.1728, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.125, "rewards/margins": 6.21875, "rewards/rejected": -19.375, "step": 5430 }, { "epoch": 0.3926947231646575, "grad_norm": 8.744782215765676, "learning_rate": 1.0846522890932808e-06, "logits/chosen": -0.11474609375, "logits/rejected": 0.3125, "logps/chosen": -380.0, "logps/rejected": -406.0, "loss": 0.1436, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.5625, "rewards/margins": 6.21875, "rewards/rejected": -16.75, "step": 5440 }, { "epoch": 0.3934165884645925, "grad_norm": 12.911459197399518, "learning_rate": 1.0836567383657542e-06, "logits/chosen": 0.031982421875, "logits/rejected": 0.38671875, "logps/chosen": -412.0, "logps/rejected": -480.0, "loss": 0.1258, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.5625, "rewards/margins": 7.8125, "rewards/rejected": -21.375, "step": 5450 }, { "epoch": 0.39413845376452755, "grad_norm": 12.397596309091062, "learning_rate": 1.0826639239215334e-06, "logits/chosen": -0.1630859375, "logits/rejected": 0.32421875, "logps/chosen": -424.0, "logps/rejected": -444.0, "loss": 0.1238, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.6875, "rewards/margins": 6.9375, "rewards/rejected": -19.625, "step": 5460 }, { "epoch": 0.39486031906446256, "grad_norm": 6.498734270205332, "learning_rate": 1.0816738332490292e-06, "logits/chosen": -0.142578125, "logits/rejected": 0.380859375, "logps/chosen": -434.0, "logps/rejected": -478.0, "loss": 0.1196, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.0625, "rewards/margins": 7.125, "rewards/rejected": -20.125, "step": 5470 }, { "epoch": 0.3955821843643976, "grad_norm": 7.23594247467827, "learning_rate": 1.0806864539165982e-06, "logits/chosen": -0.1552734375, "logits/rejected": 0.2099609375, "logps/chosen": -400.0, "logps/rejected": -440.0, "loss": 0.1364, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -11.875, "rewards/margins": 7.125, "rewards/rejected": -19.0, "step": 5480 }, { "epoch": 0.39630404966433264, "grad_norm": 9.677544844316229, "learning_rate": 1.0797017735718878e-06, "logits/chosen": -0.12890625, "logits/rejected": 0.291015625, "logps/chosen": -402.0, "logps/rejected": -452.0, "loss": 0.1555, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.125, "rewards/margins": 7.71875, "rewards/rejected": -20.875, "step": 5490 }, { "epoch": 0.39702591496426765, "grad_norm": 11.103371600106193, "learning_rate": 1.0787197799411874e-06, "logits/chosen": -0.09619140625, "logits/rejected": 0.27734375, "logps/chosen": -402.0, "logps/rejected": -446.0, "loss": 0.129, "rewards/accuracies": 0.96875, "rewards/chosen": -12.625, "rewards/margins": 7.15625, "rewards/rejected": -19.75, "step": 5500 }, { "epoch": 0.3977477802642027, "grad_norm": 10.448830468523125, "learning_rate": 1.0777404608287846e-06, "logits/chosen": -0.06884765625, "logits/rejected": 0.455078125, "logps/chosen": -414.0, "logps/rejected": -444.0, "loss": 0.1185, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.25, "rewards/margins": 7.59375, "rewards/rejected": -20.75, "step": 5510 }, { "epoch": 0.3984696455641377, "grad_norm": 11.228684167531085, "learning_rate": 1.0767638041163309e-06, "logits/chosen": -0.19921875, "logits/rejected": 0.3359375, "logps/chosen": -430.0, "logps/rejected": -480.0, "loss": 0.1648, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.1875, "rewards/margins": 8.0, "rewards/rejected": -22.25, "step": 5520 }, { "epoch": 0.3991915108640728, "grad_norm": 9.444460846979656, "learning_rate": 1.0757897977622107e-06, "logits/chosen": 0.07421875, "logits/rejected": 0.400390625, "logps/chosen": -442.0, "logps/rejected": -464.0, "loss": 0.1292, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.9375, "rewards/margins": 7.5, "rewards/rejected": -22.375, "step": 5530 }, { "epoch": 0.3999133761640078, "grad_norm": 10.215574857128491, "learning_rate": 1.074818429800918e-06, "logits/chosen": 0.0830078125, "logits/rejected": 0.470703125, "logps/chosen": -424.0, "logps/rejected": -480.0, "loss": 0.1233, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.375, "rewards/margins": 7.34375, "rewards/rejected": -22.75, "step": 5540 }, { "epoch": 0.4006352414639428, "grad_norm": 21.557664064534055, "learning_rate": 1.073849688342439e-06, "logits/chosen": 0.0115966796875, "logits/rejected": 0.42578125, "logps/chosen": -416.0, "logps/rejected": -448.0, "loss": 0.1365, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.75, "rewards/margins": 7.15625, "rewards/rejected": -20.875, "step": 5550 }, { "epoch": 0.40135710676387787, "grad_norm": 6.36479964038878, "learning_rate": 1.07288356157164e-06, "logits/chosen": 0.0189208984375, "logits/rejected": 0.4765625, "logps/chosen": -412.0, "logps/rejected": -454.0, "loss": 0.1299, "rewards/accuracies": 0.9375, "rewards/chosen": -14.5, "rewards/margins": 7.28125, "rewards/rejected": -21.875, "step": 5560 }, { "epoch": 0.4020789720638129, "grad_norm": 6.87933140905106, "learning_rate": 1.0719200377476648e-06, "logits/chosen": 0.07421875, "logits/rejected": 0.486328125, "logps/chosen": -452.0, "logps/rejected": -476.0, "loss": 0.1221, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.9375, "rewards/margins": 7.78125, "rewards/rejected": -23.75, "step": 5570 }, { "epoch": 0.40280083736374794, "grad_norm": 6.697297019978146, "learning_rate": 1.0709591052033317e-06, "logits/chosen": 0.03759765625, "logits/rejected": 0.443359375, "logps/chosen": -430.0, "logps/rejected": -482.0, "loss": 0.1302, "rewards/accuracies": 0.96875, "rewards/chosen": -16.375, "rewards/margins": 7.96875, "rewards/rejected": -24.375, "step": 5580 }, { "epoch": 0.40352270266368295, "grad_norm": 7.048138268311776, "learning_rate": 1.0700007523445435e-06, "logits/chosen": 0.09130859375, "logits/rejected": 0.498046875, "logps/chosen": -458.0, "logps/rejected": -482.0, "loss": 0.1172, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -16.625, "rewards/margins": 7.03125, "rewards/rejected": -23.625, "step": 5590 }, { "epoch": 0.40424456796361796, "grad_norm": 9.545919240671585, "learning_rate": 1.0690449676496976e-06, "logits/chosen": 0.2236328125, "logits/rejected": 0.609375, "logps/chosen": -422.0, "logps/rejected": -472.0, "loss": 0.1546, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.8125, "rewards/margins": 7.3125, "rewards/rejected": -23.125, "step": 5600 }, { "epoch": 0.404966433263553, "grad_norm": 15.381337702269557, "learning_rate": 1.0680917396691054e-06, "logits/chosen": 0.052978515625, "logits/rejected": 0.453125, "logps/chosen": -450.0, "logps/rejected": -472.0, "loss": 0.1458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.625, "rewards/margins": 7.09375, "rewards/rejected": -23.75, "step": 5610 }, { "epoch": 0.40568829856348804, "grad_norm": 9.004907587578497, "learning_rate": 1.0671410570244164e-06, "logits/chosen": 0.0751953125, "logits/rejected": 0.61328125, "logps/chosen": -462.0, "logps/rejected": -476.0, "loss": 0.1552, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.25, "rewards/margins": 7.46875, "rewards/rejected": -24.75, "step": 5620 }, { "epoch": 0.4064101638634231, "grad_norm": 6.982598752318088, "learning_rate": 1.0661929084080466e-06, "logits/chosen": 0.267578125, "logits/rejected": 0.5546875, "logps/chosen": -430.0, "logps/rejected": -466.0, "loss": 0.1253, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 7.40625, "rewards/rejected": -24.25, "step": 5630 }, { "epoch": 0.4071320291633581, "grad_norm": 12.500686625999359, "learning_rate": 1.0652472825826149e-06, "logits/chosen": 0.0498046875, "logits/rejected": 0.390625, "logps/chosen": -456.0, "logps/rejected": -494.0, "loss": 0.1359, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 7.34375, "rewards/rejected": -24.5, "step": 5640 }, { "epoch": 0.4078538944632932, "grad_norm": 9.551189412493981, "learning_rate": 1.0643041683803828e-06, "logits/chosen": 0.0260009765625, "logits/rejected": 0.5390625, "logps/chosen": -450.0, "logps/rejected": -490.0, "loss": 0.1338, "rewards/accuracies": 0.9375, "rewards/chosen": -16.625, "rewards/margins": 7.40625, "rewards/rejected": -24.0, "step": 5650 }, { "epoch": 0.4085757597632282, "grad_norm": 7.832546812610588, "learning_rate": 1.063363554702701e-06, "logits/chosen": 0.08544921875, "logits/rejected": 0.55078125, "logps/chosen": -432.0, "logps/rejected": -472.0, "loss": 0.1324, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.125, "rewards/margins": 7.25, "rewards/rejected": -23.375, "step": 5660 }, { "epoch": 0.4092976250631632, "grad_norm": 9.213219963849948, "learning_rate": 1.0624254305194609e-06, "logits/chosen": 0.0615234375, "logits/rejected": 0.578125, "logps/chosen": -432.0, "logps/rejected": -476.0, "loss": 0.1399, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.3125, "rewards/margins": 6.9375, "rewards/rejected": -22.25, "step": 5670 }, { "epoch": 0.41001949036309826, "grad_norm": 10.857639140414895, "learning_rate": 1.0614897848685505e-06, "logits/chosen": 0.09375, "logits/rejected": 0.5390625, "logps/chosen": -432.0, "logps/rejected": -462.0, "loss": 0.1455, "rewards/accuracies": 0.96875, "rewards/chosen": -14.375, "rewards/margins": 7.34375, "rewards/rejected": -21.75, "step": 5680 }, { "epoch": 0.41074135566303327, "grad_norm": 15.129739552578735, "learning_rate": 1.0605566068553173e-06, "logits/chosen": -0.04150390625, "logits/rejected": 0.408203125, "logps/chosen": -418.0, "logps/rejected": -462.0, "loss": 0.1383, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -14.875, "rewards/margins": 7.0, "rewards/rejected": -21.875, "step": 5690 }, { "epoch": 0.41146322096296833, "grad_norm": 12.10115922842981, "learning_rate": 1.0596258856520351e-06, "logits/chosen": -0.02978515625, "logits/rejected": 0.44140625, "logps/chosen": -414.0, "logps/rejected": -458.0, "loss": 0.1315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.3125, "rewards/margins": 7.6875, "rewards/rejected": -21.0, "step": 5700 }, { "epoch": 0.41218508626290334, "grad_norm": 12.46264737248812, "learning_rate": 1.0586976104973764e-06, "logits/chosen": -0.03955078125, "logits/rejected": 0.439453125, "logps/chosen": -428.0, "logps/rejected": -460.0, "loss": 0.1253, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.6875, "rewards/margins": 7.0, "rewards/rejected": -22.75, "step": 5710 }, { "epoch": 0.41290695156283835, "grad_norm": 7.648192876798088, "learning_rate": 1.05777177069589e-06, "logits/chosen": -0.0235595703125, "logits/rejected": 0.54296875, "logps/chosen": -434.0, "logps/rejected": -478.0, "loss": 0.1173, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.125, "rewards/margins": 8.125, "rewards/rejected": -23.25, "step": 5720 }, { "epoch": 0.4136288168627734, "grad_norm": 7.835369755746516, "learning_rate": 1.0568483556174834e-06, "logits/chosen": -0.10986328125, "logits/rejected": 0.341796875, "logps/chosen": -452.0, "logps/rejected": -476.0, "loss": 0.1443, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.1875, "rewards/margins": 7.25, "rewards/rejected": -22.375, "step": 5730 }, { "epoch": 0.41435068216270843, "grad_norm": 9.299003374691727, "learning_rate": 1.055927354696909e-06, "logits/chosen": 0.00811767578125, "logits/rejected": 0.357421875, "logps/chosen": -420.0, "logps/rejected": -458.0, "loss": 0.1472, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.0625, "rewards/margins": 7.0625, "rewards/rejected": -21.125, "step": 5740 }, { "epoch": 0.4150725474626435, "grad_norm": 6.554583324102338, "learning_rate": 1.0550087574332592e-06, "logits/chosen": -0.1220703125, "logits/rejected": 0.318359375, "logps/chosen": -408.0, "logps/rejected": -442.0, "loss": 0.1055, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.3125, "rewards/margins": 7.53125, "rewards/rejected": -19.875, "step": 5750 }, { "epoch": 0.4157944127625785, "grad_norm": 9.289259909555076, "learning_rate": 1.0540925533894598e-06, "logits/chosen": -0.07373046875, "logits/rejected": 0.291015625, "logps/chosen": -402.0, "logps/rejected": -440.0, "loss": 0.1654, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.75, "rewards/margins": 7.15625, "rewards/rejected": -19.875, "step": 5760 }, { "epoch": 0.4165162780625135, "grad_norm": 8.69640163422982, "learning_rate": 1.053178732191775e-06, "logits/chosen": -0.0269775390625, "logits/rejected": 0.376953125, "logps/chosen": -424.0, "logps/rejected": -466.0, "loss": 0.1333, "rewards/accuracies": 0.96875, "rewards/chosen": -14.8125, "rewards/margins": 7.21875, "rewards/rejected": -22.0, "step": 5770 }, { "epoch": 0.4172381433624486, "grad_norm": 6.970427682052613, "learning_rate": 1.0522672835293127e-06, "logits/chosen": -0.0517578125, "logits/rejected": 0.361328125, "logps/chosen": -404.0, "logps/rejected": -446.0, "loss": 0.1059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.125, "rewards/margins": 6.78125, "rewards/rejected": -20.875, "step": 5780 }, { "epoch": 0.4179600086623836, "grad_norm": 10.580498018718197, "learning_rate": 1.0513581971535365e-06, "logits/chosen": -0.08984375, "logits/rejected": 0.359375, "logps/chosen": -436.0, "logps/rejected": -496.0, "loss": 0.1377, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.25, "rewards/margins": 7.625, "rewards/rejected": -21.875, "step": 5790 }, { "epoch": 0.41868187396231865, "grad_norm": 13.027773127413209, "learning_rate": 1.0504514628777803e-06, "logits/chosen": 0.00055694580078125, "logits/rejected": 0.412109375, "logps/chosen": -422.0, "logps/rejected": -470.0, "loss": 0.1511, "rewards/accuracies": 0.90625, "rewards/chosen": -14.25, "rewards/margins": 6.625, "rewards/rejected": -20.875, "step": 5800 }, { "epoch": 0.41940373926225366, "grad_norm": 13.177545387038354, "learning_rate": 1.0495470705767713e-06, "logits/chosen": -0.050537109375, "logits/rejected": 0.45703125, "logps/chosen": -414.0, "logps/rejected": -468.0, "loss": 0.1113, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.6875, "rewards/margins": 7.21875, "rewards/rejected": -21.875, "step": 5810 }, { "epoch": 0.42012560456218867, "grad_norm": 8.810124890738125, "learning_rate": 1.0486450101861527e-06, "logits/chosen": 0.0047607421875, "logits/rejected": 0.44921875, "logps/chosen": -418.0, "logps/rejected": -458.0, "loss": 0.1483, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.8125, "rewards/margins": 7.0, "rewards/rejected": -21.875, "step": 5820 }, { "epoch": 0.42084746986212374, "grad_norm": 6.833290091044046, "learning_rate": 1.0477452717020143e-06, "logits/chosen": 0.004241943359375, "logits/rejected": 0.408203125, "logps/chosen": -416.0, "logps/rejected": -470.0, "loss": 0.1315, "rewards/accuracies": 0.96875, "rewards/chosen": -15.125, "rewards/margins": 7.40625, "rewards/rejected": -22.5, "step": 5830 }, { "epoch": 0.42156933516205874, "grad_norm": 9.176660481935917, "learning_rate": 1.0468478451804272e-06, "logits/chosen": -0.197265625, "logits/rejected": 0.333984375, "logps/chosen": -430.0, "logps/rejected": -498.0, "loss": 0.1461, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.0, "rewards/margins": 8.125, "rewards/rejected": -22.125, "step": 5840 }, { "epoch": 0.4222912004619938, "grad_norm": 5.078996481826852, "learning_rate": 1.0459527207369814e-06, "logits/chosen": -0.0556640625, "logits/rejected": 0.2890625, "logps/chosen": -430.0, "logps/rejected": -478.0, "loss": 0.1536, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.125, "rewards/margins": 7.09375, "rewards/rejected": -22.25, "step": 5850 }, { "epoch": 0.4230130657619288, "grad_norm": 13.158635491888669, "learning_rate": 1.0450598885463281e-06, "logits/chosen": 0.087890625, "logits/rejected": 0.5625, "logps/chosen": -420.0, "logps/rejected": -452.0, "loss": 0.163, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.8125, "rewards/margins": 7.0, "rewards/rejected": -21.75, "step": 5860 }, { "epoch": 0.42373493106186383, "grad_norm": 12.632213626394666, "learning_rate": 1.0441693388417282e-06, "logits/chosen": 0.0003814697265625, "logits/rejected": 0.3984375, "logps/chosen": -432.0, "logps/rejected": -468.0, "loss": 0.1361, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.1875, "rewards/margins": 6.78125, "rewards/rejected": -22.0, "step": 5870 }, { "epoch": 0.4244567963617989, "grad_norm": 8.901598237692983, "learning_rate": 1.0432810619146023e-06, "logits/chosen": -0.1923828125, "logits/rejected": 0.291015625, "logps/chosen": -434.0, "logps/rejected": -474.0, "loss": 0.1393, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.1875, "rewards/margins": 7.59375, "rewards/rejected": -21.75, "step": 5880 }, { "epoch": 0.4251786616617339, "grad_norm": 9.123329027666674, "learning_rate": 1.042395048114086e-06, "logits/chosen": -0.11962890625, "logits/rejected": 0.30859375, "logps/chosen": -442.0, "logps/rejected": -472.0, "loss": 0.1388, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.8125, "rewards/margins": 6.9375, "rewards/rejected": -20.75, "step": 5890 }, { "epoch": 0.42590052696166897, "grad_norm": 7.488892017264469, "learning_rate": 1.041511287846591e-06, "logits/chosen": -0.03857421875, "logits/rejected": 0.51171875, "logps/chosen": -430.0, "logps/rejected": -464.0, "loss": 0.1518, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.1875, "rewards/margins": 7.125, "rewards/rejected": -21.25, "step": 5900 }, { "epoch": 0.426622392261604, "grad_norm": 6.182434069534256, "learning_rate": 1.0406297715753674e-06, "logits/chosen": -0.00183868408203125, "logits/rejected": 0.369140625, "logps/chosen": -386.0, "logps/rejected": -422.0, "loss": 0.1262, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.4375, "rewards/margins": 6.78125, "rewards/rejected": -18.25, "step": 5910 }, { "epoch": 0.42734425756153904, "grad_norm": 9.950270507935956, "learning_rate": 1.0397504898200726e-06, "logits/chosen": 0.06298828125, "logits/rejected": 0.50390625, "logps/chosen": -382.0, "logps/rejected": -410.0, "loss": 0.1514, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.375, "rewards/margins": 6.8125, "rewards/rejected": -17.125, "step": 5920 }, { "epoch": 0.42806612286147405, "grad_norm": 9.066587978820134, "learning_rate": 1.0388734331563415e-06, "logits/chosen": 0.054931640625, "logits/rejected": 0.453125, "logps/chosen": -418.0, "logps/rejected": -420.0, "loss": 0.1256, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -12.4375, "rewards/margins": 6.5625, "rewards/rejected": -19.0, "step": 5930 }, { "epoch": 0.42878798816140906, "grad_norm": 11.393175283099012, "learning_rate": 1.037998592215364e-06, "logits/chosen": -0.0272216796875, "logits/rejected": 0.38671875, "logps/chosen": -416.0, "logps/rejected": -448.0, "loss": 0.1155, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.3125, "rewards/margins": 6.6875, "rewards/rejected": -20.0, "step": 5940 }, { "epoch": 0.4295098534613441, "grad_norm": 8.472350399344819, "learning_rate": 1.037125957683463e-06, "logits/chosen": 0.130859375, "logits/rejected": 0.5703125, "logps/chosen": -426.0, "logps/rejected": -470.0, "loss": 0.1281, "rewards/accuracies": 0.96875, "rewards/chosen": -13.6875, "rewards/margins": 7.90625, "rewards/rejected": -21.5, "step": 5950 }, { "epoch": 0.43023171876127914, "grad_norm": 10.737545577109845, "learning_rate": 1.0362555203016794e-06, "logits/chosen": 0.037109375, "logits/rejected": 0.4609375, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.1106, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.375, "rewards/margins": 7.375, "rewards/rejected": -21.75, "step": 5960 }, { "epoch": 0.4309535840612142, "grad_norm": 12.291974637230366, "learning_rate": 1.035387270865359e-06, "logits/chosen": 0.039794921875, "logits/rejected": 0.546875, "logps/chosen": -430.0, "logps/rejected": -490.0, "loss": 0.1398, "rewards/accuracies": 0.96875, "rewards/chosen": -16.0, "rewards/margins": 7.59375, "rewards/rejected": -23.625, "step": 5970 }, { "epoch": 0.4316754493611492, "grad_norm": 8.457158343629382, "learning_rate": 1.0345212002237434e-06, "logits/chosen": 0.05908203125, "logits/rejected": 0.47265625, "logps/chosen": -442.0, "logps/rejected": -476.0, "loss": 0.1249, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.625, "rewards/margins": 7.28125, "rewards/rejected": -21.875, "step": 5980 }, { "epoch": 0.4323973146610842, "grad_norm": 13.855875213020568, "learning_rate": 1.0336572992795644e-06, "logits/chosen": 0.054443359375, "logits/rejected": 0.486328125, "logps/chosen": -420.0, "logps/rejected": -454.0, "loss": 0.1235, "rewards/accuracies": 0.96875, "rewards/chosen": -15.375, "rewards/margins": 7.4375, "rewards/rejected": -22.875, "step": 5990 }, { "epoch": 0.4331191799610193, "grad_norm": 16.025214741642063, "learning_rate": 1.0327955589886444e-06, "logits/chosen": -0.0247802734375, "logits/rejected": 0.421875, "logps/chosen": -454.0, "logps/rejected": -492.0, "loss": 0.1288, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.25, "rewards/margins": 7.65625, "rewards/rejected": -23.875, "step": 6000 }, { "epoch": 0.4338410452609543, "grad_norm": 9.089734175846722, "learning_rate": 1.0319359703594971e-06, "logits/chosen": 0.10791015625, "logits/rejected": 0.5546875, "logps/chosen": -436.0, "logps/rejected": -448.0, "loss": 0.1208, "rewards/accuracies": 0.96875, "rewards/chosen": -16.0, "rewards/margins": 7.5, "rewards/rejected": -23.5, "step": 6010 }, { "epoch": 0.43456291056088936, "grad_norm": 7.526418205269345, "learning_rate": 1.0310785244529341e-06, "logits/chosen": 0.0537109375, "logits/rejected": 0.50390625, "logps/chosen": -450.0, "logps/rejected": -478.0, "loss": 0.1144, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.25, "rewards/margins": 7.125, "rewards/rejected": -23.375, "step": 6020 }, { "epoch": 0.43528477586082437, "grad_norm": 8.125711586467368, "learning_rate": 1.0302232123816746e-06, "logits/chosen": 0.1748046875, "logits/rejected": 0.703125, "logps/chosen": -406.0, "logps/rejected": -456.0, "loss": 0.115, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.5625, "rewards/margins": 7.8125, "rewards/rejected": -22.375, "step": 6030 }, { "epoch": 0.4360066411607594, "grad_norm": 8.855964097789311, "learning_rate": 1.0293700253099576e-06, "logits/chosen": -0.01458740234375, "logits/rejected": 0.55078125, "logps/chosen": -428.0, "logps/rejected": -472.0, "loss": 0.1343, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.0, "rewards/margins": 7.40625, "rewards/rejected": -23.375, "step": 6040 }, { "epoch": 0.43672850646069444, "grad_norm": 5.0225601381364395, "learning_rate": 1.02851895445316e-06, "logits/chosen": 0.058349609375, "logits/rejected": 0.30078125, "logps/chosen": -436.0, "logps/rejected": -502.0, "loss": 0.1116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.75, "rewards/margins": 7.84375, "rewards/rejected": -23.625, "step": 6050 }, { "epoch": 0.43745037176062945, "grad_norm": 9.547828881075086, "learning_rate": 1.0276699910774159e-06, "logits/chosen": 0.04541015625, "logits/rejected": 0.400390625, "logps/chosen": -460.0, "logps/rejected": -486.0, "loss": 0.1515, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.6875, "rewards/margins": 8.5, "rewards/rejected": -24.25, "step": 6060 }, { "epoch": 0.4381722370605645, "grad_norm": 9.184482294761866, "learning_rate": 1.0268231264992398e-06, "logits/chosen": -0.1171875, "logits/rejected": 0.384765625, "logps/chosen": -450.0, "logps/rejected": -482.0, "loss": 0.1197, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 7.75, "rewards/rejected": -23.875, "step": 6070 }, { "epoch": 0.4388941023604995, "grad_norm": 13.929506571494615, "learning_rate": 1.0259783520851542e-06, "logits/chosen": -0.12109375, "logits/rejected": 0.326171875, "logps/chosen": -450.0, "logps/rejected": -502.0, "loss": 0.1637, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.25, "rewards/margins": 8.125, "rewards/rejected": -24.375, "step": 6080 }, { "epoch": 0.43961596766043454, "grad_norm": 9.127103461612679, "learning_rate": 1.0251356592513193e-06, "logits/chosen": 0.03173828125, "logits/rejected": 0.490234375, "logps/chosen": -416.0, "logps/rejected": -464.0, "loss": 0.1221, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.25, "rewards/margins": 7.59375, "rewards/rejected": -22.75, "step": 6090 }, { "epoch": 0.4403378329603696, "grad_norm": 9.323407574183049, "learning_rate": 1.0242950394631678e-06, "logits/chosen": 0.08837890625, "logits/rejected": 0.52734375, "logps/chosen": -420.0, "logps/rejected": -434.0, "loss": 0.1364, "rewards/accuracies": 0.96875, "rewards/chosen": -14.0, "rewards/margins": 7.3125, "rewards/rejected": -21.375, "step": 6100 }, { "epoch": 0.4410596982603046, "grad_norm": 8.161642149242747, "learning_rate": 1.0234564842350404e-06, "logits/chosen": -0.08544921875, "logits/rejected": 0.298828125, "logps/chosen": -398.0, "logps/rejected": -438.0, "loss": 0.1363, "rewards/accuracies": 0.96875, "rewards/chosen": -12.875, "rewards/margins": 7.5625, "rewards/rejected": -20.5, "step": 6110 }, { "epoch": 0.4417815635602397, "grad_norm": 8.01239320612575, "learning_rate": 1.0226199851298272e-06, "logits/chosen": 0.0751953125, "logits/rejected": 0.44140625, "logps/chosen": -422.0, "logps/rejected": -444.0, "loss": 0.1313, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -13.4375, "rewards/margins": 6.5, "rewards/rejected": -20.0, "step": 6120 }, { "epoch": 0.4425034288601747, "grad_norm": 8.831267216579741, "learning_rate": 1.0217855337586106e-06, "logits/chosen": -0.0184326171875, "logits/rejected": 0.373046875, "logps/chosen": -404.0, "logps/rejected": -450.0, "loss": 0.1239, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -12.75, "rewards/margins": 7.65625, "rewards/rejected": -20.375, "step": 6130 }, { "epoch": 0.44322529416010975, "grad_norm": 11.658983830831058, "learning_rate": 1.0209531217803119e-06, "logits/chosen": -0.06640625, "logits/rejected": 0.408203125, "logps/chosen": -406.0, "logps/rejected": -456.0, "loss": 0.1381, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.9375, "rewards/margins": 7.6875, "rewards/rejected": -21.625, "step": 6140 }, { "epoch": 0.44394715946004476, "grad_norm": 10.872503368112605, "learning_rate": 1.0201227409013412e-06, "logits/chosen": 0.11767578125, "logits/rejected": 0.48828125, "logps/chosen": -432.0, "logps/rejected": -478.0, "loss": 0.1334, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -15.0625, "rewards/margins": 7.375, "rewards/rejected": -22.5, "step": 6150 }, { "epoch": 0.44466902475997977, "grad_norm": 6.776506073446041, "learning_rate": 1.0192943828752509e-06, "logits/chosen": -0.00019550323486328125, "logits/rejected": 0.5390625, "logps/chosen": -452.0, "logps/rejected": -512.0, "loss": 0.1334, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.0, "rewards/margins": 8.375, "rewards/rejected": -25.5, "step": 6160 }, { "epoch": 0.44539089005991483, "grad_norm": 12.197776696048779, "learning_rate": 1.0184680395023912e-06, "logits/chosen": 0.028076171875, "logits/rejected": 0.466796875, "logps/chosen": -438.0, "logps/rejected": -466.0, "loss": 0.1447, "rewards/accuracies": 0.9375, "rewards/chosen": -16.0, "rewards/margins": 7.0625, "rewards/rejected": -23.0, "step": 6170 }, { "epoch": 0.44611275535984984, "grad_norm": 5.490268724670604, "learning_rate": 1.0176437026295688e-06, "logits/chosen": 0.032958984375, "logits/rejected": 0.462890625, "logps/chosen": -432.0, "logps/rejected": -476.0, "loss": 0.143, "rewards/accuracies": 0.9375, "rewards/chosen": -14.75, "rewards/margins": 7.40625, "rewards/rejected": -22.125, "step": 6180 }, { "epoch": 0.4468346206597849, "grad_norm": 11.127447626487752, "learning_rate": 1.0168213641497094e-06, "logits/chosen": -0.0751953125, "logits/rejected": 0.353515625, "logps/chosen": -432.0, "logps/rejected": -470.0, "loss": 0.1559, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.5625, "rewards/margins": 7.21875, "rewards/rejected": -21.75, "step": 6190 }, { "epoch": 0.4475564859597199, "grad_norm": 9.935561356082172, "learning_rate": 1.016001016001524e-06, "logits/chosen": -0.07666015625, "logits/rejected": 0.45703125, "logps/chosen": -430.0, "logps/rejected": -456.0, "loss": 0.1181, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.1875, "rewards/margins": 6.9375, "rewards/rejected": -21.125, "step": 6200 }, { "epoch": 0.44827835125965493, "grad_norm": 10.887255412214364, "learning_rate": 1.0151826501691747e-06, "logits/chosen": -0.142578125, "logits/rejected": 0.302734375, "logps/chosen": -436.0, "logps/rejected": -460.0, "loss": 0.145, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.125, "rewards/margins": 6.90625, "rewards/rejected": -21.0, "step": 6210 }, { "epoch": 0.44900021655959, "grad_norm": 9.943102823368944, "learning_rate": 1.0143662586819475e-06, "logits/chosen": -0.0089111328125, "logits/rejected": 0.388671875, "logps/chosen": -448.0, "logps/rejected": -470.0, "loss": 0.1433, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.6875, "rewards/margins": 7.28125, "rewards/rejected": -23.0, "step": 6220 }, { "epoch": 0.449722081859525, "grad_norm": 15.369831404486488, "learning_rate": 1.0135518336139257e-06, "logits/chosen": -0.111328125, "logits/rejected": 0.38671875, "logps/chosen": -458.0, "logps/rejected": -490.0, "loss": 0.1477, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -17.5, "rewards/margins": 7.21875, "rewards/rejected": -24.75, "step": 6230 }, { "epoch": 0.45044394715946007, "grad_norm": 9.585074361913176, "learning_rate": 1.0127393670836666e-06, "logits/chosen": -0.09375, "logits/rejected": 0.486328125, "logps/chosen": -446.0, "logps/rejected": -520.0, "loss": 0.1512, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.75, "rewards/margins": 8.375, "rewards/rejected": -26.125, "step": 6240 }, { "epoch": 0.4511658124593951, "grad_norm": 9.638125583724857, "learning_rate": 1.0119288512538813e-06, "logits/chosen": 0.177734375, "logits/rejected": 0.53125, "logps/chosen": -470.0, "logps/rejected": -488.0, "loss": 0.1182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.75, "rewards/margins": 6.84375, "rewards/rejected": -25.5, "step": 6250 }, { "epoch": 0.4518876777593301, "grad_norm": 11.236702146354423, "learning_rate": 1.0111202783311173e-06, "logits/chosen": 0.091796875, "logits/rejected": 0.6875, "logps/chosen": -464.0, "logps/rejected": -498.0, "loss": 0.1351, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.25, "rewards/margins": 7.96875, "rewards/rejected": -26.125, "step": 6260 }, { "epoch": 0.45260954305926515, "grad_norm": 9.995216692895422, "learning_rate": 1.010313640565443e-06, "logits/chosen": 0.038818359375, "logits/rejected": 0.4453125, "logps/chosen": -444.0, "logps/rejected": -472.0, "loss": 0.1044, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.75, "rewards/margins": 7.59375, "rewards/rejected": -23.375, "step": 6270 }, { "epoch": 0.45333140835920016, "grad_norm": 8.997661895984741, "learning_rate": 1.0095089302501373e-06, "logits/chosen": -0.052001953125, "logits/rejected": 0.396484375, "logps/chosen": -462.0, "logps/rejected": -498.0, "loss": 0.1589, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 8.25, "rewards/rejected": -25.0, "step": 6280 }, { "epoch": 0.4540532736591352, "grad_norm": 10.051307885900684, "learning_rate": 1.0087061397213787e-06, "logits/chosen": 0.022705078125, "logits/rejected": 0.447265625, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.152, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.375, "rewards/margins": 7.71875, "rewards/rejected": -23.125, "step": 6290 }, { "epoch": 0.45477513895907024, "grad_norm": 9.59342742311766, "learning_rate": 1.007905261357939e-06, "logits/chosen": -0.0166015625, "logits/rejected": 0.365234375, "logps/chosen": -422.0, "logps/rejected": -464.0, "loss": 0.1158, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.5, "rewards/margins": 7.375, "rewards/rejected": -21.875, "step": 6300 }, { "epoch": 0.45549700425900524, "grad_norm": 13.790719523100297, "learning_rate": 1.0071062875808811e-06, "logits/chosen": -0.07373046875, "logits/rejected": 0.302734375, "logps/chosen": -456.0, "logps/rejected": -492.0, "loss": 0.1342, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.6875, "rewards/margins": 7.71875, "rewards/rejected": -22.375, "step": 6310 }, { "epoch": 0.4562188695589403, "grad_norm": 12.357256001133027, "learning_rate": 1.0063092108532552e-06, "logits/chosen": 0.0849609375, "logits/rejected": 0.494140625, "logps/chosen": -428.0, "logps/rejected": -460.0, "loss": 0.1223, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.875, "rewards/margins": 7.0625, "rewards/rejected": -23.0, "step": 6320 }, { "epoch": 0.4569407348588753, "grad_norm": 8.772561815900866, "learning_rate": 1.005514023679802e-06, "logits/chosen": 0.034423828125, "logits/rejected": 0.392578125, "logps/chosen": -442.0, "logps/rejected": -506.0, "loss": 0.1424, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.875, "rewards/margins": 8.125, "rewards/rejected": -25.0, "step": 6330 }, { "epoch": 0.4576626001588104, "grad_norm": 13.855930347569188, "learning_rate": 1.0047207186066567e-06, "logits/chosen": 0.0927734375, "logits/rejected": 0.4609375, "logps/chosen": -440.0, "logps/rejected": -490.0, "loss": 0.1209, "rewards/accuracies": 0.9375, "rewards/chosen": -17.25, "rewards/margins": 7.25, "rewards/rejected": -24.5, "step": 6340 }, { "epoch": 0.4583844654587454, "grad_norm": 18.15720338084319, "learning_rate": 1.0039292882210538e-06, "logits/chosen": -0.142578125, "logits/rejected": 0.52734375, "logps/chosen": -432.0, "logps/rejected": -474.0, "loss": 0.1533, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5625, "rewards/margins": 8.125, "rewards/rejected": -23.625, "step": 6350 }, { "epoch": 0.4591063307586804, "grad_norm": 11.641819865493076, "learning_rate": 1.0031397251510382e-06, "logits/chosen": -0.07958984375, "logits/rejected": 0.353515625, "logps/chosen": -450.0, "logps/rejected": -488.0, "loss": 0.1191, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.125, "rewards/margins": 7.25, "rewards/rejected": -23.375, "step": 6360 }, { "epoch": 0.45982819605861547, "grad_norm": 9.484875589842503, "learning_rate": 1.0023520220651762e-06, "logits/chosen": 0.047119140625, "logits/rejected": 0.51953125, "logps/chosen": -448.0, "logps/rejected": -494.0, "loss": 0.1145, "rewards/accuracies": 0.96875, "rewards/chosen": -15.875, "rewards/margins": 7.8125, "rewards/rejected": -23.625, "step": 6370 }, { "epoch": 0.4605500613585505, "grad_norm": 15.110387267750944, "learning_rate": 1.0015661716722687e-06, "logits/chosen": -0.0244140625, "logits/rejected": 0.490234375, "logps/chosen": -450.0, "logps/rejected": -494.0, "loss": 0.1282, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.9375, "rewards/margins": 8.3125, "rewards/rejected": -24.25, "step": 6380 }, { "epoch": 0.46127192665848554, "grad_norm": 9.923390773660724, "learning_rate": 1.0007821667210687e-06, "logits/chosen": 0.010498046875, "logits/rejected": 0.34765625, "logps/chosen": -416.0, "logps/rejected": -458.0, "loss": 0.1141, "rewards/accuracies": 0.96875, "rewards/chosen": -13.375, "rewards/margins": 7.4375, "rewards/rejected": -20.75, "step": 6390 }, { "epoch": 0.46199379195842055, "grad_norm": 11.683463639189517, "learning_rate": 1e-06, "logits/chosen": -0.03662109375, "logits/rejected": 0.4375, "logps/chosen": -450.0, "logps/rejected": -464.0, "loss": 0.1476, "rewards/accuracies": 0.9375, "rewards/chosen": -14.125, "rewards/margins": 7.375, "rewards/rejected": -21.5, "step": 6400 }, { "epoch": 0.4627156572583556, "grad_norm": 6.717587487400935, "learning_rate": 9.992196643368784e-07, "logits/chosen": 0.057861328125, "logits/rejected": 0.484375, "logps/chosen": -424.0, "logps/rejected": -444.0, "loss": 0.1319, "rewards/accuracies": 0.96875, "rewards/chosen": -14.6875, "rewards/margins": 6.8125, "rewards/rejected": -21.5, "step": 6410 }, { "epoch": 0.4634375225582906, "grad_norm": 6.686766410857775, "learning_rate": 9.984411525986355e-07, "logits/chosen": -0.09619140625, "logits/rejected": 0.498046875, "logps/chosen": -452.0, "logps/rejected": -470.0, "loss": 0.1265, "rewards/accuracies": 0.96875, "rewards/chosen": -14.25, "rewards/margins": 8.25, "rewards/rejected": -22.5, "step": 6420 }, { "epoch": 0.46415938785822564, "grad_norm": 11.041618803733178, "learning_rate": 9.97664457691046e-07, "logits/chosen": 0.0615234375, "logits/rejected": 0.51953125, "logps/chosen": -406.0, "logps/rejected": -452.0, "loss": 0.1334, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.9375, "rewards/margins": 7.3125, "rewards/rejected": -21.25, "step": 6430 }, { "epoch": 0.4648812531581607, "grad_norm": 12.331643225124495, "learning_rate": 9.968895725584535e-07, "logits/chosen": 0.09228515625, "logits/rejected": 0.392578125, "logps/chosen": -442.0, "logps/rejected": -460.0, "loss": 0.1435, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.375, "rewards/margins": 7.3125, "rewards/rejected": -21.625, "step": 6440 }, { "epoch": 0.4656031184580957, "grad_norm": 10.688476283118797, "learning_rate": 9.961164901835046e-07, "logits/chosen": 0.119140625, "logits/rejected": 0.45703125, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.1204, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.5625, "rewards/margins": 7.4375, "rewards/rejected": -22.0, "step": 6450 }, { "epoch": 0.4663249837580308, "grad_norm": 9.359629746360646, "learning_rate": 9.95345203586879e-07, "logits/chosen": 0.0208740234375, "logits/rejected": 0.455078125, "logps/chosen": -434.0, "logps/rejected": -472.0, "loss": 0.1249, "rewards/accuracies": 0.96875, "rewards/chosen": -15.875, "rewards/margins": 7.15625, "rewards/rejected": -23.0, "step": 6460 }, { "epoch": 0.4670468490579658, "grad_norm": 12.366597066373776, "learning_rate": 9.94575705827027e-07, "logits/chosen": -0.1259765625, "logits/rejected": 0.41015625, "logps/chosen": -462.0, "logps/rejected": -476.0, "loss": 0.1193, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.8125, "rewards/margins": 8.0625, "rewards/rejected": -23.875, "step": 6470 }, { "epoch": 0.4677687143579008, "grad_norm": 6.153616485994291, "learning_rate": 9.938079899999065e-07, "logits/chosen": 0.07763671875, "logits/rejected": 0.47265625, "logps/chosen": -442.0, "logps/rejected": -480.0, "loss": 0.1224, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.25, "rewards/margins": 8.0625, "rewards/rejected": -24.375, "step": 6480 }, { "epoch": 0.46849057965783586, "grad_norm": 15.290010771367905, "learning_rate": 9.930420492387219e-07, "logits/chosen": -0.0419921875, "logits/rejected": 0.478515625, "logps/chosen": -446.0, "logps/rejected": -468.0, "loss": 0.1296, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.75, "rewards/margins": 7.53125, "rewards/rejected": -24.25, "step": 6490 }, { "epoch": 0.46921244495777087, "grad_norm": 7.911491932232342, "learning_rate": 9.922778767136676e-07, "logits/chosen": 0.10400390625, "logits/rejected": 0.41015625, "logps/chosen": -444.0, "logps/rejected": -468.0, "loss": 0.117, "rewards/accuracies": 0.96875, "rewards/chosen": -15.625, "rewards/margins": 7.5625, "rewards/rejected": -23.25, "step": 6500 }, { "epoch": 0.46993431025770593, "grad_norm": 8.325424834951, "learning_rate": 9.915154656316713e-07, "logits/chosen": -0.0194091796875, "logits/rejected": 0.427734375, "logps/chosen": -410.0, "logps/rejected": -456.0, "loss": 0.1319, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.8125, "rewards/margins": 7.625, "rewards/rejected": -21.5, "step": 6510 }, { "epoch": 0.47065617555764094, "grad_norm": 13.016307095764912, "learning_rate": 9.907548092361398e-07, "logits/chosen": -0.054931640625, "logits/rejected": 0.41015625, "logps/chosen": -420.0, "logps/rejected": -460.0, "loss": 0.1376, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.0625, "rewards/margins": 7.5625, "rewards/rejected": -21.625, "step": 6520 }, { "epoch": 0.47137804085757595, "grad_norm": 10.757467975326454, "learning_rate": 9.899959008067097e-07, "logits/chosen": -0.03857421875, "logits/rejected": 0.310546875, "logps/chosen": -438.0, "logps/rejected": -502.0, "loss": 0.1472, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.6875, "rewards/margins": 8.375, "rewards/rejected": -24.0, "step": 6530 }, { "epoch": 0.472099906157511, "grad_norm": 11.289257514256704, "learning_rate": 9.892387336589959e-07, "logits/chosen": 0.0262451171875, "logits/rejected": 0.455078125, "logps/chosen": -400.0, "logps/rejected": -468.0, "loss": 0.1452, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.9375, "rewards/margins": 7.6875, "rewards/rejected": -21.625, "step": 6540 }, { "epoch": 0.472821771457446, "grad_norm": 14.490863175175916, "learning_rate": 9.884833011443446e-07, "logits/chosen": -0.0026397705078125, "logits/rejected": 0.33984375, "logps/chosen": -438.0, "logps/rejected": -486.0, "loss": 0.1074, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.5, "rewards/margins": 8.6875, "rewards/rejected": -24.125, "step": 6550 }, { "epoch": 0.4735436367573811, "grad_norm": 10.237284554154284, "learning_rate": 9.877295966495897e-07, "logits/chosen": -0.006591796875, "logits/rejected": 0.43359375, "logps/chosen": -440.0, "logps/rejected": -492.0, "loss": 0.1162, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.125, "rewards/margins": 7.59375, "rewards/rejected": -23.75, "step": 6560 }, { "epoch": 0.4742655020573161, "grad_norm": 8.117055546727196, "learning_rate": 9.86977613596807e-07, "logits/chosen": 0.003875732421875, "logits/rejected": 0.376953125, "logps/chosen": -428.0, "logps/rejected": -460.0, "loss": 0.1133, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.6875, "rewards/margins": 7.46875, "rewards/rejected": -22.125, "step": 6570 }, { "epoch": 0.4749873673572511, "grad_norm": 7.791301781561814, "learning_rate": 9.862273454430757e-07, "logits/chosen": -0.041748046875, "logits/rejected": 0.44921875, "logps/chosen": -444.0, "logps/rejected": -482.0, "loss": 0.1197, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.9375, "rewards/margins": 8.0, "rewards/rejected": -23.0, "step": 6580 }, { "epoch": 0.4757092326571862, "grad_norm": 8.789911263427769, "learning_rate": 9.85478785680238e-07, "logits/chosen": 0.1650390625, "logits/rejected": 0.51171875, "logps/chosen": -446.0, "logps/rejected": -482.0, "loss": 0.1288, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.875, "rewards/margins": 8.125, "rewards/rejected": -24.0, "step": 6590 }, { "epoch": 0.4764310979571212, "grad_norm": 12.471122255076226, "learning_rate": 9.847319278346618e-07, "logits/chosen": 0.1201171875, "logits/rejected": 0.546875, "logps/chosen": -466.0, "logps/rejected": -486.0, "loss": 0.1045, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 7.625, "rewards/rejected": -24.625, "step": 6600 }, { "epoch": 0.47715296325705625, "grad_norm": 6.411206820383281, "learning_rate": 9.839867654670063e-07, "logits/chosen": 0.11767578125, "logits/rejected": 0.4140625, "logps/chosen": -482.0, "logps/rejected": -528.0, "loss": 0.1176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -18.75, "rewards/margins": 7.84375, "rewards/rejected": -26.625, "step": 6610 }, { "epoch": 0.47787482855699126, "grad_norm": 5.368068524629328, "learning_rate": 9.832432921719876e-07, "logits/chosen": 0.0732421875, "logits/rejected": 0.484375, "logps/chosen": -444.0, "logps/rejected": -494.0, "loss": 0.1072, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.5, "rewards/margins": 7.125, "rewards/rejected": -25.625, "step": 6620 }, { "epoch": 0.4785966938569263, "grad_norm": 7.705824825396082, "learning_rate": 9.825015015781493e-07, "logits/chosen": 0.056640625, "logits/rejected": 0.466796875, "logps/chosen": -444.0, "logps/rejected": -516.0, "loss": 0.1477, "rewards/accuracies": 0.9375, "rewards/chosen": -17.875, "rewards/margins": 8.75, "rewards/rejected": -26.625, "step": 6630 }, { "epoch": 0.47931855915686133, "grad_norm": 5.629063797440907, "learning_rate": 9.81761387347632e-07, "logits/chosen": 0.049560546875, "logits/rejected": 0.4375, "logps/chosen": -446.0, "logps/rejected": -508.0, "loss": 0.1009, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 8.0, "rewards/rejected": -25.625, "step": 6640 }, { "epoch": 0.48004042445679634, "grad_norm": 11.475159249730643, "learning_rate": 9.810229431759452e-07, "logits/chosen": -0.028564453125, "logits/rejected": 0.546875, "logps/chosen": -486.0, "logps/rejected": -500.0, "loss": 0.1202, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -19.125, "rewards/margins": 7.84375, "rewards/rejected": -27.0, "step": 6650 }, { "epoch": 0.4807622897567314, "grad_norm": 9.860154897761035, "learning_rate": 9.802861627917437e-07, "logits/chosen": 0.0751953125, "logits/rejected": 0.5390625, "logps/chosen": -458.0, "logps/rejected": -490.0, "loss": 0.1271, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.875, "rewards/margins": 7.3125, "rewards/rejected": -26.125, "step": 6660 }, { "epoch": 0.4814841550566664, "grad_norm": 10.09836931314959, "learning_rate": 9.795510399566016e-07, "logits/chosen": -0.01141357421875, "logits/rejected": 0.498046875, "logps/chosen": -462.0, "logps/rejected": -500.0, "loss": 0.1233, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.5, "rewards/margins": 8.1875, "rewards/rejected": -25.625, "step": 6670 }, { "epoch": 0.4822060203566015, "grad_norm": 10.2309936763669, "learning_rate": 9.788175684647926e-07, "logits/chosen": -0.099609375, "logits/rejected": 0.318359375, "logps/chosen": -454.0, "logps/rejected": -504.0, "loss": 0.1056, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -16.75, "rewards/margins": 8.3125, "rewards/rejected": -25.125, "step": 6680 }, { "epoch": 0.4829278856565365, "grad_norm": 6.4320578263711115, "learning_rate": 9.780857421430687e-07, "logits/chosen": -0.10986328125, "logits/rejected": 0.259765625, "logps/chosen": -444.0, "logps/rejected": -472.0, "loss": 0.1251, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.3125, "rewards/margins": 7.59375, "rewards/rejected": -22.875, "step": 6690 }, { "epoch": 0.4836497509564715, "grad_norm": 6.232454418026802, "learning_rate": 9.773555548504417e-07, "logits/chosen": 0.04248046875, "logits/rejected": 0.52734375, "logps/chosen": -468.0, "logps/rejected": -494.0, "loss": 0.1326, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -18.0, "rewards/margins": 7.8125, "rewards/rejected": -25.875, "step": 6700 }, { "epoch": 0.48437161625640657, "grad_norm": 9.313378866885854, "learning_rate": 9.76627000477968e-07, "logits/chosen": 0.057373046875, "logits/rejected": 0.515625, "logps/chosen": -442.0, "logps/rejected": -496.0, "loss": 0.1441, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.0, "rewards/margins": 7.71875, "rewards/rejected": -24.75, "step": 6710 }, { "epoch": 0.4850934815563416, "grad_norm": 11.539996186473717, "learning_rate": 9.75900072948533e-07, "logits/chosen": 0.0517578125, "logits/rejected": 0.486328125, "logps/chosen": -472.0, "logps/rejected": -528.0, "loss": 0.1452, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -19.625, "rewards/margins": 8.1875, "rewards/rejected": -27.75, "step": 6720 }, { "epoch": 0.48581534685627664, "grad_norm": 10.151949729888942, "learning_rate": 9.751747662166388e-07, "logits/chosen": -0.00019550323486328125, "logits/rejected": 0.5, "logps/chosen": -454.0, "logps/rejected": -496.0, "loss": 0.1288, "rewards/accuracies": 0.96875, "rewards/chosen": -17.75, "rewards/margins": 8.3125, "rewards/rejected": -26.0, "step": 6730 }, { "epoch": 0.48653721215621165, "grad_norm": 9.247492884907441, "learning_rate": 9.744510742681917e-07, "logits/chosen": 0.09130859375, "logits/rejected": 0.408203125, "logps/chosen": -486.0, "logps/rejected": -516.0, "loss": 0.1361, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.625, "rewards/margins": 7.5, "rewards/rejected": -26.0, "step": 6740 }, { "epoch": 0.48725907745614666, "grad_norm": 8.394316984940911, "learning_rate": 9.737289911202953e-07, "logits/chosen": -0.008544921875, "logits/rejected": 0.35546875, "logps/chosen": -468.0, "logps/rejected": -520.0, "loss": 0.1184, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.5, "rewards/margins": 7.75, "rewards/rejected": -25.25, "step": 6750 }, { "epoch": 0.4879809427560817, "grad_norm": 13.994663402178546, "learning_rate": 9.730085108210398e-07, "logits/chosen": 0.0830078125, "logits/rejected": 0.4765625, "logps/chosen": -436.0, "logps/rejected": -490.0, "loss": 0.1055, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.875, "rewards/margins": 7.40625, "rewards/rejected": -24.25, "step": 6760 }, { "epoch": 0.48870280805601674, "grad_norm": 8.198907482567309, "learning_rate": 9.72289627449298e-07, "logits/chosen": -0.06884765625, "logits/rejected": 0.43359375, "logps/chosen": -448.0, "logps/rejected": -482.0, "loss": 0.1087, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.125, "rewards/margins": 7.96875, "rewards/rejected": -25.125, "step": 6770 }, { "epoch": 0.4894246733559518, "grad_norm": 8.890173383516705, "learning_rate": 9.715723351145206e-07, "logits/chosen": 0.004913330078125, "logits/rejected": 0.53515625, "logps/chosen": -432.0, "logps/rejected": -494.0, "loss": 0.1078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.375, "rewards/margins": 8.25, "rewards/rejected": -25.625, "step": 6780 }, { "epoch": 0.4901465386558868, "grad_norm": 8.031856642652002, "learning_rate": 9.70856627956532e-07, "logits/chosen": -0.078125, "logits/rejected": 0.443359375, "logps/chosen": -494.0, "logps/rejected": -528.0, "loss": 0.1181, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.625, "rewards/margins": 7.6875, "rewards/rejected": -26.25, "step": 6790 }, { "epoch": 0.4908684039558218, "grad_norm": 10.99222698573809, "learning_rate": 9.701425001453318e-07, "logits/chosen": -0.0294189453125, "logits/rejected": 0.453125, "logps/chosen": -446.0, "logps/rejected": -482.0, "loss": 0.1114, "rewards/accuracies": 0.96875, "rewards/chosen": -18.0, "rewards/margins": 7.5, "rewards/rejected": -25.375, "step": 6800 }, { "epoch": 0.4915902692557569, "grad_norm": 6.865174186647278, "learning_rate": 9.694299458808932e-07, "logits/chosen": -0.060302734375, "logits/rejected": 0.33984375, "logps/chosen": -454.0, "logps/rejected": -494.0, "loss": 0.0943, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.5, "rewards/margins": 7.875, "rewards/rejected": -25.375, "step": 6810 }, { "epoch": 0.4923121345556919, "grad_norm": 12.076632310836107, "learning_rate": 9.687189593929655e-07, "logits/chosen": 0.0277099609375, "logits/rejected": 0.52734375, "logps/chosen": -428.0, "logps/rejected": -492.0, "loss": 0.1372, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.75, "rewards/margins": 8.0, "rewards/rejected": -24.875, "step": 6820 }, { "epoch": 0.49303399985562696, "grad_norm": 6.869606554030821, "learning_rate": 9.680095349408789e-07, "logits/chosen": -0.04931640625, "logits/rejected": 0.408203125, "logps/chosen": -450.0, "logps/rejected": -490.0, "loss": 0.1189, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.625, "rewards/margins": 8.125, "rewards/rejected": -24.75, "step": 6830 }, { "epoch": 0.49375586515556197, "grad_norm": 11.703688546617208, "learning_rate": 9.673016668133487e-07, "logits/chosen": 0.1259765625, "logits/rejected": 0.5390625, "logps/chosen": -458.0, "logps/rejected": -498.0, "loss": 0.123, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.75, "rewards/margins": 7.84375, "rewards/rejected": -25.625, "step": 6840 }, { "epoch": 0.494477730455497, "grad_norm": 13.141188491672171, "learning_rate": 9.66595349328283e-07, "logits/chosen": 0.0179443359375, "logits/rejected": 0.470703125, "logps/chosen": -456.0, "logps/rejected": -512.0, "loss": 0.1182, "rewards/accuracies": 0.96875, "rewards/chosen": -19.0, "rewards/margins": 7.1875, "rewards/rejected": -26.25, "step": 6850 }, { "epoch": 0.49519959575543204, "grad_norm": 7.1467886476748985, "learning_rate": 9.658905768325902e-07, "logits/chosen": 0.22265625, "logits/rejected": 0.54296875, "logps/chosen": -416.0, "logps/rejected": -484.0, "loss": 0.1281, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 7.34375, "rewards/rejected": -24.625, "step": 6860 }, { "epoch": 0.49592146105536705, "grad_norm": 6.605742562573282, "learning_rate": 9.651873437019902e-07, "logits/chosen": 0.166015625, "logits/rejected": 0.546875, "logps/chosen": -462.0, "logps/rejected": -496.0, "loss": 0.1321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.625, "rewards/margins": 6.875, "rewards/rejected": -25.5, "step": 6870 }, { "epoch": 0.4966433263553021, "grad_norm": 10.898743310971378, "learning_rate": 9.644856443408243e-07, "logits/chosen": 0.08203125, "logits/rejected": 0.54296875, "logps/chosen": -450.0, "logps/rejected": -488.0, "loss": 0.1172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.75, "rewards/margins": 6.71875, "rewards/rejected": -24.5, "step": 6880 }, { "epoch": 0.4973651916552371, "grad_norm": 9.857605361228915, "learning_rate": 9.637854731818697e-07, "logits/chosen": 0.07373046875, "logits/rejected": 0.361328125, "logps/chosen": -438.0, "logps/rejected": -464.0, "loss": 0.1178, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.125, "rewards/margins": 6.9375, "rewards/rejected": -22.0, "step": 6890 }, { "epoch": 0.4980870569551722, "grad_norm": 7.74326966777158, "learning_rate": 9.630868246861536e-07, "logits/chosen": -0.053466796875, "logits/rejected": 0.416015625, "logps/chosen": -440.0, "logps/rejected": -474.0, "loss": 0.0971, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.8125, "rewards/margins": 7.59375, "rewards/rejected": -22.375, "step": 6900 }, { "epoch": 0.4988089222551072, "grad_norm": 7.471581291183305, "learning_rate": 9.623896933427685e-07, "logits/chosen": 0.01239013671875, "logits/rejected": 0.4453125, "logps/chosen": -466.0, "logps/rejected": -502.0, "loss": 0.1313, "rewards/accuracies": 0.96875, "rewards/chosen": -17.375, "rewards/margins": 7.6875, "rewards/rejected": -25.125, "step": 6910 }, { "epoch": 0.4995307875550422, "grad_norm": 7.179265335692618, "learning_rate": 9.6169407366869e-07, "logits/chosen": 0.1533203125, "logits/rejected": 0.478515625, "logps/chosen": -452.0, "logps/rejected": -502.0, "loss": 0.1291, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.125, "rewards/margins": 7.6875, "rewards/rejected": -25.875, "step": 6920 }, { "epoch": 0.5002526528549772, "grad_norm": 9.2206099355183, "learning_rate": 9.609999602085963e-07, "logits/chosen": 0.072265625, "logits/rejected": 0.5234375, "logps/chosen": -450.0, "logps/rejected": -496.0, "loss": 0.1172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.25, "rewards/margins": 7.90625, "rewards/rejected": -24.125, "step": 6930 }, { "epoch": 0.5009745181549123, "grad_norm": 8.579113069738648, "learning_rate": 9.603073475346872e-07, "logits/chosen": -0.06884765625, "logits/rejected": 0.4296875, "logps/chosen": -430.0, "logps/rejected": -458.0, "loss": 0.1081, "rewards/accuracies": 0.96875, "rewards/chosen": -14.75, "rewards/margins": 7.375, "rewards/rejected": -22.125, "step": 6940 }, { "epoch": 0.5016963834548473, "grad_norm": 6.865944313469777, "learning_rate": 9.596162302465074e-07, "logits/chosen": -0.049072265625, "logits/rejected": 0.37890625, "logps/chosen": -428.0, "logps/rejected": -472.0, "loss": 0.1132, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.8125, "rewards/margins": 7.21875, "rewards/rejected": -23.0, "step": 6950 }, { "epoch": 0.5024182487547824, "grad_norm": 9.514615255888284, "learning_rate": 9.589266029707683e-07, "logits/chosen": -0.0869140625, "logits/rejected": 0.478515625, "logps/chosen": -410.0, "logps/rejected": -446.0, "loss": 0.1242, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.25, "rewards/margins": 7.5625, "rewards/rejected": -20.875, "step": 6960 }, { "epoch": 0.5031401140547174, "grad_norm": 6.402344590811435, "learning_rate": 9.582384603611731e-07, "logits/chosen": -0.06884765625, "logits/rejected": 0.330078125, "logps/chosen": -442.0, "logps/rejected": -490.0, "loss": 0.1223, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.125, "rewards/margins": 7.5625, "rewards/rejected": -21.75, "step": 6970 }, { "epoch": 0.5038619793546524, "grad_norm": 8.71573856350605, "learning_rate": 9.575517970982428e-07, "logits/chosen": -0.054443359375, "logits/rejected": 0.41015625, "logps/chosen": -424.0, "logps/rejected": -468.0, "loss": 0.1423, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.875, "rewards/margins": 7.34375, "rewards/rejected": -22.25, "step": 6980 }, { "epoch": 0.5045838446545875, "grad_norm": 10.388133643513887, "learning_rate": 9.568666078891436e-07, "logits/chosen": 0.03515625, "logits/rejected": 0.365234375, "logps/chosen": -440.0, "logps/rejected": -484.0, "loss": 0.1448, "rewards/accuracies": 0.9375, "rewards/chosen": -15.875, "rewards/margins": 7.375, "rewards/rejected": -23.25, "step": 6990 }, { "epoch": 0.5053057099545225, "grad_norm": 9.592409317643273, "learning_rate": 9.561828874675149e-07, "logits/chosen": -0.057861328125, "logits/rejected": 0.4296875, "logps/chosen": -410.0, "logps/rejected": -470.0, "loss": 0.1063, "rewards/accuracies": 0.9375, "rewards/chosen": -14.625, "rewards/margins": 7.1875, "rewards/rejected": -21.875, "step": 7000 }, { "epoch": 0.5060275752544575, "grad_norm": 8.445176037313836, "learning_rate": 9.555006305933e-07, "logits/chosen": 0.0703125, "logits/rejected": 0.404296875, "logps/chosen": -456.0, "logps/rejected": -476.0, "loss": 0.1205, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.5, "rewards/margins": 7.03125, "rewards/rejected": -22.5, "step": 7010 }, { "epoch": 0.5067494405543925, "grad_norm": 10.348612100884166, "learning_rate": 9.548198320525771e-07, "logits/chosen": -0.216796875, "logits/rejected": 0.478515625, "logps/chosen": -434.0, "logps/rejected": -480.0, "loss": 0.1061, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.125, "rewards/margins": 7.8125, "rewards/rejected": -25.0, "step": 7020 }, { "epoch": 0.5074713058543275, "grad_norm": 10.029088016721456, "learning_rate": 9.54140486657392e-07, "logits/chosen": 0.0179443359375, "logits/rejected": 0.384765625, "logps/chosen": -432.0, "logps/rejected": -462.0, "loss": 0.136, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.75, "rewards/margins": 6.71875, "rewards/rejected": -22.375, "step": 7030 }, { "epoch": 0.5081931711542627, "grad_norm": 6.843517343838633, "learning_rate": 9.534625892455922e-07, "logits/chosen": -0.2041015625, "logits/rejected": 0.470703125, "logps/chosen": -444.0, "logps/rejected": -476.0, "loss": 0.1057, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5, "rewards/margins": 8.1875, "rewards/rejected": -23.75, "step": 7040 }, { "epoch": 0.5089150364541977, "grad_norm": 9.607610774656777, "learning_rate": 9.527861346806618e-07, "logits/chosen": -0.00185394287109375, "logits/rejected": 0.515625, "logps/chosen": -450.0, "logps/rejected": -472.0, "loss": 0.1093, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 7.59375, "rewards/rejected": -24.5, "step": 7050 }, { "epoch": 0.5096369017541327, "grad_norm": 9.66981412779997, "learning_rate": 9.521111178515582e-07, "logits/chosen": -0.00732421875, "logits/rejected": 0.482421875, "logps/chosen": -450.0, "logps/rejected": -506.0, "loss": 0.1165, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.125, "rewards/margins": 7.8125, "rewards/rejected": -25.875, "step": 7060 }, { "epoch": 0.5103587670540677, "grad_norm": 16.557708763744856, "learning_rate": 9.514375336725502e-07, "logits/chosen": -0.115234375, "logits/rejected": 0.318359375, "logps/chosen": -448.0, "logps/rejected": -482.0, "loss": 0.1228, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.5, "rewards/margins": 7.4375, "rewards/rejected": -23.875, "step": 7070 }, { "epoch": 0.5110806323540027, "grad_norm": 6.343220892720317, "learning_rate": 9.507653770830566e-07, "logits/chosen": -0.1806640625, "logits/rejected": 0.419921875, "logps/chosen": -426.0, "logps/rejected": -462.0, "loss": 0.1176, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.8125, "rewards/margins": 7.34375, "rewards/rejected": -22.125, "step": 7080 }, { "epoch": 0.5118024976539378, "grad_norm": 5.856650529357383, "learning_rate": 9.500946430474869e-07, "logits/chosen": 0.058349609375, "logits/rejected": 0.44921875, "logps/chosen": -420.0, "logps/rejected": -452.0, "loss": 0.1329, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.375, "rewards/margins": 7.21875, "rewards/rejected": -21.625, "step": 7090 }, { "epoch": 0.5125243629538728, "grad_norm": 10.842706457146976, "learning_rate": 9.494253265550825e-07, "logits/chosen": 0.037353515625, "logits/rejected": 0.46484375, "logps/chosen": -452.0, "logps/rejected": -482.0, "loss": 0.098, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 7.8125, "rewards/rejected": -24.75, "step": 7100 }, { "epoch": 0.5132462282538078, "grad_norm": 8.555937624645903, "learning_rate": 9.4875742261976e-07, "logits/chosen": 0.0098876953125, "logits/rejected": 0.416015625, "logps/chosen": -466.0, "logps/rejected": -502.0, "loss": 0.11, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 7.65625, "rewards/rejected": -24.625, "step": 7110 }, { "epoch": 0.5139680935537428, "grad_norm": 10.869245118524521, "learning_rate": 9.480909262799544e-07, "logits/chosen": -0.01080322265625, "logits/rejected": 0.390625, "logps/chosen": -442.0, "logps/rejected": -506.0, "loss": 0.1531, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5, "rewards/margins": 7.875, "rewards/rejected": -23.375, "step": 7120 }, { "epoch": 0.5146899588536779, "grad_norm": 8.829351596268475, "learning_rate": 9.47425832598465e-07, "logits/chosen": -0.08203125, "logits/rejected": 0.4453125, "logps/chosen": -418.0, "logps/rejected": -440.0, "loss": 0.1247, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.25, "rewards/margins": 7.53125, "rewards/rejected": -21.75, "step": 7130 }, { "epoch": 0.515411824153613, "grad_norm": 5.283249732839096, "learning_rate": 9.467621366623017e-07, "logits/chosen": 0.03076171875, "logits/rejected": 0.47265625, "logps/chosen": -442.0, "logps/rejected": -494.0, "loss": 0.1356, "rewards/accuracies": 0.96875, "rewards/chosen": -15.625, "rewards/margins": 7.5625, "rewards/rejected": -23.125, "step": 7140 }, { "epoch": 0.516133689453548, "grad_norm": 8.703290723391637, "learning_rate": 9.46099833582532e-07, "logits/chosen": -0.11376953125, "logits/rejected": 0.412109375, "logps/chosen": -408.0, "logps/rejected": -448.0, "loss": 0.1265, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.375, "rewards/margins": 7.625, "rewards/rejected": -21.0, "step": 7150 }, { "epoch": 0.516855554753483, "grad_norm": 6.446979128449574, "learning_rate": 9.45438918494131e-07, "logits/chosen": -0.0242919921875, "logits/rejected": 0.484375, "logps/chosen": -436.0, "logps/rejected": -468.0, "loss": 0.1221, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.4375, "rewards/margins": 7.84375, "rewards/rejected": -23.25, "step": 7160 }, { "epoch": 0.517577420053418, "grad_norm": 12.96473624373637, "learning_rate": 9.447793865558291e-07, "logits/chosen": -0.010009765625, "logits/rejected": 0.482421875, "logps/chosen": -432.0, "logps/rejected": -482.0, "loss": 0.136, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.25, "rewards/margins": 7.96875, "rewards/rejected": -23.25, "step": 7170 }, { "epoch": 0.518299285353353, "grad_norm": 6.71206202082161, "learning_rate": 9.441212329499659e-07, "logits/chosen": 0.08154296875, "logits/rejected": 0.50390625, "logps/chosen": -410.0, "logps/rejected": -464.0, "loss": 0.122, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.625, "rewards/margins": 7.625, "rewards/rejected": -23.25, "step": 7180 }, { "epoch": 0.5190211506532881, "grad_norm": 6.2143675621343375, "learning_rate": 9.434644528823399e-07, "logits/chosen": 0.06884765625, "logits/rejected": 0.51171875, "logps/chosen": -424.0, "logps/rejected": -476.0, "loss": 0.1455, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.9375, "rewards/margins": 6.8125, "rewards/rejected": -22.75, "step": 7190 }, { "epoch": 0.5197430159532231, "grad_norm": 8.678312682553202, "learning_rate": 9.428090415820634e-07, "logits/chosen": -0.142578125, "logits/rejected": 0.318359375, "logps/chosen": -456.0, "logps/rejected": -466.0, "loss": 0.1298, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.0, "rewards/margins": 7.75, "rewards/rejected": -22.75, "step": 7200 }, { "epoch": 0.5204648812531582, "grad_norm": 7.542135251260137, "learning_rate": 9.42154994301416e-07, "logits/chosen": -0.0126953125, "logits/rejected": 0.431640625, "logps/chosen": -430.0, "logps/rejected": -474.0, "loss": 0.1242, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.375, "rewards/margins": 7.125, "rewards/rejected": -22.5, "step": 7210 }, { "epoch": 0.5211867465530932, "grad_norm": 13.039175090611629, "learning_rate": 9.415023063157008e-07, "logits/chosen": 0.1279296875, "logits/rejected": 0.435546875, "logps/chosen": -430.0, "logps/rejected": -454.0, "loss": 0.1281, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.5, "rewards/margins": 6.40625, "rewards/rejected": -22.0, "step": 7220 }, { "epoch": 0.5219086118530282, "grad_norm": 12.227759639389859, "learning_rate": 9.408509729231009e-07, "logits/chosen": 0.0615234375, "logits/rejected": 0.47265625, "logps/chosen": -452.0, "logps/rejected": -476.0, "loss": 0.1162, "rewards/accuracies": 0.9375, "rewards/chosen": -17.25, "rewards/margins": 7.0, "rewards/rejected": -24.125, "step": 7230 }, { "epoch": 0.5226304771529633, "grad_norm": 10.653229916236183, "learning_rate": 9.402009894445369e-07, "logits/chosen": 0.1201171875, "logits/rejected": 0.462890625, "logps/chosen": -474.0, "logps/rejected": -506.0, "loss": 0.1468, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 6.625, "rewards/rejected": -23.375, "step": 7240 }, { "epoch": 0.5233523424528983, "grad_norm": 5.440290756586325, "learning_rate": 9.395523512235255e-07, "logits/chosen": 0.08935546875, "logits/rejected": 0.58203125, "logps/chosen": -436.0, "logps/rejected": -482.0, "loss": 0.1161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.25, "rewards/margins": 8.25, "rewards/rejected": -24.5, "step": 7250 }, { "epoch": 0.5240742077528333, "grad_norm": 5.86160695619629, "learning_rate": 9.389050536260404e-07, "logits/chosen": 0.1806640625, "logits/rejected": 0.6484375, "logps/chosen": -452.0, "logps/rejected": -502.0, "loss": 0.128, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.5, "rewards/margins": 6.90625, "rewards/rejected": -24.375, "step": 7260 }, { "epoch": 0.5247960730527683, "grad_norm": 8.05669323991899, "learning_rate": 9.382590920403722e-07, "logits/chosen": 0.08544921875, "logits/rejected": 0.59375, "logps/chosen": -440.0, "logps/rejected": -478.0, "loss": 0.1156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 7.90625, "rewards/rejected": -24.5, "step": 7270 }, { "epoch": 0.5255179383527034, "grad_norm": 10.263919877896315, "learning_rate": 9.376144618769908e-07, "logits/chosen": 0.2119140625, "logits/rejected": 0.5390625, "logps/chosen": -438.0, "logps/rejected": -478.0, "loss": 0.1491, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.6875, "rewards/margins": 7.5625, "rewards/rejected": -23.25, "step": 7280 }, { "epoch": 0.5262398036526384, "grad_norm": 10.854293494168164, "learning_rate": 9.369711585684086e-07, "logits/chosen": 0.2001953125, "logits/rejected": 0.392578125, "logps/chosen": -438.0, "logps/rejected": -494.0, "loss": 0.1226, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.8125, "rewards/margins": 7.09375, "rewards/rejected": -22.875, "step": 7290 }, { "epoch": 0.5269616689525735, "grad_norm": 12.473341598416672, "learning_rate": 9.363291775690445e-07, "logits/chosen": 0.1572265625, "logits/rejected": 0.51953125, "logps/chosen": -420.0, "logps/rejected": -442.0, "loss": 0.1034, "rewards/accuracies": 0.96875, "rewards/chosen": -15.3125, "rewards/margins": 7.09375, "rewards/rejected": -22.375, "step": 7300 }, { "epoch": 0.5276835342525085, "grad_norm": 10.54026245776836, "learning_rate": 9.356885143550886e-07, "logits/chosen": 0.10791015625, "logits/rejected": 0.43359375, "logps/chosen": -440.0, "logps/rejected": -504.0, "loss": 0.1009, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 7.3125, "rewards/rejected": -24.25, "step": 7310 }, { "epoch": 0.5284053995524435, "grad_norm": 8.700567367520138, "learning_rate": 9.350491644243688e-07, "logits/chosen": 0.08154296875, "logits/rejected": 0.51953125, "logps/chosen": -462.0, "logps/rejected": -506.0, "loss": 0.1217, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -18.0, "rewards/margins": 7.40625, "rewards/rejected": -25.375, "step": 7320 }, { "epoch": 0.5291272648523786, "grad_norm": 8.002940865278196, "learning_rate": 9.344111232962179e-07, "logits/chosen": 0.19140625, "logits/rejected": 0.65625, "logps/chosen": -448.0, "logps/rejected": -506.0, "loss": 0.1083, "rewards/accuracies": 0.96875, "rewards/chosen": -17.625, "rewards/margins": 7.71875, "rewards/rejected": -25.375, "step": 7330 }, { "epoch": 0.5298491301523136, "grad_norm": 7.7323669208286265, "learning_rate": 9.337743865113415e-07, "logits/chosen": 0.10546875, "logits/rejected": 0.58203125, "logps/chosen": -452.0, "logps/rejected": -492.0, "loss": 0.1085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.0, "rewards/margins": 8.125, "rewards/rejected": -26.125, "step": 7340 }, { "epoch": 0.5305709954522486, "grad_norm": 10.434151688650106, "learning_rate": 9.331389496316868e-07, "logits/chosen": -0.034912109375, "logits/rejected": 0.404296875, "logps/chosen": -466.0, "logps/rejected": -504.0, "loss": 0.0956, "rewards/accuracies": 0.9375, "rewards/chosen": -17.25, "rewards/margins": 8.6875, "rewards/rejected": -25.875, "step": 7350 }, { "epoch": 0.5312928607521836, "grad_norm": 11.56305422456938, "learning_rate": 9.325048082403138e-07, "logits/chosen": 0.044677734375, "logits/rejected": 0.41796875, "logps/chosen": -444.0, "logps/rejected": -492.0, "loss": 0.125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.875, "rewards/margins": 7.96875, "rewards/rejected": -23.75, "step": 7360 }, { "epoch": 0.5320147260521186, "grad_norm": 5.343016168539854, "learning_rate": 9.318719579412648e-07, "logits/chosen": 0.006103515625, "logits/rejected": 0.51171875, "logps/chosen": -456.0, "logps/rejected": -506.0, "loss": 0.1115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.0, "rewards/margins": 8.125, "rewards/rejected": -26.125, "step": 7370 }, { "epoch": 0.5327365913520538, "grad_norm": 6.206679087969331, "learning_rate": 9.312403943594374e-07, "logits/chosen": 0.142578125, "logits/rejected": 0.453125, "logps/chosen": -444.0, "logps/rejected": -520.0, "loss": 0.1122, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.5, "rewards/margins": 7.8125, "rewards/rejected": -26.25, "step": 7380 }, { "epoch": 0.5334584566519888, "grad_norm": 10.783349942935942, "learning_rate": 9.306101131404582e-07, "logits/chosen": 0.1416015625, "logits/rejected": 0.62890625, "logps/chosen": -490.0, "logps/rejected": -520.0, "loss": 0.1377, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -19.75, "rewards/margins": 7.59375, "rewards/rejected": -27.375, "step": 7390 }, { "epoch": 0.5341803219519238, "grad_norm": 8.538628778250464, "learning_rate": 9.299811099505542e-07, "logits/chosen": -0.0096435546875, "logits/rejected": 0.470703125, "logps/chosen": -474.0, "logps/rejected": -502.0, "loss": 0.0992, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 8.0625, "rewards/rejected": -26.5, "step": 7400 }, { "epoch": 0.5349021872518588, "grad_norm": 10.57459403807199, "learning_rate": 9.293533804764305e-07, "logits/chosen": 0.0947265625, "logits/rejected": 0.439453125, "logps/chosen": -458.0, "logps/rejected": -512.0, "loss": 0.1094, "rewards/accuracies": 0.96875, "rewards/chosen": -18.5, "rewards/margins": 7.96875, "rewards/rejected": -26.5, "step": 7410 }, { "epoch": 0.5356240525517938, "grad_norm": 8.962981633898275, "learning_rate": 9.28726920425144e-07, "logits/chosen": 0.057861328125, "logits/rejected": 0.578125, "logps/chosen": -448.0, "logps/rejected": -508.0, "loss": 0.1069, "rewards/accuracies": 0.96875, "rewards/chosen": -18.125, "rewards/margins": 7.53125, "rewards/rejected": -25.625, "step": 7420 }, { "epoch": 0.5363459178517289, "grad_norm": 9.088209525292106, "learning_rate": 9.281017255239815e-07, "logits/chosen": 0.0203857421875, "logits/rejected": 0.546875, "logps/chosen": -432.0, "logps/rejected": -464.0, "loss": 0.1105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.625, "rewards/margins": 7.84375, "rewards/rejected": -23.5, "step": 7430 }, { "epoch": 0.5370677831516639, "grad_norm": 26.622783477340057, "learning_rate": 9.274777915203365e-07, "logits/chosen": 0.030029296875, "logits/rejected": 0.50390625, "logps/chosen": -440.0, "logps/rejected": -478.0, "loss": 0.1275, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.75, "rewards/margins": 7.34375, "rewards/rejected": -23.125, "step": 7440 }, { "epoch": 0.5377896484515989, "grad_norm": 12.751031060292956, "learning_rate": 9.268551141815875e-07, "logits/chosen": 0.1357421875, "logits/rejected": 0.4453125, "logps/chosen": -456.0, "logps/rejected": -496.0, "loss": 0.0941, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 7.34375, "rewards/rejected": -24.5, "step": 7450 }, { "epoch": 0.5385115137515339, "grad_norm": 10.849880352877529, "learning_rate": 9.262336892949784e-07, "logits/chosen": 0.1962890625, "logits/rejected": 0.5859375, "logps/chosen": -444.0, "logps/rejected": -496.0, "loss": 0.1081, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.5, "rewards/margins": 7.9375, "rewards/rejected": -24.375, "step": 7460 }, { "epoch": 0.539233379051469, "grad_norm": 6.68873863711809, "learning_rate": 9.256135126674977e-07, "logits/chosen": 0.08544921875, "logits/rejected": 0.515625, "logps/chosen": -434.0, "logps/rejected": -472.0, "loss": 0.1197, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.125, "rewards/margins": 6.8125, "rewards/rejected": -23.0, "step": 7470 }, { "epoch": 0.5399552443514041, "grad_norm": 12.41792722412904, "learning_rate": 9.249945801257605e-07, "logits/chosen": 0.10693359375, "logits/rejected": 0.52734375, "logps/chosen": -442.0, "logps/rejected": -500.0, "loss": 0.1015, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.5, "rewards/margins": 8.375, "rewards/rejected": -24.875, "step": 7480 }, { "epoch": 0.5406771096513391, "grad_norm": 9.730122454455731, "learning_rate": 9.243768875158902e-07, "logits/chosen": 0.11767578125, "logits/rejected": 0.4921875, "logps/chosen": -430.0, "logps/rejected": -464.0, "loss": 0.1124, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -16.125, "rewards/margins": 7.5, "rewards/rejected": -23.625, "step": 7490 }, { "epoch": 0.5413989749512741, "grad_norm": 7.52129656646379, "learning_rate": 9.23760430703401e-07, "logits/chosen": 0.06494140625, "logits/rejected": 0.50390625, "logps/chosen": -440.0, "logps/rejected": -494.0, "loss": 0.0963, "rewards/accuracies": 0.96875, "rewards/chosen": -16.5, "rewards/margins": 8.5, "rewards/rejected": -25.0, "step": 7500 }, { "epoch": 0.5421208402512091, "grad_norm": 8.561117958643628, "learning_rate": 9.231452055730832e-07, "logits/chosen": -0.00390625, "logits/rejected": 0.494140625, "logps/chosen": -430.0, "logps/rejected": -474.0, "loss": 0.1023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.375, "rewards/margins": 8.125, "rewards/rejected": -23.5, "step": 7510 }, { "epoch": 0.5428427055511441, "grad_norm": 9.904687666825609, "learning_rate": 9.225312080288851e-07, "logits/chosen": 0.0791015625, "logits/rejected": 0.52734375, "logps/chosen": -464.0, "logps/rejected": -520.0, "loss": 0.1001, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.375, "rewards/margins": 8.5625, "rewards/rejected": -27.0, "step": 7520 }, { "epoch": 0.5435645708510792, "grad_norm": 9.624128704781832, "learning_rate": 9.219184339938013e-07, "logits/chosen": 0.1357421875, "logits/rejected": 0.7265625, "logps/chosen": -462.0, "logps/rejected": -502.0, "loss": 0.1252, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.75, "rewards/margins": 7.875, "rewards/rejected": -26.625, "step": 7530 }, { "epoch": 0.5442864361510142, "grad_norm": 9.6611978023934, "learning_rate": 9.213068794097574e-07, "logits/chosen": 0.0067138671875, "logits/rejected": 0.5703125, "logps/chosen": -440.0, "logps/rejected": -466.0, "loss": 0.1006, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.125, "rewards/margins": 7.09375, "rewards/rejected": -24.125, "step": 7540 }, { "epoch": 0.5450083014509493, "grad_norm": 6.905813328818327, "learning_rate": 9.206965402374975e-07, "logits/chosen": 0.107421875, "logits/rejected": 0.57421875, "logps/chosen": -458.0, "logps/rejected": -516.0, "loss": 0.0878, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.375, "rewards/margins": 7.9375, "rewards/rejected": -25.375, "step": 7550 }, { "epoch": 0.5457301667508843, "grad_norm": 8.020061860862722, "learning_rate": 9.200874124564723e-07, "logits/chosen": 0.09130859375, "logits/rejected": 0.5390625, "logps/chosen": -446.0, "logps/rejected": -486.0, "loss": 0.1181, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 8.3125, "rewards/rejected": -25.25, "step": 7560 }, { "epoch": 0.5464520320508193, "grad_norm": 10.761059628904128, "learning_rate": 9.194794920647274e-07, "logits/chosen": 0.1396484375, "logits/rejected": 0.55859375, "logps/chosen": -444.0, "logps/rejected": -512.0, "loss": 0.1144, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.125, "rewards/margins": 8.0625, "rewards/rejected": -25.125, "step": 7570 }, { "epoch": 0.5471738973507544, "grad_norm": 8.789871638025732, "learning_rate": 9.188727750787932e-07, "logits/chosen": 0.1513671875, "logits/rejected": 0.671875, "logps/chosen": -448.0, "logps/rejected": -484.0, "loss": 0.1207, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.0, "rewards/margins": 7.90625, "rewards/rejected": -24.875, "step": 7580 }, { "epoch": 0.5478957626506894, "grad_norm": 10.048735421942776, "learning_rate": 9.182672575335757e-07, "logits/chosen": 0.031005859375, "logits/rejected": 0.4609375, "logps/chosen": -464.0, "logps/rejected": -498.0, "loss": 0.1282, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 7.4375, "rewards/rejected": -24.375, "step": 7590 }, { "epoch": 0.5486176279506244, "grad_norm": 10.691861727559765, "learning_rate": 9.176629354822469e-07, "logits/chosen": 0.06982421875, "logits/rejected": 0.62109375, "logps/chosen": -416.0, "logps/rejected": -462.0, "loss": 0.1146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.1875, "rewards/margins": 8.5, "rewards/rejected": -23.75, "step": 7600 }, { "epoch": 0.5493394932505594, "grad_norm": 6.952775215271427, "learning_rate": 9.170598049961371e-07, "logits/chosen": 0.1640625, "logits/rejected": 0.55078125, "logps/chosen": -438.0, "logps/rejected": -482.0, "loss": 0.1205, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 7.3125, "rewards/rejected": -24.0, "step": 7610 }, { "epoch": 0.5500613585504944, "grad_norm": 11.24545129225844, "learning_rate": 9.164578621646276e-07, "logits/chosen": 0.0159912109375, "logits/rejected": 0.58984375, "logps/chosen": -428.0, "logps/rejected": -486.0, "loss": 0.0989, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 8.25, "rewards/rejected": -25.0, "step": 7620 }, { "epoch": 0.5507832238504295, "grad_norm": 9.55093611883023, "learning_rate": 9.15857103095044e-07, "logits/chosen": -0.0286865234375, "logits/rejected": 0.5234375, "logps/chosen": -450.0, "logps/rejected": -484.0, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 7.8125, "rewards/rejected": -24.375, "step": 7630 }, { "epoch": 0.5515050891503646, "grad_norm": 8.403431247957226, "learning_rate": 9.15257523912551e-07, "logits/chosen": 0.04248046875, "logits/rejected": 0.490234375, "logps/chosen": -438.0, "logps/rejected": -506.0, "loss": 0.1008, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.75, "rewards/margins": 8.5, "rewards/rejected": -25.25, "step": 7640 }, { "epoch": 0.5522269544502996, "grad_norm": 11.298056612782833, "learning_rate": 9.146591207600472e-07, "logits/chosen": 0.08984375, "logits/rejected": 0.5078125, "logps/chosen": -428.0, "logps/rejected": -470.0, "loss": 0.1035, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.625, "rewards/margins": 7.0625, "rewards/rejected": -22.75, "step": 7650 }, { "epoch": 0.5529488197502346, "grad_norm": 7.875082800797229, "learning_rate": 9.140618897980601e-07, "logits/chosen": 0.169921875, "logits/rejected": 0.51953125, "logps/chosen": -434.0, "logps/rejected": -480.0, "loss": 0.0949, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.625, "rewards/margins": 7.75, "rewards/rejected": -23.375, "step": 7660 }, { "epoch": 0.5536706850501696, "grad_norm": 13.931874986379995, "learning_rate": 9.134658272046442e-07, "logits/chosen": -0.08349609375, "logits/rejected": 0.396484375, "logps/chosen": -454.0, "logps/rejected": -498.0, "loss": 0.1295, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.9375, "rewards/margins": 8.5625, "rewards/rejected": -24.5, "step": 7670 }, { "epoch": 0.5543925503501047, "grad_norm": 11.175656289325133, "learning_rate": 9.128709291752768e-07, "logits/chosen": 0.0712890625, "logits/rejected": 0.4765625, "logps/chosen": -440.0, "logps/rejected": -482.0, "loss": 0.1398, "rewards/accuracies": 0.9375, "rewards/chosen": -16.25, "rewards/margins": 7.875, "rewards/rejected": -24.125, "step": 7680 }, { "epoch": 0.5551144156500397, "grad_norm": 17.922575476827948, "learning_rate": 9.122771919227568e-07, "logits/chosen": 0.1435546875, "logits/rejected": 0.5, "logps/chosen": -464.0, "logps/rejected": -508.0, "loss": 0.1125, "rewards/accuracies": 0.96875, "rewards/chosen": -17.625, "rewards/margins": 8.125, "rewards/rejected": -25.75, "step": 7690 }, { "epoch": 0.5558362809499747, "grad_norm": 5.779055513487392, "learning_rate": 9.116846116771035e-07, "logits/chosen": 0.14453125, "logits/rejected": 0.5078125, "logps/chosen": -444.0, "logps/rejected": -492.0, "loss": 0.1186, "rewards/accuracies": 0.96875, "rewards/chosen": -17.375, "rewards/margins": 7.84375, "rewards/rejected": -25.125, "step": 7700 }, { "epoch": 0.5565581462499097, "grad_norm": 12.687110004455661, "learning_rate": 9.110931846854553e-07, "logits/chosen": -0.08447265625, "logits/rejected": 0.34765625, "logps/chosen": -428.0, "logps/rejected": -492.0, "loss": 0.1174, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.5625, "rewards/margins": 7.4375, "rewards/rejected": -23.0, "step": 7710 }, { "epoch": 0.5572800115498447, "grad_norm": 3.739105046376622, "learning_rate": 9.105029072119708e-07, "logits/chosen": 0.076171875, "logits/rejected": 0.578125, "logps/chosen": -424.0, "logps/rejected": -464.0, "loss": 0.1066, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.3125, "rewards/margins": 7.6875, "rewards/rejected": -23.0, "step": 7720 }, { "epoch": 0.5580018768497799, "grad_norm": 8.065193733565154, "learning_rate": 9.099137755377291e-07, "logits/chosen": -0.080078125, "logits/rejected": 0.474609375, "logps/chosen": -436.0, "logps/rejected": -508.0, "loss": 0.0799, "rewards/accuracies": 0.96875, "rewards/chosen": -15.25, "rewards/margins": 8.3125, "rewards/rejected": -23.5, "step": 7730 }, { "epoch": 0.5587237421497149, "grad_norm": 10.367979112966422, "learning_rate": 9.093257859606311e-07, "logits/chosen": -0.00201416015625, "logits/rejected": 0.50390625, "logps/chosen": -398.0, "logps/rejected": -474.0, "loss": 0.1013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.875, "rewards/margins": 8.0625, "rewards/rejected": -23.0, "step": 7740 }, { "epoch": 0.5594456074496499, "grad_norm": 11.324883885826779, "learning_rate": 9.087389347953037e-07, "logits/chosen": -0.0732421875, "logits/rejected": 0.373046875, "logps/chosen": -450.0, "logps/rejected": -494.0, "loss": 0.1061, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 7.3125, "rewards/rejected": -24.5, "step": 7750 }, { "epoch": 0.5601674727495849, "grad_norm": 6.234462112907084, "learning_rate": 9.081532183729995e-07, "logits/chosen": 0.046875, "logits/rejected": 0.5078125, "logps/chosen": -446.0, "logps/rejected": -506.0, "loss": 0.1123, "rewards/accuracies": 0.96875, "rewards/chosen": -16.25, "rewards/margins": 8.75, "rewards/rejected": -25.0, "step": 7760 }, { "epoch": 0.56088933804952, "grad_norm": 13.774418810579453, "learning_rate": 9.075686330415037e-07, "logits/chosen": 0.02587890625, "logits/rejected": 0.53125, "logps/chosen": -452.0, "logps/rejected": -480.0, "loss": 0.1151, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.8125, "rewards/margins": 8.125, "rewards/rejected": -23.875, "step": 7770 }, { "epoch": 0.561611203349455, "grad_norm": 9.15282124099708, "learning_rate": 9.069851751650364e-07, "logits/chosen": 0.0517578125, "logits/rejected": 0.515625, "logps/chosen": -434.0, "logps/rejected": -488.0, "loss": 0.1179, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.75, "rewards/margins": 7.78125, "rewards/rejected": -24.5, "step": 7780 }, { "epoch": 0.56233306864939, "grad_norm": 5.884802503971195, "learning_rate": 9.064028411241582e-07, "logits/chosen": 0.08056640625, "logits/rejected": 0.5078125, "logps/chosen": -452.0, "logps/rejected": -498.0, "loss": 0.1078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.25, "rewards/margins": 8.8125, "rewards/rejected": -25.125, "step": 7790 }, { "epoch": 0.563054933949325, "grad_norm": 9.46668864498544, "learning_rate": 9.058216273156764e-07, "logits/chosen": -0.166015625, "logits/rejected": 0.51171875, "logps/chosen": -464.0, "logps/rejected": -500.0, "loss": 0.1256, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.625, "rewards/margins": 7.875, "rewards/rejected": -24.5, "step": 7800 }, { "epoch": 0.56377679924926, "grad_norm": 10.201991312939246, "learning_rate": 9.052415301525511e-07, "logits/chosen": 0.04443359375, "logits/rejected": 0.490234375, "logps/chosen": -440.0, "logps/rejected": -492.0, "loss": 0.1213, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.125, "rewards/margins": 7.6875, "rewards/rejected": -23.875, "step": 7810 }, { "epoch": 0.5644986645491952, "grad_norm": 7.796701425713423, "learning_rate": 9.046625460638012e-07, "logits/chosen": 0.01080322265625, "logits/rejected": 0.392578125, "logps/chosen": -458.0, "logps/rejected": -480.0, "loss": 0.1159, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.5, "rewards/margins": 6.9375, "rewards/rejected": -23.375, "step": 7820 }, { "epoch": 0.5652205298491302, "grad_norm": 10.937865583369078, "learning_rate": 9.040846714944138e-07, "logits/chosen": 0.0179443359375, "logits/rejected": 0.5234375, "logps/chosen": -434.0, "logps/rejected": -486.0, "loss": 0.1253, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.9375, "rewards/margins": 7.6875, "rewards/rejected": -22.625, "step": 7830 }, { "epoch": 0.5659423951490652, "grad_norm": 8.23616971570522, "learning_rate": 9.035079029052513e-07, "logits/chosen": 0.0172119140625, "logits/rejected": 0.54296875, "logps/chosen": -432.0, "logps/rejected": -460.0, "loss": 0.1229, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.8125, "rewards/margins": 7.0, "rewards/rejected": -22.875, "step": 7840 }, { "epoch": 0.5666642604490002, "grad_norm": 11.141005738112527, "learning_rate": 9.029322367729605e-07, "logits/chosen": 0.12353515625, "logits/rejected": 0.55078125, "logps/chosen": -456.0, "logps/rejected": -492.0, "loss": 0.1052, "rewards/accuracies": 0.9375, "rewards/chosen": -17.5, "rewards/margins": 7.40625, "rewards/rejected": -24.875, "step": 7850 }, { "epoch": 0.5673861257489352, "grad_norm": 12.150098653657428, "learning_rate": 9.02357669589883e-07, "logits/chosen": -0.0302734375, "logits/rejected": 0.380859375, "logps/chosen": -446.0, "logps/rejected": -496.0, "loss": 0.1438, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.75, "rewards/margins": 8.75, "rewards/rejected": -26.5, "step": 7860 }, { "epoch": 0.5681079910488703, "grad_norm": 14.262507771775944, "learning_rate": 9.017841978639643e-07, "logits/chosen": 0.003082275390625, "logits/rejected": 0.56640625, "logps/chosen": -448.0, "logps/rejected": -480.0, "loss": 0.1095, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.5, "rewards/margins": 7.84375, "rewards/rejected": -24.375, "step": 7870 }, { "epoch": 0.5688298563488053, "grad_norm": 6.235064119585286, "learning_rate": 9.012118181186658e-07, "logits/chosen": 0.015869140625, "logits/rejected": 0.4609375, "logps/chosen": -450.0, "logps/rejected": -468.0, "loss": 0.1196, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.5, "rewards/margins": 7.25, "rewards/rejected": -23.75, "step": 7880 }, { "epoch": 0.5695517216487404, "grad_norm": 11.791077119956995, "learning_rate": 9.00640526892875e-07, "logits/chosen": -0.010986328125, "logits/rejected": 0.29296875, "logps/chosen": -418.0, "logps/rejected": -470.0, "loss": 0.1238, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.4375, "rewards/margins": 6.875, "rewards/rejected": -22.25, "step": 7890 }, { "epoch": 0.5702735869486754, "grad_norm": 7.4392039055001025, "learning_rate": 9.000703207408191e-07, "logits/chosen": 0.057861328125, "logits/rejected": 0.4921875, "logps/chosen": -398.0, "logps/rejected": -438.0, "loss": 0.1151, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.5625, "rewards/margins": 6.59375, "rewards/rejected": -20.125, "step": 7900 }, { "epoch": 0.5709954522486104, "grad_norm": 11.120085479973381, "learning_rate": 8.995011962319761e-07, "logits/chosen": -0.07861328125, "logits/rejected": 0.28515625, "logps/chosen": -424.0, "logps/rejected": -456.0, "loss": 0.0984, "rewards/accuracies": 0.96875, "rewards/chosen": -13.6875, "rewards/margins": 7.5, "rewards/rejected": -21.25, "step": 7910 }, { "epoch": 0.5717173175485455, "grad_norm": 8.939227672048501, "learning_rate": 8.989331499509894e-07, "logits/chosen": -0.033447265625, "logits/rejected": 0.443359375, "logps/chosen": -400.0, "logps/rejected": -454.0, "loss": 0.107, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.4375, "rewards/margins": 7.875, "rewards/rejected": -21.375, "step": 7920 }, { "epoch": 0.5724391828484805, "grad_norm": 10.10834391671841, "learning_rate": 8.983661784975812e-07, "logits/chosen": -0.032958984375, "logits/rejected": 0.3828125, "logps/chosen": -428.0, "logps/rejected": -468.0, "loss": 0.1232, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.125, "rewards/margins": 7.625, "rewards/rejected": -21.75, "step": 7930 }, { "epoch": 0.5731610481484155, "grad_norm": 7.856168666978974, "learning_rate": 8.97800278486467e-07, "logits/chosen": 0.06298828125, "logits/rejected": 0.48828125, "logps/chosen": -410.0, "logps/rejected": -454.0, "loss": 0.1143, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.0625, "rewards/margins": 7.5625, "rewards/rejected": -21.625, "step": 7940 }, { "epoch": 0.5738829134483505, "grad_norm": 6.041169994609, "learning_rate": 8.972354465472708e-07, "logits/chosen": -0.00469970703125, "logits/rejected": 0.486328125, "logps/chosen": -452.0, "logps/rejected": -476.0, "loss": 0.0946, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.0, "rewards/margins": 7.78125, "rewards/rejected": -23.75, "step": 7950 }, { "epoch": 0.5746047787482855, "grad_norm": 7.31690394836129, "learning_rate": 8.966716793244405e-07, "logits/chosen": 0.119140625, "logits/rejected": 0.49609375, "logps/chosen": -472.0, "logps/rejected": -504.0, "loss": 0.0865, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 7.59375, "rewards/rejected": -25.75, "step": 7960 }, { "epoch": 0.5753266440482206, "grad_norm": 8.92054230752511, "learning_rate": 8.96108973477165e-07, "logits/chosen": 0.138671875, "logits/rejected": 0.458984375, "logps/chosen": -474.0, "logps/rejected": -524.0, "loss": 0.1275, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.875, "rewards/margins": 7.9375, "rewards/rejected": -27.75, "step": 7970 }, { "epoch": 0.5760485093481557, "grad_norm": 10.030130094921233, "learning_rate": 8.955473256792899e-07, "logits/chosen": 0.029541015625, "logits/rejected": 0.52734375, "logps/chosen": -490.0, "logps/rejected": -510.0, "loss": 0.1092, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.75, "rewards/margins": 8.375, "rewards/rejected": -27.125, "step": 7980 }, { "epoch": 0.5767703746480907, "grad_norm": 11.591131050649457, "learning_rate": 8.949867326192358e-07, "logits/chosen": 0.1103515625, "logits/rejected": 0.49609375, "logps/chosen": -468.0, "logps/rejected": -502.0, "loss": 0.0697, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.0, "rewards/margins": 7.59375, "rewards/rejected": -25.625, "step": 7990 }, { "epoch": 0.5774922399480257, "grad_norm": 10.46431068451458, "learning_rate": 8.944271909999158e-07, "logits/chosen": 0.09716796875, "logits/rejected": 0.55078125, "logps/chosen": -452.0, "logps/rejected": -506.0, "loss": 0.1051, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.875, "rewards/margins": 8.875, "rewards/rejected": -26.75, "step": 8000 }, { "epoch": 0.5782141052479607, "grad_norm": 5.116786797786776, "learning_rate": 8.938686975386545e-07, "logits/chosen": 0.04150390625, "logits/rejected": 0.5390625, "logps/chosen": -450.0, "logps/rejected": -496.0, "loss": 0.0977, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 8.625, "rewards/rejected": -25.75, "step": 8010 }, { "epoch": 0.5789359705478958, "grad_norm": 11.802885180999462, "learning_rate": 8.933112489671067e-07, "logits/chosen": 0.08837890625, "logits/rejected": 0.447265625, "logps/chosen": -450.0, "logps/rejected": -484.0, "loss": 0.1333, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 7.5625, "rewards/rejected": -24.5, "step": 8020 }, { "epoch": 0.5796578358478308, "grad_norm": 10.936627323419225, "learning_rate": 8.927548420311771e-07, "logits/chosen": 0.11083984375, "logits/rejected": 0.380859375, "logps/chosen": -452.0, "logps/rejected": -516.0, "loss": 0.1001, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.25, "rewards/margins": 8.4375, "rewards/rejected": -25.625, "step": 8030 }, { "epoch": 0.5803797011477658, "grad_norm": 9.717113877801665, "learning_rate": 8.921994734909409e-07, "logits/chosen": 0.0849609375, "logits/rejected": 0.3828125, "logps/chosen": -434.0, "logps/rejected": -486.0, "loss": 0.1194, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.0, "rewards/margins": 7.9375, "rewards/rejected": -24.0, "step": 8040 }, { "epoch": 0.5811015664477008, "grad_norm": 8.312198658583203, "learning_rate": 8.916451401205645e-07, "logits/chosen": 0.046630859375, "logits/rejected": 0.404296875, "logps/chosen": -448.0, "logps/rejected": -480.0, "loss": 0.1407, "rewards/accuracies": 0.90625, "rewards/chosen": -17.0, "rewards/margins": 7.125, "rewards/rejected": -24.125, "step": 8050 }, { "epoch": 0.5818234317476358, "grad_norm": 5.012711241047577, "learning_rate": 8.91091838708226e-07, "logits/chosen": 0.0186767578125, "logits/rejected": 0.4375, "logps/chosen": -430.0, "logps/rejected": -490.0, "loss": 0.0836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.4375, "rewards/margins": 7.96875, "rewards/rejected": -23.375, "step": 8060 }, { "epoch": 0.582545297047571, "grad_norm": 6.123242882528667, "learning_rate": 8.905395660560378e-07, "logits/chosen": 0.1044921875, "logits/rejected": 0.46875, "logps/chosen": -414.0, "logps/rejected": -482.0, "loss": 0.0924, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.25, "rewards/margins": 7.75, "rewards/rejected": -23.875, "step": 8070 }, { "epoch": 0.583267162347506, "grad_norm": 4.291050244219149, "learning_rate": 8.899883189799695e-07, "logits/chosen": -0.1640625, "logits/rejected": 0.337890625, "logps/chosen": -478.0, "logps/rejected": -528.0, "loss": 0.0905, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 8.8125, "rewards/rejected": -25.875, "step": 8080 }, { "epoch": 0.583989027647441, "grad_norm": 9.76113269579368, "learning_rate": 8.894380943097694e-07, "logits/chosen": 0.0140380859375, "logits/rejected": 0.486328125, "logps/chosen": -464.0, "logps/rejected": -504.0, "loss": 0.1329, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 8.4375, "rewards/rejected": -25.625, "step": 8090 }, { "epoch": 0.584710892947376, "grad_norm": 6.203243530453647, "learning_rate": 8.888888888888888e-07, "logits/chosen": -0.06591796875, "logits/rejected": 0.44921875, "logps/chosen": -442.0, "logps/rejected": -472.0, "loss": 0.1126, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.125, "rewards/margins": 8.375, "rewards/rejected": -25.5, "step": 8100 }, { "epoch": 0.585432758247311, "grad_norm": 10.78067582215743, "learning_rate": 8.883406995744061e-07, "logits/chosen": 0.00072479248046875, "logits/rejected": 0.462890625, "logps/chosen": -486.0, "logps/rejected": -520.0, "loss": 0.0871, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -20.5, "rewards/margins": 8.0, "rewards/rejected": -28.5, "step": 8110 }, { "epoch": 0.5861546235472461, "grad_norm": 10.090422377139445, "learning_rate": 8.877935232369506e-07, "logits/chosen": -0.08056640625, "logits/rejected": 0.474609375, "logps/chosen": -482.0, "logps/rejected": -516.0, "loss": 0.099, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -20.0, "rewards/margins": 7.6875, "rewards/rejected": -27.625, "step": 8120 }, { "epoch": 0.5868764888471811, "grad_norm": 9.163828173837649, "learning_rate": 8.872473567606276e-07, "logits/chosen": -0.07421875, "logits/rejected": 0.39453125, "logps/chosen": -458.0, "logps/rejected": -512.0, "loss": 0.0941, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 8.375, "rewards/rejected": -27.25, "step": 8130 }, { "epoch": 0.5875983541471161, "grad_norm": 11.57500015195801, "learning_rate": 8.867021970429453e-07, "logits/chosen": -0.0419921875, "logits/rejected": 0.4609375, "logps/chosen": -464.0, "logps/rejected": -506.0, "loss": 0.1203, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.125, "rewards/margins": 8.375, "rewards/rejected": -26.5, "step": 8140 }, { "epoch": 0.5883202194470512, "grad_norm": 7.163207747760915, "learning_rate": 8.86158040994738e-07, "logits/chosen": -0.11376953125, "logits/rejected": 0.3671875, "logps/chosen": -452.0, "logps/rejected": -506.0, "loss": 0.1049, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 8.5, "rewards/rejected": -25.25, "step": 8150 }, { "epoch": 0.5890420847469862, "grad_norm": 7.574259987420087, "learning_rate": 8.856148855400954e-07, "logits/chosen": -0.11474609375, "logits/rejected": 0.310546875, "logps/chosen": -454.0, "logps/rejected": -500.0, "loss": 0.1185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 7.875, "rewards/rejected": -25.625, "step": 8160 }, { "epoch": 0.5897639500469213, "grad_norm": 9.654139371358802, "learning_rate": 8.850727276162873e-07, "logits/chosen": -0.06396484375, "logits/rejected": 0.453125, "logps/chosen": -458.0, "logps/rejected": -516.0, "loss": 0.0994, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 10.5, "rewards/rejected": -28.0, "step": 8170 }, { "epoch": 0.5904858153468563, "grad_norm": 10.800157360845885, "learning_rate": 8.845315641736929e-07, "logits/chosen": -0.00994873046875, "logits/rejected": 0.3671875, "logps/chosen": -474.0, "logps/rejected": -500.0, "loss": 0.1139, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 7.59375, "rewards/rejected": -24.625, "step": 8180 }, { "epoch": 0.5912076806467913, "grad_norm": 11.115517165464615, "learning_rate": 8.839913921757278e-07, "logits/chosen": -0.2490234375, "logits/rejected": 0.310546875, "logps/chosen": -460.0, "logps/rejected": -492.0, "loss": 0.1131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.375, "rewards/margins": 7.4375, "rewards/rejected": -24.875, "step": 8190 }, { "epoch": 0.5919295459467263, "grad_norm": 10.576450266313495, "learning_rate": 8.834522085987722e-07, "logits/chosen": 0.01202392578125, "logits/rejected": 0.44921875, "logps/chosen": -432.0, "logps/rejected": -480.0, "loss": 0.1233, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.5, "rewards/margins": 7.59375, "rewards/rejected": -24.0, "step": 8200 }, { "epoch": 0.5926514112466613, "grad_norm": 8.810017307376466, "learning_rate": 8.829140104321008e-07, "logits/chosen": -0.05224609375, "logits/rejected": 0.4375, "logps/chosen": -412.0, "logps/rejected": -496.0, "loss": 0.0943, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0625, "rewards/margins": 7.5, "rewards/rejected": -22.5, "step": 8210 }, { "epoch": 0.5933732765465964, "grad_norm": 10.866593084042634, "learning_rate": 8.82376794677811e-07, "logits/chosen": -0.18359375, "logits/rejected": 0.3125, "logps/chosen": -432.0, "logps/rejected": -490.0, "loss": 0.0948, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5, "rewards/margins": 7.4375, "rewards/rejected": -22.875, "step": 8220 }, { "epoch": 0.5940951418465314, "grad_norm": 4.9381720899811, "learning_rate": 8.818405583507537e-07, "logits/chosen": -0.08056640625, "logits/rejected": 0.34375, "logps/chosen": -434.0, "logps/rejected": -486.0, "loss": 0.0953, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.3125, "rewards/margins": 9.0625, "rewards/rejected": -23.375, "step": 8230 }, { "epoch": 0.5948170071464665, "grad_norm": 7.763795530942771, "learning_rate": 8.813052984784634e-07, "logits/chosen": -0.055419921875, "logits/rejected": 0.396484375, "logps/chosen": -420.0, "logps/rejected": -444.0, "loss": 0.1034, "rewards/accuracies": 0.96875, "rewards/chosen": -13.8125, "rewards/margins": 7.875, "rewards/rejected": -21.75, "step": 8240 }, { "epoch": 0.5955388724464015, "grad_norm": 11.47506221118772, "learning_rate": 8.807710121010885e-07, "logits/chosen": -0.2109375, "logits/rejected": 0.34765625, "logps/chosen": -462.0, "logps/rejected": -478.0, "loss": 0.1058, "rewards/accuracies": 0.9375, "rewards/chosen": -14.4375, "rewards/margins": 8.4375, "rewards/rejected": -22.875, "step": 8250 }, { "epoch": 0.5962607377463366, "grad_norm": 7.57421257710474, "learning_rate": 8.802376962713231e-07, "logits/chosen": 0.0732421875, "logits/rejected": 0.458984375, "logps/chosen": -422.0, "logps/rejected": -448.0, "loss": 0.1139, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.6875, "rewards/margins": 7.21875, "rewards/rejected": -21.875, "step": 8260 }, { "epoch": 0.5969826030462716, "grad_norm": 9.89817033964841, "learning_rate": 8.797053480543386e-07, "logits/chosen": -0.0216064453125, "logits/rejected": 0.361328125, "logps/chosen": -422.0, "logps/rejected": -490.0, "loss": 0.1287, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.3125, "rewards/margins": 8.0625, "rewards/rejected": -22.375, "step": 8270 }, { "epoch": 0.5977044683462066, "grad_norm": 11.888242596971308, "learning_rate": 8.79173964527716e-07, "logits/chosen": 0.026123046875, "logits/rejected": 0.431640625, "logps/chosen": -422.0, "logps/rejected": -470.0, "loss": 0.0982, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.5625, "rewards/margins": 7.84375, "rewards/rejected": -22.375, "step": 8280 }, { "epoch": 0.5984263336461416, "grad_norm": 7.747242477502637, "learning_rate": 8.78643542781378e-07, "logits/chosen": -0.1728515625, "logits/rejected": 0.255859375, "logps/chosen": -460.0, "logps/rejected": -494.0, "loss": 0.1277, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -15.4375, "rewards/margins": 7.09375, "rewards/rejected": -22.5, "step": 8290 }, { "epoch": 0.5991481989460766, "grad_norm": 7.767954747512575, "learning_rate": 8.781140799175228e-07, "logits/chosen": -0.0927734375, "logits/rejected": 0.388671875, "logps/chosen": -426.0, "logps/rejected": -468.0, "loss": 0.1296, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.1875, "rewards/margins": 8.0, "rewards/rejected": -23.25, "step": 8300 }, { "epoch": 0.5998700642460117, "grad_norm": 5.746152461521208, "learning_rate": 8.775855730505568e-07, "logits/chosen": 0.036865234375, "logits/rejected": 0.380859375, "logps/chosen": -416.0, "logps/rejected": -460.0, "loss": 0.1089, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.0625, "rewards/margins": 7.1875, "rewards/rejected": -22.25, "step": 8310 }, { "epoch": 0.6005919295459468, "grad_norm": 9.854437558860582, "learning_rate": 8.770580193070291e-07, "logits/chosen": -0.1748046875, "logits/rejected": 0.3203125, "logps/chosen": -450.0, "logps/rejected": -476.0, "loss": 0.0782, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.4375, "rewards/margins": 7.375, "rewards/rejected": -22.875, "step": 8320 }, { "epoch": 0.6013137948458818, "grad_norm": 9.103325654814043, "learning_rate": 8.765314158255661e-07, "logits/chosen": 0.0234375, "logits/rejected": 0.3203125, "logps/chosen": -404.0, "logps/rejected": -480.0, "loss": 0.1109, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.3125, "rewards/margins": 8.1875, "rewards/rejected": -23.5, "step": 8330 }, { "epoch": 0.6020356601458168, "grad_norm": 9.10614500855838, "learning_rate": 8.760057597568057e-07, "logits/chosen": -0.1435546875, "logits/rejected": 0.31640625, "logps/chosen": -442.0, "logps/rejected": -504.0, "loss": 0.1027, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.25, "rewards/margins": 7.90625, "rewards/rejected": -24.25, "step": 8340 }, { "epoch": 0.6027575254457518, "grad_norm": 10.217009608518193, "learning_rate": 8.754810482633324e-07, "logits/chosen": -0.01177978515625, "logits/rejected": 0.416015625, "logps/chosen": -436.0, "logps/rejected": -488.0, "loss": 0.0973, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.125, "rewards/margins": 7.75, "rewards/rejected": -23.875, "step": 8350 }, { "epoch": 0.6034793907456869, "grad_norm": 8.668258891557132, "learning_rate": 8.749572785196142e-07, "logits/chosen": -0.142578125, "logits/rejected": 0.3671875, "logps/chosen": -418.0, "logps/rejected": -476.0, "loss": 0.1079, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.75, "rewards/margins": 7.90625, "rewards/rejected": -22.625, "step": 8360 }, { "epoch": 0.6042012560456219, "grad_norm": 6.311806276954964, "learning_rate": 8.744344477119373e-07, "logits/chosen": -0.0003261566162109375, "logits/rejected": 0.5, "logps/chosen": -430.0, "logps/rejected": -470.0, "loss": 0.087, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0, "rewards/margins": 7.78125, "rewards/rejected": -22.75, "step": 8370 }, { "epoch": 0.6049231213455569, "grad_norm": 3.9671752930993027, "learning_rate": 8.739125530383433e-07, "logits/chosen": -0.0169677734375, "logits/rejected": 0.47265625, "logps/chosen": -442.0, "logps/rejected": -476.0, "loss": 0.0978, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.625, "rewards/margins": 7.46875, "rewards/rejected": -23.125, "step": 8380 }, { "epoch": 0.6056449866454919, "grad_norm": 6.167579054559879, "learning_rate": 8.733915917085661e-07, "logits/chosen": -0.12060546875, "logits/rejected": 0.3203125, "logps/chosen": -460.0, "logps/rejected": -474.0, "loss": 0.1084, "rewards/accuracies": 0.96875, "rewards/chosen": -16.0, "rewards/margins": 8.125, "rewards/rejected": -24.125, "step": 8390 }, { "epoch": 0.6063668519454269, "grad_norm": 6.5267703335888525, "learning_rate": 8.728715609439695e-07, "logits/chosen": -0.01226806640625, "logits/rejected": 0.416015625, "logps/chosen": -452.0, "logps/rejected": -508.0, "loss": 0.0951, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.0, "rewards/margins": 7.46875, "rewards/rejected": -25.375, "step": 8400 }, { "epoch": 0.6070887172453621, "grad_norm": 9.19413823115347, "learning_rate": 8.72352457977484e-07, "logits/chosen": -0.06494140625, "logits/rejected": 0.283203125, "logps/chosen": -456.0, "logps/rejected": -528.0, "loss": 0.1138, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.5, "rewards/margins": 7.71875, "rewards/rejected": -25.25, "step": 8410 }, { "epoch": 0.6078105825452971, "grad_norm": 8.560089994084162, "learning_rate": 8.718342800535456e-07, "logits/chosen": -0.05029296875, "logits/rejected": 0.373046875, "logps/chosen": -418.0, "logps/rejected": -470.0, "loss": 0.0903, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 8.0, "rewards/rejected": -23.375, "step": 8420 }, { "epoch": 0.6085324478452321, "grad_norm": 7.817764773733883, "learning_rate": 8.713170244280353e-07, "logits/chosen": -0.0634765625, "logits/rejected": 0.32421875, "logps/chosen": -424.0, "logps/rejected": -472.0, "loss": 0.1285, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.75, "rewards/margins": 7.375, "rewards/rejected": -22.125, "step": 8430 }, { "epoch": 0.6092543131451671, "grad_norm": 5.707011616813797, "learning_rate": 8.708006883682162e-07, "logits/chosen": -0.15625, "logits/rejected": 0.271484375, "logps/chosen": -436.0, "logps/rejected": -462.0, "loss": 0.1154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.25, "rewards/margins": 7.15625, "rewards/rejected": -21.375, "step": 8440 }, { "epoch": 0.6099761784451021, "grad_norm": 6.524303285007003, "learning_rate": 8.702852691526739e-07, "logits/chosen": -0.0001220703125, "logits/rejected": 0.287109375, "logps/chosen": -446.0, "logps/rejected": -502.0, "loss": 0.0802, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.75, "rewards/margins": 7.6875, "rewards/rejected": -23.5, "step": 8450 }, { "epoch": 0.6106980437450372, "grad_norm": 7.316526823284807, "learning_rate": 8.697707640712562e-07, "logits/chosen": -0.126953125, "logits/rejected": 0.375, "logps/chosen": -460.0, "logps/rejected": -494.0, "loss": 0.1175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.625, "rewards/margins": 8.25, "rewards/rejected": -24.875, "step": 8460 }, { "epoch": 0.6114199090449722, "grad_norm": 6.066974432772934, "learning_rate": 8.692571704250135e-07, "logits/chosen": -0.203125, "logits/rejected": 0.314453125, "logps/chosen": -448.0, "logps/rejected": -468.0, "loss": 0.0849, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.125, "rewards/margins": 7.6875, "rewards/rejected": -23.875, "step": 8470 }, { "epoch": 0.6121417743449072, "grad_norm": 3.192983487815939, "learning_rate": 8.687444855261388e-07, "logits/chosen": -0.018310546875, "logits/rejected": 0.337890625, "logps/chosen": -432.0, "logps/rejected": -484.0, "loss": 0.0965, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.0, "rewards/margins": 8.125, "rewards/rejected": -24.25, "step": 8480 }, { "epoch": 0.6128636396448423, "grad_norm": 7.55395893333132, "learning_rate": 8.682327066979084e-07, "logits/chosen": -0.232421875, "logits/rejected": 0.20703125, "logps/chosen": -426.0, "logps/rejected": -466.0, "loss": 0.1122, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.75, "rewards/margins": 7.46875, "rewards/rejected": -22.25, "step": 8490 }, { "epoch": 0.6135855049447773, "grad_norm": 5.110644826243347, "learning_rate": 8.677218312746247e-07, "logits/chosen": -0.041259765625, "logits/rejected": 0.375, "logps/chosen": -438.0, "logps/rejected": -488.0, "loss": 0.1151, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.625, "rewards/margins": 7.53125, "rewards/rejected": -24.125, "step": 8500 }, { "epoch": 0.6143073702447124, "grad_norm": 6.730717194017024, "learning_rate": 8.672118566015558e-07, "logits/chosen": -0.09423828125, "logits/rejected": 0.404296875, "logps/chosen": -424.0, "logps/rejected": -490.0, "loss": 0.0823, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.0625, "rewards/margins": 8.125, "rewards/rejected": -23.25, "step": 8510 }, { "epoch": 0.6150292355446474, "grad_norm": 9.330139718864622, "learning_rate": 8.667027800348789e-07, "logits/chosen": -0.0341796875, "logits/rejected": 0.33984375, "logps/chosen": -426.0, "logps/rejected": -448.0, "loss": 0.1134, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.0625, "rewards/margins": 7.65625, "rewards/rejected": -21.75, "step": 8520 }, { "epoch": 0.6157511008445824, "grad_norm": 5.907081484494198, "learning_rate": 8.661945989416229e-07, "logits/chosen": -0.359375, "logits/rejected": 0.173828125, "logps/chosen": -452.0, "logps/rejected": -478.0, "loss": 0.0991, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.125, "rewards/margins": 8.1875, "rewards/rejected": -24.375, "step": 8530 }, { "epoch": 0.6164729661445174, "grad_norm": 5.873711429753785, "learning_rate": 8.6568731069961e-07, "logits/chosen": -0.1904296875, "logits/rejected": 0.2236328125, "logps/chosen": -442.0, "logps/rejected": -492.0, "loss": 0.1114, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.0, "rewards/margins": 8.625, "rewards/rejected": -24.625, "step": 8540 }, { "epoch": 0.6171948314444524, "grad_norm": 10.490212964998134, "learning_rate": 8.651809126974002e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.162109375, "logps/chosen": -434.0, "logps/rejected": -486.0, "loss": 0.1062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.25, "rewards/margins": 8.6875, "rewards/rejected": -24.0, "step": 8550 }, { "epoch": 0.6179166967443875, "grad_norm": 6.357426919957944, "learning_rate": 8.646754023342339e-07, "logits/chosen": -0.10546875, "logits/rejected": 0.248046875, "logps/chosen": -432.0, "logps/rejected": -476.0, "loss": 0.0923, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 8.0625, "rewards/rejected": -24.75, "step": 8560 }, { "epoch": 0.6186385620443225, "grad_norm": 7.322016819311883, "learning_rate": 8.64170777019976e-07, "logits/chosen": -0.21484375, "logits/rejected": 0.1904296875, "logps/chosen": -438.0, "logps/rejected": -486.0, "loss": 0.1121, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.5625, "rewards/margins": 7.5625, "rewards/rejected": -23.125, "step": 8570 }, { "epoch": 0.6193604273442576, "grad_norm": 12.083943792955624, "learning_rate": 8.636670341750609e-07, "logits/chosen": -0.01458740234375, "logits/rejected": 0.443359375, "logps/chosen": -412.0, "logps/rejected": -448.0, "loss": 0.135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.4375, "rewards/margins": 7.125, "rewards/rejected": -22.5, "step": 8580 }, { "epoch": 0.6200822926441926, "grad_norm": 8.059610120815014, "learning_rate": 8.631641712304359e-07, "logits/chosen": -0.2431640625, "logits/rejected": 0.28515625, "logps/chosen": -440.0, "logps/rejected": -476.0, "loss": 0.0991, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.25, "rewards/margins": 8.0, "rewards/rejected": -24.125, "step": 8590 }, { "epoch": 0.6208041579441276, "grad_norm": 7.795033360877695, "learning_rate": 8.626621856275073e-07, "logits/chosen": -0.1298828125, "logits/rejected": 0.33984375, "logps/chosen": -444.0, "logps/rejected": -502.0, "loss": 0.1233, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 8.25, "rewards/rejected": -25.125, "step": 8600 }, { "epoch": 0.6215260232440627, "grad_norm": 9.924736892261457, "learning_rate": 8.621610748180847e-07, "logits/chosen": -0.25390625, "logits/rejected": 0.240234375, "logps/chosen": -446.0, "logps/rejected": -492.0, "loss": 0.1011, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.875, "rewards/margins": 8.125, "rewards/rejected": -25.0, "step": 8610 }, { "epoch": 0.6222478885439977, "grad_norm": 6.6813557862760575, "learning_rate": 8.616608362643274e-07, "logits/chosen": -0.1630859375, "logits/rejected": 0.24609375, "logps/chosen": -466.0, "logps/rejected": -504.0, "loss": 0.1099, "rewards/accuracies": 0.96875, "rewards/chosen": -17.25, "rewards/margins": 8.4375, "rewards/rejected": -25.625, "step": 8620 }, { "epoch": 0.6229697538439327, "grad_norm": 14.358410550576233, "learning_rate": 8.611614674386904e-07, "logits/chosen": -0.146484375, "logits/rejected": 0.3046875, "logps/chosen": -426.0, "logps/rejected": -482.0, "loss": 0.1035, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.125, "rewards/margins": 8.3125, "rewards/rejected": -24.5, "step": 8630 }, { "epoch": 0.6236916191438677, "grad_norm": 8.421075263879258, "learning_rate": 8.606629658238703e-07, "logits/chosen": -0.1875, "logits/rejected": 0.359375, "logps/chosen": -464.0, "logps/rejected": -496.0, "loss": 0.1095, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -17.375, "rewards/margins": 7.75, "rewards/rejected": -25.125, "step": 8640 }, { "epoch": 0.6244134844438027, "grad_norm": 9.984658852522196, "learning_rate": 8.601653289127525e-07, "logits/chosen": -0.17578125, "logits/rejected": 0.24609375, "logps/chosen": -454.0, "logps/rejected": -480.0, "loss": 0.1116, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 7.8125, "rewards/rejected": -24.625, "step": 8650 }, { "epoch": 0.6251353497437379, "grad_norm": 5.470398267830087, "learning_rate": 8.596685542083577e-07, "logits/chosen": -0.173828125, "logits/rejected": 0.294921875, "logps/chosen": -442.0, "logps/rejected": -494.0, "loss": 0.1089, "rewards/accuracies": 0.9375, "rewards/chosen": -16.625, "rewards/margins": 8.0, "rewards/rejected": -24.625, "step": 8660 }, { "epoch": 0.6258572150436729, "grad_norm": 10.167234836541152, "learning_rate": 8.591726392237899e-07, "logits/chosen": -0.1474609375, "logits/rejected": 0.322265625, "logps/chosen": -458.0, "logps/rejected": -500.0, "loss": 0.1367, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.0, "rewards/margins": 8.0, "rewards/rejected": -26.0, "step": 8670 }, { "epoch": 0.6265790803436079, "grad_norm": 6.244982895241238, "learning_rate": 8.586775814821837e-07, "logits/chosen": -0.271484375, "logits/rejected": 0.30859375, "logps/chosen": -484.0, "logps/rejected": -516.0, "loss": 0.1108, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -18.125, "rewards/margins": 8.4375, "rewards/rejected": -26.5, "step": 8680 }, { "epoch": 0.6273009456435429, "grad_norm": 10.682088996879331, "learning_rate": 8.58183378516652e-07, "logits/chosen": 0.00958251953125, "logits/rejected": 0.40234375, "logps/chosen": -446.0, "logps/rejected": -502.0, "loss": 0.1002, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -19.375, "rewards/margins": 7.96875, "rewards/rejected": -27.375, "step": 8690 }, { "epoch": 0.6280228109434779, "grad_norm": 9.067208935234188, "learning_rate": 8.576900278702358e-07, "logits/chosen": -0.0634765625, "logits/rejected": 0.220703125, "logps/chosen": -452.0, "logps/rejected": -516.0, "loss": 0.1027, "rewards/accuracies": 0.96875, "rewards/chosen": -17.875, "rewards/margins": 8.125, "rewards/rejected": -26.0, "step": 8700 }, { "epoch": 0.628744676243413, "grad_norm": 6.306160159475809, "learning_rate": 8.57197527095851e-07, "logits/chosen": -0.17578125, "logits/rejected": 0.2021484375, "logps/chosen": -448.0, "logps/rejected": -508.0, "loss": 0.1106, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 7.625, "rewards/rejected": -24.5, "step": 8710 }, { "epoch": 0.629466541543348, "grad_norm": 11.55494787016399, "learning_rate": 8.567058737562385e-07, "logits/chosen": -0.25, "logits/rejected": 0.28515625, "logps/chosen": -422.0, "logps/rejected": -460.0, "loss": 0.1005, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.9375, "rewards/margins": 7.8125, "rewards/rejected": -22.75, "step": 8720 }, { "epoch": 0.630188406843283, "grad_norm": 14.598555799441899, "learning_rate": 8.562150654239141e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.2177734375, "logps/chosen": -438.0, "logps/rejected": -500.0, "loss": 0.1248, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.125, "rewards/margins": 8.125, "rewards/rejected": -23.25, "step": 8730 }, { "epoch": 0.630910272143218, "grad_norm": 8.204732549755766, "learning_rate": 8.55725099681116e-07, "logits/chosen": -0.287109375, "logits/rejected": 0.2373046875, "logps/chosen": -432.0, "logps/rejected": -462.0, "loss": 0.1004, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.6875, "rewards/margins": 7.71875, "rewards/rejected": -22.375, "step": 8740 }, { "epoch": 0.631632137443153, "grad_norm": 6.736448338961849, "learning_rate": 8.552359741197579e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.2177734375, "logps/chosen": -400.0, "logps/rejected": -446.0, "loss": 0.0945, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.4375, "rewards/margins": 7.84375, "rewards/rejected": -20.25, "step": 8750 }, { "epoch": 0.6323540027430882, "grad_norm": 6.882278578608632, "learning_rate": 8.547476863413765e-07, "logits/chosen": -0.1845703125, "logits/rejected": 0.1357421875, "logps/chosen": -426.0, "logps/rejected": -466.0, "loss": 0.1031, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.8125, "rewards/margins": 7.46875, "rewards/rejected": -21.25, "step": 8760 }, { "epoch": 0.6330758680430232, "grad_norm": 13.682294386938674, "learning_rate": 8.54260233957083e-07, "logits/chosen": -0.0625, "logits/rejected": 0.3125, "logps/chosen": -406.0, "logps/rejected": -446.0, "loss": 0.1046, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.625, "rewards/margins": 7.40625, "rewards/rejected": -21.0, "step": 8770 }, { "epoch": 0.6337977333429582, "grad_norm": 10.428542347792531, "learning_rate": 8.537736145875154e-07, "logits/chosen": -0.2451171875, "logits/rejected": 0.158203125, "logps/chosen": -440.0, "logps/rejected": -488.0, "loss": 0.1078, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.25, "rewards/margins": 7.71875, "rewards/rejected": -22.0, "step": 8780 }, { "epoch": 0.6345195986428932, "grad_norm": 10.863660836153027, "learning_rate": 8.532878258627874e-07, "logits/chosen": -0.185546875, "logits/rejected": 0.212890625, "logps/chosen": -442.0, "logps/rejected": -484.0, "loss": 0.1374, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.375, "rewards/margins": 7.65625, "rewards/rejected": -23.0, "step": 8790 }, { "epoch": 0.6352414639428283, "grad_norm": 10.522170499235482, "learning_rate": 8.528028654224416e-07, "logits/chosen": -0.18359375, "logits/rejected": 0.16015625, "logps/chosen": -410.0, "logps/rejected": -450.0, "loss": 0.1, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.3125, "rewards/margins": 6.875, "rewards/rejected": -21.25, "step": 8800 }, { "epoch": 0.6359633292427633, "grad_norm": 3.4316152693000093, "learning_rate": 8.523187309154008e-07, "logits/chosen": -0.166015625, "logits/rejected": 0.287109375, "logps/chosen": -420.0, "logps/rejected": -456.0, "loss": 0.0766, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.375, "rewards/margins": 7.3125, "rewards/rejected": -21.75, "step": 8810 }, { "epoch": 0.6366851945426983, "grad_norm": 10.414612429715696, "learning_rate": 8.518354199999198e-07, "logits/chosen": -0.10693359375, "logits/rejected": 0.349609375, "logps/chosen": -414.0, "logps/rejected": -472.0, "loss": 0.0983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.875, "rewards/margins": 7.75, "rewards/rejected": -22.625, "step": 8820 }, { "epoch": 0.6374070598426334, "grad_norm": 5.968973308066829, "learning_rate": 8.513529303435386e-07, "logits/chosen": -0.234375, "logits/rejected": 0.267578125, "logps/chosen": -426.0, "logps/rejected": -472.0, "loss": 0.1047, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0625, "rewards/margins": 7.90625, "rewards/rejected": -23.0, "step": 8830 }, { "epoch": 0.6381289251425684, "grad_norm": 12.118103636872336, "learning_rate": 8.50871259623034e-07, "logits/chosen": -0.1611328125, "logits/rejected": 0.259765625, "logps/chosen": -416.0, "logps/rejected": -458.0, "loss": 0.102, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.8125, "rewards/margins": 7.8125, "rewards/rejected": -22.625, "step": 8840 }, { "epoch": 0.6388507904425035, "grad_norm": 10.258249929456122, "learning_rate": 8.503904055243742e-07, "logits/chosen": -0.1142578125, "logits/rejected": 0.330078125, "logps/chosen": -396.0, "logps/rejected": -444.0, "loss": 0.1343, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -14.5, "rewards/margins": 6.5625, "rewards/rejected": -21.125, "step": 8850 }, { "epoch": 0.6395726557424385, "grad_norm": 8.15364512707675, "learning_rate": 8.499103657426704e-07, "logits/chosen": -0.140625, "logits/rejected": 0.302734375, "logps/chosen": -436.0, "logps/rejected": -472.0, "loss": 0.1114, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0, "rewards/margins": 7.71875, "rewards/rejected": -22.75, "step": 8860 }, { "epoch": 0.6402945210423735, "grad_norm": 8.026641024333546, "learning_rate": 8.494311379821314e-07, "logits/chosen": -0.23046875, "logits/rejected": 0.283203125, "logps/chosen": -442.0, "logps/rejected": -484.0, "loss": 0.1024, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.625, "rewards/margins": 8.125, "rewards/rejected": -23.75, "step": 8870 }, { "epoch": 0.6410163863423085, "grad_norm": 12.16726977146951, "learning_rate": 8.489527199560178e-07, "logits/chosen": -0.11767578125, "logits/rejected": 0.236328125, "logps/chosen": -444.0, "logps/rejected": -510.0, "loss": 0.0757, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 8.1875, "rewards/rejected": -24.375, "step": 8880 }, { "epoch": 0.6417382516422435, "grad_norm": 9.567969215036921, "learning_rate": 8.484751093865948e-07, "logits/chosen": -0.2734375, "logits/rejected": 0.287109375, "logps/chosen": -500.0, "logps/rejected": -532.0, "loss": 0.11, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.5, "rewards/margins": 8.0625, "rewards/rejected": -26.625, "step": 8890 }, { "epoch": 0.6424601169421786, "grad_norm": 8.125837349026241, "learning_rate": 8.47998304005088e-07, "logits/chosen": -0.2578125, "logits/rejected": 0.31640625, "logps/chosen": -452.0, "logps/rejected": -496.0, "loss": 0.1004, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.25, "rewards/margins": 8.3125, "rewards/rejected": -24.5, "step": 8900 }, { "epoch": 0.6431819822421136, "grad_norm": 11.08567913254251, "learning_rate": 8.475223015516377e-07, "logits/chosen": -0.23046875, "logits/rejected": 0.2001953125, "logps/chosen": -430.0, "logps/rejected": -486.0, "loss": 0.1192, "rewards/accuracies": 0.96875, "rewards/chosen": -16.0, "rewards/margins": 8.625, "rewards/rejected": -24.625, "step": 8910 }, { "epoch": 0.6439038475420487, "grad_norm": 10.923931389530916, "learning_rate": 8.470470997752534e-07, "logits/chosen": -0.1298828125, "logits/rejected": 0.2236328125, "logps/chosen": -438.0, "logps/rejected": -474.0, "loss": 0.1254, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.4375, "rewards/margins": 8.6875, "rewards/rejected": -23.125, "step": 8920 }, { "epoch": 0.6446257128419837, "grad_norm": 8.973425264347869, "learning_rate": 8.465726964337702e-07, "logits/chosen": -0.107421875, "logits/rejected": 0.263671875, "logps/chosen": -458.0, "logps/rejected": -498.0, "loss": 0.095, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.875, "rewards/margins": 7.84375, "rewards/rejected": -23.75, "step": 8930 }, { "epoch": 0.6453475781419187, "grad_norm": 6.846661314643279, "learning_rate": 8.460990892938031e-07, "logits/chosen": -0.1162109375, "logits/rejected": 0.302734375, "logps/chosen": -424.0, "logps/rejected": -458.0, "loss": 0.0972, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.8125, "rewards/margins": 7.3125, "rewards/rejected": -23.125, "step": 8940 }, { "epoch": 0.6460694434418538, "grad_norm": 11.308594166516675, "learning_rate": 8.456262761307038e-07, "logits/chosen": -0.1162109375, "logits/rejected": 0.251953125, "logps/chosen": -432.0, "logps/rejected": -492.0, "loss": 0.0974, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.75, "rewards/margins": 8.0, "rewards/rejected": -24.75, "step": 8950 }, { "epoch": 0.6467913087417888, "grad_norm": 8.818016489154145, "learning_rate": 8.451542547285166e-07, "logits/chosen": -0.1826171875, "logits/rejected": 0.2275390625, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.1047, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 8.375, "rewards/rejected": -27.25, "step": 8960 }, { "epoch": 0.6475131740417238, "grad_norm": 8.863375120626564, "learning_rate": 8.44683022879934e-07, "logits/chosen": -0.158203125, "logits/rejected": 0.423828125, "logps/chosen": -458.0, "logps/rejected": -482.0, "loss": 0.1117, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.375, "rewards/margins": 8.0, "rewards/rejected": -25.375, "step": 8970 }, { "epoch": 0.6482350393416588, "grad_norm": 8.082402931279965, "learning_rate": 8.442125783862544e-07, "logits/chosen": -0.181640625, "logits/rejected": 0.330078125, "logps/chosen": -444.0, "logps/rejected": -480.0, "loss": 0.0786, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 7.8125, "rewards/rejected": -24.625, "step": 8980 }, { "epoch": 0.6489569046415938, "grad_norm": 11.96267269520446, "learning_rate": 8.437429190573388e-07, "logits/chosen": -0.1875, "logits/rejected": 0.359375, "logps/chosen": -444.0, "logps/rejected": -478.0, "loss": 0.1292, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.875, "rewards/margins": 8.4375, "rewards/rejected": -24.25, "step": 8990 }, { "epoch": 0.649678769941529, "grad_norm": 13.699566598398453, "learning_rate": 8.432740427115678e-07, "logits/chosen": -0.08544921875, "logits/rejected": 0.3125, "logps/chosen": -448.0, "logps/rejected": -504.0, "loss": 0.111, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.0625, "rewards/rejected": -24.75, "step": 9000 }, { "epoch": 0.650400635241464, "grad_norm": 11.793357988917398, "learning_rate": 8.428059471757984e-07, "logits/chosen": -0.150390625, "logits/rejected": 0.2119140625, "logps/chosen": -480.0, "logps/rejected": -520.0, "loss": 0.117, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.0, "rewards/margins": 7.5625, "rewards/rejected": -25.625, "step": 9010 }, { "epoch": 0.651122500541399, "grad_norm": 13.264369214444114, "learning_rate": 8.423386302853226e-07, "logits/chosen": -0.044189453125, "logits/rejected": 0.390625, "logps/chosen": -478.0, "logps/rejected": -524.0, "loss": 0.0978, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 9.0, "rewards/rejected": -27.875, "step": 9020 }, { "epoch": 0.651844365841334, "grad_norm": 12.286637204623283, "learning_rate": 8.418720898838254e-07, "logits/chosen": -0.09521484375, "logits/rejected": 0.4140625, "logps/chosen": -438.0, "logps/rejected": -496.0, "loss": 0.0969, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.875, "rewards/margins": 7.9375, "rewards/rejected": -25.75, "step": 9030 }, { "epoch": 0.652566231141269, "grad_norm": 11.437209673963846, "learning_rate": 8.414063238233425e-07, "logits/chosen": -0.1103515625, "logits/rejected": 0.240234375, "logps/chosen": -464.0, "logps/rejected": -528.0, "loss": 0.1062, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.375, "rewards/margins": 9.125, "rewards/rejected": -26.5, "step": 9040 }, { "epoch": 0.6532880964412041, "grad_norm": 6.299975682487468, "learning_rate": 8.409413299642188e-07, "logits/chosen": -0.1484375, "logits/rejected": 0.35546875, "logps/chosen": -420.0, "logps/rejected": -480.0, "loss": 0.1165, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 8.125, "rewards/rejected": -23.375, "step": 9050 }, { "epoch": 0.6540099617411391, "grad_norm": 10.52710811298542, "learning_rate": 8.404771061750672e-07, "logits/chosen": -0.2177734375, "logits/rejected": 0.3125, "logps/chosen": -418.0, "logps/rejected": -460.0, "loss": 0.1158, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.5, "rewards/margins": 8.25, "rewards/rejected": -22.75, "step": 9060 }, { "epoch": 0.6547318270410741, "grad_norm": 7.694682785117149, "learning_rate": 8.400136503327277e-07, "logits/chosen": -0.32421875, "logits/rejected": 0.2392578125, "logps/chosen": -412.0, "logps/rejected": -452.0, "loss": 0.0963, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.375, "rewards/margins": 7.9375, "rewards/rejected": -22.375, "step": 9070 }, { "epoch": 0.6554536923410091, "grad_norm": 13.098941583557345, "learning_rate": 8.395509603222271e-07, "logits/chosen": -0.22265625, "logits/rejected": 0.228515625, "logps/chosen": -446.0, "logps/rejected": -492.0, "loss": 0.1024, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.9375, "rewards/margins": 7.78125, "rewards/rejected": -23.75, "step": 9080 }, { "epoch": 0.6561755576409442, "grad_norm": 7.894979857159869, "learning_rate": 8.390890340367368e-07, "logits/chosen": -0.0830078125, "logits/rejected": 0.447265625, "logps/chosen": -440.0, "logps/rejected": -474.0, "loss": 0.1115, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 8.125, "rewards/rejected": -25.0, "step": 9090 }, { "epoch": 0.6568974229408793, "grad_norm": 8.812958633382424, "learning_rate": 8.386278693775346e-07, "logits/chosen": -0.2578125, "logits/rejected": 0.2734375, "logps/chosen": -426.0, "logps/rejected": -464.0, "loss": 0.0892, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.4375, "rewards/margins": 8.25, "rewards/rejected": -23.625, "step": 9100 }, { "epoch": 0.6576192882408143, "grad_norm": 8.965412228066146, "learning_rate": 8.381674642539632e-07, "logits/chosen": -0.21875, "logits/rejected": 0.2451171875, "logps/chosen": -444.0, "logps/rejected": -470.0, "loss": 0.114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.5625, "rewards/margins": 8.0625, "rewards/rejected": -23.625, "step": 9110 }, { "epoch": 0.6583411535407493, "grad_norm": 8.770928059308742, "learning_rate": 8.37707816583391e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.3046875, "logps/chosen": -446.0, "logps/rejected": -472.0, "loss": 0.0821, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.3125, "rewards/margins": 8.1875, "rewards/rejected": -23.5, "step": 9120 }, { "epoch": 0.6590630188406843, "grad_norm": 13.622198165775531, "learning_rate": 8.372489242911724e-07, "logits/chosen": -0.310546875, "logits/rejected": 0.3046875, "logps/chosen": -410.0, "logps/rejected": -484.0, "loss": 0.1069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -14.3125, "rewards/margins": 9.1875, "rewards/rejected": -23.5, "step": 9130 }, { "epoch": 0.6597848841406193, "grad_norm": 19.98241474176171, "learning_rate": 8.367907853106078e-07, "logits/chosen": -0.451171875, "logits/rejected": 0.1259765625, "logps/chosen": -420.0, "logps/rejected": -456.0, "loss": 0.1129, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.0625, "rewards/margins": 7.75, "rewards/rejected": -21.875, "step": 9140 }, { "epoch": 0.6605067494405544, "grad_norm": 6.67895902887688, "learning_rate": 8.363333975829066e-07, "logits/chosen": -0.322265625, "logits/rejected": 0.1962890625, "logps/chosen": -426.0, "logps/rejected": -468.0, "loss": 0.0961, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.0, "rewards/margins": 8.0625, "rewards/rejected": -23.125, "step": 9150 }, { "epoch": 0.6612286147404894, "grad_norm": 12.278968275042093, "learning_rate": 8.358767590571457e-07, "logits/chosen": -0.1611328125, "logits/rejected": 0.2734375, "logps/chosen": -432.0, "logps/rejected": -466.0, "loss": 0.1249, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.3125, "rewards/margins": 7.9375, "rewards/rejected": -23.25, "step": 9160 }, { "epoch": 0.6619504800404244, "grad_norm": 8.58788094757818, "learning_rate": 8.354208676902326e-07, "logits/chosen": -0.28125, "logits/rejected": 0.2021484375, "logps/chosen": -450.0, "logps/rejected": -484.0, "loss": 0.0927, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.75, "rewards/margins": 8.375, "rewards/rejected": -24.125, "step": 9170 }, { "epoch": 0.6626723453403595, "grad_norm": 6.937227005322704, "learning_rate": 8.349657214468659e-07, "logits/chosen": -0.1513671875, "logits/rejected": 0.232421875, "logps/chosen": -476.0, "logps/rejected": -532.0, "loss": 0.1273, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.875, "rewards/margins": 8.6875, "rewards/rejected": -26.5, "step": 9180 }, { "epoch": 0.6633942106402945, "grad_norm": 6.592859285256643, "learning_rate": 8.345113182994988e-07, "logits/chosen": -0.39453125, "logits/rejected": 0.255859375, "logps/chosen": -450.0, "logps/rejected": -492.0, "loss": 0.0923, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.375, "rewards/margins": 8.25, "rewards/rejected": -24.625, "step": 9190 }, { "epoch": 0.6641160759402296, "grad_norm": 4.859116034240575, "learning_rate": 8.34057656228299e-07, "logits/chosen": -0.27734375, "logits/rejected": 0.10107421875, "logps/chosen": -462.0, "logps/rejected": -508.0, "loss": 0.0817, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.5, "rewards/margins": 8.3125, "rewards/rejected": -25.875, "step": 9200 }, { "epoch": 0.6648379412401646, "grad_norm": 8.29523981800585, "learning_rate": 8.336047332211128e-07, "logits/chosen": -0.296875, "logits/rejected": 0.169921875, "logps/chosen": -440.0, "logps/rejected": -496.0, "loss": 0.085, "rewards/accuracies": 0.9375, "rewards/chosen": -17.375, "rewards/margins": 8.0, "rewards/rejected": -25.375, "step": 9210 }, { "epoch": 0.6655598065400996, "grad_norm": 9.686486046236542, "learning_rate": 8.331525472734267e-07, "logits/chosen": -0.37109375, "logits/rejected": 0.203125, "logps/chosen": -452.0, "logps/rejected": -500.0, "loss": 0.1217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.375, "rewards/margins": 9.1875, "rewards/rejected": -24.5, "step": 9220 }, { "epoch": 0.6662816718400346, "grad_norm": 10.105685032471307, "learning_rate": 8.327010963883302e-07, "logits/chosen": -0.318359375, "logits/rejected": 0.1904296875, "logps/chosen": -448.0, "logps/rejected": -512.0, "loss": 0.0853, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0625, "rewards/margins": 8.3125, "rewards/rejected": -23.375, "step": 9230 }, { "epoch": 0.6670035371399696, "grad_norm": 7.459008962945386, "learning_rate": 8.322503785764789e-07, "logits/chosen": -0.384765625, "logits/rejected": 0.232421875, "logps/chosen": -464.0, "logps/rejected": -516.0, "loss": 0.1132, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.875, "rewards/margins": 8.6875, "rewards/rejected": -25.625, "step": 9240 }, { "epoch": 0.6677254024399047, "grad_norm": 21.042056543683028, "learning_rate": 8.318003918560583e-07, "logits/chosen": -0.244140625, "logits/rejected": 0.328125, "logps/chosen": -452.0, "logps/rejected": -496.0, "loss": 0.1026, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.625, "rewards/margins": 9.25, "rewards/rejected": -25.875, "step": 9250 }, { "epoch": 0.6684472677398398, "grad_norm": 11.590755819239105, "learning_rate": 8.313511342527453e-07, "logits/chosen": -0.314453125, "logits/rejected": 0.16796875, "logps/chosen": -440.0, "logps/rejected": -490.0, "loss": 0.084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.125, "rewards/margins": 8.8125, "rewards/rejected": -24.0, "step": 9260 }, { "epoch": 0.6691691330397748, "grad_norm": 9.401281034572019, "learning_rate": 8.309026037996745e-07, "logits/chosen": -0.314453125, "logits/rejected": 0.228515625, "logps/chosen": -414.0, "logps/rejected": -462.0, "loss": 0.1046, "rewards/accuracies": 0.96875, "rewards/chosen": -15.0, "rewards/margins": 7.9375, "rewards/rejected": -23.0, "step": 9270 }, { "epoch": 0.6698909983397098, "grad_norm": 6.793765887794199, "learning_rate": 8.304547985373996e-07, "logits/chosen": -0.2734375, "logits/rejected": 0.240234375, "logps/chosen": -440.0, "logps/rejected": -492.0, "loss": 0.0883, "rewards/accuracies": 0.96875, "rewards/chosen": -15.625, "rewards/margins": 8.5625, "rewards/rejected": -24.125, "step": 9280 }, { "epoch": 0.6706128636396449, "grad_norm": 6.890187416367899, "learning_rate": 8.300077165138592e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.26953125, "logps/chosen": -426.0, "logps/rejected": -476.0, "loss": 0.0966, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5625, "rewards/margins": 8.625, "rewards/rejected": -24.125, "step": 9290 }, { "epoch": 0.6713347289395799, "grad_norm": 5.9425348156978925, "learning_rate": 8.295613557843402e-07, "logits/chosen": -0.2197265625, "logits/rejected": 0.27734375, "logps/chosen": -428.0, "logps/rejected": -482.0, "loss": 0.0903, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 7.90625, "rewards/rejected": -23.25, "step": 9300 }, { "epoch": 0.6720565942395149, "grad_norm": 10.648816039316303, "learning_rate": 8.291157144114419e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.2421875, "logps/chosen": -438.0, "logps/rejected": -482.0, "loss": 0.1129, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.9375, "rewards/margins": 8.125, "rewards/rejected": -23.0, "step": 9310 }, { "epoch": 0.6727784595394499, "grad_norm": 4.899336161587677, "learning_rate": 8.286707904650417e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.1328125, "logps/chosen": -442.0, "logps/rejected": -476.0, "loss": 0.1189, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5, "rewards/margins": 8.0, "rewards/rejected": -23.5, "step": 9320 }, { "epoch": 0.6735003248393849, "grad_norm": 9.337913602405884, "learning_rate": 8.282265820222593e-07, "logits/chosen": -0.1875, "logits/rejected": 0.28125, "logps/chosen": -442.0, "logps/rejected": -480.0, "loss": 0.1021, "rewards/accuracies": 0.96875, "rewards/chosen": -16.625, "rewards/margins": 7.71875, "rewards/rejected": -24.375, "step": 9330 }, { "epoch": 0.67422219013932, "grad_norm": 12.882184582399901, "learning_rate": 8.277830871674222e-07, "logits/chosen": -0.19921875, "logits/rejected": 0.1474609375, "logps/chosen": -444.0, "logps/rejected": -484.0, "loss": 0.1058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.875, "rewards/margins": 8.4375, "rewards/rejected": -24.25, "step": 9340 }, { "epoch": 0.6749440554392551, "grad_norm": 7.6618376877490855, "learning_rate": 8.273403039920306e-07, "logits/chosen": -0.15234375, "logits/rejected": 0.357421875, "logps/chosen": -444.0, "logps/rejected": -476.0, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.625, "rewards/margins": 8.125, "rewards/rejected": -24.75, "step": 9350 }, { "epoch": 0.6756659207391901, "grad_norm": 7.296475607200089, "learning_rate": 8.268982305947231e-07, "logits/chosen": -0.236328125, "logits/rejected": 0.2412109375, "logps/chosen": -462.0, "logps/rejected": -516.0, "loss": 0.0835, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.625, "rewards/margins": 9.125, "rewards/rejected": -25.75, "step": 9360 }, { "epoch": 0.6763877860391251, "grad_norm": 11.299955853929967, "learning_rate": 8.264568650812423e-07, "logits/chosen": -0.359375, "logits/rejected": 0.2177734375, "logps/chosen": -446.0, "logps/rejected": -504.0, "loss": 0.1035, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 8.625, "rewards/rejected": -25.625, "step": 9370 }, { "epoch": 0.6771096513390601, "grad_norm": 10.102639452221426, "learning_rate": 8.26016205564401e-07, "logits/chosen": -0.1640625, "logits/rejected": 0.29296875, "logps/chosen": -440.0, "logps/rejected": -508.0, "loss": 0.0984, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.25, "rewards/margins": 9.1875, "rewards/rejected": -24.375, "step": 9380 }, { "epoch": 0.6778315166389952, "grad_norm": 12.724789142640342, "learning_rate": 8.25576250164048e-07, "logits/chosen": -0.251953125, "logits/rejected": 0.291015625, "logps/chosen": -440.0, "logps/rejected": -478.0, "loss": 0.1196, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.9375, "rewards/margins": 8.1875, "rewards/rejected": -23.125, "step": 9390 }, { "epoch": 0.6785533819389302, "grad_norm": 7.65558421052079, "learning_rate": 8.251369970070346e-07, "logits/chosen": -0.23828125, "logits/rejected": 0.259765625, "logps/chosen": -412.0, "logps/rejected": -450.0, "loss": 0.1184, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -13.5625, "rewards/margins": 7.84375, "rewards/rejected": -21.375, "step": 9400 }, { "epoch": 0.6792752472388652, "grad_norm": 5.734137126869067, "learning_rate": 8.246984442271813e-07, "logits/chosen": -0.177734375, "logits/rejected": 0.2490234375, "logps/chosen": -418.0, "logps/rejected": -464.0, "loss": 0.1013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -13.625, "rewards/margins": 7.875, "rewards/rejected": -21.625, "step": 9410 }, { "epoch": 0.6799971125388002, "grad_norm": 9.667622159596144, "learning_rate": 8.242605899652435e-07, "logits/chosen": -0.146484375, "logits/rejected": 0.333984375, "logps/chosen": -420.0, "logps/rejected": -474.0, "loss": 0.0988, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.875, "rewards/margins": 8.1875, "rewards/rejected": -23.125, "step": 9420 }, { "epoch": 0.6807189778387353, "grad_norm": 10.529039696290567, "learning_rate": 8.238234323688798e-07, "logits/chosen": -0.349609375, "logits/rejected": 0.236328125, "logps/chosen": -438.0, "logps/rejected": -470.0, "loss": 0.1088, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.875, "rewards/margins": 7.75, "rewards/rejected": -23.625, "step": 9430 }, { "epoch": 0.6814408431386704, "grad_norm": 6.888162948128841, "learning_rate": 8.233869695926182e-07, "logits/chosen": -0.296875, "logits/rejected": 0.283203125, "logps/chosen": -434.0, "logps/rejected": -476.0, "loss": 0.1086, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 8.375, "rewards/rejected": -24.5, "step": 9440 }, { "epoch": 0.6821627084386054, "grad_norm": 11.496177447620928, "learning_rate": 8.229511997978235e-07, "logits/chosen": -0.216796875, "logits/rejected": 0.29296875, "logps/chosen": -454.0, "logps/rejected": -528.0, "loss": 0.0853, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.125, "rewards/margins": 8.5, "rewards/rejected": -25.625, "step": 9450 }, { "epoch": 0.6828845737385404, "grad_norm": 8.806829807729667, "learning_rate": 8.22516121152665e-07, "logits/chosen": -0.11474609375, "logits/rejected": 0.380859375, "logps/chosen": -432.0, "logps/rejected": -500.0, "loss": 0.085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.125, "rewards/margins": 7.78125, "rewards/rejected": -25.0, "step": 9460 }, { "epoch": 0.6836064390384754, "grad_norm": 13.977388550253519, "learning_rate": 8.220817318320836e-07, "logits/chosen": -0.171875, "logits/rejected": 0.318359375, "logps/chosen": -440.0, "logps/rejected": -474.0, "loss": 0.1029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.4375, "rewards/margins": 8.125, "rewards/rejected": -23.5, "step": 9470 }, { "epoch": 0.6843283043384104, "grad_norm": 8.796395696628545, "learning_rate": 8.216480300177611e-07, "logits/chosen": -0.12060546875, "logits/rejected": 0.375, "logps/chosen": -454.0, "logps/rejected": -488.0, "loss": 0.089, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.5, "rewards/margins": 7.9375, "rewards/rejected": -24.5, "step": 9480 }, { "epoch": 0.6850501696383455, "grad_norm": 9.270921059429318, "learning_rate": 8.212150138980857e-07, "logits/chosen": -0.044677734375, "logits/rejected": 0.330078125, "logps/chosen": -426.0, "logps/rejected": -472.0, "loss": 0.1056, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.5, "rewards/margins": 7.71875, "rewards/rejected": -24.125, "step": 9490 }, { "epoch": 0.6857720349382805, "grad_norm": 8.635539824549442, "learning_rate": 8.207826816681233e-07, "logits/chosen": -0.330078125, "logits/rejected": 0.275390625, "logps/chosen": -464.0, "logps/rejected": -508.0, "loss": 0.0972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.25, "rewards/margins": 9.4375, "rewards/rejected": -25.625, "step": 9500 }, { "epoch": 0.6864939002382155, "grad_norm": 6.010495892436107, "learning_rate": 8.203510315295829e-07, "logits/chosen": -0.1259765625, "logits/rejected": 0.3515625, "logps/chosen": -446.0, "logps/rejected": -508.0, "loss": 0.0957, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -17.875, "rewards/margins": 8.9375, "rewards/rejected": -26.75, "step": 9510 }, { "epoch": 0.6872157655381506, "grad_norm": 6.9237735654050345, "learning_rate": 8.199200616907878e-07, "logits/chosen": -0.181640625, "logits/rejected": 0.40625, "logps/chosen": -468.0, "logps/rejected": -504.0, "loss": 0.096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 8.75, "rewards/rejected": -25.625, "step": 9520 }, { "epoch": 0.6879376308380856, "grad_norm": 7.451384896742146, "learning_rate": 8.194897703666421e-07, "logits/chosen": -0.0458984375, "logits/rejected": 0.421875, "logps/chosen": -462.0, "logps/rejected": -516.0, "loss": 0.1094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.25, "rewards/margins": 9.125, "rewards/rejected": -27.375, "step": 9530 }, { "epoch": 0.6886594961380207, "grad_norm": 4.671951154113432, "learning_rate": 8.190601557786015e-07, "logits/chosen": -0.224609375, "logits/rejected": 0.353515625, "logps/chosen": -446.0, "logps/rejected": -496.0, "loss": 0.1136, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.5, "rewards/rejected": -25.25, "step": 9540 }, { "epoch": 0.6893813614379557, "grad_norm": 14.52188323993986, "learning_rate": 8.186312161546413e-07, "logits/chosen": -0.224609375, "logits/rejected": 0.33984375, "logps/chosen": -438.0, "logps/rejected": -486.0, "loss": 0.1066, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 8.5625, "rewards/rejected": -25.0, "step": 9550 }, { "epoch": 0.6901032267378907, "grad_norm": 7.035712175626363, "learning_rate": 8.182029497292262e-07, "logits/chosen": -0.205078125, "logits/rejected": 0.388671875, "logps/chosen": -434.0, "logps/rejected": -502.0, "loss": 0.0849, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 8.8125, "rewards/rejected": -24.875, "step": 9560 }, { "epoch": 0.6908250920378257, "grad_norm": 10.48954664162727, "learning_rate": 8.177753547432792e-07, "logits/chosen": -0.185546875, "logits/rejected": 0.2451171875, "logps/chosen": -446.0, "logps/rejected": -496.0, "loss": 0.0935, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5625, "rewards/margins": 8.875, "rewards/rejected": -24.5, "step": 9570 }, { "epoch": 0.6915469573377607, "grad_norm": 8.05525388899689, "learning_rate": 8.173484294441524e-07, "logits/chosen": -0.2177734375, "logits/rejected": 0.2314453125, "logps/chosen": -470.0, "logps/rejected": -540.0, "loss": 0.0917, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 8.5, "rewards/rejected": -27.375, "step": 9580 }, { "epoch": 0.6922688226376958, "grad_norm": 7.368938216264777, "learning_rate": 8.169221720855952e-07, "logits/chosen": -0.162109375, "logits/rejected": 0.322265625, "logps/chosen": -462.0, "logps/rejected": -532.0, "loss": 0.0838, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.5, "rewards/margins": 8.4375, "rewards/rejected": -25.875, "step": 9590 }, { "epoch": 0.6929906879376309, "grad_norm": 12.66806323705313, "learning_rate": 8.164965809277261e-07, "logits/chosen": -0.1064453125, "logits/rejected": 0.4140625, "logps/chosen": -444.0, "logps/rejected": -490.0, "loss": 0.1181, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.625, "rewards/margins": 8.375, "rewards/rejected": -26.0, "step": 9600 }, { "epoch": 0.6937125532375659, "grad_norm": 9.241222958435916, "learning_rate": 8.160716542370011e-07, "logits/chosen": -0.14453125, "logits/rejected": 0.314453125, "logps/chosen": -472.0, "logps/rejected": -508.0, "loss": 0.0932, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.875, "rewards/margins": 8.1875, "rewards/rejected": -26.0, "step": 9610 }, { "epoch": 0.6944344185375009, "grad_norm": 11.858078992147373, "learning_rate": 8.156473902861856e-07, "logits/chosen": -0.25390625, "logits/rejected": 0.255859375, "logps/chosen": -436.0, "logps/rejected": -480.0, "loss": 0.1235, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.6875, "rewards/margins": 8.125, "rewards/rejected": -23.75, "step": 9620 }, { "epoch": 0.6951562838374359, "grad_norm": 6.903718269875224, "learning_rate": 8.152237873543241e-07, "logits/chosen": -0.022705078125, "logits/rejected": 0.33203125, "logps/chosen": -446.0, "logps/rejected": -488.0, "loss": 0.1056, "rewards/accuracies": 0.90625, "rewards/chosen": -17.5, "rewards/margins": 7.25, "rewards/rejected": -24.75, "step": 9630 }, { "epoch": 0.695878149137371, "grad_norm": 11.094802867524306, "learning_rate": 8.148008437267104e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.41015625, "logps/chosen": -418.0, "logps/rejected": -472.0, "loss": 0.1131, "rewards/accuracies": 0.9375, "rewards/chosen": -16.5, "rewards/margins": 8.0, "rewards/rejected": -24.5, "step": 9640 }, { "epoch": 0.696600014437306, "grad_norm": 11.473125925368489, "learning_rate": 8.143785576948602e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.28515625, "logps/chosen": -456.0, "logps/rejected": -484.0, "loss": 0.1086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.0, "rewards/rejected": -24.625, "step": 9650 }, { "epoch": 0.697321879737241, "grad_norm": 7.198376778390441, "learning_rate": 8.139569275564796e-07, "logits/chosen": -0.19921875, "logits/rejected": 0.46484375, "logps/chosen": -464.0, "logps/rejected": -498.0, "loss": 0.1068, "rewards/accuracies": 0.9375, "rewards/chosen": -17.375, "rewards/margins": 7.84375, "rewards/rejected": -25.25, "step": 9660 }, { "epoch": 0.698043745037176, "grad_norm": 11.849627903882368, "learning_rate": 8.135359516154388e-07, "logits/chosen": -0.28125, "logits/rejected": 0.3046875, "logps/chosen": -450.0, "logps/rejected": -492.0, "loss": 0.1113, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 8.75, "rewards/rejected": -25.125, "step": 9670 }, { "epoch": 0.698765610337111, "grad_norm": 8.622492238630745, "learning_rate": 8.131156281817418e-07, "logits/chosen": -0.1904296875, "logits/rejected": 0.3125, "logps/chosen": -440.0, "logps/rejected": -490.0, "loss": 0.0845, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.0, "rewards/margins": 8.5625, "rewards/rejected": -25.5, "step": 9680 }, { "epoch": 0.6994874756370462, "grad_norm": 10.265068384272304, "learning_rate": 8.126959555714979e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.2314453125, "logps/chosen": -444.0, "logps/rejected": -476.0, "loss": 0.109, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.25, "rewards/margins": 7.625, "rewards/rejected": -23.875, "step": 9690 }, { "epoch": 0.7002093409369812, "grad_norm": 6.364120375723211, "learning_rate": 8.122769321068952e-07, "logits/chosen": -0.357421875, "logits/rejected": 0.3125, "logps/chosen": -438.0, "logps/rejected": -482.0, "loss": 0.1014, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.75, "rewards/margins": 8.5, "rewards/rejected": -24.25, "step": 9700 }, { "epoch": 0.7009312062369162, "grad_norm": 7.5035012190128985, "learning_rate": 8.118585561161698e-07, "logits/chosen": -0.3203125, "logits/rejected": 0.2578125, "logps/chosen": -416.0, "logps/rejected": -468.0, "loss": 0.109, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -14.0, "rewards/margins": 8.625, "rewards/rejected": -22.625, "step": 9710 }, { "epoch": 0.7016530715368512, "grad_norm": 7.937356098374378, "learning_rate": 8.114408259335793e-07, "logits/chosen": -0.375, "logits/rejected": 0.1953125, "logps/chosen": -436.0, "logps/rejected": -474.0, "loss": 0.1042, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.8125, "rewards/margins": 8.8125, "rewards/rejected": -23.625, "step": 9720 }, { "epoch": 0.7023749368367862, "grad_norm": 9.799175431364079, "learning_rate": 8.110237398993754e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.1943359375, "logps/chosen": -426.0, "logps/rejected": -460.0, "loss": 0.0828, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.4375, "rewards/margins": 8.0, "rewards/rejected": -21.5, "step": 9730 }, { "epoch": 0.7030968021367213, "grad_norm": 9.689351124921574, "learning_rate": 8.106072963597751e-07, "logits/chosen": -0.1953125, "logits/rejected": 0.2294921875, "logps/chosen": -414.0, "logps/rejected": -462.0, "loss": 0.0923, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.5, "rewards/margins": 7.5625, "rewards/rejected": -22.0, "step": 9740 }, { "epoch": 0.7038186674366563, "grad_norm": 9.040502324249857, "learning_rate": 8.101914936669332e-07, "logits/chosen": -0.212890625, "logits/rejected": 0.37109375, "logps/chosen": -446.0, "logps/rejected": -492.0, "loss": 0.0747, "rewards/accuracies": 0.96875, "rewards/chosen": -15.8125, "rewards/margins": 8.5, "rewards/rejected": -24.25, "step": 9750 }, { "epoch": 0.7045405327365913, "grad_norm": 6.315883906545072, "learning_rate": 8.09776330178916e-07, "logits/chosen": -0.09130859375, "logits/rejected": 0.3828125, "logps/chosen": -434.0, "logps/rejected": -492.0, "loss": 0.0901, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.125, "rewards/margins": 8.5625, "rewards/rejected": -24.625, "step": 9760 }, { "epoch": 0.7052623980365264, "grad_norm": 10.646540388599803, "learning_rate": 8.093618042596727e-07, "logits/chosen": -0.15234375, "logits/rejected": 0.376953125, "logps/chosen": -456.0, "logps/rejected": -520.0, "loss": 0.1062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.625, "rewards/margins": 8.25, "rewards/rejected": -26.875, "step": 9770 }, { "epoch": 0.7059842633364615, "grad_norm": 6.67230476529519, "learning_rate": 8.089479142790095e-07, "logits/chosen": 0.0220947265625, "logits/rejected": 0.447265625, "logps/chosen": -446.0, "logps/rejected": -508.0, "loss": 0.0909, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 8.4375, "rewards/rejected": -25.75, "step": 9780 }, { "epoch": 0.7067061286363965, "grad_norm": 11.691475687303088, "learning_rate": 8.085346586125621e-07, "logits/chosen": -0.21875, "logits/rejected": 0.30859375, "logps/chosen": -466.0, "logps/rejected": -496.0, "loss": 0.0927, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.625, "rewards/margins": 8.0, "rewards/rejected": -25.625, "step": 9790 }, { "epoch": 0.7074279939363315, "grad_norm": 9.957012784971107, "learning_rate": 8.081220356417685e-07, "logits/chosen": -0.10205078125, "logits/rejected": 0.318359375, "logps/chosen": -456.0, "logps/rejected": -498.0, "loss": 0.1119, "rewards/accuracies": 0.96875, "rewards/chosen": -18.0, "rewards/margins": 8.4375, "rewards/rejected": -26.375, "step": 9800 }, { "epoch": 0.7081498592362665, "grad_norm": 10.660439785999385, "learning_rate": 8.077100437538435e-07, "logits/chosen": -0.1630859375, "logits/rejected": 0.349609375, "logps/chosen": -440.0, "logps/rejected": -484.0, "loss": 0.1349, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.3125, "rewards/rejected": -25.0, "step": 9810 }, { "epoch": 0.7088717245362015, "grad_norm": 10.479248839679444, "learning_rate": 8.072986813417512e-07, "logits/chosen": 0.04931640625, "logits/rejected": 0.4453125, "logps/chosen": -434.0, "logps/rejected": -474.0, "loss": 0.1108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.375, "rewards/margins": 8.25, "rewards/rejected": -24.625, "step": 9820 }, { "epoch": 0.7095935898361366, "grad_norm": 10.561294596004377, "learning_rate": 8.068879468041791e-07, "logits/chosen": 0.00167083740234375, "logits/rejected": 0.3125, "logps/chosen": -456.0, "logps/rejected": -516.0, "loss": 0.0967, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.75, "rewards/margins": 8.25, "rewards/rejected": -25.875, "step": 9830 }, { "epoch": 0.7103154551360716, "grad_norm": 8.95646762861454, "learning_rate": 8.064778385455118e-07, "logits/chosen": -0.0751953125, "logits/rejected": 0.40625, "logps/chosen": -450.0, "logps/rejected": -498.0, "loss": 0.0794, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.125, "rewards/margins": 8.875, "rewards/rejected": -26.0, "step": 9840 }, { "epoch": 0.7110373204360066, "grad_norm": 9.46250000011388, "learning_rate": 8.060683549758054e-07, "logits/chosen": -0.08544921875, "logits/rejected": 0.396484375, "logps/chosen": -490.0, "logps/rejected": -528.0, "loss": 0.1005, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.75, "rewards/margins": 8.5, "rewards/rejected": -27.25, "step": 9850 }, { "epoch": 0.7117591857359417, "grad_norm": 4.401176153424422, "learning_rate": 8.056594945107608e-07, "logits/chosen": -0.1904296875, "logits/rejected": 0.4375, "logps/chosen": -454.0, "logps/rejected": -492.0, "loss": 0.0939, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.0, "rewards/margins": 8.5, "rewards/rejected": -26.5, "step": 9860 }, { "epoch": 0.7124810510358767, "grad_norm": 7.816613275503176, "learning_rate": 8.052512555716987e-07, "logits/chosen": -0.142578125, "logits/rejected": 0.353515625, "logps/chosen": -438.0, "logps/rejected": -508.0, "loss": 0.1016, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.625, "rewards/margins": 8.25, "rewards/rejected": -24.875, "step": 9870 }, { "epoch": 0.7132029163358118, "grad_norm": 6.669337091051921, "learning_rate": 8.048436365855337e-07, "logits/chosen": -0.11083984375, "logits/rejected": 0.328125, "logps/chosen": -458.0, "logps/rejected": -490.0, "loss": 0.0684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.625, "rewards/margins": 8.1875, "rewards/rejected": -24.875, "step": 9880 }, { "epoch": 0.7139247816357468, "grad_norm": 6.313045906195397, "learning_rate": 8.044366359847486e-07, "logits/chosen": -0.119140625, "logits/rejected": 0.353515625, "logps/chosen": -452.0, "logps/rejected": -498.0, "loss": 0.1048, "rewards/accuracies": 0.9375, "rewards/chosen": -16.5, "rewards/margins": 8.625, "rewards/rejected": -25.125, "step": 9890 }, { "epoch": 0.7146466469356818, "grad_norm": 9.180997319146117, "learning_rate": 8.040302522073696e-07, "logits/chosen": -0.07470703125, "logits/rejected": 0.3046875, "logps/chosen": -432.0, "logps/rejected": -490.0, "loss": 0.0956, "rewards/accuracies": 0.96875, "rewards/chosen": -15.3125, "rewards/margins": 9.0, "rewards/rejected": -24.25, "step": 9900 }, { "epoch": 0.7153685122356168, "grad_norm": 9.311481387821575, "learning_rate": 8.036244836969407e-07, "logits/chosen": -0.08251953125, "logits/rejected": 0.31640625, "logps/chosen": -438.0, "logps/rejected": -500.0, "loss": 0.1058, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.25, "rewards/margins": 8.1875, "rewards/rejected": -24.5, "step": 9910 }, { "epoch": 0.7160903775355518, "grad_norm": 6.5675433374797985, "learning_rate": 8.032193289024989e-07, "logits/chosen": -0.05859375, "logits/rejected": 0.392578125, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.0773, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.875, "rewards/margins": 9.3125, "rewards/rejected": -27.25, "step": 9920 }, { "epoch": 0.716812242835487, "grad_norm": 5.268980835595922, "learning_rate": 8.02814786278549e-07, "logits/chosen": 0.1318359375, "logits/rejected": 0.59765625, "logps/chosen": -466.0, "logps/rejected": -516.0, "loss": 0.0861, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 8.8125, "rewards/rejected": -28.0, "step": 9930 }, { "epoch": 0.717534108135422, "grad_norm": 7.934617938523506, "learning_rate": 8.024108542850394e-07, "logits/chosen": 0.06640625, "logits/rejected": 0.478515625, "logps/chosen": -480.0, "logps/rejected": -536.0, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.875, "rewards/margins": 8.8125, "rewards/rejected": -28.75, "step": 9940 }, { "epoch": 0.718255973435357, "grad_norm": 7.074420890977353, "learning_rate": 8.020075313873367e-07, "logits/chosen": -0.10595703125, "logits/rejected": 0.416015625, "logps/chosen": -452.0, "logps/rejected": -500.0, "loss": 0.0918, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.5, "rewards/margins": 8.25, "rewards/rejected": -25.75, "step": 9950 }, { "epoch": 0.718977838735292, "grad_norm": 13.187974767576348, "learning_rate": 8.016048160562023e-07, "logits/chosen": -0.11865234375, "logits/rejected": 0.36328125, "logps/chosen": -470.0, "logps/rejected": -502.0, "loss": 0.0885, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.25, "rewards/margins": 8.25, "rewards/rejected": -25.5, "step": 9960 }, { "epoch": 0.719699704035227, "grad_norm": 7.403045116194495, "learning_rate": 8.012027067677667e-07, "logits/chosen": -0.1259765625, "logits/rejected": 0.419921875, "logps/chosen": -484.0, "logps/rejected": -512.0, "loss": 0.0786, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.625, "rewards/margins": 8.4375, "rewards/rejected": -28.0, "step": 9970 }, { "epoch": 0.7204215693351621, "grad_norm": 7.756832497856835, "learning_rate": 8.008012020035062e-07, "logits/chosen": -0.021484375, "logits/rejected": 0.41796875, "logps/chosen": -460.0, "logps/rejected": -512.0, "loss": 0.0863, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.375, "rewards/margins": 8.4375, "rewards/rejected": -27.75, "step": 9980 }, { "epoch": 0.7211434346350971, "grad_norm": 9.566750965486527, "learning_rate": 8.004003002502188e-07, "logits/chosen": -0.0859375, "logits/rejected": 0.41796875, "logps/chosen": -470.0, "logps/rejected": -532.0, "loss": 0.0806, "rewards/accuracies": 0.9375, "rewards/chosen": -20.0, "rewards/margins": 9.3125, "rewards/rejected": -29.375, "step": 9990 }, { "epoch": 0.7218652999350321, "grad_norm": 12.847161898544035, "learning_rate": 8e-07, "logits/chosen": -0.039306640625, "logits/rejected": 0.375, "logps/chosen": -458.0, "logps/rejected": -516.0, "loss": 0.0766, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.75, "rewards/margins": 8.5, "rewards/rejected": -27.25, "step": 10000 }, { "epoch": 0.7218652999350321, "eval_logits/chosen": -0.10498046875, "eval_logits/rejected": 0.365234375, "eval_logps/chosen": -476.0, "eval_logps/rejected": -512.0, "eval_loss": 0.2292831838130951, "eval_rewards/accuracies": 0.9140422344207764, "eval_rewards/chosen": -19.5, "eval_rewards/margins": 7.5, "eval_rewards/rejected": -27.0, "eval_runtime": 2855.2107, "eval_samples_per_second": 34.499, "eval_steps_per_second": 0.539, "step": 10000 }, { "epoch": 0.7225871652349671, "grad_norm": 6.262170943295461, "learning_rate": 7.996002997502185e-07, "logits/chosen": -0.11865234375, "logits/rejected": 0.404296875, "logps/chosen": -476.0, "logps/rejected": -536.0, "loss": 0.1045, "rewards/accuracies": 0.96875, "rewards/chosen": -18.875, "rewards/margins": 9.125, "rewards/rejected": -28.0, "step": 10010 }, { "epoch": 0.7233090305349021, "grad_norm": 11.333701175250617, "learning_rate": 7.992011980034937e-07, "logits/chosen": -0.25, "logits/rejected": 0.36328125, "logps/chosen": -448.0, "logps/rejected": -500.0, "loss": 0.0845, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 8.1875, "rewards/rejected": -25.25, "step": 10020 }, { "epoch": 0.7240308958348373, "grad_norm": 10.391253317357606, "learning_rate": 7.98802693267671e-07, "logits/chosen": -0.050537109375, "logits/rejected": 0.376953125, "logps/chosen": -440.0, "logps/rejected": -488.0, "loss": 0.1033, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.125, "rewards/margins": 8.5, "rewards/rejected": -24.625, "step": 10030 }, { "epoch": 0.7247527611347723, "grad_norm": 11.579307373099335, "learning_rate": 7.984047840557992e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.30859375, "logps/chosen": -466.0, "logps/rejected": -520.0, "loss": 0.1, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 9.4375, "rewards/rejected": -27.0, "step": 10040 }, { "epoch": 0.7254746264347073, "grad_norm": 9.769028584216803, "learning_rate": 7.980074688861063e-07, "logits/chosen": -0.1923828125, "logits/rejected": 0.306640625, "logps/chosen": -412.0, "logps/rejected": -468.0, "loss": 0.1106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -14.9375, "rewards/margins": 8.375, "rewards/rejected": -23.375, "step": 10050 }, { "epoch": 0.7261964917346423, "grad_norm": 6.1582130751528945, "learning_rate": 7.976107462819775e-07, "logits/chosen": -0.083984375, "logits/rejected": 0.380859375, "logps/chosen": -458.0, "logps/rejected": -510.0, "loss": 0.0829, "rewards/accuracies": 0.96875, "rewards/chosen": -17.375, "rewards/margins": 7.78125, "rewards/rejected": -25.125, "step": 10060 }, { "epoch": 0.7269183570345773, "grad_norm": 7.838048408215388, "learning_rate": 7.97214614771931e-07, "logits/chosen": -0.18359375, "logits/rejected": 0.392578125, "logps/chosen": -426.0, "logps/rejected": -494.0, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.9375, "rewards/margins": 9.5, "rewards/rejected": -25.375, "step": 10070 }, { "epoch": 0.7276402223345124, "grad_norm": 8.22727265937406, "learning_rate": 7.968190728895957e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.283203125, "logps/chosen": -458.0, "logps/rejected": -492.0, "loss": 0.0893, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 8.5, "rewards/rejected": -25.375, "step": 10080 }, { "epoch": 0.7283620876344474, "grad_norm": 15.42760048465008, "learning_rate": 7.964241191736886e-07, "logits/chosen": -0.09521484375, "logits/rejected": 0.314453125, "logps/chosen": -434.0, "logps/rejected": -498.0, "loss": 0.1014, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.875, "rewards/margins": 8.5, "rewards/rejected": -24.375, "step": 10090 }, { "epoch": 0.7290839529343824, "grad_norm": 12.371264131507344, "learning_rate": 7.960297521679913e-07, "logits/chosen": -0.302734375, "logits/rejected": 0.30859375, "logps/chosen": -478.0, "logps/rejected": -502.0, "loss": 0.0843, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.875, "rewards/margins": 9.0625, "rewards/rejected": -26.0, "step": 10100 }, { "epoch": 0.7298058182343174, "grad_norm": 14.555726203330611, "learning_rate": 7.956359704213283e-07, "logits/chosen": -0.2119140625, "logits/rejected": 0.162109375, "logps/chosen": -430.0, "logps/rejected": -494.0, "loss": 0.1033, "rewards/accuracies": 0.96875, "rewards/chosen": -15.9375, "rewards/margins": 8.3125, "rewards/rejected": -24.25, "step": 10110 }, { "epoch": 0.7305276835342525, "grad_norm": 9.658337039242703, "learning_rate": 7.95242772487544e-07, "logits/chosen": -0.12060546875, "logits/rejected": 0.33984375, "logps/chosen": -436.0, "logps/rejected": -496.0, "loss": 0.0922, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.0, "rewards/margins": 8.25, "rewards/rejected": -24.25, "step": 10120 }, { "epoch": 0.7312495488341876, "grad_norm": 10.202119250693007, "learning_rate": 7.94850156925481e-07, "logits/chosen": -0.26171875, "logits/rejected": 0.294921875, "logps/chosen": -474.0, "logps/rejected": -512.0, "loss": 0.0738, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.125, "rewards/margins": 8.875, "rewards/rejected": -25.875, "step": 10130 }, { "epoch": 0.7319714141341226, "grad_norm": 7.089937420193079, "learning_rate": 7.944581222989574e-07, "logits/chosen": -0.1044921875, "logits/rejected": 0.283203125, "logps/chosen": -460.0, "logps/rejected": -500.0, "loss": 0.104, "rewards/accuracies": 0.96875, "rewards/chosen": -17.5, "rewards/margins": 8.375, "rewards/rejected": -25.875, "step": 10140 }, { "epoch": 0.7326932794340576, "grad_norm": 6.855771558369355, "learning_rate": 7.940666671767441e-07, "logits/chosen": -0.0279541015625, "logits/rejected": 0.4140625, "logps/chosen": -452.0, "logps/rejected": -516.0, "loss": 0.1418, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -16.25, "rewards/margins": 8.625, "rewards/rejected": -24.875, "step": 10150 }, { "epoch": 0.7334151447339926, "grad_norm": 13.38266684236867, "learning_rate": 7.936757901325451e-07, "logits/chosen": 0.0458984375, "logits/rejected": 0.4609375, "logps/chosen": -440.0, "logps/rejected": -492.0, "loss": 0.0991, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 9.0, "rewards/rejected": -25.875, "step": 10160 }, { "epoch": 0.7341370100339276, "grad_norm": 13.287506242145927, "learning_rate": 7.932854897449727e-07, "logits/chosen": -0.0791015625, "logits/rejected": 0.26953125, "logps/chosen": -448.0, "logps/rejected": -512.0, "loss": 0.1037, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.625, "rewards/margins": 8.0, "rewards/rejected": -24.5, "step": 10170 }, { "epoch": 0.7348588753338627, "grad_norm": 6.933815687060413, "learning_rate": 7.928957645975286e-07, "logits/chosen": -0.11328125, "logits/rejected": 0.400390625, "logps/chosen": -436.0, "logps/rejected": -474.0, "loss": 0.0811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5625, "rewards/margins": 8.125, "rewards/rejected": -23.75, "step": 10180 }, { "epoch": 0.7355807406337977, "grad_norm": 8.760027408404188, "learning_rate": 7.925066132785799e-07, "logits/chosen": -0.1416015625, "logits/rejected": 0.40234375, "logps/chosen": -432.0, "logps/rejected": -478.0, "loss": 0.0881, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5625, "rewards/margins": 8.3125, "rewards/rejected": -23.875, "step": 10190 }, { "epoch": 0.7363026059337328, "grad_norm": 8.759809375485291, "learning_rate": 7.921180343813395e-07, "logits/chosen": -0.07373046875, "logits/rejected": 0.306640625, "logps/chosen": -426.0, "logps/rejected": -490.0, "loss": 0.079, "rewards/accuracies": 0.96875, "rewards/chosen": -14.8125, "rewards/margins": 8.25, "rewards/rejected": -23.125, "step": 10200 }, { "epoch": 0.7370244712336678, "grad_norm": 2.9039106271241217, "learning_rate": 7.917300265038436e-07, "logits/chosen": -0.10595703125, "logits/rejected": 0.490234375, "logps/chosen": -464.0, "logps/rejected": -480.0, "loss": 0.0853, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5, "rewards/margins": 9.0625, "rewards/rejected": -24.5, "step": 10210 }, { "epoch": 0.7377463365336028, "grad_norm": 7.94423099361392, "learning_rate": 7.913425882489307e-07, "logits/chosen": -0.1669921875, "logits/rejected": 0.35546875, "logps/chosen": -458.0, "logps/rejected": -500.0, "loss": 0.0716, "rewards/accuracies": 0.9375, "rewards/chosen": -16.875, "rewards/margins": 9.0, "rewards/rejected": -25.875, "step": 10220 }, { "epoch": 0.7384682018335379, "grad_norm": 9.396013525826245, "learning_rate": 7.909557182242211e-07, "logits/chosen": -0.1328125, "logits/rejected": 0.248046875, "logps/chosen": -450.0, "logps/rejected": -520.0, "loss": 0.0924, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 8.0, "rewards/rejected": -25.125, "step": 10230 }, { "epoch": 0.7391900671334729, "grad_norm": 11.456108195209524, "learning_rate": 7.905694150420947e-07, "logits/chosen": -0.205078125, "logits/rejected": 0.380859375, "logps/chosen": -436.0, "logps/rejected": -464.0, "loss": 0.1277, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.4375, "rewards/margins": 8.0, "rewards/rejected": -23.5, "step": 10240 }, { "epoch": 0.7399119324334079, "grad_norm": 6.091158416706511, "learning_rate": 7.901836773196717e-07, "logits/chosen": -0.189453125, "logits/rejected": 0.26953125, "logps/chosen": -442.0, "logps/rejected": -482.0, "loss": 0.1021, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.75, "rewards/margins": 8.625, "rewards/rejected": -24.375, "step": 10250 }, { "epoch": 0.7406337977333429, "grad_norm": 9.757093961803783, "learning_rate": 7.897985036787898e-07, "logits/chosen": -0.287109375, "logits/rejected": 0.224609375, "logps/chosen": -440.0, "logps/rejected": -486.0, "loss": 0.0953, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.3125, "rewards/margins": 8.0625, "rewards/rejected": -23.375, "step": 10260 }, { "epoch": 0.741355663033278, "grad_norm": 8.802990335841114, "learning_rate": 7.894138927459853e-07, "logits/chosen": -0.10693359375, "logits/rejected": 0.404296875, "logps/chosen": -412.0, "logps/rejected": -454.0, "loss": 0.1094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.4375, "rewards/margins": 7.78125, "rewards/rejected": -22.25, "step": 10270 }, { "epoch": 0.742077528333213, "grad_norm": 5.920022858423469, "learning_rate": 7.890298431524716e-07, "logits/chosen": -0.1708984375, "logits/rejected": 0.310546875, "logps/chosen": -434.0, "logps/rejected": -470.0, "loss": 0.112, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.9375, "rewards/margins": 8.25, "rewards/rejected": -24.125, "step": 10280 }, { "epoch": 0.7427993936331481, "grad_norm": 4.23418233589528, "learning_rate": 7.88646353534119e-07, "logits/chosen": -0.19140625, "logits/rejected": 0.1650390625, "logps/chosen": -442.0, "logps/rejected": -474.0, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": -14.75, "rewards/margins": 8.625, "rewards/rejected": -23.375, "step": 10290 }, { "epoch": 0.7435212589330831, "grad_norm": 6.82485870730506, "learning_rate": 7.882634225314345e-07, "logits/chosen": -0.2294921875, "logits/rejected": 0.1650390625, "logps/chosen": -408.0, "logps/rejected": -472.0, "loss": 0.0955, "rewards/accuracies": 0.96875, "rewards/chosen": -13.6875, "rewards/margins": 8.25, "rewards/rejected": -21.875, "step": 10300 }, { "epoch": 0.7442431242330181, "grad_norm": 9.291910329706937, "learning_rate": 7.87881048789541e-07, "logits/chosen": -0.1328125, "logits/rejected": 0.29296875, "logps/chosen": -400.0, "logps/rejected": -470.0, "loss": 0.1114, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.875, "rewards/margins": 8.0, "rewards/rejected": -21.875, "step": 10310 }, { "epoch": 0.7449649895329532, "grad_norm": 7.029523315264257, "learning_rate": 7.874992309581578e-07, "logits/chosen": -0.142578125, "logits/rejected": 0.30078125, "logps/chosen": -410.0, "logps/rejected": -480.0, "loss": 0.0882, "rewards/accuracies": 0.96875, "rewards/chosen": -14.125, "rewards/margins": 8.0, "rewards/rejected": -22.125, "step": 10320 }, { "epoch": 0.7456868548328882, "grad_norm": 9.103792232893964, "learning_rate": 7.871179676915801e-07, "logits/chosen": -0.212890625, "logits/rejected": 0.38671875, "logps/chosen": -448.0, "logps/rejected": -484.0, "loss": 0.1051, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.25, "rewards/margins": 8.75, "rewards/rejected": -25.0, "step": 10330 }, { "epoch": 0.7464087201328232, "grad_norm": 5.936025246652528, "learning_rate": 7.867372576486597e-07, "logits/chosen": -0.16796875, "logits/rejected": 0.345703125, "logps/chosen": -420.0, "logps/rejected": -478.0, "loss": 0.1116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.9375, "rewards/margins": 7.53125, "rewards/rejected": -22.375, "step": 10340 }, { "epoch": 0.7471305854327582, "grad_norm": 10.247691786745145, "learning_rate": 7.863570994927847e-07, "logits/chosen": -0.119140625, "logits/rejected": 0.30859375, "logps/chosen": -438.0, "logps/rejected": -490.0, "loss": 0.0827, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 7.90625, "rewards/rejected": -24.375, "step": 10350 }, { "epoch": 0.7478524507326932, "grad_norm": 14.264877346898617, "learning_rate": 7.859774918918594e-07, "logits/chosen": -0.302734375, "logits/rejected": 0.22265625, "logps/chosen": -448.0, "logps/rejected": -500.0, "loss": 0.099, "rewards/accuracies": 0.96875, "rewards/chosen": -16.375, "rewards/margins": 8.1875, "rewards/rejected": -24.625, "step": 10360 }, { "epoch": 0.7485743160326284, "grad_norm": 5.51549396779447, "learning_rate": 7.855984335182852e-07, "logits/chosen": -0.07470703125, "logits/rejected": 0.314453125, "logps/chosen": -482.0, "logps/rejected": -532.0, "loss": 0.078, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.625, "rewards/margins": 8.625, "rewards/rejected": -27.25, "step": 10370 }, { "epoch": 0.7492961813325634, "grad_norm": 14.116123549612539, "learning_rate": 7.852199230489422e-07, "logits/chosen": -0.111328125, "logits/rejected": 0.39453125, "logps/chosen": -454.0, "logps/rejected": -508.0, "loss": 0.1122, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 8.125, "rewards/rejected": -26.375, "step": 10380 }, { "epoch": 0.7500180466324984, "grad_norm": 11.141038342721034, "learning_rate": 7.848419591651668e-07, "logits/chosen": -0.1865234375, "logits/rejected": 0.302734375, "logps/chosen": -478.0, "logps/rejected": -536.0, "loss": 0.0812, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 9.0, "rewards/rejected": -27.625, "step": 10390 }, { "epoch": 0.7507399119324334, "grad_norm": 9.306097435046318, "learning_rate": 7.844645405527361e-07, "logits/chosen": -0.06494140625, "logits/rejected": 0.31640625, "logps/chosen": -458.0, "logps/rejected": -504.0, "loss": 0.0768, "rewards/accuracies": 0.9375, "rewards/chosen": -17.875, "rewards/margins": 8.125, "rewards/rejected": -26.0, "step": 10400 }, { "epoch": 0.7514617772323684, "grad_norm": 10.22086123285554, "learning_rate": 7.840876659018457e-07, "logits/chosen": -0.11328125, "logits/rejected": 0.330078125, "logps/chosen": -458.0, "logps/rejected": -524.0, "loss": 0.0867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.0, "rewards/margins": 8.6875, "rewards/rejected": -25.75, "step": 10410 }, { "epoch": 0.7521836425323035, "grad_norm": 8.066213761055305, "learning_rate": 7.837113339070922e-07, "logits/chosen": -0.216796875, "logits/rejected": 0.232421875, "logps/chosen": -438.0, "logps/rejected": -458.0, "loss": 0.071, "rewards/accuracies": 0.96875, "rewards/chosen": -15.3125, "rewards/margins": 7.75, "rewards/rejected": -23.125, "step": 10420 }, { "epoch": 0.7529055078322385, "grad_norm": 9.46266382655412, "learning_rate": 7.833355432674538e-07, "logits/chosen": -0.37109375, "logits/rejected": 0.1669921875, "logps/chosen": -446.0, "logps/rejected": -486.0, "loss": 0.0824, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.5, "rewards/margins": 8.5625, "rewards/rejected": -24.0, "step": 10430 }, { "epoch": 0.7536273731321735, "grad_norm": 8.92723035540253, "learning_rate": 7.829602926862713e-07, "logits/chosen": -0.296875, "logits/rejected": 0.19921875, "logps/chosen": -418.0, "logps/rejected": -468.0, "loss": 0.1066, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.875, "rewards/margins": 8.8125, "rewards/rejected": -22.75, "step": 10440 }, { "epoch": 0.7543492384321085, "grad_norm": 11.897967839893859, "learning_rate": 7.825855808712296e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.248046875, "logps/chosen": -400.0, "logps/rejected": -448.0, "loss": 0.091, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.75, "rewards/margins": 9.3125, "rewards/rejected": -22.0, "step": 10450 }, { "epoch": 0.7550711037320436, "grad_norm": 6.544594308081151, "learning_rate": 7.822114065343386e-07, "logits/chosen": -0.326171875, "logits/rejected": 0.1328125, "logps/chosen": -410.0, "logps/rejected": -462.0, "loss": 0.0993, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.375, "rewards/margins": 7.875, "rewards/rejected": -20.25, "step": 10460 }, { "epoch": 0.7557929690319787, "grad_norm": 9.405534462383082, "learning_rate": 7.818377683919149e-07, "logits/chosen": -0.3203125, "logits/rejected": 0.275390625, "logps/chosen": -408.0, "logps/rejected": -438.0, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": -11.6875, "rewards/margins": 8.6875, "rewards/rejected": -20.375, "step": 10470 }, { "epoch": 0.7565148343319137, "grad_norm": 9.976170308493366, "learning_rate": 7.814646651645635e-07, "logits/chosen": -0.369140625, "logits/rejected": 0.17578125, "logps/chosen": -404.0, "logps/rejected": -460.0, "loss": 0.0971, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -11.875, "rewards/margins": 8.375, "rewards/rejected": -20.25, "step": 10480 }, { "epoch": 0.7572366996318487, "grad_norm": 4.67033083201407, "learning_rate": 7.810920955771586e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.251953125, "logps/chosen": -404.0, "logps/rejected": -432.0, "loss": 0.1065, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.125, "rewards/margins": 7.625, "rewards/rejected": -20.75, "step": 10490 }, { "epoch": 0.7579585649317837, "grad_norm": 7.757767276824581, "learning_rate": 7.807200583588266e-07, "logits/chosen": -0.197265625, "logits/rejected": 0.216796875, "logps/chosen": -406.0, "logps/rejected": -468.0, "loss": 0.0924, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -13.5625, "rewards/margins": 7.53125, "rewards/rejected": -21.125, "step": 10500 }, { "epoch": 0.7586804302317187, "grad_norm": 6.748981336385312, "learning_rate": 7.803485522429261e-07, "logits/chosen": -0.2734375, "logits/rejected": 0.138671875, "logps/chosen": -412.0, "logps/rejected": -460.0, "loss": 0.0848, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.5, "rewards/margins": 8.625, "rewards/rejected": -22.125, "step": 10510 }, { "epoch": 0.7594022955316538, "grad_norm": 8.335922961920595, "learning_rate": 7.799775759670318e-07, "logits/chosen": -0.287109375, "logits/rejected": 0.1513671875, "logps/chosen": -444.0, "logps/rejected": -484.0, "loss": 0.1021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.75, "rewards/margins": 8.875, "rewards/rejected": -23.625, "step": 10520 }, { "epoch": 0.7601241608315888, "grad_norm": 9.652665993658141, "learning_rate": 7.796071282729149e-07, "logits/chosen": -0.09765625, "logits/rejected": 0.29296875, "logps/chosen": -442.0, "logps/rejected": -492.0, "loss": 0.0852, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.5, "rewards/margins": 8.4375, "rewards/rejected": -25.0, "step": 10530 }, { "epoch": 0.7608460261315239, "grad_norm": 6.386861946616202, "learning_rate": 7.792372079065259e-07, "logits/chosen": -0.080078125, "logits/rejected": 0.3359375, "logps/chosen": -444.0, "logps/rejected": -506.0, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": -16.75, "rewards/margins": 8.875, "rewards/rejected": -25.625, "step": 10540 }, { "epoch": 0.7615678914314589, "grad_norm": 11.497288770362193, "learning_rate": 7.788678136179767e-07, "logits/chosen": -0.1474609375, "logits/rejected": 0.26953125, "logps/chosen": -442.0, "logps/rejected": -516.0, "loss": 0.0985, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.0, "rewards/margins": 9.125, "rewards/rejected": -26.0, "step": 10550 }, { "epoch": 0.7622897567313939, "grad_norm": 11.82268904646955, "learning_rate": 7.78498944161523e-07, "logits/chosen": -0.326171875, "logits/rejected": 0.28515625, "logps/chosen": -448.0, "logps/rejected": -502.0, "loss": 0.0803, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.4375, "rewards/margins": 9.0625, "rewards/rejected": -24.5, "step": 10560 }, { "epoch": 0.763011622031329, "grad_norm": 9.467027921451338, "learning_rate": 7.781305982955459e-07, "logits/chosen": -0.091796875, "logits/rejected": 0.345703125, "logps/chosen": -432.0, "logps/rejected": -468.0, "loss": 0.1063, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.875, "rewards/margins": 7.9375, "rewards/rejected": -21.875, "step": 10570 }, { "epoch": 0.763733487331264, "grad_norm": 8.716456938838405, "learning_rate": 7.777627747825355e-07, "logits/chosen": -0.3125, "logits/rejected": 0.2314453125, "logps/chosen": -398.0, "logps/rejected": -458.0, "loss": 0.1004, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.625, "rewards/margins": 8.375, "rewards/rejected": -21.0, "step": 10580 }, { "epoch": 0.764455352631199, "grad_norm": 11.116488589305511, "learning_rate": 7.773954723890725e-07, "logits/chosen": -0.10986328125, "logits/rejected": 0.2490234375, "logps/chosen": -432.0, "logps/rejected": -498.0, "loss": 0.0879, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.25, "rewards/margins": 8.5625, "rewards/rejected": -22.875, "step": 10590 }, { "epoch": 0.765177217931134, "grad_norm": 11.157681716134856, "learning_rate": 7.770286898858113e-07, "logits/chosen": -0.08935546875, "logits/rejected": 0.29296875, "logps/chosen": -422.0, "logps/rejected": -466.0, "loss": 0.1026, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.8125, "rewards/margins": 8.5625, "rewards/rejected": -22.375, "step": 10600 }, { "epoch": 0.765899083231069, "grad_norm": 4.635775075575135, "learning_rate": 7.766624260474625e-07, "logits/chosen": -0.18359375, "logits/rejected": 0.326171875, "logps/chosen": -396.0, "logps/rejected": -442.0, "loss": 0.0574, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -12.6875, "rewards/margins": 8.3125, "rewards/rejected": -21.0, "step": 10610 }, { "epoch": 0.7666209485310042, "grad_norm": 12.354797181279366, "learning_rate": 7.762966796527759e-07, "logits/chosen": -0.1884765625, "logits/rejected": 0.30859375, "logps/chosen": -422.0, "logps/rejected": -464.0, "loss": 0.107, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.5625, "rewards/margins": 8.0625, "rewards/rejected": -22.5, "step": 10620 }, { "epoch": 0.7673428138309392, "grad_norm": 13.169306446993499, "learning_rate": 7.759314494845234e-07, "logits/chosen": -0.2490234375, "logits/rejected": 0.11962890625, "logps/chosen": -428.0, "logps/rejected": -474.0, "loss": 0.1037, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.125, "rewards/margins": 8.1875, "rewards/rejected": -23.25, "step": 10630 }, { "epoch": 0.7680646791308742, "grad_norm": 7.739551157311822, "learning_rate": 7.755667343294812e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.244140625, "logps/chosen": -438.0, "logps/rejected": -494.0, "loss": 0.1049, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5, "rewards/margins": 8.375, "rewards/rejected": -23.875, "step": 10640 }, { "epoch": 0.7687865444308092, "grad_norm": 11.827658147472318, "learning_rate": 7.752025329784146e-07, "logits/chosen": -0.2333984375, "logits/rejected": 0.33203125, "logps/chosen": -436.0, "logps/rejected": -488.0, "loss": 0.0956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.3125, "rewards/margins": 8.1875, "rewards/rejected": -23.5, "step": 10650 }, { "epoch": 0.7695084097307442, "grad_norm": 9.972642054676342, "learning_rate": 7.748388442260596e-07, "logits/chosen": -0.2490234375, "logits/rejected": 0.384765625, "logps/chosen": -414.0, "logps/rejected": -450.0, "loss": 0.1029, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.25, "rewards/margins": 8.75, "rewards/rejected": -23.0, "step": 10660 }, { "epoch": 0.7702302750306793, "grad_norm": 8.108387546599499, "learning_rate": 7.744756668711065e-07, "logits/chosen": -0.1884765625, "logits/rejected": 0.3359375, "logps/chosen": -440.0, "logps/rejected": -478.0, "loss": 0.0827, "rewards/accuracies": 0.96875, "rewards/chosen": -15.625, "rewards/margins": 8.4375, "rewards/rejected": -24.0, "step": 10670 }, { "epoch": 0.7709521403306143, "grad_norm": 8.154178440717448, "learning_rate": 7.741129997161835e-07, "logits/chosen": -0.1923828125, "logits/rejected": 0.140625, "logps/chosen": -436.0, "logps/rejected": -468.0, "loss": 0.0847, "rewards/accuracies": 0.96875, "rewards/chosen": -14.9375, "rewards/margins": 8.3125, "rewards/rejected": -23.25, "step": 10680 }, { "epoch": 0.7716740056305493, "grad_norm": 6.62123243391713, "learning_rate": 7.737508415678403e-07, "logits/chosen": -0.2353515625, "logits/rejected": 0.1484375, "logps/chosen": -426.0, "logps/rejected": -506.0, "loss": 0.0781, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.75, "rewards/margins": 8.875, "rewards/rejected": -24.625, "step": 10690 }, { "epoch": 0.7723958709304843, "grad_norm": 9.26914294496602, "learning_rate": 7.733891912365308e-07, "logits/chosen": -0.2490234375, "logits/rejected": 0.25, "logps/chosen": -446.0, "logps/rejected": -482.0, "loss": 0.0812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5, "rewards/margins": 8.5, "rewards/rejected": -24.0, "step": 10700 }, { "epoch": 0.7731177362304194, "grad_norm": 12.971927891547399, "learning_rate": 7.730280475365979e-07, "logits/chosen": -0.18359375, "logits/rejected": 0.333984375, "logps/chosen": -472.0, "logps/rejected": -504.0, "loss": 0.113, "rewards/accuracies": 0.96875, "rewards/chosen": -14.3125, "rewards/margins": 8.75, "rewards/rejected": -23.125, "step": 10710 }, { "epoch": 0.7738396015303545, "grad_norm": 8.417440435328082, "learning_rate": 7.726674092862557e-07, "logits/chosen": -0.173828125, "logits/rejected": 0.228515625, "logps/chosen": -404.0, "logps/rejected": -458.0, "loss": 0.0869, "rewards/accuracies": 0.96875, "rewards/chosen": -13.5, "rewards/margins": 8.6875, "rewards/rejected": -22.25, "step": 10720 }, { "epoch": 0.7745614668302895, "grad_norm": 10.822644186087635, "learning_rate": 7.723072753075748e-07, "logits/chosen": -0.2490234375, "logits/rejected": 0.1806640625, "logps/chosen": -406.0, "logps/rejected": -448.0, "loss": 0.0979, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.0625, "rewards/margins": 8.0, "rewards/rejected": -21.0, "step": 10730 }, { "epoch": 0.7752833321302245, "grad_norm": 4.771006516595872, "learning_rate": 7.719476444264649e-07, "logits/chosen": -0.353515625, "logits/rejected": 0.2392578125, "logps/chosen": -456.0, "logps/rejected": -480.0, "loss": 0.0982, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.1875, "rewards/margins": 8.8125, "rewards/rejected": -24.0, "step": 10740 }, { "epoch": 0.7760051974301595, "grad_norm": 10.759970118254055, "learning_rate": 7.715885154726593e-07, "logits/chosen": -0.3046875, "logits/rejected": 0.1865234375, "logps/chosen": -430.0, "logps/rejected": -466.0, "loss": 0.1014, "rewards/accuracies": 0.9375, "rewards/chosen": -14.0, "rewards/margins": 8.1875, "rewards/rejected": -22.125, "step": 10750 }, { "epoch": 0.7767270627300946, "grad_norm": 5.472600678427675, "learning_rate": 7.71229887279699e-07, "logits/chosen": -0.275390625, "logits/rejected": 0.310546875, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.0913, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.1875, "rewards/margins": 8.3125, "rewards/rejected": -22.5, "step": 10760 }, { "epoch": 0.7774489280300296, "grad_norm": 8.21824512484107, "learning_rate": 7.708717586849164e-07, "logits/chosen": -0.267578125, "logits/rejected": 0.3046875, "logps/chosen": -402.0, "logps/rejected": -456.0, "loss": 0.0822, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.5, "rewards/margins": 8.0, "rewards/rejected": -22.5, "step": 10770 }, { "epoch": 0.7781707933299646, "grad_norm": 3.2646158614834757, "learning_rate": 7.705141285294196e-07, "logits/chosen": -0.267578125, "logits/rejected": 0.26171875, "logps/chosen": -436.0, "logps/rejected": -458.0, "loss": 0.065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -13.3125, "rewards/margins": 8.3125, "rewards/rejected": -21.625, "step": 10780 }, { "epoch": 0.7788926586298996, "grad_norm": 12.377497794363093, "learning_rate": 7.701569956580767e-07, "logits/chosen": -0.298828125, "logits/rejected": 0.287109375, "logps/chosen": -446.0, "logps/rejected": -466.0, "loss": 0.086, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.0625, "rewards/margins": 8.25, "rewards/rejected": -23.25, "step": 10790 }, { "epoch": 0.7796145239298347, "grad_norm": 8.726131890538158, "learning_rate": 7.69800358919501e-07, "logits/chosen": -0.1474609375, "logits/rejected": 0.36328125, "logps/chosen": -428.0, "logps/rejected": -486.0, "loss": 0.1142, "rewards/accuracies": 0.96875, "rewards/chosen": -15.3125, "rewards/margins": 9.125, "rewards/rejected": -24.375, "step": 10800 }, { "epoch": 0.7803363892297698, "grad_norm": 8.895794480628, "learning_rate": 7.694442171660333e-07, "logits/chosen": -0.06494140625, "logits/rejected": 0.2890625, "logps/chosen": -424.0, "logps/rejected": -454.0, "loss": 0.1017, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.5625, "rewards/margins": 7.96875, "rewards/rejected": -22.5, "step": 10810 }, { "epoch": 0.7810582545297048, "grad_norm": 7.249652171600769, "learning_rate": 7.690885692537282e-07, "logits/chosen": -0.1396484375, "logits/rejected": 0.203125, "logps/chosen": -448.0, "logps/rejected": -480.0, "loss": 0.0691, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 7.8125, "rewards/rejected": -24.875, "step": 10820 }, { "epoch": 0.7817801198296398, "grad_norm": 11.483681869046327, "learning_rate": 7.687334140423383e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.224609375, "logps/chosen": -428.0, "logps/rejected": -490.0, "loss": 0.1069, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.0, "rewards/margins": 8.375, "rewards/rejected": -23.375, "step": 10830 }, { "epoch": 0.7825019851295748, "grad_norm": 7.937442513463118, "learning_rate": 7.683787503952985e-07, "logits/chosen": -0.208984375, "logits/rejected": 0.326171875, "logps/chosen": -434.0, "logps/rejected": -488.0, "loss": 0.069, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.0, "rewards/margins": 8.4375, "rewards/rejected": -24.5, "step": 10840 }, { "epoch": 0.7832238504295098, "grad_norm": 6.754124210962345, "learning_rate": 7.680245771797108e-07, "logits/chosen": -0.1591796875, "logits/rejected": 0.302734375, "logps/chosen": -458.0, "logps/rejected": -484.0, "loss": 0.0862, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.375, "rewards/margins": 8.9375, "rewards/rejected": -25.375, "step": 10850 }, { "epoch": 0.7839457157294449, "grad_norm": 7.819867149054748, "learning_rate": 7.676708932663293e-07, "logits/chosen": -0.2353515625, "logits/rejected": 0.2080078125, "logps/chosen": -416.0, "logps/rejected": -492.0, "loss": 0.1025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -13.75, "rewards/margins": 9.9375, "rewards/rejected": -23.625, "step": 10860 }, { "epoch": 0.78466758102938, "grad_norm": 5.4107255863285895, "learning_rate": 7.67317697529545e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.201171875, "logps/chosen": -424.0, "logps/rejected": -464.0, "loss": 0.0692, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.75, "rewards/margins": 8.0, "rewards/rejected": -21.75, "step": 10870 }, { "epoch": 0.785389446329315, "grad_norm": 6.118134872007548, "learning_rate": 7.669649888473704e-07, "logits/chosen": -0.3515625, "logits/rejected": 0.11181640625, "logps/chosen": -426.0, "logps/rejected": -450.0, "loss": 0.0938, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.25, "rewards/margins": 8.125, "rewards/rejected": -21.375, "step": 10880 }, { "epoch": 0.78611131162925, "grad_norm": 8.882379468077405, "learning_rate": 7.666127661014253e-07, "logits/chosen": -0.263671875, "logits/rejected": 0.255859375, "logps/chosen": -392.0, "logps/rejected": -464.0, "loss": 0.0822, "rewards/accuracies": 0.96875, "rewards/chosen": -13.375, "rewards/margins": 8.1875, "rewards/rejected": -21.625, "step": 10890 }, { "epoch": 0.786833176929185, "grad_norm": 8.178875220698325, "learning_rate": 7.662610281769211e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.1337890625, "logps/chosen": -444.0, "logps/rejected": -488.0, "loss": 0.0824, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.875, "rewards/margins": 8.5625, "rewards/rejected": -23.5, "step": 10900 }, { "epoch": 0.7875550422291201, "grad_norm": 10.131580238809324, "learning_rate": 7.659097739626465e-07, "logits/chosen": -0.248046875, "logits/rejected": 0.1630859375, "logps/chosen": -420.0, "logps/rejected": -464.0, "loss": 0.0876, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.875, "rewards/margins": 8.3125, "rewards/rejected": -22.25, "step": 10910 }, { "epoch": 0.7882769075290551, "grad_norm": 7.832522677446272, "learning_rate": 7.655590023509527e-07, "logits/chosen": -0.107421875, "logits/rejected": 0.376953125, "logps/chosen": -428.0, "logps/rejected": -490.0, "loss": 0.0698, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.0625, "rewards/margins": 9.125, "rewards/rejected": -24.25, "step": 10920 }, { "epoch": 0.7889987728289901, "grad_norm": 8.256617750151781, "learning_rate": 7.652087122377384e-07, "logits/chosen": -0.291015625, "logits/rejected": 0.2890625, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.0809, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.1875, "rewards/margins": 8.5, "rewards/rejected": -22.625, "step": 10930 }, { "epoch": 0.7897206381289251, "grad_norm": 8.495925762603013, "learning_rate": 7.648589025224355e-07, "logits/chosen": -0.1708984375, "logits/rejected": 0.1591796875, "logps/chosen": -412.0, "logps/rejected": -476.0, "loss": 0.0815, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.6875, "rewards/margins": 8.125, "rewards/rejected": -22.875, "step": 10940 }, { "epoch": 0.7904425034288601, "grad_norm": 11.380510012804129, "learning_rate": 7.645095721079945e-07, "logits/chosen": -0.208984375, "logits/rejected": 0.24609375, "logps/chosen": -400.0, "logps/rejected": -456.0, "loss": 0.1201, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.375, "rewards/margins": 8.25, "rewards/rejected": -22.625, "step": 10950 }, { "epoch": 0.7911643687287953, "grad_norm": 6.7523251972561695, "learning_rate": 7.641607199008701e-07, "logits/chosen": -0.3046875, "logits/rejected": 0.1708984375, "logps/chosen": -450.0, "logps/rejected": -480.0, "loss": 0.0695, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.4375, "rewards/margins": 7.875, "rewards/rejected": -23.375, "step": 10960 }, { "epoch": 0.7918862340287303, "grad_norm": 13.694241612018695, "learning_rate": 7.638123448110066e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.1162109375, "logps/chosen": -430.0, "logps/rejected": -476.0, "loss": 0.1087, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.6875, "rewards/margins": 8.3125, "rewards/rejected": -23.0, "step": 10970 }, { "epoch": 0.7926080993286653, "grad_norm": 7.643368489458751, "learning_rate": 7.634644457518242e-07, "logits/chosen": -0.314453125, "logits/rejected": 0.2158203125, "logps/chosen": -452.0, "logps/rejected": -488.0, "loss": 0.0709, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.0625, "rewards/margins": 8.75, "rewards/rejected": -23.875, "step": 10980 }, { "epoch": 0.7933299646286003, "grad_norm": 3.1132851159636954, "learning_rate": 7.631170216402039e-07, "logits/chosen": -0.384765625, "logits/rejected": 0.0299072265625, "logps/chosen": -422.0, "logps/rejected": -464.0, "loss": 0.0762, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -14.5625, "rewards/margins": 7.9375, "rewards/rejected": -22.5, "step": 10990 }, { "epoch": 0.7940518299285353, "grad_norm": 10.644135500509968, "learning_rate": 7.627700713964739e-07, "logits/chosen": -0.2412109375, "logits/rejected": 0.162109375, "logps/chosen": -418.0, "logps/rejected": -472.0, "loss": 0.0962, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.6875, "rewards/margins": 9.125, "rewards/rejected": -23.875, "step": 11000 }, { "epoch": 0.7947736952284704, "grad_norm": 8.621568813725244, "learning_rate": 7.624235939443953e-07, "logits/chosen": -0.435546875, "logits/rejected": 0.265625, "logps/chosen": -426.0, "logps/rejected": -468.0, "loss": 0.0975, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -14.875, "rewards/margins": 8.4375, "rewards/rejected": -23.25, "step": 11010 }, { "epoch": 0.7954955605284054, "grad_norm": 7.916782366981212, "learning_rate": 7.620775882111482e-07, "logits/chosen": -0.375, "logits/rejected": 0.1640625, "logps/chosen": -420.0, "logps/rejected": -482.0, "loss": 0.0934, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.875, "rewards/margins": 8.125, "rewards/rejected": -23.0, "step": 11020 }, { "epoch": 0.7962174258283404, "grad_norm": 13.457394150040164, "learning_rate": 7.617320531273181e-07, "logits/chosen": -0.353515625, "logits/rejected": 0.3125, "logps/chosen": -418.0, "logps/rejected": -466.0, "loss": 0.0772, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.5625, "rewards/margins": 9.25, "rewards/rejected": -23.875, "step": 11030 }, { "epoch": 0.7969392911282754, "grad_norm": 8.749330675489764, "learning_rate": 7.613869876268809e-07, "logits/chosen": -0.375, "logits/rejected": 0.1875, "logps/chosen": -412.0, "logps/rejected": -464.0, "loss": 0.0918, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -14.4375, "rewards/margins": 8.625, "rewards/rejected": -23.125, "step": 11040 }, { "epoch": 0.7976611564282104, "grad_norm": 11.263784947140397, "learning_rate": 7.610423906471905e-07, "logits/chosen": -0.396484375, "logits/rejected": 0.091796875, "logps/chosen": -428.0, "logps/rejected": -480.0, "loss": 0.0702, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.0, "rewards/margins": 8.25, "rewards/rejected": -23.25, "step": 11050 }, { "epoch": 0.7983830217281456, "grad_norm": 8.817480029990136, "learning_rate": 7.606982611289639e-07, "logits/chosen": -0.265625, "logits/rejected": 0.26171875, "logps/chosen": -452.0, "logps/rejected": -492.0, "loss": 0.0894, "rewards/accuracies": 0.96875, "rewards/chosen": -15.8125, "rewards/margins": 9.0, "rewards/rejected": -24.75, "step": 11060 }, { "epoch": 0.7991048870280806, "grad_norm": 6.911004827233596, "learning_rate": 7.60354598016268e-07, "logits/chosen": -0.3125, "logits/rejected": 0.140625, "logps/chosen": -454.0, "logps/rejected": -490.0, "loss": 0.0832, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.875, "rewards/margins": 8.6875, "rewards/rejected": -24.5, "step": 11070 }, { "epoch": 0.7998267523280156, "grad_norm": 9.196763640349275, "learning_rate": 7.600114002565063e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.2890625, "logps/chosen": -452.0, "logps/rejected": -508.0, "loss": 0.0732, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -17.125, "rewards/margins": 9.0, "rewards/rejected": -26.125, "step": 11080 }, { "epoch": 0.8005486176279506, "grad_norm": 9.654798892913139, "learning_rate": 7.596686668004049e-07, "logits/chosen": -0.291015625, "logits/rejected": 0.138671875, "logps/chosen": -440.0, "logps/rejected": -472.0, "loss": 0.0763, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.625, "rewards/margins": 8.375, "rewards/rejected": -24.0, "step": 11090 }, { "epoch": 0.8012704829278856, "grad_norm": 9.927164010432259, "learning_rate": 7.593263966019991e-07, "logits/chosen": -0.40625, "logits/rejected": 0.19921875, "logps/chosen": -418.0, "logps/rejected": -486.0, "loss": 0.1124, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -15.0, "rewards/margins": 8.6875, "rewards/rejected": -23.75, "step": 11100 }, { "epoch": 0.8019923482278207, "grad_norm": 11.154923787326965, "learning_rate": 7.589845886186201e-07, "logits/chosen": -0.265625, "logits/rejected": 0.158203125, "logps/chosen": -442.0, "logps/rejected": -488.0, "loss": 0.1008, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.25, "rewards/margins": 8.8125, "rewards/rejected": -25.0, "step": 11110 }, { "epoch": 0.8027142135277557, "grad_norm": 12.485348608692934, "learning_rate": 7.586432418108816e-07, "logits/chosen": -0.328125, "logits/rejected": 0.11865234375, "logps/chosen": -450.0, "logps/rejected": -480.0, "loss": 0.0762, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.125, "rewards/margins": 8.25, "rewards/rejected": -23.375, "step": 11120 }, { "epoch": 0.8034360788276907, "grad_norm": 10.010768762384323, "learning_rate": 7.583023551426664e-07, "logits/chosen": -0.3046875, "logits/rejected": 0.212890625, "logps/chosen": -420.0, "logps/rejected": -474.0, "loss": 0.0948, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.375, "rewards/margins": 9.375, "rewards/rejected": -23.75, "step": 11130 }, { "epoch": 0.8041579441276258, "grad_norm": 11.445945630193068, "learning_rate": 7.579619275811138e-07, "logits/chosen": -0.322265625, "logits/rejected": 0.189453125, "logps/chosen": -440.0, "logps/rejected": -474.0, "loss": 0.0911, "rewards/accuracies": 0.96875, "rewards/chosen": -14.6875, "rewards/margins": 8.375, "rewards/rejected": -23.0, "step": 11140 }, { "epoch": 0.8048798094275608, "grad_norm": 14.197321395864547, "learning_rate": 7.576219580966055e-07, "logits/chosen": -0.181640625, "logits/rejected": 0.2421875, "logps/chosen": -404.0, "logps/rejected": -474.0, "loss": 0.0692, "rewards/accuracies": 0.96875, "rewards/chosen": -14.5625, "rewards/margins": 8.75, "rewards/rejected": -23.25, "step": 11150 }, { "epoch": 0.8056016747274959, "grad_norm": 5.3677035843598455, "learning_rate": 7.57282445662753e-07, "logits/chosen": -0.271484375, "logits/rejected": 0.2412109375, "logps/chosen": -408.0, "logps/rejected": -470.0, "loss": 0.0965, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.0, "rewards/margins": 8.0, "rewards/rejected": -23.0, "step": 11160 }, { "epoch": 0.8063235400274309, "grad_norm": 12.724092980868754, "learning_rate": 7.569433892563852e-07, "logits/chosen": -0.2177734375, "logits/rejected": 0.28125, "logps/chosen": -428.0, "logps/rejected": -492.0, "loss": 0.0919, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.0, "rewards/margins": 9.1875, "rewards/rejected": -23.25, "step": 11170 }, { "epoch": 0.8070454053273659, "grad_norm": 4.131280000391168, "learning_rate": 7.566047878575343e-07, "logits/chosen": -0.271484375, "logits/rejected": 0.17578125, "logps/chosen": -400.0, "logps/rejected": -468.0, "loss": 0.0965, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.8125, "rewards/margins": 8.3125, "rewards/rejected": -21.125, "step": 11180 }, { "epoch": 0.8077672706273009, "grad_norm": 10.27392701879549, "learning_rate": 7.562666404494236e-07, "logits/chosen": -0.255859375, "logits/rejected": 0.275390625, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.0723, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.0, "rewards/margins": 8.4375, "rewards/rejected": -22.5, "step": 11190 }, { "epoch": 0.8084891359272359, "grad_norm": 6.415927806647793, "learning_rate": 7.559289460184543e-07, "logits/chosen": -0.2119140625, "logits/rejected": 0.296875, "logps/chosen": -434.0, "logps/rejected": -476.0, "loss": 0.0774, "rewards/accuracies": 0.96875, "rewards/chosen": -14.875, "rewards/margins": 8.625, "rewards/rejected": -23.5, "step": 11200 }, { "epoch": 0.809211001227171, "grad_norm": 7.000849649624034, "learning_rate": 7.555917035541937e-07, "logits/chosen": -0.212890625, "logits/rejected": 0.24609375, "logps/chosen": -406.0, "logps/rejected": -450.0, "loss": 0.0813, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -14.1875, "rewards/margins": 8.125, "rewards/rejected": -22.375, "step": 11210 }, { "epoch": 0.809932866527106, "grad_norm": 5.051127011612099, "learning_rate": 7.552549120493609e-07, "logits/chosen": -0.205078125, "logits/rejected": 0.1943359375, "logps/chosen": -420.0, "logps/rejected": -452.0, "loss": 0.0664, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.375, "rewards/margins": 8.125, "rewards/rejected": -22.5, "step": 11220 }, { "epoch": 0.8106547318270411, "grad_norm": 7.122710572840244, "learning_rate": 7.549185704998158e-07, "logits/chosen": -0.171875, "logits/rejected": 0.2392578125, "logps/chosen": -414.0, "logps/rejected": -460.0, "loss": 0.089, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -15.0, "rewards/margins": 7.84375, "rewards/rejected": -22.875, "step": 11230 }, { "epoch": 0.8113765971269761, "grad_norm": 9.380252008462143, "learning_rate": 7.545826779045449e-07, "logits/chosen": -0.1259765625, "logits/rejected": 0.291015625, "logps/chosen": -432.0, "logps/rejected": -482.0, "loss": 0.112, "rewards/accuracies": 0.9375, "rewards/chosen": -15.875, "rewards/margins": 7.59375, "rewards/rejected": -23.5, "step": 11240 }, { "epoch": 0.8120984624269111, "grad_norm": 10.371874310085431, "learning_rate": 7.542472332656506e-07, "logits/chosen": -0.169921875, "logits/rejected": 0.185546875, "logps/chosen": -414.0, "logps/rejected": -490.0, "loss": 0.1031, "rewards/accuracies": 0.96875, "rewards/chosen": -13.6875, "rewards/margins": 8.3125, "rewards/rejected": -22.0, "step": 11250 }, { "epoch": 0.8128203277268462, "grad_norm": 10.953653316700672, "learning_rate": 7.539122355883373e-07, "logits/chosen": -0.236328125, "logits/rejected": 0.26953125, "logps/chosen": -414.0, "logps/rejected": -458.0, "loss": 0.1004, "rewards/accuracies": 0.96875, "rewards/chosen": -14.625, "rewards/margins": 7.96875, "rewards/rejected": -22.625, "step": 11260 }, { "epoch": 0.8135421930267812, "grad_norm": 8.793812066555885, "learning_rate": 7.535776838808995e-07, "logits/chosen": -0.322265625, "logits/rejected": 0.11865234375, "logps/chosen": -422.0, "logps/rejected": -464.0, "loss": 0.0845, "rewards/accuracies": 0.96875, "rewards/chosen": -14.0625, "rewards/margins": 8.3125, "rewards/rejected": -22.375, "step": 11270 }, { "epoch": 0.8142640583267162, "grad_norm": 9.077884294369802, "learning_rate": 7.532435771547094e-07, "logits/chosen": -0.1533203125, "logits/rejected": 0.271484375, "logps/chosen": -408.0, "logps/rejected": -462.0, "loss": 0.0875, "rewards/accuracies": 0.96875, "rewards/chosen": -13.3125, "rewards/margins": 8.1875, "rewards/rejected": -21.5, "step": 11280 }, { "epoch": 0.8149859236266512, "grad_norm": 5.901434491035247, "learning_rate": 7.52909914424205e-07, "logits/chosen": -0.310546875, "logits/rejected": 0.1240234375, "logps/chosen": -440.0, "logps/rejected": -476.0, "loss": 0.0756, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.3125, "rewards/margins": 7.875, "rewards/rejected": -22.125, "step": 11290 }, { "epoch": 0.8157077889265864, "grad_norm": 5.316155040830778, "learning_rate": 7.525766947068777e-07, "logits/chosen": -0.130859375, "logits/rejected": 0.25390625, "logps/chosen": -408.0, "logps/rejected": -482.0, "loss": 0.0779, "rewards/accuracies": 0.96875, "rewards/chosen": -14.0625, "rewards/margins": 8.3125, "rewards/rejected": -22.375, "step": 11300 }, { "epoch": 0.8164296542265214, "grad_norm": 6.0764085040259355, "learning_rate": 7.522439170232598e-07, "logits/chosen": -0.125, "logits/rejected": 0.443359375, "logps/chosen": -414.0, "logps/rejected": -464.0, "loss": 0.0843, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.8125, "rewards/margins": 8.625, "rewards/rejected": -24.375, "step": 11310 }, { "epoch": 0.8171515195264564, "grad_norm": 6.118457695456505, "learning_rate": 7.519115803969124e-07, "logits/chosen": -0.169921875, "logits/rejected": 0.2314453125, "logps/chosen": -426.0, "logps/rejected": -476.0, "loss": 0.099, "rewards/accuracies": 0.96875, "rewards/chosen": -15.6875, "rewards/margins": 8.0625, "rewards/rejected": -23.75, "step": 11320 }, { "epoch": 0.8178733848263914, "grad_norm": 6.303094889271294, "learning_rate": 7.515796838544139e-07, "logits/chosen": -0.306640625, "logits/rejected": 0.380859375, "logps/chosen": -442.0, "logps/rejected": -492.0, "loss": 0.0811, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.375, "rewards/margins": 9.5625, "rewards/rejected": -26.0, "step": 11330 }, { "epoch": 0.8185952501263264, "grad_norm": 14.442467546926125, "learning_rate": 7.51248226425348e-07, "logits/chosen": -0.15234375, "logits/rejected": 0.275390625, "logps/chosen": -442.0, "logps/rejected": -506.0, "loss": 0.0854, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 8.5625, "rewards/rejected": -25.375, "step": 11340 }, { "epoch": 0.8193171154262615, "grad_norm": 4.890280674603113, "learning_rate": 7.509172071422913e-07, "logits/chosen": -0.390625, "logits/rejected": 0.14453125, "logps/chosen": -466.0, "logps/rejected": -500.0, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": -16.125, "rewards/margins": 9.3125, "rewards/rejected": -25.375, "step": 11350 }, { "epoch": 0.8200389807261965, "grad_norm": 13.406013504074718, "learning_rate": 7.505866250408015e-07, "logits/chosen": -0.173828125, "logits/rejected": 0.318359375, "logps/chosen": -448.0, "logps/rejected": -488.0, "loss": 0.0961, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.125, "rewards/margins": 8.0, "rewards/rejected": -24.25, "step": 11360 }, { "epoch": 0.8207608460261315, "grad_norm": 7.416946615030462, "learning_rate": 7.50256479159406e-07, "logits/chosen": -0.1650390625, "logits/rejected": 0.33203125, "logps/chosen": -394.0, "logps/rejected": -448.0, "loss": 0.079, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.125, "rewards/margins": 7.71875, "rewards/rejected": -21.875, "step": 11370 }, { "epoch": 0.8214827113260665, "grad_norm": 10.453328574243224, "learning_rate": 7.499267685395902e-07, "logits/chosen": -0.173828125, "logits/rejected": 0.232421875, "logps/chosen": -434.0, "logps/rejected": -476.0, "loss": 0.1364, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.75, "rewards/margins": 8.25, "rewards/rejected": -23.0, "step": 11380 }, { "epoch": 0.8222045766260015, "grad_norm": 12.340761656308382, "learning_rate": 7.495974922257845e-07, "logits/chosen": -0.2060546875, "logits/rejected": 0.26171875, "logps/chosen": -466.0, "logps/rejected": -480.0, "loss": 0.0829, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.25, "rewards/margins": 7.46875, "rewards/rejected": -23.75, "step": 11390 }, { "epoch": 0.8229264419259367, "grad_norm": 8.875417728896911, "learning_rate": 7.492686492653552e-07, "logits/chosen": -0.1767578125, "logits/rejected": 0.265625, "logps/chosen": -420.0, "logps/rejected": -482.0, "loss": 0.0926, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.125, "rewards/margins": 7.46875, "rewards/rejected": -23.5, "step": 11400 }, { "epoch": 0.8236483072258717, "grad_norm": 7.408107349597455, "learning_rate": 7.489402387085902e-07, "logits/chosen": -0.205078125, "logits/rejected": 0.34765625, "logps/chosen": -414.0, "logps/rejected": -466.0, "loss": 0.0757, "rewards/accuracies": 0.96875, "rewards/chosen": -15.25, "rewards/margins": 8.625, "rewards/rejected": -23.875, "step": 11410 }, { "epoch": 0.8243701725258067, "grad_norm": 9.506330250150752, "learning_rate": 7.486122596086891e-07, "logits/chosen": -0.3828125, "logits/rejected": 0.2255859375, "logps/chosen": -470.0, "logps/rejected": -532.0, "loss": 0.1293, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 9.375, "rewards/rejected": -26.625, "step": 11420 }, { "epoch": 0.8250920378257417, "grad_norm": 11.587353384655874, "learning_rate": 7.482847110217516e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.28125, "logps/chosen": -432.0, "logps/rejected": -486.0, "loss": 0.0937, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.6875, "rewards/margins": 8.75, "rewards/rejected": -24.5, "step": 11430 }, { "epoch": 0.8258139031256767, "grad_norm": 7.5727209541788145, "learning_rate": 7.479575920067657e-07, "logits/chosen": -0.10791015625, "logits/rejected": 0.2734375, "logps/chosen": -430.0, "logps/rejected": -480.0, "loss": 0.0885, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -16.125, "rewards/margins": 8.0, "rewards/rejected": -24.125, "step": 11440 }, { "epoch": 0.8265357684256118, "grad_norm": 7.984492267581647, "learning_rate": 7.476309016255964e-07, "logits/chosen": -0.32421875, "logits/rejected": 0.263671875, "logps/chosen": -454.0, "logps/rejected": -486.0, "loss": 0.0908, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.9375, "rewards/margins": 9.4375, "rewards/rejected": -24.375, "step": 11450 }, { "epoch": 0.8272576337255468, "grad_norm": 11.509484310168613, "learning_rate": 7.473046389429744e-07, "logits/chosen": -0.1552734375, "logits/rejected": 0.337890625, "logps/chosen": -432.0, "logps/rejected": -486.0, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.5, "rewards/margins": 8.9375, "rewards/rejected": -24.375, "step": 11460 }, { "epoch": 0.8279794990254818, "grad_norm": 8.757179674349313, "learning_rate": 7.469788030264852e-07, "logits/chosen": -0.1474609375, "logits/rejected": 0.2578125, "logps/chosen": -460.0, "logps/rejected": -490.0, "loss": 0.0843, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.375, "rewards/margins": 8.1875, "rewards/rejected": -24.5, "step": 11470 }, { "epoch": 0.8287013643254169, "grad_norm": 5.923595915954257, "learning_rate": 7.466533929465574e-07, "logits/chosen": -0.1767578125, "logits/rejected": 0.2578125, "logps/chosen": -470.0, "logps/rejected": -492.0, "loss": 0.0879, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.25, "rewards/margins": 8.1875, "rewards/rejected": -26.375, "step": 11480 }, { "epoch": 0.8294232296253519, "grad_norm": 6.481971345655886, "learning_rate": 7.463284077764519e-07, "logits/chosen": -0.1572265625, "logits/rejected": 0.302734375, "logps/chosen": -460.0, "logps/rejected": -516.0, "loss": 0.0861, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.125, "rewards/margins": 9.1875, "rewards/rejected": -26.25, "step": 11490 }, { "epoch": 0.830145094925287, "grad_norm": 10.157894525440891, "learning_rate": 7.460038465922511e-07, "logits/chosen": -0.0634765625, "logits/rejected": 0.326171875, "logps/chosen": -460.0, "logps/rejected": -498.0, "loss": 0.0924, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 8.125, "rewards/rejected": -25.625, "step": 11500 }, { "epoch": 0.830866960225222, "grad_norm": 10.692255195619332, "learning_rate": 7.456797084728466e-07, "logits/chosen": -0.016357421875, "logits/rejected": 0.384765625, "logps/chosen": -436.0, "logps/rejected": -516.0, "loss": 0.0707, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.375, "rewards/margins": 9.125, "rewards/rejected": -26.625, "step": 11510 }, { "epoch": 0.831588825525157, "grad_norm": 7.053514740280467, "learning_rate": 7.453559924999299e-07, "logits/chosen": -0.040283203125, "logits/rejected": 0.318359375, "logps/chosen": -432.0, "logps/rejected": -506.0, "loss": 0.0863, "rewards/accuracies": 0.96875, "rewards/chosen": -16.625, "rewards/margins": 9.125, "rewards/rejected": -25.75, "step": 11520 }, { "epoch": 0.832310690825092, "grad_norm": 11.259246415222528, "learning_rate": 7.450326977579804e-07, "logits/chosen": -0.310546875, "logits/rejected": 0.1572265625, "logps/chosen": -454.0, "logps/rejected": -516.0, "loss": 0.1101, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.875, "rewards/margins": 8.8125, "rewards/rejected": -25.625, "step": 11530 }, { "epoch": 0.833032556125027, "grad_norm": 8.971300878382278, "learning_rate": 7.447098233342549e-07, "logits/chosen": -0.03076171875, "logits/rejected": 0.3046875, "logps/chosen": -458.0, "logps/rejected": -520.0, "loss": 0.0707, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.125, "rewards/margins": 9.4375, "rewards/rejected": -25.625, "step": 11540 }, { "epoch": 0.8337544214249621, "grad_norm": 10.019084995419709, "learning_rate": 7.443873683187767e-07, "logits/chosen": -0.1748046875, "logits/rejected": 0.29296875, "logps/chosen": -442.0, "logps/rejected": -500.0, "loss": 0.0802, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 9.1875, "rewards/rejected": -26.625, "step": 11550 }, { "epoch": 0.8344762867248972, "grad_norm": 3.567454455291716, "learning_rate": 7.440653318043245e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.26953125, "logps/chosen": -482.0, "logps/rejected": -498.0, "loss": 0.0842, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.25, "rewards/margins": 8.3125, "rewards/rejected": -26.5, "step": 11560 }, { "epoch": 0.8351981520248322, "grad_norm": 7.669759674587947, "learning_rate": 7.437437128864224e-07, "logits/chosen": -0.25, "logits/rejected": 0.23046875, "logps/chosen": -464.0, "logps/rejected": -520.0, "loss": 0.068, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.875, "rewards/margins": 9.0, "rewards/rejected": -26.875, "step": 11570 }, { "epoch": 0.8359200173247672, "grad_norm": 8.952073339476055, "learning_rate": 7.434225106633287e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.212890625, "logps/chosen": -462.0, "logps/rejected": -500.0, "loss": 0.0908, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 9.5625, "rewards/rejected": -27.0, "step": 11580 }, { "epoch": 0.8366418826247022, "grad_norm": 10.826254335739078, "learning_rate": 7.431017242360253e-07, "logits/chosen": -0.236328125, "logits/rejected": 0.263671875, "logps/chosen": -466.0, "logps/rejected": -524.0, "loss": 0.0798, "rewards/accuracies": 0.96875, "rewards/chosen": -18.625, "rewards/margins": 8.75, "rewards/rejected": -27.375, "step": 11590 }, { "epoch": 0.8373637479246373, "grad_norm": 8.82618489913904, "learning_rate": 7.427813527082074e-07, "logits/chosen": -0.251953125, "logits/rejected": 0.37890625, "logps/chosen": -454.0, "logps/rejected": -506.0, "loss": 0.0944, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 9.0625, "rewards/rejected": -27.625, "step": 11600 }, { "epoch": 0.8380856132245723, "grad_norm": 8.533118652392847, "learning_rate": 7.424613951862727e-07, "logits/chosen": -0.12158203125, "logits/rejected": 0.421875, "logps/chosen": -426.0, "logps/rejected": -470.0, "loss": 0.072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 8.5625, "rewards/rejected": -25.625, "step": 11610 }, { "epoch": 0.8388074785245073, "grad_norm": 4.562239474060393, "learning_rate": 7.42141850779311e-07, "logits/chosen": -0.2119140625, "logits/rejected": 0.314453125, "logps/chosen": -442.0, "logps/rejected": -502.0, "loss": 0.0926, "rewards/accuracies": 0.96875, "rewards/chosen": -18.125, "rewards/margins": 8.75, "rewards/rejected": -26.875, "step": 11620 }, { "epoch": 0.8395293438244423, "grad_norm": 8.241861177258324, "learning_rate": 7.418227185990941e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.25, "logps/chosen": -468.0, "logps/rejected": -516.0, "loss": 0.1062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -19.375, "rewards/margins": 8.4375, "rewards/rejected": -27.875, "step": 11630 }, { "epoch": 0.8402512091243773, "grad_norm": 8.253956561994894, "learning_rate": 7.415039977600647e-07, "logits/chosen": -0.23046875, "logits/rejected": 0.419921875, "logps/chosen": -472.0, "logps/rejected": -520.0, "loss": 0.0981, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -20.25, "rewards/margins": 9.1875, "rewards/rejected": -29.375, "step": 11640 }, { "epoch": 0.8409730744243125, "grad_norm": 7.615820576392514, "learning_rate": 7.411856873793271e-07, "logits/chosen": -0.0947265625, "logits/rejected": 0.37890625, "logps/chosen": -496.0, "logps/rejected": -548.0, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -20.125, "rewards/margins": 8.5625, "rewards/rejected": -28.75, "step": 11650 }, { "epoch": 0.8416949397242475, "grad_norm": 6.841933127431233, "learning_rate": 7.408677865766361e-07, "logits/chosen": -0.19140625, "logits/rejected": 0.23046875, "logps/chosen": -450.0, "logps/rejected": -516.0, "loss": 0.0836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.375, "rewards/margins": 8.5625, "rewards/rejected": -26.875, "step": 11660 }, { "epoch": 0.8424168050241825, "grad_norm": 8.101742108005885, "learning_rate": 7.405502944743868e-07, "logits/chosen": -0.306640625, "logits/rejected": 0.30078125, "logps/chosen": -460.0, "logps/rejected": -516.0, "loss": 0.0778, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.25, "rewards/margins": 8.375, "rewards/rejected": -26.5, "step": 11670 }, { "epoch": 0.8431386703241175, "grad_norm": 5.342364099582754, "learning_rate": 7.402332101976052e-07, "logits/chosen": -0.328125, "logits/rejected": 0.126953125, "logps/chosen": -476.0, "logps/rejected": -516.0, "loss": 0.1073, "rewards/accuracies": 0.9375, "rewards/chosen": -18.625, "rewards/margins": 7.46875, "rewards/rejected": -26.125, "step": 11680 }, { "epoch": 0.8438605356240525, "grad_norm": 5.16495497915405, "learning_rate": 7.399165328739372e-07, "logits/chosen": -0.326171875, "logits/rejected": 0.1611328125, "logps/chosen": -454.0, "logps/rejected": -506.0, "loss": 0.1156, "rewards/accuracies": 0.96875, "rewards/chosen": -16.25, "rewards/margins": 8.0625, "rewards/rejected": -24.25, "step": 11690 }, { "epoch": 0.8445824009239876, "grad_norm": 10.147907311579816, "learning_rate": 7.396002616336387e-07, "logits/chosen": -0.359375, "logits/rejected": 0.201171875, "logps/chosen": -428.0, "logps/rejected": -464.0, "loss": 0.0931, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -14.9375, "rewards/margins": 7.90625, "rewards/rejected": -22.875, "step": 11700 }, { "epoch": 0.8453042662239226, "grad_norm": 10.296448261582464, "learning_rate": 7.392843956095663e-07, "logits/chosen": -0.2177734375, "logits/rejected": 0.22265625, "logps/chosen": -414.0, "logps/rejected": -486.0, "loss": 0.0842, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.75, "rewards/margins": 8.5625, "rewards/rejected": -24.375, "step": 11710 }, { "epoch": 0.8460261315238576, "grad_norm": 11.742358831157073, "learning_rate": 7.389689339371664e-07, "logits/chosen": -0.2060546875, "logits/rejected": 0.20703125, "logps/chosen": -418.0, "logps/rejected": -478.0, "loss": 0.0752, "rewards/accuracies": 0.96875, "rewards/chosen": -15.875, "rewards/margins": 7.78125, "rewards/rejected": -23.625, "step": 11720 }, { "epoch": 0.8467479968237926, "grad_norm": 9.178644081207475, "learning_rate": 7.386538757544653e-07, "logits/chosen": -0.388671875, "logits/rejected": 0.16796875, "logps/chosen": -422.0, "logps/rejected": -488.0, "loss": 0.109, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.125, "rewards/margins": 8.875, "rewards/rejected": -24.0, "step": 11730 }, { "epoch": 0.8474698621237277, "grad_norm": 7.355974357615239, "learning_rate": 7.3833922020206e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.0157470703125, "logps/chosen": -474.0, "logps/rejected": -516.0, "loss": 0.0939, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 8.125, "rewards/rejected": -25.25, "step": 11740 }, { "epoch": 0.8481917274236628, "grad_norm": 6.671505523119575, "learning_rate": 7.38024966423108e-07, "logits/chosen": -0.31640625, "logits/rejected": 0.087890625, "logps/chosen": -440.0, "logps/rejected": -508.0, "loss": 0.0873, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 8.5625, "rewards/rejected": -26.125, "step": 11750 }, { "epoch": 0.8489135927235978, "grad_norm": 13.48409676527448, "learning_rate": 7.377111135633174e-07, "logits/chosen": -0.205078125, "logits/rejected": 0.236328125, "logps/chosen": -436.0, "logps/rejected": -486.0, "loss": 0.08, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 7.71875, "rewards/rejected": -24.75, "step": 11760 }, { "epoch": 0.8496354580235328, "grad_norm": 12.220559372198059, "learning_rate": 7.373976607709372e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.12451171875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.1175, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.25, "rewards/margins": 7.75, "rewards/rejected": -26.0, "step": 11770 }, { "epoch": 0.8503573233234678, "grad_norm": 9.940583378877355, "learning_rate": 7.370846071967476e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.09912109375, "logps/chosen": -462.0, "logps/rejected": -516.0, "loss": 0.0915, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.75, "rewards/margins": 8.625, "rewards/rejected": -25.375, "step": 11780 }, { "epoch": 0.8510791886234029, "grad_norm": 10.6992444674567, "learning_rate": 7.367719519940501e-07, "logits/chosen": -0.1259765625, "logits/rejected": 0.1005859375, "logps/chosen": -460.0, "logps/rejected": -488.0, "loss": 0.0729, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.5, "rewards/margins": 7.78125, "rewards/rejected": -24.375, "step": 11790 }, { "epoch": 0.8518010539233379, "grad_norm": 6.99265566070554, "learning_rate": 7.364596943186587e-07, "logits/chosen": -0.3203125, "logits/rejected": 0.171875, "logps/chosen": -438.0, "logps/rejected": -468.0, "loss": 0.0914, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.125, "rewards/margins": 7.84375, "rewards/rejected": -24.0, "step": 11800 }, { "epoch": 0.852522919223273, "grad_norm": 11.989899213951977, "learning_rate": 7.36147833328889e-07, "logits/chosen": -0.263671875, "logits/rejected": 0.1650390625, "logps/chosen": -474.0, "logps/rejected": -540.0, "loss": 0.1042, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.5, "rewards/margins": 8.375, "rewards/rejected": -25.875, "step": 11810 }, { "epoch": 0.853244784523208, "grad_norm": 9.944241208616818, "learning_rate": 7.358363681855503e-07, "logits/chosen": -0.291015625, "logits/rejected": 0.306640625, "logps/chosen": -448.0, "logps/rejected": -472.0, "loss": 0.1007, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.5, "rewards/margins": 8.5, "rewards/rejected": -25.0, "step": 11820 }, { "epoch": 0.853966649823143, "grad_norm": 12.31399340515142, "learning_rate": 7.355252980519345e-07, "logits/chosen": -0.068359375, "logits/rejected": 0.271484375, "logps/chosen": -440.0, "logps/rejected": -506.0, "loss": 0.071, "rewards/accuracies": 0.96875, "rewards/chosen": -16.375, "rewards/margins": 8.8125, "rewards/rejected": -25.25, "step": 11830 }, { "epoch": 0.8546885151230781, "grad_norm": 11.141901987895709, "learning_rate": 7.352146220938078e-07, "logits/chosen": -0.2109375, "logits/rejected": 0.3359375, "logps/chosen": -426.0, "logps/rejected": -474.0, "loss": 0.0768, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.0625, "rewards/margins": 8.6875, "rewards/rejected": -23.75, "step": 11840 }, { "epoch": 0.8554103804230131, "grad_norm": 5.303659421001808, "learning_rate": 7.349043394794005e-07, "logits/chosen": -0.259765625, "logits/rejected": 0.2890625, "logps/chosen": -440.0, "logps/rejected": -482.0, "loss": 0.0644, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.25, "rewards/margins": 9.0, "rewards/rejected": -25.25, "step": 11850 }, { "epoch": 0.8561322457229481, "grad_norm": 5.6883870428894925, "learning_rate": 7.345944493793987e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.19140625, "logps/chosen": -462.0, "logps/rejected": -510.0, "loss": 0.0803, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.0, "rewards/margins": 9.25, "rewards/rejected": -26.25, "step": 11860 }, { "epoch": 0.8568541110228831, "grad_norm": 11.286278924873974, "learning_rate": 7.342849509669337e-07, "logits/chosen": -0.416015625, "logits/rejected": 0.240234375, "logps/chosen": -422.0, "logps/rejected": -482.0, "loss": 0.0679, "rewards/accuracies": 0.96875, "rewards/chosen": -15.25, "rewards/margins": 8.75, "rewards/rejected": -24.0, "step": 11870 }, { "epoch": 0.8575759763228181, "grad_norm": 13.35944190895779, "learning_rate": 7.339758434175737e-07, "logits/chosen": -0.2138671875, "logits/rejected": 0.1591796875, "logps/chosen": -448.0, "logps/rejected": -512.0, "loss": 0.0794, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.75, "rewards/margins": 8.5, "rewards/rejected": -25.25, "step": 11880 }, { "epoch": 0.8582978416227532, "grad_norm": 5.382676114948697, "learning_rate": 7.336671259093143e-07, "logits/chosen": -0.166015625, "logits/rejected": 0.328125, "logps/chosen": -454.0, "logps/rejected": -524.0, "loss": 0.0826, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.125, "rewards/margins": 9.1875, "rewards/rejected": -26.25, "step": 11890 }, { "epoch": 0.8590197069226883, "grad_norm": 7.471130926415352, "learning_rate": 7.33358797622569e-07, "logits/chosen": -0.2412109375, "logits/rejected": 0.2412109375, "logps/chosen": -452.0, "logps/rejected": -492.0, "loss": 0.0987, "rewards/accuracies": 0.96875, "rewards/chosen": -16.5, "rewards/margins": 8.375, "rewards/rejected": -24.875, "step": 11900 }, { "epoch": 0.8597415722226233, "grad_norm": 11.717146198278368, "learning_rate": 7.330508577401606e-07, "logits/chosen": -0.326171875, "logits/rejected": 0.1953125, "logps/chosen": -438.0, "logps/rejected": -486.0, "loss": 0.0829, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.75, "rewards/margins": 9.0, "rewards/rejected": -23.75, "step": 11910 }, { "epoch": 0.8604634375225583, "grad_norm": 7.834345727312983, "learning_rate": 7.327433054473117e-07, "logits/chosen": -0.20703125, "logits/rejected": 0.24609375, "logps/chosen": -470.0, "logps/rejected": -516.0, "loss": 0.0908, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -18.25, "rewards/margins": 8.5625, "rewards/rejected": -26.75, "step": 11920 }, { "epoch": 0.8611853028224933, "grad_norm": 9.30330106438001, "learning_rate": 7.324361399316357e-07, "logits/chosen": -0.287109375, "logits/rejected": 0.1884765625, "logps/chosen": -452.0, "logps/rejected": -504.0, "loss": 0.0907, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 8.5, "rewards/rejected": -25.625, "step": 11930 }, { "epoch": 0.8619071681224284, "grad_norm": 6.6627836713529565, "learning_rate": 7.321293603831281e-07, "logits/chosen": -0.181640625, "logits/rejected": 0.1357421875, "logps/chosen": -460.0, "logps/rejected": -520.0, "loss": 0.0657, "rewards/accuracies": 0.96875, "rewards/chosen": -17.625, "rewards/margins": 9.1875, "rewards/rejected": -26.875, "step": 11940 }, { "epoch": 0.8626290334223634, "grad_norm": 6.913912583411662, "learning_rate": 7.318229659941572e-07, "logits/chosen": -0.169921875, "logits/rejected": 0.25390625, "logps/chosen": -490.0, "logps/rejected": -540.0, "loss": 0.078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 9.1875, "rewards/rejected": -28.375, "step": 11950 }, { "epoch": 0.8633508987222984, "grad_norm": 12.981963675111853, "learning_rate": 7.315169559594551e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.3203125, "logps/chosen": -454.0, "logps/rejected": -508.0, "loss": 0.0975, "rewards/accuracies": 0.96875, "rewards/chosen": -19.375, "rewards/margins": 8.6875, "rewards/rejected": -28.125, "step": 11960 }, { "epoch": 0.8640727640222334, "grad_norm": 10.94082834293095, "learning_rate": 7.31211329476109e-07, "logits/chosen": -0.36328125, "logits/rejected": 0.18359375, "logps/chosen": -500.0, "logps/rejected": -532.0, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.875, "rewards/margins": 8.9375, "rewards/rejected": -27.75, "step": 11970 }, { "epoch": 0.8647946293221684, "grad_norm": 11.702297047396875, "learning_rate": 7.309060857435526e-07, "logits/chosen": -0.3515625, "logits/rejected": 0.1064453125, "logps/chosen": -458.0, "logps/rejected": -512.0, "loss": 0.0808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.625, "rewards/margins": 8.4375, "rewards/rejected": -26.0, "step": 11980 }, { "epoch": 0.8655164946221036, "grad_norm": 13.568998030034129, "learning_rate": 7.30601223963557e-07, "logits/chosen": -0.21484375, "logits/rejected": 0.16015625, "logps/chosen": -442.0, "logps/rejected": -520.0, "loss": 0.0893, "rewards/accuracies": 0.9375, "rewards/chosen": -17.375, "rewards/margins": 8.1875, "rewards/rejected": -25.5, "step": 11990 }, { "epoch": 0.8662383599220386, "grad_norm": 9.214553318065748, "learning_rate": 7.302967433402214e-07, "logits/chosen": -0.328125, "logits/rejected": 0.2109375, "logps/chosen": -444.0, "logps/rejected": -480.0, "loss": 0.0891, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.5, "rewards/margins": 8.6875, "rewards/rejected": -25.25, "step": 12000 }, { "epoch": 0.8669602252219736, "grad_norm": 10.011954855653801, "learning_rate": 7.299926430799657e-07, "logits/chosen": -0.310546875, "logits/rejected": 0.1748046875, "logps/chosen": -426.0, "logps/rejected": -488.0, "loss": 0.09, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.625, "rewards/margins": 7.5625, "rewards/rejected": -23.25, "step": 12010 }, { "epoch": 0.8676820905219086, "grad_norm": 9.865147790277799, "learning_rate": 7.296889223915205e-07, "logits/chosen": -0.232421875, "logits/rejected": 0.328125, "logps/chosen": -444.0, "logps/rejected": -502.0, "loss": 0.0987, "rewards/accuracies": 0.96875, "rewards/chosen": -16.375, "rewards/margins": 8.8125, "rewards/rejected": -25.25, "step": 12020 }, { "epoch": 0.8684039558218436, "grad_norm": 7.0228866710313325, "learning_rate": 7.293855804859192e-07, "logits/chosen": -0.234375, "logits/rejected": 0.2080078125, "logps/chosen": -410.0, "logps/rejected": -468.0, "loss": 0.0796, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.9375, "rewards/margins": 8.5625, "rewards/rejected": -22.5, "step": 12030 }, { "epoch": 0.8691258211217787, "grad_norm": 7.684333419828546, "learning_rate": 7.290826165764892e-07, "logits/chosen": -0.294921875, "logits/rejected": 0.1474609375, "logps/chosen": -420.0, "logps/rejected": -472.0, "loss": 0.0975, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.0, "rewards/margins": 8.3125, "rewards/rejected": -23.25, "step": 12040 }, { "epoch": 0.8698476864217137, "grad_norm": 12.678237663346309, "learning_rate": 7.28780029878843e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.185546875, "logps/chosen": -474.0, "logps/rejected": -480.0, "loss": 0.0973, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.75, "rewards/margins": 8.5625, "rewards/rejected": -24.375, "step": 12050 }, { "epoch": 0.8705695517216487, "grad_norm": 6.203669328755798, "learning_rate": 7.284778196108706e-07, "logits/chosen": -0.2890625, "logits/rejected": 0.1669921875, "logps/chosen": -448.0, "logps/rejected": -488.0, "loss": 0.0555, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5, "rewards/margins": 8.125, "rewards/rejected": -23.625, "step": 12060 }, { "epoch": 0.8712914170215837, "grad_norm": 8.511105211850087, "learning_rate": 7.281759849927299e-07, "logits/chosen": -0.330078125, "logits/rejected": 0.283203125, "logps/chosen": -434.0, "logps/rejected": -524.0, "loss": 0.1065, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.75, "rewards/margins": 8.625, "rewards/rejected": -24.25, "step": 12070 }, { "epoch": 0.8720132823215188, "grad_norm": 9.865753701739965, "learning_rate": 7.278745252468389e-07, "logits/chosen": -0.333984375, "logits/rejected": 0.15625, "logps/chosen": -434.0, "logps/rejected": -508.0, "loss": 0.0944, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.125, "rewards/margins": 9.0, "rewards/rejected": -25.125, "step": 12080 }, { "epoch": 0.8727351476214539, "grad_norm": 9.522017806329815, "learning_rate": 7.275734395978672e-07, "logits/chosen": -0.369140625, "logits/rejected": 0.1259765625, "logps/chosen": -428.0, "logps/rejected": -480.0, "loss": 0.0702, "rewards/accuracies": 0.96875, "rewards/chosen": -15.375, "rewards/margins": 7.84375, "rewards/rejected": -23.25, "step": 12090 }, { "epoch": 0.8734570129213889, "grad_norm": 4.287703366103392, "learning_rate": 7.272727272727272e-07, "logits/chosen": -0.2734375, "logits/rejected": 0.1611328125, "logps/chosen": -418.0, "logps/rejected": -472.0, "loss": 0.0824, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 8.6875, "rewards/rejected": -24.0, "step": 12100 }, { "epoch": 0.8741788782213239, "grad_norm": 7.6427089993224895, "learning_rate": 7.269723875005668e-07, "logits/chosen": -0.3046875, "logits/rejected": 0.2431640625, "logps/chosen": -444.0, "logps/rejected": -498.0, "loss": 0.0705, "rewards/accuracies": 0.96875, "rewards/chosen": -16.5, "rewards/margins": 8.4375, "rewards/rejected": -24.875, "step": 12110 }, { "epoch": 0.8749007435212589, "grad_norm": 10.002017996585428, "learning_rate": 7.266724195127595e-07, "logits/chosen": -0.1845703125, "logits/rejected": 0.251953125, "logps/chosen": -464.0, "logps/rejected": -506.0, "loss": 0.1011, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.25, "rewards/margins": 8.5, "rewards/rejected": -25.75, "step": 12120 }, { "epoch": 0.8756226088211939, "grad_norm": 12.12654412714052, "learning_rate": 7.26372822542898e-07, "logits/chosen": -0.2138671875, "logits/rejected": 0.1982421875, "logps/chosen": -414.0, "logps/rejected": -446.0, "loss": 0.086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.75, "rewards/margins": 8.4375, "rewards/rejected": -23.125, "step": 12130 }, { "epoch": 0.876344474121129, "grad_norm": 7.543488174697192, "learning_rate": 7.260735958267845e-07, "logits/chosen": -0.2373046875, "logits/rejected": 0.25390625, "logps/chosen": -442.0, "logps/rejected": -482.0, "loss": 0.0679, "rewards/accuracies": 0.96875, "rewards/chosen": -14.8125, "rewards/margins": 8.375, "rewards/rejected": -23.25, "step": 12140 }, { "epoch": 0.877066339421064, "grad_norm": 12.467696826308307, "learning_rate": 7.257747386024231e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.244140625, "logps/chosen": -430.0, "logps/rejected": -504.0, "loss": 0.075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.875, "rewards/margins": 9.625, "rewards/rejected": -25.5, "step": 12150 }, { "epoch": 0.877788204720999, "grad_norm": 8.153482546239454, "learning_rate": 7.254762501100117e-07, "logits/chosen": -0.28125, "logits/rejected": 0.29296875, "logps/chosen": -448.0, "logps/rejected": -508.0, "loss": 0.0846, "rewards/accuracies": 0.96875, "rewards/chosen": -17.0, "rewards/margins": 8.4375, "rewards/rejected": -25.5, "step": 12160 }, { "epoch": 0.8785100700209341, "grad_norm": 7.0470227863753045, "learning_rate": 7.251781295919335e-07, "logits/chosen": -0.25390625, "logits/rejected": 0.228515625, "logps/chosen": -442.0, "logps/rejected": -476.0, "loss": 0.1032, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.5, "rewards/margins": 8.1875, "rewards/rejected": -23.75, "step": 12170 }, { "epoch": 0.8792319353208691, "grad_norm": 7.818425049688634, "learning_rate": 7.248803762927498e-07, "logits/chosen": -0.2412109375, "logits/rejected": 0.2353515625, "logps/chosen": -454.0, "logps/rejected": -480.0, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.25, "rewards/margins": 8.0625, "rewards/rejected": -24.25, "step": 12180 }, { "epoch": 0.8799538006208042, "grad_norm": 4.97658587287996, "learning_rate": 7.245829894591907e-07, "logits/chosen": -0.2236328125, "logits/rejected": 0.2177734375, "logps/chosen": -434.0, "logps/rejected": -482.0, "loss": 0.0944, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.125, "rewards/margins": 7.875, "rewards/rejected": -24.0, "step": 12190 }, { "epoch": 0.8806756659207392, "grad_norm": 7.959232600172129, "learning_rate": 7.242859683401482e-07, "logits/chosen": -0.2578125, "logits/rejected": 0.2392578125, "logps/chosen": -438.0, "logps/rejected": -488.0, "loss": 0.0788, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.6875, "rewards/margins": 8.75, "rewards/rejected": -24.375, "step": 12200 }, { "epoch": 0.8813975312206742, "grad_norm": 12.10204702821982, "learning_rate": 7.239893121866677e-07, "logits/chosen": -0.1484375, "logits/rejected": 0.2216796875, "logps/chosen": -448.0, "logps/rejected": -488.0, "loss": 0.0954, "rewards/accuracies": 0.96875, "rewards/chosen": -15.4375, "rewards/margins": 8.3125, "rewards/rejected": -23.75, "step": 12210 }, { "epoch": 0.8821193965206092, "grad_norm": 10.673367524320987, "learning_rate": 7.236930202519399e-07, "logits/chosen": -0.134765625, "logits/rejected": 0.353515625, "logps/chosen": -440.0, "logps/rejected": -506.0, "loss": 0.0922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.875, "rewards/margins": 9.1875, "rewards/rejected": -26.125, "step": 12220 }, { "epoch": 0.8828412618205442, "grad_norm": 2.630447999455839, "learning_rate": 7.233970917912936e-07, "logits/chosen": -0.38671875, "logits/rejected": 0.1884765625, "logps/chosen": -424.0, "logps/rejected": -478.0, "loss": 0.084, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.8125, "rewards/margins": 8.8125, "rewards/rejected": -24.625, "step": 12230 }, { "epoch": 0.8835631271204794, "grad_norm": 9.896684651201012, "learning_rate": 7.231015260621871e-07, "logits/chosen": -0.1455078125, "logits/rejected": 0.1845703125, "logps/chosen": -444.0, "logps/rejected": -520.0, "loss": 0.0845, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 9.0625, "rewards/rejected": -25.875, "step": 12240 }, { "epoch": 0.8842849924204144, "grad_norm": 8.136114928132173, "learning_rate": 7.228063223242011e-07, "logits/chosen": -0.2470703125, "logits/rejected": 0.15234375, "logps/chosen": -452.0, "logps/rejected": -520.0, "loss": 0.0918, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.75, "rewards/margins": 8.75, "rewards/rejected": -25.5, "step": 12250 }, { "epoch": 0.8850068577203494, "grad_norm": 12.227669689975192, "learning_rate": 7.225114798390295e-07, "logits/chosen": -0.24609375, "logits/rejected": 0.33203125, "logps/chosen": -442.0, "logps/rejected": -508.0, "loss": 0.1088, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.0, "rewards/margins": 8.3125, "rewards/rejected": -26.375, "step": 12260 }, { "epoch": 0.8857287230202844, "grad_norm": 10.383407261430895, "learning_rate": 7.222169978704737e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.30078125, "logps/chosen": -458.0, "logps/rejected": -496.0, "loss": 0.103, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -17.25, "rewards/margins": 7.9375, "rewards/rejected": -25.25, "step": 12270 }, { "epoch": 0.8864505883202195, "grad_norm": 9.551851081457954, "learning_rate": 7.219228756844335e-07, "logits/chosen": -0.16015625, "logits/rejected": 0.3359375, "logps/chosen": -468.0, "logps/rejected": -512.0, "loss": 0.0759, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.625, "rewards/margins": 8.375, "rewards/rejected": -27.0, "step": 12280 }, { "epoch": 0.8871724536201545, "grad_norm": 9.107113581565198, "learning_rate": 7.216291125488994e-07, "logits/chosen": -0.310546875, "logits/rejected": 0.25, "logps/chosen": -452.0, "logps/rejected": -512.0, "loss": 0.0903, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.75, "rewards/margins": 9.0, "rewards/rejected": -26.75, "step": 12290 }, { "epoch": 0.8878943189200895, "grad_norm": 6.434541577247974, "learning_rate": 7.213357077339458e-07, "logits/chosen": -0.271484375, "logits/rejected": 0.212890625, "logps/chosen": -480.0, "logps/rejected": -502.0, "loss": 0.0759, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 8.625, "rewards/rejected": -25.625, "step": 12300 }, { "epoch": 0.8886161842200245, "grad_norm": 10.57570746207905, "learning_rate": 7.210426605117224e-07, "logits/chosen": -0.32421875, "logits/rejected": 0.126953125, "logps/chosen": -446.0, "logps/rejected": -506.0, "loss": 0.0975, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 8.25, "rewards/rejected": -25.375, "step": 12310 }, { "epoch": 0.8893380495199595, "grad_norm": 12.332393041118431, "learning_rate": 7.207499701564471e-07, "logits/chosen": -0.279296875, "logits/rejected": 0.1455078125, "logps/chosen": -468.0, "logps/rejected": -496.0, "loss": 0.0982, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.25, "rewards/margins": 7.9375, "rewards/rejected": -25.25, "step": 12320 }, { "epoch": 0.8900599148198947, "grad_norm": 9.966467591581166, "learning_rate": 7.204576359443989e-07, "logits/chosen": -0.486328125, "logits/rejected": 0.0615234375, "logps/chosen": -444.0, "logps/rejected": -488.0, "loss": 0.0904, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.875, "rewards/margins": 8.4375, "rewards/rejected": -25.375, "step": 12330 }, { "epoch": 0.8907817801198297, "grad_norm": 10.80238279868622, "learning_rate": 7.201656571539094e-07, "logits/chosen": -0.1513671875, "logits/rejected": 0.2392578125, "logps/chosen": -456.0, "logps/rejected": -524.0, "loss": 0.0887, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.625, "rewards/margins": 8.875, "rewards/rejected": -26.5, "step": 12340 }, { "epoch": 0.8915036454197647, "grad_norm": 11.511965511252603, "learning_rate": 7.19874033065356e-07, "logits/chosen": -0.3515625, "logits/rejected": 0.25390625, "logps/chosen": -476.0, "logps/rejected": -528.0, "loss": 0.0895, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.125, "rewards/margins": 8.9375, "rewards/rejected": -27.125, "step": 12350 }, { "epoch": 0.8922255107196997, "grad_norm": 7.740910653897622, "learning_rate": 7.195827629611545e-07, "logits/chosen": -0.232421875, "logits/rejected": 0.236328125, "logps/chosen": -466.0, "logps/rejected": -540.0, "loss": 0.0743, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 8.375, "rewards/rejected": -28.0, "step": 12360 }, { "epoch": 0.8929473760196347, "grad_norm": 11.537914387466923, "learning_rate": 7.19291846125751e-07, "logits/chosen": -0.16796875, "logits/rejected": 0.2578125, "logps/chosen": -476.0, "logps/rejected": -540.0, "loss": 0.0693, "rewards/accuracies": 0.96875, "rewards/chosen": -19.875, "rewards/margins": 9.5, "rewards/rejected": -29.5, "step": 12370 }, { "epoch": 0.8936692413195698, "grad_norm": 12.509225588993463, "learning_rate": 7.190012818456154e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.1953125, "logps/chosen": -482.0, "logps/rejected": -516.0, "loss": 0.0739, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.75, "rewards/margins": 8.5625, "rewards/rejected": -27.375, "step": 12380 }, { "epoch": 0.8943911066195048, "grad_norm": 8.852583787329202, "learning_rate": 7.187110694092334e-07, "logits/chosen": -0.15625, "logits/rejected": 0.2578125, "logps/chosen": -468.0, "logps/rejected": -516.0, "loss": 0.0706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.375, "rewards/margins": 9.1875, "rewards/rejected": -28.5, "step": 12390 }, { "epoch": 0.8951129719194398, "grad_norm": 11.005678039239788, "learning_rate": 7.184212081070996e-07, "logits/chosen": -0.171875, "logits/rejected": 0.28515625, "logps/chosen": -484.0, "logps/rejected": -548.0, "loss": 0.0907, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 9.125, "rewards/rejected": -29.25, "step": 12400 }, { "epoch": 0.8958348372193748, "grad_norm": 8.383197510336206, "learning_rate": 7.181316972317097e-07, "logits/chosen": -0.283203125, "logits/rejected": 0.232421875, "logps/chosen": -482.0, "logps/rejected": -516.0, "loss": 0.076, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.25, "rewards/margins": 8.5625, "rewards/rejected": -27.75, "step": 12410 }, { "epoch": 0.8965567025193099, "grad_norm": 12.03796398987186, "learning_rate": 7.17842536077554e-07, "logits/chosen": -0.1904296875, "logits/rejected": 0.189453125, "logps/chosen": -454.0, "logps/rejected": -506.0, "loss": 0.0984, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 8.0, "rewards/rejected": -25.25, "step": 12420 }, { "epoch": 0.897278567819245, "grad_norm": 10.188957712749245, "learning_rate": 7.175537239411094e-07, "logits/chosen": -0.2373046875, "logits/rejected": 0.23046875, "logps/chosen": -462.0, "logps/rejected": -512.0, "loss": 0.0928, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -18.875, "rewards/margins": 8.0625, "rewards/rejected": -27.0, "step": 12430 }, { "epoch": 0.89800043311918, "grad_norm": 6.045289041293544, "learning_rate": 7.172652601208325e-07, "logits/chosen": -0.177734375, "logits/rejected": 0.279296875, "logps/chosen": -444.0, "logps/rejected": -506.0, "loss": 0.0863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.875, "rewards/margins": 8.3125, "rewards/rejected": -25.125, "step": 12440 }, { "epoch": 0.898722298419115, "grad_norm": 6.5398663384377, "learning_rate": 7.169771439171534e-07, "logits/chosen": -0.1865234375, "logits/rejected": 0.25390625, "logps/chosen": -472.0, "logps/rejected": -520.0, "loss": 0.066, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.6875, "rewards/rejected": -25.25, "step": 12450 }, { "epoch": 0.89944416371905, "grad_norm": 9.565537330486501, "learning_rate": 7.166893746324661e-07, "logits/chosen": -0.341796875, "logits/rejected": 0.162109375, "logps/chosen": -448.0, "logps/rejected": -484.0, "loss": 0.0986, "rewards/accuracies": 0.96875, "rewards/chosen": -16.25, "rewards/margins": 8.375, "rewards/rejected": -24.625, "step": 12460 }, { "epoch": 0.900166029018985, "grad_norm": 7.638384296812464, "learning_rate": 7.164019515711245e-07, "logits/chosen": -0.2333984375, "logits/rejected": 0.224609375, "logps/chosen": -466.0, "logps/rejected": -510.0, "loss": 0.1026, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.5, "rewards/margins": 8.0625, "rewards/rejected": -25.625, "step": 12470 }, { "epoch": 0.9008878943189201, "grad_norm": 6.063599484303692, "learning_rate": 7.161148740394328e-07, "logits/chosen": -0.2216796875, "logits/rejected": 0.2734375, "logps/chosen": -448.0, "logps/rejected": -496.0, "loss": 0.091, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.375, "rewards/margins": 8.3125, "rewards/rejected": -25.75, "step": 12480 }, { "epoch": 0.9016097596188551, "grad_norm": 12.994507204343982, "learning_rate": 7.158281413456402e-07, "logits/chosen": -0.37890625, "logits/rejected": 0.061279296875, "logps/chosen": -464.0, "logps/rejected": -510.0, "loss": 0.1097, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.875, "rewards/margins": 7.875, "rewards/rejected": -25.75, "step": 12490 }, { "epoch": 0.9023316249187902, "grad_norm": 12.282439044511072, "learning_rate": 7.155417527999326e-07, "logits/chosen": -0.1767578125, "logits/rejected": 0.228515625, "logps/chosen": -442.0, "logps/rejected": -490.0, "loss": 0.084, "rewards/accuracies": 0.96875, "rewards/chosen": -16.625, "rewards/margins": 7.65625, "rewards/rejected": -24.25, "step": 12500 }, { "epoch": 0.9030534902187252, "grad_norm": 7.441211087510537, "learning_rate": 7.152557077144268e-07, "logits/chosen": -0.2265625, "logits/rejected": 0.1806640625, "logps/chosen": -436.0, "logps/rejected": -494.0, "loss": 0.0695, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.0, "rewards/margins": 7.875, "rewards/rejected": -25.0, "step": 12510 }, { "epoch": 0.9037753555186602, "grad_norm": 24.650791291803984, "learning_rate": 7.149700054031623e-07, "logits/chosen": -0.1376953125, "logits/rejected": 0.302734375, "logps/chosen": -442.0, "logps/rejected": -492.0, "loss": 0.0897, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 8.3125, "rewards/rejected": -25.25, "step": 12520 }, { "epoch": 0.9044972208185953, "grad_norm": 8.737334309249306, "learning_rate": 7.146846451820958e-07, "logits/chosen": -0.16015625, "logits/rejected": 0.251953125, "logps/chosen": -458.0, "logps/rejected": -508.0, "loss": 0.0714, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.375, "rewards/rejected": -25.0, "step": 12530 }, { "epoch": 0.9052190861185303, "grad_norm": 9.896982847468763, "learning_rate": 7.14399626369093e-07, "logits/chosen": -0.30859375, "logits/rejected": 0.2177734375, "logps/chosen": -462.0, "logps/rejected": -480.0, "loss": 0.0918, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -17.25, "rewards/margins": 7.75, "rewards/rejected": -25.0, "step": 12540 }, { "epoch": 0.9059409514184653, "grad_norm": 11.23353301492825, "learning_rate": 7.141149482839228e-07, "logits/chosen": -0.185546875, "logits/rejected": 0.189453125, "logps/chosen": -446.0, "logps/rejected": -516.0, "loss": 0.0968, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 9.375, "rewards/rejected": -26.5, "step": 12550 }, { "epoch": 0.9066628167184003, "grad_norm": 7.786389344483044, "learning_rate": 7.138306102482496e-07, "logits/chosen": -0.25390625, "logits/rejected": 0.22265625, "logps/chosen": -460.0, "logps/rejected": -512.0, "loss": 0.0633, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.625, "rewards/margins": 8.5, "rewards/rejected": -26.125, "step": 12560 }, { "epoch": 0.9073846820183353, "grad_norm": 11.351204465062752, "learning_rate": 7.135466115856274e-07, "logits/chosen": -0.26953125, "logits/rejected": 0.2490234375, "logps/chosen": -442.0, "logps/rejected": -516.0, "loss": 0.0932, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.9375, "rewards/margins": 8.5625, "rewards/rejected": -24.5, "step": 12570 }, { "epoch": 0.9081065473182705, "grad_norm": 14.711984371550884, "learning_rate": 7.13262951621492e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.267578125, "logps/chosen": -432.0, "logps/rejected": -484.0, "loss": 0.0967, "rewards/accuracies": 0.9375, "rewards/chosen": -16.375, "rewards/margins": 8.5, "rewards/rejected": -24.875, "step": 12580 }, { "epoch": 0.9088284126182055, "grad_norm": 9.477697843619023, "learning_rate": 7.129796296831554e-07, "logits/chosen": -0.2265625, "logits/rejected": 0.1708984375, "logps/chosen": -444.0, "logps/rejected": -486.0, "loss": 0.0789, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.0, "rewards/margins": 8.5, "rewards/rejected": -23.5, "step": 12590 }, { "epoch": 0.9095502779181405, "grad_norm": 8.885657264280319, "learning_rate": 7.126966450997984e-07, "logits/chosen": -0.28515625, "logits/rejected": 0.1396484375, "logps/chosen": -476.0, "logps/rejected": -528.0, "loss": 0.0698, "rewards/accuracies": 0.96875, "rewards/chosen": -16.375, "rewards/margins": 8.1875, "rewards/rejected": -24.5, "step": 12600 }, { "epoch": 0.9102721432180755, "grad_norm": 3.063652665131735, "learning_rate": 7.124139972024637e-07, "logits/chosen": -0.294921875, "logits/rejected": 0.267578125, "logps/chosen": -436.0, "logps/rejected": -486.0, "loss": 0.0896, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.125, "rewards/margins": 8.75, "rewards/rejected": -24.875, "step": 12610 }, { "epoch": 0.9109940085180105, "grad_norm": 6.285474087806684, "learning_rate": 7.121316853240503e-07, "logits/chosen": -0.43359375, "logits/rejected": 0.126953125, "logps/chosen": -414.0, "logps/rejected": -512.0, "loss": 0.0819, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.3125, "rewards/margins": 9.0625, "rewards/rejected": -24.375, "step": 12620 }, { "epoch": 0.9117158738179456, "grad_norm": 12.114791129150808, "learning_rate": 7.118497087993057e-07, "logits/chosen": -0.15234375, "logits/rejected": 0.32421875, "logps/chosen": -448.0, "logps/rejected": -490.0, "loss": 0.0966, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.25, "rewards/margins": 9.0, "rewards/rejected": -26.25, "step": 12630 }, { "epoch": 0.9124377391178806, "grad_norm": 13.132064310565022, "learning_rate": 7.1156806696482e-07, "logits/chosen": -0.34375, "logits/rejected": 0.1416015625, "logps/chosen": -410.0, "logps/rejected": -484.0, "loss": 0.0768, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.9375, "rewards/margins": 8.75, "rewards/rejected": -22.625, "step": 12640 }, { "epoch": 0.9131596044178156, "grad_norm": 6.477427379897674, "learning_rate": 7.112867591590192e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.228515625, "logps/chosen": -468.0, "logps/rejected": -528.0, "loss": 0.0851, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.5, "rewards/margins": 9.25, "rewards/rejected": -26.75, "step": 12650 }, { "epoch": 0.9138814697177506, "grad_norm": 9.954478988710086, "learning_rate": 7.110057847221588e-07, "logits/chosen": -0.37890625, "logits/rejected": 0.1630859375, "logps/chosen": -468.0, "logps/rejected": -536.0, "loss": 0.0845, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 9.0625, "rewards/rejected": -27.25, "step": 12660 }, { "epoch": 0.9146033350176856, "grad_norm": 8.253764129186237, "learning_rate": 7.107251429963166e-07, "logits/chosen": -0.279296875, "logits/rejected": 0.162109375, "logps/chosen": -458.0, "logps/rejected": -516.0, "loss": 0.0819, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.875, "rewards/margins": 8.75, "rewards/rejected": -25.625, "step": 12670 }, { "epoch": 0.9153252003176208, "grad_norm": 8.793281980883004, "learning_rate": 7.104448333253878e-07, "logits/chosen": -0.330078125, "logits/rejected": 0.1318359375, "logps/chosen": -438.0, "logps/rejected": -484.0, "loss": 0.0816, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.4375, "rewards/margins": 8.3125, "rewards/rejected": -23.75, "step": 12680 }, { "epoch": 0.9160470656175558, "grad_norm": 8.943578007464493, "learning_rate": 7.101648550550766e-07, "logits/chosen": -0.22265625, "logits/rejected": 0.10595703125, "logps/chosen": -464.0, "logps/rejected": -524.0, "loss": 0.0783, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.5, "rewards/margins": 8.6875, "rewards/rejected": -27.125, "step": 12690 }, { "epoch": 0.9167689309174908, "grad_norm": 7.963946529586731, "learning_rate": 7.098852075328911e-07, "logits/chosen": -0.2021484375, "logits/rejected": 0.1982421875, "logps/chosen": -444.0, "logps/rejected": -496.0, "loss": 0.0729, "rewards/accuracies": 0.96875, "rewards/chosen": -16.25, "rewards/margins": 8.9375, "rewards/rejected": -25.125, "step": 12700 }, { "epoch": 0.9174907962174258, "grad_norm": 11.089402779013176, "learning_rate": 7.096058901081364e-07, "logits/chosen": -0.275390625, "logits/rejected": 0.125, "logps/chosen": -454.0, "logps/rejected": -508.0, "loss": 0.0966, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.125, "rewards/margins": 8.75, "rewards/rejected": -25.875, "step": 12710 }, { "epoch": 0.9182126615173608, "grad_norm": 4.986450701097274, "learning_rate": 7.093269021319087e-07, "logits/chosen": -0.2890625, "logits/rejected": 0.2431640625, "logps/chosen": -444.0, "logps/rejected": -488.0, "loss": 0.0885, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -17.0, "rewards/margins": 8.625, "rewards/rejected": -25.625, "step": 12720 }, { "epoch": 0.9189345268172959, "grad_norm": 12.621071320154511, "learning_rate": 7.090482429570884e-07, "logits/chosen": -0.173828125, "logits/rejected": 0.2734375, "logps/chosen": -436.0, "logps/rejected": -504.0, "loss": 0.0694, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.125, "rewards/margins": 9.1875, "rewards/rejected": -25.375, "step": 12730 }, { "epoch": 0.9196563921172309, "grad_norm": 9.034349407291788, "learning_rate": 7.087699119383339e-07, "logits/chosen": -0.275390625, "logits/rejected": 0.2314453125, "logps/chosen": -436.0, "logps/rejected": -488.0, "loss": 0.0863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -16.625, "rewards/margins": 8.0625, "rewards/rejected": -24.625, "step": 12740 }, { "epoch": 0.920378257417166, "grad_norm": 10.631554200670887, "learning_rate": 7.084919084320762e-07, "logits/chosen": -0.2216796875, "logits/rejected": 0.22265625, "logps/chosen": -434.0, "logps/rejected": -480.0, "loss": 0.091, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5625, "rewards/margins": 8.3125, "rewards/rejected": -23.875, "step": 12750 }, { "epoch": 0.921100122717101, "grad_norm": 9.34233808239901, "learning_rate": 7.08214231796511e-07, "logits/chosen": -0.255859375, "logits/rejected": 0.12060546875, "logps/chosen": -442.0, "logps/rejected": -502.0, "loss": 0.085, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 8.0, "rewards/rejected": -25.25, "step": 12760 }, { "epoch": 0.9218219880170361, "grad_norm": 10.134718060643891, "learning_rate": 7.079368813915939e-07, "logits/chosen": -0.228515625, "logits/rejected": 0.2177734375, "logps/chosen": -424.0, "logps/rejected": -504.0, "loss": 0.0915, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.75, "rewards/margins": 8.6875, "rewards/rejected": -24.375, "step": 12770 }, { "epoch": 0.9225438533169711, "grad_norm": 7.933011551850086, "learning_rate": 7.076598565790337e-07, "logits/chosen": -0.19140625, "logits/rejected": 0.279296875, "logps/chosen": -452.0, "logps/rejected": -494.0, "loss": 0.0879, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -16.75, "rewards/margins": 8.75, "rewards/rejected": -25.5, "step": 12780 }, { "epoch": 0.9232657186169061, "grad_norm": 9.564758094824342, "learning_rate": 7.073831567222859e-07, "logits/chosen": -0.26953125, "logits/rejected": 0.1591796875, "logps/chosen": -454.0, "logps/rejected": -512.0, "loss": 0.1105, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.375, "rewards/margins": 8.25, "rewards/rejected": -25.625, "step": 12790 }, { "epoch": 0.9239875839168411, "grad_norm": 5.645285067071935, "learning_rate": 7.071067811865475e-07, "logits/chosen": -0.185546875, "logits/rejected": 0.240234375, "logps/chosen": -436.0, "logps/rejected": -500.0, "loss": 0.0824, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 8.625, "rewards/rejected": -25.625, "step": 12800 }, { "epoch": 0.9247094492167761, "grad_norm": 5.5843719806148675, "learning_rate": 7.068307293387497e-07, "logits/chosen": -0.328125, "logits/rejected": 0.150390625, "logps/chosen": -472.0, "logps/rejected": -520.0, "loss": 0.0737, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 8.625, "rewards/rejected": -26.875, "step": 12810 }, { "epoch": 0.9254313145167112, "grad_norm": 9.714699395517002, "learning_rate": 7.065550005475526e-07, "logits/chosen": -0.2197265625, "logits/rejected": 0.255859375, "logps/chosen": -466.0, "logps/rejected": -536.0, "loss": 0.0737, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.375, "rewards/margins": 10.375, "rewards/rejected": -27.75, "step": 12820 }, { "epoch": 0.9261531798166462, "grad_norm": 5.9970736739298305, "learning_rate": 7.062795941833388e-07, "logits/chosen": -0.228515625, "logits/rejected": 0.2470703125, "logps/chosen": -472.0, "logps/rejected": -532.0, "loss": 0.0875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -19.875, "rewards/margins": 8.4375, "rewards/rejected": -28.375, "step": 12830 }, { "epoch": 0.9268750451165813, "grad_norm": 9.300861888837126, "learning_rate": 7.060045096182077e-07, "logits/chosen": -0.34765625, "logits/rejected": 0.224609375, "logps/chosen": -456.0, "logps/rejected": -516.0, "loss": 0.0633, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 10.0, "rewards/rejected": -27.5, "step": 12840 }, { "epoch": 0.9275969104165163, "grad_norm": 13.383952140972747, "learning_rate": 7.057297462259693e-07, "logits/chosen": -0.2314453125, "logits/rejected": 0.298828125, "logps/chosen": -466.0, "logps/rejected": -496.0, "loss": 0.1053, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 8.6875, "rewards/rejected": -26.375, "step": 12850 }, { "epoch": 0.9283187757164513, "grad_norm": 13.145302983573048, "learning_rate": 7.05455303382138e-07, "logits/chosen": -0.189453125, "logits/rejected": 0.322265625, "logps/chosen": -476.0, "logps/rejected": -536.0, "loss": 0.092, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 9.0625, "rewards/rejected": -26.625, "step": 12860 }, { "epoch": 0.9290406410163864, "grad_norm": 7.4484640768601444, "learning_rate": 7.051811804639268e-07, "logits/chosen": -0.142578125, "logits/rejected": 0.26171875, "logps/chosen": -434.0, "logps/rejected": -468.0, "loss": 0.0711, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.125, "rewards/margins": 8.0, "rewards/rejected": -24.125, "step": 12870 }, { "epoch": 0.9297625063163214, "grad_norm": 8.044935087713478, "learning_rate": 7.049073768502414e-07, "logits/chosen": -0.162109375, "logits/rejected": 0.2890625, "logps/chosen": -452.0, "logps/rejected": -498.0, "loss": 0.0709, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.125, "rewards/margins": 8.125, "rewards/rejected": -25.25, "step": 12880 }, { "epoch": 0.9304843716162564, "grad_norm": 5.491878490698801, "learning_rate": 7.046338919216742e-07, "logits/chosen": -0.23828125, "logits/rejected": 0.2578125, "logps/chosen": -474.0, "logps/rejected": -498.0, "loss": 0.0893, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.625, "rewards/margins": 9.0625, "rewards/rejected": -26.625, "step": 12890 }, { "epoch": 0.9312062369161914, "grad_norm": 11.5993480856352, "learning_rate": 7.04360725060499e-07, "logits/chosen": -0.1845703125, "logits/rejected": 0.208984375, "logps/chosen": -494.0, "logps/rejected": -536.0, "loss": 0.0891, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -20.125, "rewards/margins": 8.625, "rewards/rejected": -28.875, "step": 12900 }, { "epoch": 0.9319281022161264, "grad_norm": 10.182879626241082, "learning_rate": 7.040878756506639e-07, "logits/chosen": -0.158203125, "logits/rejected": 0.2255859375, "logps/chosen": -466.0, "logps/rejected": -524.0, "loss": 0.0964, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.875, "rewards/margins": 8.5, "rewards/rejected": -28.375, "step": 12910 }, { "epoch": 0.9326499675160616, "grad_norm": 4.829031993303522, "learning_rate": 7.038153430777867e-07, "logits/chosen": -0.408203125, "logits/rejected": 0.08984375, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0571, "rewards/accuracies": 0.96875, "rewards/chosen": -18.0, "rewards/margins": 8.8125, "rewards/rejected": -26.75, "step": 12920 }, { "epoch": 0.9333718328159966, "grad_norm": 11.277875568398258, "learning_rate": 7.035431267291484e-07, "logits/chosen": -0.330078125, "logits/rejected": 0.255859375, "logps/chosen": -446.0, "logps/rejected": -502.0, "loss": 0.1042, "rewards/accuracies": 0.96875, "rewards/chosen": -17.875, "rewards/margins": 9.5, "rewards/rejected": -27.375, "step": 12930 }, { "epoch": 0.9340936981159316, "grad_norm": 7.583365328804073, "learning_rate": 7.032712259936877e-07, "logits/chosen": -0.38671875, "logits/rejected": 0.1171875, "logps/chosen": -460.0, "logps/rejected": -506.0, "loss": 0.0794, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.125, "rewards/margins": 8.9375, "rewards/rejected": -26.0, "step": 12940 }, { "epoch": 0.9348155634158666, "grad_norm": 4.869138578617633, "learning_rate": 7.029996402619949e-07, "logits/chosen": -0.470703125, "logits/rejected": 0.11376953125, "logps/chosen": -428.0, "logps/rejected": -478.0, "loss": 0.0794, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.5, "rewards/margins": 8.8125, "rewards/rejected": -24.25, "step": 12950 }, { "epoch": 0.9355374287158016, "grad_norm": 8.304919751996067, "learning_rate": 7.027283689263065e-07, "logits/chosen": -0.287109375, "logits/rejected": 0.1357421875, "logps/chosen": -438.0, "logps/rejected": -492.0, "loss": 0.0847, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.5, "rewards/margins": 8.3125, "rewards/rejected": -24.75, "step": 12960 }, { "epoch": 0.9362592940157367, "grad_norm": 6.46271538389965, "learning_rate": 7.024574113804996e-07, "logits/chosen": -0.1083984375, "logits/rejected": 0.1611328125, "logps/chosen": -440.0, "logps/rejected": -504.0, "loss": 0.0684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.125, "rewards/margins": 8.75, "rewards/rejected": -25.875, "step": 12970 }, { "epoch": 0.9369811593156717, "grad_norm": 9.391966179199818, "learning_rate": 7.021867670200857e-07, "logits/chosen": -0.1904296875, "logits/rejected": 0.328125, "logps/chosen": -448.0, "logps/rejected": -510.0, "loss": 0.0783, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.875, "rewards/margins": 8.875, "rewards/rejected": -26.75, "step": 12980 }, { "epoch": 0.9377030246156067, "grad_norm": 17.138694840967208, "learning_rate": 7.019164352422057e-07, "logits/chosen": -0.1875, "logits/rejected": 0.3125, "logps/chosen": -474.0, "logps/rejected": -504.0, "loss": 0.0772, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -19.375, "rewards/margins": 8.375, "rewards/rejected": -27.75, "step": 12990 }, { "epoch": 0.9384248899155417, "grad_norm": 10.306296063444178, "learning_rate": 7.016464154456234e-07, "logits/chosen": -0.23828125, "logits/rejected": 0.30078125, "logps/chosen": -448.0, "logps/rejected": -498.0, "loss": 0.0719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 8.9375, "rewards/rejected": -26.625, "step": 13000 }, { "epoch": 0.9391467552154767, "grad_norm": 10.787763783272537, "learning_rate": 7.013767070307207e-07, "logits/chosen": -0.330078125, "logits/rejected": 0.2099609375, "logps/chosen": -458.0, "logps/rejected": -520.0, "loss": 0.0852, "rewards/accuracies": 0.96875, "rewards/chosen": -17.875, "rewards/margins": 8.625, "rewards/rejected": -26.5, "step": 13010 }, { "epoch": 0.9398686205154119, "grad_norm": 9.735782677247954, "learning_rate": 7.011073093994919e-07, "logits/chosen": -0.30859375, "logits/rejected": 0.2177734375, "logps/chosen": -460.0, "logps/rejected": -506.0, "loss": 0.1024, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.5, "rewards/margins": 9.0625, "rewards/rejected": -26.625, "step": 13020 }, { "epoch": 0.9405904858153469, "grad_norm": 8.460311014348951, "learning_rate": 7.008382219555372e-07, "logits/chosen": -0.224609375, "logits/rejected": 0.25, "logps/chosen": -476.0, "logps/rejected": -544.0, "loss": 0.0776, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.25, "rewards/margins": 9.0, "rewards/rejected": -29.25, "step": 13030 }, { "epoch": 0.9413123511152819, "grad_norm": 8.426911061394383, "learning_rate": 7.005694441040588e-07, "logits/chosen": -0.3046875, "logits/rejected": 0.1650390625, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0658, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 9.375, "rewards/rejected": -29.25, "step": 13040 }, { "epoch": 0.9420342164152169, "grad_norm": 4.848021600983768, "learning_rate": 7.003009752518536e-07, "logits/chosen": -0.279296875, "logits/rejected": 0.361328125, "logps/chosen": -496.0, "logps/rejected": -536.0, "loss": 0.065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 9.6875, "rewards/rejected": -30.0, "step": 13050 }, { "epoch": 0.9427560817151519, "grad_norm": 7.453897783000677, "learning_rate": 7.000328148073091e-07, "logits/chosen": -0.19140625, "logits/rejected": 0.29296875, "logps/chosen": -474.0, "logps/rejected": -528.0, "loss": 0.084, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.625, "rewards/margins": 8.6875, "rewards/rejected": -27.25, "step": 13060 }, { "epoch": 0.943477947015087, "grad_norm": 5.49741861092138, "learning_rate": 6.997649621803973e-07, "logits/chosen": -0.30078125, "logits/rejected": 0.059814453125, "logps/chosen": -484.0, "logps/rejected": -524.0, "loss": 0.0787, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.5, "rewards/margins": 8.5625, "rewards/rejected": -27.125, "step": 13070 }, { "epoch": 0.944199812315022, "grad_norm": 4.9470123347249, "learning_rate": 6.994974167826689e-07, "logits/chosen": -0.25, "logits/rejected": 0.16015625, "logps/chosen": -438.0, "logps/rejected": -494.0, "loss": 0.06, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.25, "rewards/margins": 8.5625, "rewards/rejected": -24.875, "step": 13080 }, { "epoch": 0.944921677614957, "grad_norm": 6.747212625998914, "learning_rate": 6.99230178027249e-07, "logits/chosen": -0.13671875, "logits/rejected": 0.326171875, "logps/chosen": -452.0, "logps/rejected": -504.0, "loss": 0.073, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 8.875, "rewards/rejected": -26.5, "step": 13090 }, { "epoch": 0.945643542914892, "grad_norm": 7.7104599959608455, "learning_rate": 6.989632453288303e-07, "logits/chosen": -0.267578125, "logits/rejected": 0.177734375, "logps/chosen": -470.0, "logps/rejected": -532.0, "loss": 0.0937, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.375, "rewards/margins": 8.8125, "rewards/rejected": -26.125, "step": 13100 }, { "epoch": 0.9463654082148271, "grad_norm": 18.369910589502318, "learning_rate": 6.98696618103669e-07, "logits/chosen": -0.3203125, "logits/rejected": 0.1767578125, "logps/chosen": -428.0, "logps/rejected": -474.0, "loss": 0.0987, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.6875, "rewards/margins": 8.125, "rewards/rejected": -23.875, "step": 13110 }, { "epoch": 0.9470872735147622, "grad_norm": 10.62917652183618, "learning_rate": 6.984302957695782e-07, "logits/chosen": -0.32421875, "logits/rejected": 0.12451171875, "logps/chosen": -458.0, "logps/rejected": -502.0, "loss": 0.0991, "rewards/accuracies": 0.96875, "rewards/chosen": -15.875, "rewards/margins": 9.1875, "rewards/rejected": -25.0, "step": 13120 }, { "epoch": 0.9478091388146972, "grad_norm": 14.067249023206639, "learning_rate": 6.981642777459237e-07, "logits/chosen": -0.322265625, "logits/rejected": 0.1650390625, "logps/chosen": -438.0, "logps/rejected": -486.0, "loss": 0.0737, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.625, "rewards/margins": 9.25, "rewards/rejected": -24.875, "step": 13130 }, { "epoch": 0.9485310041146322, "grad_norm": 10.49633482497925, "learning_rate": 6.978985634536182e-07, "logits/chosen": -0.400390625, "logits/rejected": 0.060546875, "logps/chosen": -430.0, "logps/rejected": -488.0, "loss": 0.0703, "rewards/accuracies": 0.96875, "rewards/chosen": -15.125, "rewards/margins": 8.625, "rewards/rejected": -23.75, "step": 13140 }, { "epoch": 0.9492528694145672, "grad_norm": 4.474855530907365, "learning_rate": 6.976331523151157e-07, "logits/chosen": -0.2890625, "logits/rejected": 0.08447265625, "logps/chosen": -448.0, "logps/rejected": -486.0, "loss": 0.0749, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.0, "rewards/margins": 9.1875, "rewards/rejected": -25.25, "step": 13150 }, { "epoch": 0.9499747347145022, "grad_norm": 8.189919031369769, "learning_rate": 6.973680437544066e-07, "logits/chosen": -0.50390625, "logits/rejected": 0.1064453125, "logps/chosen": -438.0, "logps/rejected": -490.0, "loss": 0.0882, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.9375, "rewards/margins": 9.375, "rewards/rejected": -25.25, "step": 13160 }, { "epoch": 0.9506966000144373, "grad_norm": 5.165790538217877, "learning_rate": 6.971032371970126e-07, "logits/chosen": -0.47265625, "logits/rejected": 0.146484375, "logps/chosen": -446.0, "logps/rejected": -480.0, "loss": 0.0723, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.9375, "rewards/margins": 9.5, "rewards/rejected": -25.5, "step": 13170 }, { "epoch": 0.9514184653143724, "grad_norm": 8.950854365660128, "learning_rate": 6.968387320699806e-07, "logits/chosen": -0.494140625, "logits/rejected": 0.0859375, "logps/chosen": -420.0, "logps/rejected": -474.0, "loss": 0.0751, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.4375, "rewards/margins": 9.375, "rewards/rejected": -23.875, "step": 13180 }, { "epoch": 0.9521403306143074, "grad_norm": 9.019632062829743, "learning_rate": 6.965745278018791e-07, "logits/chosen": -0.470703125, "logits/rejected": 0.048828125, "logps/chosen": -412.0, "logps/rejected": -468.0, "loss": 0.0698, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.5625, "rewards/margins": 9.25, "rewards/rejected": -23.75, "step": 13190 }, { "epoch": 0.9528621959142424, "grad_norm": 9.406744758502915, "learning_rate": 6.963106238227914e-07, "logits/chosen": -0.46875, "logits/rejected": -0.1015625, "logps/chosen": -440.0, "logps/rejected": -520.0, "loss": 0.0803, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.3125, "rewards/margins": 11.25, "rewards/rejected": -26.5, "step": 13200 }, { "epoch": 0.9535840612141774, "grad_norm": 16.840127451624806, "learning_rate": 6.96047019564311e-07, "logits/chosen": -0.34765625, "logits/rejected": 0.173828125, "logps/chosen": -446.0, "logps/rejected": -504.0, "loss": 0.1008, "rewards/accuracies": 0.9375, "rewards/chosen": -16.75, "rewards/margins": 8.9375, "rewards/rejected": -25.75, "step": 13210 }, { "epoch": 0.9543059265141125, "grad_norm": 9.691460233300555, "learning_rate": 6.957837144595368e-07, "logits/chosen": -0.40234375, "logits/rejected": 0.111328125, "logps/chosen": -456.0, "logps/rejected": -520.0, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.0, "rewards/margins": 10.5625, "rewards/rejected": -26.625, "step": 13220 }, { "epoch": 0.9550277918140475, "grad_norm": 4.911007570655939, "learning_rate": 6.955207079430681e-07, "logits/chosen": -0.37890625, "logits/rejected": 0.2431640625, "logps/chosen": -428.0, "logps/rejected": -482.0, "loss": 0.0885, "rewards/accuracies": 0.96875, "rewards/chosen": -15.8125, "rewards/margins": 8.8125, "rewards/rejected": -24.625, "step": 13230 }, { "epoch": 0.9557496571139825, "grad_norm": 11.331879546670448, "learning_rate": 6.952579994509982e-07, "logits/chosen": -0.357421875, "logits/rejected": 0.2041015625, "logps/chosen": -468.0, "logps/rejected": -500.0, "loss": 0.0938, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.125, "rewards/margins": 8.875, "rewards/rejected": -25.0, "step": 13240 }, { "epoch": 0.9564715224139175, "grad_norm": 5.79940971097897, "learning_rate": 6.94995588420911e-07, "logits/chosen": -0.400390625, "logits/rejected": 0.0478515625, "logps/chosen": -456.0, "logps/rejected": -502.0, "loss": 0.0658, "rewards/accuracies": 0.96875, "rewards/chosen": -15.75, "rewards/margins": 9.625, "rewards/rejected": -25.375, "step": 13250 }, { "epoch": 0.9571933877138527, "grad_norm": 24.075974541728804, "learning_rate": 6.947334742918749e-07, "logits/chosen": -0.33984375, "logits/rejected": 0.16015625, "logps/chosen": -466.0, "logps/rejected": -516.0, "loss": 0.0815, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.375, "rewards/margins": 9.1875, "rewards/rejected": -27.625, "step": 13260 }, { "epoch": 0.9579152530137877, "grad_norm": 8.538014986741407, "learning_rate": 6.94471656504438e-07, "logits/chosen": -0.419921875, "logits/rejected": 0.12353515625, "logps/chosen": -446.0, "logps/rejected": -512.0, "loss": 0.0808, "rewards/accuracies": 0.96875, "rewards/chosen": -16.625, "rewards/margins": 8.875, "rewards/rejected": -25.5, "step": 13270 }, { "epoch": 0.9586371183137227, "grad_norm": 9.24005643981387, "learning_rate": 6.942101345006232e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.2373046875, "logps/chosen": -440.0, "logps/rejected": -486.0, "loss": 0.0608, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.25, "rewards/margins": 9.5, "rewards/rejected": -25.625, "step": 13280 }, { "epoch": 0.9593589836136577, "grad_norm": 5.478189387469531, "learning_rate": 6.939489077239235e-07, "logits/chosen": -0.392578125, "logits/rejected": 0.1875, "logps/chosen": -450.0, "logps/rejected": -502.0, "loss": 0.0956, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.75, "rewards/margins": 9.875, "rewards/rejected": -26.75, "step": 13290 }, { "epoch": 0.9600808489135927, "grad_norm": 6.71598511470423, "learning_rate": 6.936879756192959e-07, "logits/chosen": -0.345703125, "logits/rejected": 0.05419921875, "logps/chosen": -434.0, "logps/rejected": -496.0, "loss": 0.0821, "rewards/accuracies": 0.96875, "rewards/chosen": -15.875, "rewards/margins": 9.4375, "rewards/rejected": -25.375, "step": 13300 }, { "epoch": 0.9608027142135278, "grad_norm": 8.711672638617449, "learning_rate": 6.934273376331579e-07, "logits/chosen": -0.24609375, "logits/rejected": 0.12158203125, "logps/chosen": -426.0, "logps/rejected": -502.0, "loss": 0.0954, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.25, "rewards/margins": 9.3125, "rewards/rejected": -24.625, "step": 13310 }, { "epoch": 0.9615245795134628, "grad_norm": 10.585850994530396, "learning_rate": 6.931669932133818e-07, "logits/chosen": -0.314453125, "logits/rejected": 0.2080078125, "logps/chosen": -446.0, "logps/rejected": -506.0, "loss": 0.0878, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 9.5, "rewards/rejected": -25.875, "step": 13320 }, { "epoch": 0.9622464448133978, "grad_norm": 9.345147191176736, "learning_rate": 6.929069418092892e-07, "logits/chosen": -0.36328125, "logits/rejected": 0.06640625, "logps/chosen": -440.0, "logps/rejected": -486.0, "loss": 0.0765, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.375, "rewards/margins": 8.8125, "rewards/rejected": -25.25, "step": 13330 }, { "epoch": 0.9629683101133328, "grad_norm": 10.910436584375871, "learning_rate": 6.926471828716478e-07, "logits/chosen": -0.208984375, "logits/rejected": 0.1044921875, "logps/chosen": -450.0, "logps/rejected": -520.0, "loss": 0.0693, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 9.0, "rewards/rejected": -25.25, "step": 13340 }, { "epoch": 0.9636901754132678, "grad_norm": 5.547706579036159, "learning_rate": 6.923877158526646e-07, "logits/chosen": -0.322265625, "logits/rejected": 0.2265625, "logps/chosen": -438.0, "logps/rejected": -498.0, "loss": 0.0838, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.375, "rewards/margins": 8.875, "rewards/rejected": -25.25, "step": 13350 }, { "epoch": 0.964412040713203, "grad_norm": 7.362293953584876, "learning_rate": 6.921285402059827e-07, "logits/chosen": -0.2197265625, "logits/rejected": 0.1845703125, "logps/chosen": -434.0, "logps/rejected": -498.0, "loss": 0.0878, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 7.96875, "rewards/rejected": -24.125, "step": 13360 }, { "epoch": 0.965133906013138, "grad_norm": 7.796736956442941, "learning_rate": 6.918696553866751e-07, "logits/chosen": -0.361328125, "logits/rejected": 0.076171875, "logps/chosen": -438.0, "logps/rejected": -490.0, "loss": 0.0869, "rewards/accuracies": 0.96875, "rewards/chosen": -16.125, "rewards/margins": 8.9375, "rewards/rejected": -25.0, "step": 13370 }, { "epoch": 0.965855771313073, "grad_norm": 6.367091294109095, "learning_rate": 6.916110608512408e-07, "logits/chosen": -0.34375, "logits/rejected": 0.1826171875, "logps/chosen": -480.0, "logps/rejected": -508.0, "loss": 0.0868, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.25, "rewards/margins": 8.3125, "rewards/rejected": -25.5, "step": 13380 }, { "epoch": 0.966577636613008, "grad_norm": 16.343337991155092, "learning_rate": 6.913527560575998e-07, "logits/chosen": -0.36328125, "logits/rejected": 0.1455078125, "logps/chosen": -462.0, "logps/rejected": -492.0, "loss": 0.1111, "rewards/accuracies": 0.96875, "rewards/chosen": -17.5, "rewards/margins": 7.90625, "rewards/rejected": -25.375, "step": 13390 }, { "epoch": 0.967299501912943, "grad_norm": 10.198835638593199, "learning_rate": 6.910947404650881e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.080078125, "logps/chosen": -468.0, "logps/rejected": -500.0, "loss": 0.075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.375, "rewards/margins": 8.3125, "rewards/rejected": -25.75, "step": 13400 }, { "epoch": 0.9680213672128781, "grad_norm": 8.408879722742531, "learning_rate": 6.90837013534453e-07, "logits/chosen": -0.2001953125, "logits/rejected": 0.205078125, "logps/chosen": -450.0, "logps/rejected": -512.0, "loss": 0.0741, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.375, "rewards/margins": 8.1875, "rewards/rejected": -26.625, "step": 13410 }, { "epoch": 0.9687432325128131, "grad_norm": 8.056868758878213, "learning_rate": 6.905795747278488e-07, "logits/chosen": -0.4140625, "logits/rejected": 0.154296875, "logps/chosen": -462.0, "logps/rejected": -504.0, "loss": 0.094, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.75, "rewards/margins": 9.125, "rewards/rejected": -25.875, "step": 13420 }, { "epoch": 0.9694650978127481, "grad_norm": 4.001663001750351, "learning_rate": 6.903224235088314e-07, "logits/chosen": -0.359375, "logits/rejected": -0.0830078125, "logps/chosen": -428.0, "logps/rejected": -476.0, "loss": 0.0902, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.4375, "rewards/margins": 7.9375, "rewards/rejected": -23.375, "step": 13430 }, { "epoch": 0.9701869631126832, "grad_norm": 9.707382959252408, "learning_rate": 6.900655593423541e-07, "logits/chosen": -0.419921875, "logits/rejected": -0.12255859375, "logps/chosen": -422.0, "logps/rejected": -454.0, "loss": 0.1193, "rewards/accuracies": 0.96875, "rewards/chosen": -12.9375, "rewards/margins": 8.0, "rewards/rejected": -21.0, "step": 13440 }, { "epoch": 0.9709088284126182, "grad_norm": 4.611806119868308, "learning_rate": 6.89808981694763e-07, "logits/chosen": -0.40625, "logits/rejected": 0.12451171875, "logps/chosen": -424.0, "logps/rejected": -508.0, "loss": 0.0876, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -13.6875, "rewards/margins": 8.8125, "rewards/rejected": -22.5, "step": 13450 }, { "epoch": 0.9716306937125533, "grad_norm": 9.02190096973129, "learning_rate": 6.895526900337915e-07, "logits/chosen": -0.36328125, "logits/rejected": 0.07177734375, "logps/chosen": -402.0, "logps/rejected": -434.0, "loss": 0.1113, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.75, "rewards/margins": 8.0, "rewards/rejected": -21.75, "step": 13460 }, { "epoch": 0.9723525590124883, "grad_norm": 7.9049107113054875, "learning_rate": 6.892966838285567e-07, "logits/chosen": -0.349609375, "logits/rejected": 0.1650390625, "logps/chosen": -426.0, "logps/rejected": -484.0, "loss": 0.0842, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -15.625, "rewards/margins": 8.8125, "rewards/rejected": -24.375, "step": 13470 }, { "epoch": 0.9730744243124233, "grad_norm": 6.787380287893767, "learning_rate": 6.890409625495545e-07, "logits/chosen": -0.1298828125, "logits/rejected": 0.2578125, "logps/chosen": -428.0, "logps/rejected": -488.0, "loss": 0.0696, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.6875, "rewards/margins": 8.8125, "rewards/rejected": -23.5, "step": 13480 }, { "epoch": 0.9737962896123583, "grad_norm": 7.117717105931689, "learning_rate": 6.887855256686546e-07, "logits/chosen": -0.193359375, "logits/rejected": 0.1650390625, "logps/chosen": -446.0, "logps/rejected": -528.0, "loss": 0.0922, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -17.375, "rewards/margins": 8.4375, "rewards/rejected": -25.875, "step": 13490 }, { "epoch": 0.9745181549122933, "grad_norm": 3.772338953782398, "learning_rate": 6.885303726590963e-07, "logits/chosen": -0.43359375, "logits/rejected": 0.10205078125, "logps/chosen": -444.0, "logps/rejected": -490.0, "loss": 0.0612, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.625, "rewards/margins": 9.4375, "rewards/rejected": -26.0, "step": 13500 }, { "epoch": 0.9752400202122284, "grad_norm": 11.189506969691044, "learning_rate": 6.882755029954837e-07, "logits/chosen": -0.365234375, "logits/rejected": 0.1259765625, "logps/chosen": -456.0, "logps/rejected": -506.0, "loss": 0.0863, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.75, "rewards/margins": 9.5625, "rewards/rejected": -25.375, "step": 13510 }, { "epoch": 0.9759618855121635, "grad_norm": 10.310528289571751, "learning_rate": 6.880209161537815e-07, "logits/chosen": -0.3203125, "logits/rejected": 0.07861328125, "logps/chosen": -442.0, "logps/rejected": -500.0, "loss": 0.0882, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.0, "rewards/margins": 9.0, "rewards/rejected": -26.0, "step": 13520 }, { "epoch": 0.9766837508120985, "grad_norm": 3.409936194205413, "learning_rate": 6.877666116113097e-07, "logits/chosen": -0.39453125, "logits/rejected": 0.0869140625, "logps/chosen": -468.0, "logps/rejected": -502.0, "loss": 0.083, "rewards/accuracies": 0.96875, "rewards/chosen": -16.625, "rewards/margins": 8.625, "rewards/rejected": -25.375, "step": 13530 }, { "epoch": 0.9774056161120335, "grad_norm": 11.842488018285799, "learning_rate": 6.875125888467405e-07, "logits/chosen": -0.37890625, "logits/rejected": 0.1884765625, "logps/chosen": -436.0, "logps/rejected": -480.0, "loss": 0.073, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.375, "rewards/margins": 9.1875, "rewards/rejected": -24.5, "step": 13540 }, { "epoch": 0.9781274814119685, "grad_norm": 8.301485363385503, "learning_rate": 6.872588473400923e-07, "logits/chosen": -0.470703125, "logits/rejected": 0.05029296875, "logps/chosen": -424.0, "logps/rejected": -476.0, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -15.1875, "rewards/margins": 8.9375, "rewards/rejected": -24.125, "step": 13550 }, { "epoch": 0.9788493467119036, "grad_norm": 9.90473478482429, "learning_rate": 6.870053865727262e-07, "logits/chosen": -0.40234375, "logits/rejected": 0.055908203125, "logps/chosen": -436.0, "logps/rejected": -494.0, "loss": 0.0941, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.125, "rewards/margins": 8.5625, "rewards/rejected": -23.75, "step": 13560 }, { "epoch": 0.9795712120118386, "grad_norm": 7.880914978562655, "learning_rate": 6.867522060273408e-07, "logits/chosen": -0.38671875, "logits/rejected": 0.1337890625, "logps/chosen": -428.0, "logps/rejected": -484.0, "loss": 0.0723, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.375, "rewards/margins": 8.875, "rewards/rejected": -25.25, "step": 13570 }, { "epoch": 0.9802930773117736, "grad_norm": 7.00662199353278, "learning_rate": 6.864993051879688e-07, "logits/chosen": -0.4765625, "logits/rejected": 0.0712890625, "logps/chosen": -426.0, "logps/rejected": -480.0, "loss": 0.0927, "rewards/accuracies": 0.96875, "rewards/chosen": -15.5625, "rewards/margins": 8.5625, "rewards/rejected": -24.125, "step": 13580 }, { "epoch": 0.9810149426117086, "grad_norm": 6.236751182955256, "learning_rate": 6.862466835399716e-07, "logits/chosen": -0.302734375, "logits/rejected": 0.1318359375, "logps/chosen": -432.0, "logps/rejected": -474.0, "loss": 0.0824, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.625, "rewards/margins": 7.9375, "rewards/rejected": -24.5, "step": 13590 }, { "epoch": 0.9817368079116436, "grad_norm": 9.574912357884877, "learning_rate": 6.859943405700353e-07, "logits/chosen": -0.482421875, "logits/rejected": 0.024658203125, "logps/chosen": -458.0, "logps/rejected": -520.0, "loss": 0.0885, "rewards/accuracies": 0.96875, "rewards/chosen": -16.25, "rewards/margins": 8.8125, "rewards/rejected": -25.0, "step": 13600 }, { "epoch": 0.9824586732115788, "grad_norm": 7.241892972207727, "learning_rate": 6.857422757661664e-07, "logits/chosen": -0.416015625, "logits/rejected": 0.08349609375, "logps/chosen": -448.0, "logps/rejected": -484.0, "loss": 0.0716, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -15.375, "rewards/margins": 8.125, "rewards/rejected": -23.5, "step": 13610 }, { "epoch": 0.9831805385115138, "grad_norm": 8.957811611665893, "learning_rate": 6.854904886176873e-07, "logits/chosen": -0.337890625, "logits/rejected": 0.1474609375, "logps/chosen": -456.0, "logps/rejected": -496.0, "loss": 0.0873, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.625, "rewards/margins": 8.375, "rewards/rejected": -25.0, "step": 13620 }, { "epoch": 0.9839024038114488, "grad_norm": 7.9917522884300185, "learning_rate": 6.85238978615232e-07, "logits/chosen": -0.451171875, "logits/rejected": -0.1396484375, "logps/chosen": -452.0, "logps/rejected": -496.0, "loss": 0.093, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -16.375, "rewards/margins": 8.5625, "rewards/rejected": -24.875, "step": 13630 }, { "epoch": 0.9846242691113838, "grad_norm": 11.436309974808266, "learning_rate": 6.849877452507417e-07, "logits/chosen": -0.37109375, "logits/rejected": 0.1845703125, "logps/chosen": -444.0, "logps/rejected": -496.0, "loss": 0.0658, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.375, "rewards/margins": 8.8125, "rewards/rejected": -25.125, "step": 13640 }, { "epoch": 0.9853461344113188, "grad_norm": 18.226116319214277, "learning_rate": 6.847367880174605e-07, "logits/chosen": -0.48828125, "logits/rejected": -0.00185394287109375, "logps/chosen": -468.0, "logps/rejected": -536.0, "loss": 0.0959, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 8.9375, "rewards/rejected": -26.125, "step": 13650 }, { "epoch": 0.9860679997112539, "grad_norm": 6.450180419615528, "learning_rate": 6.844861064099317e-07, "logits/chosen": -0.46484375, "logits/rejected": 0.09521484375, "logps/chosen": -448.0, "logps/rejected": -510.0, "loss": 0.0713, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 9.5, "rewards/rejected": -27.375, "step": 13660 }, { "epoch": 0.9867898650111889, "grad_norm": 8.539311110514603, "learning_rate": 6.842356999239922e-07, "logits/chosen": -0.390625, "logits/rejected": 0.0537109375, "logps/chosen": -442.0, "logps/rejected": -516.0, "loss": 0.0741, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 8.9375, "rewards/rejected": -27.125, "step": 13670 }, { "epoch": 0.9875117303111239, "grad_norm": 4.471235793221726, "learning_rate": 6.839855680567694e-07, "logits/chosen": -0.408203125, "logits/rejected": 0.16015625, "logps/chosen": -438.0, "logps/rejected": -504.0, "loss": 0.0797, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.125, "rewards/margins": 9.4375, "rewards/rejected": -26.5, "step": 13680 }, { "epoch": 0.988233595611059, "grad_norm": 11.471766108173055, "learning_rate": 6.837357103066766e-07, "logits/chosen": -0.373046875, "logits/rejected": 0.2021484375, "logps/chosen": -472.0, "logps/rejected": -510.0, "loss": 0.0691, "rewards/accuracies": 0.96875, "rewards/chosen": -17.875, "rewards/margins": 10.125, "rewards/rejected": -28.0, "step": 13690 }, { "epoch": 0.988955460910994, "grad_norm": 3.2827022769108285, "learning_rate": 6.834861261734087e-07, "logits/chosen": -0.3359375, "logits/rejected": 0.1865234375, "logps/chosen": -472.0, "logps/rejected": -520.0, "loss": 0.0714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -18.75, "rewards/margins": 9.125, "rewards/rejected": -27.875, "step": 13700 }, { "epoch": 0.9896773262109291, "grad_norm": 9.045481235997434, "learning_rate": 6.832368151579382e-07, "logits/chosen": -0.58203125, "logits/rejected": 0.0859375, "logps/chosen": -476.0, "logps/rejected": -528.0, "loss": 0.0738, "rewards/accuracies": 0.96875, "rewards/chosen": -18.5, "rewards/margins": 9.5, "rewards/rejected": -28.0, "step": 13710 }, { "epoch": 0.9903991915108641, "grad_norm": 9.617480856030397, "learning_rate": 6.829877767625106e-07, "logits/chosen": -0.373046875, "logits/rejected": 0.10986328125, "logps/chosen": -470.0, "logps/rejected": -492.0, "loss": 0.0826, "rewards/accuracies": 0.96875, "rewards/chosen": -17.25, "rewards/margins": 9.375, "rewards/rejected": -26.625, "step": 13720 }, { "epoch": 0.9911210568107991, "grad_norm": 5.993937485388265, "learning_rate": 6.827390104906408e-07, "logits/chosen": -0.484375, "logits/rejected": 0.1318359375, "logps/chosen": -442.0, "logps/rejected": -512.0, "loss": 0.0878, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.375, "rewards/margins": 9.5, "rewards/rejected": -26.875, "step": 13730 }, { "epoch": 0.9918429221107341, "grad_norm": 14.884711697972985, "learning_rate": 6.824905158471082e-07, "logits/chosen": -0.453125, "logits/rejected": 0.07568359375, "logps/chosen": -448.0, "logps/rejected": -510.0, "loss": 0.0824, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 8.75, "rewards/rejected": -25.5, "step": 13740 }, { "epoch": 0.9925647874106691, "grad_norm": 8.614946379663799, "learning_rate": 6.822422923379532e-07, "logits/chosen": -0.3984375, "logits/rejected": -0.013671875, "logps/chosen": -416.0, "logps/rejected": -466.0, "loss": 0.0697, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -13.375, "rewards/margins": 9.5, "rewards/rejected": -22.875, "step": 13750 }, { "epoch": 0.9932866527106042, "grad_norm": 5.533888437666742, "learning_rate": 6.819943394704734e-07, "logits/chosen": -0.47265625, "logits/rejected": -0.06640625, "logps/chosen": -438.0, "logps/rejected": -490.0, "loss": 0.0736, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -15.9375, "rewards/margins": 9.25, "rewards/rejected": -25.25, "step": 13760 }, { "epoch": 0.9940085180105392, "grad_norm": 6.304034274985279, "learning_rate": 6.817466567532181e-07, "logits/chosen": -0.451171875, "logits/rejected": -0.034423828125, "logps/chosen": -432.0, "logps/rejected": -480.0, "loss": 0.0588, "rewards/accuracies": 0.96875, "rewards/chosen": -15.375, "rewards/margins": 9.0, "rewards/rejected": -24.375, "step": 13770 }, { "epoch": 0.9947303833104743, "grad_norm": 12.186586866451217, "learning_rate": 6.814992436959855e-07, "logits/chosen": -0.490234375, "logits/rejected": 0.1025390625, "logps/chosen": -464.0, "logps/rejected": -524.0, "loss": 0.0913, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -17.875, "rewards/margins": 9.25, "rewards/rejected": -27.125, "step": 13780 }, { "epoch": 0.9954522486104093, "grad_norm": 7.679812856839088, "learning_rate": 6.812520998098182e-07, "logits/chosen": -0.546875, "logits/rejected": -0.130859375, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0501, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 9.4375, "rewards/rejected": -28.0, "step": 13790 }, { "epoch": 0.9961741139103444, "grad_norm": 4.502796785236198, "learning_rate": 6.810052246069989e-07, "logits/chosen": -0.494140625, "logits/rejected": 0.015380859375, "logps/chosen": -454.0, "logps/rejected": -512.0, "loss": 0.0902, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 8.8125, "rewards/rejected": -26.0, "step": 13800 }, { "epoch": 0.9968959792102794, "grad_norm": 3.4790599416031025, "learning_rate": 6.807586176010469e-07, "logits/chosen": -0.5, "logits/rejected": 0.03271484375, "logps/chosen": -476.0, "logps/rejected": -520.0, "loss": 0.0823, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 9.9375, "rewards/rejected": -27.375, "step": 13810 }, { "epoch": 0.9976178445102144, "grad_norm": 7.236224769443775, "learning_rate": 6.805122783067135e-07, "logits/chosen": -0.59375, "logits/rejected": -0.013916015625, "logps/chosen": -444.0, "logps/rejected": -510.0, "loss": 0.0841, "rewards/accuracies": 0.96875, "rewards/chosen": -17.5, "rewards/margins": 9.6875, "rewards/rejected": -27.25, "step": 13820 }, { "epoch": 0.9983397098101494, "grad_norm": 7.146653360093775, "learning_rate": 6.802662062399785e-07, "logits/chosen": -0.55078125, "logits/rejected": -0.020263671875, "logps/chosen": -466.0, "logps/rejected": -508.0, "loss": 0.0708, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 9.0, "rewards/rejected": -26.5, "step": 13830 }, { "epoch": 0.9990615751100844, "grad_norm": 9.033364009709832, "learning_rate": 6.800204009180459e-07, "logits/chosen": -0.279296875, "logits/rejected": 0.09228515625, "logps/chosen": -456.0, "logps/rejected": -536.0, "loss": 0.08, "rewards/accuracies": 0.96875, "rewards/chosen": -16.75, "rewards/margins": 9.5625, "rewards/rejected": -26.375, "step": 13840 }, { "epoch": 0.9997834404100195, "grad_norm": 7.36013473597634, "learning_rate": 6.797748618593397e-07, "logits/chosen": -0.5546875, "logits/rejected": -0.046875, "logps/chosen": -460.0, "logps/rejected": -516.0, "loss": 0.0712, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -16.75, "rewards/margins": 9.0625, "rewards/rejected": -25.75, "step": 13850 }, { "epoch": 1.0005053057099544, "grad_norm": 4.980123916128394, "learning_rate": 6.795295885835008e-07, "logits/chosen": -0.408203125, "logits/rejected": 0.10693359375, "logps/chosen": -420.0, "logps/rejected": -498.0, "loss": 0.0444, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -14.9375, "rewards/margins": 9.3125, "rewards/rejected": -24.25, "step": 13860 }, { "epoch": 1.0012271710098894, "grad_norm": 3.6279160863310156, "learning_rate": 6.792845806113816e-07, "logits/chosen": -0.5390625, "logits/rejected": -0.2001953125, "logps/chosen": -450.0, "logps/rejected": -490.0, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": -15.1875, "rewards/margins": 9.1875, "rewards/rejected": -24.375, "step": 13870 }, { "epoch": 1.0019490363098247, "grad_norm": 5.009253615520719, "learning_rate": 6.790398374650439e-07, "logits/chosen": -0.515625, "logits/rejected": -0.031982421875, "logps/chosen": -446.0, "logps/rejected": -484.0, "loss": 0.0245, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -14.875, "rewards/margins": 9.0625, "rewards/rejected": -24.0, "step": 13880 }, { "epoch": 1.0026709016097597, "grad_norm": 6.640496030133301, "learning_rate": 6.787953586677534e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.07080078125, "logps/chosen": -464.0, "logps/rejected": -516.0, "loss": 0.0379, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.75, "rewards/margins": 9.625, "rewards/rejected": -26.375, "step": 13890 }, { "epoch": 1.0033927669096947, "grad_norm": 2.6825680670854575, "learning_rate": 6.785511437439767e-07, "logits/chosen": -0.6328125, "logits/rejected": -0.045166015625, "logps/chosen": -466.0, "logps/rejected": -528.0, "loss": 0.0246, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.25, "rewards/margins": 10.5, "rewards/rejected": -27.75, "step": 13900 }, { "epoch": 1.0041146322096297, "grad_norm": 4.6383720589146415, "learning_rate": 6.78307192219377e-07, "logits/chosen": -0.59375, "logits/rejected": -0.02490234375, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 9.9375, "rewards/rejected": -28.25, "step": 13910 }, { "epoch": 1.0048364975095647, "grad_norm": 5.566311104901634, "learning_rate": 6.780635036208104e-07, "logits/chosen": -0.65234375, "logits/rejected": 0.09716796875, "logps/chosen": -452.0, "logps/rejected": -544.0, "loss": 0.0346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.75, "rewards/margins": 11.0, "rewards/rejected": -27.75, "step": 13920 }, { "epoch": 1.0055583628094997, "grad_norm": 4.10070904205242, "learning_rate": 6.77820077476322e-07, "logits/chosen": -0.5859375, "logits/rejected": -0.06787109375, "logps/chosen": -444.0, "logps/rejected": -516.0, "loss": 0.0474, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.75, "rewards/margins": 10.4375, "rewards/rejected": -28.125, "step": 13930 }, { "epoch": 1.0062802281094347, "grad_norm": 3.3892761837597103, "learning_rate": 6.775769133151421e-07, "logits/chosen": -0.62890625, "logits/rejected": -0.054931640625, "logps/chosen": -462.0, "logps/rejected": -520.0, "loss": 0.0413, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 10.5, "rewards/rejected": -29.0, "step": 13940 }, { "epoch": 1.0070020934093697, "grad_norm": 7.573541759296532, "learning_rate": 6.773340106676827e-07, "logits/chosen": -0.52734375, "logits/rejected": -0.0595703125, "logps/chosen": -462.0, "logps/rejected": -532.0, "loss": 0.0382, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 10.25, "rewards/rejected": -28.875, "step": 13950 }, { "epoch": 1.0077239587093048, "grad_norm": 3.149358204086945, "learning_rate": 6.770913690655326e-07, "logits/chosen": -0.625, "logits/rejected": 0.016845703125, "logps/chosen": -478.0, "logps/rejected": -528.0, "loss": 0.0375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 10.25, "rewards/rejected": -29.25, "step": 13960 }, { "epoch": 1.0084458240092398, "grad_norm": 3.6283874576671313, "learning_rate": 6.768489880414549e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.03125, "logps/chosen": -476.0, "logps/rejected": -532.0, "loss": 0.0396, "rewards/accuracies": 0.96875, "rewards/chosen": -18.125, "rewards/margins": 10.5625, "rewards/rejected": -28.625, "step": 13970 }, { "epoch": 1.009167689309175, "grad_norm": 3.189943986128951, "learning_rate": 6.766068671293825e-07, "logits/chosen": -0.37890625, "logits/rejected": 0.056884765625, "logps/chosen": -474.0, "logps/rejected": -532.0, "loss": 0.027, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 11.0625, "rewards/rejected": -29.375, "step": 13980 }, { "epoch": 1.00988955460911, "grad_norm": 4.501930499455324, "learning_rate": 6.763650058644149e-07, "logits/chosen": -0.58203125, "logits/rejected": -0.15234375, "logps/chosen": -418.0, "logps/rejected": -504.0, "loss": 0.0346, "rewards/accuracies": 0.96875, "rewards/chosen": -16.0, "rewards/margins": 9.9375, "rewards/rejected": -26.0, "step": 13990 }, { "epoch": 1.010611419909045, "grad_norm": 7.038625790836817, "learning_rate": 6.761234037828132e-07, "logits/chosen": -0.546875, "logits/rejected": -0.04248046875, "logps/chosen": -466.0, "logps/rejected": -516.0, "loss": 0.0322, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 10.625, "rewards/rejected": -29.0, "step": 14000 }, { "epoch": 1.01133328520898, "grad_norm": 5.7525574545449505, "learning_rate": 6.758820604219981e-07, "logits/chosen": -0.5, "logits/rejected": -0.0791015625, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.0479, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.25, "rewards/margins": 10.25, "rewards/rejected": -28.5, "step": 14010 }, { "epoch": 1.012055150508915, "grad_norm": 13.672083423768704, "learning_rate": 6.756409753205447e-07, "logits/chosen": -0.69140625, "logits/rejected": -0.09228515625, "logps/chosen": -478.0, "logps/rejected": -544.0, "loss": 0.0414, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.375, "rewards/margins": 10.375, "rewards/rejected": -28.75, "step": 14020 }, { "epoch": 1.01277701580885, "grad_norm": 4.285810857447892, "learning_rate": 6.754001480181798e-07, "logits/chosen": -0.6171875, "logits/rejected": -0.01019287109375, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0342, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 10.9375, "rewards/rejected": -28.625, "step": 14030 }, { "epoch": 1.013498881108785, "grad_norm": 6.612768005565578, "learning_rate": 6.751595780557777e-07, "logits/chosen": -0.609375, "logits/rejected": 0.001739501953125, "logps/chosen": -480.0, "logps/rejected": -544.0, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -17.875, "rewards/margins": 10.375, "rewards/rejected": -28.25, "step": 14040 }, { "epoch": 1.01422074640872, "grad_norm": 3.117207285677455, "learning_rate": 6.749192649753564e-07, "logits/chosen": -0.74609375, "logits/rejected": -0.1572265625, "logps/chosen": -472.0, "logps/rejected": -528.0, "loss": 0.0413, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.875, "rewards/margins": 10.875, "rewards/rejected": -27.75, "step": 14050 }, { "epoch": 1.014942611708655, "grad_norm": 6.510099852038428, "learning_rate": 6.746792083200745e-07, "logits/chosen": -0.54296875, "logits/rejected": -0.029296875, "logps/chosen": -454.0, "logps/rejected": -540.0, "loss": 0.0484, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 11.25, "rewards/rejected": -29.5, "step": 14060 }, { "epoch": 1.0156644770085903, "grad_norm": 6.58498866709647, "learning_rate": 6.74439407634227e-07, "logits/chosen": -0.734375, "logits/rejected": -0.146484375, "logps/chosen": -482.0, "logps/rejected": -576.0, "loss": 0.0311, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 11.8125, "rewards/rejected": -31.375, "step": 14070 }, { "epoch": 1.0163863423085253, "grad_norm": 3.4875173618806095, "learning_rate": 6.74199862463242e-07, "logits/chosen": -0.703125, "logits/rejected": -0.2431640625, "logps/chosen": -464.0, "logps/rejected": -544.0, "loss": 0.0221, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.375, "rewards/margins": 11.6875, "rewards/rejected": -29.125, "step": 14080 }, { "epoch": 1.0171082076084603, "grad_norm": 3.0247925999636736, "learning_rate": 6.739605723536771e-07, "logits/chosen": -0.64453125, "logits/rejected": -0.1923828125, "logps/chosen": -476.0, "logps/rejected": -544.0, "loss": 0.034, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 10.25, "rewards/rejected": -27.875, "step": 14090 }, { "epoch": 1.0178300729083953, "grad_norm": 7.661627921045524, "learning_rate": 6.737215368532152e-07, "logits/chosen": -0.67578125, "logits/rejected": -0.03173828125, "logps/chosen": -456.0, "logps/rejected": -516.0, "loss": 0.0555, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -16.875, "rewards/margins": 10.75, "rewards/rejected": -27.625, "step": 14100 }, { "epoch": 1.0185519382083303, "grad_norm": 1.0840425000400435, "learning_rate": 6.734827555106618e-07, "logits/chosen": -0.61328125, "logits/rejected": -0.095703125, "logps/chosen": -464.0, "logps/rejected": -520.0, "loss": 0.0364, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.0, "rewards/margins": 10.375, "rewards/rejected": -27.375, "step": 14110 }, { "epoch": 1.0192738035082654, "grad_norm": 6.799197822303818, "learning_rate": 6.73244227875941e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.09716796875, "logps/chosen": -460.0, "logps/rejected": -532.0, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -17.875, "rewards/margins": 10.9375, "rewards/rejected": -28.75, "step": 14120 }, { "epoch": 1.0199956688082004, "grad_norm": 4.250262799184485, "learning_rate": 6.730059535000916e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.267578125, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 10.5, "rewards/rejected": -29.25, "step": 14130 }, { "epoch": 1.0207175341081354, "grad_norm": 6.5905868147710125, "learning_rate": 6.727679319352644e-07, "logits/chosen": -0.5703125, "logits/rejected": -0.036865234375, "logps/chosen": -480.0, "logps/rejected": -528.0, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": -20.25, "rewards/margins": 9.75, "rewards/rejected": -29.875, "step": 14140 }, { "epoch": 1.0214393994080704, "grad_norm": 6.4742587485425585, "learning_rate": 6.725301627347177e-07, "logits/chosen": -0.62109375, "logits/rejected": 0.0908203125, "logps/chosen": -482.0, "logps/rejected": -536.0, "loss": 0.0439, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.0, "rewards/rejected": -30.25, "step": 14150 }, { "epoch": 1.0221612647080054, "grad_norm": 9.731467541285447, "learning_rate": 6.722926454528143e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.026123046875, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0286, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 10.3125, "rewards/rejected": -29.25, "step": 14160 }, { "epoch": 1.0228831300079406, "grad_norm": 9.069333031869968, "learning_rate": 6.720553796450181e-07, "logits/chosen": -0.6328125, "logits/rejected": -0.12060546875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0423, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.375, "rewards/margins": 11.4375, "rewards/rejected": -28.75, "step": 14170 }, { "epoch": 1.0236049953078756, "grad_norm": 7.250180185531442, "learning_rate": 6.718183648678903e-07, "logits/chosen": -0.69140625, "logits/rejected": -0.0478515625, "logps/chosen": -452.0, "logps/rejected": -510.0, "loss": 0.054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.625, "rewards/margins": 10.3125, "rewards/rejected": -27.875, "step": 14180 }, { "epoch": 1.0243268606078106, "grad_norm": 5.161778819899752, "learning_rate": 6.715816006790862e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.2001953125, "logps/chosen": -442.0, "logps/rejected": -500.0, "loss": 0.042, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -15.875, "rewards/margins": 10.5625, "rewards/rejected": -26.5, "step": 14190 }, { "epoch": 1.0250487259077457, "grad_norm": 8.71443042703087, "learning_rate": 6.713450866373512e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.0966796875, "logps/chosen": -448.0, "logps/rejected": -500.0, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.875, "rewards/margins": 9.8125, "rewards/rejected": -26.625, "step": 14200 }, { "epoch": 1.0257705912076807, "grad_norm": 0.6282207871413441, "learning_rate": 6.711088223025183e-07, "logits/chosen": -0.62890625, "logits/rejected": -0.13671875, "logps/chosen": -448.0, "logps/rejected": -506.0, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.25, "rewards/margins": 9.75, "rewards/rejected": -27.0, "step": 14210 }, { "epoch": 1.0264924565076157, "grad_norm": 4.091182250902407, "learning_rate": 6.708728072355036e-07, "logits/chosen": -0.65234375, "logits/rejected": 0.04443359375, "logps/chosen": -464.0, "logps/rejected": -548.0, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.25, "rewards/margins": 10.8125, "rewards/rejected": -30.0, "step": 14220 }, { "epoch": 1.0272143218075507, "grad_norm": 1.8536144974398239, "learning_rate": 6.706370409983032e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.1015625, "logps/chosen": -434.0, "logps/rejected": -512.0, "loss": 0.0394, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.375, "rewards/margins": 10.1875, "rewards/rejected": -27.5, "step": 14230 }, { "epoch": 1.0279361871074857, "grad_norm": 11.166996581345497, "learning_rate": 6.704015231539909e-07, "logits/chosen": -0.8125, "logits/rejected": -0.14453125, "logps/chosen": -466.0, "logps/rejected": -532.0, "loss": 0.0267, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.0, "rewards/margins": 10.5, "rewards/rejected": -28.5, "step": 14240 }, { "epoch": 1.0286580524074207, "grad_norm": 4.320353834373834, "learning_rate": 6.701662532667127e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.1982421875, "logps/chosen": -496.0, "logps/rejected": -540.0, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 11.5, "rewards/rejected": -29.375, "step": 14250 }, { "epoch": 1.0293799177073557, "grad_norm": 5.863827604565047, "learning_rate": 6.699312309016855e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.0849609375, "logps/chosen": -466.0, "logps/rejected": -524.0, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 10.5, "rewards/rejected": -28.75, "step": 14260 }, { "epoch": 1.030101783007291, "grad_norm": 2.2422519337280504, "learning_rate": 6.696964556251918e-07, "logits/chosen": -0.65234375, "logits/rejected": 0.06591796875, "logps/chosen": -452.0, "logps/rejected": -564.0, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 10.625, "rewards/rejected": -29.625, "step": 14270 }, { "epoch": 1.030823648307226, "grad_norm": 8.01113808911681, "learning_rate": 6.694619270045784e-07, "logits/chosen": -0.76171875, "logits/rejected": -0.08837890625, "logps/chosen": -470.0, "logps/rejected": -544.0, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 10.6875, "rewards/rejected": -30.625, "step": 14280 }, { "epoch": 1.031545513607161, "grad_norm": 8.094707594536153, "learning_rate": 6.69227644608251e-07, "logits/chosen": -0.6015625, "logits/rejected": -0.043212890625, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0462, "rewards/accuracies": 0.96875, "rewards/chosen": -20.375, "rewards/margins": 10.3125, "rewards/rejected": -30.75, "step": 14290 }, { "epoch": 1.032267378907096, "grad_norm": 9.227927649515173, "learning_rate": 6.689936080056727e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.041748046875, "logps/chosen": -506.0, "logps/rejected": -576.0, "loss": 0.0281, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.625, "rewards/margins": 10.75, "rewards/rejected": -32.5, "step": 14300 }, { "epoch": 1.032989244207031, "grad_norm": 2.4726832890881902, "learning_rate": 6.687598167673588e-07, "logits/chosen": -0.75, "logits/rejected": -0.11865234375, "logps/chosen": -506.0, "logps/rejected": -564.0, "loss": 0.0311, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 9.875, "rewards/rejected": -31.125, "step": 14310 }, { "epoch": 1.033711109506966, "grad_norm": 2.8401874769025603, "learning_rate": 6.685262704648755e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.045654296875, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0346, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 10.125, "rewards/rejected": -30.625, "step": 14320 }, { "epoch": 1.034432974806901, "grad_norm": 5.339895505827722, "learning_rate": 6.682929686708352e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.1494140625, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.125, "step": 14330 }, { "epoch": 1.035154840106836, "grad_norm": 14.080145579218092, "learning_rate": 6.680599109588932e-07, "logits/chosen": -0.66796875, "logits/rejected": -0.09033203125, "logps/chosen": -480.0, "logps/rejected": -540.0, "loss": 0.0396, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 10.5625, "rewards/rejected": -29.875, "step": 14340 }, { "epoch": 1.035876705406771, "grad_norm": 6.850964070488285, "learning_rate": 6.678270969037457e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.08837890625, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0426, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 9.8125, "rewards/rejected": -29.375, "step": 14350 }, { "epoch": 1.036598570706706, "grad_norm": 3.6411970440547083, "learning_rate": 6.67594526081125e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.2216796875, "logps/chosen": -506.0, "logps/rejected": -580.0, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -19.625, "rewards/margins": 11.5, "rewards/rejected": -31.125, "step": 14360 }, { "epoch": 1.0373204360066413, "grad_norm": 10.554749992018749, "learning_rate": 6.673621980677971e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.146484375, "logps/chosen": -520.0, "logps/rejected": -568.0, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -20.375, "rewards/margins": 10.875, "rewards/rejected": -31.25, "step": 14370 }, { "epoch": 1.0380423013065763, "grad_norm": 7.6337890305697735, "learning_rate": 6.671301124415585e-07, "logits/chosen": -0.67578125, "logits/rejected": -0.1689453125, "logps/chosen": -482.0, "logps/rejected": -556.0, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 10.875, "rewards/rejected": -30.25, "step": 14380 }, { "epoch": 1.0387641666065113, "grad_norm": 7.867668140889722, "learning_rate": 6.668982687812326e-07, "logits/chosen": -0.8125, "logits/rejected": -0.19140625, "logps/chosen": -458.0, "logps/rejected": -520.0, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.125, "rewards/margins": 10.5, "rewards/rejected": -27.625, "step": 14390 }, { "epoch": 1.0394860319064463, "grad_norm": 8.441680667011251, "learning_rate": 6.666666666666666e-07, "logits/chosen": -0.7578125, "logits/rejected": -0.1015625, "logps/chosen": -450.0, "logps/rejected": -528.0, "loss": 0.0512, "rewards/accuracies": 0.96875, "rewards/chosen": -17.125, "rewards/margins": 10.0625, "rewards/rejected": -27.25, "step": 14400 }, { "epoch": 1.0402078972063813, "grad_norm": 3.1816534710863973, "learning_rate": 6.664353056787287e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.220703125, "logps/chosen": -456.0, "logps/rejected": -532.0, "loss": 0.0389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.75, "rewards/margins": 10.875, "rewards/rejected": -28.75, "step": 14410 }, { "epoch": 1.0409297625063163, "grad_norm": 2.2645957333707045, "learning_rate": 6.662041853993043e-07, "logits/chosen": -0.72265625, "logits/rejected": -0.2021484375, "logps/chosen": -474.0, "logps/rejected": -524.0, "loss": 0.0392, "rewards/accuracies": 0.96875, "rewards/chosen": -17.875, "rewards/margins": 9.0, "rewards/rejected": -26.75, "step": 14420 }, { "epoch": 1.0416516278062513, "grad_norm": 6.388379284660044, "learning_rate": 6.659733054112932e-07, "logits/chosen": -0.63671875, "logits/rejected": -0.0791015625, "logps/chosen": -442.0, "logps/rejected": -506.0, "loss": 0.0358, "rewards/accuracies": 0.96875, "rewards/chosen": -17.75, "rewards/margins": 9.875, "rewards/rejected": -27.625, "step": 14430 }, { "epoch": 1.0423734931061863, "grad_norm": 6.009448617810182, "learning_rate": 6.657426652986061e-07, "logits/chosen": -0.765625, "logits/rejected": -0.2275390625, "logps/chosen": -460.0, "logps/rejected": -548.0, "loss": 0.0354, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -18.375, "rewards/margins": 9.9375, "rewards/rejected": -28.25, "step": 14440 }, { "epoch": 1.0430953584061213, "grad_norm": 7.4064219994022436, "learning_rate": 6.655122646461624e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.2099609375, "logps/chosen": -462.0, "logps/rejected": -516.0, "loss": 0.0506, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.875, "rewards/margins": 10.3125, "rewards/rejected": -28.25, "step": 14450 }, { "epoch": 1.0438172237060566, "grad_norm": 6.329863004099862, "learning_rate": 6.652821030398855e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.376953125, "logps/chosen": -464.0, "logps/rejected": -512.0, "loss": 0.0553, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.25, "rewards/margins": 9.875, "rewards/rejected": -27.125, "step": 14460 }, { "epoch": 1.0445390890059916, "grad_norm": 8.681972957737457, "learning_rate": 6.650521800667013e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.09228515625, "logps/chosen": -458.0, "logps/rejected": -528.0, "loss": 0.028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 10.5625, "rewards/rejected": -28.5, "step": 14470 }, { "epoch": 1.0452609543059266, "grad_norm": 3.891931853528603, "learning_rate": 6.648224953145336e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.1337890625, "logps/chosen": -452.0, "logps/rejected": -536.0, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 10.625, "rewards/rejected": -28.5, "step": 14480 }, { "epoch": 1.0459828196058616, "grad_norm": 5.028562959089505, "learning_rate": 6.645930483723025e-07, "logits/chosen": -0.84375, "logits/rejected": -0.14453125, "logps/chosen": -476.0, "logps/rejected": -552.0, "loss": 0.0321, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 11.0625, "rewards/rejected": -30.375, "step": 14490 }, { "epoch": 1.0467046849057966, "grad_norm": 9.156939535343296, "learning_rate": 6.643638388299198e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.09619140625, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.0453, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.625, "rewards/rejected": -29.625, "step": 14500 }, { "epoch": 1.0474265502057316, "grad_norm": 7.2417235209137685, "learning_rate": 6.64134866278287e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.07373046875, "logps/chosen": -482.0, "logps/rejected": -536.0, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 10.875, "rewards/rejected": -30.75, "step": 14510 }, { "epoch": 1.0481484155056666, "grad_norm": 1.699977324850441, "learning_rate": 6.639061303092922e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.115234375, "logps/chosen": -486.0, "logps/rejected": -572.0, "loss": 0.0391, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 10.8125, "rewards/rejected": -31.75, "step": 14520 }, { "epoch": 1.0488702808056016, "grad_norm": 4.08961189631735, "learning_rate": 6.636776305158062e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.1318359375, "logps/chosen": -524.0, "logps/rejected": -612.0, "loss": 0.0473, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.25, "rewards/margins": 12.25, "rewards/rejected": -34.5, "step": 14530 }, { "epoch": 1.0495921461055366, "grad_norm": 6.256145130522096, "learning_rate": 6.634493664916802e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.2734375, "logps/chosen": -486.0, "logps/rejected": -548.0, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -20.125, "rewards/margins": 10.5625, "rewards/rejected": -30.625, "step": 14540 }, { "epoch": 1.0503140114054716, "grad_norm": 13.534743486884542, "learning_rate": 6.632213378317426e-07, "logits/chosen": -0.96875, "logits/rejected": -0.19140625, "logps/chosen": -460.0, "logps/rejected": -528.0, "loss": 0.0514, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 11.1875, "rewards/rejected": -29.75, "step": 14550 }, { "epoch": 1.0510358767054067, "grad_norm": 6.464449659219985, "learning_rate": 6.629935441317959e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.185546875, "logps/chosen": -462.0, "logps/rejected": -560.0, "loss": 0.02, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 10.875, "rewards/rejected": -30.75, "step": 14560 }, { "epoch": 1.051757742005342, "grad_norm": 8.820194141548754, "learning_rate": 6.627659849886136e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.240234375, "logps/chosen": -504.0, "logps/rejected": -564.0, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": -20.625, "rewards/margins": 9.9375, "rewards/rejected": -30.5, "step": 14570 }, { "epoch": 1.052479607305277, "grad_norm": 6.933048189380386, "learning_rate": 6.625386599999376e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.30078125, "logps/chosen": -472.0, "logps/rejected": -556.0, "loss": 0.0464, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.625, "rewards/margins": 11.1875, "rewards/rejected": -29.75, "step": 14580 }, { "epoch": 1.053201472605212, "grad_norm": 7.170951345495441, "learning_rate": 6.623115687644749e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.09033203125, "logps/chosen": -474.0, "logps/rejected": -532.0, "loss": 0.0383, "rewards/accuracies": 0.96875, "rewards/chosen": -19.5, "rewards/margins": 10.4375, "rewards/rejected": -29.875, "step": 14590 }, { "epoch": 1.053923337905147, "grad_norm": 7.633759691341581, "learning_rate": 6.620847108818943e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.185546875, "logps/chosen": -464.0, "logps/rejected": -508.0, "loss": 0.0393, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 9.8125, "rewards/rejected": -28.5, "step": 14600 }, { "epoch": 1.054645203205082, "grad_norm": 5.702863187083997, "learning_rate": 6.618580859528244e-07, "logits/chosen": -0.8125, "logits/rejected": -0.2412109375, "logps/chosen": -446.0, "logps/rejected": -504.0, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -17.25, "rewards/margins": 9.8125, "rewards/rejected": -27.125, "step": 14610 }, { "epoch": 1.055367068505017, "grad_norm": 4.061125471373368, "learning_rate": 6.616316935788494e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.142578125, "logps/chosen": -478.0, "logps/rejected": -540.0, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 10.0625, "rewards/rejected": -29.625, "step": 14620 }, { "epoch": 1.056088933804952, "grad_norm": 3.7359680293052064, "learning_rate": 6.614055333625071e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.1650390625, "logps/chosen": -472.0, "logps/rejected": -564.0, "loss": 0.0406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 11.625, "rewards/rejected": -31.25, "step": 14630 }, { "epoch": 1.056810799104887, "grad_norm": 5.687237365650945, "learning_rate": 6.611796049072861e-07, "logits/chosen": -0.66015625, "logits/rejected": -0.09033203125, "logps/chosen": -458.0, "logps/rejected": -536.0, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -18.875, "rewards/margins": 10.4375, "rewards/rejected": -29.25, "step": 14640 }, { "epoch": 1.057532664404822, "grad_norm": 3.946736540906871, "learning_rate": 6.60953907817622e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.1611328125, "logps/chosen": -466.0, "logps/rejected": -540.0, "loss": 0.0299, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 10.625, "rewards/rejected": -30.75, "step": 14650 }, { "epoch": 1.0582545297047572, "grad_norm": 2.947008004928722, "learning_rate": 6.607284416988948e-07, "logits/chosen": -0.6796875, "logits/rejected": -0.1640625, "logps/chosen": -472.0, "logps/rejected": -536.0, "loss": 0.0471, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 10.5, "rewards/rejected": -29.25, "step": 14660 }, { "epoch": 1.0589763950046922, "grad_norm": 6.523257456086955, "learning_rate": 6.605032061574266e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.1845703125, "logps/chosen": -488.0, "logps/rejected": -548.0, "loss": 0.0359, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.875, "rewards/margins": 10.6875, "rewards/rejected": -30.5, "step": 14670 }, { "epoch": 1.0596982603046272, "grad_norm": 6.197019720914456, "learning_rate": 6.602782008004778e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.25, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0457, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.25, "rewards/margins": 10.5625, "rewards/rejected": -31.875, "step": 14680 }, { "epoch": 1.0604201256045622, "grad_norm": 9.571428461169102, "learning_rate": 6.600534252362451e-07, "logits/chosen": -0.7890625, "logits/rejected": -0.22265625, "logps/chosen": -494.0, "logps/rejected": -568.0, "loss": 0.0494, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 10.625, "rewards/rejected": -30.75, "step": 14690 }, { "epoch": 1.0611419909044972, "grad_norm": 7.113083720827658, "learning_rate": 6.59828879073858e-07, "logits/chosen": -0.64453125, "logits/rejected": -0.07275390625, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0411, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 10.6875, "rewards/rejected": -30.625, "step": 14700 }, { "epoch": 1.0618638562044322, "grad_norm": 4.264550467830831, "learning_rate": 6.596045619233764e-07, "logits/chosen": -0.6796875, "logits/rejected": -0.154296875, "logps/chosen": -494.0, "logps/rejected": -560.0, "loss": 0.0357, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 10.875, "rewards/rejected": -31.125, "step": 14710 }, { "epoch": 1.0625857215043673, "grad_norm": 3.6878600843772444, "learning_rate": 6.59380473395787e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.0179443359375, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0218, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 11.1875, "rewards/rejected": -30.75, "step": 14720 }, { "epoch": 1.0633075868043023, "grad_norm": 9.54069824077645, "learning_rate": 6.591566131030017e-07, "logits/chosen": -0.71875, "logits/rejected": -0.189453125, "logps/chosen": -482.0, "logps/rejected": -548.0, "loss": 0.0331, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 10.0, "rewards/rejected": -30.0, "step": 14730 }, { "epoch": 1.0640294521042373, "grad_norm": 5.115032639283987, "learning_rate": 6.589329806578535e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.04931640625, "logps/chosen": -494.0, "logps/rejected": -540.0, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -20.5, "rewards/margins": 10.5, "rewards/rejected": -31.0, "step": 14740 }, { "epoch": 1.0647513174041723, "grad_norm": 4.972928091112957, "learning_rate": 6.587095756740946e-07, "logits/chosen": -0.7578125, "logits/rejected": -0.1328125, "logps/chosen": -452.0, "logps/rejected": -536.0, "loss": 0.0291, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.5, "rewards/margins": 11.6875, "rewards/rejected": -29.25, "step": 14750 }, { "epoch": 1.0654731827041075, "grad_norm": 9.892473595770326, "learning_rate": 6.58486397766393e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.2451171875, "logps/chosen": -492.0, "logps/rejected": -544.0, "loss": 0.0448, "rewards/accuracies": 0.96875, "rewards/chosen": -20.75, "rewards/margins": 9.4375, "rewards/rejected": -30.25, "step": 14760 }, { "epoch": 1.0661950480040425, "grad_norm": 8.87345383754722, "learning_rate": 6.582634465503303e-07, "logits/chosen": -0.78125, "logits/rejected": -0.3046875, "logps/chosen": -470.0, "logps/rejected": -528.0, "loss": 0.0272, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 10.4375, "rewards/rejected": -28.75, "step": 14770 }, { "epoch": 1.0669169133039775, "grad_norm": 4.732030230349395, "learning_rate": 6.580407216423983e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.173828125, "logps/chosen": -434.0, "logps/rejected": -512.0, "loss": 0.0363, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.25, "rewards/margins": 10.125, "rewards/rejected": -27.375, "step": 14780 }, { "epoch": 1.0676387786039125, "grad_norm": 5.600162670071713, "learning_rate": 6.578182226599962e-07, "logits/chosen": -0.73046875, "logits/rejected": -0.008544921875, "logps/chosen": -452.0, "logps/rejected": -536.0, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -19.0, "rewards/margins": 11.0625, "rewards/rejected": -30.0, "step": 14790 }, { "epoch": 1.0683606439038476, "grad_norm": 9.589016488251145, "learning_rate": 6.575959492214291e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.05029296875, "logps/chosen": -492.0, "logps/rejected": -544.0, "loss": 0.0513, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.5, "rewards/margins": 10.4375, "rewards/rejected": -31.0, "step": 14800 }, { "epoch": 1.0690825092037826, "grad_norm": 7.256864532484522, "learning_rate": 6.573739009459035e-07, "logits/chosen": -0.67578125, "logits/rejected": 0.007720947265625, "logps/chosen": -496.0, "logps/rejected": -568.0, "loss": 0.0448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 10.375, "rewards/rejected": -30.75, "step": 14810 }, { "epoch": 1.0698043745037176, "grad_norm": 3.913048009228486, "learning_rate": 6.571520774535256e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.1875, "logps/chosen": -498.0, "logps/rejected": -540.0, "loss": 0.0239, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 9.875, "rewards/rejected": -30.875, "step": 14820 }, { "epoch": 1.0705262398036526, "grad_norm": 5.215561510017924, "learning_rate": 6.569304783652981e-07, "logits/chosen": -0.61328125, "logits/rejected": -0.1220703125, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 10.5, "rewards/rejected": -29.75, "step": 14830 }, { "epoch": 1.0712481051035876, "grad_norm": 7.794870096070178, "learning_rate": 6.567091033031185e-07, "logits/chosen": -0.72265625, "logits/rejected": -0.06787109375, "logps/chosen": -450.0, "logps/rejected": -552.0, "loss": 0.0332, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.25, "rewards/margins": 10.5, "rewards/rejected": -28.75, "step": 14840 }, { "epoch": 1.0719699704035226, "grad_norm": 0.5472826986737926, "learning_rate": 6.564879518897744e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.2041015625, "logps/chosen": -432.0, "logps/rejected": -500.0, "loss": 0.0288, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.5, "rewards/margins": 10.0625, "rewards/rejected": -26.625, "step": 14850 }, { "epoch": 1.0726918357034578, "grad_norm": 7.307199350885506, "learning_rate": 6.562670237489432e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.06884765625, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -18.5, "rewards/margins": 11.125, "rewards/rejected": -29.625, "step": 14860 }, { "epoch": 1.0734137010033928, "grad_norm": 3.3009720424182003, "learning_rate": 6.560463185051874e-07, "logits/chosen": -0.69140625, "logits/rejected": -0.1943359375, "logps/chosen": -458.0, "logps/rejected": -544.0, "loss": 0.0299, "rewards/accuracies": 0.96875, "rewards/chosen": -19.125, "rewards/margins": 9.8125, "rewards/rejected": -29.0, "step": 14870 }, { "epoch": 1.0741355663033278, "grad_norm": 3.2917164115666098, "learning_rate": 6.558258357839529e-07, "logits/chosen": -0.609375, "logits/rejected": -0.05908203125, "logps/chosen": -470.0, "logps/rejected": -528.0, "loss": 0.0383, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.125, "rewards/margins": 10.875, "rewards/rejected": -29.125, "step": 14880 }, { "epoch": 1.0748574316032629, "grad_norm": 6.463454143240353, "learning_rate": 6.556055752115664e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.08984375, "logps/chosen": -470.0, "logps/rejected": -532.0, "loss": 0.0575, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 10.375, "rewards/rejected": -29.625, "step": 14890 }, { "epoch": 1.0755792969031979, "grad_norm": 7.16581651821228, "learning_rate": 6.553855364152324e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.2216796875, "logps/chosen": -486.0, "logps/rejected": -548.0, "loss": 0.0252, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.375, "rewards/rejected": -29.375, "step": 14900 }, { "epoch": 1.0763011622031329, "grad_norm": 5.946225593684132, "learning_rate": 6.551657190230304e-07, "logits/chosen": -0.65234375, "logits/rejected": -0.154296875, "logps/chosen": -478.0, "logps/rejected": -540.0, "loss": 0.0288, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.25, "rewards/margins": 10.125, "rewards/rejected": -28.375, "step": 14910 }, { "epoch": 1.0770230275030679, "grad_norm": 7.6281139279588475, "learning_rate": 6.549461226639129e-07, "logits/chosen": -0.6875, "logits/rejected": -0.1279296875, "logps/chosen": -452.0, "logps/rejected": -532.0, "loss": 0.0434, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 10.375, "rewards/rejected": -28.25, "step": 14920 }, { "epoch": 1.077744892803003, "grad_norm": 8.824018153454377, "learning_rate": 6.547267469677022e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.10205078125, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0429, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 11.5625, "rewards/rejected": -29.875, "step": 14930 }, { "epoch": 1.078466758102938, "grad_norm": 5.996912599303431, "learning_rate": 6.545075915650879e-07, "logits/chosen": -0.54296875, "logits/rejected": 0.06103515625, "logps/chosen": -500.0, "logps/rejected": -540.0, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -19.25, "rewards/margins": 11.0, "rewards/rejected": -30.25, "step": 14940 }, { "epoch": 1.0791886234028731, "grad_norm": 5.7558165279415645, "learning_rate": 6.542886560876246e-07, "logits/chosen": -0.796875, "logits/rejected": -0.314453125, "logps/chosen": -474.0, "logps/rejected": -568.0, "loss": 0.0365, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 10.875, "rewards/rejected": -29.875, "step": 14950 }, { "epoch": 1.0799104887028081, "grad_norm": 9.25667323840582, "learning_rate": 6.540699401677286e-07, "logits/chosen": -0.7734375, "logits/rejected": -0.0751953125, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.5, "rewards/rejected": -31.75, "step": 14960 }, { "epoch": 1.0806323540027432, "grad_norm": 5.486056139122968, "learning_rate": 6.538514434386762e-07, "logits/chosen": -0.5625, "logits/rejected": -0.1630859375, "logps/chosen": -478.0, "logps/rejected": -524.0, "loss": 0.0513, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.5, "rewards/margins": 9.5625, "rewards/rejected": -29.0, "step": 14970 }, { "epoch": 1.0813542193026782, "grad_norm": 6.346531997703292, "learning_rate": 6.536331655346004e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.1357421875, "logps/chosen": -468.0, "logps/rejected": -524.0, "loss": 0.0284, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.75, "rewards/margins": 11.0, "rewards/rejected": -28.75, "step": 14980 }, { "epoch": 1.0820760846026132, "grad_norm": 2.7035809605242447, "learning_rate": 6.534151060904888e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.2099609375, "logps/chosen": -438.0, "logps/rejected": -528.0, "loss": 0.0513, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.5, "rewards/margins": 10.25, "rewards/rejected": -26.75, "step": 14990 }, { "epoch": 1.0827979499025482, "grad_norm": 3.55197439769088, "learning_rate": 6.531972647421808e-07, "logits/chosen": -0.67578125, "logits/rejected": -0.057861328125, "logps/chosen": -460.0, "logps/rejected": -528.0, "loss": 0.0359, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.5, "rewards/margins": 10.6875, "rewards/rejected": -28.25, "step": 15000 }, { "epoch": 1.0827979499025482, "eval_logits/chosen": -0.73046875, "eval_logits/rejected": -0.18359375, "eval_logps/chosen": -472.0, "eval_logps/rejected": -516.0, "eval_loss": 0.24471142888069153, "eval_rewards/accuracies": 0.9158279299736023, "eval_rewards/chosen": -19.0, "eval_rewards/margins": 8.375, "eval_rewards/rejected": -27.375, "eval_runtime": 2854.084, "eval_samples_per_second": 34.513, "eval_steps_per_second": 0.54, "step": 15000 }, { "epoch": 1.0835198152024832, "grad_norm": 7.155599502103401, "learning_rate": 6.529796411263649e-07, "logits/chosen": -0.498046875, "logits/rejected": -0.003753662109375, "logps/chosen": -448.0, "logps/rejected": -508.0, "loss": 0.0504, "rewards/accuracies": 0.96875, "rewards/chosen": -18.125, "rewards/margins": 10.0625, "rewards/rejected": -28.25, "step": 15010 }, { "epoch": 1.0842416805024182, "grad_norm": 4.065879116181823, "learning_rate": 6.527622348805765e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.1591796875, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 10.5, "rewards/rejected": -28.375, "step": 15020 }, { "epoch": 1.0849635458023532, "grad_norm": 2.714105153759878, "learning_rate": 6.525450456431951e-07, "logits/chosen": -0.70703125, "logits/rejected": -0.2021484375, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.0417, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.125, "rewards/margins": 10.75, "rewards/rejected": -27.875, "step": 15030 }, { "epoch": 1.0856854111022882, "grad_norm": 7.828615281356051, "learning_rate": 6.523280730534421e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.232421875, "logps/chosen": -462.0, "logps/rejected": -532.0, "loss": 0.044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.875, "rewards/margins": 10.1875, "rewards/rejected": -28.125, "step": 15040 }, { "epoch": 1.0864072764022232, "grad_norm": 2.8241434822061935, "learning_rate": 6.521113167513779e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.1865234375, "logps/chosen": -432.0, "logps/rejected": -508.0, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.375, "rewards/margins": 10.25, "rewards/rejected": -26.625, "step": 15050 }, { "epoch": 1.0871291417021585, "grad_norm": 5.9518077756558005, "learning_rate": 6.518947763778994e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.3359375, "logps/chosen": -458.0, "logps/rejected": -540.0, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": -17.25, "rewards/margins": 10.8125, "rewards/rejected": -28.125, "step": 15060 }, { "epoch": 1.0878510070020935, "grad_norm": 6.906571055611576, "learning_rate": 6.516784515747379e-07, "logits/chosen": -0.7890625, "logits/rejected": -0.28125, "logps/chosen": -472.0, "logps/rejected": -528.0, "loss": 0.0389, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.875, "rewards/margins": 10.5, "rewards/rejected": -28.375, "step": 15070 }, { "epoch": 1.0885728723020285, "grad_norm": 5.569612012033611, "learning_rate": 6.514623419844562e-07, "logits/chosen": -0.76171875, "logits/rejected": -0.1494140625, "logps/chosen": -450.0, "logps/rejected": -524.0, "loss": 0.0341, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 10.125, "rewards/rejected": -28.0, "step": 15080 }, { "epoch": 1.0892947376019635, "grad_norm": 7.177293361863773, "learning_rate": 6.512464472504465e-07, "logits/chosen": -0.7890625, "logits/rejected": -0.1845703125, "logps/chosen": -452.0, "logps/rejected": -508.0, "loss": 0.0297, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.5, "rewards/margins": 9.9375, "rewards/rejected": -27.5, "step": 15090 }, { "epoch": 1.0900166029018985, "grad_norm": 3.4923625477478293, "learning_rate": 6.510307670169275e-07, "logits/chosen": -0.875, "logits/rejected": -0.21484375, "logps/chosen": -450.0, "logps/rejected": -524.0, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.125, "rewards/margins": 10.5, "rewards/rejected": -27.625, "step": 15100 }, { "epoch": 1.0907384682018335, "grad_norm": 5.780382758276598, "learning_rate": 6.508153009289422e-07, "logits/chosen": -0.71875, "logits/rejected": -0.17578125, "logps/chosen": -448.0, "logps/rejected": -510.0, "loss": 0.0431, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.625, "rewards/margins": 9.75, "rewards/rejected": -27.375, "step": 15110 }, { "epoch": 1.0914603335017685, "grad_norm": 1.1615688427890534, "learning_rate": 6.506000486323554e-07, "logits/chosen": -0.890625, "logits/rejected": -0.263671875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0353, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 10.8125, "rewards/rejected": -29.0, "step": 15120 }, { "epoch": 1.0921821988017035, "grad_norm": 5.763730403551177, "learning_rate": 6.503850097738512e-07, "logits/chosen": -0.734375, "logits/rejected": -0.169921875, "logps/chosen": -444.0, "logps/rejected": -544.0, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -17.125, "rewards/margins": 10.9375, "rewards/rejected": -28.0, "step": 15130 }, { "epoch": 1.0929040641016385, "grad_norm": 4.708686122103629, "learning_rate": 6.50170184000931e-07, "logits/chosen": -0.6796875, "logits/rejected": -0.228515625, "logps/chosen": -452.0, "logps/rejected": -524.0, "loss": 0.0482, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.0, "rewards/margins": 9.9375, "rewards/rejected": -26.875, "step": 15140 }, { "epoch": 1.0936259294015738, "grad_norm": 6.010859975674747, "learning_rate": 6.4995557096191e-07, "logits/chosen": -0.8125, "logits/rejected": -0.09912109375, "logps/chosen": -444.0, "logps/rejected": -510.0, "loss": 0.0385, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.125, "rewards/margins": 10.0, "rewards/rejected": -26.125, "step": 15150 }, { "epoch": 1.0943477947015088, "grad_norm": 13.918877991634414, "learning_rate": 6.49741170305916e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.248046875, "logps/chosen": -432.0, "logps/rejected": -516.0, "loss": 0.0483, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.25, "rewards/margins": 10.75, "rewards/rejected": -27.0, "step": 15160 }, { "epoch": 1.0950696600014438, "grad_norm": 5.0863863830784215, "learning_rate": 6.495269816828862e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.263671875, "logps/chosen": -450.0, "logps/rejected": -512.0, "loss": 0.0442, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.875, "rewards/margins": 10.3125, "rewards/rejected": -26.125, "step": 15170 }, { "epoch": 1.0957915253013788, "grad_norm": 9.114810218365943, "learning_rate": 6.493130047435654e-07, "logits/chosen": -0.7578125, "logits/rejected": -0.1982421875, "logps/chosen": -450.0, "logps/rejected": -540.0, "loss": 0.0406, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.875, "rewards/margins": 10.5, "rewards/rejected": -28.375, "step": 15180 }, { "epoch": 1.0965133906013138, "grad_norm": 7.252367642519483, "learning_rate": 6.490992391395026e-07, "logits/chosen": -0.61328125, "logits/rejected": -0.07666015625, "logps/chosen": -462.0, "logps/rejected": -540.0, "loss": 0.0376, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.875, "rewards/margins": 9.875, "rewards/rejected": -28.75, "step": 15190 }, { "epoch": 1.0972352559012488, "grad_norm": 5.316512169098011, "learning_rate": 6.488856845230501e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.0654296875, "logps/chosen": -478.0, "logps/rejected": -532.0, "loss": 0.0354, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 11.75, "rewards/rejected": -30.75, "step": 15200 }, { "epoch": 1.0979571212011838, "grad_norm": 8.763086974741245, "learning_rate": 6.486723405473599e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.11474609375, "logps/chosen": -454.0, "logps/rejected": -556.0, "loss": 0.0431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 11.3125, "rewards/rejected": -30.875, "step": 15210 }, { "epoch": 1.0986789865011188, "grad_norm": 12.826631312295746, "learning_rate": 6.484592068663816e-07, "logits/chosen": -0.69140625, "logits/rejected": -0.055908203125, "logps/chosen": -488.0, "logps/rejected": -536.0, "loss": 0.026, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 11.0625, "rewards/rejected": -30.625, "step": 15220 }, { "epoch": 1.0994008518010538, "grad_norm": 6.961892847195886, "learning_rate": 6.482462831348603e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.15234375, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.0364, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.125, "step": 15230 }, { "epoch": 1.1001227171009889, "grad_norm": 13.015351844413074, "learning_rate": 6.480335690083345e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.1103515625, "logps/chosen": -424.0, "logps/rejected": -498.0, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.5, "rewards/margins": 11.125, "rewards/rejected": -26.625, "step": 15240 }, { "epoch": 1.100844582400924, "grad_norm": 8.247481528986407, "learning_rate": 6.478210641431327e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.04833984375, "logps/chosen": -432.0, "logps/rejected": -512.0, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -15.8125, "rewards/margins": 11.25, "rewards/rejected": -27.0, "step": 15250 }, { "epoch": 1.101566447700859, "grad_norm": 5.335087127252629, "learning_rate": 6.476087681963725e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.267578125, "logps/chosen": -440.0, "logps/rejected": -532.0, "loss": 0.0372, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.75, "rewards/margins": 11.0625, "rewards/rejected": -26.875, "step": 15260 }, { "epoch": 1.102288313000794, "grad_norm": 2.048541689826953, "learning_rate": 6.473966808259572e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.29296875, "logps/chosen": -442.0, "logps/rejected": -500.0, "loss": 0.0326, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.625, "rewards/margins": 10.0625, "rewards/rejected": -26.625, "step": 15270 }, { "epoch": 1.1030101783007291, "grad_norm": 8.458453207437769, "learning_rate": 6.471848016905735e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.2255859375, "logps/chosen": -450.0, "logps/rejected": -516.0, "loss": 0.0374, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.0, "rewards/margins": 10.1875, "rewards/rejected": -28.125, "step": 15280 }, { "epoch": 1.1037320436006641, "grad_norm": 11.316669103473195, "learning_rate": 6.469731304496901e-07, "logits/chosen": -0.76953125, "logits/rejected": -0.1279296875, "logps/chosen": -476.0, "logps/rejected": -552.0, "loss": 0.0466, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 11.625, "rewards/rejected": -30.625, "step": 15290 }, { "epoch": 1.1044539089005991, "grad_norm": 9.472620865665574, "learning_rate": 6.467616667635546e-07, "logits/chosen": -0.76171875, "logits/rejected": -0.2333984375, "logps/chosen": -474.0, "logps/rejected": -528.0, "loss": 0.039, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.0, "rewards/margins": 10.8125, "rewards/rejected": -27.75, "step": 15300 }, { "epoch": 1.1051757742005341, "grad_norm": 2.5020838856541694, "learning_rate": 6.465504102931912e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.26953125, "logps/chosen": -454.0, "logps/rejected": -536.0, "loss": 0.0384, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.375, "rewards/rejected": -28.875, "step": 15310 }, { "epoch": 1.1058976395004692, "grad_norm": 3.956615894501751, "learning_rate": 6.463393607003991e-07, "logits/chosen": -0.796875, "logits/rejected": -0.294921875, "logps/chosen": -450.0, "logps/rejected": -536.0, "loss": 0.0353, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.875, "rewards/rejected": -29.875, "step": 15320 }, { "epoch": 1.1066195048004042, "grad_norm": 8.823906691999703, "learning_rate": 6.461285176477491e-07, "logits/chosen": -0.7421875, "logits/rejected": -0.283203125, "logps/chosen": -460.0, "logps/rejected": -584.0, "loss": 0.0544, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.5, "rewards/margins": 10.625, "rewards/rejected": -30.0, "step": 15330 }, { "epoch": 1.1073413701003392, "grad_norm": 5.589081632645653, "learning_rate": 6.45917880798583e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.2119140625, "logps/chosen": -438.0, "logps/rejected": -536.0, "loss": 0.0531, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.375, "rewards/margins": 9.875, "rewards/rejected": -28.125, "step": 15340 }, { "epoch": 1.1080632354002744, "grad_norm": 11.339745413694821, "learning_rate": 6.457074498170094e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.2490234375, "logps/chosen": -450.0, "logps/rejected": -528.0, "loss": 0.04, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.5, "rewards/margins": 10.8125, "rewards/rejected": -28.25, "step": 15350 }, { "epoch": 1.1087851007002094, "grad_norm": 6.336795845766386, "learning_rate": 6.454972243679028e-07, "logits/chosen": -0.72265625, "logits/rejected": -0.0303955078125, "logps/chosen": -456.0, "logps/rejected": -520.0, "loss": 0.0491, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -19.625, "rewards/margins": 9.8125, "rewards/rejected": -29.5, "step": 15360 }, { "epoch": 1.1095069660001444, "grad_norm": 4.038332500103891, "learning_rate": 6.452872041169011e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.1748046875, "logps/chosen": -470.0, "logps/rejected": -544.0, "loss": 0.0336, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 10.9375, "rewards/rejected": -30.0, "step": 15370 }, { "epoch": 1.1102288313000794, "grad_norm": 1.759795117340761, "learning_rate": 6.450773887304029e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.32421875, "logps/chosen": -496.0, "logps/rejected": -552.0, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 10.5625, "rewards/rejected": -29.0, "step": 15380 }, { "epoch": 1.1109506966000144, "grad_norm": 5.8194071037038855, "learning_rate": 6.448677778755658e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.1474609375, "logps/chosen": -468.0, "logps/rejected": -528.0, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 10.5625, "rewards/rejected": -29.75, "step": 15390 }, { "epoch": 1.1116725618999495, "grad_norm": 8.504920752380531, "learning_rate": 6.446583712203042e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.203125, "logps/chosen": -496.0, "logps/rejected": -544.0, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 10.9375, "rewards/rejected": -30.375, "step": 15400 }, { "epoch": 1.1123944271998845, "grad_norm": 1.1263363815387553, "learning_rate": 6.444491684332863e-07, "logits/chosen": -0.76171875, "logits/rejected": -0.138671875, "logps/chosen": -472.0, "logps/rejected": -528.0, "loss": 0.0464, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 10.0625, "rewards/rejected": -29.0, "step": 15410 }, { "epoch": 1.1131162924998195, "grad_norm": 4.310951298610594, "learning_rate": 6.442401691839331e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.208984375, "logps/chosen": -424.0, "logps/rejected": -502.0, "loss": 0.0329, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -16.25, "rewards/margins": 10.5625, "rewards/rejected": -26.875, "step": 15420 }, { "epoch": 1.1138381577997545, "grad_norm": 3.9606011038992053, "learning_rate": 6.440313731424148e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.263671875, "logps/chosen": -478.0, "logps/rejected": -560.0, "loss": 0.0513, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.625, "rewards/margins": 11.0, "rewards/rejected": -29.625, "step": 15430 }, { "epoch": 1.1145600230996897, "grad_norm": 9.746356986105823, "learning_rate": 6.438227799796504e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.140625, "logps/chosen": -490.0, "logps/rejected": -548.0, "loss": 0.0459, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 10.75, "rewards/rejected": -30.375, "step": 15440 }, { "epoch": 1.1152818883996247, "grad_norm": 11.186733694980838, "learning_rate": 6.436143893673037e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.318359375, "logps/chosen": -464.0, "logps/rejected": -524.0, "loss": 0.0361, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 10.625, "rewards/rejected": -29.5, "step": 15450 }, { "epoch": 1.1160037536995597, "grad_norm": 3.1853258025908833, "learning_rate": 6.434062009777823e-07, "logits/chosen": -0.734375, "logits/rejected": -0.2451171875, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 10.0625, "rewards/rejected": -29.875, "step": 15460 }, { "epoch": 1.1167256189994947, "grad_norm": 9.102503934404657, "learning_rate": 6.431982144842349e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.30859375, "logps/chosen": -474.0, "logps/rejected": -536.0, "loss": 0.0413, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.25, "rewards/margins": 10.75, "rewards/rejected": -29.0, "step": 15470 }, { "epoch": 1.1174474842994297, "grad_norm": 7.8292906016539465, "learning_rate": 6.429904295605495e-07, "logits/chosen": -0.953125, "logits/rejected": -0.28515625, "logps/chosen": -470.0, "logps/rejected": -528.0, "loss": 0.0501, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 11.0625, "rewards/rejected": -29.5, "step": 15480 }, { "epoch": 1.1181693495993648, "grad_norm": 13.881212150715628, "learning_rate": 6.42782845881351e-07, "logits/chosen": -0.7734375, "logits/rejected": -0.2080078125, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0409, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 10.8125, "rewards/rejected": -30.25, "step": 15490 }, { "epoch": 1.1188912148992998, "grad_norm": 10.079788284965744, "learning_rate": 6.42575463121999e-07, "logits/chosen": -0.9375, "logits/rejected": -0.29296875, "logps/chosen": -460.0, "logps/rejected": -544.0, "loss": 0.0417, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.625, "rewards/margins": 10.375, "rewards/rejected": -28.0, "step": 15500 }, { "epoch": 1.1196130801992348, "grad_norm": 10.32091615214319, "learning_rate": 6.423682809585863e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.205078125, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0339, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.0625, "rewards/rejected": -30.375, "step": 15510 }, { "epoch": 1.1203349454991698, "grad_norm": 6.621880032361976, "learning_rate": 6.421612990679356e-07, "logits/chosen": -0.828125, "logits/rejected": -0.0849609375, "logps/chosen": -446.0, "logps/rejected": -528.0, "loss": 0.0505, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.75, "rewards/margins": 11.0, "rewards/rejected": -28.75, "step": 15520 }, { "epoch": 1.1210568107991048, "grad_norm": 5.076000888045765, "learning_rate": 6.419545171275983e-07, "logits/chosen": -0.96875, "logits/rejected": -0.2099609375, "logps/chosen": -436.0, "logps/rejected": -506.0, "loss": 0.0285, "rewards/accuracies": 0.96875, "rewards/chosen": -16.875, "rewards/margins": 10.9375, "rewards/rejected": -27.75, "step": 15530 }, { "epoch": 1.1217786760990398, "grad_norm": 4.342850861921313, "learning_rate": 6.417479348158526e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.30078125, "logps/chosen": -466.0, "logps/rejected": -528.0, "loss": 0.0236, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.0, "rewards/margins": 10.125, "rewards/rejected": -27.0, "step": 15540 }, { "epoch": 1.122500541398975, "grad_norm": 19.075697687853644, "learning_rate": 6.415415518117003e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.255859375, "logps/chosen": -464.0, "logps/rejected": -520.0, "loss": 0.0388, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 9.875, "rewards/rejected": -28.375, "step": 15550 }, { "epoch": 1.12322240669891, "grad_norm": 1.484677473525545, "learning_rate": 6.413353677948659e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.2421875, "logps/chosen": -480.0, "logps/rejected": -540.0, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.875, "rewards/margins": 10.4375, "rewards/rejected": -28.25, "step": 15560 }, { "epoch": 1.123944271998845, "grad_norm": 11.911434224428302, "learning_rate": 6.411293824457933e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.3125, "logps/chosen": -434.0, "logps/rejected": -532.0, "loss": 0.0341, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.375, "rewards/margins": 11.125, "rewards/rejected": -27.5, "step": 15570 }, { "epoch": 1.12466613729878, "grad_norm": 12.398975835771914, "learning_rate": 6.409235954456451e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.302734375, "logps/chosen": -456.0, "logps/rejected": -544.0, "loss": 0.03, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 10.4375, "rewards/rejected": -28.625, "step": 15580 }, { "epoch": 1.125388002598715, "grad_norm": 3.5739553384782554, "learning_rate": 6.407180064762995e-07, "logits/chosen": -0.84375, "logits/rejected": -0.2470703125, "logps/chosen": -450.0, "logps/rejected": -524.0, "loss": 0.0471, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.125, "rewards/margins": 11.25, "rewards/rejected": -28.375, "step": 15590 }, { "epoch": 1.12610986789865, "grad_norm": 8.534804217513935, "learning_rate": 6.405126152203485e-07, "logits/chosen": -0.7421875, "logits/rejected": -0.271484375, "logps/chosen": -470.0, "logps/rejected": -532.0, "loss": 0.0379, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.75, "rewards/margins": 11.0, "rewards/rejected": -28.75, "step": 15600 }, { "epoch": 1.126831733198585, "grad_norm": 6.444321411681982, "learning_rate": 6.403074213610959e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.427734375, "logps/chosen": -454.0, "logps/rejected": -532.0, "loss": 0.0316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.625, "rewards/margins": 11.0, "rewards/rejected": -28.625, "step": 15610 }, { "epoch": 1.12755359849852, "grad_norm": 0.8245319875776387, "learning_rate": 6.401024245825555e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.318359375, "logps/chosen": -464.0, "logps/rejected": -548.0, "loss": 0.0245, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.0, "rewards/margins": 10.9375, "rewards/rejected": -29.0, "step": 15620 }, { "epoch": 1.1282754637984551, "grad_norm": 3.1703957280931183, "learning_rate": 6.398976245694481e-07, "logits/chosen": -0.84375, "logits/rejected": -0.36328125, "logps/chosen": -458.0, "logps/rejected": -524.0, "loss": 0.0314, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 10.625, "rewards/rejected": -28.75, "step": 15630 }, { "epoch": 1.1289973290983903, "grad_norm": 3.8192196541834837, "learning_rate": 6.396930210072012e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.2041015625, "logps/chosen": -470.0, "logps/rejected": -540.0, "loss": 0.0583, "rewards/accuracies": 0.96875, "rewards/chosen": -18.5, "rewards/margins": 11.4375, "rewards/rejected": -30.0, "step": 15640 }, { "epoch": 1.1297191943983254, "grad_norm": 10.747928143099282, "learning_rate": 6.394886135819452e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.306640625, "logps/chosen": -458.0, "logps/rejected": -556.0, "loss": 0.0392, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 11.125, "rewards/rejected": -29.5, "step": 15650 }, { "epoch": 1.1304410596982604, "grad_norm": 10.908268806707865, "learning_rate": 6.392844019805121e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.162109375, "logps/chosen": -466.0, "logps/rejected": -580.0, "loss": 0.0524, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 10.9375, "rewards/rejected": -30.75, "step": 15660 }, { "epoch": 1.1311629249981954, "grad_norm": 10.492974961024434, "learning_rate": 6.39080385890434e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.359375, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0441, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.625, "rewards/margins": 9.8125, "rewards/rejected": -28.375, "step": 15670 }, { "epoch": 1.1318847902981304, "grad_norm": 7.885986336684668, "learning_rate": 6.388765649999399e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.318359375, "logps/chosen": -444.0, "logps/rejected": -488.0, "loss": 0.0558, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.5, "rewards/margins": 10.3125, "rewards/rejected": -26.75, "step": 15680 }, { "epoch": 1.1326066555980654, "grad_norm": 5.763711424318099, "learning_rate": 6.38672938997955e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.23828125, "logps/chosen": -458.0, "logps/rejected": -544.0, "loss": 0.0287, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.0, "rewards/margins": 10.9375, "rewards/rejected": -29.0, "step": 15690 }, { "epoch": 1.1333285208980004, "grad_norm": 6.720993196049985, "learning_rate": 6.384695075740977e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.427734375, "logps/chosen": -386.0, "logps/rejected": -478.0, "loss": 0.0377, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -15.5625, "rewards/margins": 10.0, "rewards/rejected": -25.625, "step": 15700 }, { "epoch": 1.1340503861979354, "grad_norm": 7.838857302948272, "learning_rate": 6.382662704186784e-07, "logits/chosen": -1.015625, "logits/rejected": -0.4453125, "logps/chosen": -458.0, "logps/rejected": -532.0, "loss": 0.045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.375, "rewards/margins": 10.1875, "rewards/rejected": -27.5, "step": 15710 }, { "epoch": 1.1347722514978704, "grad_norm": 3.0414424091680368, "learning_rate": 6.380632272226964e-07, "logits/chosen": -0.7578125, "logits/rejected": -0.232421875, "logps/chosen": -460.0, "logps/rejected": -528.0, "loss": 0.0303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 10.5625, "rewards/rejected": -28.0, "step": 15720 }, { "epoch": 1.1354941167978057, "grad_norm": 5.2639853515909625, "learning_rate": 6.378603776778394e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.3828125, "logps/chosen": -426.0, "logps/rejected": -494.0, "loss": 0.0459, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.75, "rewards/margins": 10.1875, "rewards/rejected": -25.0, "step": 15730 }, { "epoch": 1.1362159820977404, "grad_norm": 1.9037769629233636, "learning_rate": 6.376577214764806e-07, "logits/chosen": -1.03125, "logits/rejected": -0.58203125, "logps/chosen": -416.0, "logps/rejected": -492.0, "loss": 0.0414, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -14.0, "rewards/margins": 10.5625, "rewards/rejected": -24.625, "step": 15740 }, { "epoch": 1.1369378473976757, "grad_norm": 4.577656885524853, "learning_rate": 6.374552583116766e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.404296875, "logps/chosen": -432.0, "logps/rejected": -520.0, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.875, "rewards/margins": 10.1875, "rewards/rejected": -25.125, "step": 15750 }, { "epoch": 1.1376597126976107, "grad_norm": 10.101474018451375, "learning_rate": 6.372529878771662e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.48046875, "logps/chosen": -454.0, "logps/rejected": -528.0, "loss": 0.0401, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.875, "rewards/margins": 10.875, "rewards/rejected": -26.75, "step": 15760 }, { "epoch": 1.1383815779975457, "grad_norm": 2.855813364269649, "learning_rate": 6.370509098673676e-07, "logits/chosen": -0.7265625, "logits/rejected": -0.1982421875, "logps/chosen": -452.0, "logps/rejected": -540.0, "loss": 0.0342, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.0, "rewards/margins": 11.3125, "rewards/rejected": -29.25, "step": 15770 }, { "epoch": 1.1391034432974807, "grad_norm": 10.928986378339037, "learning_rate": 6.368490239773771e-07, "logits/chosen": -0.765625, "logits/rejected": -0.212890625, "logps/chosen": -492.0, "logps/rejected": -548.0, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 10.25, "rewards/rejected": -30.125, "step": 15780 }, { "epoch": 1.1398253085974157, "grad_norm": 12.55537417015435, "learning_rate": 6.366473299029672e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.3359375, "logps/chosen": -462.0, "logps/rejected": -536.0, "loss": 0.0386, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.875, "rewards/margins": 10.75, "rewards/rejected": -29.625, "step": 15790 }, { "epoch": 1.1405471738973507, "grad_norm": 8.093040452294339, "learning_rate": 6.36445827340584e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.2021484375, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0412, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 12.125, "rewards/rejected": -31.5, "step": 15800 }, { "epoch": 1.1412690391972857, "grad_norm": 8.672829359208558, "learning_rate": 6.362445159873459e-07, "logits/chosen": -0.84375, "logits/rejected": -0.16796875, "logps/chosen": -492.0, "logps/rejected": -576.0, "loss": 0.0328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 11.6875, "rewards/rejected": -32.5, "step": 15810 }, { "epoch": 1.1419909044972207, "grad_norm": 14.853057002047015, "learning_rate": 6.360433955410417e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.2451171875, "logps/chosen": -474.0, "logps/rejected": -544.0, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.3125, "rewards/rejected": -30.5, "step": 15820 }, { "epoch": 1.1427127697971557, "grad_norm": 4.784784665501337, "learning_rate": 6.35842465700128e-07, "logits/chosen": -0.953125, "logits/rejected": -0.28125, "logps/chosen": -500.0, "logps/rejected": -548.0, "loss": 0.0482, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.875, "rewards/margins": 11.375, "rewards/rejected": -31.25, "step": 15830 }, { "epoch": 1.143434635097091, "grad_norm": 3.7478282747169467, "learning_rate": 6.356417261637281e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.1474609375, "logps/chosen": -464.0, "logps/rejected": -536.0, "loss": 0.0298, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 11.1875, "rewards/rejected": -30.25, "step": 15840 }, { "epoch": 1.144156500397026, "grad_norm": 7.227823373045497, "learning_rate": 6.354411766316301e-07, "logits/chosen": -0.953125, "logits/rejected": -0.3515625, "logps/chosen": -440.0, "logps/rejected": -540.0, "loss": 0.0349, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 10.6875, "rewards/rejected": -28.5, "step": 15850 }, { "epoch": 1.144878365696961, "grad_norm": 6.01012677730855, "learning_rate": 6.35240816804284e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.212890625, "logps/chosen": -454.0, "logps/rejected": -540.0, "loss": 0.0379, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.8125, "rewards/rejected": -29.125, "step": 15860 }, { "epoch": 1.145600230996896, "grad_norm": 10.102892037823274, "learning_rate": 6.350406463828013e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.1279296875, "logps/chosen": -472.0, "logps/rejected": -548.0, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -19.125, "rewards/margins": 10.25, "rewards/rejected": -29.375, "step": 15870 }, { "epoch": 1.146322096296831, "grad_norm": 3.153338482776112, "learning_rate": 6.348406650689516e-07, "logits/chosen": -0.921875, "logits/rejected": -0.275390625, "logps/chosen": -490.0, "logps/rejected": -556.0, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 10.5625, "rewards/rejected": -30.5, "step": 15880 }, { "epoch": 1.147043961596766, "grad_norm": 11.096594756673857, "learning_rate": 6.346408725651623e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.310546875, "logps/chosen": -474.0, "logps/rejected": -536.0, "loss": 0.0436, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 10.25, "rewards/rejected": -28.0, "step": 15890 }, { "epoch": 1.147765826896701, "grad_norm": 10.594527746209232, "learning_rate": 6.344412685745153e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.2431640625, "logps/chosen": -450.0, "logps/rejected": -528.0, "loss": 0.0339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.25, "rewards/margins": 10.625, "rewards/rejected": -28.875, "step": 15900 }, { "epoch": 1.148487692196636, "grad_norm": 6.309650729783451, "learning_rate": 6.342418528007461e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.25, "logps/chosen": -464.0, "logps/rejected": -516.0, "loss": 0.0318, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.125, "rewards/margins": 10.3125, "rewards/rejected": -28.375, "step": 15910 }, { "epoch": 1.149209557496571, "grad_norm": 2.921766569344226, "learning_rate": 6.340426249482414e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.240234375, "logps/chosen": -482.0, "logps/rejected": -548.0, "loss": 0.0422, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 11.25, "rewards/rejected": -30.875, "step": 15920 }, { "epoch": 1.1499314227965063, "grad_norm": 3.17062475015496, "learning_rate": 6.338435847220378e-07, "logits/chosen": -1.03125, "logits/rejected": -0.345703125, "logps/chosen": -476.0, "logps/rejected": -536.0, "loss": 0.0378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.625, "rewards/margins": 10.375, "rewards/rejected": -29.0, "step": 15930 }, { "epoch": 1.1506532880964413, "grad_norm": 8.666991004214431, "learning_rate": 6.336447318278194e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.337890625, "logps/chosen": -466.0, "logps/rejected": -536.0, "loss": 0.0448, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 10.5, "rewards/rejected": -29.625, "step": 15940 }, { "epoch": 1.1513751533963763, "grad_norm": 4.410362108935011, "learning_rate": 6.334460659719166e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.337890625, "logps/chosen": -462.0, "logps/rejected": -540.0, "loss": 0.0503, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 10.1875, "rewards/rejected": -29.0, "step": 15950 }, { "epoch": 1.1520970186963113, "grad_norm": 10.290105701477415, "learning_rate": 6.332475868613035e-07, "logits/chosen": -0.796875, "logits/rejected": -0.369140625, "logps/chosen": -448.0, "logps/rejected": -502.0, "loss": 0.0512, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.0, "rewards/margins": 10.0, "rewards/rejected": -26.875, "step": 15960 }, { "epoch": 1.1528188839962463, "grad_norm": 4.621139915626799, "learning_rate": 6.330492942035967e-07, "logits/chosen": -0.90625, "logits/rejected": -0.1923828125, "logps/chosen": -474.0, "logps/rejected": -520.0, "loss": 0.0437, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 10.4375, "rewards/rejected": -29.25, "step": 15970 }, { "epoch": 1.1535407492961813, "grad_norm": 7.51846203097976, "learning_rate": 6.32851187707053e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.26171875, "logps/chosen": -448.0, "logps/rejected": -508.0, "loss": 0.0499, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.9375, "rewards/margins": 10.75, "rewards/rejected": -26.75, "step": 15980 }, { "epoch": 1.1542626145961163, "grad_norm": 2.5687084792343837, "learning_rate": 6.326532670805687e-07, "logits/chosen": -0.90625, "logits/rejected": -0.251953125, "logps/chosen": -456.0, "logps/rejected": -528.0, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -17.0, "rewards/margins": 10.625, "rewards/rejected": -27.625, "step": 15990 }, { "epoch": 1.1549844798960514, "grad_norm": 1.2319847219709414, "learning_rate": 6.324555320336758e-07, "logits/chosen": -0.921875, "logits/rejected": -0.37109375, "logps/chosen": -466.0, "logps/rejected": -528.0, "loss": 0.0348, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.0, "rewards/margins": 10.5, "rewards/rejected": -27.5, "step": 16000 }, { "epoch": 1.1557063451959864, "grad_norm": 4.975003737526406, "learning_rate": 6.322579822765425e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.2041015625, "logps/chosen": -436.0, "logps/rejected": -536.0, "loss": 0.0316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 10.25, "rewards/rejected": -28.0, "step": 16010 }, { "epoch": 1.1564282104959214, "grad_norm": 0.878335918338948, "learning_rate": 6.320606175199696e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.263671875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 10.5, "rewards/rejected": -29.5, "step": 16020 }, { "epoch": 1.1571500757958564, "grad_norm": 4.535014522775993, "learning_rate": 6.318634374753898e-07, "logits/chosen": -0.765625, "logits/rejected": -0.050048828125, "logps/chosen": -474.0, "logps/rejected": -540.0, "loss": 0.0517, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 11.125, "rewards/rejected": -30.75, "step": 16030 }, { "epoch": 1.1578719410957916, "grad_norm": 5.7848777763590125, "learning_rate": 6.316664418548654e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.26953125, "logps/chosen": -466.0, "logps/rejected": -536.0, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.375, "rewards/rejected": -29.25, "step": 16040 }, { "epoch": 1.1585938063957266, "grad_norm": 4.121255448804228, "learning_rate": 6.314696303710867e-07, "logits/chosen": -0.890625, "logits/rejected": -0.30859375, "logps/chosen": -506.0, "logps/rejected": -588.0, "loss": 0.0424, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 11.75, "rewards/rejected": -31.75, "step": 16050 }, { "epoch": 1.1593156716956616, "grad_norm": 9.86363007237048, "learning_rate": 6.312730027373703e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.11328125, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.0432, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 11.4375, "rewards/rejected": -30.25, "step": 16060 }, { "epoch": 1.1600375369955966, "grad_norm": 8.276515833826462, "learning_rate": 6.310765586676574e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.2138671875, "logps/chosen": -468.0, "logps/rejected": -536.0, "loss": 0.0494, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 11.3125, "rewards/rejected": -30.0, "step": 16070 }, { "epoch": 1.1607594022955317, "grad_norm": 2.4241516069560887, "learning_rate": 6.308802978765117e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.3671875, "logps/chosen": -438.0, "logps/rejected": -498.0, "loss": 0.0354, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.875, "rewards/margins": 11.0625, "rewards/rejected": -26.875, "step": 16080 }, { "epoch": 1.1614812675954667, "grad_norm": 1.54873781872518, "learning_rate": 6.306842200791181e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.330078125, "logps/chosen": -474.0, "logps/rejected": -576.0, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.75, "rewards/margins": 12.75, "rewards/rejected": -29.5, "step": 16090 }, { "epoch": 1.1622031328954017, "grad_norm": 5.585241119434081, "learning_rate": 6.304883249912805e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.326171875, "logps/chosen": -452.0, "logps/rejected": -528.0, "loss": 0.0282, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -16.75, "rewards/margins": 10.375, "rewards/rejected": -27.125, "step": 16100 }, { "epoch": 1.1629249981953367, "grad_norm": 4.003131355008722, "learning_rate": 6.302926123294205e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.314453125, "logps/chosen": -452.0, "logps/rejected": -512.0, "loss": 0.0333, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 10.6875, "rewards/rejected": -28.25, "step": 16110 }, { "epoch": 1.1636468634952717, "grad_norm": 4.989548331646427, "learning_rate": 6.300970818105758e-07, "logits/chosen": -0.8125, "logits/rejected": -0.275390625, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.036, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 10.5, "rewards/rejected": -29.25, "step": 16120 }, { "epoch": 1.164368728795207, "grad_norm": 8.835490712771032, "learning_rate": 6.299017331523976e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.21875, "logps/chosen": -452.0, "logps/rejected": -532.0, "loss": 0.0348, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 10.875, "rewards/rejected": -29.625, "step": 16130 }, { "epoch": 1.165090594095142, "grad_norm": 3.3324130696018117, "learning_rate": 6.297065660731497e-07, "logits/chosen": -0.8125, "logits/rejected": -0.279296875, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0488, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.625, "rewards/margins": 11.0, "rewards/rejected": -29.625, "step": 16140 }, { "epoch": 1.165812459395077, "grad_norm": 4.221270872859508, "learning_rate": 6.29511580291707e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.2021484375, "logps/chosen": -458.0, "logps/rejected": -548.0, "loss": 0.0392, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.25, "rewards/rejected": -30.5, "step": 16150 }, { "epoch": 1.166534324695012, "grad_norm": 6.446701210554025, "learning_rate": 6.293167755275526e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.205078125, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.0287, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.25, "rewards/margins": 10.8125, "rewards/rejected": -31.125, "step": 16160 }, { "epoch": 1.167256189994947, "grad_norm": 0.8945606805124204, "learning_rate": 6.291221515007776e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.2109375, "logps/chosen": -446.0, "logps/rejected": -524.0, "loss": 0.0337, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 11.5625, "rewards/rejected": -30.25, "step": 16170 }, { "epoch": 1.167978055294882, "grad_norm": 8.668747243173794, "learning_rate": 6.28927707932078e-07, "logits/chosen": -0.671875, "logits/rejected": -0.3125, "logps/chosen": -494.0, "logps/rejected": -572.0, "loss": 0.0485, "rewards/accuracies": 0.96875, "rewards/chosen": -21.375, "rewards/margins": 10.625, "rewards/rejected": -32.0, "step": 16180 }, { "epoch": 1.168699920594817, "grad_norm": 1.9163852138799065, "learning_rate": 6.287334445427542e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.15234375, "logps/chosen": -506.0, "logps/rejected": -572.0, "loss": 0.0277, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.5, "rewards/margins": 11.125, "rewards/rejected": -33.5, "step": 16190 }, { "epoch": 1.169421785894752, "grad_norm": 5.908977968325368, "learning_rate": 6.285393610547089e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.177734375, "logps/chosen": -506.0, "logps/rejected": -576.0, "loss": 0.0572, "rewards/accuracies": 0.96875, "rewards/chosen": -21.625, "rewards/margins": 10.25, "rewards/rejected": -31.875, "step": 16200 }, { "epoch": 1.170143651194687, "grad_norm": 4.9749369367796845, "learning_rate": 6.28345457190445e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.263671875, "logps/chosen": -462.0, "logps/rejected": -556.0, "loss": 0.0346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.125, "rewards/margins": 11.5, "rewards/rejected": -30.625, "step": 16210 }, { "epoch": 1.1708655164946222, "grad_norm": 1.504191755717927, "learning_rate": 6.281517326730642e-07, "logits/chosen": -0.7734375, "logits/rejected": -0.373046875, "logps/chosen": -456.0, "logps/rejected": -544.0, "loss": 0.0256, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 10.75, "rewards/rejected": -29.625, "step": 16220 }, { "epoch": 1.171587381794557, "grad_norm": 10.229542542062052, "learning_rate": 6.279581872262657e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.234375, "logps/chosen": -496.0, "logps/rejected": -568.0, "loss": 0.0373, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 11.8125, "rewards/rejected": -31.75, "step": 16230 }, { "epoch": 1.1723092470944922, "grad_norm": 8.764285450945527, "learning_rate": 6.277648205743445e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.330078125, "logps/chosen": -476.0, "logps/rejected": -540.0, "loss": 0.0585, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.75, "rewards/margins": 10.5625, "rewards/rejected": -29.25, "step": 16240 }, { "epoch": 1.1730311123944273, "grad_norm": 5.938365746286655, "learning_rate": 6.275716324421889e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.310546875, "logps/chosen": -484.0, "logps/rejected": -568.0, "loss": 0.0301, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.5, "rewards/rejected": -30.75, "step": 16250 }, { "epoch": 1.1737529776943623, "grad_norm": 13.20751956383299, "learning_rate": 6.273786225552799e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.2431640625, "logps/chosen": -466.0, "logps/rejected": -524.0, "loss": 0.0478, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 10.8125, "rewards/rejected": -30.375, "step": 16260 }, { "epoch": 1.1744748429942973, "grad_norm": 8.557213670765101, "learning_rate": 6.271857906396891e-07, "logits/chosen": -0.921875, "logits/rejected": -0.275390625, "logps/chosen": -472.0, "logps/rejected": -556.0, "loss": 0.0232, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.5, "rewards/rejected": -31.625, "step": 16270 }, { "epoch": 1.1751967082942323, "grad_norm": 11.451204176166376, "learning_rate": 6.269931364220768e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.3203125, "logps/chosen": -462.0, "logps/rejected": -524.0, "loss": 0.0419, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 11.4375, "rewards/rejected": -29.75, "step": 16280 }, { "epoch": 1.1759185735941673, "grad_norm": 4.898296705287314, "learning_rate": 6.268006596296912e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.314453125, "logps/chosen": -454.0, "logps/rejected": -520.0, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 11.1875, "rewards/rejected": -28.625, "step": 16290 }, { "epoch": 1.1766404388941023, "grad_norm": 7.691689329513782, "learning_rate": 6.266083599903659e-07, "logits/chosen": -0.69140625, "logits/rejected": -0.18359375, "logps/chosen": -450.0, "logps/rejected": -536.0, "loss": 0.045, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 10.1875, "rewards/rejected": -29.25, "step": 16300 }, { "epoch": 1.1773623041940373, "grad_norm": 5.065457365447701, "learning_rate": 6.264162372325185e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.296875, "logps/chosen": -474.0, "logps/rejected": -560.0, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.125, "step": 16310 }, { "epoch": 1.1780841694939723, "grad_norm": 6.53112758322987, "learning_rate": 6.262242910851495e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.275390625, "logps/chosen": -456.0, "logps/rejected": -536.0, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 10.4375, "rewards/rejected": -29.875, "step": 16320 }, { "epoch": 1.1788060347939076, "grad_norm": 3.2427785284727926, "learning_rate": 6.2603252127784e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.23046875, "logps/chosen": -458.0, "logps/rejected": -560.0, "loss": 0.0318, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -20.75, "rewards/margins": 10.5625, "rewards/rejected": -31.375, "step": 16330 }, { "epoch": 1.1795279000938426, "grad_norm": 7.379081271641469, "learning_rate": 6.258409275407508e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.259765625, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0492, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 11.125, "rewards/rejected": -32.75, "step": 16340 }, { "epoch": 1.1802497653937776, "grad_norm": 8.940841656237664, "learning_rate": 6.256495096046201e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.3984375, "logps/chosen": -474.0, "logps/rejected": -516.0, "loss": 0.051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 10.5, "rewards/rejected": -28.875, "step": 16350 }, { "epoch": 1.1809716306937126, "grad_norm": 5.159681775345592, "learning_rate": 6.254582672007622e-07, "logits/chosen": -0.890625, "logits/rejected": -0.484375, "logps/chosen": -464.0, "logps/rejected": -544.0, "loss": 0.0613, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -19.125, "rewards/margins": 9.875, "rewards/rejected": -29.0, "step": 16360 }, { "epoch": 1.1816934959936476, "grad_norm": 2.9852588939613787, "learning_rate": 6.25267200061066e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.345703125, "logps/chosen": -454.0, "logps/rejected": -536.0, "loss": 0.0445, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 11.125, "rewards/rejected": -29.5, "step": 16370 }, { "epoch": 1.1824153612935826, "grad_norm": 7.793416290635037, "learning_rate": 6.250763079179939e-07, "logits/chosen": -0.875, "logits/rejected": -0.310546875, "logps/chosen": -448.0, "logps/rejected": -544.0, "loss": 0.0452, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.0, "rewards/margins": 10.75, "rewards/rejected": -27.75, "step": 16380 }, { "epoch": 1.1831372265935176, "grad_norm": 9.253788816355357, "learning_rate": 6.248855905045789e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.287109375, "logps/chosen": -474.0, "logps/rejected": -532.0, "loss": 0.0409, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.75, "rewards/margins": 10.1875, "rewards/rejected": -27.875, "step": 16390 }, { "epoch": 1.1838590918934526, "grad_norm": 11.750122297344586, "learning_rate": 6.246950475544243e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.3671875, "logps/chosen": -460.0, "logps/rejected": -560.0, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.0, "rewards/margins": 11.25, "rewards/rejected": -29.25, "step": 16400 }, { "epoch": 1.1845809571933876, "grad_norm": 8.47568622326397, "learning_rate": 6.245046788017015e-07, "logits/chosen": -0.828125, "logits/rejected": -0.330078125, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.036, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.0, "rewards/margins": 10.5625, "rewards/rejected": -28.5, "step": 16410 }, { "epoch": 1.1853028224933229, "grad_norm": 2.9746897574394797, "learning_rate": 6.243144839811487e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.31640625, "logps/chosen": -460.0, "logps/rejected": -532.0, "loss": 0.044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.25, "rewards/margins": 10.8125, "rewards/rejected": -28.125, "step": 16420 }, { "epoch": 1.1860246877932579, "grad_norm": 5.718279852446565, "learning_rate": 6.241244628280692e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.21484375, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -17.375, "rewards/margins": 10.75, "rewards/rejected": -28.125, "step": 16430 }, { "epoch": 1.1867465530931929, "grad_norm": 6.935398141423478, "learning_rate": 6.239346150783301e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.216796875, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0509, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.25, "rewards/margins": 10.75, "rewards/rejected": -30.0, "step": 16440 }, { "epoch": 1.187468418393128, "grad_norm": 7.366468870169417, "learning_rate": 6.237449404683603e-07, "logits/chosen": -0.8125, "logits/rejected": -0.35546875, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0253, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.25, "rewards/rejected": -32.5, "step": 16450 }, { "epoch": 1.188190283693063, "grad_norm": 8.839317464138714, "learning_rate": 6.235554387351494e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.37109375, "logps/chosen": -478.0, "logps/rejected": -552.0, "loss": 0.0348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 11.375, "rewards/rejected": -30.75, "step": 16460 }, { "epoch": 1.188912148992998, "grad_norm": 8.335447497629639, "learning_rate": 6.233661096162459e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.21484375, "logps/chosen": -450.0, "logps/rejected": -520.0, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -17.875, "rewards/margins": 10.8125, "rewards/rejected": -28.75, "step": 16470 }, { "epoch": 1.189634014292933, "grad_norm": 6.659481292794523, "learning_rate": 6.231769528497558e-07, "logits/chosen": -0.953125, "logits/rejected": -0.34375, "logps/chosen": -452.0, "logps/rejected": -536.0, "loss": 0.0405, "rewards/accuracies": 0.96875, "rewards/chosen": -18.375, "rewards/margins": 10.8125, "rewards/rejected": -29.125, "step": 16480 }, { "epoch": 1.190355879592868, "grad_norm": 13.389237095809086, "learning_rate": 6.229879681743411e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.283203125, "logps/chosen": -456.0, "logps/rejected": -548.0, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -17.75, "rewards/margins": 11.375, "rewards/rejected": -29.125, "step": 16490 }, { "epoch": 1.191077744892803, "grad_norm": 6.407193004751358, "learning_rate": 6.227991553292183e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.2080078125, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 11.1875, "rewards/rejected": -29.75, "step": 16500 }, { "epoch": 1.191799610192738, "grad_norm": 6.227540312862041, "learning_rate": 6.226105140541567e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.35546875, "logps/chosen": -466.0, "logps/rejected": -552.0, "loss": 0.046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 11.3125, "rewards/rejected": -30.75, "step": 16510 }, { "epoch": 1.192521475492673, "grad_norm": 2.4349318806916083, "learning_rate": 6.224220440894771e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.234375, "logps/chosen": -468.0, "logps/rejected": -536.0, "loss": 0.034, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.125, "rewards/margins": 10.3125, "rewards/rejected": -30.5, "step": 16520 }, { "epoch": 1.1932433407926082, "grad_norm": 3.614346679922726, "learning_rate": 6.222337451760501e-07, "logits/chosen": -0.7421875, "logits/rejected": -0.326171875, "logps/chosen": -494.0, "logps/rejected": -552.0, "loss": 0.0222, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 10.375, "rewards/rejected": -31.25, "step": 16530 }, { "epoch": 1.1939652060925432, "grad_norm": 2.2417625695507435, "learning_rate": 6.220456170552948e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.400390625, "logps/chosen": -470.0, "logps/rejected": -536.0, "loss": 0.0325, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 11.3125, "rewards/rejected": -31.25, "step": 16540 }, { "epoch": 1.1946870713924782, "grad_norm": 1.2916325053578706, "learning_rate": 6.218576594691773e-07, "logits/chosen": -1.046875, "logits/rejected": -0.34765625, "logps/chosen": -458.0, "logps/rejected": -532.0, "loss": 0.0369, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.875, "rewards/margins": 11.5625, "rewards/rejected": -29.375, "step": 16550 }, { "epoch": 1.1954089366924132, "grad_norm": 2.9985850816460733, "learning_rate": 6.216698721602092e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.400390625, "logps/chosen": -454.0, "logps/rejected": -528.0, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": -18.25, "rewards/margins": 10.4375, "rewards/rejected": -28.75, "step": 16560 }, { "epoch": 1.1961308019923482, "grad_norm": 11.14458295896513, "learning_rate": 6.214822548714457e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.4375, "logps/chosen": -450.0, "logps/rejected": -548.0, "loss": 0.0406, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 11.3125, "rewards/rejected": -29.75, "step": 16570 }, { "epoch": 1.1968526672922832, "grad_norm": 4.934265062726373, "learning_rate": 6.212948073464848e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.5, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.125, "rewards/margins": 11.125, "rewards/rejected": -29.375, "step": 16580 }, { "epoch": 1.1975745325922182, "grad_norm": 7.592891881466667, "learning_rate": 6.211075293294654e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.490234375, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.031, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 10.75, "rewards/rejected": -29.25, "step": 16590 }, { "epoch": 1.1982963978921533, "grad_norm": 8.723068444631231, "learning_rate": 6.209204205650662e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.37109375, "logps/chosen": -466.0, "logps/rejected": -552.0, "loss": 0.0272, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 11.8125, "rewards/rejected": -30.125, "step": 16600 }, { "epoch": 1.1990182631920883, "grad_norm": 7.333986471523812, "learning_rate": 6.207334807985037e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.380859375, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0408, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.625, "rewards/margins": 11.9375, "rewards/rejected": -30.5, "step": 16610 }, { "epoch": 1.1997401284920235, "grad_norm": 2.477829818633876, "learning_rate": 6.205467097755309e-07, "logits/chosen": -1.140625, "logits/rejected": -0.34375, "logps/chosen": -466.0, "logps/rejected": -560.0, "loss": 0.0375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.6875, "rewards/rejected": -30.75, "step": 16620 }, { "epoch": 1.2004619937919585, "grad_norm": 9.524958808537184, "learning_rate": 6.203601072424364e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.34765625, "logps/chosen": -486.0, "logps/rejected": -560.0, "loss": 0.0432, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 11.6875, "rewards/rejected": -30.625, "step": 16630 }, { "epoch": 1.2011838590918935, "grad_norm": 3.437651981161375, "learning_rate": 6.201736729460422e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.51171875, "logps/chosen": -476.0, "logps/rejected": -544.0, "loss": 0.0375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.5625, "rewards/rejected": -29.0, "step": 16640 }, { "epoch": 1.2019057243918285, "grad_norm": 5.768189067507049, "learning_rate": 6.199874066337029e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.345703125, "logps/chosen": -488.0, "logps/rejected": -552.0, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": -19.125, "rewards/margins": 11.1875, "rewards/rejected": -30.375, "step": 16650 }, { "epoch": 1.2026275896917635, "grad_norm": 6.485341860946756, "learning_rate": 6.198013080533033e-07, "logits/chosen": -1.03125, "logits/rejected": -0.31640625, "logps/chosen": -468.0, "logps/rejected": -528.0, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -19.125, "rewards/margins": 11.375, "rewards/rejected": -30.5, "step": 16660 }, { "epoch": 1.2033494549916985, "grad_norm": 9.938901516890079, "learning_rate": 6.196153769532584e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.41015625, "logps/chosen": -452.0, "logps/rejected": -520.0, "loss": 0.0405, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.125, "rewards/margins": 10.75, "rewards/rejected": -29.0, "step": 16670 }, { "epoch": 1.2040713202916336, "grad_norm": 4.413806081995455, "learning_rate": 6.194296130825109e-07, "logits/chosen": -1.0, "logits/rejected": -0.462890625, "logps/chosen": -494.0, "logps/rejected": -560.0, "loss": 0.0392, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 11.0625, "rewards/rejected": -30.5, "step": 16680 }, { "epoch": 1.2047931855915686, "grad_norm": 4.911437825190286, "learning_rate": 6.192440161905297e-07, "logits/chosen": -1.0, "logits/rejected": -0.408203125, "logps/chosen": -460.0, "logps/rejected": -536.0, "loss": 0.0484, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.125, "rewards/margins": 10.9375, "rewards/rejected": -29.0, "step": 16690 }, { "epoch": 1.2055150508915036, "grad_norm": 4.756065651081247, "learning_rate": 6.190585860273094e-07, "logits/chosen": -1.015625, "logits/rejected": -0.33203125, "logps/chosen": -454.0, "logps/rejected": -540.0, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -17.75, "rewards/margins": 11.5625, "rewards/rejected": -29.375, "step": 16700 }, { "epoch": 1.2062369161914388, "grad_norm": 7.2450072874076, "learning_rate": 6.18873322343368e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.37109375, "logps/chosen": -436.0, "logps/rejected": -524.0, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.125, "rewards/margins": 11.375, "rewards/rejected": -28.5, "step": 16710 }, { "epoch": 1.2069587814913736, "grad_norm": 10.414436163190626, "learning_rate": 6.186882248897459e-07, "logits/chosen": -1.046875, "logits/rejected": -0.36328125, "logps/chosen": -460.0, "logps/rejected": -532.0, "loss": 0.0511, "rewards/accuracies": 0.96875, "rewards/chosen": -18.75, "rewards/margins": 11.0625, "rewards/rejected": -29.75, "step": 16720 }, { "epoch": 1.2076806467913088, "grad_norm": 4.548527464309389, "learning_rate": 6.185032934180045e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.271484375, "logps/chosen": -456.0, "logps/rejected": -528.0, "loss": 0.0277, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 10.1875, "rewards/rejected": -28.875, "step": 16730 }, { "epoch": 1.2084025120912438, "grad_norm": 3.385461259842268, "learning_rate": 6.183185276802243e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.34375, "logps/chosen": -474.0, "logps/rejected": -544.0, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 10.8125, "rewards/rejected": -31.25, "step": 16740 }, { "epoch": 1.2091243773911788, "grad_norm": 10.188058058455056, "learning_rate": 6.181339274290046e-07, "logits/chosen": -1.046875, "logits/rejected": -0.333984375, "logps/chosen": -464.0, "logps/rejected": -532.0, "loss": 0.032, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 11.625, "rewards/rejected": -30.125, "step": 16750 }, { "epoch": 1.2098462426911138, "grad_norm": 7.622403149841904, "learning_rate": 6.179494924174608e-07, "logits/chosen": -1.015625, "logits/rejected": -0.4765625, "logps/chosen": -462.0, "logps/rejected": -556.0, "loss": 0.0291, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.75, "rewards/margins": 11.1875, "rewards/rejected": -30.0, "step": 16760 }, { "epoch": 1.2105681079910489, "grad_norm": 5.8883549440138285, "learning_rate": 6.177652223992242e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.56640625, "logps/chosen": -456.0, "logps/rejected": -532.0, "loss": 0.0431, "rewards/accuracies": 0.96875, "rewards/chosen": -18.75, "rewards/margins": 10.375, "rewards/rejected": -29.125, "step": 16770 }, { "epoch": 1.2112899732909839, "grad_norm": 8.45941898212079, "learning_rate": 6.175811171284396e-07, "logits/chosen": -1.25, "logits/rejected": -0.4609375, "logps/chosen": -484.0, "logps/rejected": -548.0, "loss": 0.0356, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 10.875, "rewards/rejected": -29.75, "step": 16780 }, { "epoch": 1.2120118385909189, "grad_norm": 7.204708240757095, "learning_rate": 6.173971763597644e-07, "logits/chosen": -1.171875, "logits/rejected": -0.439453125, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": -18.375, "rewards/margins": 11.75, "rewards/rejected": -30.125, "step": 16790 }, { "epoch": 1.2127337038908539, "grad_norm": 10.390006503224395, "learning_rate": 6.172133998483677e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.40234375, "logps/chosen": -458.0, "logps/rejected": -532.0, "loss": 0.0402, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.875, "rewards/rejected": -29.25, "step": 16800 }, { "epoch": 1.213455569190789, "grad_norm": 2.960257525959071, "learning_rate": 6.170297873499277e-07, "logits/chosen": -1.03125, "logits/rejected": -0.42578125, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0434, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.875, "rewards/margins": 10.8125, "rewards/rejected": -30.75, "step": 16810 }, { "epoch": 1.2141774344907241, "grad_norm": 3.198466464720858, "learning_rate": 6.168463386206317e-07, "logits/chosen": -0.875, "logits/rejected": -0.392578125, "logps/chosen": -496.0, "logps/rejected": -564.0, "loss": 0.0316, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.5, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 16820 }, { "epoch": 1.2148992997906591, "grad_norm": 3.7095576395475764, "learning_rate": 6.166630534171737e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.326171875, "logps/chosen": -482.0, "logps/rejected": -576.0, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 11.625, "rewards/rejected": -31.125, "step": 16830 }, { "epoch": 1.2156211650905941, "grad_norm": 4.563660747469963, "learning_rate": 6.164799314967538e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.40234375, "logps/chosen": -468.0, "logps/rejected": -536.0, "loss": 0.0298, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.625, "rewards/margins": 10.5625, "rewards/rejected": -30.125, "step": 16840 }, { "epoch": 1.2163430303905292, "grad_norm": 6.072471966249578, "learning_rate": 6.162969726170763e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.455078125, "logps/chosen": -458.0, "logps/rejected": -502.0, "loss": 0.0434, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.5, "rewards/margins": 11.0, "rewards/rejected": -28.5, "step": 16850 }, { "epoch": 1.2170648956904642, "grad_norm": 3.2977567657631575, "learning_rate": 6.161141765363486e-07, "logits/chosen": -1.109375, "logits/rejected": -0.28125, "logps/chosen": -458.0, "logps/rejected": -524.0, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 11.375, "rewards/rejected": -29.25, "step": 16860 }, { "epoch": 1.2177867609903992, "grad_norm": 6.201153324394959, "learning_rate": 6.159315430132796e-07, "logits/chosen": -0.859375, "logits/rejected": -0.3671875, "logps/chosen": -486.0, "logps/rejected": -568.0, "loss": 0.0336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.875, "rewards/margins": 10.5625, "rewards/rejected": -31.375, "step": 16870 }, { "epoch": 1.2185086262903342, "grad_norm": 8.882351004428308, "learning_rate": 6.157490718070792e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.384765625, "logps/chosen": -460.0, "logps/rejected": -540.0, "loss": 0.0451, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 10.375, "rewards/rejected": -29.625, "step": 16880 }, { "epoch": 1.2192304915902692, "grad_norm": 7.578840817370983, "learning_rate": 6.155667626774557e-07, "logits/chosen": -1.078125, "logits/rejected": -0.291015625, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.375, "rewards/margins": 11.8125, "rewards/rejected": -29.125, "step": 16890 }, { "epoch": 1.2199523568902042, "grad_norm": 7.582449553031142, "learning_rate": 6.153846153846154e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.478515625, "logps/chosen": -466.0, "logps/rejected": -540.0, "loss": 0.0441, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 11.1875, "rewards/rejected": -29.375, "step": 16900 }, { "epoch": 1.2206742221901394, "grad_norm": 2.8218412443016114, "learning_rate": 6.152026296892608e-07, "logits/chosen": -1.203125, "logits/rejected": -0.53515625, "logps/chosen": -452.0, "logps/rejected": -540.0, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.125, "rewards/margins": 10.875, "rewards/rejected": -28.0, "step": 16910 }, { "epoch": 1.2213960874900744, "grad_norm": 2.267324227973284, "learning_rate": 6.150208053525901e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.337890625, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.0488, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.25, "rewards/margins": 9.9375, "rewards/rejected": -28.25, "step": 16920 }, { "epoch": 1.2221179527900095, "grad_norm": 6.216007322957484, "learning_rate": 6.148391421362942e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.3515625, "logps/chosen": -480.0, "logps/rejected": -544.0, "loss": 0.042, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.25, "rewards/margins": 10.4375, "rewards/rejected": -29.75, "step": 16930 }, { "epoch": 1.2228398180899445, "grad_norm": 10.56640964686367, "learning_rate": 6.146576398025575e-07, "logits/chosen": -1.0625, "logits/rejected": -0.228515625, "logps/chosen": -498.0, "logps/rejected": -560.0, "loss": 0.0371, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.0625, "rewards/rejected": -32.0, "step": 16940 }, { "epoch": 1.2235616833898795, "grad_norm": 10.920585322312682, "learning_rate": 6.144762981140549e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.32421875, "logps/chosen": -468.0, "logps/rejected": -540.0, "loss": 0.0317, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 10.75, "rewards/rejected": -31.5, "step": 16950 }, { "epoch": 1.2242835486898145, "grad_norm": 9.034447067523288, "learning_rate": 6.142951168339512e-07, "logits/chosen": -1.0, "logits/rejected": -0.388671875, "logps/chosen": -460.0, "logps/rejected": -552.0, "loss": 0.0379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.125, "rewards/rejected": -30.25, "step": 16960 }, { "epoch": 1.2250054139897495, "grad_norm": 9.674423050351862, "learning_rate": 6.141140957259e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.2734375, "logps/chosen": -504.0, "logps/rejected": -576.0, "loss": 0.0491, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.375, "rewards/margins": 11.25, "rewards/rejected": -32.75, "step": 16970 }, { "epoch": 1.2257272792896845, "grad_norm": 5.845447613346872, "learning_rate": 6.139332345540418e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.3203125, "logps/chosen": -472.0, "logps/rejected": -560.0, "loss": 0.039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 10.625, "rewards/rejected": -30.75, "step": 16980 }, { "epoch": 1.2264491445896195, "grad_norm": 8.97015175810971, "learning_rate": 6.137525330830035e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.248046875, "logps/chosen": -496.0, "logps/rejected": -552.0, "loss": 0.0393, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.875, "rewards/margins": 10.875, "rewards/rejected": -32.75, "step": 16990 }, { "epoch": 1.2271710098895545, "grad_norm": 7.78135577246136, "learning_rate": 6.135719910778963e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.35546875, "logps/chosen": -498.0, "logps/rejected": -572.0, "loss": 0.0592, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.375, "rewards/margins": 10.875, "rewards/rejected": -32.25, "step": 17000 }, { "epoch": 1.2278928751894895, "grad_norm": 10.941647093637997, "learning_rate": 6.133916083043149e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.34765625, "logps/chosen": -496.0, "logps/rejected": -588.0, "loss": 0.0407, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.0, "rewards/margins": 11.375, "rewards/rejected": -33.25, "step": 17010 }, { "epoch": 1.2286147404894248, "grad_norm": 10.87592395513016, "learning_rate": 6.132113845283359e-07, "logits/chosen": -1.015625, "logits/rejected": -0.349609375, "logps/chosen": -496.0, "logps/rejected": -588.0, "loss": 0.0451, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -22.0, "rewards/margins": 11.125, "rewards/rejected": -33.25, "step": 17020 }, { "epoch": 1.2293366057893598, "grad_norm": 6.74293448363458, "learning_rate": 6.13031319516517e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.2138671875, "logps/chosen": -492.0, "logps/rejected": -544.0, "loss": 0.048, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 10.625, "rewards/rejected": -31.375, "step": 17030 }, { "epoch": 1.2300584710892948, "grad_norm": 3.6369071214408626, "learning_rate": 6.128514130358955e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.40625, "logps/chosen": -476.0, "logps/rejected": -552.0, "loss": 0.0323, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 10.5, "rewards/rejected": -29.875, "step": 17040 }, { "epoch": 1.2307803363892298, "grad_norm": 3.6342081042601193, "learning_rate": 6.126716648539868e-07, "logits/chosen": -1.0, "logits/rejected": -0.3671875, "logps/chosen": -478.0, "logps/rejected": -568.0, "loss": 0.0373, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 11.75, "rewards/rejected": -32.5, "step": 17050 }, { "epoch": 1.2315022016891648, "grad_norm": 11.605489153368204, "learning_rate": 6.124920747387834e-07, "logits/chosen": -0.90625, "logits/rejected": -0.333984375, "logps/chosen": -484.0, "logps/rejected": -560.0, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 11.1875, "rewards/rejected": -32.25, "step": 17060 }, { "epoch": 1.2322240669890998, "grad_norm": 8.505007574476814, "learning_rate": 6.123126424587535e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.240234375, "logps/chosen": -476.0, "logps/rejected": -556.0, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": -21.375, "rewards/margins": 11.125, "rewards/rejected": -32.5, "step": 17070 }, { "epoch": 1.2329459322890348, "grad_norm": 1.3862418768480966, "learning_rate": 6.121333677828399e-07, "logits/chosen": -0.921875, "logits/rejected": -0.416015625, "logps/chosen": -516.0, "logps/rejected": -568.0, "loss": 0.0265, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.375, "rewards/margins": 11.0625, "rewards/rejected": -32.5, "step": 17080 }, { "epoch": 1.2336677975889698, "grad_norm": 5.36194594104562, "learning_rate": 6.119542504804587e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.48046875, "logps/chosen": -478.0, "logps/rejected": -556.0, "loss": 0.0418, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.125, "rewards/rejected": -30.375, "step": 17090 }, { "epoch": 1.2343896628889048, "grad_norm": 5.630334960836123, "learning_rate": 6.11775290321498e-07, "logits/chosen": -1.125, "logits/rejected": -0.51171875, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.0321, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.375, "rewards/margins": 11.6875, "rewards/rejected": -33.0, "step": 17100 }, { "epoch": 1.23511152818884, "grad_norm": 6.745071700485224, "learning_rate": 6.115964870763166e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.349609375, "logps/chosen": -490.0, "logps/rejected": -556.0, "loss": 0.0425, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 10.4375, "rewards/rejected": -31.625, "step": 17110 }, { "epoch": 1.235833393488775, "grad_norm": 2.6080771674754377, "learning_rate": 6.114178405157431e-07, "logits/chosen": -0.84375, "logits/rejected": -0.21875, "logps/chosen": -520.0, "logps/rejected": -568.0, "loss": 0.0318, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.5, "rewards/margins": 10.75, "rewards/rejected": -33.25, "step": 17120 }, { "epoch": 1.23655525878871, "grad_norm": 5.4076545547826695, "learning_rate": 6.112393504110738e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.28125, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.0291, "rewards/accuracies": 0.96875, "rewards/chosen": -22.25, "rewards/margins": 11.4375, "rewards/rejected": -33.75, "step": 17130 }, { "epoch": 1.237277124088645, "grad_norm": 9.674212310821261, "learning_rate": 6.110610165340729e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.37890625, "logps/chosen": -502.0, "logps/rejected": -556.0, "loss": 0.0487, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.5, "rewards/margins": 11.0, "rewards/rejected": -32.5, "step": 17140 }, { "epoch": 1.23799898938858, "grad_norm": 3.3996122189740157, "learning_rate": 6.1088283865697e-07, "logits/chosen": -0.953125, "logits/rejected": -0.40625, "logps/chosen": -510.0, "logps/rejected": -592.0, "loss": 0.0252, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.375, "rewards/margins": 11.8125, "rewards/rejected": -33.25, "step": 17150 }, { "epoch": 1.2387208546885151, "grad_norm": 12.202647539739793, "learning_rate": 6.107048165524593e-07, "logits/chosen": -1.140625, "logits/rejected": -0.482421875, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0373, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.75, "rewards/rejected": -32.25, "step": 17160 }, { "epoch": 1.2394427199884501, "grad_norm": 10.35437286911651, "learning_rate": 6.105269499936986e-07, "logits/chosen": -1.078125, "logits/rejected": -0.306640625, "logps/chosen": -500.0, "logps/rejected": -584.0, "loss": 0.0366, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 12.125, "rewards/rejected": -33.25, "step": 17170 }, { "epoch": 1.2401645852883851, "grad_norm": 3.3434918284324313, "learning_rate": 6.103492387543075e-07, "logits/chosen": -1.078125, "logits/rejected": -0.455078125, "logps/chosen": -516.0, "logps/rejected": -568.0, "loss": 0.0372, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.875, "rewards/margins": 10.875, "rewards/rejected": -32.75, "step": 17180 }, { "epoch": 1.2408864505883201, "grad_norm": 2.8462566851952205, "learning_rate": 6.101716826083674e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.369140625, "logps/chosen": -458.0, "logps/rejected": -540.0, "loss": 0.048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 10.6875, "rewards/rejected": -31.25, "step": 17190 }, { "epoch": 1.2416083158882554, "grad_norm": 4.261480925060918, "learning_rate": 6.099942813304186e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.37109375, "logps/chosen": -490.0, "logps/rejected": -584.0, "loss": 0.0349, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.25, "rewards/margins": 11.3125, "rewards/rejected": -32.5, "step": 17200 }, { "epoch": 1.2423301811881902, "grad_norm": 8.05697478069948, "learning_rate": 6.098170346954607e-07, "logits/chosen": -1.140625, "logits/rejected": -0.43359375, "logps/chosen": -486.0, "logps/rejected": -552.0, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 10.9375, "rewards/rejected": -31.75, "step": 17210 }, { "epoch": 1.2430520464881254, "grad_norm": 6.030233544614565, "learning_rate": 6.0963994247895e-07, "logits/chosen": -1.15625, "logits/rejected": -0.37109375, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.0, "rewards/margins": 11.4375, "rewards/rejected": -30.5, "step": 17220 }, { "epoch": 1.2437739117880604, "grad_norm": 4.035955199336376, "learning_rate": 6.094630044567996e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.4140625, "logps/chosen": -502.0, "logps/rejected": -580.0, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 10.9375, "rewards/rejected": -32.0, "step": 17230 }, { "epoch": 1.2444957770879954, "grad_norm": 5.775246362104368, "learning_rate": 6.092862204053773e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.353515625, "logps/chosen": -502.0, "logps/rejected": -564.0, "loss": 0.0552, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 10.875, "rewards/rejected": -31.75, "step": 17240 }, { "epoch": 1.2452176423879304, "grad_norm": 6.126978501438753, "learning_rate": 6.091095901015048e-07, "logits/chosen": -0.859375, "logits/rejected": -0.287109375, "logps/chosen": -488.0, "logps/rejected": -580.0, "loss": 0.0329, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.25, "rewards/margins": 11.4375, "rewards/rejected": -31.625, "step": 17250 }, { "epoch": 1.2459395076878654, "grad_norm": 2.0597270911310708, "learning_rate": 6.089331133224562e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.39453125, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.0322, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.25, "rewards/margins": 12.125, "rewards/rejected": -34.5, "step": 17260 }, { "epoch": 1.2466613729878004, "grad_norm": 7.075797017804865, "learning_rate": 6.087567898459576e-07, "logits/chosen": -0.96875, "logits/rejected": -0.46484375, "logps/chosen": -498.0, "logps/rejected": -580.0, "loss": 0.0318, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 11.9375, "rewards/rejected": -32.5, "step": 17270 }, { "epoch": 1.2473832382877355, "grad_norm": 6.297514654704189, "learning_rate": 6.085806194501844e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.373046875, "logps/chosen": -492.0, "logps/rejected": -580.0, "loss": 0.046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.875, "rewards/rejected": -32.75, "step": 17280 }, { "epoch": 1.2481051035876705, "grad_norm": 1.949680476979119, "learning_rate": 6.084046019137626e-07, "logits/chosen": -1.015625, "logits/rejected": -0.419921875, "logps/chosen": -456.0, "logps/rejected": -536.0, "loss": 0.0243, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 11.375, "rewards/rejected": -30.625, "step": 17290 }, { "epoch": 1.2488269688876055, "grad_norm": 1.4543075703241581, "learning_rate": 6.082287370157644e-07, "logits/chosen": -1.0625, "logits/rejected": -0.3984375, "logps/chosen": -478.0, "logps/rejected": -580.0, "loss": 0.0374, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 12.0, "rewards/rejected": -31.75, "step": 17300 }, { "epoch": 1.2495488341875407, "grad_norm": 11.733465521940417, "learning_rate": 6.0805302453571e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.478515625, "logps/chosen": -464.0, "logps/rejected": -560.0, "loss": 0.057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.75, "rewards/margins": 11.3125, "rewards/rejected": -31.0, "step": 17310 }, { "epoch": 1.2502706994874757, "grad_norm": 6.0235891048849854, "learning_rate": 6.078774642535648e-07, "logits/chosen": -0.984375, "logits/rejected": -0.3359375, "logps/chosen": -506.0, "logps/rejected": -584.0, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 11.6875, "rewards/rejected": -32.5, "step": 17320 }, { "epoch": 1.2509925647874107, "grad_norm": 10.225756805522048, "learning_rate": 6.077020559497388e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.36328125, "logps/chosen": -478.0, "logps/rejected": -572.0, "loss": 0.0329, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 17330 }, { "epoch": 1.2517144300873457, "grad_norm": 7.949007049337855, "learning_rate": 6.07526799405085e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.349609375, "logps/chosen": -462.0, "logps/rejected": -540.0, "loss": 0.0535, "rewards/accuracies": 0.96875, "rewards/chosen": -19.5, "rewards/margins": 11.0, "rewards/rejected": -30.5, "step": 17340 }, { "epoch": 1.2524362953872807, "grad_norm": 5.108804468500013, "learning_rate": 6.073516944008986e-07, "logits/chosen": -1.015625, "logits/rejected": -0.408203125, "logps/chosen": -506.0, "logps/rejected": -572.0, "loss": 0.0413, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.625, "rewards/margins": 11.9375, "rewards/rejected": -32.5, "step": 17350 }, { "epoch": 1.2531581606872157, "grad_norm": 10.001498856449444, "learning_rate": 6.071767407189163e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.4609375, "logps/chosen": -456.0, "logps/rejected": -560.0, "loss": 0.0328, "rewards/accuracies": 0.96875, "rewards/chosen": -19.25, "rewards/margins": 10.8125, "rewards/rejected": -30.125, "step": 17360 }, { "epoch": 1.2538800259871508, "grad_norm": 6.618398398386329, "learning_rate": 6.070019381413139e-07, "logits/chosen": -1.046875, "logits/rejected": -0.52734375, "logps/chosen": -484.0, "logps/rejected": -536.0, "loss": 0.0539, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -20.25, "rewards/margins": 9.625, "rewards/rejected": -29.875, "step": 17370 }, { "epoch": 1.2546018912870858, "grad_norm": 2.7328628316146486, "learning_rate": 6.068272864507064e-07, "logits/chosen": -0.921875, "logits/rejected": -0.28515625, "logps/chosen": -428.0, "logps/rejected": -516.0, "loss": 0.0399, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 11.1875, "rewards/rejected": -29.25, "step": 17380 }, { "epoch": 1.2553237565870208, "grad_norm": 5.287519989737241, "learning_rate": 6.066527854301463e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.416015625, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0446, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 11.375, "rewards/rejected": -31.5, "step": 17390 }, { "epoch": 1.256045621886956, "grad_norm": 1.579526855809976, "learning_rate": 6.064784348631227e-07, "logits/chosen": -0.84375, "logits/rejected": -0.435546875, "logps/chosen": -488.0, "logps/rejected": -548.0, "loss": 0.0307, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 10.375, "rewards/rejected": -30.0, "step": 17400 }, { "epoch": 1.2567674871868908, "grad_norm": 5.022516538180256, "learning_rate": 6.063042345335596e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.365234375, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0342, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.4375, "rewards/rejected": -29.5, "step": 17410 }, { "epoch": 1.257489352486826, "grad_norm": 8.804677639349574, "learning_rate": 6.061301842258155e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.251953125, "logps/chosen": -474.0, "logps/rejected": -560.0, "loss": 0.0327, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.25, "rewards/margins": 11.5, "rewards/rejected": -30.75, "step": 17420 }, { "epoch": 1.258211217786761, "grad_norm": 10.840013685172423, "learning_rate": 6.059562837246821e-07, "logits/chosen": -0.90625, "logits/rejected": -0.314453125, "logps/chosen": -490.0, "logps/rejected": -564.0, "loss": 0.0542, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.125, "rewards/margins": 11.75, "rewards/rejected": -31.875, "step": 17430 }, { "epoch": 1.258933083086696, "grad_norm": 15.496437193001004, "learning_rate": 6.057825328153826e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.333984375, "logps/chosen": -456.0, "logps/rejected": -544.0, "loss": 0.0511, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 10.9375, "rewards/rejected": -30.625, "step": 17440 }, { "epoch": 1.259654948386631, "grad_norm": 2.52860211228586, "learning_rate": 6.056089312835716e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.27734375, "logps/chosen": -472.0, "logps/rejected": -540.0, "loss": 0.0451, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.125, "step": 17450 }, { "epoch": 1.260376813686566, "grad_norm": 3.7436168293121446, "learning_rate": 6.054354789153331e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.189453125, "logps/chosen": -478.0, "logps/rejected": -552.0, "loss": 0.0442, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 10.75, "rewards/rejected": -30.75, "step": 17460 }, { "epoch": 1.261098678986501, "grad_norm": 7.22167343819474, "learning_rate": 6.052621754971796e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.349609375, "logps/chosen": -516.0, "logps/rejected": -596.0, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -22.375, "rewards/margins": 11.375, "rewards/rejected": -33.75, "step": 17470 }, { "epoch": 1.261820544286436, "grad_norm": 3.9513150591737904, "learning_rate": 6.050890208160515e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.390625, "logps/chosen": -520.0, "logps/rejected": -568.0, "loss": 0.0667, "rewards/accuracies": 0.96875, "rewards/chosen": -22.0, "rewards/margins": 11.0625, "rewards/rejected": -33.0, "step": 17480 }, { "epoch": 1.2625424095863713, "grad_norm": 12.323322147715684, "learning_rate": 6.049160146593155e-07, "logits/chosen": -0.890625, "logits/rejected": -0.388671875, "logps/chosen": -490.0, "logps/rejected": -596.0, "loss": 0.0549, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.875, "rewards/rejected": -33.5, "step": 17490 }, { "epoch": 1.263264274886306, "grad_norm": 2.4948285739976823, "learning_rate": 6.047431568147635e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.18359375, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0458, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 10.125, "rewards/rejected": -30.875, "step": 17500 }, { "epoch": 1.2639861401862413, "grad_norm": 5.619602453211708, "learning_rate": 6.045704470706117e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.2060546875, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 11.4375, "rewards/rejected": -31.625, "step": 17510 }, { "epoch": 1.2647080054861763, "grad_norm": 9.64347537014279, "learning_rate": 6.043978852154994e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.33984375, "logps/chosen": -520.0, "logps/rejected": -600.0, "loss": 0.0327, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 11.8125, "rewards/rejected": -33.5, "step": 17520 }, { "epoch": 1.2654298707861114, "grad_norm": 9.819342980614657, "learning_rate": 6.04225471038488e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.1826171875, "logps/chosen": -478.0, "logps/rejected": -548.0, "loss": 0.04, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -21.625, "rewards/margins": 10.0625, "rewards/rejected": -31.75, "step": 17530 }, { "epoch": 1.2661517360860464, "grad_norm": 10.073751654227273, "learning_rate": 6.040532043290601e-07, "logits/chosen": -1.0, "logits/rejected": -0.267578125, "logps/chosen": -482.0, "logps/rejected": -548.0, "loss": 0.0561, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 11.3125, "rewards/rejected": -31.375, "step": 17540 }, { "epoch": 1.2668736013859814, "grad_norm": 3.0679652366921943, "learning_rate": 6.038810848771178e-07, "logits/chosen": -0.890625, "logits/rejected": -0.259765625, "logps/chosen": -470.0, "logps/rejected": -540.0, "loss": 0.0384, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 10.125, "rewards/rejected": -29.5, "step": 17550 }, { "epoch": 1.2675954666859164, "grad_norm": 6.331535620559754, "learning_rate": 6.037091124729821e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.271484375, "logps/chosen": -490.0, "logps/rejected": -536.0, "loss": 0.0452, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 10.75, "rewards/rejected": -31.625, "step": 17560 }, { "epoch": 1.2683173319858514, "grad_norm": 6.800267295529355, "learning_rate": 6.03537286907392e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.400390625, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0568, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 10.9375, "rewards/rejected": -31.25, "step": 17570 }, { "epoch": 1.2690391972857864, "grad_norm": 7.161015904293474, "learning_rate": 6.033656079715029e-07, "logits/chosen": -0.72265625, "logits/rejected": -0.1884765625, "logps/chosen": -484.0, "logps/rejected": -560.0, "loss": 0.036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 10.6875, "rewards/rejected": -31.625, "step": 17580 }, { "epoch": 1.2697610625857214, "grad_norm": 4.602643452128709, "learning_rate": 6.031940754568862e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.35546875, "logps/chosen": -454.0, "logps/rejected": -524.0, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 10.5, "rewards/rejected": -28.875, "step": 17590 }, { "epoch": 1.2704829278856566, "grad_norm": 7.0641625287722505, "learning_rate": 6.030226891555272e-07, "logits/chosen": -1.03125, "logits/rejected": -0.41015625, "logps/chosen": -448.0, "logps/rejected": -520.0, "loss": 0.0503, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 10.1875, "rewards/rejected": -28.0, "step": 17600 }, { "epoch": 1.2712047931855914, "grad_norm": 4.439492173023385, "learning_rate": 6.028514488598253e-07, "logits/chosen": -0.921875, "logits/rejected": -0.419921875, "logps/chosen": -462.0, "logps/rejected": -540.0, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -19.0, "rewards/margins": 10.5, "rewards/rejected": -29.5, "step": 17610 }, { "epoch": 1.2719266584855267, "grad_norm": 6.611737392434757, "learning_rate": 6.026803543625922e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.53515625, "logps/chosen": -448.0, "logps/rejected": -532.0, "loss": 0.0349, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -16.875, "rewards/margins": 11.125, "rewards/rejected": -28.0, "step": 17620 }, { "epoch": 1.2726485237854617, "grad_norm": 3.5979673916668595, "learning_rate": 6.025094054570507e-07, "logits/chosen": -1.09375, "logits/rejected": -0.4453125, "logps/chosen": -456.0, "logps/rejected": -524.0, "loss": 0.0444, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.8125, "rewards/rejected": -29.25, "step": 17630 }, { "epoch": 1.2733703890853967, "grad_norm": 11.00946735594366, "learning_rate": 6.023386019368341e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.56640625, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0396, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 10.625, "rewards/rejected": -31.625, "step": 17640 }, { "epoch": 1.2740922543853317, "grad_norm": 5.235686619752142, "learning_rate": 6.021679435959851e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.3671875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 10.8125, "rewards/rejected": -29.375, "step": 17650 }, { "epoch": 1.2748141196852667, "grad_norm": 6.197341417706041, "learning_rate": 6.019974302289545e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.42578125, "logps/chosen": -472.0, "logps/rejected": -548.0, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 10.75, "rewards/rejected": -29.875, "step": 17660 }, { "epoch": 1.2755359849852017, "grad_norm": 2.4666989407603355, "learning_rate": 6.018270616306004e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.33203125, "logps/chosen": -474.0, "logps/rejected": -556.0, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 10.4375, "rewards/rejected": -29.25, "step": 17670 }, { "epoch": 1.2762578502851367, "grad_norm": 7.2662202584803675, "learning_rate": 6.016568375961868e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.390625, "logps/chosen": -468.0, "logps/rejected": -540.0, "loss": 0.0295, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 10.6875, "rewards/rejected": -29.375, "step": 17680 }, { "epoch": 1.276979715585072, "grad_norm": 3.365977084588401, "learning_rate": 6.014867579213833e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.275390625, "logps/chosen": -434.0, "logps/rejected": -536.0, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 10.75, "rewards/rejected": -29.75, "step": 17690 }, { "epoch": 1.2777015808850067, "grad_norm": 8.652390488407743, "learning_rate": 6.013168224022631e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.482421875, "logps/chosen": -498.0, "logps/rejected": -560.0, "loss": 0.0404, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.125, "rewards/margins": 10.8125, "rewards/rejected": -31.0, "step": 17700 }, { "epoch": 1.278423446184942, "grad_norm": 59.46058479461573, "learning_rate": 6.011470308353028e-07, "logits/chosen": -1.0, "logits/rejected": -0.37109375, "logps/chosen": -498.0, "logps/rejected": -560.0, "loss": 0.0596, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.5, "rewards/margins": 10.25, "rewards/rejected": -31.75, "step": 17710 }, { "epoch": 1.279145311484877, "grad_norm": 4.445126019045369, "learning_rate": 6.00977383017381e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.1630859375, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0418, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 11.75, "rewards/rejected": -32.75, "step": 17720 }, { "epoch": 1.279867176784812, "grad_norm": 5.517126098329073, "learning_rate": 6.008078787457772e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.302734375, "logps/chosen": -506.0, "logps/rejected": -584.0, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.875, "rewards/margins": 11.4375, "rewards/rejected": -33.25, "step": 17730 }, { "epoch": 1.280589042084747, "grad_norm": 9.9891993139709, "learning_rate": 6.006385178181711e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.337890625, "logps/chosen": -488.0, "logps/rejected": -540.0, "loss": 0.0441, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 10.625, "rewards/rejected": -30.25, "step": 17740 }, { "epoch": 1.281310907384682, "grad_norm": 2.736220501585007, "learning_rate": 6.004693000326412e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.380859375, "logps/chosen": -456.0, "logps/rejected": -552.0, "loss": 0.0295, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 12.0625, "rewards/rejected": -30.625, "step": 17750 }, { "epoch": 1.282032772684617, "grad_norm": 10.428779226942751, "learning_rate": 6.003002251876642e-07, "logits/chosen": -1.015625, "logits/rejected": -0.306640625, "logps/chosen": -472.0, "logps/rejected": -536.0, "loss": 0.04, "rewards/accuracies": 0.96875, "rewards/chosen": -18.375, "rewards/margins": 11.0, "rewards/rejected": -29.375, "step": 17760 }, { "epoch": 1.282754637984552, "grad_norm": 11.26203480766092, "learning_rate": 6.001312930821136e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.310546875, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0533, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 9.5625, "rewards/rejected": -30.0, "step": 17770 }, { "epoch": 1.283476503284487, "grad_norm": 4.413724500993193, "learning_rate": 5.999625035152588e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.224609375, "logps/chosen": -486.0, "logps/rejected": -552.0, "loss": 0.0447, "rewards/accuracies": 0.96875, "rewards/chosen": -20.75, "rewards/margins": 10.4375, "rewards/rejected": -31.125, "step": 17780 }, { "epoch": 1.284198368584422, "grad_norm": 5.481402503714921, "learning_rate": 5.997938562867645e-07, "logits/chosen": -0.828125, "logits/rejected": -0.3671875, "logps/chosen": -476.0, "logps/rejected": -580.0, "loss": 0.0482, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.875, "rewards/margins": 10.625, "rewards/rejected": -29.5, "step": 17790 }, { "epoch": 1.2849202338843573, "grad_norm": 5.274228193950034, "learning_rate": 5.996253511966891e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.482421875, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.0331, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.25, "rewards/margins": 11.0, "rewards/rejected": -28.25, "step": 17800 }, { "epoch": 1.2856420991842923, "grad_norm": 7.079076195099126, "learning_rate": 5.994569880454842e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.361328125, "logps/chosen": -466.0, "logps/rejected": -516.0, "loss": 0.0521, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 11.0625, "rewards/rejected": -28.625, "step": 17810 }, { "epoch": 1.2863639644842273, "grad_norm": 2.939633221472232, "learning_rate": 5.99288766633993e-07, "logits/chosen": -0.78125, "logits/rejected": -0.1982421875, "logps/chosen": -476.0, "logps/rejected": -532.0, "loss": 0.0318, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.25, "rewards/margins": 10.8125, "rewards/rejected": -29.125, "step": 17820 }, { "epoch": 1.2870858297841623, "grad_norm": 6.323094695563326, "learning_rate": 5.991206867634499e-07, "logits/chosen": -0.921875, "logits/rejected": -0.38671875, "logps/chosen": -462.0, "logps/rejected": -552.0, "loss": 0.0473, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 10.75, "rewards/rejected": -30.0, "step": 17830 }, { "epoch": 1.2878076950840973, "grad_norm": 12.827841941483378, "learning_rate": 5.989527482354798e-07, "logits/chosen": -0.90625, "logits/rejected": -0.21875, "logps/chosen": -470.0, "logps/rejected": -544.0, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -19.75, "rewards/margins": 11.25, "rewards/rejected": -30.875, "step": 17840 }, { "epoch": 1.2885295603840323, "grad_norm": 5.034985975390114, "learning_rate": 5.987849508520958e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.330078125, "logps/chosen": -474.0, "logps/rejected": -556.0, "loss": 0.0337, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.0625, "rewards/rejected": -29.75, "step": 17850 }, { "epoch": 1.2892514256839673, "grad_norm": 5.112156801019341, "learning_rate": 5.986172944156994e-07, "logits/chosen": -1.0, "logits/rejected": -0.4296875, "logps/chosen": -454.0, "logps/rejected": -528.0, "loss": 0.0322, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.75, "rewards/margins": 10.25, "rewards/rejected": -28.0, "step": 17860 }, { "epoch": 1.2899732909839023, "grad_norm": 4.674217674046798, "learning_rate": 5.984497787290794e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.375, "logps/chosen": -460.0, "logps/rejected": -552.0, "loss": 0.0291, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 10.875, "rewards/rejected": -29.75, "step": 17870 }, { "epoch": 1.2906951562838374, "grad_norm": 2.7533032208796127, "learning_rate": 5.982824035954103e-07, "logits/chosen": -0.90625, "logits/rejected": -0.279296875, "logps/chosen": -466.0, "logps/rejected": -528.0, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 10.625, "rewards/rejected": -29.375, "step": 17880 }, { "epoch": 1.2914170215837726, "grad_norm": 5.946356211603844, "learning_rate": 5.98115168818252e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.29296875, "logps/chosen": -472.0, "logps/rejected": -568.0, "loss": 0.0393, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.25, "rewards/rejected": -31.5, "step": 17890 }, { "epoch": 1.2921388868837074, "grad_norm": 3.5301674835011325, "learning_rate": 5.979480742015487e-07, "logits/chosen": -1.2109375, "logits/rejected": -0.55859375, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0315, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 10.4375, "rewards/rejected": -29.875, "step": 17900 }, { "epoch": 1.2928607521836426, "grad_norm": 10.384266266064648, "learning_rate": 5.977811195496272e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.361328125, "logps/chosen": -484.0, "logps/rejected": -576.0, "loss": 0.049, "rewards/accuracies": 0.96875, "rewards/chosen": -20.5, "rewards/margins": 11.0625, "rewards/rejected": -31.5, "step": 17910 }, { "epoch": 1.2935826174835776, "grad_norm": 1.8772722107081334, "learning_rate": 5.976143046671967e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.451171875, "logps/chosen": -494.0, "logps/rejected": -544.0, "loss": 0.0353, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.0, "rewards/rejected": -30.75, "step": 17920 }, { "epoch": 1.2943044827835126, "grad_norm": 2.7601041446885604, "learning_rate": 5.974476293593484e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.357421875, "logps/chosen": -500.0, "logps/rejected": -584.0, "loss": 0.0359, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.6875, "rewards/rejected": -32.0, "step": 17930 }, { "epoch": 1.2950263480834476, "grad_norm": 9.435720517523771, "learning_rate": 5.97281093431553e-07, "logits/chosen": -0.9375, "logits/rejected": -0.328125, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -18.875, "rewards/margins": 12.5625, "rewards/rejected": -31.375, "step": 17940 }, { "epoch": 1.2957482133833826, "grad_norm": 12.46034351494444, "learning_rate": 5.971146966896607e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.427734375, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -18.75, "rewards/margins": 11.625, "rewards/rejected": -30.375, "step": 17950 }, { "epoch": 1.2964700786833177, "grad_norm": 3.4456170523353826, "learning_rate": 5.969484389399003e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.4765625, "logps/chosen": -450.0, "logps/rejected": -520.0, "loss": 0.0255, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.0, "rewards/margins": 11.0, "rewards/rejected": -29.0, "step": 17960 }, { "epoch": 1.2971919439832527, "grad_norm": 5.12255799762917, "learning_rate": 5.967823199888781e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.498046875, "logps/chosen": -462.0, "logps/rejected": -528.0, "loss": 0.0248, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 10.875, "rewards/rejected": -29.25, "step": 17970 }, { "epoch": 1.297913809283188, "grad_norm": 8.005891432981691, "learning_rate": 5.966163396435766e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.51171875, "logps/chosen": -478.0, "logps/rejected": -552.0, "loss": 0.0386, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 10.9375, "rewards/rejected": -29.0, "step": 17980 }, { "epoch": 1.2986356745831227, "grad_norm": 5.57423285980183, "learning_rate": 5.964504977113543e-07, "logits/chosen": -0.96875, "logits/rejected": -0.44921875, "logps/chosen": -490.0, "logps/rejected": -584.0, "loss": 0.0259, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.375, "rewards/margins": 11.4375, "rewards/rejected": -32.75, "step": 17990 }, { "epoch": 1.299357539883058, "grad_norm": 6.74486453600757, "learning_rate": 5.962847939999439e-07, "logits/chosen": -0.921875, "logits/rejected": -0.333984375, "logps/chosen": -500.0, "logps/rejected": -568.0, "loss": 0.0288, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.6875, "rewards/rejected": -32.25, "step": 18000 }, { "epoch": 1.300079405182993, "grad_norm": 3.4225642397982807, "learning_rate": 5.961192283174521e-07, "logits/chosen": -0.84375, "logits/rejected": -0.34765625, "logps/chosen": -462.0, "logps/rejected": -580.0, "loss": 0.0428, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 10.5625, "rewards/rejected": -30.0, "step": 18010 }, { "epoch": 1.300801270482928, "grad_norm": 5.165748013704686, "learning_rate": 5.959538004723581e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.337890625, "logps/chosen": -460.0, "logps/rejected": -532.0, "loss": 0.0409, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.375, "rewards/margins": 10.5, "rewards/rejected": -27.875, "step": 18020 }, { "epoch": 1.301523135782863, "grad_norm": 7.828237623630413, "learning_rate": 5.957885102735133e-07, "logits/chosen": -1.0, "logits/rejected": -0.306640625, "logps/chosen": -482.0, "logps/rejected": -544.0, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 11.5, "rewards/rejected": -29.75, "step": 18030 }, { "epoch": 1.302245001082798, "grad_norm": 10.652103863954423, "learning_rate": 5.956233575301397e-07, "logits/chosen": -1.046875, "logits/rejected": -0.408203125, "logps/chosen": -490.0, "logps/rejected": -532.0, "loss": 0.0421, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 10.9375, "rewards/rejected": -29.5, "step": 18040 }, { "epoch": 1.302966866382733, "grad_norm": 1.580760709607706, "learning_rate": 5.954583420518295e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.62109375, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0464, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.625, "rewards/margins": 12.625, "rewards/rejected": -30.25, "step": 18050 }, { "epoch": 1.303688731682668, "grad_norm": 7.234610135140837, "learning_rate": 5.952934636485436e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.474609375, "logps/chosen": -446.0, "logps/rejected": -498.0, "loss": 0.0332, "rewards/accuracies": 0.96875, "rewards/chosen": -18.25, "rewards/margins": 9.5, "rewards/rejected": -27.75, "step": 18060 }, { "epoch": 1.304410596982603, "grad_norm": 5.350876031093355, "learning_rate": 5.951287221306115e-07, "logits/chosen": -0.875, "logits/rejected": -0.365234375, "logps/chosen": -448.0, "logps/rejected": -552.0, "loss": 0.0324, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 10.9375, "rewards/rejected": -29.75, "step": 18070 }, { "epoch": 1.305132462282538, "grad_norm": 2.773969073367958, "learning_rate": 5.949641173087295e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.29296875, "logps/chosen": -456.0, "logps/rejected": -540.0, "loss": 0.0477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 11.125, "rewards/rejected": -29.625, "step": 18080 }, { "epoch": 1.3058543275824732, "grad_norm": 13.100789447547747, "learning_rate": 5.947996489939607e-07, "logits/chosen": -0.96875, "logits/rejected": -0.28515625, "logps/chosen": -456.0, "logps/rejected": -520.0, "loss": 0.0443, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.0, "rewards/margins": 10.6875, "rewards/rejected": -28.75, "step": 18090 }, { "epoch": 1.306576192882408, "grad_norm": 11.624443659218985, "learning_rate": 5.94635316997733e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.443359375, "logps/chosen": -472.0, "logps/rejected": -524.0, "loss": 0.0517, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 11.6875, "rewards/rejected": -29.75, "step": 18100 }, { "epoch": 1.3072980581823432, "grad_norm": 4.137191791092678, "learning_rate": 5.944711211318392e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.431640625, "logps/chosen": -460.0, "logps/rejected": -524.0, "loss": 0.045, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 10.625, "rewards/rejected": -28.25, "step": 18110 }, { "epoch": 1.3080199234822782, "grad_norm": 5.818218025594527, "learning_rate": 5.943070612084358e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.52734375, "logps/chosen": -438.0, "logps/rejected": -536.0, "loss": 0.0243, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 10.5, "rewards/rejected": -28.75, "step": 18120 }, { "epoch": 1.3087417887822133, "grad_norm": 4.394446567896311, "learning_rate": 5.941431370400415e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.5859375, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.0383, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 11.8125, "rewards/rejected": -30.75, "step": 18130 }, { "epoch": 1.3094636540821483, "grad_norm": 8.37170008670797, "learning_rate": 5.939793484395371e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.484375, "logps/chosen": -468.0, "logps/rejected": -552.0, "loss": 0.048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.0, "rewards/rejected": -30.125, "step": 18140 }, { "epoch": 1.3101855193820833, "grad_norm": 7.936153052659773, "learning_rate": 5.938156952201644e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.4609375, "logps/chosen": -444.0, "logps/rejected": -524.0, "loss": 0.0428, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.0, "rewards/margins": 11.4375, "rewards/rejected": -29.375, "step": 18150 }, { "epoch": 1.3109073846820183, "grad_norm": 1.4502757227219656, "learning_rate": 5.936521771955247e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.5703125, "logps/chosen": -494.0, "logps/rejected": -556.0, "loss": 0.0332, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.625, "rewards/margins": 10.3125, "rewards/rejected": -31.0, "step": 18160 }, { "epoch": 1.3116292499819533, "grad_norm": 4.946273323939672, "learning_rate": 5.934887941795792e-07, "logits/chosen": -1.03125, "logits/rejected": -0.3203125, "logps/chosen": -460.0, "logps/rejected": -532.0, "loss": 0.023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.75, "rewards/margins": 12.1875, "rewards/rejected": -30.0, "step": 18170 }, { "epoch": 1.3123511152818885, "grad_norm": 11.064763474623902, "learning_rate": 5.933255459866463e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.435546875, "logps/chosen": -502.0, "logps/rejected": -572.0, "loss": 0.0521, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 12.0, "rewards/rejected": -31.125, "step": 18180 }, { "epoch": 1.3130729805818233, "grad_norm": 7.926179986749982, "learning_rate": 5.931624324314028e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.41015625, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0267, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.25, "rewards/margins": 11.1875, "rewards/rejected": -32.5, "step": 18190 }, { "epoch": 1.3137948458817585, "grad_norm": 11.92910559807403, "learning_rate": 5.929994533288809e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.4140625, "logps/chosen": -496.0, "logps/rejected": -568.0, "loss": 0.0501, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.375, "rewards/margins": 11.5625, "rewards/rejected": -31.875, "step": 18200 }, { "epoch": 1.3145167111816936, "grad_norm": 12.2787796533799, "learning_rate": 5.928366084944692e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.373046875, "logps/chosen": -492.0, "logps/rejected": -580.0, "loss": 0.0464, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.9375, "rewards/rejected": -33.0, "step": 18210 }, { "epoch": 1.3152385764816286, "grad_norm": 1.9025362800431016, "learning_rate": 5.926738977439106e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.3125, "logps/chosen": -474.0, "logps/rejected": -572.0, "loss": 0.0349, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 11.875, "rewards/rejected": -32.5, "step": 18220 }, { "epoch": 1.3159604417815636, "grad_norm": 4.174095831548478, "learning_rate": 5.925113208933017e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.33984375, "logps/chosen": -466.0, "logps/rejected": -548.0, "loss": 0.0339, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.0, "rewards/rejected": -30.25, "step": 18230 }, { "epoch": 1.3166823070814986, "grad_norm": 3.193354962484017, "learning_rate": 5.923488777590923e-07, "logits/chosen": -1.0625, "logits/rejected": -0.453125, "logps/chosen": -458.0, "logps/rejected": -548.0, "loss": 0.045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 12.25, "rewards/rejected": -31.0, "step": 18240 }, { "epoch": 1.3174041723814336, "grad_norm": 3.4462289514362747, "learning_rate": 5.921865681580842e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.291015625, "logps/chosen": -502.0, "logps/rejected": -564.0, "loss": 0.0378, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.0625, "rewards/rejected": -32.0, "step": 18250 }, { "epoch": 1.3181260376813686, "grad_norm": 4.409285259610101, "learning_rate": 5.920243919074303e-07, "logits/chosen": -0.96875, "logits/rejected": -0.322265625, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.0328, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 12.0625, "rewards/rejected": -32.75, "step": 18260 }, { "epoch": 1.3188479029813036, "grad_norm": 5.621671479645623, "learning_rate": 5.918623488246337e-07, "logits/chosen": -1.015625, "logits/rejected": -0.375, "logps/chosen": -458.0, "logps/rejected": -556.0, "loss": 0.0375, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.625, "rewards/rejected": -30.625, "step": 18270 }, { "epoch": 1.3195697682812386, "grad_norm": 7.133045278818336, "learning_rate": 5.917004387275473e-07, "logits/chosen": -1.09375, "logits/rejected": -0.25390625, "logps/chosen": -488.0, "logps/rejected": -520.0, "loss": 0.0424, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.375, "rewards/rejected": -30.75, "step": 18280 }, { "epoch": 1.3202916335811739, "grad_norm": 12.748502903972742, "learning_rate": 5.915386614343725e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.1943359375, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.062, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 11.9375, "rewards/rejected": -31.25, "step": 18290 }, { "epoch": 1.3210134988811089, "grad_norm": 9.524431917179674, "learning_rate": 5.913770167636582e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.443359375, "logps/chosen": -502.0, "logps/rejected": -584.0, "loss": 0.0288, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.5, "rewards/rejected": -31.625, "step": 18300 }, { "epoch": 1.3217353641810439, "grad_norm": 8.997896277471618, "learning_rate": 5.912155045343007e-07, "logits/chosen": -0.63671875, "logits/rejected": -0.2119140625, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.029, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.75, "rewards/margins": 11.5625, "rewards/rejected": -31.375, "step": 18310 }, { "epoch": 1.3224572294809789, "grad_norm": 6.103336457325073, "learning_rate": 5.910541245655417e-07, "logits/chosen": -1.125, "logits/rejected": -0.44921875, "logps/chosen": -480.0, "logps/rejected": -544.0, "loss": 0.0383, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.3125, "rewards/rejected": -31.25, "step": 18320 }, { "epoch": 1.323179094780914, "grad_norm": 10.92694610512437, "learning_rate": 5.908928766769686e-07, "logits/chosen": -0.765625, "logits/rejected": -0.322265625, "logps/chosen": -466.0, "logps/rejected": -532.0, "loss": 0.0498, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.25, "rewards/margins": 10.875, "rewards/rejected": -31.125, "step": 18330 }, { "epoch": 1.323900960080849, "grad_norm": 3.1326588106214737, "learning_rate": 5.907317606885129e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.34765625, "logps/chosen": -462.0, "logps/rejected": -560.0, "loss": 0.0444, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 11.875, "rewards/rejected": -30.625, "step": 18340 }, { "epoch": 1.324622825380784, "grad_norm": 9.995815908477859, "learning_rate": 5.905707764204498e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.30078125, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0356, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 11.375, "rewards/rejected": -31.375, "step": 18350 }, { "epoch": 1.325344690680719, "grad_norm": 9.213365500999295, "learning_rate": 5.904099236933968e-07, "logits/chosen": -0.96875, "logits/rejected": -0.439453125, "logps/chosen": -470.0, "logps/rejected": -536.0, "loss": 0.0413, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.375, "rewards/margins": 10.625, "rewards/rejected": -30.0, "step": 18360 }, { "epoch": 1.326066555980654, "grad_norm": 5.54184710784043, "learning_rate": 5.902492023283137e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.40234375, "logps/chosen": -494.0, "logps/rejected": -568.0, "loss": 0.0381, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 11.0, "rewards/rejected": -31.75, "step": 18370 }, { "epoch": 1.3267884212805892, "grad_norm": 8.813093921064496, "learning_rate": 5.90088612146501e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.34765625, "logps/chosen": -486.0, "logps/rejected": -552.0, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -19.25, "rewards/margins": 11.0, "rewards/rejected": -30.25, "step": 18380 }, { "epoch": 1.327510286580524, "grad_norm": 6.65154388583465, "learning_rate": 5.899281529695992e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.51171875, "logps/chosen": -470.0, "logps/rejected": -564.0, "loss": 0.0271, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.25, "rewards/rejected": -31.875, "step": 18390 }, { "epoch": 1.3282321518804592, "grad_norm": 11.54871456144754, "learning_rate": 5.897678246195885e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.212890625, "logps/chosen": -474.0, "logps/rejected": -548.0, "loss": 0.037, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 18400 }, { "epoch": 1.3289540171803942, "grad_norm": 6.913537156693515, "learning_rate": 5.896076269187873e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.310546875, "logps/chosen": -498.0, "logps/rejected": -576.0, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 11.125, "rewards/rejected": -32.75, "step": 18410 }, { "epoch": 1.3296758824803292, "grad_norm": 7.628157267812419, "learning_rate": 5.894475596898518e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.267578125, "logps/chosen": -524.0, "logps/rejected": -600.0, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.625, "rewards/margins": 11.0, "rewards/rejected": -34.75, "step": 18420 }, { "epoch": 1.3303977477802642, "grad_norm": 8.650333755087043, "learning_rate": 5.892876227557749e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.40234375, "logps/chosen": -508.0, "logps/rejected": -592.0, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 12.5625, "rewards/rejected": -34.25, "step": 18430 }, { "epoch": 1.3311196130801992, "grad_norm": 2.480478066039619, "learning_rate": 5.891278159398855e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.275390625, "logps/chosen": -536.0, "logps/rejected": -612.0, "loss": 0.0387, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -24.375, "rewards/margins": 11.4375, "rewards/rejected": -35.75, "step": 18440 }, { "epoch": 1.3318414783801342, "grad_norm": 2.2778812517858573, "learning_rate": 5.889681390658482e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.25, "logps/chosen": -486.0, "logps/rejected": -568.0, "loss": 0.0222, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.875, "rewards/margins": 11.1875, "rewards/rejected": -33.0, "step": 18450 }, { "epoch": 1.3325633436800692, "grad_norm": 5.75412068898989, "learning_rate": 5.888085919576612e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.28515625, "logps/chosen": -520.0, "logps/rejected": -588.0, "loss": 0.0314, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 11.5, "rewards/rejected": -33.5, "step": 18460 }, { "epoch": 1.3332852089800045, "grad_norm": 2.6762802279408877, "learning_rate": 5.886491744396568e-07, "logits/chosen": -0.74609375, "logits/rejected": -0.2041015625, "logps/chosen": -532.0, "logps/rejected": -600.0, "loss": 0.0307, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.25, "rewards/margins": 10.8125, "rewards/rejected": -34.25, "step": 18470 }, { "epoch": 1.3340070742799393, "grad_norm": 11.588176601317507, "learning_rate": 5.884898863364996e-07, "logits/chosen": -0.921875, "logits/rejected": -0.36328125, "logps/chosen": -506.0, "logps/rejected": -604.0, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.5, "rewards/margins": 11.5625, "rewards/rejected": -35.0, "step": 18480 }, { "epoch": 1.3347289395798745, "grad_norm": 3.371383110938856, "learning_rate": 5.883307274731868e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.322265625, "logps/chosen": -500.0, "logps/rejected": -552.0, "loss": 0.0319, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.25, "rewards/margins": 11.625, "rewards/rejected": -34.0, "step": 18490 }, { "epoch": 1.3354508048798095, "grad_norm": 9.71747344161264, "learning_rate": 5.881716976750462e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.275390625, "logps/chosen": -512.0, "logps/rejected": -564.0, "loss": 0.0296, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.0, "rewards/margins": 10.5, "rewards/rejected": -32.5, "step": 18500 }, { "epoch": 1.3361726701797445, "grad_norm": 6.975798804695383, "learning_rate": 5.88012796767736e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.28515625, "logps/chosen": -488.0, "logps/rejected": -560.0, "loss": 0.0402, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.375, "rewards/margins": 10.625, "rewards/rejected": -32.0, "step": 18510 }, { "epoch": 1.3368945354796795, "grad_norm": 1.6702951804243298, "learning_rate": 5.878540245772441e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.279296875, "logps/chosen": -524.0, "logps/rejected": -608.0, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.5, "rewards/margins": 11.25, "rewards/rejected": -33.75, "step": 18520 }, { "epoch": 1.3376164007796145, "grad_norm": 3.1004230240223696, "learning_rate": 5.876953809298869e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.3203125, "logps/chosen": -468.0, "logps/rejected": -560.0, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -21.25, "rewards/margins": 11.75, "rewards/rejected": -33.0, "step": 18530 }, { "epoch": 1.3383382660795495, "grad_norm": 7.57271867192475, "learning_rate": 5.87536865652309e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.34765625, "logps/chosen": -524.0, "logps/rejected": -592.0, "loss": 0.0365, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 11.5625, "rewards/rejected": -33.5, "step": 18540 }, { "epoch": 1.3390601313794845, "grad_norm": 2.8632894989818554, "learning_rate": 5.873784785714818e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.5078125, "logps/chosen": -508.0, "logps/rejected": -580.0, "loss": 0.0363, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.5, "rewards/margins": 11.4375, "rewards/rejected": -33.75, "step": 18550 }, { "epoch": 1.3397819966794196, "grad_norm": 6.570744666237716, "learning_rate": 5.872202195147034e-07, "logits/chosen": -1.03125, "logits/rejected": -0.2373046875, "logps/chosen": -510.0, "logps/rejected": -604.0, "loss": 0.0384, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.375, "rewards/margins": 12.3125, "rewards/rejected": -35.5, "step": 18560 }, { "epoch": 1.3405038619793546, "grad_norm": 10.680400807683553, "learning_rate": 5.870620883095973e-07, "logits/chosen": -1.125, "logits/rejected": -0.44140625, "logps/chosen": -510.0, "logps/rejected": -580.0, "loss": 0.0496, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.25, "rewards/margins": 11.4375, "rewards/rejected": -33.75, "step": 18570 }, { "epoch": 1.3412257272792898, "grad_norm": 1.6441760736804727, "learning_rate": 5.869040847841115e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.26953125, "logps/chosen": -512.0, "logps/rejected": -592.0, "loss": 0.0473, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -23.75, "rewards/margins": 11.3125, "rewards/rejected": -35.0, "step": 18580 }, { "epoch": 1.3419475925792246, "grad_norm": 8.903244309657156, "learning_rate": 5.867462087665184e-07, "logits/chosen": -0.859375, "logits/rejected": -0.2890625, "logps/chosen": -524.0, "logps/rejected": -604.0, "loss": 0.0386, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.125, "rewards/margins": 12.25, "rewards/rejected": -35.5, "step": 18590 }, { "epoch": 1.3426694578791598, "grad_norm": 2.4873068111057415, "learning_rate": 5.865884600854132e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.376953125, "logps/chosen": -486.0, "logps/rejected": -572.0, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -21.5, "rewards/margins": 11.3125, "rewards/rejected": -32.75, "step": 18600 }, { "epoch": 1.3433913231790948, "grad_norm": 9.09312049589483, "learning_rate": 5.864308385697138e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.416015625, "logps/chosen": -472.0, "logps/rejected": -572.0, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 10.8125, "rewards/rejected": -31.0, "step": 18610 }, { "epoch": 1.3441131884790298, "grad_norm": 14.497255813548692, "learning_rate": 5.862733440486595e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.29296875, "logps/chosen": -474.0, "logps/rejected": -544.0, "loss": 0.0399, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.75, "rewards/rejected": -31.5, "step": 18620 }, { "epoch": 1.3448350537789648, "grad_norm": 5.771509493420207, "learning_rate": 5.861159763518106e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.427734375, "logps/chosen": -466.0, "logps/rejected": -548.0, "loss": 0.0368, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.875, "rewards/margins": 11.0, "rewards/rejected": -30.875, "step": 18630 }, { "epoch": 1.3455569190788998, "grad_norm": 9.224502923183744, "learning_rate": 5.859587353090476e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.4140625, "logps/chosen": -488.0, "logps/rejected": -564.0, "loss": 0.0572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.5, "rewards/margins": 11.0625, "rewards/rejected": -32.5, "step": 18640 }, { "epoch": 1.3462787843788349, "grad_norm": 4.559256187215061, "learning_rate": 5.858016207505699e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.359375, "logps/chosen": -466.0, "logps/rejected": -556.0, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": -19.625, "rewards/margins": 11.1875, "rewards/rejected": -30.875, "step": 18650 }, { "epoch": 1.3470006496787699, "grad_norm": 10.054534567796004, "learning_rate": 5.856446325068959e-07, "logits/chosen": -0.953125, "logits/rejected": -0.2451171875, "logps/chosen": -458.0, "logps/rejected": -536.0, "loss": 0.0483, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 10.6875, "rewards/rejected": -29.625, "step": 18660 }, { "epoch": 1.347722514978705, "grad_norm": 7.199078126968941, "learning_rate": 5.854877704088614e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.4140625, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0366, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 12.25, "rewards/rejected": -30.5, "step": 18670 }, { "epoch": 1.3484443802786399, "grad_norm": 7.0483166239656, "learning_rate": 5.853310342876193e-07, "logits/chosen": -1.109375, "logits/rejected": -0.5234375, "logps/chosen": -470.0, "logps/rejected": -548.0, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 11.625, "rewards/rejected": -31.25, "step": 18680 }, { "epoch": 1.3491662455785751, "grad_norm": 4.892167827440163, "learning_rate": 5.851744239746388e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.5703125, "logps/chosen": -478.0, "logps/rejected": -528.0, "loss": 0.0355, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 9.8125, "rewards/rejected": -29.25, "step": 18690 }, { "epoch": 1.3498881108785101, "grad_norm": 4.075585346487948, "learning_rate": 5.850179393017045e-07, "logits/chosen": -1.171875, "logits/rejected": -0.421875, "logps/chosen": -472.0, "logps/rejected": -532.0, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 10.6875, "rewards/rejected": -29.375, "step": 18700 }, { "epoch": 1.3506099761784451, "grad_norm": 6.362928222077973, "learning_rate": 5.848615801009158e-07, "logits/chosen": -0.984375, "logits/rejected": -0.48828125, "logps/chosen": -476.0, "logps/rejected": -544.0, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 11.375, "rewards/rejected": -30.125, "step": 18710 }, { "epoch": 1.3513318414783801, "grad_norm": 7.871197285309995, "learning_rate": 5.847053462046862e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.349609375, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 11.5625, "rewards/rejected": -31.0, "step": 18720 }, { "epoch": 1.3520537067783152, "grad_norm": 2.327155239439537, "learning_rate": 5.845492374457418e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.369140625, "logps/chosen": -476.0, "logps/rejected": -556.0, "loss": 0.0387, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.25, "rewards/margins": 11.4375, "rewards/rejected": -30.625, "step": 18730 }, { "epoch": 1.3527755720782502, "grad_norm": 3.3928628183350305, "learning_rate": 5.843932536571219e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.384765625, "logps/chosen": -454.0, "logps/rejected": -576.0, "loss": 0.0379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.9375, "rewards/rejected": -31.75, "step": 18740 }, { "epoch": 1.3534974373781852, "grad_norm": 5.085989351283831, "learning_rate": 5.842373946721771e-07, "logits/chosen": -1.140625, "logits/rejected": -0.50390625, "logps/chosen": -494.0, "logps/rejected": -592.0, "loss": 0.033, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 12.1875, "rewards/rejected": -31.125, "step": 18750 }, { "epoch": 1.3542193026781202, "grad_norm": 1.5494192373181133, "learning_rate": 5.840816603245691e-07, "logits/chosen": -1.03125, "logits/rejected": -0.357421875, "logps/chosen": -488.0, "logps/rejected": -568.0, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 10.625, "rewards/rejected": -32.25, "step": 18760 }, { "epoch": 1.3549411679780552, "grad_norm": 15.57648877599964, "learning_rate": 5.839260504482696e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.431640625, "logps/chosen": -482.0, "logps/rejected": -572.0, "loss": 0.0373, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 12.4375, "rewards/rejected": -32.25, "step": 18770 }, { "epoch": 1.3556630332779904, "grad_norm": 8.613902185064118, "learning_rate": 5.837705648775598e-07, "logits/chosen": -1.03125, "logits/rejected": -0.3359375, "logps/chosen": -506.0, "logps/rejected": -572.0, "loss": 0.0306, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 12.0, "rewards/rejected": -32.75, "step": 18780 }, { "epoch": 1.3563848985779254, "grad_norm": 8.653854539308893, "learning_rate": 5.836152034470301e-07, "logits/chosen": -1.078125, "logits/rejected": -0.4296875, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0308, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.125, "rewards/margins": 10.75, "rewards/rejected": -32.75, "step": 18790 }, { "epoch": 1.3571067638778604, "grad_norm": 8.54350388398553, "learning_rate": 5.834599659915782e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.486328125, "logps/chosen": -490.0, "logps/rejected": -568.0, "loss": 0.0342, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 12.0, "rewards/rejected": -33.75, "step": 18800 }, { "epoch": 1.3578286291777955, "grad_norm": 7.693519151332788, "learning_rate": 5.833048523464095e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.41015625, "logps/chosen": -504.0, "logps/rejected": -576.0, "loss": 0.047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 12.1875, "rewards/rejected": -34.0, "step": 18810 }, { "epoch": 1.3585504944777305, "grad_norm": 6.241778416917968, "learning_rate": 5.831498623470357e-07, "logits/chosen": -1.234375, "logits/rejected": -0.44140625, "logps/chosen": -488.0, "logps/rejected": -556.0, "loss": 0.0383, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 10.875, "rewards/rejected": -32.0, "step": 18820 }, { "epoch": 1.3592723597776655, "grad_norm": 7.354927157757841, "learning_rate": 5.829949958292743e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.369140625, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.0625, "rewards/rejected": -31.25, "step": 18830 }, { "epoch": 1.3599942250776005, "grad_norm": 5.677167258473226, "learning_rate": 5.828402526292479e-07, "logits/chosen": -1.15625, "logits/rejected": -0.54296875, "logps/chosen": -476.0, "logps/rejected": -556.0, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -19.5, "rewards/margins": 11.5625, "rewards/rejected": -31.125, "step": 18840 }, { "epoch": 1.3607160903775355, "grad_norm": 3.4308299512970355, "learning_rate": 5.826856325833838e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.46484375, "logps/chosen": -472.0, "logps/rejected": -576.0, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": -19.125, "rewards/margins": 11.9375, "rewards/rejected": -31.125, "step": 18850 }, { "epoch": 1.3614379556774705, "grad_norm": 11.118840451290275, "learning_rate": 5.825311355284121e-07, "logits/chosen": -1.171875, "logits/rejected": -0.455078125, "logps/chosen": -516.0, "logps/rejected": -576.0, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.9375, "rewards/rejected": -31.75, "step": 18860 }, { "epoch": 1.3621598209774057, "grad_norm": 3.22295422897076, "learning_rate": 5.823767613013663e-07, "logits/chosen": -1.3046875, "logits/rejected": -0.56640625, "logps/chosen": -458.0, "logps/rejected": -540.0, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 11.0625, "rewards/rejected": -29.0, "step": 18870 }, { "epoch": 1.3628816862773405, "grad_norm": 8.352612433614787, "learning_rate": 5.82222509739582e-07, "logits/chosen": -1.15625, "logits/rejected": -0.54296875, "logps/chosen": -480.0, "logps/rejected": -548.0, "loss": 0.0268, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.125, "rewards/margins": 12.3125, "rewards/rejected": -30.5, "step": 18880 }, { "epoch": 1.3636035515772758, "grad_norm": 6.576113453990731, "learning_rate": 5.820683806806961e-07, "logits/chosen": -1.125, "logits/rejected": -0.56640625, "logps/chosen": -450.0, "logps/rejected": -540.0, "loss": 0.0358, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.375, "rewards/margins": 11.5625, "rewards/rejected": -30.0, "step": 18890 }, { "epoch": 1.3643254168772108, "grad_norm": 1.3076370131539288, "learning_rate": 5.819143739626463e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.5234375, "logps/chosen": -456.0, "logps/rejected": -512.0, "loss": 0.0326, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.375, "rewards/margins": 10.4375, "rewards/rejected": -28.75, "step": 18900 }, { "epoch": 1.3650472821771458, "grad_norm": 7.832863672277581, "learning_rate": 5.817604894236704e-07, "logits/chosen": -1.171875, "logits/rejected": -0.52734375, "logps/chosen": -484.0, "logps/rejected": -548.0, "loss": 0.0484, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 10.75, "rewards/rejected": -29.625, "step": 18910 }, { "epoch": 1.3657691474770808, "grad_norm": 11.681632000994707, "learning_rate": 5.816067269023052e-07, "logits/chosen": -1.03125, "logits/rejected": -0.62109375, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.125, "rewards/margins": 11.0, "rewards/rejected": -29.25, "step": 18920 }, { "epoch": 1.3664910127770158, "grad_norm": 6.541163944906956, "learning_rate": 5.814530862373863e-07, "logits/chosen": -1.125, "logits/rejected": -0.55859375, "logps/chosen": -468.0, "logps/rejected": -556.0, "loss": 0.0535, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 11.125, "rewards/rejected": -30.75, "step": 18930 }, { "epoch": 1.3672128780769508, "grad_norm": 6.902992550949027, "learning_rate": 5.812995672680471e-07, "logits/chosen": -0.984375, "logits/rejected": -0.458984375, "logps/chosen": -428.0, "logps/rejected": -524.0, "loss": 0.0316, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 11.125, "rewards/rejected": -28.625, "step": 18940 }, { "epoch": 1.3679347433768858, "grad_norm": 6.094267674068026, "learning_rate": 5.811461698337183e-07, "logits/chosen": -1.1875, "logits/rejected": -0.5, "logps/chosen": -480.0, "logps/rejected": -528.0, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -19.125, "rewards/margins": 10.6875, "rewards/rejected": -29.75, "step": 18950 }, { "epoch": 1.368656608676821, "grad_norm": 9.991898744603994, "learning_rate": 5.809928937741268e-07, "logits/chosen": -1.125, "logits/rejected": -0.6015625, "logps/chosen": -504.0, "logps/rejected": -580.0, "loss": 0.0344, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 11.625, "rewards/rejected": -31.125, "step": 18960 }, { "epoch": 1.3693784739767558, "grad_norm": 6.143202701554489, "learning_rate": 5.808397389292952e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.32421875, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0426, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.25, "rewards/margins": 10.5625, "rewards/rejected": -29.75, "step": 18970 }, { "epoch": 1.370100339276691, "grad_norm": 11.662297295961356, "learning_rate": 5.806867051395413e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.4921875, "logps/chosen": -482.0, "logps/rejected": -544.0, "loss": 0.0396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.125, "rewards/margins": 11.25, "rewards/rejected": -30.375, "step": 18980 }, { "epoch": 1.370822204576626, "grad_norm": 5.901840941074001, "learning_rate": 5.805337922454774e-07, "logits/chosen": -1.046875, "logits/rejected": -0.328125, "logps/chosen": -486.0, "logps/rejected": -576.0, "loss": 0.0367, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 11.9375, "rewards/rejected": -32.5, "step": 18990 }, { "epoch": 1.371544069876561, "grad_norm": 7.711104629181514, "learning_rate": 5.803810000880093e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.349609375, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.0365, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 19000 }, { "epoch": 1.372265935176496, "grad_norm": 9.181780658011643, "learning_rate": 5.802283285083356e-07, "logits/chosen": -0.875, "logits/rejected": -0.333984375, "logps/chosen": -502.0, "logps/rejected": -596.0, "loss": 0.0552, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -21.875, "rewards/margins": 11.75, "rewards/rejected": -33.5, "step": 19010 }, { "epoch": 1.372987800476431, "grad_norm": 8.591164127425822, "learning_rate": 5.800757773479473e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.380859375, "logps/chosen": -490.0, "logps/rejected": -576.0, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 11.3125, "rewards/rejected": -33.0, "step": 19020 }, { "epoch": 1.373709665776366, "grad_norm": 5.919029617297377, "learning_rate": 5.799233464486271e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.35546875, "logps/chosen": -506.0, "logps/rejected": -580.0, "loss": 0.065, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -22.25, "rewards/margins": 11.5, "rewards/rejected": -33.75, "step": 19030 }, { "epoch": 1.3744315310763011, "grad_norm": 5.2181061293682065, "learning_rate": 5.797710356524484e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.357421875, "logps/chosen": -536.0, "logps/rejected": -580.0, "loss": 0.0405, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.875, "rewards/margins": 11.3125, "rewards/rejected": -33.25, "step": 19040 }, { "epoch": 1.3751533963762361, "grad_norm": 8.240473514033022, "learning_rate": 5.796188448017747e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.419921875, "logps/chosen": -496.0, "logps/rejected": -560.0, "loss": 0.0285, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.125, "rewards/rejected": -31.875, "step": 19050 }, { "epoch": 1.3758752616761711, "grad_norm": 4.43361836665445, "learning_rate": 5.794667737392593e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.3203125, "logps/chosen": -536.0, "logps/rejected": -608.0, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.375, "rewards/margins": 12.375, "rewards/rejected": -34.75, "step": 19060 }, { "epoch": 1.3765971269761064, "grad_norm": 4.613302623874174, "learning_rate": 5.793148223078442e-07, "logits/chosen": -0.953125, "logits/rejected": -0.3984375, "logps/chosen": -478.0, "logps/rejected": -568.0, "loss": 0.0263, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.25, "rewards/margins": 12.25, "rewards/rejected": -32.5, "step": 19070 }, { "epoch": 1.3773189922760412, "grad_norm": 7.743014346334478, "learning_rate": 5.791629903507591e-07, "logits/chosen": -0.90625, "logits/rejected": -0.302734375, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0447, "rewards/accuracies": 0.96875, "rewards/chosen": -20.625, "rewards/margins": 11.0625, "rewards/rejected": -31.75, "step": 19080 }, { "epoch": 1.3780408575759764, "grad_norm": 4.811362180808535, "learning_rate": 5.79011277711522e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.236328125, "logps/chosen": -508.0, "logps/rejected": -572.0, "loss": 0.0476, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.125, "rewards/margins": 11.625, "rewards/rejected": -33.75, "step": 19090 }, { "epoch": 1.3787627228759114, "grad_norm": 5.021861908542211, "learning_rate": 5.788596842339373e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.34375, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.032, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.125, "rewards/margins": 11.5625, "rewards/rejected": -30.75, "step": 19100 }, { "epoch": 1.3794845881758464, "grad_norm": 8.550351953266802, "learning_rate": 5.787082097620952e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.35546875, "logps/chosen": -440.0, "logps/rejected": -520.0, "loss": 0.0586, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.625, "rewards/margins": 10.875, "rewards/rejected": -28.5, "step": 19110 }, { "epoch": 1.3802064534757814, "grad_norm": 11.577098722355004, "learning_rate": 5.785568541403717e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.431640625, "logps/chosen": -448.0, "logps/rejected": -528.0, "loss": 0.0395, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 10.625, "rewards/rejected": -29.625, "step": 19120 }, { "epoch": 1.3809283187757164, "grad_norm": 4.224612912877457, "learning_rate": 5.784056172134274e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.57421875, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0396, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.4375, "rewards/rejected": -30.25, "step": 19130 }, { "epoch": 1.3816501840756514, "grad_norm": 12.052601574496, "learning_rate": 5.782544988262072e-07, "logits/chosen": -1.015625, "logits/rejected": -0.359375, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0383, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 11.625, "rewards/rejected": -29.875, "step": 19140 }, { "epoch": 1.3823720493755864, "grad_norm": 14.310790005473425, "learning_rate": 5.781034988239392e-07, "logits/chosen": -1.140625, "logits/rejected": -0.54296875, "logps/chosen": -478.0, "logps/rejected": -544.0, "loss": 0.0396, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.75, "rewards/margins": 11.375, "rewards/rejected": -29.125, "step": 19150 }, { "epoch": 1.3830939146755217, "grad_norm": 2.4869693498767935, "learning_rate": 5.779526170521345e-07, "logits/chosen": -1.03125, "logits/rejected": -0.5078125, "logps/chosen": -460.0, "logps/rejected": -528.0, "loss": 0.0384, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.5, "rewards/margins": 10.9375, "rewards/rejected": -29.375, "step": 19160 }, { "epoch": 1.3838157799754565, "grad_norm": 7.978163544004108, "learning_rate": 5.778018533565859e-07, "logits/chosen": -1.0625, "logits/rejected": -0.51953125, "logps/chosen": -476.0, "logps/rejected": -552.0, "loss": 0.0425, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.625, "rewards/margins": 11.375, "rewards/rejected": -30.0, "step": 19170 }, { "epoch": 1.3845376452753917, "grad_norm": 7.9872623603431085, "learning_rate": 5.77651207583368e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.396484375, "logps/chosen": -520.0, "logps/rejected": -564.0, "loss": 0.0381, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.25, "rewards/margins": 10.625, "rewards/rejected": -30.875, "step": 19180 }, { "epoch": 1.3852595105753267, "grad_norm": 8.906572885808043, "learning_rate": 5.775006795788362e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.18359375, "logps/chosen": -482.0, "logps/rejected": -540.0, "loss": 0.0275, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.125, "rewards/margins": 11.375, "rewards/rejected": -31.5, "step": 19190 }, { "epoch": 1.3859813758752617, "grad_norm": 3.0713993998655313, "learning_rate": 5.773502691896258e-07, "logits/chosen": -1.0625, "logits/rejected": -0.49609375, "logps/chosen": -502.0, "logps/rejected": -576.0, "loss": 0.0418, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.0625, "rewards/rejected": -31.5, "step": 19200 }, { "epoch": 1.3867032411751967, "grad_norm": 3.2129343453434585, "learning_rate": 5.771999762626513e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.625, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0291, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 10.6875, "rewards/rejected": -30.875, "step": 19210 }, { "epoch": 1.3874251064751317, "grad_norm": 9.510800093890797, "learning_rate": 5.77049800645107e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.416015625, "logps/chosen": -476.0, "logps/rejected": -568.0, "loss": 0.0545, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.375, "rewards/margins": 10.875, "rewards/rejected": -31.125, "step": 19220 }, { "epoch": 1.3881469717750667, "grad_norm": 10.535999438908634, "learning_rate": 5.768997421844641e-07, "logits/chosen": -1.0625, "logits/rejected": -0.48046875, "logps/chosen": -460.0, "logps/rejected": -520.0, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.25, "rewards/margins": 10.625, "rewards/rejected": -29.0, "step": 19230 }, { "epoch": 1.3888688370750017, "grad_norm": 5.03921269673966, "learning_rate": 5.767498007284723e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.3203125, "logps/chosen": -450.0, "logps/rejected": -528.0, "loss": 0.0317, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 10.625, "rewards/rejected": -28.75, "step": 19240 }, { "epoch": 1.3895907023749368, "grad_norm": 7.734725647103866, "learning_rate": 5.765999761251576e-07, "logits/chosen": -1.0, "logits/rejected": -0.47265625, "logps/chosen": -440.0, "logps/rejected": -520.0, "loss": 0.0407, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.625, "rewards/margins": 11.3125, "rewards/rejected": -28.875, "step": 19250 }, { "epoch": 1.3903125676748718, "grad_norm": 3.478337311249032, "learning_rate": 5.764502682228225e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.490234375, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0364, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 10.6875, "rewards/rejected": -30.0, "step": 19260 }, { "epoch": 1.391034432974807, "grad_norm": 8.14886114772802, "learning_rate": 5.763006768700448e-07, "logits/chosen": -1.09375, "logits/rejected": -0.5078125, "logps/chosen": -472.0, "logps/rejected": -564.0, "loss": 0.0467, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 11.5625, "rewards/rejected": -30.75, "step": 19270 }, { "epoch": 1.391756298274742, "grad_norm": 9.411234934268835, "learning_rate": 5.761512019156773e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.400390625, "logps/chosen": -470.0, "logps/rejected": -572.0, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": -19.75, "rewards/margins": 11.4375, "rewards/rejected": -31.25, "step": 19280 }, { "epoch": 1.392478163574677, "grad_norm": 4.607169998603663, "learning_rate": 5.760018432088474e-07, "logits/chosen": -0.75390625, "logits/rejected": -0.28515625, "logps/chosen": -484.0, "logps/rejected": -572.0, "loss": 0.0401, "rewards/accuracies": 0.96875, "rewards/chosen": -21.5, "rewards/margins": 10.9375, "rewards/rejected": -32.5, "step": 19290 }, { "epoch": 1.393200028874612, "grad_norm": 5.0675684664677325, "learning_rate": 5.758526005989556e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.337890625, "logps/chosen": -520.0, "logps/rejected": -564.0, "loss": 0.0248, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.125, "rewards/margins": 11.5625, "rewards/rejected": -33.75, "step": 19300 }, { "epoch": 1.393921894174547, "grad_norm": 6.402748080156138, "learning_rate": 5.757034739356758e-07, "logits/chosen": -1.0, "logits/rejected": -0.40234375, "logps/chosen": -524.0, "logps/rejected": -580.0, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -21.875, "rewards/margins": 11.5, "rewards/rejected": -33.5, "step": 19310 }, { "epoch": 1.394643759474482, "grad_norm": 5.255767214246289, "learning_rate": 5.755544630689541e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.447265625, "logps/chosen": -512.0, "logps/rejected": -592.0, "loss": 0.0403, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.875, "rewards/margins": 10.875, "rewards/rejected": -32.75, "step": 19320 }, { "epoch": 1.395365624774417, "grad_norm": 3.1914765748303684, "learning_rate": 5.754055678490085e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.435546875, "logps/chosen": -500.0, "logps/rejected": -580.0, "loss": 0.0253, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 11.0, "rewards/rejected": -33.0, "step": 19330 }, { "epoch": 1.396087490074352, "grad_norm": 3.441551814573671, "learning_rate": 5.752567881263278e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.279296875, "logps/chosen": -486.0, "logps/rejected": -576.0, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.25, "rewards/margins": 11.0, "rewards/rejected": -33.25, "step": 19340 }, { "epoch": 1.396809355374287, "grad_norm": 12.90908219352459, "learning_rate": 5.751081237516715e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.21875, "logps/chosen": -510.0, "logps/rejected": -588.0, "loss": 0.0385, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.75, "rewards/margins": 12.5625, "rewards/rejected": -35.5, "step": 19350 }, { "epoch": 1.3975312206742223, "grad_norm": 4.41048321879535, "learning_rate": 5.74959574576069e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.455078125, "logps/chosen": -512.0, "logps/rejected": -592.0, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.0, "rewards/margins": 12.5, "rewards/rejected": -34.5, "step": 19360 }, { "epoch": 1.398253085974157, "grad_norm": 6.871068131756123, "learning_rate": 5.748111404508186e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.345703125, "logps/chosen": -504.0, "logps/rejected": -572.0, "loss": 0.0431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 10.9375, "rewards/rejected": -32.75, "step": 19370 }, { "epoch": 1.3989749512740923, "grad_norm": 2.2516439525083674, "learning_rate": 5.746628212274873e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.341796875, "logps/chosen": -484.0, "logps/rejected": -576.0, "loss": 0.0195, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 10.875, "rewards/rejected": -32.5, "step": 19380 }, { "epoch": 1.3996968165740273, "grad_norm": 5.644379692808879, "learning_rate": 5.745146167579106e-07, "logits/chosen": -0.96875, "logits/rejected": -0.375, "logps/chosen": -476.0, "logps/rejected": -576.0, "loss": 0.0435, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.375, "rewards/margins": 11.25, "rewards/rejected": -33.75, "step": 19390 }, { "epoch": 1.4004186818739623, "grad_norm": 14.732060415130864, "learning_rate": 5.743665268941905e-07, "logits/chosen": -0.875, "logits/rejected": -0.369140625, "logps/chosen": -520.0, "logps/rejected": -596.0, "loss": 0.0521, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.625, "rewards/margins": 11.75, "rewards/rejected": -34.25, "step": 19400 }, { "epoch": 1.4011405471738974, "grad_norm": 10.000805265343347, "learning_rate": 5.742185514886961e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.29296875, "logps/chosen": -516.0, "logps/rejected": -584.0, "loss": 0.034, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 11.0625, "rewards/rejected": -33.25, "step": 19410 }, { "epoch": 1.4018624124738324, "grad_norm": 6.0441120076309645, "learning_rate": 5.740706903940628e-07, "logits/chosen": -0.9375, "logits/rejected": -0.291015625, "logps/chosen": -516.0, "logps/rejected": -556.0, "loss": 0.0355, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.0, "rewards/margins": 10.6875, "rewards/rejected": -32.75, "step": 19420 }, { "epoch": 1.4025842777737674, "grad_norm": 7.1802333934649685, "learning_rate": 5.739229434631911e-07, "logits/chosen": -1.078125, "logits/rejected": -0.44140625, "logps/chosen": -510.0, "logps/rejected": -588.0, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.0, "rewards/margins": 10.875, "rewards/rejected": -33.75, "step": 19430 }, { "epoch": 1.4033061430737024, "grad_norm": 4.784295438533627, "learning_rate": 5.737753105492469e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.1591796875, "logps/chosen": -486.0, "logps/rejected": -568.0, "loss": 0.0292, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.1875, "rewards/rejected": -31.625, "step": 19440 }, { "epoch": 1.4040280083736376, "grad_norm": 2.612950474543836, "learning_rate": 5.736277915056597e-07, "logits/chosen": -0.921875, "logits/rejected": -0.267578125, "logps/chosen": -496.0, "logps/rejected": -580.0, "loss": 0.0295, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.375, "rewards/margins": 12.0, "rewards/rejected": -34.5, "step": 19450 }, { "epoch": 1.4047498736735724, "grad_norm": 9.479101555816884, "learning_rate": 5.734803861861232e-07, "logits/chosen": -0.953125, "logits/rejected": -0.322265625, "logps/chosen": -536.0, "logps/rejected": -620.0, "loss": 0.0364, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -25.375, "rewards/margins": 12.0625, "rewards/rejected": -37.5, "step": 19460 }, { "epoch": 1.4054717389735076, "grad_norm": 9.544901907928612, "learning_rate": 5.733330944445938e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.41796875, "logps/chosen": -524.0, "logps/rejected": -604.0, "loss": 0.0396, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.625, "rewards/margins": 12.75, "rewards/rejected": -35.5, "step": 19470 }, { "epoch": 1.4061936042734426, "grad_norm": 2.5420809158994824, "learning_rate": 5.731859161352903e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.423828125, "logps/chosen": -512.0, "logps/rejected": -612.0, "loss": 0.0389, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -24.625, "rewards/margins": 12.1875, "rewards/rejected": -36.75, "step": 19480 }, { "epoch": 1.4069154695733777, "grad_norm": 7.854840033103349, "learning_rate": 5.730388511126939e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.337890625, "logps/chosen": -496.0, "logps/rejected": -584.0, "loss": 0.0475, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.75, "rewards/margins": 11.5625, "rewards/rejected": -34.25, "step": 19490 }, { "epoch": 1.4076373348733127, "grad_norm": 10.506892464804357, "learning_rate": 5.728918992315463e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.349609375, "logps/chosen": -536.0, "logps/rejected": -616.0, "loss": 0.0403, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.75, "rewards/margins": 11.125, "rewards/rejected": -34.75, "step": 19500 }, { "epoch": 1.4083592001732477, "grad_norm": 2.3756983675078875, "learning_rate": 5.727450603468501e-07, "logits/chosen": -1.0625, "logits/rejected": -0.40234375, "logps/chosen": -492.0, "logps/rejected": -572.0, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 12.1875, "rewards/rejected": -34.0, "step": 19510 }, { "epoch": 1.4090810654731827, "grad_norm": 13.835091852425492, "learning_rate": 5.725983343138682e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.248046875, "logps/chosen": -486.0, "logps/rejected": -572.0, "loss": 0.0506, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.125, "rewards/margins": 12.25, "rewards/rejected": -34.5, "step": 19520 }, { "epoch": 1.4098029307731177, "grad_norm": 12.561339281344464, "learning_rate": 5.724517209881224e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.357421875, "logps/chosen": -510.0, "logps/rejected": -588.0, "loss": 0.046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.625, "rewards/rejected": -32.75, "step": 19530 }, { "epoch": 1.4105247960730527, "grad_norm": 3.992305791224224, "learning_rate": 5.723052202253938e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.47265625, "logps/chosen": -502.0, "logps/rejected": -552.0, "loss": 0.0367, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 10.1875, "rewards/rejected": -30.875, "step": 19540 }, { "epoch": 1.4112466613729877, "grad_norm": 8.572872676246893, "learning_rate": 5.721588318817212e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.490234375, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0401, "rewards/accuracies": 0.96875, "rewards/chosen": -18.875, "rewards/margins": 11.0625, "rewards/rejected": -29.875, "step": 19550 }, { "epoch": 1.411968526672923, "grad_norm": 11.11011059553233, "learning_rate": 5.720125558134017e-07, "logits/chosen": -0.9375, "logits/rejected": -0.291015625, "logps/chosen": -474.0, "logps/rejected": -552.0, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 11.0, "rewards/rejected": -30.5, "step": 19560 }, { "epoch": 1.4126903919728577, "grad_norm": 8.933925893227146, "learning_rate": 5.718663918769888e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.40625, "logps/chosen": -490.0, "logps/rejected": -556.0, "loss": 0.0207, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 11.3125, "rewards/rejected": -30.75, "step": 19570 }, { "epoch": 1.413412257272793, "grad_norm": 5.640379172411014, "learning_rate": 5.717203399292928e-07, "logits/chosen": -0.921875, "logits/rejected": -0.26953125, "logps/chosen": -498.0, "logps/rejected": -568.0, "loss": 0.0474, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.875, "rewards/margins": 10.875, "rewards/rejected": -32.75, "step": 19580 }, { "epoch": 1.414134122572728, "grad_norm": 3.628429358764815, "learning_rate": 5.7157439982738e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.453125, "logps/chosen": -508.0, "logps/rejected": -596.0, "loss": 0.0469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 11.5, "rewards/rejected": -33.25, "step": 19590 }, { "epoch": 1.414855987872663, "grad_norm": 6.7818717352158755, "learning_rate": 5.714285714285714e-07, "logits/chosen": -1.078125, "logits/rejected": -0.400390625, "logps/chosen": -474.0, "logps/rejected": -568.0, "loss": 0.0354, "rewards/accuracies": 0.96875, "rewards/chosen": -20.75, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 19600 }, { "epoch": 1.415577853172598, "grad_norm": 5.7683028107609005, "learning_rate": 5.712828545904434e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.51953125, "logps/chosen": -498.0, "logps/rejected": -556.0, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.125, "rewards/rejected": -31.25, "step": 19610 }, { "epoch": 1.416299718472533, "grad_norm": 3.0811199723594846, "learning_rate": 5.711372491708257e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.24609375, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 11.625, "rewards/rejected": -30.75, "step": 19620 }, { "epoch": 1.417021583772468, "grad_norm": 2.5311279214615117, "learning_rate": 5.709917550278023e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.376953125, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0498, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 12.0, "rewards/rejected": -32.25, "step": 19630 }, { "epoch": 1.417743449072403, "grad_norm": 10.606377268546982, "learning_rate": 5.7084637201971e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.328125, "logps/chosen": -508.0, "logps/rejected": -612.0, "loss": 0.0306, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.5, "rewards/margins": 12.1875, "rewards/rejected": -34.75, "step": 19640 }, { "epoch": 1.4184653143723382, "grad_norm": 12.944657904484968, "learning_rate": 5.707011000051373e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.5, "logps/chosen": -504.0, "logps/rejected": -580.0, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 11.6875, "rewards/rejected": -32.5, "step": 19650 }, { "epoch": 1.419187179672273, "grad_norm": 4.02588057730012, "learning_rate": 5.705559388429252e-07, "logits/chosen": -0.875, "logits/rejected": -0.40625, "logps/chosen": -508.0, "logps/rejected": -600.0, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -22.0, "rewards/margins": 11.875, "rewards/rejected": -34.0, "step": 19660 }, { "epoch": 1.4199090449722083, "grad_norm": 2.8491760553408843, "learning_rate": 5.704108883921655e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.1337890625, "logps/chosen": -498.0, "logps/rejected": -576.0, "loss": 0.0444, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.875, "rewards/margins": 10.9375, "rewards/rejected": -34.75, "step": 19670 }, { "epoch": 1.4206309102721433, "grad_norm": 9.726662446339125, "learning_rate": 5.70265948512201e-07, "logits/chosen": -1.03125, "logits/rejected": -0.361328125, "logps/chosen": -500.0, "logps/rejected": -580.0, "loss": 0.04, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.125, "rewards/margins": 12.125, "rewards/rejected": -34.25, "step": 19680 }, { "epoch": 1.4213527755720783, "grad_norm": 6.174720217811745, "learning_rate": 5.701211190626241e-07, "logits/chosen": -0.90625, "logits/rejected": -0.419921875, "logps/chosen": -524.0, "logps/rejected": -624.0, "loss": 0.0348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.0, "rewards/margins": 12.4375, "rewards/rejected": -34.5, "step": 19690 }, { "epoch": 1.4220746408720133, "grad_norm": 2.3576406305594158, "learning_rate": 5.699763999032772e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.279296875, "logps/chosen": -494.0, "logps/rejected": -576.0, "loss": 0.0597, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.75, "rewards/margins": 11.4375, "rewards/rejected": -33.25, "step": 19700 }, { "epoch": 1.4227965061719483, "grad_norm": 5.128743721191619, "learning_rate": 5.69831790894251e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.33203125, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.0361, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 11.875, "rewards/rejected": -31.875, "step": 19710 }, { "epoch": 1.4235183714718833, "grad_norm": 5.514388071454609, "learning_rate": 5.69687291895885e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.263671875, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0338, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.625, "rewards/margins": 11.25, "rewards/rejected": -33.0, "step": 19720 }, { "epoch": 1.4242402367718183, "grad_norm": 7.04644150615564, "learning_rate": 5.695429027687665e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.353515625, "logps/chosen": -476.0, "logps/rejected": -552.0, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -19.5, "rewards/margins": 12.0, "rewards/rejected": -31.5, "step": 19730 }, { "epoch": 1.4249621020717533, "grad_norm": 12.83458097859366, "learning_rate": 5.693986233737299e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.39453125, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 11.5625, "rewards/rejected": -32.25, "step": 19740 }, { "epoch": 1.4256839673716883, "grad_norm": 2.7631597218546022, "learning_rate": 5.692544535718559e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.48046875, "logps/chosen": -516.0, "logps/rejected": -584.0, "loss": 0.0439, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 12.4375, "rewards/rejected": -34.0, "step": 19750 }, { "epoch": 1.4264058326716236, "grad_norm": 5.827850811725189, "learning_rate": 5.691103932244722e-07, "logits/chosen": -1.109375, "logits/rejected": -0.40234375, "logps/chosen": -496.0, "logps/rejected": -568.0, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 12.1875, "rewards/rejected": -33.0, "step": 19760 }, { "epoch": 1.4271276979715586, "grad_norm": 8.876365300724874, "learning_rate": 5.689664421931509e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.46875, "logps/chosen": -470.0, "logps/rejected": -548.0, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -21.125, "rewards/margins": 11.375, "rewards/rejected": -32.5, "step": 19770 }, { "epoch": 1.4278495632714936, "grad_norm": 4.395323647583876, "learning_rate": 5.6882260033971e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.376953125, "logps/chosen": -500.0, "logps/rejected": -568.0, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -21.0, "rewards/margins": 11.125, "rewards/rejected": -32.0, "step": 19780 }, { "epoch": 1.4285714285714286, "grad_norm": 8.485441774121842, "learning_rate": 5.686788675262114e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.349609375, "logps/chosen": -492.0, "logps/rejected": -556.0, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": -20.625, "rewards/margins": 11.1875, "rewards/rejected": -31.75, "step": 19790 }, { "epoch": 1.4292932938713636, "grad_norm": 3.535252169506904, "learning_rate": 5.685352436149611e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.416015625, "logps/chosen": -500.0, "logps/rejected": -580.0, "loss": 0.0281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.375, "rewards/margins": 12.0, "rewards/rejected": -33.5, "step": 19800 }, { "epoch": 1.4300151591712986, "grad_norm": 8.044689603532062, "learning_rate": 5.683917284685082e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.4609375, "logps/chosen": -492.0, "logps/rejected": -580.0, "loss": 0.0353, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.25, "rewards/rejected": -32.75, "step": 19810 }, { "epoch": 1.4307370244712336, "grad_norm": 8.613844344421054, "learning_rate": 5.682483219496449e-07, "logits/chosen": -1.046875, "logits/rejected": -0.458984375, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0311, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 11.25, "rewards/rejected": -32.5, "step": 19820 }, { "epoch": 1.4314588897711686, "grad_norm": 5.2699529862521866, "learning_rate": 5.681050239214051e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.4609375, "logps/chosen": -490.0, "logps/rejected": -576.0, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 11.875, "rewards/rejected": -33.5, "step": 19830 }, { "epoch": 1.4321807550711037, "grad_norm": 5.762996490177777, "learning_rate": 5.679618342470647e-07, "logits/chosen": -1.046875, "logits/rejected": -0.515625, "logps/chosen": -524.0, "logps/rejected": -588.0, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.875, "rewards/margins": 12.125, "rewards/rejected": -34.0, "step": 19840 }, { "epoch": 1.4329026203710389, "grad_norm": 10.276860398967159, "learning_rate": 5.678187527901407e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.48046875, "logps/chosen": -512.0, "logps/rejected": -604.0, "loss": 0.0453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.5, "rewards/margins": 11.625, "rewards/rejected": -35.0, "step": 19850 }, { "epoch": 1.4336244856709737, "grad_norm": 3.1079864286370573, "learning_rate": 5.676757794143909e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.392578125, "logps/chosen": -520.0, "logps/rejected": -572.0, "loss": 0.057, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.5, "rewards/margins": 12.0, "rewards/rejected": -33.5, "step": 19860 }, { "epoch": 1.434346350970909, "grad_norm": 5.070245636390145, "learning_rate": 5.675329139838122e-07, "logits/chosen": -0.921875, "logits/rejected": -0.310546875, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 11.6875, "rewards/rejected": -32.75, "step": 19870 }, { "epoch": 1.435068216270844, "grad_norm": 6.840429945234436, "learning_rate": 5.673901563626419e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.2578125, "logps/chosen": -502.0, "logps/rejected": -588.0, "loss": 0.0391, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 12.5, "rewards/rejected": -33.0, "step": 19880 }, { "epoch": 1.435790081570779, "grad_norm": 14.079112927444505, "learning_rate": 5.672475064153561e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.484375, "logps/chosen": -492.0, "logps/rejected": -584.0, "loss": 0.0337, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 11.75, "rewards/rejected": -32.5, "step": 19890 }, { "epoch": 1.436511946870714, "grad_norm": 8.265467916634492, "learning_rate": 5.671049640066686e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.4453125, "logps/chosen": -462.0, "logps/rejected": -548.0, "loss": 0.0379, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 12.1875, "rewards/rejected": -32.25, "step": 19900 }, { "epoch": 1.437233812170649, "grad_norm": 6.799098910231795, "learning_rate": 5.66962529001532e-07, "logits/chosen": -1.171875, "logits/rejected": -0.447265625, "logps/chosen": -510.0, "logps/rejected": -580.0, "loss": 0.0288, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.9375, "rewards/rejected": -33.5, "step": 19910 }, { "epoch": 1.437955677470584, "grad_norm": 7.561518612006637, "learning_rate": 5.668202012651357e-07, "logits/chosen": -1.2421875, "logits/rejected": -0.5078125, "logps/chosen": -464.0, "logps/rejected": -532.0, "loss": 0.0335, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 11.25, "rewards/rejected": -31.0, "step": 19920 }, { "epoch": 1.438677542770519, "grad_norm": 8.902819059727214, "learning_rate": 5.666779806629058e-07, "logits/chosen": -1.046875, "logits/rejected": -0.470703125, "logps/chosen": -524.0, "logps/rejected": -600.0, "loss": 0.0376, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.75, "rewards/margins": 11.875, "rewards/rejected": -34.75, "step": 19930 }, { "epoch": 1.4393994080704542, "grad_norm": 12.444529673099282, "learning_rate": 5.665358670605048e-07, "logits/chosen": -1.171875, "logits/rejected": -0.55078125, "logps/chosen": -500.0, "logps/rejected": -568.0, "loss": 0.0388, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.3125, "rewards/rejected": -31.25, "step": 19940 }, { "epoch": 1.440121273370389, "grad_norm": 8.537312125485021, "learning_rate": 5.663938603238308e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.380859375, "logps/chosen": -474.0, "logps/rejected": -564.0, "loss": 0.0569, "rewards/accuracies": 0.96875, "rewards/chosen": -19.375, "rewards/margins": 11.75, "rewards/rejected": -31.125, "step": 19950 }, { "epoch": 1.4408431386703242, "grad_norm": 7.102500153056601, "learning_rate": 5.662519603190176e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.44140625, "logps/chosen": -462.0, "logps/rejected": -544.0, "loss": 0.0436, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 11.0, "rewards/rejected": -30.375, "step": 19960 }, { "epoch": 1.4415650039702592, "grad_norm": 9.705889584830176, "learning_rate": 5.661101669124328e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.48828125, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0422, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 10.875, "rewards/rejected": -30.0, "step": 19970 }, { "epoch": 1.4422868692701942, "grad_norm": 7.418565085296477, "learning_rate": 5.659684799706784e-07, "logits/chosen": -1.0, "logits/rejected": -0.466796875, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.0303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 10.625, "rewards/rejected": -29.75, "step": 19980 }, { "epoch": 1.4430087345701292, "grad_norm": 2.4201235071809246, "learning_rate": 5.658268993605907e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.38671875, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 11.5, "rewards/rejected": -30.5, "step": 19990 }, { "epoch": 1.4437305998700642, "grad_norm": 12.98048863550099, "learning_rate": 5.65685424949238e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.498046875, "logps/chosen": -442.0, "logps/rejected": -528.0, "loss": 0.0417, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.375, "rewards/margins": 11.0, "rewards/rejected": -28.5, "step": 20000 }, { "epoch": 1.4437305998700642, "eval_logits/chosen": -1.015625, "eval_logits/rejected": -0.478515625, "eval_logps/chosen": -472.0, "eval_logps/rejected": -524.0, "eval_loss": 0.2521119713783264, "eval_rewards/accuracies": 0.9192370176315308, "eval_rewards/chosen": -19.0, "eval_rewards/margins": 8.9375, "eval_rewards/rejected": -27.875, "eval_runtime": 2854.1677, "eval_samples_per_second": 34.512, "eval_steps_per_second": 0.54, "step": 20000 }, { "epoch": 1.4444524651699993, "grad_norm": 9.731313258053063, "learning_rate": 5.655440566039219e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.447265625, "logps/chosen": -470.0, "logps/rejected": -540.0, "loss": 0.0528, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.375, "rewards/margins": 11.1875, "rewards/rejected": -29.625, "step": 20010 }, { "epoch": 1.4451743304699343, "grad_norm": 14.19723793242637, "learning_rate": 5.654027941921755e-07, "logits/chosen": -1.03125, "logits/rejected": -0.396484375, "logps/chosen": -456.0, "logps/rejected": -544.0, "loss": 0.0503, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.75, "rewards/margins": 12.1875, "rewards/rejected": -29.875, "step": 20020 }, { "epoch": 1.4458961957698693, "grad_norm": 2.6130247093438523, "learning_rate": 5.65261637581764e-07, "logits/chosen": -0.96875, "logits/rejected": -0.3359375, "logps/chosen": -474.0, "logps/rejected": -552.0, "loss": 0.0408, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.625, "rewards/margins": 11.125, "rewards/rejected": -30.75, "step": 20030 }, { "epoch": 1.4466180610698043, "grad_norm": 6.876607714388175, "learning_rate": 5.65120586640683e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.484375, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.0257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.5625, "rewards/rejected": -31.625, "step": 20040 }, { "epoch": 1.4473399263697395, "grad_norm": 1.6504585526571298, "learning_rate": 5.649796412371589e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.314453125, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0345, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 12.1875, "rewards/rejected": -32.25, "step": 20050 }, { "epoch": 1.4480617916696743, "grad_norm": 9.018011542350216, "learning_rate": 5.648388012396479e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.2373046875, "logps/chosen": -470.0, "logps/rejected": -560.0, "loss": 0.0466, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 10.9375, "rewards/rejected": -30.5, "step": 20060 }, { "epoch": 1.4487836569696095, "grad_norm": 7.911647274449866, "learning_rate": 5.646980665168356e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.412109375, "logps/chosen": -454.0, "logps/rejected": -544.0, "loss": 0.0376, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 11.5625, "rewards/rejected": -31.0, "step": 20070 }, { "epoch": 1.4495055222695445, "grad_norm": 4.360549020858272, "learning_rate": 5.645574369376366e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.431640625, "logps/chosen": -512.0, "logps/rejected": -572.0, "loss": 0.0385, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.875, "rewards/margins": 12.0, "rewards/rejected": -33.0, "step": 20080 }, { "epoch": 1.4502273875694796, "grad_norm": 7.9192656791168545, "learning_rate": 5.644169123711941e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.390625, "logps/chosen": -470.0, "logps/rejected": -556.0, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.3125, "rewards/rejected": -31.25, "step": 20090 }, { "epoch": 1.4509492528694146, "grad_norm": 2.1579497481502545, "learning_rate": 5.642764926868786e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.416015625, "logps/chosen": -520.0, "logps/rejected": -576.0, "loss": 0.0439, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 10.9375, "rewards/rejected": -32.0, "step": 20100 }, { "epoch": 1.4516711181693496, "grad_norm": 8.92337797028772, "learning_rate": 5.641361777542885e-07, "logits/chosen": -1.0, "logits/rejected": -0.451171875, "logps/chosen": -502.0, "logps/rejected": -556.0, "loss": 0.0418, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 11.0625, "rewards/rejected": -31.5, "step": 20110 }, { "epoch": 1.4523929834692846, "grad_norm": 9.4862092484231, "learning_rate": 5.639959674432491e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.427734375, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0592, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 10.8125, "rewards/rejected": -30.125, "step": 20120 }, { "epoch": 1.4531148487692196, "grad_norm": 1.8132867688844017, "learning_rate": 5.638558616238116e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.515625, "logps/chosen": -486.0, "logps/rejected": -544.0, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -20.0, "rewards/margins": 10.375, "rewards/rejected": -30.375, "step": 20130 }, { "epoch": 1.4538367140691548, "grad_norm": 5.669031089758203, "learning_rate": 5.637158601662535e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.365234375, "logps/chosen": -492.0, "logps/rejected": -552.0, "loss": 0.0323, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 11.4375, "rewards/rejected": -31.5, "step": 20140 }, { "epoch": 1.4545585793690896, "grad_norm": 8.360386214168537, "learning_rate": 5.635759629410775e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.47265625, "logps/chosen": -444.0, "logps/rejected": -532.0, "loss": 0.0381, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 10.6875, "rewards/rejected": -29.125, "step": 20150 }, { "epoch": 1.4552804446690248, "grad_norm": 6.260558068843859, "learning_rate": 5.63436169819011e-07, "logits/chosen": -1.046875, "logits/rejected": -0.4140625, "logps/chosen": -490.0, "logps/rejected": -572.0, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -20.0, "rewards/margins": 10.9375, "rewards/rejected": -30.875, "step": 20160 }, { "epoch": 1.4560023099689599, "grad_norm": 4.148591885332768, "learning_rate": 5.63296480671006e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.375, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0455, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.875, "rewards/margins": 10.8125, "rewards/rejected": -30.625, "step": 20170 }, { "epoch": 1.4567241752688949, "grad_norm": 8.371111509013566, "learning_rate": 5.631568953682381e-07, "logits/chosen": -0.921875, "logits/rejected": -0.259765625, "logps/chosen": -470.0, "logps/rejected": -556.0, "loss": 0.0345, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 10.6875, "rewards/rejected": -31.125, "step": 20180 }, { "epoch": 1.4574460405688299, "grad_norm": 7.779898256403466, "learning_rate": 5.630174137821066e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.322265625, "logps/chosen": -438.0, "logps/rejected": -508.0, "loss": 0.0329, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.125, "rewards/margins": 10.6875, "rewards/rejected": -27.875, "step": 20190 }, { "epoch": 1.4581679058687649, "grad_norm": 9.998821463065156, "learning_rate": 5.628780357842334e-07, "logits/chosen": -0.953125, "logits/rejected": -0.296875, "logps/chosen": -482.0, "logps/rejected": -568.0, "loss": 0.0506, "rewards/accuracies": 0.96875, "rewards/chosen": -20.5, "rewards/margins": 12.125, "rewards/rejected": -32.5, "step": 20200 }, { "epoch": 1.4588897711687, "grad_norm": 6.023754002986316, "learning_rate": 5.627387612464627e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.486328125, "logps/chosen": -480.0, "logps/rejected": -532.0, "loss": 0.0332, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.125, "rewards/margins": 11.4375, "rewards/rejected": -28.625, "step": 20210 }, { "epoch": 1.459611636468635, "grad_norm": 7.965266219097138, "learning_rate": 5.625995900408606e-07, "logits/chosen": -0.71875, "logits/rejected": -0.10107421875, "logps/chosen": -470.0, "logps/rejected": -524.0, "loss": 0.0438, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 10.9375, "rewards/rejected": -30.0, "step": 20220 }, { "epoch": 1.46033350176857, "grad_norm": 7.111057092717457, "learning_rate": 5.624605220397146e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.291015625, "logps/chosen": -466.0, "logps/rejected": -560.0, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.625, "rewards/margins": 12.4375, "rewards/rejected": -31.0, "step": 20230 }, { "epoch": 1.461055367068505, "grad_norm": 7.581626780852195, "learning_rate": 5.623215571155332e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.466796875, "logps/chosen": -490.0, "logps/rejected": -560.0, "loss": 0.0283, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.75, "rewards/margins": 11.5, "rewards/rejected": -31.125, "step": 20240 }, { "epoch": 1.4617772323684401, "grad_norm": 6.490161778357701, "learning_rate": 5.621826951410452e-07, "logits/chosen": -0.75, "logits/rejected": -0.306640625, "logps/chosen": -478.0, "logps/rejected": -552.0, "loss": 0.0439, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 11.1875, "rewards/rejected": -32.25, "step": 20250 }, { "epoch": 1.4624990976683752, "grad_norm": 5.454915487123184, "learning_rate": 5.620439359891992e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.443359375, "logps/chosen": -494.0, "logps/rejected": -564.0, "loss": 0.0361, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.875, "rewards/margins": 10.3125, "rewards/rejected": -32.25, "step": 20260 }, { "epoch": 1.4632209629683102, "grad_norm": 11.63893693368547, "learning_rate": 5.619052795331631e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.25390625, "logps/chosen": -520.0, "logps/rejected": -592.0, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -22.5, "rewards/margins": 11.125, "rewards/rejected": -33.75, "step": 20270 }, { "epoch": 1.4639428282682452, "grad_norm": 2.7286740300004584, "learning_rate": 5.617667256463242e-07, "logits/chosen": -1.046875, "logits/rejected": -0.38671875, "logps/chosen": -494.0, "logps/rejected": -572.0, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -19.625, "rewards/margins": 11.625, "rewards/rejected": -31.375, "step": 20280 }, { "epoch": 1.4646646935681802, "grad_norm": 3.2626678895628842, "learning_rate": 5.616282742022878e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.419921875, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0236, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 11.375, "rewards/rejected": -31.625, "step": 20290 }, { "epoch": 1.4653865588681152, "grad_norm": 3.9751187526907255, "learning_rate": 5.614899250748771e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.5625, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0304, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.875, "rewards/rejected": -31.75, "step": 20300 }, { "epoch": 1.4661084241680502, "grad_norm": 9.087456495866993, "learning_rate": 5.613516781381333e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.5, "logps/chosen": -494.0, "logps/rejected": -552.0, "loss": 0.0339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.125, "rewards/margins": 11.25, "rewards/rejected": -30.375, "step": 20310 }, { "epoch": 1.4668302894679852, "grad_norm": 2.7306122648238933, "learning_rate": 5.612135332663137e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.361328125, "logps/chosen": -462.0, "logps/rejected": -548.0, "loss": 0.0509, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 10.6875, "rewards/rejected": -30.375, "step": 20320 }, { "epoch": 1.4675521547679202, "grad_norm": 2.9260225819075107, "learning_rate": 5.61075490333893e-07, "logits/chosen": -1.109375, "logits/rejected": -0.515625, "logps/chosen": -472.0, "logps/rejected": -560.0, "loss": 0.0502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.0, "rewards/margins": 10.625, "rewards/rejected": -31.5, "step": 20330 }, { "epoch": 1.4682740200678555, "grad_norm": 5.215619820744068, "learning_rate": 5.609375492155617e-07, "logits/chosen": -0.8125, "logits/rejected": -0.138671875, "logps/chosen": -484.0, "logps/rejected": -540.0, "loss": 0.0305, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.875, "rewards/margins": 10.75, "rewards/rejected": -30.75, "step": 20340 }, { "epoch": 1.4689958853677902, "grad_norm": 4.081946264441409, "learning_rate": 5.607997097862253e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.44921875, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.4375, "rewards/rejected": -32.0, "step": 20350 }, { "epoch": 1.4697177506677255, "grad_norm": 7.100311463284256, "learning_rate": 5.60661971921005e-07, "logits/chosen": -1.109375, "logits/rejected": -0.47265625, "logps/chosen": -516.0, "logps/rejected": -600.0, "loss": 0.0498, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.875, "rewards/margins": 12.4375, "rewards/rejected": -34.25, "step": 20360 }, { "epoch": 1.4704396159676605, "grad_norm": 5.711155632933553, "learning_rate": 5.605243354952362e-07, "logits/chosen": -0.953125, "logits/rejected": -0.431640625, "logps/chosen": -506.0, "logps/rejected": -564.0, "loss": 0.0493, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 11.125, "rewards/rejected": -33.0, "step": 20370 }, { "epoch": 1.4711614812675955, "grad_norm": 4.5287091031227416, "learning_rate": 5.603868003844686e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.53515625, "logps/chosen": -502.0, "logps/rejected": -592.0, "loss": 0.0392, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.375, "rewards/margins": 11.4375, "rewards/rejected": -32.75, "step": 20380 }, { "epoch": 1.4718833465675305, "grad_norm": 6.451910104545657, "learning_rate": 5.602493664644657e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.474609375, "logps/chosen": -500.0, "logps/rejected": -560.0, "loss": 0.0411, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 11.125, "rewards/rejected": -31.125, "step": 20390 }, { "epoch": 1.4726052118674655, "grad_norm": 6.938135374218518, "learning_rate": 5.601120336112038e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.439453125, "logps/chosen": -498.0, "logps/rejected": -572.0, "loss": 0.0417, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 10.625, "rewards/rejected": -31.375, "step": 20400 }, { "epoch": 1.4733270771674005, "grad_norm": 11.518152359849005, "learning_rate": 5.599748017008724e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.4375, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.0309, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.375, "rewards/margins": 11.4375, "rewards/rejected": -32.75, "step": 20410 }, { "epoch": 1.4740489424673355, "grad_norm": 7.549093289259376, "learning_rate": 5.598376706098727e-07, "logits/chosen": -1.0625, "logits/rejected": -0.51171875, "logps/chosen": -494.0, "logps/rejected": -564.0, "loss": 0.0257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 10.75, "rewards/rejected": -31.125, "step": 20420 }, { "epoch": 1.4747708077672708, "grad_norm": 8.62191075114058, "learning_rate": 5.59700640214818e-07, "logits/chosen": -0.82421875, "logits/rejected": -0.298828125, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 20430 }, { "epoch": 1.4754926730672056, "grad_norm": 6.655389075834914, "learning_rate": 5.595637103925327e-07, "logits/chosen": -0.96875, "logits/rejected": -0.26953125, "logps/chosen": -458.0, "logps/rejected": -556.0, "loss": 0.0406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 12.625, "rewards/rejected": -32.5, "step": 20440 }, { "epoch": 1.4762145383671408, "grad_norm": 4.687734362788046, "learning_rate": 5.594268810200525e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.421875, "logps/chosen": -472.0, "logps/rejected": -572.0, "loss": 0.0235, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 11.0625, "rewards/rejected": -32.25, "step": 20450 }, { "epoch": 1.4769364036670758, "grad_norm": 4.206112659489805, "learning_rate": 5.592901519746228e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.408203125, "logps/chosen": -466.0, "logps/rejected": -572.0, "loss": 0.0549, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.5, "rewards/rejected": -31.5, "step": 20460 }, { "epoch": 1.4776582689670108, "grad_norm": 3.787678355250991, "learning_rate": 5.591535231336994e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.47265625, "logps/chosen": -472.0, "logps/rejected": -560.0, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -20.625, "rewards/margins": 11.75, "rewards/rejected": -32.25, "step": 20470 }, { "epoch": 1.4783801342669458, "grad_norm": 8.182959274226269, "learning_rate": 5.590169943749474e-07, "logits/chosen": -0.875, "logits/rejected": -0.439453125, "logps/chosen": -488.0, "logps/rejected": -568.0, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -20.375, "rewards/margins": 10.5, "rewards/rejected": -31.0, "step": 20480 }, { "epoch": 1.4791019995668808, "grad_norm": 4.270270194535877, "learning_rate": 5.588805655762408e-07, "logits/chosen": -0.73828125, "logits/rejected": -0.2373046875, "logps/chosen": -462.0, "logps/rejected": -568.0, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 11.6875, "rewards/rejected": -30.75, "step": 20490 }, { "epoch": 1.4798238648668158, "grad_norm": 1.8938672770225975, "learning_rate": 5.587442366156625e-07, "logits/chosen": -0.796875, "logits/rejected": -0.365234375, "logps/chosen": -444.0, "logps/rejected": -516.0, "loss": 0.0439, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.75, "rewards/margins": 10.9375, "rewards/rejected": -29.625, "step": 20500 }, { "epoch": 1.4805457301667508, "grad_norm": 2.2682721762394205, "learning_rate": 5.58608007371503e-07, "logits/chosen": -0.859375, "logits/rejected": -0.265625, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -20.5, "rewards/margins": 10.6875, "rewards/rejected": -31.125, "step": 20510 }, { "epoch": 1.4812675954666858, "grad_norm": 15.314288040079333, "learning_rate": 5.584718777222606e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.4609375, "logps/chosen": -482.0, "logps/rejected": -588.0, "loss": 0.0512, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 11.625, "rewards/rejected": -31.75, "step": 20520 }, { "epoch": 1.4819894607666209, "grad_norm": 11.932389607152054, "learning_rate": 5.58335847546641e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.47265625, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -17.75, "rewards/margins": 11.875, "rewards/rejected": -29.625, "step": 20530 }, { "epoch": 1.482711326066556, "grad_norm": 5.373142588140227, "learning_rate": 5.58199916723556e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.32421875, "logps/chosen": -486.0, "logps/rejected": -564.0, "loss": 0.0247, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 12.0625, "rewards/rejected": -31.0, "step": 20540 }, { "epoch": 1.4834331913664909, "grad_norm": 5.997135646577196, "learning_rate": 5.580640851321247e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.23828125, "logps/chosen": -502.0, "logps/rejected": -568.0, "loss": 0.0379, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.5, "rewards/margins": 11.125, "rewards/rejected": -31.75, "step": 20550 }, { "epoch": 1.484155056666426, "grad_norm": 3.105165042794338, "learning_rate": 5.579283526516706e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.34765625, "logps/chosen": -506.0, "logps/rejected": -568.0, "loss": 0.0264, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.75, "rewards/rejected": -33.25, "step": 20560 }, { "epoch": 1.4848769219663611, "grad_norm": 6.621109077289986, "learning_rate": 5.577927191617239e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.26953125, "logps/chosen": -478.0, "logps/rejected": -572.0, "loss": 0.0262, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 11.375, "rewards/rejected": -32.0, "step": 20570 }, { "epoch": 1.4855987872662961, "grad_norm": 8.963504196643251, "learning_rate": 5.576571845420189e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.390625, "logps/chosen": -494.0, "logps/rejected": -580.0, "loss": 0.0335, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 11.0, "rewards/rejected": -33.5, "step": 20580 }, { "epoch": 1.4863206525662311, "grad_norm": 9.906124854232992, "learning_rate": 5.575217486724946e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.439453125, "logps/chosen": -498.0, "logps/rejected": -564.0, "loss": 0.0467, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 10.625, "rewards/rejected": -31.375, "step": 20590 }, { "epoch": 1.4870425178661661, "grad_norm": 15.328086761650429, "learning_rate": 5.573864114332941e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.5703125, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0347, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.5, "rewards/margins": 12.0, "rewards/rejected": -32.5, "step": 20600 }, { "epoch": 1.4877643831661012, "grad_norm": 5.332217012088421, "learning_rate": 5.572511727047639e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.353515625, "logps/chosen": -520.0, "logps/rejected": -584.0, "loss": 0.0296, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.75, "rewards/margins": 11.8125, "rewards/rejected": -34.5, "step": 20610 }, { "epoch": 1.4884862484660362, "grad_norm": 2.4037065078427804, "learning_rate": 5.571160323674535e-07, "logits/chosen": -1.03125, "logits/rejected": -0.2431640625, "logps/chosen": -480.0, "logps/rejected": -548.0, "loss": 0.0244, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 12.5625, "rewards/rejected": -32.25, "step": 20620 }, { "epoch": 1.4892081137659714, "grad_norm": 4.888769364471836, "learning_rate": 5.569809903021156e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.56640625, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.0338, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 20630 }, { "epoch": 1.4899299790659062, "grad_norm": 6.519177738082624, "learning_rate": 5.568460463897046e-07, "logits/chosen": -0.72265625, "logits/rejected": -0.251953125, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0377, "rewards/accuracies": 0.96875, "rewards/chosen": -20.75, "rewards/margins": 10.6875, "rewards/rejected": -31.5, "step": 20640 }, { "epoch": 1.4906518443658414, "grad_norm": 12.446428020417429, "learning_rate": 5.567112005113767e-07, "logits/chosen": -1.0625, "logits/rejected": -0.431640625, "logps/chosen": -494.0, "logps/rejected": -544.0, "loss": 0.0435, "rewards/accuracies": 0.96875, "rewards/chosen": -19.875, "rewards/margins": 10.5, "rewards/rejected": -30.375, "step": 20650 }, { "epoch": 1.4913737096657764, "grad_norm": 5.759391617441124, "learning_rate": 5.565764525484902e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.400390625, "logps/chosen": -482.0, "logps/rejected": -540.0, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 11.625, "rewards/rejected": -31.5, "step": 20660 }, { "epoch": 1.4920955749657114, "grad_norm": 0.9318920112154633, "learning_rate": 5.564418023826033e-07, "logits/chosen": -1.015625, "logits/rejected": -0.390625, "logps/chosen": -466.0, "logps/rejected": -544.0, "loss": 0.0483, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.3125, "rewards/rejected": -31.0, "step": 20670 }, { "epoch": 1.4928174402656464, "grad_norm": 3.57096501003294, "learning_rate": 5.563072498954754e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.3671875, "logps/chosen": -502.0, "logps/rejected": -572.0, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -21.375, "rewards/margins": 11.75, "rewards/rejected": -33.0, "step": 20680 }, { "epoch": 1.4935393055655815, "grad_norm": 7.586171585753885, "learning_rate": 5.561727949690656e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.431640625, "logps/chosen": -506.0, "logps/rejected": -576.0, "loss": 0.0432, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.6875, "rewards/rejected": -32.25, "step": 20690 }, { "epoch": 1.4942611708655165, "grad_norm": 9.490262003763261, "learning_rate": 5.560384374855327e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.345703125, "logps/chosen": -488.0, "logps/rejected": -564.0, "loss": 0.0338, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.3125, "rewards/rejected": -32.0, "step": 20700 }, { "epoch": 1.4949830361654515, "grad_norm": 2.488937246185105, "learning_rate": 5.559041773272347e-07, "logits/chosen": -0.953125, "logits/rejected": -0.42578125, "logps/chosen": -486.0, "logps/rejected": -564.0, "loss": 0.0498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -20.625, "rewards/margins": 10.5, "rewards/rejected": -31.125, "step": 20710 }, { "epoch": 1.4957049014653865, "grad_norm": 7.076825364960802, "learning_rate": 5.557700143767284e-07, "logits/chosen": -1.0, "logits/rejected": -0.423828125, "logps/chosen": -474.0, "logps/rejected": -540.0, "loss": 0.0253, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.125, "step": 20720 }, { "epoch": 1.4964267667653215, "grad_norm": 2.150270783848349, "learning_rate": 5.556359485167687e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.357421875, "logps/chosen": -456.0, "logps/rejected": -532.0, "loss": 0.0364, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.25, "rewards/rejected": -30.375, "step": 20730 }, { "epoch": 1.4971486320652567, "grad_norm": 12.174686990833278, "learning_rate": 5.555019796303087e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.482421875, "logps/chosen": -456.0, "logps/rejected": -524.0, "loss": 0.0283, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 10.1875, "rewards/rejected": -28.75, "step": 20740 }, { "epoch": 1.4978704973651917, "grad_norm": 2.6111562674217814, "learning_rate": 5.553681076004985e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.48046875, "logps/chosen": -476.0, "logps/rejected": -568.0, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.5, "rewards/rejected": -31.875, "step": 20750 }, { "epoch": 1.4985923626651267, "grad_norm": 4.261160943152129, "learning_rate": 5.55234332310686e-07, "logits/chosen": -0.90625, "logits/rejected": -0.32421875, "logps/chosen": -516.0, "logps/rejected": -612.0, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -22.75, "rewards/margins": 12.0625, "rewards/rejected": -34.75, "step": 20760 }, { "epoch": 1.4993142279650618, "grad_norm": 7.4566304118814015, "learning_rate": 5.551006536444146e-07, "logits/chosen": -0.984375, "logits/rejected": -0.333984375, "logps/chosen": -490.0, "logps/rejected": -556.0, "loss": 0.0288, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.625, "rewards/margins": 11.3125, "rewards/rejected": -33.0, "step": 20770 }, { "epoch": 1.5000360932649968, "grad_norm": 6.926365336193132, "learning_rate": 5.549670714854249e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.46484375, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.1875, "rewards/rejected": -32.5, "step": 20780 }, { "epoch": 1.5007579585649318, "grad_norm": 8.305601251888366, "learning_rate": 5.548335857176527e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.384765625, "logps/chosen": -490.0, "logps/rejected": -568.0, "loss": 0.0247, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 12.0625, "rewards/rejected": -32.75, "step": 20790 }, { "epoch": 1.5014798238648668, "grad_norm": 7.424462883667929, "learning_rate": 5.547001962252292e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.39453125, "logps/chosen": -474.0, "logps/rejected": -556.0, "loss": 0.0324, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.75, "rewards/margins": 12.0, "rewards/rejected": -33.75, "step": 20800 }, { "epoch": 1.502201689164802, "grad_norm": 10.89036915397399, "learning_rate": 5.545669028924805e-07, "logits/chosen": -1.09375, "logits/rejected": -0.35546875, "logps/chosen": -484.0, "logps/rejected": -576.0, "loss": 0.0282, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.875, "rewards/margins": 11.875, "rewards/rejected": -32.75, "step": 20810 }, { "epoch": 1.5029235544647368, "grad_norm": 9.70045146289426, "learning_rate": 5.544337056039272e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.359375, "logps/chosen": -508.0, "logps/rejected": -592.0, "loss": 0.0394, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.5, "rewards/margins": 12.4375, "rewards/rejected": -34.0, "step": 20820 }, { "epoch": 1.503645419764672, "grad_norm": 6.798546814791989, "learning_rate": 5.543006042442842e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.306640625, "logps/chosen": -462.0, "logps/rejected": -568.0, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.5, "rewards/margins": 11.5, "rewards/rejected": -32.0, "step": 20830 }, { "epoch": 1.5043672850646068, "grad_norm": 10.664402395713267, "learning_rate": 5.541675986984596e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.25390625, "logps/chosen": -492.0, "logps/rejected": -572.0, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -20.75, "rewards/margins": 12.6875, "rewards/rejected": -33.5, "step": 20840 }, { "epoch": 1.505089150364542, "grad_norm": 3.6133979156755984, "learning_rate": 5.540346888515549e-07, "logits/chosen": -1.109375, "logits/rejected": -0.361328125, "logps/chosen": -512.0, "logps/rejected": -564.0, "loss": 0.0335, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.6875, "rewards/rejected": -32.75, "step": 20850 }, { "epoch": 1.505811015664477, "grad_norm": 10.088588731889127, "learning_rate": 5.539018745888647e-07, "logits/chosen": -1.015625, "logits/rejected": -0.373046875, "logps/chosen": -490.0, "logps/rejected": -580.0, "loss": 0.0294, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.875, "rewards/rejected": -32.0, "step": 20860 }, { "epoch": 1.506532880964412, "grad_norm": 8.504850831004449, "learning_rate": 5.537691557958757e-07, "logits/chosen": -0.796875, "logits/rejected": -0.333984375, "logps/chosen": -468.0, "logps/rejected": -548.0, "loss": 0.0466, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.0, "rewards/margins": 11.3125, "rewards/rejected": -30.25, "step": 20870 }, { "epoch": 1.507254746264347, "grad_norm": 2.519466209374201, "learning_rate": 5.536365323582665e-07, "logits/chosen": -0.859375, "logits/rejected": -0.33984375, "logps/chosen": -508.0, "logps/rejected": -564.0, "loss": 0.0414, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 10.875, "rewards/rejected": -31.125, "step": 20880 }, { "epoch": 1.507976611564282, "grad_norm": 9.858638719371928, "learning_rate": 5.535040041619073e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.369140625, "logps/chosen": -486.0, "logps/rejected": -580.0, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -21.25, "rewards/margins": 11.625, "rewards/rejected": -32.75, "step": 20890 }, { "epoch": 1.508698476864217, "grad_norm": 8.15058350643724, "learning_rate": 5.533715710928597e-07, "logits/chosen": -0.859375, "logits/rejected": -0.310546875, "logps/chosen": -500.0, "logps/rejected": -576.0, "loss": 0.0329, "rewards/accuracies": 0.96875, "rewards/chosen": -21.25, "rewards/margins": 11.625, "rewards/rejected": -32.75, "step": 20900 }, { "epoch": 1.509420342164152, "grad_norm": 5.920318371525918, "learning_rate": 5.532392330373758e-07, "logits/chosen": -1.015625, "logits/rejected": -0.388671875, "logps/chosen": -502.0, "logps/rejected": -584.0, "loss": 0.0403, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 11.375, "rewards/rejected": -32.25, "step": 20910 }, { "epoch": 1.5101422074640873, "grad_norm": 3.449475549609333, "learning_rate": 5.531069898818981e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.4921875, "logps/chosen": -482.0, "logps/rejected": -540.0, "loss": 0.0273, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.25, "rewards/margins": 11.375, "rewards/rejected": -30.625, "step": 20920 }, { "epoch": 1.5108640727640221, "grad_norm": 5.151773140371726, "learning_rate": 5.529748415130589e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.4140625, "logps/chosen": -512.0, "logps/rejected": -576.0, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -20.375, "rewards/margins": 11.625, "rewards/rejected": -32.0, "step": 20930 }, { "epoch": 1.5115859380639574, "grad_norm": 2.472707981395112, "learning_rate": 5.528427878176804e-07, "logits/chosen": -1.0, "logits/rejected": -0.6015625, "logps/chosen": -488.0, "logps/rejected": -560.0, "loss": 0.0275, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.5, "rewards/margins": 10.8125, "rewards/rejected": -31.25, "step": 20940 }, { "epoch": 1.5123078033638921, "grad_norm": 12.319357340474305, "learning_rate": 5.527108286827734e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.3203125, "logps/chosen": -478.0, "logps/rejected": -556.0, "loss": 0.0461, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.625, "rewards/rejected": -32.5, "step": 20950 }, { "epoch": 1.5130296686638274, "grad_norm": 4.991744589018043, "learning_rate": 5.525789639955376e-07, "logits/chosen": -1.078125, "logits/rejected": -0.474609375, "logps/chosen": -504.0, "logps/rejected": -568.0, "loss": 0.0255, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.5, "rewards/margins": 11.8125, "rewards/rejected": -32.25, "step": 20960 }, { "epoch": 1.5137515339637624, "grad_norm": 4.321078918745112, "learning_rate": 5.524471936433611e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.400390625, "logps/chosen": -496.0, "logps/rejected": -576.0, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 12.75, "rewards/rejected": -33.75, "step": 20970 }, { "epoch": 1.5144733992636974, "grad_norm": 3.939741323676411, "learning_rate": 5.523155175138198e-07, "logits/chosen": -1.125, "logits/rejected": -0.34375, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 12.0, "rewards/rejected": -33.0, "step": 20980 }, { "epoch": 1.5151952645636324, "grad_norm": 12.604547827686376, "learning_rate": 5.52183935494677e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.416015625, "logps/chosen": -504.0, "logps/rejected": -596.0, "loss": 0.041, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 12.25, "rewards/rejected": -34.75, "step": 20990 }, { "epoch": 1.5159171298635674, "grad_norm": 8.947692560899775, "learning_rate": 5.520524474738833e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.318359375, "logps/chosen": -476.0, "logps/rejected": -576.0, "loss": 0.0213, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.125, "rewards/margins": 12.0, "rewards/rejected": -33.25, "step": 21000 }, { "epoch": 1.5166389951635026, "grad_norm": 6.310440203751818, "learning_rate": 5.519210533395758e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.458984375, "logps/chosen": -482.0, "logps/rejected": -568.0, "loss": 0.0381, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.0, "rewards/margins": 11.4375, "rewards/rejected": -32.5, "step": 21010 }, { "epoch": 1.5173608604634374, "grad_norm": 6.79296130361368, "learning_rate": 5.517897529800779e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.59765625, "logps/chosen": -490.0, "logps/rejected": -568.0, "loss": 0.0448, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 12.0, "rewards/rejected": -32.75, "step": 21020 }, { "epoch": 1.5180827257633727, "grad_norm": 5.903211663554724, "learning_rate": 5.51658546283899e-07, "logits/chosen": -1.09375, "logits/rejected": -0.41796875, "logps/chosen": -494.0, "logps/rejected": -568.0, "loss": 0.0455, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.5, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 21030 }, { "epoch": 1.5188045910633075, "grad_norm": 6.213859158015678, "learning_rate": 5.515274331397337e-07, "logits/chosen": -1.015625, "logits/rejected": -0.48828125, "logps/chosen": -474.0, "logps/rejected": -532.0, "loss": 0.054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 10.75, "rewards/rejected": -29.5, "step": 21040 }, { "epoch": 1.5195264563632427, "grad_norm": 6.644605007441931, "learning_rate": 5.51396413436462e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.34375, "logps/chosen": -458.0, "logps/rejected": -524.0, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.0, "rewards/margins": 10.9375, "rewards/rejected": -29.0, "step": 21050 }, { "epoch": 1.5202483216631777, "grad_norm": 7.294965909588323, "learning_rate": 5.512654870631487e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.384765625, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.038, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.6875, "rewards/rejected": -31.5, "step": 21060 }, { "epoch": 1.5209701869631127, "grad_norm": 8.077098716415064, "learning_rate": 5.511346539090424e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.3515625, "logps/chosen": -456.0, "logps/rejected": -568.0, "loss": 0.0389, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 10.875, "rewards/rejected": -30.25, "step": 21070 }, { "epoch": 1.5216920522630477, "grad_norm": 10.918272985921915, "learning_rate": 5.51003913863576e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.40234375, "logps/chosen": -450.0, "logps/rejected": -532.0, "loss": 0.0295, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.625, "rewards/margins": 11.0625, "rewards/rejected": -29.625, "step": 21080 }, { "epoch": 1.5224139175629827, "grad_norm": 8.228085309878226, "learning_rate": 5.508732668163659e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.419921875, "logps/chosen": -468.0, "logps/rejected": -548.0, "loss": 0.0351, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.4375, "rewards/rejected": -30.25, "step": 21090 }, { "epoch": 1.5231357828629177, "grad_norm": 9.2600291931187, "learning_rate": 5.507427126572114e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.3828125, "logps/chosen": -480.0, "logps/rejected": -568.0, "loss": 0.0389, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 11.875, "rewards/rejected": -30.875, "step": 21100 }, { "epoch": 1.5238576481628527, "grad_norm": 5.2247005001698446, "learning_rate": 5.506122512760947e-07, "logits/chosen": -0.84375, "logits/rejected": -0.2451171875, "logps/chosen": -500.0, "logps/rejected": -584.0, "loss": 0.0402, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 12.0, "rewards/rejected": -32.75, "step": 21110 }, { "epoch": 1.524579513462788, "grad_norm": 10.059655779533378, "learning_rate": 5.504818825631803e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.447265625, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -21.125, "rewards/margins": 10.875, "rewards/rejected": -32.0, "step": 21120 }, { "epoch": 1.5253013787627228, "grad_norm": 11.672230428641047, "learning_rate": 5.503516064088146e-07, "logits/chosen": -0.90625, "logits/rejected": -0.2578125, "logps/chosen": -490.0, "logps/rejected": -580.0, "loss": 0.0401, "rewards/accuracies": 0.96875, "rewards/chosen": -22.0, "rewards/margins": 11.8125, "rewards/rejected": -33.75, "step": 21130 }, { "epoch": 1.526023244062658, "grad_norm": 9.745268465726328, "learning_rate": 5.502214227035259e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.38671875, "logps/chosen": -472.0, "logps/rejected": -556.0, "loss": 0.0516, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 12.0, "rewards/rejected": -32.75, "step": 21140 }, { "epoch": 1.5267451093625928, "grad_norm": 7.695911619817417, "learning_rate": 5.500913313380231e-07, "logits/chosen": -1.09375, "logits/rejected": -0.43359375, "logps/chosen": -506.0, "logps/rejected": -564.0, "loss": 0.0433, "rewards/accuracies": 0.96875, "rewards/chosen": -21.625, "rewards/margins": 11.375, "rewards/rejected": -33.0, "step": 21150 }, { "epoch": 1.527466974662528, "grad_norm": 9.878193914327314, "learning_rate": 5.499613322031964e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.419921875, "logps/chosen": -498.0, "logps/rejected": -580.0, "loss": 0.0568, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 11.75, "rewards/rejected": -33.5, "step": 21160 }, { "epoch": 1.528188839962463, "grad_norm": 7.425044374517048, "learning_rate": 5.498314251901161e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.318359375, "logps/chosen": -496.0, "logps/rejected": -584.0, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.625, "rewards/margins": 11.375, "rewards/rejected": -34.0, "step": 21170 }, { "epoch": 1.528910705262398, "grad_norm": 8.549609695220855, "learning_rate": 5.497016101900326e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.412109375, "logps/chosen": -478.0, "logps/rejected": -568.0, "loss": 0.0385, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.125, "rewards/margins": 11.5, "rewards/rejected": -32.75, "step": 21180 }, { "epoch": 1.529632570562333, "grad_norm": 12.246610124505033, "learning_rate": 5.495718870943762e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.4453125, "logps/chosen": -508.0, "logps/rejected": -592.0, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -22.75, "rewards/margins": 11.375, "rewards/rejected": -34.25, "step": 21190 }, { "epoch": 1.530354435862268, "grad_norm": 1.9947308425553154, "learning_rate": 5.494422557947561e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.3203125, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.0357, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 11.75, "rewards/rejected": -31.75, "step": 21200 }, { "epoch": 1.5310763011622033, "grad_norm": 2.1474501839567064, "learning_rate": 5.493127161829605e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.244140625, "logps/chosen": -512.0, "logps/rejected": -608.0, "loss": 0.0316, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -23.125, "rewards/margins": 11.625, "rewards/rejected": -34.75, "step": 21210 }, { "epoch": 1.531798166462138, "grad_norm": 8.437192449650572, "learning_rate": 5.491832681509561e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.365234375, "logps/chosen": -512.0, "logps/rejected": -592.0, "loss": 0.0423, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -23.25, "rewards/margins": 10.9375, "rewards/rejected": -34.25, "step": 21220 }, { "epoch": 1.5325200317620733, "grad_norm": 11.756459782959887, "learning_rate": 5.490539115908881e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.265625, "logps/chosen": -532.0, "logps/rejected": -608.0, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -24.125, "rewards/margins": 11.4375, "rewards/rejected": -35.5, "step": 21230 }, { "epoch": 1.533241897062008, "grad_norm": 1.9017281266767592, "learning_rate": 5.489246463950787e-07, "logits/chosen": -0.84375, "logits/rejected": -0.2451171875, "logps/chosen": -528.0, "logps/rejected": -620.0, "loss": 0.0509, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -24.875, "rewards/margins": 12.1875, "rewards/rejected": -37.0, "step": 21240 }, { "epoch": 1.5339637623619433, "grad_norm": 5.37440731282658, "learning_rate": 5.487954724560283e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.2470703125, "logps/chosen": -520.0, "logps/rejected": -596.0, "loss": 0.0372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -24.5, "rewards/margins": 11.5, "rewards/rejected": -36.0, "step": 21250 }, { "epoch": 1.5346856276618783, "grad_norm": 8.506327156613, "learning_rate": 5.486663896664134e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.439453125, "logps/chosen": -494.0, "logps/rejected": -596.0, "loss": 0.0472, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.25, "rewards/margins": 11.8125, "rewards/rejected": -35.0, "step": 21260 }, { "epoch": 1.5354074929618133, "grad_norm": 6.01645209102356, "learning_rate": 5.485373979190881e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.255859375, "logps/chosen": -506.0, "logps/rejected": -584.0, "loss": 0.0306, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.375, "rewards/margins": 11.6875, "rewards/rejected": -34.0, "step": 21270 }, { "epoch": 1.5361293582617483, "grad_norm": 2.7644082191537716, "learning_rate": 5.484084971070817e-07, "logits/chosen": -1.140625, "logits/rejected": -0.51953125, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.125, "rewards/margins": 11.3125, "rewards/rejected": -33.5, "step": 21280 }, { "epoch": 1.5368512235616834, "grad_norm": 7.416637311607162, "learning_rate": 5.482796871236004e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.4375, "logps/chosen": -516.0, "logps/rejected": -580.0, "loss": 0.0374, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.9375, "rewards/rejected": -33.0, "step": 21290 }, { "epoch": 1.5375730888616186, "grad_norm": 4.735003285971846, "learning_rate": 5.481509678620253e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.384765625, "logps/chosen": -488.0, "logps/rejected": -560.0, "loss": 0.035, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 10.5, "rewards/rejected": -32.25, "step": 21300 }, { "epoch": 1.5382949541615534, "grad_norm": 3.4551422793004924, "learning_rate": 5.480223392159126e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.54296875, "logps/chosen": -502.0, "logps/rejected": -556.0, "loss": 0.0295, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.5, "rewards/margins": 10.5625, "rewards/rejected": -30.125, "step": 21310 }, { "epoch": 1.5390168194614886, "grad_norm": 13.66685230864379, "learning_rate": 5.478938010789937e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.337890625, "logps/chosen": -488.0, "logps/rejected": -564.0, "loss": 0.0437, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 11.0625, "rewards/rejected": -31.125, "step": 21320 }, { "epoch": 1.5397386847614234, "grad_norm": 5.73842302074398, "learning_rate": 5.477653533451739e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.400390625, "logps/chosen": -520.0, "logps/rejected": -592.0, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 10.8125, "rewards/rejected": -32.5, "step": 21330 }, { "epoch": 1.5404605500613586, "grad_norm": 8.654918812950562, "learning_rate": 5.476369959085329e-07, "logits/chosen": -1.03125, "logits/rejected": -0.365234375, "logps/chosen": -492.0, "logps/rejected": -560.0, "loss": 0.0472, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.25, "rewards/margins": 10.4375, "rewards/rejected": -32.75, "step": 21340 }, { "epoch": 1.5411824153612936, "grad_norm": 3.212536930292904, "learning_rate": 5.475087286633238e-07, "logits/chosen": -0.953125, "logits/rejected": -0.427734375, "logps/chosen": -492.0, "logps/rejected": -584.0, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -21.25, "rewards/margins": 11.5625, "rewards/rejected": -32.75, "step": 21350 }, { "epoch": 1.5419042806612286, "grad_norm": 10.690243278355206, "learning_rate": 5.473805515039733e-07, "logits/chosen": -0.953125, "logits/rejected": -0.3125, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0378, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 10.0625, "rewards/rejected": -30.75, "step": 21360 }, { "epoch": 1.5426261459611637, "grad_norm": 5.709041457155234, "learning_rate": 5.472524643250806e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.322265625, "logps/chosen": -524.0, "logps/rejected": -592.0, "loss": 0.0314, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.125, "rewards/margins": 10.875, "rewards/rejected": -34.0, "step": 21370 }, { "epoch": 1.5433480112610987, "grad_norm": 13.627676369459925, "learning_rate": 5.471244670214178e-07, "logits/chosen": -0.890625, "logits/rejected": -0.2275390625, "logps/chosen": -516.0, "logps/rejected": -576.0, "loss": 0.0325, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 11.875, "rewards/rejected": -33.75, "step": 21380 }, { "epoch": 1.5440698765610337, "grad_norm": 6.217091123302644, "learning_rate": 5.469965594879291e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.39453125, "logps/chosen": -496.0, "logps/rejected": -556.0, "loss": 0.0366, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 11.8125, "rewards/rejected": -33.0, "step": 21390 }, { "epoch": 1.5447917418609687, "grad_norm": 10.640401709826536, "learning_rate": 5.468687416197306e-07, "logits/chosen": -0.921875, "logits/rejected": -0.28125, "logps/chosen": -506.0, "logps/rejected": -588.0, "loss": 0.0478, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.5, "rewards/margins": 11.4375, "rewards/rejected": -34.0, "step": 21400 }, { "epoch": 1.545513607160904, "grad_norm": 10.430732471007069, "learning_rate": 5.467410133121096e-07, "logits/chosen": -1.078125, "logits/rejected": -0.482421875, "logps/chosen": -490.0, "logps/rejected": -580.0, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.375, "rewards/rejected": -33.0, "step": 21410 }, { "epoch": 1.5462354724608387, "grad_norm": 6.6752297111329835, "learning_rate": 5.466133744605251e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.439453125, "logps/chosen": -516.0, "logps/rejected": -580.0, "loss": 0.0368, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.25, "rewards/margins": 11.375, "rewards/rejected": -33.5, "step": 21420 }, { "epoch": 1.546957337760774, "grad_norm": 10.901905044593612, "learning_rate": 5.464858249606063e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.318359375, "logps/chosen": -502.0, "logps/rejected": -580.0, "loss": 0.0389, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.5, "rewards/margins": 11.375, "rewards/rejected": -34.75, "step": 21430 }, { "epoch": 1.5476792030607087, "grad_norm": 3.6077550162230825, "learning_rate": 5.46358364708153e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.470703125, "logps/chosen": -504.0, "logps/rejected": -564.0, "loss": 0.0433, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 11.375, "rewards/rejected": -33.75, "step": 21440 }, { "epoch": 1.548401068360644, "grad_norm": 8.551894596021492, "learning_rate": 5.46230993599135e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.328125, "logps/chosen": -544.0, "logps/rejected": -624.0, "loss": 0.0459, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -24.0, "rewards/margins": 11.5625, "rewards/rejected": -35.5, "step": 21450 }, { "epoch": 1.549122933660579, "grad_norm": 6.383283399552494, "learning_rate": 5.46103711529692e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.345703125, "logps/chosen": -524.0, "logps/rejected": -632.0, "loss": 0.0501, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.75, "rewards/margins": 11.5, "rewards/rejected": -35.25, "step": 21460 }, { "epoch": 1.549844798960514, "grad_norm": 4.8073247296736294, "learning_rate": 5.459765183961328e-07, "logits/chosen": -0.796875, "logits/rejected": -0.2275390625, "logps/chosen": -524.0, "logps/rejected": -604.0, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -24.25, "rewards/margins": 11.5, "rewards/rejected": -35.75, "step": 21470 }, { "epoch": 1.550566664260449, "grad_norm": 9.64822326362457, "learning_rate": 5.458494140949352e-07, "logits/chosen": -1.015625, "logits/rejected": -0.296875, "logps/chosen": -512.0, "logps/rejected": -620.0, "loss": 0.0342, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -23.875, "rewards/margins": 12.0625, "rewards/rejected": -36.0, "step": 21480 }, { "epoch": 1.551288529560384, "grad_norm": 7.473938074905632, "learning_rate": 5.457223985227455e-07, "logits/chosen": -0.96875, "logits/rejected": -0.2890625, "logps/chosen": -516.0, "logps/rejected": -572.0, "loss": 0.0465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -23.25, "rewards/margins": 11.8125, "rewards/rejected": -35.0, "step": 21490 }, { "epoch": 1.5520103948603192, "grad_norm": 4.809451688486798, "learning_rate": 5.455954715763787e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.48046875, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0378, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.5625, "rewards/rejected": -31.875, "step": 21500 }, { "epoch": 1.552732260160254, "grad_norm": 13.226358647573935, "learning_rate": 5.454686331528174e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.375, "logps/chosen": -506.0, "logps/rejected": -556.0, "loss": 0.0449, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.125, "rewards/margins": 10.375, "rewards/rejected": -30.5, "step": 21510 }, { "epoch": 1.5534541254601892, "grad_norm": 9.650351831078604, "learning_rate": 5.453418831492118e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.52734375, "logps/chosen": -496.0, "logps/rejected": -560.0, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -19.25, "rewards/margins": 11.1875, "rewards/rejected": -30.375, "step": 21520 }, { "epoch": 1.554175990760124, "grad_norm": 3.5896210790546568, "learning_rate": 5.452152214628792e-07, "logits/chosen": -0.796875, "logits/rejected": -0.267578125, "logps/chosen": -476.0, "logps/rejected": -584.0, "loss": 0.0328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.5, "rewards/margins": 11.375, "rewards/rejected": -31.875, "step": 21530 }, { "epoch": 1.5548978560600593, "grad_norm": 7.913838389532741, "learning_rate": 5.450886479913041e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.408203125, "logps/chosen": -488.0, "logps/rejected": -556.0, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 10.0625, "rewards/rejected": -31.25, "step": 21540 }, { "epoch": 1.5556197213599943, "grad_norm": 11.36684363334711, "learning_rate": 5.449621626321372e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.29296875, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.0452, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.375, "rewards/margins": 11.375, "rewards/rejected": -31.75, "step": 21550 }, { "epoch": 1.5563415866599293, "grad_norm": 4.621511030397181, "learning_rate": 5.448357652831955e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.45703125, "logps/chosen": -506.0, "logps/rejected": -560.0, "loss": 0.0292, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 11.25, "rewards/rejected": -31.625, "step": 21560 }, { "epoch": 1.5570634519598643, "grad_norm": 8.680156884125836, "learning_rate": 5.44709455842462e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.32421875, "logps/chosen": -478.0, "logps/rejected": -544.0, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -19.75, "rewards/margins": 11.375, "rewards/rejected": -31.0, "step": 21570 }, { "epoch": 1.5577853172597993, "grad_norm": 8.645845208092936, "learning_rate": 5.445832342080844e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.3046875, "logps/chosen": -476.0, "logps/rejected": -556.0, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 11.625, "rewards/rejected": -31.625, "step": 21580 }, { "epoch": 1.5585071825597343, "grad_norm": 7.837519671246311, "learning_rate": 5.44457100278377e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.4140625, "logps/chosen": -510.0, "logps/rejected": -592.0, "loss": 0.0437, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 12.0, "rewards/rejected": -34.0, "step": 21590 }, { "epoch": 1.5592290478596693, "grad_norm": 6.447879697978433, "learning_rate": 5.443310539518173e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.447265625, "logps/chosen": -474.0, "logps/rejected": -540.0, "loss": 0.0304, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 11.5, "rewards/rejected": -30.5, "step": 21600 }, { "epoch": 1.5599509131596045, "grad_norm": 6.843334331928697, "learning_rate": 5.442050951270483e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.423828125, "logps/chosen": -470.0, "logps/rejected": -536.0, "loss": 0.0372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.625, "rewards/margins": 11.125, "rewards/rejected": -30.75, "step": 21610 }, { "epoch": 1.5606727784595393, "grad_norm": 11.389886297606859, "learning_rate": 5.440792237028766e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.42578125, "logps/chosen": -470.0, "logps/rejected": -576.0, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.75, "rewards/margins": 11.625, "rewards/rejected": -30.375, "step": 21620 }, { "epoch": 1.5613946437594746, "grad_norm": 6.964057147480093, "learning_rate": 5.439534395782728e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.44921875, "logps/chosen": -504.0, "logps/rejected": -580.0, "loss": 0.0379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.5625, "rewards/rejected": -31.25, "step": 21630 }, { "epoch": 1.5621165090594094, "grad_norm": 10.074737300706303, "learning_rate": 5.438277426523708e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.484375, "logps/chosen": -440.0, "logps/rejected": -532.0, "loss": 0.0418, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.125, "rewards/margins": 10.8125, "rewards/rejected": -29.0, "step": 21640 }, { "epoch": 1.5628383743593446, "grad_norm": 7.859042652728329, "learning_rate": 5.437021328244679e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.291015625, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -20.5, "rewards/margins": 10.8125, "rewards/rejected": -31.375, "step": 21650 }, { "epoch": 1.5635602396592796, "grad_norm": 5.491392859275762, "learning_rate": 5.435766099940235e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.34375, "logps/chosen": -498.0, "logps/rejected": -572.0, "loss": 0.021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 12.375, "rewards/rejected": -33.75, "step": 21660 }, { "epoch": 1.5642821049592146, "grad_norm": 7.686831754387359, "learning_rate": 5.434511740606597e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.318359375, "logps/chosen": -506.0, "logps/rejected": -588.0, "loss": 0.0469, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.25, "rewards/margins": 12.6875, "rewards/rejected": -34.0, "step": 21670 }, { "epoch": 1.5650039702591496, "grad_norm": 6.417939531907064, "learning_rate": 5.433258249241613e-07, "logits/chosen": -0.921875, "logits/rejected": -0.345703125, "logps/chosen": -500.0, "logps/rejected": -588.0, "loss": 0.0403, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 11.5625, "rewards/rejected": -33.0, "step": 21680 }, { "epoch": 1.5657258355590846, "grad_norm": 8.924962556032279, "learning_rate": 5.432005624844737e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.318359375, "logps/chosen": -468.0, "logps/rejected": -556.0, "loss": 0.0478, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 12.25, "rewards/rejected": -31.5, "step": 21690 }, { "epoch": 1.5664477008590199, "grad_norm": 8.892399349349919, "learning_rate": 5.430753866417044e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.333984375, "logps/chosen": -468.0, "logps/rejected": -560.0, "loss": 0.0295, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.9375, "rewards/rejected": -30.625, "step": 21700 }, { "epoch": 1.5671695661589546, "grad_norm": 6.592012479086353, "learning_rate": 5.429502972961222e-07, "logits/chosen": -0.9140625, "logits/rejected": -0.357421875, "logps/chosen": -452.0, "logps/rejected": -520.0, "loss": 0.035, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.1875, "rewards/rejected": -30.0, "step": 21710 }, { "epoch": 1.5678914314588899, "grad_norm": 7.149878023088217, "learning_rate": 5.428252943481558e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.287109375, "logps/chosen": -482.0, "logps/rejected": -548.0, "loss": 0.0471, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.0, "rewards/margins": 11.5, "rewards/rejected": -31.5, "step": 21720 }, { "epoch": 1.5686132967588247, "grad_norm": 4.139921018436647, "learning_rate": 5.427003776983951e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.306640625, "logps/chosen": -442.0, "logps/rejected": -520.0, "loss": 0.0372, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -17.5, "rewards/margins": 10.75, "rewards/rejected": -28.25, "step": 21730 }, { "epoch": 1.56933516205876, "grad_norm": 2.8431249984617635, "learning_rate": 5.425755472475893e-07, "logits/chosen": -0.76953125, "logits/rejected": -0.31640625, "logps/chosen": -502.0, "logps/rejected": -576.0, "loss": 0.0266, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 11.9375, "rewards/rejected": -33.25, "step": 21740 }, { "epoch": 1.570057027358695, "grad_norm": 9.663259098911228, "learning_rate": 5.424508028966484e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.439453125, "logps/chosen": -476.0, "logps/rejected": -544.0, "loss": 0.0389, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 11.3125, "rewards/rejected": -30.125, "step": 21750 }, { "epoch": 1.57077889265863, "grad_norm": 4.038054340366836, "learning_rate": 5.423261445466404e-07, "logits/chosen": -0.828125, "logits/rejected": -0.29296875, "logps/chosen": -470.0, "logps/rejected": -564.0, "loss": 0.048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 11.625, "rewards/rejected": -31.125, "step": 21760 }, { "epoch": 1.571500757958565, "grad_norm": 15.044592571138375, "learning_rate": 5.422015720987936e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.326171875, "logps/chosen": -492.0, "logps/rejected": -580.0, "loss": 0.0636, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.9375, "rewards/rejected": -31.0, "step": 21770 }, { "epoch": 1.5722226232585, "grad_norm": 4.422093049122864, "learning_rate": 5.420770854544944e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.443359375, "logps/chosen": -458.0, "logps/rejected": -528.0, "loss": 0.037, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.5, "rewards/margins": 11.3125, "rewards/rejected": -28.875, "step": 21780 }, { "epoch": 1.5729444885584352, "grad_norm": 6.632638086847986, "learning_rate": 5.419526845152879e-07, "logits/chosen": -0.890625, "logits/rejected": -0.2158203125, "logps/chosen": -468.0, "logps/rejected": -544.0, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 10.9375, "rewards/rejected": -29.5, "step": 21790 }, { "epoch": 1.57366635385837, "grad_norm": 8.300286342268462, "learning_rate": 5.418283691828771e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.1494140625, "logps/chosen": -480.0, "logps/rejected": -552.0, "loss": 0.0468, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 11.375, "rewards/rejected": -31.5, "step": 21800 }, { "epoch": 1.5743882191583052, "grad_norm": 9.60317753460245, "learning_rate": 5.417041393591227e-07, "logits/chosen": -0.921875, "logits/rejected": -0.46875, "logps/chosen": -486.0, "logps/rejected": -536.0, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -18.125, "rewards/margins": 11.25, "rewards/rejected": -29.375, "step": 21810 }, { "epoch": 1.57511008445824, "grad_norm": 2.7043689608325536, "learning_rate": 5.415799949460433e-07, "logits/chosen": -1.046875, "logits/rejected": -0.427734375, "logps/chosen": -486.0, "logps/rejected": -540.0, "loss": 0.0358, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.4375, "rewards/rejected": -30.5, "step": 21820 }, { "epoch": 1.5758319497581752, "grad_norm": 4.892822260304265, "learning_rate": 5.414559358458137e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.404296875, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -20.625, "rewards/margins": 11.4375, "rewards/rejected": -32.0, "step": 21830 }, { "epoch": 1.5765538150581102, "grad_norm": 4.555827603301777, "learning_rate": 5.413319619607667e-07, "logits/chosen": -1.0, "logits/rejected": -0.28515625, "logps/chosen": -524.0, "logps/rejected": -592.0, "loss": 0.039, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.8125, "rewards/rejected": -32.75, "step": 21840 }, { "epoch": 1.5772756803580452, "grad_norm": 8.959980082174791, "learning_rate": 5.412080731933907e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.40625, "logps/chosen": -508.0, "logps/rejected": -584.0, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.875, "rewards/margins": 10.4375, "rewards/rejected": -32.25, "step": 21850 }, { "epoch": 1.5779975456579802, "grad_norm": 5.986679947110591, "learning_rate": 5.410842694463302e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.3046875, "logps/chosen": -492.0, "logps/rejected": -548.0, "loss": 0.0452, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.125, "rewards/margins": 11.0, "rewards/rejected": -32.0, "step": 21860 }, { "epoch": 1.5787194109579152, "grad_norm": 7.990995106640052, "learning_rate": 5.409605506223862e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.388671875, "logps/chosen": -484.0, "logps/rejected": -580.0, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -23.0, "rewards/margins": 11.0625, "rewards/rejected": -34.0, "step": 21870 }, { "epoch": 1.5794412762578502, "grad_norm": 0.6701206898763926, "learning_rate": 5.408369166245146e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.1572265625, "logps/chosen": -496.0, "logps/rejected": -560.0, "loss": 0.0211, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.875, "rewards/margins": 11.625, "rewards/rejected": -33.5, "step": 21880 }, { "epoch": 1.5801631415577853, "grad_norm": 6.751034595639384, "learning_rate": 5.407133673558267e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.349609375, "logps/chosen": -510.0, "logps/rejected": -600.0, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.0, "rewards/margins": 12.5, "rewards/rejected": -34.5, "step": 21890 }, { "epoch": 1.5808850068577205, "grad_norm": 7.886666303289136, "learning_rate": 5.405899027195888e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.42578125, "logps/chosen": -486.0, "logps/rejected": -560.0, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -20.625, "rewards/margins": 11.5625, "rewards/rejected": -32.25, "step": 21900 }, { "epoch": 1.5816068721576553, "grad_norm": 7.6890753825776335, "learning_rate": 5.404665226192212e-07, "logits/chosen": -1.015625, "logits/rejected": -0.376953125, "logps/chosen": -480.0, "logps/rejected": -564.0, "loss": 0.0409, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.75, "rewards/rejected": -33.25, "step": 21910 }, { "epoch": 1.5823287374575905, "grad_norm": 8.038311851195248, "learning_rate": 5.403432269582991e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.357421875, "logps/chosen": -498.0, "logps/rejected": -588.0, "loss": 0.0454, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.125, "rewards/rejected": -32.25, "step": 21920 }, { "epoch": 1.5830506027575253, "grad_norm": 10.161126426973004, "learning_rate": 5.402200156405514e-07, "logits/chosen": -0.859375, "logits/rejected": -0.166015625, "logps/chosen": -488.0, "logps/rejected": -564.0, "loss": 0.0428, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.6875, "rewards/rejected": -32.25, "step": 21930 }, { "epoch": 1.5837724680574605, "grad_norm": 9.194804003258563, "learning_rate": 5.400968885698603e-07, "logits/chosen": -0.875, "logits/rejected": -0.37109375, "logps/chosen": -498.0, "logps/rejected": -576.0, "loss": 0.0352, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 11.75, "rewards/rejected": -33.25, "step": 21940 }, { "epoch": 1.5844943333573955, "grad_norm": 5.564076207500214, "learning_rate": 5.399738456502616e-07, "logits/chosen": -0.953125, "logits/rejected": -0.353515625, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.0362, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 12.3125, "rewards/rejected": -31.875, "step": 21950 }, { "epoch": 1.5852161986573305, "grad_norm": 14.467394903715823, "learning_rate": 5.398508867859439e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.37109375, "logps/chosen": -486.0, "logps/rejected": -556.0, "loss": 0.0487, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.625, "rewards/margins": 10.9375, "rewards/rejected": -30.625, "step": 21960 }, { "epoch": 1.5859380639572656, "grad_norm": 8.696844615689228, "learning_rate": 5.397280118812487e-07, "logits/chosen": -1.109375, "logits/rejected": -0.51171875, "logps/chosen": -482.0, "logps/rejected": -552.0, "loss": 0.0371, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.5, "rewards/margins": 11.25, "rewards/rejected": -31.75, "step": 21970 }, { "epoch": 1.5866599292572006, "grad_norm": 7.121155041259728, "learning_rate": 5.396052208406695e-07, "logits/chosen": -0.96875, "logits/rejected": -0.44921875, "logps/chosen": -486.0, "logps/rejected": -552.0, "loss": 0.0397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 10.75, "rewards/rejected": -30.125, "step": 21980 }, { "epoch": 1.5873817945571358, "grad_norm": 2.4184994421866643, "learning_rate": 5.394825135688519e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.41015625, "logps/chosen": -478.0, "logps/rejected": -564.0, "loss": 0.0386, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 11.75, "rewards/rejected": -31.25, "step": 21990 }, { "epoch": 1.5881036598570706, "grad_norm": 2.93487550080862, "learning_rate": 5.393598899705937e-07, "logits/chosen": -0.859375, "logits/rejected": -0.369140625, "logps/chosen": -484.0, "logps/rejected": -548.0, "loss": 0.0325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.5, "rewards/margins": 11.25, "rewards/rejected": -30.75, "step": 22000 }, { "epoch": 1.5888255251570058, "grad_norm": 7.717121138419722, "learning_rate": 5.392373499508432e-07, "logits/chosen": -0.953125, "logits/rejected": -0.33984375, "logps/chosen": -466.0, "logps/rejected": -568.0, "loss": 0.0505, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 11.6875, "rewards/rejected": -32.0, "step": 22010 }, { "epoch": 1.5895473904569406, "grad_norm": 7.657911200956337, "learning_rate": 5.391148934147006e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.671875, "logps/chosen": -450.0, "logps/rejected": -536.0, "loss": 0.0313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 11.125, "rewards/rejected": -30.0, "step": 22020 }, { "epoch": 1.5902692557568758, "grad_norm": 5.686379482986802, "learning_rate": 5.389925202674166e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.1943359375, "logps/chosen": -490.0, "logps/rejected": -580.0, "loss": 0.0299, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 11.6875, "rewards/rejected": -32.5, "step": 22030 }, { "epoch": 1.5909911210568108, "grad_norm": 9.203493194807033, "learning_rate": 5.388702304143923e-07, "logits/chosen": -0.921875, "logits/rejected": -0.4140625, "logps/chosen": -472.0, "logps/rejected": -560.0, "loss": 0.0407, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -21.0, "rewards/margins": 10.3125, "rewards/rejected": -31.375, "step": 22040 }, { "epoch": 1.5917129863567459, "grad_norm": 8.606261904763022, "learning_rate": 5.387480237611791e-07, "logits/chosen": -0.96875, "logits/rejected": -0.25, "logps/chosen": -488.0, "logps/rejected": -580.0, "loss": 0.0306, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.625, "rewards/margins": 12.0625, "rewards/rejected": -33.75, "step": 22050 }, { "epoch": 1.5924348516566809, "grad_norm": 10.769894662699206, "learning_rate": 5.386259002134781e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.4140625, "logps/chosen": -494.0, "logps/rejected": -572.0, "loss": 0.0601, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.625, "rewards/margins": 10.8125, "rewards/rejected": -32.5, "step": 22060 }, { "epoch": 1.5931567169566159, "grad_norm": 3.1673452578750037, "learning_rate": 5.385038596771402e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.54296875, "logps/chosen": -500.0, "logps/rejected": -592.0, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -21.125, "rewards/margins": 12.1875, "rewards/rejected": -33.25, "step": 22070 }, { "epoch": 1.5938785822565509, "grad_norm": 7.929238919614598, "learning_rate": 5.383819020581654e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.423828125, "logps/chosen": -506.0, "logps/rejected": -596.0, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.375, "rewards/margins": 12.625, "rewards/rejected": -34.0, "step": 22080 }, { "epoch": 1.594600447556486, "grad_norm": 5.63966378052398, "learning_rate": 5.382600272627028e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.40234375, "logps/chosen": -496.0, "logps/rejected": -568.0, "loss": 0.0451, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 12.3125, "rewards/rejected": -34.0, "step": 22090 }, { "epoch": 1.5953223128564211, "grad_norm": 4.653707101841594, "learning_rate": 5.381382351970499e-07, "logits/chosen": -1.03125, "logits/rejected": -0.435546875, "logps/chosen": -458.0, "logps/rejected": -564.0, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -19.25, "rewards/margins": 12.0, "rewards/rejected": -31.25, "step": 22100 }, { "epoch": 1.596044178156356, "grad_norm": 8.420792863242069, "learning_rate": 5.380165257676528e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.455078125, "logps/chosen": -500.0, "logps/rejected": -544.0, "loss": 0.0426, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.25, "rewards/margins": 11.0, "rewards/rejected": -31.25, "step": 22110 }, { "epoch": 1.5967660434562911, "grad_norm": 8.494608550505239, "learning_rate": 5.378948988811054e-07, "logits/chosen": -0.984375, "logits/rejected": -0.34765625, "logps/chosen": -480.0, "logps/rejected": -532.0, "loss": 0.0329, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.4375, "rewards/rejected": -31.125, "step": 22120 }, { "epoch": 1.597487908756226, "grad_norm": 7.714034174446202, "learning_rate": 5.377733544441496e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.53125, "logps/chosen": -484.0, "logps/rejected": -552.0, "loss": 0.0341, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.625, "rewards/margins": 12.0, "rewards/rejected": -31.625, "step": 22130 }, { "epoch": 1.5982097740561612, "grad_norm": 10.997290016079363, "learning_rate": 5.376518923636746e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.421875, "logps/chosen": -486.0, "logps/rejected": -556.0, "loss": 0.0373, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 11.1875, "rewards/rejected": -33.0, "step": 22140 }, { "epoch": 1.5989316393560962, "grad_norm": 4.9664413265403216, "learning_rate": 5.375305125467166e-07, "logits/chosen": -0.859375, "logits/rejected": -0.392578125, "logps/chosen": -480.0, "logps/rejected": -576.0, "loss": 0.0257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.0, "rewards/margins": 11.625, "rewards/rejected": -32.5, "step": 22150 }, { "epoch": 1.5996535046560312, "grad_norm": 9.533376778193725, "learning_rate": 5.37409214900459e-07, "logits/chosen": -1.015625, "logits/rejected": -0.5234375, "logps/chosen": -458.0, "logps/rejected": -556.0, "loss": 0.0361, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.75, "rewards/margins": 11.1875, "rewards/rejected": -29.875, "step": 22160 }, { "epoch": 1.6003753699559662, "grad_norm": 5.663551719413997, "learning_rate": 5.372879993322315e-07, "logits/chosen": -1.03125, "logits/rejected": -0.5546875, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0444, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.6875, "rewards/rejected": -30.75, "step": 22170 }, { "epoch": 1.6010972352559012, "grad_norm": 10.52937000129152, "learning_rate": 5.371668657495103e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.546875, "logps/chosen": -452.0, "logps/rejected": -540.0, "loss": 0.0415, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.875, "rewards/margins": 11.1875, "rewards/rejected": -30.125, "step": 22180 }, { "epoch": 1.6018191005558364, "grad_norm": 7.389794847035026, "learning_rate": 5.37045814059917e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.390625, "logps/chosen": -438.0, "logps/rejected": -532.0, "loss": 0.0327, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -17.75, "rewards/margins": 11.625, "rewards/rejected": -29.375, "step": 22190 }, { "epoch": 1.6025409658557712, "grad_norm": 9.157904459087323, "learning_rate": 5.369248441712195e-07, "logits/chosen": -1.0625, "logits/rejected": -0.55078125, "logps/chosen": -458.0, "logps/rejected": -528.0, "loss": 0.0357, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.625, "rewards/margins": 11.125, "rewards/rejected": -28.75, "step": 22200 }, { "epoch": 1.6032628311557064, "grad_norm": 4.685171582524784, "learning_rate": 5.368039559913306e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.478515625, "logps/chosen": -494.0, "logps/rejected": -580.0, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.25, "rewards/rejected": -31.625, "step": 22210 }, { "epoch": 1.6039846964556412, "grad_norm": 7.081920954086708, "learning_rate": 5.366831494283084e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.349609375, "logps/chosen": -462.0, "logps/rejected": -548.0, "loss": 0.0304, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 11.875, "rewards/rejected": -32.0, "step": 22220 }, { "epoch": 1.6047065617555765, "grad_norm": 9.359680177606696, "learning_rate": 5.365624243903558e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.458984375, "logps/chosen": -512.0, "logps/rejected": -568.0, "loss": 0.0411, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 12.1875, "rewards/rejected": -33.25, "step": 22230 }, { "epoch": 1.6054284270555115, "grad_norm": 1.7067675296306462, "learning_rate": 5.3644178078582e-07, "logits/chosen": -1.046875, "logits/rejected": -0.41015625, "logps/chosen": -486.0, "logps/rejected": -564.0, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -20.875, "rewards/margins": 12.0, "rewards/rejected": -33.0, "step": 22240 }, { "epoch": 1.6061502923554465, "grad_norm": 11.154977109765358, "learning_rate": 5.363212185231927e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.41796875, "logps/chosen": -480.0, "logps/rejected": -564.0, "loss": 0.0407, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 11.25, "rewards/rejected": -31.875, "step": 22250 }, { "epoch": 1.6068721576553815, "grad_norm": 5.099029317053824, "learning_rate": 5.362007375111091e-07, "logits/chosen": -0.875, "logits/rejected": -0.451171875, "logps/chosen": -472.0, "logps/rejected": -568.0, "loss": 0.0304, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 11.6875, "rewards/rejected": -32.5, "step": 22260 }, { "epoch": 1.6075940229553165, "grad_norm": 2.78319339310038, "learning_rate": 5.360803376583483e-07, "logits/chosen": -1.03125, "logits/rejected": -0.439453125, "logps/chosen": -500.0, "logps/rejected": -560.0, "loss": 0.0502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.875, "rewards/margins": 11.125, "rewards/rejected": -32.0, "step": 22270 }, { "epoch": 1.6083158882552517, "grad_norm": 5.098755234528997, "learning_rate": 5.359600188738324e-07, "logits/chosen": -0.890625, "logits/rejected": -0.32421875, "logps/chosen": -498.0, "logps/rejected": -588.0, "loss": 0.0392, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.5, "rewards/margins": 11.8125, "rewards/rejected": -32.25, "step": 22280 }, { "epoch": 1.6090377535551865, "grad_norm": 5.3104769995474665, "learning_rate": 5.35839781066627e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.2275390625, "logps/chosen": -468.0, "logps/rejected": -552.0, "loss": 0.036, "rewards/accuracies": 0.96875, "rewards/chosen": -20.25, "rewards/margins": 11.0625, "rewards/rejected": -31.375, "step": 22290 }, { "epoch": 1.6097596188551218, "grad_norm": 8.787638041499589, "learning_rate": 5.357196241459401e-07, "logits/chosen": -0.921875, "logits/rejected": -0.33984375, "logps/chosen": -474.0, "logps/rejected": -564.0, "loss": 0.0407, "rewards/accuracies": 0.96875, "rewards/chosen": -21.0, "rewards/margins": 11.875, "rewards/rejected": -32.75, "step": 22300 }, { "epoch": 1.6104814841550565, "grad_norm": 4.902494837908547, "learning_rate": 5.355995480211221e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.5234375, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.0399, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 22310 }, { "epoch": 1.6112033494549918, "grad_norm": 10.096661575068012, "learning_rate": 5.354795526016659e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.3984375, "logps/chosen": -502.0, "logps/rejected": -568.0, "loss": 0.036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.5, "rewards/rejected": -32.75, "step": 22320 }, { "epoch": 1.6119252147549268, "grad_norm": 8.336076495936679, "learning_rate": 5.353596377972059e-07, "logits/chosen": -1.078125, "logits/rejected": -0.30859375, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0407, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.25, "rewards/margins": 11.5, "rewards/rejected": -30.75, "step": 22330 }, { "epoch": 1.6126470800548618, "grad_norm": 3.490285361040781, "learning_rate": 5.352398035175184e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.349609375, "logps/chosen": -484.0, "logps/rejected": -540.0, "loss": 0.0406, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.3125, "rewards/rejected": -31.75, "step": 22340 }, { "epoch": 1.6133689453547968, "grad_norm": 7.679492684285736, "learning_rate": 5.351200496725209e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.50390625, "logps/chosen": -488.0, "logps/rejected": -568.0, "loss": 0.0437, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 12.1875, "rewards/rejected": -32.5, "step": 22350 }, { "epoch": 1.6140908106547318, "grad_norm": 6.475164033297049, "learning_rate": 5.350003761722717e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.50390625, "logps/chosen": -484.0, "logps/rejected": -548.0, "loss": 0.0497, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.9375, "rewards/rejected": -31.625, "step": 22360 }, { "epoch": 1.6148126759546668, "grad_norm": 9.64373520693428, "learning_rate": 5.348807829269702e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.392578125, "logps/chosen": -516.0, "logps/rejected": -584.0, "loss": 0.0426, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.0, "rewards/margins": 10.625, "rewards/rejected": -33.5, "step": 22370 }, { "epoch": 1.6155345412546018, "grad_norm": 10.569973918818595, "learning_rate": 5.347612698469559e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.369140625, "logps/chosen": -500.0, "logps/rejected": -576.0, "loss": 0.0332, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.1875, "rewards/rejected": -32.25, "step": 22380 }, { "epoch": 1.616256406554537, "grad_norm": 4.837799024331139, "learning_rate": 5.346418368427089e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.46484375, "logps/chosen": -470.0, "logps/rejected": -564.0, "loss": 0.0334, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.6875, "rewards/rejected": -31.375, "step": 22390 }, { "epoch": 1.6169782718544718, "grad_norm": 2.2972973879523506, "learning_rate": 5.345224838248488e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.42578125, "logps/chosen": -502.0, "logps/rejected": -580.0, "loss": 0.0535, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 11.3125, "rewards/rejected": -32.0, "step": 22400 }, { "epoch": 1.617700137154407, "grad_norm": 6.3265217143385675, "learning_rate": 5.344032107041349e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.4296875, "logps/chosen": -472.0, "logps/rejected": -548.0, "loss": 0.044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.0, "rewards/margins": 10.625, "rewards/rejected": -28.625, "step": 22410 }, { "epoch": 1.6184220024543419, "grad_norm": 2.390354019732529, "learning_rate": 5.34284017391466e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.34765625, "logps/chosen": -472.0, "logps/rejected": -520.0, "loss": 0.0421, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.875, "rewards/margins": 9.9375, "rewards/rejected": -28.875, "step": 22420 }, { "epoch": 1.619143867754277, "grad_norm": 6.64782156862032, "learning_rate": 5.341649037978797e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.314453125, "logps/chosen": -470.0, "logps/rejected": -560.0, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.125, "rewards/margins": 10.9375, "rewards/rejected": -30.125, "step": 22430 }, { "epoch": 1.619865733054212, "grad_norm": 5.701581674946874, "learning_rate": 5.340458698345527e-07, "logits/chosen": -1.046875, "logits/rejected": -0.388671875, "logps/chosen": -484.0, "logps/rejected": -552.0, "loss": 0.0473, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.75, "rewards/margins": 11.1875, "rewards/rejected": -30.0, "step": 22440 }, { "epoch": 1.6205875983541471, "grad_norm": 7.247061062737589, "learning_rate": 5.339269154127999e-07, "logits/chosen": -1.03125, "logits/rejected": -0.50390625, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0468, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 11.0625, "rewards/rejected": -31.125, "step": 22450 }, { "epoch": 1.6213094636540821, "grad_norm": 4.563083554521247, "learning_rate": 5.338080404440744e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.318359375, "logps/chosen": -464.0, "logps/rejected": -548.0, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.0625, "rewards/rejected": -30.375, "step": 22460 }, { "epoch": 1.6220313289540171, "grad_norm": 6.765153581803742, "learning_rate": 5.336892448399675e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.2578125, "logps/chosen": -480.0, "logps/rejected": -548.0, "loss": 0.0372, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 12.0, "rewards/rejected": -32.5, "step": 22470 }, { "epoch": 1.6227531942539524, "grad_norm": 2.592668675505673, "learning_rate": 5.335705285122082e-07, "logits/chosen": -0.859375, "logits/rejected": -0.2470703125, "logps/chosen": -508.0, "logps/rejected": -600.0, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": -21.375, "rewards/margins": 12.0625, "rewards/rejected": -33.5, "step": 22480 }, { "epoch": 1.6234750595538872, "grad_norm": 12.300231573579062, "learning_rate": 5.334518913726623e-07, "logits/chosen": -0.90625, "logits/rejected": -0.34375, "logps/chosen": -480.0, "logps/rejected": -572.0, "loss": 0.0501, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 11.6875, "rewards/rejected": -33.25, "step": 22490 }, { "epoch": 1.6241969248538224, "grad_norm": 6.633745920641773, "learning_rate": 5.333333333333333e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.451171875, "logps/chosen": -484.0, "logps/rejected": -544.0, "loss": 0.0485, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 11.5625, "rewards/rejected": -31.5, "step": 22500 }, { "epoch": 1.6249187901537572, "grad_norm": 3.605711603945024, "learning_rate": 5.332148543063615e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.36328125, "logps/chosen": -520.0, "logps/rejected": -580.0, "loss": 0.0332, "rewards/accuracies": 0.96875, "rewards/chosen": -21.875, "rewards/margins": 11.0625, "rewards/rejected": -33.0, "step": 22510 }, { "epoch": 1.6256406554536924, "grad_norm": 5.6568082418270285, "learning_rate": 5.330964542040233e-07, "logits/chosen": -0.78515625, "logits/rejected": -0.314453125, "logps/chosen": -472.0, "logps/rejected": -548.0, "loss": 0.0378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.625, "rewards/margins": 10.5625, "rewards/rejected": -30.125, "step": 22520 }, { "epoch": 1.6263625207536274, "grad_norm": 6.486088380637691, "learning_rate": 5.32978132938732e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.515625, "logps/chosen": -506.0, "logps/rejected": -556.0, "loss": 0.0617, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 11.125, "rewards/rejected": -30.375, "step": 22530 }, { "epoch": 1.6270843860535624, "grad_norm": 10.266561242345176, "learning_rate": 5.328598904230365e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.39453125, "logps/chosen": -486.0, "logps/rejected": -552.0, "loss": 0.0336, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 11.0625, "rewards/rejected": -30.0, "step": 22540 }, { "epoch": 1.6278062513534974, "grad_norm": 7.901728048060941, "learning_rate": 5.327417265696215e-07, "logits/chosen": -0.88671875, "logits/rejected": -0.3125, "logps/chosen": -470.0, "logps/rejected": -556.0, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 10.875, "rewards/rejected": -30.75, "step": 22550 }, { "epoch": 1.6285281166534324, "grad_norm": 3.396393261483008, "learning_rate": 5.326236412913074e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.25390625, "logps/chosen": -476.0, "logps/rejected": -576.0, "loss": 0.0335, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 11.625, "rewards/rejected": -31.625, "step": 22560 }, { "epoch": 1.6292499819533675, "grad_norm": 6.217332863462395, "learning_rate": 5.325056345010497e-07, "logits/chosen": -0.921875, "logits/rejected": -0.3984375, "logps/chosen": -436.0, "logps/rejected": -528.0, "loss": 0.0452, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.125, "rewards/margins": 10.375, "rewards/rejected": -28.5, "step": 22570 }, { "epoch": 1.6299718472533025, "grad_norm": 9.011903583667802, "learning_rate": 5.323877061119386e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.455078125, "logps/chosen": -466.0, "logps/rejected": -552.0, "loss": 0.033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.875, "rewards/margins": 12.125, "rewards/rejected": -30.0, "step": 22580 }, { "epoch": 1.6306937125532377, "grad_norm": 6.596634068992424, "learning_rate": 5.322698560371995e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.427734375, "logps/chosen": -468.0, "logps/rejected": -556.0, "loss": 0.0472, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 11.0625, "rewards/rejected": -30.25, "step": 22590 }, { "epoch": 1.6314155778531725, "grad_norm": 6.678077559535046, "learning_rate": 5.321520841901914e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.296875, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0252, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.5, "rewards/margins": 10.625, "rewards/rejected": -31.125, "step": 22600 }, { "epoch": 1.6321374431531077, "grad_norm": 8.691814024088426, "learning_rate": 5.320343904844084e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.455078125, "logps/chosen": -474.0, "logps/rejected": -548.0, "loss": 0.0383, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.875, "rewards/margins": 10.5, "rewards/rejected": -30.25, "step": 22610 }, { "epoch": 1.6328593084530425, "grad_norm": 2.4247222284061754, "learning_rate": 5.319167748334776e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.439453125, "logps/chosen": -448.0, "logps/rejected": -532.0, "loss": 0.0391, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -17.625, "rewards/margins": 11.1875, "rewards/rejected": -28.75, "step": 22620 }, { "epoch": 1.6335811737529777, "grad_norm": 5.703891692152661, "learning_rate": 5.317992371511601e-07, "logits/chosen": -0.7890625, "logits/rejected": -0.267578125, "logps/chosen": -464.0, "logps/rejected": -548.0, "loss": 0.0492, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 11.4375, "rewards/rejected": -30.75, "step": 22630 }, { "epoch": 1.6343030390529127, "grad_norm": 2.753651978222426, "learning_rate": 5.316817773513505e-07, "logits/chosen": -0.7890625, "logits/rejected": -0.228515625, "logps/chosen": -488.0, "logps/rejected": -576.0, "loss": 0.0418, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.625, "rewards/margins": 11.0625, "rewards/rejected": -32.75, "step": 22640 }, { "epoch": 1.6350249043528478, "grad_norm": 7.432869072503092, "learning_rate": 5.315643953480763e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.306640625, "logps/chosen": -490.0, "logps/rejected": -564.0, "loss": 0.0377, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 11.1875, "rewards/rejected": -31.875, "step": 22650 }, { "epoch": 1.6357467696527828, "grad_norm": 4.893347223807246, "learning_rate": 5.314470910554976e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.41015625, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.375, "rewards/margins": 11.1875, "rewards/rejected": -32.5, "step": 22660 }, { "epoch": 1.6364686349527178, "grad_norm": 5.426268356814639, "learning_rate": 5.313298643879074e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.357421875, "logps/chosen": -498.0, "logps/rejected": -584.0, "loss": 0.0304, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 22670 }, { "epoch": 1.637190500252653, "grad_norm": 20.104572772197372, "learning_rate": 5.312127152597304e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.201171875, "logps/chosen": -474.0, "logps/rejected": -556.0, "loss": 0.0349, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.0, "rewards/margins": 12.25, "rewards/rejected": -32.25, "step": 22680 }, { "epoch": 1.6379123655525878, "grad_norm": 1.7937948385140312, "learning_rate": 5.310956435855243e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.421875, "logps/chosen": -464.0, "logps/rejected": -572.0, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -20.375, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 22690 }, { "epoch": 1.638634230852523, "grad_norm": 4.697804574298248, "learning_rate": 5.309786492799776e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.373046875, "logps/chosen": -482.0, "logps/rejected": -572.0, "loss": 0.0199, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.3125, "rewards/rejected": -32.25, "step": 22700 }, { "epoch": 1.6393560961524578, "grad_norm": 10.949821551682957, "learning_rate": 5.308617322579108e-07, "logits/chosen": -0.921875, "logits/rejected": -0.359375, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0403, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.0, "rewards/rejected": -31.125, "step": 22710 }, { "epoch": 1.640077961452393, "grad_norm": 4.518942356255158, "learning_rate": 5.307448924342752e-07, "logits/chosen": -1.03125, "logits/rejected": -0.32421875, "logps/chosen": -458.0, "logps/rejected": -532.0, "loss": 0.0354, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 11.5, "rewards/rejected": -30.75, "step": 22720 }, { "epoch": 1.640799826752328, "grad_norm": 9.181469800098126, "learning_rate": 5.306281297241538e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.31640625, "logps/chosen": -490.0, "logps/rejected": -596.0, "loss": 0.0427, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.375, "rewards/margins": 11.25, "rewards/rejected": -33.5, "step": 22730 }, { "epoch": 1.641521692052263, "grad_norm": 6.806060886012383, "learning_rate": 5.305114440427598e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.1552734375, "logps/chosen": -524.0, "logps/rejected": -612.0, "loss": 0.0272, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.875, "rewards/margins": 13.0, "rewards/rejected": -35.75, "step": 22740 }, { "epoch": 1.642243557352198, "grad_norm": 1.017517946171399, "learning_rate": 5.303948353054367e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.298828125, "logps/chosen": -504.0, "logps/rejected": -588.0, "loss": 0.0469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -23.0, "rewards/margins": 11.875, "rewards/rejected": -34.75, "step": 22750 }, { "epoch": 1.642965422652133, "grad_norm": 8.24484560860379, "learning_rate": 5.302783034276587e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.2421875, "logps/chosen": -506.0, "logps/rejected": -592.0, "loss": 0.0316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.75, "rewards/margins": 12.6875, "rewards/rejected": -35.5, "step": 22760 }, { "epoch": 1.6436872879520683, "grad_norm": 6.339903590539374, "learning_rate": 5.301618483250294e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.58203125, "logps/chosen": -506.0, "logps/rejected": -552.0, "loss": 0.0489, "rewards/accuracies": 0.96875, "rewards/chosen": -20.75, "rewards/margins": 11.0, "rewards/rejected": -31.875, "step": 22770 }, { "epoch": 1.644409153252003, "grad_norm": 9.473195646407275, "learning_rate": 5.300454699132826e-07, "logits/chosen": -0.76171875, "logits/rejected": -0.2333984375, "logps/chosen": -500.0, "logps/rejected": -588.0, "loss": 0.0363, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.125, "rewards/margins": 12.25, "rewards/rejected": -34.25, "step": 22780 }, { "epoch": 1.6451310185519383, "grad_norm": 16.676837829880675, "learning_rate": 5.299291681082812e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.2265625, "logps/chosen": -520.0, "logps/rejected": -584.0, "loss": 0.0561, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 12.5625, "rewards/rejected": -34.25, "step": 22790 }, { "epoch": 1.6458528838518731, "grad_norm": 6.3163411730637105, "learning_rate": 5.298129428260176e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.259765625, "logps/chosen": -500.0, "logps/rejected": -596.0, "loss": 0.0302, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.625, "rewards/margins": 11.75, "rewards/rejected": -34.25, "step": 22800 }, { "epoch": 1.6465747491518083, "grad_norm": 6.769439948128098, "learning_rate": 5.296967939826123e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.314453125, "logps/chosen": -524.0, "logps/rejected": -580.0, "loss": 0.0377, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.75, "rewards/margins": 11.5625, "rewards/rejected": -34.25, "step": 22810 }, { "epoch": 1.6472966144517434, "grad_norm": 11.755159097119872, "learning_rate": 5.295807214943156e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.3359375, "logps/chosen": -506.0, "logps/rejected": -568.0, "loss": 0.0311, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.25, "rewards/margins": 10.6875, "rewards/rejected": -33.0, "step": 22820 }, { "epoch": 1.6480184797516784, "grad_norm": 2.283405903013561, "learning_rate": 5.294647252775055e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.291015625, "logps/chosen": -490.0, "logps/rejected": -572.0, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": -21.25, "rewards/margins": 11.5, "rewards/rejected": -32.75, "step": 22830 }, { "epoch": 1.6487403450516134, "grad_norm": 10.118715209409666, "learning_rate": 5.293488052486882e-07, "logits/chosen": -0.85546875, "logits/rejected": -0.353515625, "logps/chosen": -482.0, "logps/rejected": -584.0, "loss": 0.029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 11.5, "rewards/rejected": -31.5, "step": 22840 }, { "epoch": 1.6494622103515484, "grad_norm": 2.9451050564335257, "learning_rate": 5.292329613244979e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.2421875, "logps/chosen": -494.0, "logps/rejected": -584.0, "loss": 0.0301, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.0, "rewards/margins": 11.5625, "rewards/rejected": -33.5, "step": 22850 }, { "epoch": 1.6501840756514834, "grad_norm": 3.565212549878473, "learning_rate": 5.291171934216967e-07, "logits/chosen": -1.0625, "logits/rejected": -0.498046875, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0304, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 12.0, "rewards/rejected": -32.75, "step": 22860 }, { "epoch": 1.6509059409514184, "grad_norm": 7.501079761639751, "learning_rate": 5.290015014571736e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.4921875, "logps/chosen": -494.0, "logps/rejected": -580.0, "loss": 0.0384, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.5, "rewards/margins": 11.875, "rewards/rejected": -33.25, "step": 22870 }, { "epoch": 1.6516278062513536, "grad_norm": 8.261591361577295, "learning_rate": 5.28885885347945e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.3828125, "logps/chosen": -496.0, "logps/rejected": -584.0, "loss": 0.0436, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.5, "rewards/margins": 12.5625, "rewards/rejected": -34.0, "step": 22880 }, { "epoch": 1.6523496715512884, "grad_norm": 7.500609918572426, "learning_rate": 5.287703450111545e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.494140625, "logps/chosen": -524.0, "logps/rejected": -576.0, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -21.25, "rewards/margins": 11.9375, "rewards/rejected": -33.0, "step": 22890 }, { "epoch": 1.6530715368512237, "grad_norm": 2.532466922342025, "learning_rate": 5.286548803640718e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.52734375, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.0315, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.125, "rewards/margins": 11.125, "rewards/rejected": -33.25, "step": 22900 }, { "epoch": 1.6537934021511584, "grad_norm": 6.872138467398378, "learning_rate": 5.285394913240933e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.59375, "logps/chosen": -516.0, "logps/rejected": -576.0, "loss": 0.0239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.625, "rewards/margins": 9.9375, "rewards/rejected": -32.75, "step": 22910 }, { "epoch": 1.6545152674510937, "grad_norm": 7.9161667080481894, "learning_rate": 5.284241778087417e-07, "logits/chosen": -0.984375, "logits/rejected": -0.31640625, "logps/chosen": -512.0, "logps/rejected": -568.0, "loss": 0.0304, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.75, "rewards/margins": 11.625, "rewards/rejected": -33.25, "step": 22920 }, { "epoch": 1.6552371327510287, "grad_norm": 4.640272278722553, "learning_rate": 5.283089397356652e-07, "logits/chosen": -1.171875, "logits/rejected": -0.4375, "logps/chosen": -516.0, "logps/rejected": -576.0, "loss": 0.0364, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 11.9375, "rewards/rejected": -33.0, "step": 22930 }, { "epoch": 1.6559589980509637, "grad_norm": 4.917280835344823, "learning_rate": 5.28193777022638e-07, "logits/chosen": -1.125, "logits/rejected": -0.4140625, "logps/chosen": -512.0, "logps/rejected": -612.0, "loss": 0.0366, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.625, "rewards/margins": 11.875, "rewards/rejected": -34.5, "step": 22940 }, { "epoch": 1.6566808633508987, "grad_norm": 6.1755807804105025, "learning_rate": 5.280786895875596e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.396484375, "logps/chosen": -472.0, "logps/rejected": -572.0, "loss": 0.0373, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 12.625, "rewards/rejected": -32.25, "step": 22950 }, { "epoch": 1.6574027286508337, "grad_norm": 9.102197814973877, "learning_rate": 5.279636773484546e-07, "logits/chosen": -1.046875, "logits/rejected": -0.4296875, "logps/chosen": -480.0, "logps/rejected": -576.0, "loss": 0.0328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.125, "rewards/margins": 10.5625, "rewards/rejected": -31.75, "step": 22960 }, { "epoch": 1.658124593950769, "grad_norm": 8.058200919624591, "learning_rate": 5.278487402234727e-07, "logits/chosen": -0.90625, "logits/rejected": -0.298828125, "logps/chosen": -504.0, "logps/rejected": -580.0, "loss": 0.0378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.25, "rewards/margins": 11.625, "rewards/rejected": -33.75, "step": 22970 }, { "epoch": 1.6588464592507037, "grad_norm": 8.971897915232763, "learning_rate": 5.27733878130888e-07, "logits/chosen": -0.9375, "logits/rejected": -0.26171875, "logps/chosen": -486.0, "logps/rejected": -580.0, "loss": 0.0286, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.875, "rewards/margins": 11.5625, "rewards/rejected": -34.5, "step": 22980 }, { "epoch": 1.659568324550639, "grad_norm": 11.229076658179341, "learning_rate": 5.276190909890993e-07, "logits/chosen": -1.125, "logits/rejected": -0.216796875, "logps/chosen": -462.0, "logps/rejected": -540.0, "loss": 0.0453, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.5, "rewards/margins": 11.8125, "rewards/rejected": -31.25, "step": 22990 }, { "epoch": 1.6602901898505737, "grad_norm": 5.113132690019474, "learning_rate": 5.275043787166296e-07, "logits/chosen": -1.015625, "logits/rejected": -0.328125, "logps/chosen": -510.0, "logps/rejected": -588.0, "loss": 0.0353, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.0, "rewards/margins": 11.0625, "rewards/rejected": -33.0, "step": 23000 }, { "epoch": 1.661012055150509, "grad_norm": 2.016838084116326, "learning_rate": 5.273897412321254e-07, "logits/chosen": -1.078125, "logits/rejected": -0.51953125, "logps/chosen": -478.0, "logps/rejected": -576.0, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -19.5, "rewards/margins": 12.0, "rewards/rejected": -31.5, "step": 23010 }, { "epoch": 1.661733920450444, "grad_norm": 6.818855899891011, "learning_rate": 5.272751784543577e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.470703125, "logps/chosen": -482.0, "logps/rejected": -572.0, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -18.625, "rewards/margins": 12.625, "rewards/rejected": -31.125, "step": 23020 }, { "epoch": 1.662455785750379, "grad_norm": 5.878997630688907, "learning_rate": 5.2716069030222e-07, "logits/chosen": -1.171875, "logits/rejected": -0.45703125, "logps/chosen": -470.0, "logps/rejected": -536.0, "loss": 0.0507, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -18.75, "rewards/margins": 11.375, "rewards/rejected": -30.125, "step": 23030 }, { "epoch": 1.663177651050314, "grad_norm": 2.3379051147621195, "learning_rate": 5.270462766947299e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.3359375, "logps/chosen": -484.0, "logps/rejected": -584.0, "loss": 0.0309, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 11.5625, "rewards/rejected": -32.25, "step": 23040 }, { "epoch": 1.663899516350249, "grad_norm": 6.920731064185809, "learning_rate": 5.269319375510273e-07, "logits/chosen": -0.921875, "logits/rejected": -0.40625, "logps/chosen": -484.0, "logps/rejected": -532.0, "loss": 0.0352, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -18.875, "rewards/margins": 10.375, "rewards/rejected": -29.25, "step": 23050 }, { "epoch": 1.664621381650184, "grad_norm": 11.726024108356683, "learning_rate": 5.268176727903755e-07, "logits/chosen": -0.921875, "logits/rejected": -0.4375, "logps/chosen": -486.0, "logps/rejected": -544.0, "loss": 0.0377, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.8125, "rewards/rejected": -31.75, "step": 23060 }, { "epoch": 1.665343246950119, "grad_norm": 3.6540474828232687, "learning_rate": 5.267034823321595e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.349609375, "logps/chosen": -498.0, "logps/rejected": -564.0, "loss": 0.0339, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.75, "rewards/margins": 11.375, "rewards/rejected": -31.125, "step": 23070 }, { "epoch": 1.6660651122500543, "grad_norm": 5.690122066518041, "learning_rate": 5.265893660958874e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.3671875, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0467, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.375, "rewards/margins": 11.125, "rewards/rejected": -30.625, "step": 23080 }, { "epoch": 1.666786977549989, "grad_norm": 6.870137148204074, "learning_rate": 5.264753240011888e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.271484375, "logps/chosen": -448.0, "logps/rejected": -540.0, "loss": 0.0378, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.0, "rewards/margins": 10.75, "rewards/rejected": -29.75, "step": 23090 }, { "epoch": 1.6675088428499243, "grad_norm": 11.54566334557969, "learning_rate": 5.263613559678151e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.134765625, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.037, "rewards/accuracies": 0.96875, "rewards/chosen": -21.875, "rewards/margins": 11.9375, "rewards/rejected": -33.75, "step": 23100 }, { "epoch": 1.668230708149859, "grad_norm": 4.060593299661113, "learning_rate": 5.262474619156395e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.310546875, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0367, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 12.625, "rewards/rejected": -32.25, "step": 23110 }, { "epoch": 1.6689525734497943, "grad_norm": 5.439171827841054, "learning_rate": 5.261336417646563e-07, "logits/chosen": -0.83203125, "logits/rejected": -0.275390625, "logps/chosen": -504.0, "logps/rejected": -588.0, "loss": 0.0416, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.125, "rewards/rejected": -32.25, "step": 23120 }, { "epoch": 1.6696744387497293, "grad_norm": 1.5943166446971713, "learning_rate": 5.26019895434981e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.166015625, "logps/chosen": -496.0, "logps/rejected": -548.0, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.25, "rewards/rejected": -32.5, "step": 23130 }, { "epoch": 1.6703963040496643, "grad_norm": 1.3145408009496178, "learning_rate": 5.259062228468499e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.251953125, "logps/chosen": -502.0, "logps/rejected": -568.0, "loss": 0.0413, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 11.3125, "rewards/rejected": -33.0, "step": 23140 }, { "epoch": 1.6711181693495993, "grad_norm": 8.3161840407021, "learning_rate": 5.257926239206199e-07, "logits/chosen": -0.921875, "logits/rejected": -0.443359375, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0542, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.25, "rewards/margins": 11.5625, "rewards/rejected": -32.75, "step": 23150 }, { "epoch": 1.6718400346495343, "grad_norm": 4.805140310720784, "learning_rate": 5.256790985767682e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.27734375, "logps/chosen": -494.0, "logps/rejected": -560.0, "loss": 0.0325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 12.0, "rewards/rejected": -32.5, "step": 23160 }, { "epoch": 1.6725618999494696, "grad_norm": 6.568755067887631, "learning_rate": 5.255656467358922e-07, "logits/chosen": -1.125, "logits/rejected": -0.53125, "logps/chosen": -502.0, "logps/rejected": -588.0, "loss": 0.042, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.125, "rewards/margins": 11.875, "rewards/rejected": -33.0, "step": 23170 }, { "epoch": 1.6732837652494044, "grad_norm": 7.6506493131585955, "learning_rate": 5.254522683187093e-07, "logits/chosen": -0.77734375, "logits/rejected": -0.22265625, "logps/chosen": -478.0, "logps/rejected": -572.0, "loss": 0.0322, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.375, "rewards/margins": 11.4375, "rewards/rejected": -31.875, "step": 23180 }, { "epoch": 1.6740056305493396, "grad_norm": 6.513048821308124, "learning_rate": 5.253389632460566e-07, "logits/chosen": -0.9375, "logits/rejected": -0.423828125, "logps/chosen": -502.0, "logps/rejected": -592.0, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -21.125, "rewards/margins": 11.6875, "rewards/rejected": -32.75, "step": 23190 }, { "epoch": 1.6747274958492744, "grad_norm": 8.03369045638993, "learning_rate": 5.252257314388901e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.3125, "logps/chosen": -492.0, "logps/rejected": -572.0, "loss": 0.0408, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.625, "rewards/rejected": -32.5, "step": 23200 }, { "epoch": 1.6754493611492096, "grad_norm": 12.640973058735739, "learning_rate": 5.251125728182861e-07, "logits/chosen": -0.8671875, "logits/rejected": -0.390625, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.0457, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.5, "rewards/margins": 11.125, "rewards/rejected": -30.625, "step": 23210 }, { "epoch": 1.6761712264491446, "grad_norm": 4.7632397429027025, "learning_rate": 5.249994873054385e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.396484375, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0277, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 11.5625, "rewards/rejected": -30.5, "step": 23220 }, { "epoch": 1.6768930917490796, "grad_norm": 6.492702612802512, "learning_rate": 5.24886474821661e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.357421875, "logps/chosen": -480.0, "logps/rejected": -564.0, "loss": 0.0383, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.25, "rewards/margins": 10.9375, "rewards/rejected": -30.25, "step": 23230 }, { "epoch": 1.6776149570490146, "grad_norm": 6.97745836716101, "learning_rate": 5.247735352883857e-07, "logits/chosen": -0.84375, "logits/rejected": -0.423828125, "logps/chosen": -484.0, "logps/rejected": -540.0, "loss": 0.0304, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.25, "rewards/margins": 10.6875, "rewards/rejected": -31.0, "step": 23240 }, { "epoch": 1.6783368223489497, "grad_norm": 4.943988053048604, "learning_rate": 5.246606686271623e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.345703125, "logps/chosen": -460.0, "logps/rejected": -556.0, "loss": 0.0291, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.625, "rewards/margins": 11.375, "rewards/rejected": -30.0, "step": 23250 }, { "epoch": 1.6790586876488849, "grad_norm": 6.4610177550967824, "learning_rate": 5.245478747596593e-07, "logits/chosen": -0.8359375, "logits/rejected": -0.30859375, "logps/chosen": -468.0, "logps/rejected": -568.0, "loss": 0.0324, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 12.5625, "rewards/rejected": -32.0, "step": 23260 }, { "epoch": 1.6797805529488197, "grad_norm": 4.202856677641443, "learning_rate": 5.244351536076629e-07, "logits/chosen": -0.828125, "logits/rejected": -0.2314453125, "logps/chosen": -490.0, "logps/rejected": -564.0, "loss": 0.0254, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.125, "rewards/margins": 10.8125, "rewards/rejected": -32.0, "step": 23270 }, { "epoch": 1.680502418248755, "grad_norm": 2.4661889654044966, "learning_rate": 5.243225050930763e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.388671875, "logps/chosen": -504.0, "logps/rejected": -588.0, "loss": 0.03, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.25, "rewards/margins": 11.875, "rewards/rejected": -33.25, "step": 23280 }, { "epoch": 1.6812242835486897, "grad_norm": 11.42770486458539, "learning_rate": 5.242099291379208e-07, "logits/chosen": -1.015625, "logits/rejected": -0.341796875, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.625, "rewards/margins": 12.125, "rewards/rejected": -34.75, "step": 23290 }, { "epoch": 1.681946148848625, "grad_norm": 10.331985764605964, "learning_rate": 5.240974256643347e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.388671875, "logps/chosen": -512.0, "logps/rejected": -588.0, "loss": 0.044, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -21.125, "rewards/margins": 10.9375, "rewards/rejected": -32.0, "step": 23300 }, { "epoch": 1.68266801414856, "grad_norm": 11.076534163736941, "learning_rate": 5.239849945945729e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.361328125, "logps/chosen": -516.0, "logps/rejected": -588.0, "loss": 0.038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.25, "rewards/margins": 11.375, "rewards/rejected": -33.5, "step": 23310 }, { "epoch": 1.683389879448495, "grad_norm": 7.240925764107687, "learning_rate": 5.238726358510071e-07, "logits/chosen": -0.92578125, "logits/rejected": -0.33984375, "logps/chosen": -458.0, "logps/rejected": -556.0, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -19.25, "rewards/margins": 12.0625, "rewards/rejected": -31.25, "step": 23320 }, { "epoch": 1.68411174474843, "grad_norm": 14.436460319148557, "learning_rate": 5.237603493561259e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.4140625, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0385, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.375, "rewards/margins": 11.875, "rewards/rejected": -32.25, "step": 23330 }, { "epoch": 1.684833610048365, "grad_norm": 5.976970122791802, "learning_rate": 5.236481350325335e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.2333984375, "logps/chosen": -464.0, "logps/rejected": -564.0, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -20.75, "rewards/margins": 11.5, "rewards/rejected": -32.25, "step": 23340 }, { "epoch": 1.6855554753483, "grad_norm": 8.993935135199793, "learning_rate": 5.235359928029507e-07, "logits/chosen": -1.015625, "logits/rejected": -0.29296875, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 11.625, "rewards/rejected": -32.75, "step": 23350 }, { "epoch": 1.686277340648235, "grad_norm": 2.5618338734622172, "learning_rate": 5.234239225902136e-07, "logits/chosen": -1.15625, "logits/rejected": -0.41015625, "logps/chosen": -506.0, "logps/rejected": -572.0, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -21.75, "rewards/margins": 11.5625, "rewards/rejected": -33.25, "step": 23360 }, { "epoch": 1.6869992059481702, "grad_norm": 7.393431327913823, "learning_rate": 5.233119243172744e-07, "logits/chosen": -1.046875, "logits/rejected": -0.2734375, "logps/chosen": -496.0, "logps/rejected": -576.0, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.625, "rewards/rejected": -32.25, "step": 23370 }, { "epoch": 1.687721071248105, "grad_norm": 3.2737349818525305, "learning_rate": 5.231999979071999e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.451171875, "logps/chosen": -490.0, "logps/rejected": -568.0, "loss": 0.0422, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 12.0625, "rewards/rejected": -31.125, "step": 23380 }, { "epoch": 1.6884429365480402, "grad_norm": 4.802767499224768, "learning_rate": 5.23088143283173e-07, "logits/chosen": -1.2109375, "logits/rejected": -0.5078125, "logps/chosen": -460.0, "logps/rejected": -548.0, "loss": 0.0253, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.375, "rewards/margins": 11.625, "rewards/rejected": -31.0, "step": 23390 }, { "epoch": 1.689164801847975, "grad_norm": 9.515145539865415, "learning_rate": 5.229763603684907e-07, "logits/chosen": -0.984375, "logits/rejected": -0.3984375, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.0347, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 11.75, "rewards/rejected": -31.875, "step": 23400 }, { "epoch": 1.6898866671479102, "grad_norm": 1.158010602009028, "learning_rate": 5.228646490865652e-07, "logits/chosen": -0.7421875, "logits/rejected": -0.365234375, "logps/chosen": -482.0, "logps/rejected": -536.0, "loss": 0.0273, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 11.125, "rewards/rejected": -29.375, "step": 23410 }, { "epoch": 1.6906085324478453, "grad_norm": 5.987524055183568, "learning_rate": 5.227530093609228e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.396484375, "logps/chosen": -460.0, "logps/rejected": -536.0, "loss": 0.0389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.25, "rewards/margins": 11.625, "rewards/rejected": -30.875, "step": 23420 }, { "epoch": 1.6913303977477803, "grad_norm": 4.991667170894923, "learning_rate": 5.226414411152042e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.390625, "logps/chosen": -444.0, "logps/rejected": -552.0, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -17.75, "rewards/margins": 11.5, "rewards/rejected": -29.375, "step": 23430 }, { "epoch": 1.6920522630477153, "grad_norm": 12.574866940216891, "learning_rate": 5.225299442731641e-07, "logits/chosen": -1.03125, "logits/rejected": -0.45703125, "logps/chosen": -462.0, "logps/rejected": -524.0, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -18.25, "rewards/margins": 10.8125, "rewards/rejected": -29.0, "step": 23440 }, { "epoch": 1.6927741283476503, "grad_norm": 5.05979523267898, "learning_rate": 5.224185187586712e-07, "logits/chosen": -1.0625, "logits/rejected": -0.5546875, "logps/chosen": -450.0, "logps/rejected": -528.0, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -17.75, "rewards/margins": 11.125, "rewards/rejected": -28.875, "step": 23450 }, { "epoch": 1.6934959936475855, "grad_norm": 4.209541650004566, "learning_rate": 5.223071644957079e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.54296875, "logps/chosen": -478.0, "logps/rejected": -532.0, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -18.5, "rewards/margins": 11.1875, "rewards/rejected": -29.75, "step": 23460 }, { "epoch": 1.6942178589475203, "grad_norm": 6.990209503818633, "learning_rate": 5.221958814083692e-07, "logits/chosen": -0.859375, "logits/rejected": -0.412109375, "logps/chosen": -500.0, "logps/rejected": -568.0, "loss": 0.0362, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.0, "rewards/margins": 11.25, "rewards/rejected": -32.25, "step": 23470 }, { "epoch": 1.6949397242474555, "grad_norm": 14.902701967959551, "learning_rate": 5.220846694208641e-07, "logits/chosen": -0.875, "logits/rejected": -0.4375, "logps/chosen": -490.0, "logps/rejected": -560.0, "loss": 0.0334, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 11.25, "rewards/rejected": -31.375, "step": 23480 }, { "epoch": 1.6956615895473903, "grad_norm": 10.29458492994833, "learning_rate": 5.219735284575142e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.3671875, "logps/chosen": -512.0, "logps/rejected": -544.0, "loss": 0.0458, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.75, "rewards/margins": 11.75, "rewards/rejected": -31.5, "step": 23490 }, { "epoch": 1.6963834548473256, "grad_norm": 4.456409256034051, "learning_rate": 5.218624584427538e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.46875, "logps/chosen": -448.0, "logps/rejected": -524.0, "loss": 0.055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 11.125, "rewards/rejected": -30.0, "step": 23500 }, { "epoch": 1.6971053201472606, "grad_norm": 4.496302777949102, "learning_rate": 5.217514593011297e-07, "logits/chosen": -1.03125, "logits/rejected": -0.53125, "logps/chosen": -454.0, "logps/rejected": -532.0, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.0, "rewards/margins": 11.125, "rewards/rejected": -30.0, "step": 23510 }, { "epoch": 1.6978271854471956, "grad_norm": 6.929605625058228, "learning_rate": 5.216405309573011e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.458984375, "logps/chosen": -478.0, "logps/rejected": -560.0, "loss": 0.0414, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.375, "rewards/margins": 11.5, "rewards/rejected": -30.875, "step": 23520 }, { "epoch": 1.6985490507471306, "grad_norm": 10.315684305207027, "learning_rate": 5.215296733360391e-07, "logits/chosen": -1.1875, "logits/rejected": -0.55078125, "logps/chosen": -452.0, "logps/rejected": -524.0, "loss": 0.0299, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.25, "rewards/margins": 10.9375, "rewards/rejected": -29.25, "step": 23530 }, { "epoch": 1.6992709160470656, "grad_norm": 5.434605460665202, "learning_rate": 5.214188863622271e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.435546875, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.0468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.875, "rewards/margins": 11.0625, "rewards/rejected": -30.875, "step": 23540 }, { "epoch": 1.6999927813470006, "grad_norm": 2.935055772336424, "learning_rate": 5.213081699608596e-07, "logits/chosen": -1.078125, "logits/rejected": -0.51953125, "logps/chosen": -486.0, "logps/rejected": -580.0, "loss": 0.025, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 11.5625, "rewards/rejected": -32.0, "step": 23550 }, { "epoch": 1.7007146466469356, "grad_norm": 2.778330807387947, "learning_rate": 5.21197524057043e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.408203125, "logps/chosen": -494.0, "logps/rejected": -580.0, "loss": 0.034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 11.6875, "rewards/rejected": -33.25, "step": 23560 }, { "epoch": 1.7014365119468708, "grad_norm": 2.23940175412165, "learning_rate": 5.210869485759943e-07, "logits/chosen": -1.046875, "logits/rejected": -0.49609375, "logps/chosen": -528.0, "logps/rejected": -612.0, "loss": 0.0366, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -23.25, "rewards/margins": 12.0, "rewards/rejected": -35.25, "step": 23570 }, { "epoch": 1.7021583772468056, "grad_norm": 10.529943258951993, "learning_rate": 5.209764434430422e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.40234375, "logps/chosen": -516.0, "logps/rejected": -580.0, "loss": 0.0527, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -22.625, "rewards/margins": 11.8125, "rewards/rejected": -34.5, "step": 23580 }, { "epoch": 1.7028802425467409, "grad_norm": 6.273999686681689, "learning_rate": 5.20866008583626e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.28125, "logps/chosen": -510.0, "logps/rejected": -568.0, "loss": 0.0436, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -22.375, "rewards/margins": 12.125, "rewards/rejected": -34.5, "step": 23590 }, { "epoch": 1.7036021078466757, "grad_norm": 9.595444684328461, "learning_rate": 5.207556439232955e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.298828125, "logps/chosen": -508.0, "logps/rejected": -608.0, "loss": 0.0469, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.875, "rewards/margins": 13.375, "rewards/rejected": -35.25, "step": 23600 }, { "epoch": 1.7043239731466109, "grad_norm": 11.973404329290268, "learning_rate": 5.206453493877105e-07, "logits/chosen": -0.859375, "logits/rejected": -0.447265625, "logps/chosen": -482.0, "logps/rejected": -560.0, "loss": 0.0478, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.25, "rewards/rejected": -31.5, "step": 23610 }, { "epoch": 1.705045838446546, "grad_norm": 3.113252533774934, "learning_rate": 5.205351249026418e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.39453125, "logps/chosen": -478.0, "logps/rejected": -588.0, "loss": 0.0177, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 12.25, "rewards/rejected": -32.5, "step": 23620 }, { "epoch": 1.705767703746481, "grad_norm": 4.696971976034861, "learning_rate": 5.204249703939695e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.51953125, "logps/chosen": -472.0, "logps/rejected": -560.0, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.5, "rewards/rejected": -31.75, "step": 23630 }, { "epoch": 1.706489569046416, "grad_norm": 10.751566139814734, "learning_rate": 5.203148857876837e-07, "logits/chosen": -1.1796875, "logits/rejected": -0.5703125, "logps/chosen": -454.0, "logps/rejected": -544.0, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.125, "rewards/margins": 11.125, "rewards/rejected": -30.25, "step": 23640 }, { "epoch": 1.707211434346351, "grad_norm": 6.822484436640594, "learning_rate": 5.202048710098841e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.3984375, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0396, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.0, "rewards/margins": 11.4375, "rewards/rejected": -32.5, "step": 23650 }, { "epoch": 1.7079332996462862, "grad_norm": 11.73440965582099, "learning_rate": 5.200949259867794e-07, "logits/chosen": -1.15625, "logits/rejected": -0.2412109375, "logps/chosen": -472.0, "logps/rejected": -540.0, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -19.375, "rewards/margins": 11.75, "rewards/rejected": -31.0, "step": 23660 }, { "epoch": 1.708655164946221, "grad_norm": 10.213511607818601, "learning_rate": 5.199850506446878e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.51953125, "logps/chosen": -460.0, "logps/rejected": -524.0, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.125, "rewards/margins": 10.8125, "rewards/rejected": -28.875, "step": 23670 }, { "epoch": 1.7093770302461562, "grad_norm": 5.067047096017084, "learning_rate": 5.198752449100363e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.53515625, "logps/chosen": -502.0, "logps/rejected": -564.0, "loss": 0.0345, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.75, "rewards/rejected": -32.75, "step": 23680 }, { "epoch": 1.710098895546091, "grad_norm": 5.668141534511573, "learning_rate": 5.197655087093606e-07, "logits/chosen": -1.125, "logits/rejected": -0.6328125, "logps/chosen": -510.0, "logps/rejected": -576.0, "loss": 0.0345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 11.4375, "rewards/rejected": -32.5, "step": 23690 }, { "epoch": 1.7108207608460262, "grad_norm": 6.569762855208356, "learning_rate": 5.196558419693047e-07, "logits/chosen": -1.046875, "logits/rejected": -0.4140625, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": -20.5, "rewards/margins": 11.3125, "rewards/rejected": -31.875, "step": 23700 }, { "epoch": 1.7115426261459612, "grad_norm": 9.198871849411725, "learning_rate": 5.195462446166212e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.44140625, "logps/chosen": -502.0, "logps/rejected": -588.0, "loss": 0.0451, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.875, "rewards/margins": 11.4375, "rewards/rejected": -32.25, "step": 23710 }, { "epoch": 1.7122644914458962, "grad_norm": 10.395736874755906, "learning_rate": 5.194367165781708e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.46875, "logps/chosen": -490.0, "logps/rejected": -580.0, "loss": 0.0491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.375, "rewards/margins": 10.8125, "rewards/rejected": -32.25, "step": 23720 }, { "epoch": 1.7129863567458312, "grad_norm": 5.004915138903061, "learning_rate": 5.193272577809217e-07, "logits/chosen": -1.03125, "logits/rejected": -0.431640625, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0247, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.5, "rewards/margins": 11.9375, "rewards/rejected": -31.5, "step": 23730 }, { "epoch": 1.7137082220457662, "grad_norm": 1.848610418595771, "learning_rate": 5.192178681519504e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.4375, "logps/chosen": -482.0, "logps/rejected": -568.0, "loss": 0.0431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.0, "rewards/margins": 11.3125, "rewards/rejected": -33.25, "step": 23740 }, { "epoch": 1.7144300873457012, "grad_norm": 5.973541746200792, "learning_rate": 5.191085476184402e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.62890625, "logps/chosen": -480.0, "logps/rejected": -572.0, "loss": 0.0422, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.0, "rewards/rejected": -32.25, "step": 23750 }, { "epoch": 1.7151519526456362, "grad_norm": 8.381108270482759, "learning_rate": 5.18999296107682e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.5546875, "logps/chosen": -462.0, "logps/rejected": -556.0, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 11.6875, "rewards/rejected": -31.5, "step": 23760 }, { "epoch": 1.7158738179455715, "grad_norm": 4.624643434369577, "learning_rate": 5.188901135470739e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.462890625, "logps/chosen": -508.0, "logps/rejected": -600.0, "loss": 0.0361, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.25, "rewards/margins": 11.1875, "rewards/rejected": -33.5, "step": 23770 }, { "epoch": 1.7165956832455063, "grad_norm": 3.4804993696840905, "learning_rate": 5.187809998641207e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.5078125, "logps/chosen": -516.0, "logps/rejected": -584.0, "loss": 0.0267, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.5, "rewards/margins": 11.75, "rewards/rejected": -34.25, "step": 23780 }, { "epoch": 1.7173175485454415, "grad_norm": 5.122109123608406, "learning_rate": 5.186719549864339e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.310546875, "logps/chosen": -494.0, "logps/rejected": -556.0, "loss": 0.0461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.0, "rewards/margins": 10.5625, "rewards/rejected": -32.75, "step": 23790 }, { "epoch": 1.7180394138453765, "grad_norm": 7.8535290081060705, "learning_rate": 5.185629788417315e-07, "logits/chosen": -1.078125, "logits/rejected": -0.3515625, "logps/chosen": -458.0, "logps/rejected": -544.0, "loss": 0.0354, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.75, "rewards/rejected": -31.625, "step": 23800 }, { "epoch": 1.7187612791453115, "grad_norm": 7.304883241564248, "learning_rate": 5.184540713578377e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.400390625, "logps/chosen": -486.0, "logps/rejected": -568.0, "loss": 0.0309, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.25, "rewards/margins": 11.5625, "rewards/rejected": -32.75, "step": 23810 }, { "epoch": 1.7194831444452465, "grad_norm": 4.189962440061821, "learning_rate": 5.183452324626827e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.55078125, "logps/chosen": -500.0, "logps/rejected": -556.0, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 11.3125, "rewards/rejected": -32.0, "step": 23820 }, { "epoch": 1.7202050097451815, "grad_norm": 8.373890862122137, "learning_rate": 5.182364620843029e-07, "logits/chosen": -0.96875, "logits/rejected": -0.427734375, "logps/chosen": -490.0, "logps/rejected": -568.0, "loss": 0.0331, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 10.9375, "rewards/rejected": -31.0, "step": 23830 }, { "epoch": 1.7209268750451165, "grad_norm": 12.643633690831741, "learning_rate": 5.181277601508397e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.408203125, "logps/chosen": -474.0, "logps/rejected": -556.0, "loss": 0.05, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.75, "rewards/margins": 12.25, "rewards/rejected": -33.0, "step": 23840 }, { "epoch": 1.7216487403450516, "grad_norm": 5.537583718670499, "learning_rate": 5.180191265905408e-07, "logits/chosen": -1.078125, "logits/rejected": -0.55859375, "logps/chosen": -464.0, "logps/rejected": -540.0, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.0, "rewards/margins": 12.0, "rewards/rejected": -31.0, "step": 23850 }, { "epoch": 1.7223706056449868, "grad_norm": 3.3704315072097857, "learning_rate": 5.179105613317587e-07, "logits/chosen": -1.0625, "logits/rejected": -0.54296875, "logps/chosen": -496.0, "logps/rejected": -576.0, "loss": 0.0305, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 12.1875, "rewards/rejected": -33.0, "step": 23860 }, { "epoch": 1.7230924709449216, "grad_norm": 12.348831106769019, "learning_rate": 5.178020643029507e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.5859375, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.0267, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.625, "rewards/margins": 12.1875, "rewards/rejected": -32.75, "step": 23870 }, { "epoch": 1.7238143362448568, "grad_norm": 8.85396858858089, "learning_rate": 5.176936354326795e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.44140625, "logps/chosen": -478.0, "logps/rejected": -576.0, "loss": 0.0393, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.75, "rewards/margins": 11.9375, "rewards/rejected": -32.75, "step": 23880 }, { "epoch": 1.7245362015447916, "grad_norm": 6.700497410860626, "learning_rate": 5.175852746496124e-07, "logits/chosen": -0.875, "logits/rejected": -0.43359375, "logps/chosen": -508.0, "logps/rejected": -608.0, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 12.4375, "rewards/rejected": -34.0, "step": 23890 }, { "epoch": 1.7252580668447268, "grad_norm": 0.785234187524223, "learning_rate": 5.174769818825206e-07, "logits/chosen": -1.078125, "logits/rejected": -0.5859375, "logps/chosen": -474.0, "logps/rejected": -568.0, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": -20.75, "rewards/margins": 11.3125, "rewards/rejected": -32.0, "step": 23900 }, { "epoch": 1.7259799321446618, "grad_norm": 6.9368470700647045, "learning_rate": 5.173687570602804e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.44140625, "logps/chosen": -502.0, "logps/rejected": -580.0, "loss": 0.0371, "rewards/accuracies": 0.96875, "rewards/chosen": -21.75, "rewards/margins": 11.8125, "rewards/rejected": -33.5, "step": 23910 }, { "epoch": 1.7267017974445968, "grad_norm": 4.90421159158826, "learning_rate": 5.172606001118717e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.484375, "logps/chosen": -500.0, "logps/rejected": -580.0, "loss": 0.0386, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 11.9375, "rewards/rejected": -33.5, "step": 23920 }, { "epoch": 1.7274236627445319, "grad_norm": 6.687674913538871, "learning_rate": 5.171525109663781e-07, "logits/chosen": -1.09375, "logits/rejected": -0.4765625, "logps/chosen": -470.0, "logps/rejected": -568.0, "loss": 0.0437, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.875, "rewards/margins": 10.875, "rewards/rejected": -31.75, "step": 23930 }, { "epoch": 1.7281455280444669, "grad_norm": 4.034185081179775, "learning_rate": 5.170444895529875e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.470703125, "logps/chosen": -508.0, "logps/rejected": -592.0, "loss": 0.0284, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.875, "rewards/margins": 12.125, "rewards/rejected": -33.0, "step": 23940 }, { "epoch": 1.728867393344402, "grad_norm": 2.690717760486488, "learning_rate": 5.169365358009907e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.373046875, "logps/chosen": -460.0, "logps/rejected": -536.0, "loss": 0.0364, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -18.875, "rewards/margins": 11.125, "rewards/rejected": -30.0, "step": 23950 }, { "epoch": 1.7295892586443369, "grad_norm": 2.8337398947207917, "learning_rate": 5.168286496397822e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.384765625, "logps/chosen": -510.0, "logps/rejected": -580.0, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.875, "rewards/margins": 10.875, "rewards/rejected": -31.75, "step": 23960 }, { "epoch": 1.7303111239442721, "grad_norm": 2.2606881840411557, "learning_rate": 5.167208309988594e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.326171875, "logps/chosen": -498.0, "logps/rejected": -580.0, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -20.875, "rewards/margins": 11.8125, "rewards/rejected": -32.75, "step": 23970 }, { "epoch": 1.731032989244207, "grad_norm": 6.515052701131066, "learning_rate": 5.166130798078227e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.466796875, "logps/chosen": -468.0, "logps/rejected": -540.0, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 10.75, "rewards/rejected": -30.375, "step": 23980 }, { "epoch": 1.7317548545441421, "grad_norm": 5.037990770125281, "learning_rate": 5.16505395996375e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.64453125, "logps/chosen": -516.0, "logps/rejected": -616.0, "loss": 0.0244, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.375, "rewards/margins": 13.6875, "rewards/rejected": -35.0, "step": 23990 }, { "epoch": 1.7324767198440771, "grad_norm": 8.160205596334848, "learning_rate": 5.163977794943222e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.41015625, "logps/chosen": -494.0, "logps/rejected": -576.0, "loss": 0.0392, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.625, "rewards/margins": 11.5625, "rewards/rejected": -32.25, "step": 24000 }, { "epoch": 1.7331985851440121, "grad_norm": 3.137214112833886, "learning_rate": 5.162902302315721e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.416015625, "logps/chosen": -472.0, "logps/rejected": -564.0, "loss": 0.0192, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.75, "rewards/margins": 12.6875, "rewards/rejected": -32.5, "step": 24010 }, { "epoch": 1.7339204504439472, "grad_norm": 7.421168568756134, "learning_rate": 5.161827481381348e-07, "logits/chosen": -0.96875, "logits/rejected": -0.349609375, "logps/chosen": -520.0, "logps/rejected": -616.0, "loss": 0.0295, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.25, "rewards/margins": 12.375, "rewards/rejected": -35.5, "step": 24020 }, { "epoch": 1.7346423157438822, "grad_norm": 4.330069352954876, "learning_rate": 5.160753331441223e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.57421875, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0402, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.125, "rewards/margins": 11.6875, "rewards/rejected": -32.75, "step": 24030 }, { "epoch": 1.7353641810438172, "grad_norm": 5.303117565802668, "learning_rate": 5.159679851797486e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.353515625, "logps/chosen": -532.0, "logps/rejected": -580.0, "loss": 0.0279, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.375, "rewards/margins": 12.25, "rewards/rejected": -34.5, "step": 24040 }, { "epoch": 1.7360860463437522, "grad_norm": 3.0105140120194256, "learning_rate": 5.158607041753288e-07, "logits/chosen": -1.15625, "logits/rejected": -0.423828125, "logps/chosen": -480.0, "logps/rejected": -572.0, "loss": 0.041, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 11.5, "rewards/rejected": -31.875, "step": 24050 }, { "epoch": 1.7368079116436874, "grad_norm": 2.5124494010089595, "learning_rate": 5.157534900612799e-07, "logits/chosen": -1.1796875, "logits/rejected": -0.49609375, "logps/chosen": -474.0, "logps/rejected": -568.0, "loss": 0.0333, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 11.8125, "rewards/rejected": -34.25, "step": 24060 }, { "epoch": 1.7375297769436222, "grad_norm": 12.963408706129718, "learning_rate": 5.156463427681196e-07, "logits/chosen": -1.09375, "logits/rejected": -0.486328125, "logps/chosen": -498.0, "logps/rejected": -580.0, "loss": 0.0448, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.875, "rewards/margins": 12.4375, "rewards/rejected": -33.5, "step": 24070 }, { "epoch": 1.7382516422435574, "grad_norm": 3.803598647015619, "learning_rate": 5.155392622264671e-07, "logits/chosen": -1.125, "logits/rejected": -0.455078125, "logps/chosen": -504.0, "logps/rejected": -584.0, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -20.875, "rewards/margins": 12.3125, "rewards/rejected": -33.25, "step": 24080 }, { "epoch": 1.7389735075434922, "grad_norm": 6.240280643930149, "learning_rate": 5.154322483670419e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.369140625, "logps/chosen": -480.0, "logps/rejected": -540.0, "loss": 0.0351, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.0, "rewards/margins": 11.75, "rewards/rejected": -32.75, "step": 24090 }, { "epoch": 1.7396953728434275, "grad_norm": 9.709936548527788, "learning_rate": 5.153253011206647e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.52734375, "logps/chosen": -448.0, "logps/rejected": -516.0, "loss": 0.0348, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.875, "rewards/margins": 10.875, "rewards/rejected": -29.75, "step": 24100 }, { "epoch": 1.7404172381433625, "grad_norm": 7.654059631073427, "learning_rate": 5.15218420418256e-07, "logits/chosen": -1.0, "logits/rejected": -0.3125, "logps/chosen": -472.0, "logps/rejected": -552.0, "loss": 0.0397, "rewards/accuracies": 0.96875, "rewards/chosen": -20.5, "rewards/margins": 11.1875, "rewards/rejected": -31.75, "step": 24110 }, { "epoch": 1.7411391034432975, "grad_norm": 13.544870158993056, "learning_rate": 5.151116061908373e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.42578125, "logps/chosen": -506.0, "logps/rejected": -572.0, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -21.625, "rewards/margins": 11.6875, "rewards/rejected": -33.25, "step": 24120 }, { "epoch": 1.7418609687432325, "grad_norm": 2.5261053881097464, "learning_rate": 5.150048583695291e-07, "logits/chosen": -0.953125, "logits/rejected": -0.30859375, "logps/chosen": -510.0, "logps/rejected": -584.0, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -22.75, "rewards/margins": 11.5, "rewards/rejected": -34.25, "step": 24130 }, { "epoch": 1.7425828340431675, "grad_norm": 9.241346790274621, "learning_rate": 5.14898176885553e-07, "logits/chosen": -1.046875, "logits/rejected": -0.296875, "logps/chosen": -486.0, "logps/rejected": -560.0, "loss": 0.0459, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.375, "rewards/margins": 12.1875, "rewards/rejected": -33.5, "step": 24140 }, { "epoch": 1.7433046993431027, "grad_norm": 12.064317887604998, "learning_rate": 5.147915616702294e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.5390625, "logps/chosen": -512.0, "logps/rejected": -600.0, "loss": 0.0361, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -22.375, "rewards/margins": 11.6875, "rewards/rejected": -34.0, "step": 24150 }, { "epoch": 1.7440265646430375, "grad_norm": 8.01059096572459, "learning_rate": 5.146850126549788e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.431640625, "logps/chosen": -524.0, "logps/rejected": -616.0, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -23.25, "rewards/margins": 11.9375, "rewards/rejected": -35.25, "step": 24160 }, { "epoch": 1.7447484299429727, "grad_norm": 12.120792127649393, "learning_rate": 5.145785297713203e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.41015625, "logps/chosen": -500.0, "logps/rejected": -612.0, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.0, "rewards/margins": 11.875, "rewards/rejected": -34.0, "step": 24170 }, { "epoch": 1.7454702952429075, "grad_norm": 2.8161740949525664, "learning_rate": 5.144721129508728e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.470703125, "logps/chosen": -464.0, "logps/rejected": -564.0, "loss": 0.0353, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.875, "rewards/rejected": -32.25, "step": 24180 }, { "epoch": 1.7461921605428428, "grad_norm": 4.145411936346543, "learning_rate": 5.143657621253539e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.39453125, "logps/chosen": -504.0, "logps/rejected": -588.0, "loss": 0.0169, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.875, "rewards/margins": 12.375, "rewards/rejected": -35.25, "step": 24190 }, { "epoch": 1.7469140258427778, "grad_norm": 5.24753705498723, "learning_rate": 5.1425947722658e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.5703125, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0336, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -22.125, "rewards/margins": 12.3125, "rewards/rejected": -34.5, "step": 24200 }, { "epoch": 1.7476358911427128, "grad_norm": 2.3608893223997462, "learning_rate": 5.141532581864661e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.318359375, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0392, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.625, "rewards/margins": 12.125, "rewards/rejected": -33.75, "step": 24210 }, { "epoch": 1.7483577564426478, "grad_norm": 6.691457698360428, "learning_rate": 5.140471049370253e-07, "logits/chosen": -1.015625, "logits/rejected": -0.349609375, "logps/chosen": -548.0, "logps/rejected": -620.0, "loss": 0.0319, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -24.125, "rewards/margins": 11.6875, "rewards/rejected": -35.75, "step": 24220 }, { "epoch": 1.7490796217425828, "grad_norm": 2.3606894320496656, "learning_rate": 5.139410174103693e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.478515625, "logps/chosen": -510.0, "logps/rejected": -608.0, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -23.25, "rewards/margins": 12.5, "rewards/rejected": -35.75, "step": 24230 }, { "epoch": 1.7498014870425178, "grad_norm": 2.499089383939129, "learning_rate": 5.138349955387079e-07, "logits/chosen": -1.0, "logits/rejected": -0.337890625, "logps/chosen": -502.0, "logps/rejected": -584.0, "loss": 0.027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 12.5, "rewards/rejected": -34.25, "step": 24240 }, { "epoch": 1.7505233523424528, "grad_norm": 3.6739700494643115, "learning_rate": 5.137290392543484e-07, "logits/chosen": -0.921875, "logits/rejected": -0.34375, "logps/chosen": -506.0, "logps/rejected": -592.0, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -20.875, "rewards/margins": 12.25, "rewards/rejected": -33.0, "step": 24250 }, { "epoch": 1.751245217642388, "grad_norm": 12.20973068116013, "learning_rate": 5.13623148489696e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.4765625, "logps/chosen": -492.0, "logps/rejected": -564.0, "loss": 0.0372, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.875, "rewards/margins": 11.3125, "rewards/rejected": -32.25, "step": 24260 }, { "epoch": 1.7519670829423228, "grad_norm": 8.1631712388452, "learning_rate": 5.135173231772532e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.388671875, "logps/chosen": -506.0, "logps/rejected": -596.0, "loss": 0.0501, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -23.75, "rewards/margins": 12.0, "rewards/rejected": -35.75, "step": 24270 }, { "epoch": 1.752688948242258, "grad_norm": 7.445951387953033, "learning_rate": 5.134115632496199e-07, "logits/chosen": -0.9375, "logits/rejected": -0.5078125, "logps/chosen": -492.0, "logps/rejected": -592.0, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": -22.0, "rewards/margins": 11.875, "rewards/rejected": -33.75, "step": 24280 }, { "epoch": 1.753410813542193, "grad_norm": 13.143014871487303, "learning_rate": 5.133058686394933e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.384765625, "logps/chosen": -464.0, "logps/rejected": -552.0, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": -19.75, "rewards/margins": 11.8125, "rewards/rejected": -31.5, "step": 24290 }, { "epoch": 1.754132678842128, "grad_norm": 2.0875855003442934, "learning_rate": 5.132002392796672e-07, "logits/chosen": -1.0, "logits/rejected": -0.458984375, "logps/chosen": -504.0, "logps/rejected": -556.0, "loss": 0.0354, "rewards/accuracies": 0.96875, "rewards/chosen": -21.75, "rewards/margins": 10.8125, "rewards/rejected": -32.5, "step": 24300 }, { "epoch": 1.754854544142063, "grad_norm": 2.4109507158553867, "learning_rate": 5.130946751030328e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.333984375, "logps/chosen": -480.0, "logps/rejected": -556.0, "loss": 0.0284, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -21.875, "rewards/margins": 10.8125, "rewards/rejected": -32.75, "step": 24310 }, { "epoch": 1.755576409441998, "grad_norm": 3.5308516400104475, "learning_rate": 5.129891760425771e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.53125, "logps/chosen": -476.0, "logps/rejected": -540.0, "loss": 0.0486, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.375, "rewards/margins": 10.75, "rewards/rejected": -32.25, "step": 24320 }, { "epoch": 1.7562982747419331, "grad_norm": 5.202673290069236, "learning_rate": 5.128837420313838e-07, "logits/chosen": -1.09375, "logits/rejected": -0.5234375, "logps/chosen": -484.0, "logps/rejected": -564.0, "loss": 0.0447, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.25, "rewards/margins": 11.9375, "rewards/rejected": -32.25, "step": 24330 }, { "epoch": 1.7570201400418681, "grad_norm": 2.9975685990308425, "learning_rate": 5.127783730026332e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.6015625, "logps/chosen": -500.0, "logps/rejected": -556.0, "loss": 0.0347, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 11.125, "rewards/rejected": -31.25, "step": 24340 }, { "epoch": 1.7577420053418034, "grad_norm": 6.635607584176257, "learning_rate": 5.126730688896011e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.453125, "logps/chosen": -474.0, "logps/rejected": -564.0, "loss": 0.0249, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.6875, "rewards/rejected": -31.625, "step": 24350 }, { "epoch": 1.7584638706417381, "grad_norm": 1.9348110375315928, "learning_rate": 5.125678296256597e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.412109375, "logps/chosen": -496.0, "logps/rejected": -584.0, "loss": 0.0266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 12.5, "rewards/rejected": -32.75, "step": 24360 }, { "epoch": 1.7591857359416734, "grad_norm": 3.5879950358868244, "learning_rate": 5.124626551442763e-07, "logits/chosen": -1.234375, "logits/rejected": -0.482421875, "logps/chosen": -470.0, "logps/rejected": -564.0, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -20.25, "rewards/margins": 12.5625, "rewards/rejected": -32.75, "step": 24370 }, { "epoch": 1.7599076012416082, "grad_norm": 11.351519658664973, "learning_rate": 5.123575453790144e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.435546875, "logps/chosen": -468.0, "logps/rejected": -552.0, "loss": 0.0483, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 12.0625, "rewards/rejected": -31.75, "step": 24380 }, { "epoch": 1.7606294665415434, "grad_norm": 6.392254332368823, "learning_rate": 5.122525002635324e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.458984375, "logps/chosen": -484.0, "logps/rejected": -556.0, "loss": 0.0295, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.625, "rewards/margins": 10.0625, "rewards/rejected": -31.625, "step": 24390 }, { "epoch": 1.7613513318414784, "grad_norm": 8.795011883879122, "learning_rate": 5.121475197315839e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.373046875, "logps/chosen": -494.0, "logps/rejected": -576.0, "loss": 0.039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.5, "rewards/margins": 11.8125, "rewards/rejected": -31.375, "step": 24400 }, { "epoch": 1.7620731971414134, "grad_norm": 2.1767331015377787, "learning_rate": 5.120426037170176e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.38671875, "logps/chosen": -468.0, "logps/rejected": -540.0, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 10.875, "rewards/rejected": -30.75, "step": 24410 }, { "epoch": 1.7627950624413484, "grad_norm": 11.39561651022592, "learning_rate": 5.119377521537771e-07, "logits/chosen": -1.125, "logits/rejected": -0.427734375, "logps/chosen": -502.0, "logps/rejected": -572.0, "loss": 0.0362, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -21.625, "rewards/margins": 11.5625, "rewards/rejected": -33.25, "step": 24420 }, { "epoch": 1.7635169277412834, "grad_norm": 10.806072206619675, "learning_rate": 5.118329649759004e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.390625, "logps/chosen": -504.0, "logps/rejected": -564.0, "loss": 0.0446, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.25, "rewards/margins": 11.4375, "rewards/rejected": -32.75, "step": 24430 }, { "epoch": 1.7642387930412187, "grad_norm": 11.274640120967064, "learning_rate": 5.117282421175202e-07, "logits/chosen": -0.91796875, "logits/rejected": -0.28125, "logps/chosen": -496.0, "logps/rejected": -576.0, "loss": 0.0371, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.75, "rewards/margins": 11.125, "rewards/rejected": -31.75, "step": 24440 }, { "epoch": 1.7649606583411535, "grad_norm": 15.090674966604878, "learning_rate": 5.116235835128635e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.51953125, "logps/chosen": -506.0, "logps/rejected": -556.0, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 10.9375, "rewards/rejected": -32.0, "step": 24450 }, { "epoch": 1.7656825236410887, "grad_norm": 13.853379216351279, "learning_rate": 5.115189890962512e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.310546875, "logps/chosen": -500.0, "logps/rejected": -584.0, "loss": 0.0278, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.875, "rewards/rejected": -33.0, "step": 24460 }, { "epoch": 1.7664043889410235, "grad_norm": 2.9633137550468662, "learning_rate": 5.114144588020982e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.55859375, "logps/chosen": -476.0, "logps/rejected": -576.0, "loss": 0.0279, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.25, "rewards/margins": 11.5, "rewards/rejected": -32.75, "step": 24470 }, { "epoch": 1.7671262542409587, "grad_norm": 7.351324781513057, "learning_rate": 5.113099925649136e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.376953125, "logps/chosen": -498.0, "logps/rejected": -572.0, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.125, "rewards/margins": 11.625, "rewards/rejected": -33.75, "step": 24480 }, { "epoch": 1.7678481195408937, "grad_norm": 3.0837252840640366, "learning_rate": 5.112055903192996e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.515625, "logps/chosen": -484.0, "logps/rejected": -564.0, "loss": 0.0391, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 11.875, "rewards/rejected": -32.0, "step": 24490 }, { "epoch": 1.7685699848408287, "grad_norm": 9.211213868410985, "learning_rate": 5.111012519999519e-07, "logits/chosen": -1.015625, "logits/rejected": -0.52734375, "logps/chosen": -492.0, "logps/rejected": -568.0, "loss": 0.0484, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.0, "rewards/margins": 12.375, "rewards/rejected": -32.25, "step": 24500 }, { "epoch": 1.7692918501407637, "grad_norm": 3.549503128532949, "learning_rate": 5.109969775416598e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.396484375, "logps/chosen": -498.0, "logps/rejected": -592.0, "loss": 0.0317, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 12.1875, "rewards/rejected": -33.25, "step": 24510 }, { "epoch": 1.7700137154406987, "grad_norm": 8.322971156912251, "learning_rate": 5.108927668793053e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.2890625, "logps/chosen": -474.0, "logps/rejected": -544.0, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": -21.0, "rewards/margins": 12.1875, "rewards/rejected": -33.0, "step": 24520 }, { "epoch": 1.7707355807406338, "grad_norm": 9.894569901055187, "learning_rate": 5.107886199478635e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.3125, "logps/chosen": -504.0, "logps/rejected": -568.0, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 10.875, "rewards/rejected": -32.0, "step": 24530 }, { "epoch": 1.7714574460405688, "grad_norm": 6.130957549172793, "learning_rate": 5.106845366824023e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.4453125, "logps/chosen": -498.0, "logps/rejected": -600.0, "loss": 0.0429, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.875, "rewards/margins": 11.6875, "rewards/rejected": -33.5, "step": 24540 }, { "epoch": 1.772179311340504, "grad_norm": 14.848684283032087, "learning_rate": 5.105805170180822e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.396484375, "logps/chosen": -456.0, "logps/rejected": -556.0, "loss": 0.0403, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.375, "rewards/margins": 11.125, "rewards/rejected": -31.5, "step": 24550 }, { "epoch": 1.7729011766404388, "grad_norm": 6.05140876393501, "learning_rate": 5.104765608901559e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.54296875, "logps/chosen": -484.0, "logps/rejected": -560.0, "loss": 0.0309, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.5, "rewards/margins": 10.0625, "rewards/rejected": -31.5, "step": 24560 }, { "epoch": 1.773623041940374, "grad_norm": 4.327279630731724, "learning_rate": 5.103726682339685e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.4296875, "logps/chosen": -488.0, "logps/rejected": -552.0, "loss": 0.0379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 11.0625, "rewards/rejected": -31.375, "step": 24570 }, { "epoch": 1.7743449072403088, "grad_norm": 9.381107896049398, "learning_rate": 5.102688389849571e-07, "logits/chosen": -1.125, "logits/rejected": -0.455078125, "logps/chosen": -470.0, "logps/rejected": -548.0, "loss": 0.0307, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.25, "rewards/rejected": -31.125, "step": 24580 }, { "epoch": 1.775066772540244, "grad_norm": 3.1740707319507964, "learning_rate": 5.101650730786509e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.236328125, "logps/chosen": -486.0, "logps/rejected": -564.0, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.625, "rewards/margins": 11.9375, "rewards/rejected": -33.5, "step": 24590 }, { "epoch": 1.775788637840179, "grad_norm": 8.258272967719723, "learning_rate": 5.100613704506706e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.5625, "logps/chosen": -484.0, "logps/rejected": -564.0, "loss": 0.0256, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.625, "rewards/margins": 11.375, "rewards/rejected": -32.0, "step": 24600 }, { "epoch": 1.776510503140114, "grad_norm": 7.671525793397803, "learning_rate": 5.099577310367286e-07, "logits/chosen": -1.09375, "logits/rejected": -0.3359375, "logps/chosen": -498.0, "logps/rejected": -584.0, "loss": 0.0277, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.375, "rewards/margins": 11.9375, "rewards/rejected": -33.25, "step": 24610 }, { "epoch": 1.777232368440049, "grad_norm": 3.8926418864735832, "learning_rate": 5.098541547726285e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.34765625, "logps/chosen": -494.0, "logps/rejected": -592.0, "loss": 0.0324, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.125, "rewards/margins": 11.75, "rewards/rejected": -31.875, "step": 24620 }, { "epoch": 1.777954233739984, "grad_norm": 4.3893262524998224, "learning_rate": 5.097506415942655e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.46484375, "logps/chosen": -480.0, "logps/rejected": -584.0, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.125, "rewards/margins": 12.5, "rewards/rejected": -32.75, "step": 24630 }, { "epoch": 1.7786760990399193, "grad_norm": 5.545361084841591, "learning_rate": 5.096471914376254e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.3828125, "logps/chosen": -494.0, "logps/rejected": -604.0, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": -22.625, "rewards/margins": 11.75, "rewards/rejected": -34.25, "step": 24640 }, { "epoch": 1.779397964339854, "grad_norm": 5.331055847420215, "learning_rate": 5.095438042387856e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.54296875, "logps/chosen": -498.0, "logps/rejected": -564.0, "loss": 0.0518, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -21.375, "rewards/margins": 11.3125, "rewards/rejected": -32.5, "step": 24650 }, { "epoch": 1.7801198296397893, "grad_norm": 7.207254066069479, "learning_rate": 5.094404799339134e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.5546875, "logps/chosen": -478.0, "logps/rejected": -576.0, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 11.625, "rewards/rejected": -31.5, "step": 24660 }, { "epoch": 1.780841694939724, "grad_norm": 9.613015835318992, "learning_rate": 5.093372184592671e-07, "logits/chosen": -0.953125, "logits/rejected": -0.34765625, "logps/chosen": -480.0, "logps/rejected": -568.0, "loss": 0.0441, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.75, "rewards/margins": 11.625, "rewards/rejected": -33.5, "step": 24670 }, { "epoch": 1.7815635602396593, "grad_norm": 9.539411240535305, "learning_rate": 5.092340197511956e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.466796875, "logps/chosen": -470.0, "logps/rejected": -556.0, "loss": 0.0239, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.6875, "rewards/rejected": -31.375, "step": 24680 }, { "epoch": 1.7822854255395943, "grad_norm": 5.772706741685414, "learning_rate": 5.091308837461377e-07, "logits/chosen": -1.0703125, "logits/rejected": -0.50390625, "logps/chosen": -444.0, "logps/rejected": -536.0, "loss": 0.0228, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.5, "rewards/margins": 11.1875, "rewards/rejected": -29.75, "step": 24690 }, { "epoch": 1.7830072908395294, "grad_norm": 8.44975084129519, "learning_rate": 5.090278103806222e-07, "logits/chosen": -1.09375, "logits/rejected": -0.498046875, "logps/chosen": -492.0, "logps/rejected": -572.0, "loss": 0.0399, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.375, "rewards/margins": 11.0625, "rewards/rejected": -31.5, "step": 24700 }, { "epoch": 1.7837291561394644, "grad_norm": 8.809394957487802, "learning_rate": 5.089247995912683e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.46484375, "logps/chosen": -460.0, "logps/rejected": -560.0, "loss": 0.0246, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.75, "rewards/rejected": -31.5, "step": 24710 }, { "epoch": 1.7844510214393994, "grad_norm": 8.809224236819778, "learning_rate": 5.088218513147844e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.5078125, "logps/chosen": -498.0, "logps/rejected": -560.0, "loss": 0.049, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -20.375, "rewards/margins": 11.1875, "rewards/rejected": -31.625, "step": 24720 }, { "epoch": 1.7851728867393344, "grad_norm": 3.66516206224327, "learning_rate": 5.087189654879688e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.392578125, "logps/chosen": -460.0, "logps/rejected": -568.0, "loss": 0.0214, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.875, "rewards/rejected": -31.0, "step": 24730 }, { "epoch": 1.7858947520392694, "grad_norm": 8.263744709062568, "learning_rate": 5.086161420477092e-07, "logits/chosen": -1.1640625, "logits/rejected": -0.55859375, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.0416, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 11.25, "rewards/rejected": -31.5, "step": 24740 }, { "epoch": 1.7866166173392046, "grad_norm": 9.086069011860891, "learning_rate": 5.085133809309825e-07, "logits/chosen": -1.0625, "logits/rejected": -0.46875, "logps/chosen": -472.0, "logps/rejected": -568.0, "loss": 0.0386, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.625, "rewards/margins": 11.75, "rewards/rejected": -31.375, "step": 24750 }, { "epoch": 1.7873384826391394, "grad_norm": 5.97793559636165, "learning_rate": 5.084106820748547e-07, "logits/chosen": -1.0859375, "logits/rejected": -0.54296875, "logps/chosen": -486.0, "logps/rejected": -556.0, "loss": 0.0396, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.125, "rewards/margins": 11.9375, "rewards/rejected": -31.0, "step": 24760 }, { "epoch": 1.7880603479390746, "grad_norm": 4.151491232457048, "learning_rate": 5.083080454164808e-07, "logits/chosen": -0.8828125, "logits/rejected": -0.443359375, "logps/chosen": -438.0, "logps/rejected": -552.0, "loss": 0.0472, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -18.375, "rewards/margins": 11.3125, "rewards/rejected": -29.625, "step": 24770 }, { "epoch": 1.7887822132390097, "grad_norm": 3.425504893675308, "learning_rate": 5.082054708931043e-07, "logits/chosen": -1.0546875, "logits/rejected": -0.5234375, "logps/chosen": -492.0, "logps/rejected": -532.0, "loss": 0.0386, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.0, "rewards/margins": 11.1875, "rewards/rejected": -30.25, "step": 24780 }, { "epoch": 1.7895040785389447, "grad_norm": 7.214116244908318, "learning_rate": 5.081029584420579e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.3984375, "logps/chosen": -476.0, "logps/rejected": -548.0, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -20.25, "rewards/margins": 11.625, "rewards/rejected": -31.75, "step": 24790 }, { "epoch": 1.7902259438388797, "grad_norm": 5.259982567570186, "learning_rate": 5.08000508000762e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.373046875, "logps/chosen": -472.0, "logps/rejected": -564.0, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": -19.875, "rewards/margins": 12.4375, "rewards/rejected": -32.25, "step": 24800 }, { "epoch": 1.7909478091388147, "grad_norm": 8.728886524347773, "learning_rate": 5.078981195067258e-07, "logits/chosen": -1.1015625, "logits/rejected": -0.421875, "logps/chosen": -476.0, "logps/rejected": -556.0, "loss": 0.0371, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.625, "rewards/margins": 11.5625, "rewards/rejected": -31.125, "step": 24810 }, { "epoch": 1.7916696744387497, "grad_norm": 11.25366335712021, "learning_rate": 5.077957928975466e-07, "logits/chosen": -1.09375, "logits/rejected": -0.419921875, "logps/chosen": -488.0, "logps/rejected": -560.0, "loss": 0.0458, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.75, "rewards/margins": 11.875, "rewards/rejected": -31.625, "step": 24820 }, { "epoch": 1.7923915397386847, "grad_norm": 3.1339164188150592, "learning_rate": 5.076935281109094e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.484375, "logps/chosen": -496.0, "logps/rejected": -572.0, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -21.125, "rewards/margins": 11.0625, "rewards/rejected": -32.25, "step": 24830 }, { "epoch": 1.79311340503862, "grad_norm": 3.079614321886142, "learning_rate": 5.075913250845874e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.318359375, "logps/chosen": -480.0, "logps/rejected": -560.0, "loss": 0.0375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.75, "rewards/margins": 11.8125, "rewards/rejected": -31.5, "step": 24840 }, { "epoch": 1.7938352703385547, "grad_norm": 4.58977877448314, "learning_rate": 5.074891837564409e-07, "logits/chosen": -1.125, "logits/rejected": -0.34375, "logps/chosen": -488.0, "logps/rejected": -580.0, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.25, "rewards/margins": 12.5, "rewards/rejected": -32.75, "step": 24850 }, { "epoch": 1.79455713563849, "grad_norm": 8.332475375052159, "learning_rate": 5.073871040644184e-07, "logits/chosen": -1.2265625, "logits/rejected": -0.5078125, "logps/chosen": -492.0, "logps/rejected": -536.0, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.125, "rewards/margins": 11.125, "rewards/rejected": -31.25, "step": 24860 }, { "epoch": 1.7952790009384247, "grad_norm": 8.781473849374143, "learning_rate": 5.072850859465551e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.4296875, "logps/chosen": -452.0, "logps/rejected": -532.0, "loss": 0.0415, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -19.375, "rewards/margins": 11.3125, "rewards/rejected": -30.75, "step": 24870 }, { "epoch": 1.79600086623836, "grad_norm": 9.6549525108276, "learning_rate": 5.071831293409737e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.3515625, "logps/chosen": -482.0, "logps/rejected": -544.0, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -19.875, "rewards/margins": 11.1875, "rewards/rejected": -31.125, "step": 24880 }, { "epoch": 1.796722731538295, "grad_norm": 2.4373812781596786, "learning_rate": 5.07081234185884e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.48828125, "logps/chosen": -464.0, "logps/rejected": -548.0, "loss": 0.0514, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -18.5, "rewards/margins": 11.125, "rewards/rejected": -29.625, "step": 24890 }, { "epoch": 1.79744459683823, "grad_norm": 3.6124820114884666, "learning_rate": 5.069794004195823e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.255859375, "logps/chosen": -490.0, "logps/rejected": -548.0, "loss": 0.0511, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -20.625, "rewards/margins": 11.0, "rewards/rejected": -31.625, "step": 24900 }, { "epoch": 1.798166462138165, "grad_norm": 4.832435980553248, "learning_rate": 5.06877627980452e-07, "logits/chosen": -1.0625, "logits/rejected": -0.51953125, "logps/chosen": -502.0, "logps/rejected": -568.0, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -20.25, "rewards/margins": 11.75, "rewards/rejected": -32.0, "step": 24910 }, { "epoch": 1.7988883274381, "grad_norm": 2.782236393768399, "learning_rate": 5.067759168069628e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.388671875, "logps/chosen": -468.0, "logps/rejected": -552.0, "loss": 0.0235, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -19.875, "rewards/margins": 11.1875, "rewards/rejected": -31.0, "step": 24920 }, { "epoch": 1.7996101927380352, "grad_norm": 17.82623396574509, "learning_rate": 5.066742668376709e-07, "logits/chosen": -1.0234375, "logits/rejected": -0.48046875, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.0411, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -19.375, "rewards/margins": 11.0625, "rewards/rejected": -30.5, "step": 24930 }, { "epoch": 1.80033205803797, "grad_norm": 8.38276046287854, "learning_rate": 5.065726780112187e-07, "logits/chosen": -1.1171875, "logits/rejected": -0.55078125, "logps/chosen": -498.0, "logps/rejected": -588.0, "loss": 0.045, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -20.5, "rewards/margins": 11.25, "rewards/rejected": -31.75, "step": 24940 }, { "epoch": 1.8010539233379053, "grad_norm": 3.0166509448852215, "learning_rate": 5.064711502663347e-07, "logits/chosen": -0.98046875, "logits/rejected": -0.51171875, "logps/chosen": -492.0, "logps/rejected": -556.0, "loss": 0.0429, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -20.375, "rewards/margins": 11.25, "rewards/rejected": -31.625, "step": 24950 }, { "epoch": 1.80177578863784, "grad_norm": 3.542614533014246, "learning_rate": 5.063696835418333e-07, "logits/chosen": -1.0625, "logits/rejected": -0.470703125, "logps/chosen": -488.0, "logps/rejected": -572.0, "loss": 0.0339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -21.25, "rewards/margins": 11.3125, "rewards/rejected": -32.5, "step": 24960 }, { "epoch": 1.8024976539377753, "grad_norm": 13.270300621382177, "learning_rate": 5.062682777766146e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.392578125, "logps/chosen": -520.0, "logps/rejected": -588.0, "loss": 0.0463, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -22.0, "rewards/margins": 11.875, "rewards/rejected": -33.75, "step": 24970 }, { "epoch": 1.8032195192377103, "grad_norm": 8.750891011584928, "learning_rate": 5.061669329096646e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.318359375, "logps/chosen": -478.0, "logps/rejected": -544.0, "loss": 0.0356, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -20.375, "rewards/margins": 11.375, "rewards/rejected": -31.625, "step": 24980 }, { "epoch": 1.8039413845376453, "grad_norm": 11.165541397045487, "learning_rate": 5.060656488800544e-07, "logits/chosen": -0.96875, "logits/rejected": -0.3125, "logps/chosen": -520.0, "logps/rejected": -588.0, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -21.125, "rewards/margins": 11.0625, "rewards/rejected": -32.25, "step": 24990 }, { "epoch": 1.8046632498375803, "grad_norm": 4.440859091210994, "learning_rate": 5.059644256269407e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.314453125, "logps/chosen": -478.0, "logps/rejected": -576.0, "loss": 0.0339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -20.875, "rewards/margins": 12.625, "rewards/rejected": -33.5, "step": 25000 }, { "epoch": 1.8046632498375803, "eval_logits/chosen": -0.89453125, "eval_logits/rejected": -0.38671875, "eval_logps/chosen": -510.0, "eval_logps/rejected": -564.0, "eval_loss": 0.2600978910923004, "eval_rewards/accuracies": 0.9182629585266113, "eval_rewards/chosen": -22.75, "eval_rewards/margins": 9.375, "eval_rewards/rejected": -32.25, "eval_runtime": 2855.0098, "eval_samples_per_second": 34.501, "eval_steps_per_second": 0.539, "step": 25000 } ], "logging_steps": 10, "max_steps": 27706, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }