{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.997144739732045, "eval_steps": 500, "global_step": 4552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008785416209092905, "grad_norm": 151.09651533448113, "learning_rate": 9.980228471001756e-07, "logits/chosen": 1.0671875476837158, "logits/rejected": 1.0962402820587158, "logps/chosen": -356.20001220703125, "logps/rejected": -320.7875061035156, "loss": 0.6841, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.1537178009748459, "rewards/margins": 0.03141021728515625, "rewards/rejected": 0.12230034172534943, "step": 10 }, { "epoch": 0.01757083241818581, "grad_norm": 114.92940859686446, "learning_rate": 9.958260105448154e-07, "logits/chosen": 0.886889636516571, "logits/rejected": 0.830902099609375, "logps/chosen": -390.70001220703125, "logps/rejected": -320.95001220703125, "loss": 0.6616, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.2272384613752365, "rewards/margins": 0.10868529975414276, "rewards/rejected": 0.11845703423023224, "step": 20 }, { "epoch": 0.026356248627278717, "grad_norm": 99.4281321979489, "learning_rate": 9.936291739894551e-07, "logits/chosen": 0.971435546875, "logits/rejected": 0.9229980707168579, "logps/chosen": -383.2875061035156, "logps/rejected": -338.0, "loss": 0.6233, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.30714720487594604, "rewards/margins": 0.26402586698532104, "rewards/rejected": 0.04339599609375, "step": 30 }, { "epoch": 0.03514166483637162, "grad_norm": 137.47037563285127, "learning_rate": 9.91432337434095e-07, "logits/chosen": 1.034570336341858, "logits/rejected": 0.98516845703125, "logps/chosen": -363.5375061035156, "logps/rejected": -332.63751220703125, "loss": 0.6734, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.5036255121231079, "rewards/margins": 0.23667296767234802, "rewards/rejected": 0.2673172056674957, "step": 40 }, { "epoch": 0.04392708104546453, "grad_norm": 153.12952279834127, "learning_rate": 9.892355008787344e-07, "logits/chosen": 1.106201171875, "logits/rejected": 1.016687035560608, "logps/chosen": -380.4624938964844, "logps/rejected": -332.875, "loss": 0.6386, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7356628179550171, "rewards/margins": 0.3098083436489105, "rewards/rejected": 0.426025390625, "step": 50 }, { "epoch": 0.052712497254557435, "grad_norm": 128.4773480962962, "learning_rate": 9.870386643233744e-07, "logits/chosen": 0.734082043170929, "logits/rejected": 0.75872802734375, "logps/chosen": -366.76251220703125, "logps/rejected": -322.79998779296875, "loss": 0.6354, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4343017637729645, "rewards/margins": 0.3361572325229645, "rewards/rejected": 0.09842529147863388, "step": 60 }, { "epoch": 0.06149791346365034, "grad_norm": 121.07915020845833, "learning_rate": 9.84841827768014e-07, "logits/chosen": 0.85595703125, "logits/rejected": 0.846264660358429, "logps/chosen": -369.45001220703125, "logps/rejected": -343.88751220703125, "loss": 0.6002, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21712645888328552, "rewards/margins": 0.4567016661167145, "rewards/rejected": -0.23920592665672302, "step": 70 }, { "epoch": 0.07028332967274324, "grad_norm": 167.33011884780123, "learning_rate": 9.826449912126537e-07, "logits/chosen": 0.883953869342804, "logits/rejected": 0.928759753704071, "logps/chosen": -369.3125, "logps/rejected": -327.95001220703125, "loss": 0.7009, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.37826308608055115, "rewards/margins": 0.19295044243335724, "rewards/rejected": 0.18565063178539276, "step": 80 }, { "epoch": 0.07906874588183616, "grad_norm": 150.80725911305663, "learning_rate": 9.804481546572935e-07, "logits/chosen": 0.928088366985321, "logits/rejected": 0.9686279296875, "logps/chosen": -352.3125, "logps/rejected": -329.54998779296875, "loss": 0.6061, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.40784913301467896, "rewards/margins": 0.41189271211624146, "rewards/rejected": -0.0037445067428052425, "step": 90 }, { "epoch": 0.08785416209092906, "grad_norm": 126.47482435598091, "learning_rate": 9.782513181019332e-07, "logits/chosen": 0.685253918170929, "logits/rejected": 0.7221924066543579, "logps/chosen": -383.8500061035156, "logps/rejected": -353.11248779296875, "loss": 0.6127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.20036621391773224, "rewards/margins": 0.3989120423793793, "rewards/rejected": -0.19847717881202698, "step": 100 }, { "epoch": 0.09663957830002197, "grad_norm": 186.75022745914072, "learning_rate": 9.760544815465728e-07, "logits/chosen": 0.937835693359375, "logits/rejected": 0.796038806438446, "logps/chosen": -371.9624938964844, "logps/rejected": -344.63751220703125, "loss": 0.5914, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.12029419094324112, "rewards/margins": 0.524304211139679, "rewards/rejected": -0.4040679931640625, "step": 110 }, { "epoch": 0.10542499450911487, "grad_norm": 126.28462795794948, "learning_rate": 9.738576449912126e-07, "logits/chosen": 0.7960571050643921, "logits/rejected": 0.7025146484375, "logps/chosen": -310.32501220703125, "logps/rejected": -349.54998779296875, "loss": 0.6606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01264038123190403, "rewards/margins": 0.2706542909145355, "rewards/rejected": -0.2831375002861023, "step": 120 }, { "epoch": 0.11421041071820777, "grad_norm": 143.9051601011162, "learning_rate": 9.716608084358523e-07, "logits/chosen": 0.6286102533340454, "logits/rejected": 0.5620483160018921, "logps/chosen": -367.57501220703125, "logps/rejected": -336.9624938964844, "loss": 0.5295, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04833984375, "rewards/margins": 0.744854748249054, "rewards/rejected": -0.7928680181503296, "step": 130 }, { "epoch": 0.12299582692730068, "grad_norm": 127.86618676843386, "learning_rate": 9.69463971880492e-07, "logits/chosen": 0.638824462890625, "logits/rejected": 0.7274414300918579, "logps/chosen": -393.25, "logps/rejected": -368.20001220703125, "loss": 0.5919, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06902465969324112, "rewards/margins": 0.5027099847793579, "rewards/rejected": -0.5716583132743835, "step": 140 }, { "epoch": 0.13178124313639358, "grad_norm": 114.43515251869901, "learning_rate": 9.672671353251316e-07, "logits/chosen": 0.6732284426689148, "logits/rejected": 0.5601867437362671, "logps/chosen": -435.32501220703125, "logps/rejected": -353.7749938964844, "loss": 0.5841, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16024169325828552, "rewards/margins": 0.6184753179550171, "rewards/rejected": -0.458221435546875, "step": 150 }, { "epoch": 0.14056665934548648, "grad_norm": 87.24458775305732, "learning_rate": 9.650702987697716e-07, "logits/chosen": 0.8066161870956421, "logits/rejected": 0.7737792730331421, "logps/chosen": -336.54998779296875, "logps/rejected": -305.51251220703125, "loss": 0.6329, "rewards/accuracies": 0.59375, "rewards/chosen": 0.5506836175918579, "rewards/margins": 0.4502502381801605, "rewards/rejected": 0.10050658881664276, "step": 160 }, { "epoch": 0.1493520755545794, "grad_norm": 115.40282927817894, "learning_rate": 9.628734622144111e-07, "logits/chosen": 0.931384265422821, "logits/rejected": 0.9638305902481079, "logps/chosen": -354.8500061035156, "logps/rejected": -321.9375, "loss": 0.6026, "rewards/accuracies": 0.65625, "rewards/chosen": 0.42521363496780396, "rewards/margins": 0.48103028535842896, "rewards/rejected": -0.0561065673828125, "step": 170 }, { "epoch": 0.1581374917636723, "grad_norm": 140.2321782908542, "learning_rate": 9.60676625659051e-07, "logits/chosen": 0.9021056890487671, "logits/rejected": 0.834228515625, "logps/chosen": -360.7250061035156, "logps/rejected": -312.20001220703125, "loss": 0.6449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.27436599135398865, "rewards/margins": 0.42230224609375, "rewards/rejected": -0.14810791611671448, "step": 180 }, { "epoch": 0.1669229079727652, "grad_norm": 163.67649617231362, "learning_rate": 9.584797891036907e-07, "logits/chosen": 0.9392334222793579, "logits/rejected": 0.9095458984375, "logps/chosen": -320.65625, "logps/rejected": -314.42498779296875, "loss": 0.6526, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.12119140475988388, "rewards/margins": 0.43657225370407104, "rewards/rejected": -0.31494444608688354, "step": 190 }, { "epoch": 0.1757083241818581, "grad_norm": 177.61724015544777, "learning_rate": 9.562829525483304e-07, "logits/chosen": 0.6334503293037415, "logits/rejected": 0.5529083013534546, "logps/chosen": -371.82501220703125, "logps/rejected": -308.36248779296875, "loss": 0.5558, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16226807236671448, "rewards/margins": 0.6910949945449829, "rewards/rejected": -0.5287994146347046, "step": 200 }, { "epoch": 0.184493740390951, "grad_norm": 134.87896103033913, "learning_rate": 9.5408611599297e-07, "logits/chosen": 0.7063964605331421, "logits/rejected": 0.7430664300918579, "logps/chosen": -404.875, "logps/rejected": -376.54998779296875, "loss": 0.6428, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.11356201022863388, "rewards/margins": 0.525054931640625, "rewards/rejected": -0.41135865449905396, "step": 210 }, { "epoch": 0.19327915660004394, "grad_norm": 105.82311336201163, "learning_rate": 9.518892794376097e-07, "logits/chosen": 0.7984985113143921, "logits/rejected": 0.86920166015625, "logps/chosen": -356.9624938964844, "logps/rejected": -347.6625061035156, "loss": 0.6127, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.558428943157196, "rewards/margins": 0.46592408418655396, "rewards/rejected": 0.091888427734375, "step": 220 }, { "epoch": 0.20206457280913684, "grad_norm": 144.91586500580095, "learning_rate": 9.496924428822495e-07, "logits/chosen": 0.983593761920929, "logits/rejected": 0.9734131097793579, "logps/chosen": -361.32501220703125, "logps/rejected": -356.5375061035156, "loss": 0.5888, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.07080078125, "rewards/margins": 0.6333984136581421, "rewards/rejected": 0.4374328553676605, "step": 230 }, { "epoch": 0.21084998901822974, "grad_norm": 141.03037870356604, "learning_rate": 9.474956063268892e-07, "logits/chosen": 0.7502807378768921, "logits/rejected": 0.746752917766571, "logps/chosen": -382.42498779296875, "logps/rejected": -353.07501220703125, "loss": 0.713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6906067132949829, "rewards/margins": 0.4616332948207855, "rewards/rejected": 0.22933349013328552, "step": 240 }, { "epoch": 0.21963540522732264, "grad_norm": 169.6441648862064, "learning_rate": 9.45298769771529e-07, "logits/chosen": 0.7519286870956421, "logits/rejected": 0.853442370891571, "logps/chosen": -368.75, "logps/rejected": -373.57501220703125, "loss": 0.6581, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.15360108017921448, "rewards/margins": 0.4932006895542145, "rewards/rejected": -0.33977049589157104, "step": 250 }, { "epoch": 0.22842082143641554, "grad_norm": 108.61478658142337, "learning_rate": 9.431019332161687e-07, "logits/chosen": 0.812304675579071, "logits/rejected": 0.679516613483429, "logps/chosen": -402.54998779296875, "logps/rejected": -380.20001220703125, "loss": 0.7215, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.17167052626609802, "rewards/margins": 0.3968139588832855, "rewards/rejected": -0.2255859375, "step": 260 }, { "epoch": 0.23720623764550847, "grad_norm": 124.32670733741226, "learning_rate": 9.409050966608084e-07, "logits/chosen": 0.83447265625, "logits/rejected": 0.785534679889679, "logps/chosen": -352.54998779296875, "logps/rejected": -350.0, "loss": 0.5861, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.6496948003768921, "rewards/margins": 0.5508362054824829, "rewards/rejected": 0.0981292724609375, "step": 270 }, { "epoch": 0.24599165385460137, "grad_norm": 116.0704456044606, "learning_rate": 9.387082601054481e-07, "logits/chosen": 0.722949206829071, "logits/rejected": 0.753491222858429, "logps/chosen": -355.875, "logps/rejected": -330.4125061035156, "loss": 0.552, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6617676019668579, "rewards/margins": 0.65631103515625, "rewards/rejected": 0.0050292969681322575, "step": 280 }, { "epoch": 0.25477707006369427, "grad_norm": 108.1916315449721, "learning_rate": 9.365114235500879e-07, "logits/chosen": 0.733715832233429, "logits/rejected": 0.7449401617050171, "logps/chosen": -413.03125, "logps/rejected": -353.32501220703125, "loss": 0.6047, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5131622552871704, "rewards/margins": 0.5536743402481079, "rewards/rejected": -0.04054565355181694, "step": 290 }, { "epoch": 0.26356248627278717, "grad_norm": 133.364295663979, "learning_rate": 9.343145869947275e-07, "logits/chosen": 0.8286377191543579, "logits/rejected": 0.7414795160293579, "logps/chosen": -366.0375061035156, "logps/rejected": -320.07501220703125, "loss": 0.5622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7572571039199829, "rewards/margins": 0.624768078327179, "rewards/rejected": 0.13253478705883026, "step": 300 }, { "epoch": 0.27234790248188007, "grad_norm": 108.43332170579106, "learning_rate": 9.321177504393673e-07, "logits/chosen": 0.822924792766571, "logits/rejected": 0.83349609375, "logps/chosen": -377.6000061035156, "logps/rejected": -362.7749938964844, "loss": 0.6409, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5801147222518921, "rewards/margins": 0.615307629108429, "rewards/rejected": -0.03511963039636612, "step": 310 }, { "epoch": 0.28113331869097297, "grad_norm": 112.2071577307009, "learning_rate": 9.299209138840069e-07, "logits/chosen": 0.627978503704071, "logits/rejected": 0.635516345500946, "logps/chosen": -363.8999938964844, "logps/rejected": -315.0375061035156, "loss": 0.505, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19639739394187927, "rewards/margins": 0.847576916217804, "rewards/rejected": -0.651165783405304, "step": 320 }, { "epoch": 0.28991873490006587, "grad_norm": 72.35866660405829, "learning_rate": 9.277240773286467e-07, "logits/chosen": 0.638873279094696, "logits/rejected": 0.7006683349609375, "logps/chosen": -373.3374938964844, "logps/rejected": -364.9624938964844, "loss": 0.5764, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3648742735385895, "rewards/margins": 0.74029541015625, "rewards/rejected": -0.37539976835250854, "step": 330 }, { "epoch": 0.2987041511091588, "grad_norm": 108.22073521666348, "learning_rate": 9.255272407732864e-07, "logits/chosen": 0.613391101360321, "logits/rejected": 0.642047107219696, "logps/chosen": -358.1875, "logps/rejected": -363.07501220703125, "loss": 0.5322, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.45933228731155396, "rewards/margins": 0.854412853717804, "rewards/rejected": -0.39440613985061646, "step": 340 }, { "epoch": 0.3074895673182517, "grad_norm": 84.95882946410181, "learning_rate": 9.233304042179262e-07, "logits/chosen": 0.7430366277694702, "logits/rejected": 0.645538330078125, "logps/chosen": -381.875, "logps/rejected": -332.32501220703125, "loss": 0.6114, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.50030517578125, "rewards/margins": 0.648486316204071, "rewards/rejected": -0.14830932021141052, "step": 350 }, { "epoch": 0.3162749835273446, "grad_norm": 128.50649971665374, "learning_rate": 9.211335676625659e-07, "logits/chosen": 0.8686157464981079, "logits/rejected": 0.747119128704071, "logps/chosen": -383.13751220703125, "logps/rejected": -335.76251220703125, "loss": 0.5864, "rewards/accuracies": 0.71875, "rewards/chosen": 0.93817138671875, "rewards/margins": 0.6886962652206421, "rewards/rejected": 0.24915161728858948, "step": 360 }, { "epoch": 0.3250603997364375, "grad_norm": 110.36630033806003, "learning_rate": 9.189367311072056e-07, "logits/chosen": 0.7000182867050171, "logits/rejected": 0.711151123046875, "logps/chosen": -323.14373779296875, "logps/rejected": -328.5375061035156, "loss": 0.5951, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.6826171875, "rewards/margins": 0.728564441204071, "rewards/rejected": -0.04671936109662056, "step": 370 }, { "epoch": 0.3338458159455304, "grad_norm": 162.37137814313525, "learning_rate": 9.167398945518453e-07, "logits/chosen": 0.6787353754043579, "logits/rejected": 0.576916515827179, "logps/chosen": -381.1000061035156, "logps/rejected": -330.2749938964844, "loss": 0.5628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.634472668170929, "rewards/margins": 0.7288573980331421, "rewards/rejected": -0.09415283054113388, "step": 380 }, { "epoch": 0.3426312321546233, "grad_norm": 134.03592317731486, "learning_rate": 9.14543057996485e-07, "logits/chosen": 0.6447784304618835, "logits/rejected": 0.669390857219696, "logps/chosen": -363.4750061035156, "logps/rejected": -358.125, "loss": 0.5053, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.9730163812637329, "rewards/margins": 0.947802722454071, "rewards/rejected": 0.02423706091940403, "step": 390 }, { "epoch": 0.3514166483637162, "grad_norm": 167.7727964549623, "learning_rate": 9.123462214411247e-07, "logits/chosen": 0.667950451374054, "logits/rejected": 0.601123034954071, "logps/chosen": -360.9437561035156, "logps/rejected": -337.2250061035156, "loss": 0.6243, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.656298816204071, "rewards/margins": 0.7783203125, "rewards/rejected": -0.12236328423023224, "step": 400 }, { "epoch": 0.3602020645728091, "grad_norm": 121.53016191133405, "learning_rate": 9.101493848857645e-07, "logits/chosen": 0.749523937702179, "logits/rejected": 0.734057605266571, "logps/chosen": -378.2875061035156, "logps/rejected": -366.0249938964844, "loss": 0.5861, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.74395751953125, "rewards/margins": 0.906903088092804, "rewards/rejected": -0.1632080078125, "step": 410 }, { "epoch": 0.368987480781902, "grad_norm": 135.08378970564655, "learning_rate": 9.079525483304041e-07, "logits/chosen": 0.5611938238143921, "logits/rejected": 0.54962158203125, "logps/chosen": -380.11248779296875, "logps/rejected": -341.96875, "loss": 0.5345, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.960009753704071, "rewards/margins": 0.884899914264679, "rewards/rejected": 0.07491455227136612, "step": 420 }, { "epoch": 0.3777728969909949, "grad_norm": 83.93872723499767, "learning_rate": 9.057557117750439e-07, "logits/chosen": 0.729357898235321, "logits/rejected": 0.570416271686554, "logps/chosen": -365.5375061035156, "logps/rejected": -357.5249938964844, "loss": 0.5018, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.832226574420929, "rewards/margins": 0.9748474359512329, "rewards/rejected": -0.14298096299171448, "step": 430 }, { "epoch": 0.3865583132000879, "grad_norm": 134.20265672845252, "learning_rate": 9.035588752196836e-07, "logits/chosen": 0.536975085735321, "logits/rejected": 0.5665649175643921, "logps/chosen": -343.9375, "logps/rejected": -333.95001220703125, "loss": 0.5882, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.40465086698532104, "rewards/margins": 0.766345202922821, "rewards/rejected": -0.36149901151657104, "step": 440 }, { "epoch": 0.3953437294091808, "grad_norm": 224.63520173136556, "learning_rate": 9.013620386643234e-07, "logits/chosen": 0.4037490785121918, "logits/rejected": 0.303131103515625, "logps/chosen": -378.8999938964844, "logps/rejected": -344.13751220703125, "loss": 0.595, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18750610947608948, "rewards/margins": 0.7963622808456421, "rewards/rejected": -0.6088226437568665, "step": 450 }, { "epoch": 0.4041291456182737, "grad_norm": 107.53892107868361, "learning_rate": 8.99165202108963e-07, "logits/chosen": 0.6433471441268921, "logits/rejected": 0.621997058391571, "logps/chosen": -421.15313720703125, "logps/rejected": -368.5874938964844, "loss": 0.619, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3782592713832855, "rewards/margins": 0.6962035894393921, "rewards/rejected": -0.31683349609375, "step": 460 }, { "epoch": 0.4129145618273666, "grad_norm": 137.77832204950442, "learning_rate": 8.969683655536028e-07, "logits/chosen": 0.7189086675643921, "logits/rejected": 0.6367523074150085, "logps/chosen": -351.1499938964844, "logps/rejected": -347.25, "loss": 0.6513, "rewards/accuracies": 0.65625, "rewards/chosen": 0.777844250202179, "rewards/margins": 0.5938965082168579, "rewards/rejected": 0.18350830674171448, "step": 470 }, { "epoch": 0.4216999780364595, "grad_norm": 163.66174766854658, "learning_rate": 8.947715289982425e-07, "logits/chosen": 0.7084594964981079, "logits/rejected": 0.6649200320243835, "logps/chosen": -343.33123779296875, "logps/rejected": -310.6499938964844, "loss": 0.6166, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.8969360589981079, "rewards/margins": 0.657684326171875, "rewards/rejected": 0.238616943359375, "step": 480 }, { "epoch": 0.4304853942455524, "grad_norm": 126.20927562170151, "learning_rate": 8.925746924428822e-07, "logits/chosen": 0.62030029296875, "logits/rejected": 0.5196533203125, "logps/chosen": -337.7875061035156, "logps/rejected": -313.1499938964844, "loss": 0.585, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.736572265625, "rewards/margins": 0.782519519329071, "rewards/rejected": -0.04567260667681694, "step": 490 }, { "epoch": 0.4392708104546453, "grad_norm": 129.24078166742171, "learning_rate": 8.903778558875219e-07, "logits/chosen": 0.6379638910293579, "logits/rejected": 0.5695968866348267, "logps/chosen": -370.625, "logps/rejected": -394.73748779296875, "loss": 0.6274, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20437316596508026, "rewards/margins": 0.7197815179824829, "rewards/rejected": -0.5159393548965454, "step": 500 }, { "epoch": 0.4480562266637382, "grad_norm": 150.91820731985138, "learning_rate": 8.881810193321616e-07, "logits/chosen": 0.563305675983429, "logits/rejected": 0.48903197050094604, "logps/chosen": -428.53125, "logps/rejected": -355.0, "loss": 0.5737, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.24339599907398224, "rewards/margins": 0.847363293170929, "rewards/rejected": -0.6048919558525085, "step": 510 }, { "epoch": 0.4568416428728311, "grad_norm": 93.13743072543112, "learning_rate": 8.859841827768013e-07, "logits/chosen": 0.675518810749054, "logits/rejected": 0.5924316644668579, "logps/chosen": -382.76251220703125, "logps/rejected": -316.4125061035156, "loss": 0.5875, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5012573003768921, "rewards/margins": 0.90728759765625, "rewards/rejected": -0.40598756074905396, "step": 520 }, { "epoch": 0.46562705908192403, "grad_norm": 109.35127941244305, "learning_rate": 8.837873462214412e-07, "logits/chosen": 0.6280151605606079, "logits/rejected": 0.6853576898574829, "logps/chosen": -357.63751220703125, "logps/rejected": -352.32501220703125, "loss": 0.6039, "rewards/accuracies": 0.65625, "rewards/chosen": 0.562817394733429, "rewards/margins": 0.786669909954071, "rewards/rejected": -0.22328491508960724, "step": 530 }, { "epoch": 0.47441247529101693, "grad_norm": 124.04452532045764, "learning_rate": 8.815905096660808e-07, "logits/chosen": 0.56134033203125, "logits/rejected": 0.626446545124054, "logps/chosen": -369.76873779296875, "logps/rejected": -335.8500061035156, "loss": 0.594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.55401611328125, "rewards/margins": 0.8180297613143921, "rewards/rejected": -0.26448363065719604, "step": 540 }, { "epoch": 0.48319789150010983, "grad_norm": 129.5122149157207, "learning_rate": 8.793936731107206e-07, "logits/chosen": 0.7513183355331421, "logits/rejected": 0.72869873046875, "logps/chosen": -389.9750061035156, "logps/rejected": -364.98748779296875, "loss": 0.5636, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.608502209186554, "rewards/margins": 0.9462829828262329, "rewards/rejected": -0.33787840604782104, "step": 550 }, { "epoch": 0.49198330770920273, "grad_norm": 107.3538703882858, "learning_rate": 8.771968365553602e-07, "logits/chosen": 0.6791626214981079, "logits/rejected": 0.695880115032196, "logps/chosen": -361.54998779296875, "logps/rejected": -374.9750061035156, "loss": 0.5527, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.699780285358429, "rewards/margins": 0.9020019769668579, "rewards/rejected": -0.20244140923023224, "step": 560 }, { "epoch": 0.5007687239182956, "grad_norm": 177.97444949948223, "learning_rate": 8.75e-07, "logits/chosen": 0.67779541015625, "logits/rejected": 0.744676947593689, "logps/chosen": -364.3500061035156, "logps/rejected": -341.5, "loss": 0.7085, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4039062559604645, "rewards/margins": 0.5597168207168579, "rewards/rejected": -0.15574340522289276, "step": 570 }, { "epoch": 0.5095541401273885, "grad_norm": 122.48697250569873, "learning_rate": 8.728031634446396e-07, "logits/chosen": 0.720629870891571, "logits/rejected": 0.69036865234375, "logps/chosen": -328.51251220703125, "logps/rejected": -320.8374938964844, "loss": 0.5002, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.796459972858429, "rewards/margins": 1.0414307117462158, "rewards/rejected": -0.2451629638671875, "step": 580 }, { "epoch": 0.5183395563364814, "grad_norm": 118.89771083803319, "learning_rate": 8.706063268892794e-07, "logits/chosen": 0.791455090045929, "logits/rejected": 0.6425811648368835, "logps/chosen": -391.2250061035156, "logps/rejected": -368.875, "loss": 0.6853, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.48602294921875, "rewards/margins": 0.7132568359375, "rewards/rejected": -0.2267501801252365, "step": 590 }, { "epoch": 0.5271249725455743, "grad_norm": 155.99582645315638, "learning_rate": 8.684094903339191e-07, "logits/chosen": 0.7638183832168579, "logits/rejected": 0.709136962890625, "logps/chosen": -375.92498779296875, "logps/rejected": -349.125, "loss": 0.6293, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.735107421875, "rewards/margins": 0.604382336139679, "rewards/rejected": 0.13076476752758026, "step": 600 }, { "epoch": 0.5359103887546672, "grad_norm": 225.33936495151184, "learning_rate": 8.662126537785588e-07, "logits/chosen": 0.7878662347793579, "logits/rejected": 0.7696777582168579, "logps/chosen": -322.20001220703125, "logps/rejected": -388.63751220703125, "loss": 0.6649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6917785406112671, "rewards/margins": 0.52044677734375, "rewards/rejected": 0.17127075791358948, "step": 610 }, { "epoch": 0.5446958049637601, "grad_norm": 110.98456818822763, "learning_rate": 8.640158172231986e-07, "logits/chosen": 0.7795654535293579, "logits/rejected": 0.7029678225517273, "logps/chosen": -397.375, "logps/rejected": -378.63751220703125, "loss": 0.5739, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.64324951171875, "rewards/margins": 0.679333508014679, "rewards/rejected": -0.03640594333410263, "step": 620 }, { "epoch": 0.553481221172853, "grad_norm": 150.10411947989135, "learning_rate": 8.618189806678383e-07, "logits/chosen": 0.7293640375137329, "logits/rejected": 0.7070678472518921, "logps/chosen": -367.07501220703125, "logps/rejected": -346.6625061035156, "loss": 0.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5694640874862671, "rewards/margins": 0.890942394733429, "rewards/rejected": -0.321197509765625, "step": 630 }, { "epoch": 0.5622666373819459, "grad_norm": 141.11291175168716, "learning_rate": 8.59622144112478e-07, "logits/chosen": 0.7176605463027954, "logits/rejected": 0.7665680050849915, "logps/chosen": -349.48748779296875, "logps/rejected": -347.3500061035156, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": 0.5024261474609375, "rewards/margins": 0.958483874797821, "rewards/rejected": -0.45650023221969604, "step": 640 }, { "epoch": 0.5710520535910388, "grad_norm": 175.66153690704684, "learning_rate": 8.574253075571178e-07, "logits/chosen": 0.734509289264679, "logits/rejected": 0.673205554485321, "logps/chosen": -366.67498779296875, "logps/rejected": -372.2250061035156, "loss": 0.6194, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.5160675048828125, "rewards/margins": 0.789233386516571, "rewards/rejected": -0.272848516702652, "step": 650 }, { "epoch": 0.5798374698001317, "grad_norm": 124.49392630208213, "learning_rate": 8.552284710017574e-07, "logits/chosen": 0.6680542230606079, "logits/rejected": 0.8218322992324829, "logps/chosen": -391.7875061035156, "logps/rejected": -369.875, "loss": 0.4963, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.5751632452011108, "rewards/margins": 0.91632080078125, "rewards/rejected": -0.340676873922348, "step": 660 }, { "epoch": 0.5886228860092246, "grad_norm": 116.24043167382194, "learning_rate": 8.530316344463972e-07, "logits/chosen": 0.794238269329071, "logits/rejected": 0.7583373785018921, "logps/chosen": -400.8500061035156, "logps/rejected": -391.42498779296875, "loss": 0.5998, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46043092012405396, "rewards/margins": 0.663299560546875, "rewards/rejected": -0.20281371474266052, "step": 670 }, { "epoch": 0.5974083022183176, "grad_norm": 157.83491160598462, "learning_rate": 8.508347978910368e-07, "logits/chosen": 0.6641845703125, "logits/rejected": 0.635418713092804, "logps/chosen": -354.82501220703125, "logps/rejected": -334.2875061035156, "loss": 0.5914, "rewards/accuracies": 0.65625, "rewards/chosen": 0.45613402128219604, "rewards/margins": 0.649658203125, "rewards/rejected": -0.19429321587085724, "step": 680 }, { "epoch": 0.6061937184274105, "grad_norm": 129.40243554284726, "learning_rate": 8.486379613356766e-07, "logits/chosen": 0.724499523639679, "logits/rejected": 0.693554699420929, "logps/chosen": -382.2749938964844, "logps/rejected": -342.4624938964844, "loss": 0.5215, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.24189452826976776, "rewards/margins": 0.911914050579071, "rewards/rejected": -0.670062243938446, "step": 690 }, { "epoch": 0.6149791346365034, "grad_norm": 143.32724935265028, "learning_rate": 8.464411247803162e-07, "logits/chosen": 0.7372192144393921, "logits/rejected": 0.732818603515625, "logps/chosen": -377.98748779296875, "logps/rejected": -335.36248779296875, "loss": 0.5277, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6014373898506165, "rewards/margins": 1.1469604969024658, "rewards/rejected": -0.545520007610321, "step": 700 }, { "epoch": 0.6237645508455963, "grad_norm": 126.70012257286149, "learning_rate": 8.442442882249561e-07, "logits/chosen": 0.770050048828125, "logits/rejected": 0.821685791015625, "logps/chosen": -387.1625061035156, "logps/rejected": -349.4375, "loss": 0.6125, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.5986908078193665, "rewards/margins": 0.813891589641571, "rewards/rejected": -0.21472778916358948, "step": 710 }, { "epoch": 0.6325499670546892, "grad_norm": 125.72849683010732, "learning_rate": 8.420474516695958e-07, "logits/chosen": 0.6972411870956421, "logits/rejected": 0.6502441167831421, "logps/chosen": -377.98748779296875, "logps/rejected": -356.88751220703125, "loss": 0.6007, "rewards/accuracies": 0.6875, "rewards/chosen": 0.684704601764679, "rewards/margins": 0.7746216058731079, "rewards/rejected": -0.08931579440832138, "step": 720 }, { "epoch": 0.6413353832637821, "grad_norm": 172.3470017024697, "learning_rate": 8.398506151142355e-07, "logits/chosen": 0.7913421392440796, "logits/rejected": 0.800183117389679, "logps/chosen": -328.45623779296875, "logps/rejected": -320.79376220703125, "loss": 0.6501, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.638354480266571, "rewards/margins": 0.567730724811554, "rewards/rejected": 0.07038573920726776, "step": 730 }, { "epoch": 0.650120799472875, "grad_norm": 123.16953400043357, "learning_rate": 8.376537785588752e-07, "logits/chosen": 0.71356201171875, "logits/rejected": 0.731762707233429, "logps/chosen": -354.6000061035156, "logps/rejected": -338.82501220703125, "loss": 0.684, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.4485412538051605, "rewards/margins": 0.532824695110321, "rewards/rejected": -0.08410034328699112, "step": 740 }, { "epoch": 0.6589062156819679, "grad_norm": 161.50510744078096, "learning_rate": 8.354569420035149e-07, "logits/chosen": 0.7501220703125, "logits/rejected": 0.6981170773506165, "logps/chosen": -319.9624938964844, "logps/rejected": -296.26873779296875, "loss": 0.6151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5109497308731079, "rewards/margins": 0.687792956829071, "rewards/rejected": -0.17631836235523224, "step": 750 }, { "epoch": 0.6676916318910608, "grad_norm": 174.36478567810855, "learning_rate": 8.332601054481546e-07, "logits/chosen": 0.7668091058731079, "logits/rejected": 0.6859161257743835, "logps/chosen": -363.0625, "logps/rejected": -333.9125061035156, "loss": 0.617, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1929810047149658, "rewards/margins": 0.7418457269668579, "rewards/rejected": 0.4510864317417145, "step": 760 }, { "epoch": 0.6764770481001537, "grad_norm": 118.4359615573863, "learning_rate": 8.310632688927944e-07, "logits/chosen": 1.004541039466858, "logits/rejected": 0.9631713628768921, "logps/chosen": -345.63751220703125, "logps/rejected": -333.625, "loss": 0.5946, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.290185570716858, "rewards/margins": 0.7626587152481079, "rewards/rejected": 0.528491199016571, "step": 770 }, { "epoch": 0.6852624643092466, "grad_norm": 91.10133410852907, "learning_rate": 8.28866432337434e-07, "logits/chosen": 0.9068359136581421, "logits/rejected": 0.8267456293106079, "logps/chosen": -396.2749938964844, "logps/rejected": -354.6875, "loss": 0.6452, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.8797607421875, "rewards/margins": 0.58795166015625, "rewards/rejected": 0.2915283143520355, "step": 780 }, { "epoch": 0.6940478805183395, "grad_norm": 157.15792747788979, "learning_rate": 8.266695957820738e-07, "logits/chosen": 0.814221203327179, "logits/rejected": 0.769946277141571, "logps/chosen": -379.0625, "logps/rejected": -346.0625, "loss": 0.6353, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.982312023639679, "rewards/margins": 0.6677001714706421, "rewards/rejected": 0.3150390684604645, "step": 790 }, { "epoch": 0.7028332967274324, "grad_norm": 107.29366633252654, "learning_rate": 8.244727592267134e-07, "logits/chosen": 1.009851098060608, "logits/rejected": 0.8573974370956421, "logps/chosen": -335.88751220703125, "logps/rejected": -337.9375, "loss": 0.6391, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.716137707233429, "rewards/margins": 0.7086426019668579, "rewards/rejected": 0.007434082217514515, "step": 800 }, { "epoch": 0.7116187129365253, "grad_norm": 83.25793606599939, "learning_rate": 8.222759226713533e-07, "logits/chosen": 0.667956531047821, "logits/rejected": 0.735888659954071, "logps/chosen": -376.8500061035156, "logps/rejected": -347.9375, "loss": 0.5386, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4475463926792145, "rewards/margins": 0.818878173828125, "rewards/rejected": -0.37104493379592896, "step": 810 }, { "epoch": 0.7204041291456182, "grad_norm": 152.94967257227145, "learning_rate": 8.200790861159929e-07, "logits/chosen": 0.576385498046875, "logits/rejected": 0.4786926209926605, "logps/chosen": -388.4750061035156, "logps/rejected": -336.5, "loss": 0.5822, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.283346563577652, "rewards/margins": 0.870532214641571, "rewards/rejected": -0.5871612429618835, "step": 820 }, { "epoch": 0.7291895453547111, "grad_norm": 101.73940874098514, "learning_rate": 8.178822495606327e-07, "logits/chosen": 0.5696471929550171, "logits/rejected": 0.5772765874862671, "logps/chosen": -328.75, "logps/rejected": -309.86248779296875, "loss": 0.5556, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.5639068484306335, "rewards/margins": 0.8946533203125, "rewards/rejected": -0.3313049376010895, "step": 830 }, { "epoch": 0.737974961563804, "grad_norm": 113.75262534577234, "learning_rate": 8.156854130052724e-07, "logits/chosen": 0.7153381109237671, "logits/rejected": 0.7607971429824829, "logps/chosen": -364.1625061035156, "logps/rejected": -356.6499938964844, "loss": 0.537, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.612945556640625, "rewards/margins": 0.9758545160293579, "rewards/rejected": -0.363433837890625, "step": 840 }, { "epoch": 0.7467603777728969, "grad_norm": 106.90229414350566, "learning_rate": 8.134885764499121e-07, "logits/chosen": 0.67059326171875, "logits/rejected": 0.654449462890625, "logps/chosen": -367.5, "logps/rejected": -329.0375061035156, "loss": 0.6237, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4867187440395355, "rewards/margins": 0.833789050579071, "rewards/rejected": -0.34722900390625, "step": 850 }, { "epoch": 0.7555457939819898, "grad_norm": 140.84843055889834, "learning_rate": 8.112917398945518e-07, "logits/chosen": 0.661730945110321, "logits/rejected": 0.6620239019393921, "logps/chosen": -348.82501220703125, "logps/rejected": -325.7250061035156, "loss": 0.5834, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.38698118925094604, "rewards/margins": 0.7570434808731079, "rewards/rejected": -0.36968994140625, "step": 860 }, { "epoch": 0.7643312101910829, "grad_norm": 108.8480422275272, "learning_rate": 8.090949033391915e-07, "logits/chosen": 0.779815673828125, "logits/rejected": 0.753613293170929, "logps/chosen": -364.26873779296875, "logps/rejected": -319.98748779296875, "loss": 0.5807, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5811492800712585, "rewards/margins": 0.8836425542831421, "rewards/rejected": -0.3028457760810852, "step": 870 }, { "epoch": 0.7731166264001758, "grad_norm": 103.28213597481961, "learning_rate": 8.068980667838312e-07, "logits/chosen": 0.6358245611190796, "logits/rejected": 0.613421618938446, "logps/chosen": -370.11248779296875, "logps/rejected": -331.8999938964844, "loss": 0.5945, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.39617919921875, "rewards/margins": 0.725994884967804, "rewards/rejected": -0.32902222871780396, "step": 880 }, { "epoch": 0.7819020426092687, "grad_norm": 91.25906751428586, "learning_rate": 8.04701230228471e-07, "logits/chosen": 0.593548595905304, "logits/rejected": 0.6077514886856079, "logps/chosen": -326.41876220703125, "logps/rejected": -329.2124938964844, "loss": 0.5103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6264282464981079, "rewards/margins": 1.06427001953125, "rewards/rejected": -0.43754273653030396, "step": 890 }, { "epoch": 0.7906874588183616, "grad_norm": 130.27162332227738, "learning_rate": 8.025043936731107e-07, "logits/chosen": 0.6284118890762329, "logits/rejected": 0.5671142339706421, "logps/chosen": -341.70001220703125, "logps/rejected": -376.57501220703125, "loss": 0.7825, "rewards/accuracies": 0.6875, "rewards/chosen": 0.255584716796875, "rewards/margins": 0.7244507074356079, "rewards/rejected": -0.4685302674770355, "step": 900 }, { "epoch": 0.7994728750274545, "grad_norm": 218.03370500164647, "learning_rate": 8.003075571177505e-07, "logits/chosen": 0.5689789056777954, "logits/rejected": 0.4414001405239105, "logps/chosen": -408.8374938964844, "logps/rejected": -333.5249938964844, "loss": 0.633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.29487913846969604, "rewards/margins": 0.77197265625, "rewards/rejected": -0.47686767578125, "step": 910 }, { "epoch": 0.8082582912365474, "grad_norm": 126.70556647572619, "learning_rate": 7.981107205623901e-07, "logits/chosen": 0.83587646484375, "logits/rejected": 0.7498108148574829, "logps/chosen": -377.48748779296875, "logps/rejected": -312.40625, "loss": 0.6558, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.523876965045929, "rewards/margins": 0.6868652105331421, "rewards/rejected": -0.16251984238624573, "step": 920 }, { "epoch": 0.8170437074456403, "grad_norm": 121.69939250228455, "learning_rate": 7.959138840070299e-07, "logits/chosen": 0.7581115961074829, "logits/rejected": 0.677813708782196, "logps/chosen": -395.51251220703125, "logps/rejected": -349.5249938964844, "loss": 0.6019, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.846508800983429, "rewards/margins": 0.8267822265625, "rewards/rejected": 0.02013549767434597, "step": 930 }, { "epoch": 0.8258291236547332, "grad_norm": 127.86187401796298, "learning_rate": 7.937170474516695e-07, "logits/chosen": 0.6433044672012329, "logits/rejected": 0.6284820437431335, "logps/chosen": -353.82501220703125, "logps/rejected": -317.29998779296875, "loss": 0.5001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.758056640625, "rewards/margins": 0.955322265625, "rewards/rejected": -0.19773559272289276, "step": 940 }, { "epoch": 0.834614539863826, "grad_norm": 138.63379878791022, "learning_rate": 7.915202108963093e-07, "logits/chosen": 0.5711212158203125, "logits/rejected": 0.5924438238143921, "logps/chosen": -368.1000061035156, "logps/rejected": -323.1499938964844, "loss": 0.5605, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.568225085735321, "rewards/margins": 1.0416991710662842, "rewards/rejected": -0.47343748807907104, "step": 950 }, { "epoch": 0.843399956072919, "grad_norm": 94.27426947174395, "learning_rate": 7.89323374340949e-07, "logits/chosen": 0.7823120355606079, "logits/rejected": 0.751513659954071, "logps/chosen": -410.42498779296875, "logps/rejected": -360.4624938964844, "loss": 0.6197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4736877381801605, "rewards/margins": 0.7760254144668579, "rewards/rejected": -0.30219727754592896, "step": 960 }, { "epoch": 0.8521853722820119, "grad_norm": 61.32412692603968, "learning_rate": 7.871265377855887e-07, "logits/chosen": 0.797534167766571, "logits/rejected": 0.6763855218887329, "logps/chosen": -321.2749938964844, "logps/rejected": -326.3374938964844, "loss": 0.553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5334686040878296, "rewards/margins": 0.8941894769668579, "rewards/rejected": -0.36101073026657104, "step": 970 }, { "epoch": 0.8609707884911048, "grad_norm": 102.31462405859801, "learning_rate": 7.849297012302284e-07, "logits/chosen": 0.61859130859375, "logits/rejected": 0.6014129519462585, "logps/chosen": -385.7562561035156, "logps/rejected": -368.1000061035156, "loss": 0.5764, "rewards/accuracies": 0.71875, "rewards/chosen": 0.62567138671875, "rewards/margins": 1.0081603527069092, "rewards/rejected": -0.3816070556640625, "step": 980 }, { "epoch": 0.8697562047001977, "grad_norm": 102.24158230381265, "learning_rate": 7.827328646748682e-07, "logits/chosen": 0.5759872198104858, "logits/rejected": 0.45131224393844604, "logps/chosen": -367.125, "logps/rejected": -333.2749938964844, "loss": 0.4789, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.34223634004592896, "rewards/margins": 1.128271460533142, "rewards/rejected": -0.787731945514679, "step": 990 }, { "epoch": 0.8785416209092906, "grad_norm": 133.1515966249434, "learning_rate": 7.805360281195079e-07, "logits/chosen": 0.5089355707168579, "logits/rejected": 0.47595977783203125, "logps/chosen": -368.0062561035156, "logps/rejected": -339.3500061035156, "loss": 0.4808, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2845214903354645, "rewards/margins": 1.2062866687774658, "rewards/rejected": -0.92047119140625, "step": 1000 }, { "epoch": 0.8873270371183835, "grad_norm": 59.03373350583219, "learning_rate": 7.783391915641477e-07, "logits/chosen": 0.5002090334892273, "logits/rejected": 0.5658935308456421, "logps/chosen": -392.36248779296875, "logps/rejected": -348.9624938964844, "loss": 0.5735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5140625238418579, "rewards/margins": 0.980053722858429, "rewards/rejected": -0.4658249020576477, "step": 1010 }, { "epoch": 0.8961124533274764, "grad_norm": 144.63745375991866, "learning_rate": 7.761423550087873e-07, "logits/chosen": 0.614459216594696, "logits/rejected": 0.6694061160087585, "logps/chosen": -333.54998779296875, "logps/rejected": -334.48748779296875, "loss": 0.654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2696166932582855, "rewards/margins": 0.7572997808456421, "rewards/rejected": -0.48844605684280396, "step": 1020 }, { "epoch": 0.9048978695365693, "grad_norm": 54.48592445712244, "learning_rate": 7.739455184534271e-07, "logits/chosen": 0.6297241449356079, "logits/rejected": 0.55712890625, "logps/chosen": -371.79998779296875, "logps/rejected": -366.20001220703125, "loss": 0.5175, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.36857300996780396, "rewards/margins": 1.3197662830352783, "rewards/rejected": -0.9504944086074829, "step": 1030 }, { "epoch": 0.9136832857456622, "grad_norm": 161.37424857124523, "learning_rate": 7.717486818980667e-07, "logits/chosen": 0.776538074016571, "logits/rejected": 0.7039642333984375, "logps/chosen": -325.73748779296875, "logps/rejected": -318.2749938964844, "loss": 0.6158, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10103759914636612, "rewards/margins": 0.841174304485321, "rewards/rejected": -0.740100085735321, "step": 1040 }, { "epoch": 0.922468701954755, "grad_norm": 106.86342239723682, "learning_rate": 7.695518453427065e-07, "logits/chosen": 0.6782867312431335, "logits/rejected": 0.6315857172012329, "logps/chosen": -444.76251220703125, "logps/rejected": -389.5249938964844, "loss": 0.6035, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.17373046278953552, "rewards/margins": 0.966479480266571, "rewards/rejected": -0.7929657101631165, "step": 1050 }, { "epoch": 0.9312541181638481, "grad_norm": 117.29426893364159, "learning_rate": 7.673550087873461e-07, "logits/chosen": 0.717907726764679, "logits/rejected": 0.746874988079071, "logps/chosen": -410.9375, "logps/rejected": -372.20001220703125, "loss": 0.6089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.452484130859375, "rewards/margins": 0.940417468547821, "rewards/rejected": -0.4882568418979645, "step": 1060 }, { "epoch": 0.940039534372941, "grad_norm": 142.28713009411996, "learning_rate": 7.651581722319859e-07, "logits/chosen": 0.743884265422821, "logits/rejected": 0.6647239923477173, "logps/chosen": -368.23748779296875, "logps/rejected": -327.0874938964844, "loss": 0.6538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.3909668028354645, "rewards/margins": 0.748828113079071, "rewards/rejected": -0.35784608125686646, "step": 1070 }, { "epoch": 0.9488249505820339, "grad_norm": 154.20539680100782, "learning_rate": 7.629613356766256e-07, "logits/chosen": 0.8165649175643921, "logits/rejected": 0.838085949420929, "logps/chosen": -371.54998779296875, "logps/rejected": -336.54998779296875, "loss": 0.6345, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.45190733671188354, "rewards/margins": 0.8287353515625, "rewards/rejected": -0.3762268126010895, "step": 1080 }, { "epoch": 0.9576103667911268, "grad_norm": 94.91330653905075, "learning_rate": 7.607644991212654e-07, "logits/chosen": 0.8318725824356079, "logits/rejected": 0.933349609375, "logps/chosen": -356.125, "logps/rejected": -321.78125, "loss": 0.6088, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.573596179485321, "rewards/margins": 0.810925304889679, "rewards/rejected": -0.23660889267921448, "step": 1090 }, { "epoch": 0.9663957830002197, "grad_norm": 121.64668794527282, "learning_rate": 7.585676625659051e-07, "logits/chosen": 0.8489990234375, "logits/rejected": 0.730908215045929, "logps/chosen": -388.04998779296875, "logps/rejected": -353.1000061035156, "loss": 0.5795, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4540039002895355, "rewards/margins": 0.7512451410293579, "rewards/rejected": -0.29765015840530396, "step": 1100 }, { "epoch": 0.9751811992093126, "grad_norm": 200.01724380088467, "learning_rate": 7.563708260105448e-07, "logits/chosen": 0.8021240234375, "logits/rejected": 0.777496337890625, "logps/chosen": -421.375, "logps/rejected": -395.79998779296875, "loss": 0.5837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7443939447402954, "rewards/margins": 0.9650634527206421, "rewards/rejected": -0.22100830078125, "step": 1110 }, { "epoch": 0.9839666154184055, "grad_norm": 125.61936623164024, "learning_rate": 7.541739894551845e-07, "logits/chosen": 0.795947253704071, "logits/rejected": 0.8147674798965454, "logps/chosen": -368.4750061035156, "logps/rejected": -349.04998779296875, "loss": 0.5563, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.532635509967804, "rewards/margins": 0.9850403070449829, "rewards/rejected": -0.45094603300094604, "step": 1120 }, { "epoch": 0.9927520316274984, "grad_norm": 111.74600728803112, "learning_rate": 7.519771528998243e-07, "logits/chosen": 0.724029541015625, "logits/rejected": 0.6282714605331421, "logps/chosen": -348.5625, "logps/rejected": -342.2749938964844, "loss": 0.5326, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.75531005859375, "rewards/margins": 1.0071532726287842, "rewards/rejected": -0.25104981660842896, "step": 1130 }, { "epoch": 1.0008785416209094, "grad_norm": 18.961311130094302, "learning_rate": 7.497803163444639e-07, "logits/chosen": 0.7015281915664673, "logits/rejected": 0.5504051446914673, "logps/chosen": -367.47296142578125, "logps/rejected": -352.02703857421875, "loss": 0.4072, "rewards/accuracies": 0.7364864945411682, "rewards/chosen": 0.6846758723258972, "rewards/margins": 1.4769188165664673, "rewards/rejected": -0.7920383810997009, "step": 1140 }, { "epoch": 1.0096639578300022, "grad_norm": 25.74009991856314, "learning_rate": 7.475834797891037e-07, "logits/chosen": 0.6251068115234375, "logits/rejected": 0.507830798625946, "logps/chosen": -386.67498779296875, "logps/rejected": -366.375, "loss": 0.132, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3281738758087158, "rewards/margins": 3.579882860183716, "rewards/rejected": -2.252734422683716, "step": 1150 }, { "epoch": 1.0184493740390952, "grad_norm": 25.90305513520264, "learning_rate": 7.453866432337433e-07, "logits/chosen": 0.4692138731479645, "logits/rejected": 0.42976683378219604, "logps/chosen": -348.0375061035156, "logps/rejected": -388.2749938964844, "loss": 0.129, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2872192859649658, "rewards/margins": 3.822070360183716, "rewards/rejected": -2.535937547683716, "step": 1160 }, { "epoch": 1.027234790248188, "grad_norm": 24.47208853043789, "learning_rate": 7.431898066783831e-07, "logits/chosen": 0.3258407711982727, "logits/rejected": 0.43860167264938354, "logps/chosen": -353.82501220703125, "logps/rejected": -360.61248779296875, "loss": 0.1081, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.792187511920929, "rewards/margins": 3.8642578125, "rewards/rejected": -3.0708985328674316, "step": 1170 }, { "epoch": 1.036020206457281, "grad_norm": 80.13200072674502, "learning_rate": 7.409929701230228e-07, "logits/chosen": 0.3098510801792145, "logits/rejected": 0.04650268703699112, "logps/chosen": -375.9125061035156, "logps/rejected": -392.3999938964844, "loss": 0.0887, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5392669439315796, "rewards/margins": 4.153124809265137, "rewards/rejected": -3.6138672828674316, "step": 1180 }, { "epoch": 1.0448056226663738, "grad_norm": 43.569828431359475, "learning_rate": 7.387961335676626e-07, "logits/chosen": 0.14960937201976776, "logits/rejected": -0.0274658203125, "logps/chosen": -389.1000061035156, "logps/rejected": -401.8999938964844, "loss": 0.1188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.20869140326976776, "rewards/margins": 4.052343845367432, "rewards/rejected": -3.8431639671325684, "step": 1190 }, { "epoch": 1.0535910388754668, "grad_norm": 27.169343285543146, "learning_rate": 7.365992970123023e-07, "logits/chosen": 0.28812867403030396, "logits/rejected": 0.06358031928539276, "logps/chosen": -354.4624938964844, "logps/rejected": -363.6875, "loss": 0.0904, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.0153319835662842, "rewards/margins": 4.733984470367432, "rewards/rejected": -3.7164063453674316, "step": 1200 }, { "epoch": 1.0623764550845596, "grad_norm": 18.045747269850246, "learning_rate": 7.34402460456942e-07, "logits/chosen": 0.222137451171875, "logits/rejected": 0.19993285834789276, "logps/chosen": -382.8999938964844, "logps/rejected": -350.75, "loss": 0.1493, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2532227039337158, "rewards/margins": 4.169629096984863, "rewards/rejected": -2.914746046066284, "step": 1210 }, { "epoch": 1.0711618712936526, "grad_norm": 15.782534570658127, "learning_rate": 7.322056239015817e-07, "logits/chosen": 0.39531248807907104, "logits/rejected": 0.22655335068702698, "logps/chosen": -392.70001220703125, "logps/rejected": -358.8500061035156, "loss": 0.0808, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.498693823814392, "rewards/margins": 3.984570264816284, "rewards/rejected": -2.4874024391174316, "step": 1220 }, { "epoch": 1.0799472875027454, "grad_norm": 41.60913089522361, "learning_rate": 7.300087873462214e-07, "logits/chosen": 0.11590881645679474, "logits/rejected": 0.19947203993797302, "logps/chosen": -307.8999938964844, "logps/rejected": -358.63751220703125, "loss": 0.1367, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0971527099609375, "rewards/margins": 3.8564453125, "rewards/rejected": -2.7579102516174316, "step": 1230 }, { "epoch": 1.0887327037118384, "grad_norm": 27.58443225464252, "learning_rate": 7.278119507908611e-07, "logits/chosen": 0.16886290907859802, "logits/rejected": 0.06146850436925888, "logps/chosen": -361.29998779296875, "logps/rejected": -410.1000061035156, "loss": 0.0827, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7530761957168579, "rewards/margins": 4.321093559265137, "rewards/rejected": -3.568066358566284, "step": 1240 }, { "epoch": 1.0975181199209312, "grad_norm": 27.544523406436177, "learning_rate": 7.256151142355009e-07, "logits/chosen": 0.20024414360523224, "logits/rejected": 0.08804626762866974, "logps/chosen": -427.25, "logps/rejected": -389.20001220703125, "loss": 0.1036, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.187963843345642, "rewards/margins": 4.1123046875, "rewards/rejected": -2.920703172683716, "step": 1250 }, { "epoch": 1.1063035361300242, "grad_norm": 29.01811359667435, "learning_rate": 7.234182776801405e-07, "logits/chosen": 0.36299437284469604, "logits/rejected": 0.23122557997703552, "logps/chosen": -426.1499938964844, "logps/rejected": -398.3500061035156, "loss": 0.0785, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6031494140625, "rewards/margins": 4.442968845367432, "rewards/rejected": -2.8393311500549316, "step": 1260 }, { "epoch": 1.115088952339117, "grad_norm": 53.036873110381045, "learning_rate": 7.212214411247804e-07, "logits/chosen": 0.418182373046875, "logits/rejected": 0.28881072998046875, "logps/chosen": -374.7250061035156, "logps/rejected": -422.57501220703125, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2725098133087158, "rewards/margins": 4.546093940734863, "rewards/rejected": -3.2750000953674316, "step": 1270 }, { "epoch": 1.12387436854821, "grad_norm": 9.899801957737457, "learning_rate": 7.1902460456942e-07, "logits/chosen": 0.0919189453125, "logits/rejected": 0.056884765625, "logps/chosen": -346.45001220703125, "logps/rejected": -342.0874938964844, "loss": 0.123, "rewards/accuracies": 0.96875, "rewards/chosen": 0.862963855266571, "rewards/margins": 4.120312690734863, "rewards/rejected": -3.2601561546325684, "step": 1280 }, { "epoch": 1.132659784757303, "grad_norm": 29.677330328095056, "learning_rate": 7.168277680140598e-07, "logits/chosen": 0.21311035752296448, "logits/rejected": 0.15051880478858948, "logps/chosen": -328.04998779296875, "logps/rejected": -353.625, "loss": 0.0959, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.970532238483429, "rewards/margins": 4.250390529632568, "rewards/rejected": -3.281445264816284, "step": 1290 }, { "epoch": 1.1414452009663958, "grad_norm": 25.074383165509722, "learning_rate": 7.146309314586994e-07, "logits/chosen": 0.14276733994483948, "logits/rejected": 0.05648193508386612, "logps/chosen": -354.2875061035156, "logps/rejected": -386.01251220703125, "loss": 0.1124, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0518004894256592, "rewards/margins": 4.182812690734863, "rewards/rejected": -3.1285157203674316, "step": 1300 }, { "epoch": 1.1502306171754886, "grad_norm": 35.807000811413275, "learning_rate": 7.124340949033392e-07, "logits/chosen": 0.25034791231155396, "logits/rejected": 0.04127044603228569, "logps/chosen": -355.3999938964844, "logps/rejected": -400.2749938964844, "loss": 0.1062, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.216333031654358, "rewards/margins": 4.776953220367432, "rewards/rejected": -3.5625977516174316, "step": 1310 }, { "epoch": 1.1590160333845816, "grad_norm": 26.969059967341988, "learning_rate": 7.102372583479789e-07, "logits/chosen": 0.18806152045726776, "logits/rejected": 0.08161010593175888, "logps/chosen": -387.4125061035156, "logps/rejected": -410.2250061035156, "loss": 0.0886, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.0563476085662842, "rewards/margins": 4.512499809265137, "rewards/rejected": -3.456249952316284, "step": 1320 }, { "epoch": 1.1678014495936746, "grad_norm": 20.761591384826335, "learning_rate": 7.080404217926186e-07, "logits/chosen": 0.14601440727710724, "logits/rejected": -0.1387939453125, "logps/chosen": -374.5218811035156, "logps/rejected": -359.7749938964844, "loss": 0.0891, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.509735107421875, "rewards/margins": 4.608788967132568, "rewards/rejected": -4.097851753234863, "step": 1330 }, { "epoch": 1.1765868658027674, "grad_norm": 43.971081047811936, "learning_rate": 7.058435852372583e-07, "logits/chosen": 0.092315673828125, "logits/rejected": -0.06405334174633026, "logps/chosen": -376.70001220703125, "logps/rejected": -367.54998779296875, "loss": 0.0744, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.486328125, "rewards/margins": 4.68359375, "rewards/rejected": -4.197363376617432, "step": 1340 }, { "epoch": 1.1853722820118604, "grad_norm": 36.67212573663308, "learning_rate": 7.03646748681898e-07, "logits/chosen": -0.01691894605755806, "logits/rejected": -0.1436767578125, "logps/chosen": -374.3374938964844, "logps/rejected": -425.95001220703125, "loss": 0.0747, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.24630126357078552, "rewards/margins": 5.192968845367432, "rewards/rejected": -4.947265625, "step": 1350 }, { "epoch": 1.1941576982209532, "grad_norm": 7.6299294768570585, "learning_rate": 7.014499121265377e-07, "logits/chosen": 0.02706298790872097, "logits/rejected": -0.07857666164636612, "logps/chosen": -394.42498779296875, "logps/rejected": -410.0249938964844, "loss": 0.1002, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.27030640840530396, "rewards/margins": 5.024023532867432, "rewards/rejected": -4.748827934265137, "step": 1360 }, { "epoch": 1.2029431144300462, "grad_norm": 12.784808502067452, "learning_rate": 6.992530755711776e-07, "logits/chosen": 0.06367187201976776, "logits/rejected": -0.0109100341796875, "logps/chosen": -368.76251220703125, "logps/rejected": -392.17498779296875, "loss": 0.111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9055694341659546, "rewards/margins": 4.5205078125, "rewards/rejected": -3.6175780296325684, "step": 1370 }, { "epoch": 1.211728530639139, "grad_norm": 21.34329301834719, "learning_rate": 6.970562390158172e-07, "logits/chosen": 0.24456176161766052, "logits/rejected": 0.13595886528491974, "logps/chosen": -351.1499938964844, "logps/rejected": -393.70001220703125, "loss": 0.0905, "rewards/accuracies": 0.96875, "rewards/chosen": 0.901721179485321, "rewards/margins": 4.271484375, "rewards/rejected": -3.368945360183716, "step": 1380 }, { "epoch": 1.220513946848232, "grad_norm": 36.66894688845897, "learning_rate": 6.94859402460457e-07, "logits/chosen": 0.10258178412914276, "logits/rejected": 0.19937439262866974, "logps/chosen": -329.79998779296875, "logps/rejected": -375.32501220703125, "loss": 0.1137, "rewards/accuracies": 0.96875, "rewards/chosen": 0.39006346464157104, "rewards/margins": 4.4765625, "rewards/rejected": -4.085546970367432, "step": 1390 }, { "epoch": 1.2292993630573248, "grad_norm": 29.446614552242355, "learning_rate": 6.926625659050966e-07, "logits/chosen": 0.03094482421875, "logits/rejected": -0.21303100883960724, "logps/chosen": -450.4750061035156, "logps/rejected": -416.8999938964844, "loss": 0.0997, "rewards/accuracies": 0.96875, "rewards/chosen": 0.32521361112594604, "rewards/margins": 5.093066215515137, "rewards/rejected": -4.769921779632568, "step": 1400 }, { "epoch": 1.2380847792664178, "grad_norm": 38.8991883092129, "learning_rate": 6.904657293497364e-07, "logits/chosen": 0.04876708984375, "logits/rejected": 0.07144775241613388, "logps/chosen": -350.48748779296875, "logps/rejected": -375.625, "loss": 0.1116, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3395934998989105, "rewards/margins": 4.362890720367432, "rewards/rejected": -4.023828029632568, "step": 1410 }, { "epoch": 1.2468701954755106, "grad_norm": 29.84815304609672, "learning_rate": 6.88268892794376e-07, "logits/chosen": 0.13684844970703125, "logits/rejected": 0.035247802734375, "logps/chosen": -402.4125061035156, "logps/rejected": -374.6000061035156, "loss": 0.084, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8470458984375, "rewards/margins": 4.445703029632568, "rewards/rejected": -3.5943360328674316, "step": 1420 }, { "epoch": 1.2556556116846036, "grad_norm": 72.64894241875751, "learning_rate": 6.860720562390158e-07, "logits/chosen": 0.18435057997703552, "logits/rejected": 0.12365112453699112, "logps/chosen": -394.88751220703125, "logps/rejected": -409.3500061035156, "loss": 0.1096, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8996826410293579, "rewards/margins": 4.514062404632568, "rewards/rejected": -3.611132860183716, "step": 1430 }, { "epoch": 1.2644410278936964, "grad_norm": 17.17517320426886, "learning_rate": 6.838752196836555e-07, "logits/chosen": 0.12235565483570099, "logits/rejected": 0.0897216796875, "logps/chosen": -351.53125, "logps/rejected": -356.1625061035156, "loss": 0.1142, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1646239757537842, "rewards/margins": 4.300976753234863, "rewards/rejected": -3.1333985328674316, "step": 1440 }, { "epoch": 1.2732264441027894, "grad_norm": 47.50524430723618, "learning_rate": 6.816783831282952e-07, "logits/chosen": 0.4380355775356293, "logits/rejected": 0.24175414443016052, "logps/chosen": -352.98748779296875, "logps/rejected": -337.79998779296875, "loss": 0.1209, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.18609619140625, "rewards/margins": 4.235547065734863, "rewards/rejected": -3.0503907203674316, "step": 1450 }, { "epoch": 1.2820118603118824, "grad_norm": 25.52227321185207, "learning_rate": 6.79481546572935e-07, "logits/chosen": 0.26811522245407104, "logits/rejected": 0.188629150390625, "logps/chosen": -368.1625061035156, "logps/rejected": -360.625, "loss": 0.084, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.3247191905975342, "rewards/margins": 4.214257717132568, "rewards/rejected": -2.888867139816284, "step": 1460 }, { "epoch": 1.2907972765209752, "grad_norm": 68.49357949495098, "learning_rate": 6.772847100175747e-07, "logits/chosen": 0.14310303330421448, "logits/rejected": 0.16095581650733948, "logps/chosen": -362.32501220703125, "logps/rejected": -378.42498779296875, "loss": 0.0953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.025903344154358, "rewards/margins": 4.573046684265137, "rewards/rejected": -3.546875, "step": 1470 }, { "epoch": 1.299582692730068, "grad_norm": 17.62184814774221, "learning_rate": 6.750878734622144e-07, "logits/chosen": -0.027109527960419655, "logits/rejected": 0.009899902157485485, "logps/chosen": -317.0625, "logps/rejected": -364.82501220703125, "loss": 0.1083, "rewards/accuracies": 0.96875, "rewards/chosen": 0.611621081829071, "rewards/margins": 4.155371189117432, "rewards/rejected": -3.5404295921325684, "step": 1480 }, { "epoch": 1.308368108939161, "grad_norm": 54.48934831716566, "learning_rate": 6.728910369068542e-07, "logits/chosen": 0.08114013820886612, "logits/rejected": -0.08742065727710724, "logps/chosen": -391.29998779296875, "logps/rejected": -361.1625061035156, "loss": 0.0948, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19893798232078552, "rewards/margins": 4.1806640625, "rewards/rejected": -3.981640577316284, "step": 1490 }, { "epoch": 1.317153525148254, "grad_norm": 40.437175738022766, "learning_rate": 6.706942003514938e-07, "logits/chosen": -0.11091308295726776, "logits/rejected": -0.18497315049171448, "logps/chosen": -355.73748779296875, "logps/rejected": -353.1875, "loss": 0.114, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07499389350414276, "rewards/margins": 4.407275199890137, "rewards/rejected": -4.330468654632568, "step": 1500 }, { "epoch": 1.3259389413573468, "grad_norm": 43.65970666506957, "learning_rate": 6.684973637961336e-07, "logits/chosen": 0.10129089653491974, "logits/rejected": -0.23331299424171448, "logps/chosen": -399.17498779296875, "logps/rejected": -371.95001220703125, "loss": 0.0881, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3244476318359375, "rewards/margins": 4.925390720367432, "rewards/rejected": -4.601758003234863, "step": 1510 }, { "epoch": 1.3347243575664396, "grad_norm": 54.30439811160622, "learning_rate": 6.663005272407732e-07, "logits/chosen": 0.20499877631664276, "logits/rejected": 0.05280761793255806, "logps/chosen": -369.33123779296875, "logps/rejected": -362.4750061035156, "loss": 0.0994, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4191528260707855, "rewards/margins": 4.276562690734863, "rewards/rejected": -3.8568358421325684, "step": 1520 }, { "epoch": 1.3435097737755326, "grad_norm": 107.15664199206056, "learning_rate": 6.64103690685413e-07, "logits/chosen": 0.071990966796875, "logits/rejected": 0.17463989555835724, "logps/chosen": -350.0249938964844, "logps/rejected": -386.75, "loss": 0.12, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2621521055698395, "rewards/margins": 4.564062595367432, "rewards/rejected": -4.302929878234863, "step": 1530 }, { "epoch": 1.3522951899846256, "grad_norm": 24.696590834073753, "learning_rate": 6.619068541300526e-07, "logits/chosen": 0.240264892578125, "logits/rejected": 0.14331665635108948, "logps/chosen": -369.375, "logps/rejected": -385.92498779296875, "loss": 0.0869, "rewards/accuracies": 0.96875, "rewards/chosen": 0.58758544921875, "rewards/margins": 4.363671779632568, "rewards/rejected": -3.773632764816284, "step": 1540 }, { "epoch": 1.3610806061937184, "grad_norm": 23.357878184588497, "learning_rate": 6.597100175746925e-07, "logits/chosen": 0.15198974311351776, "logits/rejected": 0.05958252027630806, "logps/chosen": -353.7250061035156, "logps/rejected": -353.23748779296875, "loss": 0.093, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5988403558731079, "rewards/margins": 4.3017578125, "rewards/rejected": -3.7046875953674316, "step": 1550 }, { "epoch": 1.3698660224028114, "grad_norm": 43.376019030374245, "learning_rate": 6.575131810193322e-07, "logits/chosen": 0.09593506157398224, "logits/rejected": 0.10770263522863388, "logps/chosen": -340.7124938964844, "logps/rejected": -383.125, "loss": 0.0919, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.46781617403030396, "rewards/margins": 4.310937404632568, "rewards/rejected": -3.8433594703674316, "step": 1560 }, { "epoch": 1.3786514386119042, "grad_norm": 17.855298670818428, "learning_rate": 6.553163444639719e-07, "logits/chosen": 0.09766235202550888, "logits/rejected": -0.11490020900964737, "logps/chosen": -315.82501220703125, "logps/rejected": -315.8500061035156, "loss": 0.111, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.22515869140625, "rewards/margins": 4.445116996765137, "rewards/rejected": -4.2216796875, "step": 1570 }, { "epoch": 1.3874368548209972, "grad_norm": 68.56153825890065, "learning_rate": 6.531195079086116e-07, "logits/chosen": -0.11847229301929474, "logits/rejected": -0.15147094428539276, "logps/chosen": -343.1499938964844, "logps/rejected": -388.07501220703125, "loss": 0.1086, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.12851257622241974, "rewards/margins": 4.626367092132568, "rewards/rejected": -4.754101753234863, "step": 1580 }, { "epoch": 1.39622227103009, "grad_norm": 28.735553469006355, "learning_rate": 6.509226713532513e-07, "logits/chosen": 0.05935058742761612, "logits/rejected": -0.04839172214269638, "logps/chosen": -423.3125, "logps/rejected": -425.07501220703125, "loss": 0.0768, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.2706054747104645, "rewards/margins": 5.030859470367432, "rewards/rejected": -4.763281345367432, "step": 1590 }, { "epoch": 1.405007687239183, "grad_norm": 62.7272063217339, "learning_rate": 6.48725834797891e-07, "logits/chosen": 0.02900085411965847, "logits/rejected": -0.10573120415210724, "logps/chosen": -346.8125, "logps/rejected": -363.70001220703125, "loss": 0.1177, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4808349609375, "rewards/margins": 4.555859565734863, "rewards/rejected": -4.077929496765137, "step": 1600 }, { "epoch": 1.4137931034482758, "grad_norm": 30.797170694331456, "learning_rate": 6.465289982425308e-07, "logits/chosen": 0.19962768256664276, "logits/rejected": 0.06115112453699112, "logps/chosen": -345.57501220703125, "logps/rejected": -368.04998779296875, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": 0.804302990436554, "rewards/margins": 4.596093654632568, "rewards/rejected": -3.7953124046325684, "step": 1610 }, { "epoch": 1.4225785196573688, "grad_norm": 26.351909538511425, "learning_rate": 6.443321616871704e-07, "logits/chosen": 0.22113037109375, "logits/rejected": 0.06400756537914276, "logps/chosen": -347.3500061035156, "logps/rejected": -352.20001220703125, "loss": 0.1147, "rewards/accuracies": 0.96875, "rewards/chosen": 0.865002453327179, "rewards/margins": 4.421777248382568, "rewards/rejected": -3.5511717796325684, "step": 1620 }, { "epoch": 1.4313639358664616, "grad_norm": 25.560687844232596, "learning_rate": 6.421353251318102e-07, "logits/chosen": 0.30369263887405396, "logits/rejected": 0.20023497939109802, "logps/chosen": -359.23748779296875, "logps/rejected": -357.375, "loss": 0.0842, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.039648413658142, "rewards/margins": 4.372265815734863, "rewards/rejected": -3.3310546875, "step": 1630 }, { "epoch": 1.4401493520755546, "grad_norm": 32.64109756378455, "learning_rate": 6.399384885764498e-07, "logits/chosen": 0.38753050565719604, "logits/rejected": 0.30475157499313354, "logps/chosen": -320.76873779296875, "logps/rejected": -335.04998779296875, "loss": 0.1162, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7394164800643921, "rewards/margins": 4.137499809265137, "rewards/rejected": -3.3973631858825684, "step": 1640 }, { "epoch": 1.4489347682846474, "grad_norm": 65.97089287943633, "learning_rate": 6.377416520210897e-07, "logits/chosen": 0.22052612900733948, "logits/rejected": 0.09918823093175888, "logps/chosen": -351.04376220703125, "logps/rejected": -397.3999938964844, "loss": 0.094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.614086925983429, "rewards/margins": 4.383984565734863, "rewards/rejected": -3.7694334983825684, "step": 1650 }, { "epoch": 1.4577201844937404, "grad_norm": 49.66928607958998, "learning_rate": 6.355448154657293e-07, "logits/chosen": 0.1690521240234375, "logits/rejected": 0.0045532225631177425, "logps/chosen": -401.95001220703125, "logps/rejected": -390.67498779296875, "loss": 0.1119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.48921507596969604, "rewards/margins": 4.643945217132568, "rewards/rejected": -4.155859470367432, "step": 1660 }, { "epoch": 1.4665056007028334, "grad_norm": 25.09000045797173, "learning_rate": 6.333479789103691e-07, "logits/chosen": 0.14262695610523224, "logits/rejected": -0.0040527344681322575, "logps/chosen": -331.92498779296875, "logps/rejected": -353.0, "loss": 0.1052, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6036926507949829, "rewards/margins": 4.4765625, "rewards/rejected": -3.8740234375, "step": 1670 }, { "epoch": 1.4752910169119262, "grad_norm": 39.460931137465884, "learning_rate": 6.311511423550088e-07, "logits/chosen": 0.14421996474266052, "logits/rejected": 0.12274780124425888, "logps/chosen": -365.1000061035156, "logps/rejected": -390.32501220703125, "loss": 0.0818, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.62945556640625, "rewards/margins": 4.797461032867432, "rewards/rejected": -4.171288967132568, "step": 1680 }, { "epoch": 1.484076433121019, "grad_norm": 36.55526307021372, "learning_rate": 6.289543057996485e-07, "logits/chosen": 0.08905067294836044, "logits/rejected": -0.00844421423971653, "logps/chosen": -410.38751220703125, "logps/rejected": -425.67498779296875, "loss": 0.0874, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.2578125, "rewards/margins": 4.848046779632568, "rewards/rejected": -4.588281154632568, "step": 1690 }, { "epoch": 1.492861849330112, "grad_norm": 33.71108398895067, "learning_rate": 6.267574692442882e-07, "logits/chosen": 0.17714843153953552, "logits/rejected": -0.10486450046300888, "logps/chosen": -384.0874938964844, "logps/rejected": -405.57501220703125, "loss": 0.0909, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.27130126953125, "rewards/margins": 4.947070121765137, "rewards/rejected": -4.669531345367432, "step": 1700 }, { "epoch": 1.501647265539205, "grad_norm": 35.95244726564276, "learning_rate": 6.245606326889279e-07, "logits/chosen": 0.04472961276769638, "logits/rejected": -0.06884155422449112, "logps/chosen": -374.0249938964844, "logps/rejected": -357.6499938964844, "loss": 0.0975, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.37933349609375, "rewards/margins": 4.652929782867432, "rewards/rejected": -4.273828029632568, "step": 1710 }, { "epoch": 1.5104326817482978, "grad_norm": 60.67837113961381, "learning_rate": 6.223637961335676e-07, "logits/chosen": 0.0863189697265625, "logits/rejected": 0.02843322791159153, "logps/chosen": -368.92498779296875, "logps/rejected": -398.9750061035156, "loss": 0.1045, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.24602660536766052, "rewards/margins": 4.794140815734863, "rewards/rejected": -4.547265529632568, "step": 1720 }, { "epoch": 1.5192180979573906, "grad_norm": 56.26951170068822, "learning_rate": 6.201669595782074e-07, "logits/chosen": 0.02961425855755806, "logits/rejected": -0.10726318508386612, "logps/chosen": -324.20623779296875, "logps/rejected": -347.1000061035156, "loss": 0.0875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20033875107765198, "rewards/margins": 4.233984470367432, "rewards/rejected": -4.0322265625, "step": 1730 }, { "epoch": 1.5280035141664836, "grad_norm": 40.06963273385231, "learning_rate": 6.179701230228471e-07, "logits/chosen": 0.13121947646141052, "logits/rejected": -0.08991088718175888, "logps/chosen": -343.1000061035156, "logps/rejected": -366.6000061035156, "loss": 0.1247, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6551879644393921, "rewards/margins": 4.742968559265137, "rewards/rejected": -4.084765434265137, "step": 1740 }, { "epoch": 1.5367889303755766, "grad_norm": 57.76158403399992, "learning_rate": 6.157732864674869e-07, "logits/chosen": 0.4082275331020355, "logits/rejected": 0.25133055448532104, "logps/chosen": -396.1499938964844, "logps/rejected": -397.63751220703125, "loss": 0.1141, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9578857421875, "rewards/margins": 4.543261528015137, "rewards/rejected": -3.588085889816284, "step": 1750 }, { "epoch": 1.5455743465846694, "grad_norm": 6.004055980219616, "learning_rate": 6.135764499121265e-07, "logits/chosen": 0.25307005643844604, "logits/rejected": 0.19484253227710724, "logps/chosen": -375.2124938964844, "logps/rejected": -387.2250061035156, "loss": 0.1037, "rewards/accuracies": 0.96875, "rewards/chosen": 0.95758056640625, "rewards/margins": 4.614648342132568, "rewards/rejected": -3.658398389816284, "step": 1760 }, { "epoch": 1.5543597627937622, "grad_norm": 7.6607077764583265, "learning_rate": 6.113796133567663e-07, "logits/chosen": 0.28349608182907104, "logits/rejected": 0.22066040337085724, "logps/chosen": -413.92498779296875, "logps/rejected": -403.0249938964844, "loss": 0.0882, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.051629662513733, "rewards/margins": 4.667187690734863, "rewards/rejected": -3.6162109375, "step": 1770 }, { "epoch": 1.5631451790028552, "grad_norm": 9.300474655363427, "learning_rate": 6.091827768014059e-07, "logits/chosen": 0.2749877870082855, "logits/rejected": 0.1775360107421875, "logps/chosen": -359.54998779296875, "logps/rejected": -363.95001220703125, "loss": 0.0847, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8436279296875, "rewards/margins": 4.612500190734863, "rewards/rejected": -3.7710938453674316, "step": 1780 }, { "epoch": 1.5719305952119482, "grad_norm": 44.14896799764711, "learning_rate": 6.069859402460457e-07, "logits/chosen": 0.05922851711511612, "logits/rejected": -0.06189422681927681, "logps/chosen": -396.45001220703125, "logps/rejected": -397.07501220703125, "loss": 0.0727, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.907470703125, "rewards/margins": 5.034960746765137, "rewards/rejected": -4.124804496765137, "step": 1790 }, { "epoch": 1.5807160114210412, "grad_norm": 39.41812476221796, "learning_rate": 6.047891036906854e-07, "logits/chosen": 0.06993408501148224, "logits/rejected": -0.04909820482134819, "logps/chosen": -354.0, "logps/rejected": -378.3125, "loss": 0.0838, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4854797422885895, "rewards/margins": 5.089062690734863, "rewards/rejected": -4.604882717132568, "step": 1800 }, { "epoch": 1.589501427630134, "grad_norm": 36.00158867042787, "learning_rate": 6.025922671353251e-07, "logits/chosen": 0.17885741591453552, "logits/rejected": 0.08350219577550888, "logps/chosen": -378.4750061035156, "logps/rejected": -359.07501220703125, "loss": 0.1337, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5884917974472046, "rewards/margins": 4.553515434265137, "rewards/rejected": -3.9662108421325684, "step": 1810 }, { "epoch": 1.5982868438392268, "grad_norm": 26.85662539536716, "learning_rate": 6.003954305799648e-07, "logits/chosen": 0.26131439208984375, "logits/rejected": 0.11054687201976776, "logps/chosen": -389.86248779296875, "logps/rejected": -373.6499938964844, "loss": 0.1033, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7273925542831421, "rewards/margins": 4.715234279632568, "rewards/rejected": -3.9873046875, "step": 1820 }, { "epoch": 1.6070722600483198, "grad_norm": 24.61863012973079, "learning_rate": 5.981985940246046e-07, "logits/chosen": 0.20513305068016052, "logits/rejected": -0.03585510328412056, "logps/chosen": -351.54998779296875, "logps/rejected": -374.29998779296875, "loss": 0.1, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.813610851764679, "rewards/margins": 4.83203125, "rewards/rejected": -4.018750190734863, "step": 1830 }, { "epoch": 1.6158576762574128, "grad_norm": 30.19886743012248, "learning_rate": 5.960017574692443e-07, "logits/chosen": 0.24801024794578552, "logits/rejected": 0.140380859375, "logps/chosen": -340.5249938964844, "logps/rejected": -359.5625, "loss": 0.0894, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.626818835735321, "rewards/margins": 4.638281345367432, "rewards/rejected": -4.013085842132568, "step": 1840 }, { "epoch": 1.6246430924665056, "grad_norm": 20.669679124145542, "learning_rate": 5.938049209138841e-07, "logits/chosen": 0.17121581733226776, "logits/rejected": 0.10506973415613174, "logps/chosen": -337.36248779296875, "logps/rejected": -380.79998779296875, "loss": 0.109, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8272949457168579, "rewards/margins": 4.519140720367432, "rewards/rejected": -3.6908202171325684, "step": 1850 }, { "epoch": 1.6334285086755984, "grad_norm": 17.864181323134876, "learning_rate": 5.916080843585237e-07, "logits/chosen": 0.29936522245407104, "logits/rejected": 0.005969238467514515, "logps/chosen": -403.1875, "logps/rejected": -367.75, "loss": 0.0907, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6920257806777954, "rewards/margins": 4.465234279632568, "rewards/rejected": -3.770703077316284, "step": 1860 }, { "epoch": 1.6422139248846914, "grad_norm": 43.108597569867804, "learning_rate": 5.894112478031635e-07, "logits/chosen": 0.260833740234375, "logits/rejected": 0.11446838080883026, "logps/chosen": -345.57501220703125, "logps/rejected": -384.1499938964844, "loss": 0.098, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.94287109375, "rewards/margins": 4.765234470367432, "rewards/rejected": -3.8212890625, "step": 1870 }, { "epoch": 1.6509993410937844, "grad_norm": 32.08705014201711, "learning_rate": 5.872144112478031e-07, "logits/chosen": -0.039703369140625, "logits/rejected": -0.16614380478858948, "logps/chosen": -340.2749938964844, "logps/rejected": -368.2250061035156, "loss": 0.1282, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.22166748344898224, "rewards/margins": 4.634570121765137, "rewards/rejected": -4.417187690734863, "step": 1880 }, { "epoch": 1.6597847573028772, "grad_norm": 25.230918446831954, "learning_rate": 5.850175746924429e-07, "logits/chosen": -0.076568603515625, "logits/rejected": -0.13760986924171448, "logps/chosen": -343.8374938964844, "logps/rejected": -367.88751220703125, "loss": 0.1089, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.03530273586511612, "rewards/margins": 4.866796970367432, "rewards/rejected": -4.828906059265137, "step": 1890 }, { "epoch": 1.66857017351197, "grad_norm": 28.782325999185357, "learning_rate": 5.828207381370825e-07, "logits/chosen": 0.04927978664636612, "logits/rejected": -0.11873779445886612, "logps/chosen": -364.82501220703125, "logps/rejected": -371.76251220703125, "loss": 0.0804, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.29414063692092896, "rewards/margins": 4.594922065734863, "rewards/rejected": -4.300390720367432, "step": 1900 }, { "epoch": 1.677355589721063, "grad_norm": 39.56995948273492, "learning_rate": 5.806239015817222e-07, "logits/chosen": 0.19766846299171448, "logits/rejected": 0.0022186278365552425, "logps/chosen": -362.4125061035156, "logps/rejected": -366.1000061035156, "loss": 0.0875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5320281982421875, "rewards/margins": 4.528906345367432, "rewards/rejected": -3.998046875, "step": 1910 }, { "epoch": 1.686141005930156, "grad_norm": 61.89153279993434, "learning_rate": 5.78427065026362e-07, "logits/chosen": 0.16651611030101776, "logits/rejected": 0.11162109673023224, "logps/chosen": -393.51251220703125, "logps/rejected": -406.0249938964844, "loss": 0.083, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.503222644329071, "rewards/margins": 5.094140529632568, "rewards/rejected": -4.594140529632568, "step": 1920 }, { "epoch": 1.6949264221392488, "grad_norm": 53.028359244866124, "learning_rate": 5.762302284710018e-07, "logits/chosen": 0.03557739406824112, "logits/rejected": -0.11150817573070526, "logps/chosen": -336.1875, "logps/rejected": -344.6499938964844, "loss": 0.1334, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.252267450094223, "rewards/margins": 4.489697456359863, "rewards/rejected": -4.2333984375, "step": 1930 }, { "epoch": 1.7037118383483416, "grad_norm": 21.394288524009077, "learning_rate": 5.740333919156415e-07, "logits/chosen": 0.13410034775733948, "logits/rejected": 0.06999512016773224, "logps/chosen": -390.45001220703125, "logps/rejected": -395.7749938964844, "loss": 0.125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.21196898818016052, "rewards/margins": 5.00390625, "rewards/rejected": -4.7890625, "step": 1940 }, { "epoch": 1.7124972545574346, "grad_norm": 41.79983599557749, "learning_rate": 5.718365553602812e-07, "logits/chosen": 0.02685546875, "logits/rejected": -0.03560791164636612, "logps/chosen": -339.125, "logps/rejected": -406.82501220703125, "loss": 0.1008, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.08051757514476776, "rewards/margins": 5.119726657867432, "rewards/rejected": -5.038866996765137, "step": 1950 }, { "epoch": 1.7212826707665276, "grad_norm": 52.09928660901128, "learning_rate": 5.696397188049209e-07, "logits/chosen": 0.11361084133386612, "logits/rejected": 0.03064880333840847, "logps/chosen": -378.73748779296875, "logps/rejected": -399.0249938964844, "loss": 0.0952, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.06352539360523224, "rewards/margins": 4.658398628234863, "rewards/rejected": -4.595312595367432, "step": 1960 }, { "epoch": 1.7300680869756204, "grad_norm": 10.80714830955533, "learning_rate": 5.674428822495607e-07, "logits/chosen": 0.15378113090991974, "logits/rejected": -0.0009582519414834678, "logps/chosen": -382.42498779296875, "logps/rejected": -395.92498779296875, "loss": 0.1026, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.08788452297449112, "rewards/margins": 4.893359184265137, "rewards/rejected": -4.807909965515137, "step": 1970 }, { "epoch": 1.7388535031847132, "grad_norm": 100.37969764444117, "learning_rate": 5.652460456942003e-07, "logits/chosen": -0.004345702938735485, "logits/rejected": -0.1478271484375, "logps/chosen": -340.54998779296875, "logps/rejected": -399.5249938964844, "loss": 0.1003, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.632843017578125, "rewards/margins": 4.899023532867432, "rewards/rejected": -4.265038967132568, "step": 1980 }, { "epoch": 1.7476389193938062, "grad_norm": 25.09795188456828, "learning_rate": 5.6304920913884e-07, "logits/chosen": 0.15374450385570526, "logits/rejected": 0.13051147758960724, "logps/chosen": -358.3500061035156, "logps/rejected": -395.5249938964844, "loss": 0.0651, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.642407238483429, "rewards/margins": 4.857031345367432, "rewards/rejected": -4.21484375, "step": 1990 }, { "epoch": 1.7564243356028992, "grad_norm": 37.874085940833744, "learning_rate": 5.608523725834797e-07, "logits/chosen": 0.05633544921875, "logits/rejected": 0.01805725134909153, "logps/chosen": -383.375, "logps/rejected": -411.6000061035156, "loss": 0.1098, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.070068359375, "rewards/margins": 4.712109565734863, "rewards/rejected": -4.641015529632568, "step": 2000 }, { "epoch": 1.7652097518119922, "grad_norm": 34.23867146885987, "learning_rate": 5.586555360281194e-07, "logits/chosen": 0.09916992485523224, "logits/rejected": -0.0019165038829669356, "logps/chosen": -422.20001220703125, "logps/rejected": -424.625, "loss": 0.0881, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.3125244081020355, "rewards/margins": 4.854101657867432, "rewards/rejected": -4.539843559265137, "step": 2010 }, { "epoch": 1.773995168021085, "grad_norm": 41.10409729371438, "learning_rate": 5.564586994727593e-07, "logits/chosen": -0.103515625, "logits/rejected": -0.13312987983226776, "logps/chosen": -308.0, "logps/rejected": -359.1000061035156, "loss": 0.1023, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10305175930261612, "rewards/margins": 4.732812404632568, "rewards/rejected": -4.6357421875, "step": 2020 }, { "epoch": 1.7827805842301778, "grad_norm": 10.720548303217885, "learning_rate": 5.54261862917399e-07, "logits/chosen": 0.0580596923828125, "logits/rejected": 0.08719940483570099, "logps/chosen": -404.0625, "logps/rejected": -449.32501220703125, "loss": 0.0842, "rewards/accuracies": 0.96875, "rewards/chosen": 0.13427123427391052, "rewards/margins": 5.551171779632568, "rewards/rejected": -5.416015625, "step": 2030 }, { "epoch": 1.7915660004392708, "grad_norm": 23.90538197060928, "learning_rate": 5.520650263620387e-07, "logits/chosen": 0.07843627780675888, "logits/rejected": 0.0580902099609375, "logps/chosen": -407.01251220703125, "logps/rejected": -438.4750061035156, "loss": 0.1013, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15858153998851776, "rewards/margins": 5.047265529632568, "rewards/rejected": -4.888281345367432, "step": 2040 }, { "epoch": 1.8003514166483638, "grad_norm": 40.5341522479917, "learning_rate": 5.498681898066783e-07, "logits/chosen": 0.09753723442554474, "logits/rejected": -0.14697265625, "logps/chosen": -406.4125061035156, "logps/rejected": -433.70001220703125, "loss": 0.0896, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4485229551792145, "rewards/margins": 5.349999904632568, "rewards/rejected": -4.903906345367432, "step": 2050 }, { "epoch": 1.8091368328574566, "grad_norm": 18.736028737222284, "learning_rate": 5.476713532513181e-07, "logits/chosen": 0.15931396186351776, "logits/rejected": 0.03362121433019638, "logps/chosen": -367.36248779296875, "logps/rejected": -365.7250061035156, "loss": 0.1321, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2738891541957855, "rewards/margins": 4.547461032867432, "rewards/rejected": -4.271679878234863, "step": 2060 }, { "epoch": 1.8179222490665494, "grad_norm": 13.361253353392135, "learning_rate": 5.454745166959577e-07, "logits/chosen": 0.15653686225414276, "logits/rejected": 0.02617187425494194, "logps/chosen": -373.86248779296875, "logps/rejected": -407.75, "loss": 0.0842, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4008727967739105, "rewards/margins": 4.906054496765137, "rewards/rejected": -4.502539157867432, "step": 2070 }, { "epoch": 1.8267076652756424, "grad_norm": 39.99490461285906, "learning_rate": 5.432776801405975e-07, "logits/chosen": 0.03799133375287056, "logits/rejected": -0.07724914699792862, "logps/chosen": -352.63751220703125, "logps/rejected": -354.57501220703125, "loss": 0.1184, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6164184808731079, "rewards/margins": 4.643945217132568, "rewards/rejected": -4.028515815734863, "step": 2080 }, { "epoch": 1.8354930814847354, "grad_norm": 28.196090654956855, "learning_rate": 5.410808435852372e-07, "logits/chosen": 0.31315308809280396, "logits/rejected": 0.16741637885570526, "logps/chosen": -364.70001220703125, "logps/rejected": -368.17498779296875, "loss": 0.0894, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.085485816001892, "rewards/margins": 4.519140720367432, "rewards/rejected": -3.4365234375, "step": 2090 }, { "epoch": 1.8442784976938282, "grad_norm": 41.66603476819805, "learning_rate": 5.388840070298769e-07, "logits/chosen": 0.2149658203125, "logits/rejected": 0.138702392578125, "logps/chosen": -361.0375061035156, "logps/rejected": -377.1875, "loss": 0.1079, "rewards/accuracies": 0.96875, "rewards/chosen": 1.057397484779358, "rewards/margins": 4.5999755859375, "rewards/rejected": -3.5406250953674316, "step": 2100 }, { "epoch": 1.853063913902921, "grad_norm": 33.24533053716776, "learning_rate": 5.366871704745168e-07, "logits/chosen": -0.12694701552391052, "logits/rejected": -0.11174316704273224, "logps/chosen": -338.70001220703125, "logps/rejected": -359.9750061035156, "loss": 0.0826, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.701336681842804, "rewards/margins": 4.533007621765137, "rewards/rejected": -3.830859422683716, "step": 2110 }, { "epoch": 1.861849330112014, "grad_norm": 27.26573492098949, "learning_rate": 5.344903339191564e-07, "logits/chosen": -0.07471923530101776, "logits/rejected": -0.22687073051929474, "logps/chosen": -325.32501220703125, "logps/rejected": -363.54998779296875, "loss": 0.0994, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6041259765625, "rewards/margins": 4.5810546875, "rewards/rejected": -3.9769530296325684, "step": 2120 }, { "epoch": 1.870634746321107, "grad_norm": 42.1761830913932, "learning_rate": 5.322934973637961e-07, "logits/chosen": 0.0013183593982830644, "logits/rejected": -0.12770386040210724, "logps/chosen": -382.9750061035156, "logps/rejected": -374.63751220703125, "loss": 0.1061, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.52093505859375, "rewards/margins": 4.591992378234863, "rewards/rejected": -4.069921970367432, "step": 2130 }, { "epoch": 1.8794201625301998, "grad_norm": 26.42861435415268, "learning_rate": 5.300966608084358e-07, "logits/chosen": 0.166737362742424, "logits/rejected": -0.0047973631881177425, "logps/chosen": -398.89373779296875, "logps/rejected": -387.3500061035156, "loss": 0.0996, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.048242211341858, "rewards/margins": 4.808984279632568, "rewards/rejected": -3.7606444358825684, "step": 2140 }, { "epoch": 1.8882055787392926, "grad_norm": 65.94331928974138, "learning_rate": 5.278998242530755e-07, "logits/chosen": 0.10513915866613388, "logits/rejected": 0.03287048265337944, "logps/chosen": -370.64373779296875, "logps/rejected": -354.1000061035156, "loss": 0.126, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.6910156011581421, "rewards/margins": 4.7060546875, "rewards/rejected": -4.01318359375, "step": 2150 }, { "epoch": 1.8969909949483856, "grad_norm": 23.980782631779498, "learning_rate": 5.257029876977153e-07, "logits/chosen": 0.19606932997703552, "logits/rejected": 0.23218384385108948, "logps/chosen": -368.17498779296875, "logps/rejected": -407.875, "loss": 0.092, "rewards/accuracies": 0.96875, "rewards/chosen": 1.027374267578125, "rewards/margins": 4.841406345367432, "rewards/rejected": -3.8154296875, "step": 2160 }, { "epoch": 1.9057764111574786, "grad_norm": 21.568324114745366, "learning_rate": 5.235061511423549e-07, "logits/chosen": 0.19191589951515198, "logits/rejected": 0.0799407958984375, "logps/chosen": -380.1499938964844, "logps/rejected": -398.3999938964844, "loss": 0.0844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.43385010957717896, "rewards/margins": 4.466796875, "rewards/rejected": -4.031640529632568, "step": 2170 }, { "epoch": 1.9145618273665717, "grad_norm": 38.77290895132535, "learning_rate": 5.213093145869947e-07, "logits/chosen": 0.16731567680835724, "logits/rejected": -0.0037109374534338713, "logps/chosen": -387.3999938964844, "logps/rejected": -397.875, "loss": 0.1112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.580090343952179, "rewards/margins": 4.948437690734863, "rewards/rejected": -4.367968559265137, "step": 2180 }, { "epoch": 1.9233472435756644, "grad_norm": 37.045503918575704, "learning_rate": 5.191124780316343e-07, "logits/chosen": 0.0038696289993822575, "logits/rejected": 0.02183837816119194, "logps/chosen": -374.0375061035156, "logps/rejected": -367.1625061035156, "loss": 0.0898, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5845993161201477, "rewards/margins": 4.587890625, "rewards/rejected": -4.004101753234863, "step": 2190 }, { "epoch": 1.9321326597847572, "grad_norm": 31.478917146004225, "learning_rate": 5.169156414762741e-07, "logits/chosen": 0.24622192978858948, "logits/rejected": 0.2511230409145355, "logps/chosen": -379.875, "logps/rejected": -405.9624938964844, "loss": 0.0827, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8529907464981079, "rewards/margins": 4.812890529632568, "rewards/rejected": -3.9593749046325684, "step": 2200 }, { "epoch": 1.9409180759938502, "grad_norm": 26.405776225214478, "learning_rate": 5.147188049209139e-07, "logits/chosen": 0.2529235780239105, "logits/rejected": 0.23553466796875, "logps/chosen": -373.79998779296875, "logps/rejected": -371.0375061035156, "loss": 0.11, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6650390625, "rewards/margins": 4.365234375, "rewards/rejected": -3.698046922683716, "step": 2210 }, { "epoch": 1.9497034922029433, "grad_norm": 29.329938535869665, "learning_rate": 5.125219683655536e-07, "logits/chosen": 0.2546752989292145, "logits/rejected": 0.03877868503332138, "logps/chosen": -367.9624938964844, "logps/rejected": -380.3999938964844, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 0.765881359577179, "rewards/margins": 5.325390815734863, "rewards/rejected": -4.560546875, "step": 2220 }, { "epoch": 1.958488908412036, "grad_norm": 59.989448975169495, "learning_rate": 5.103251318101933e-07, "logits/chosen": 0.0027893066871911287, "logits/rejected": -0.09723510593175888, "logps/chosen": -347.125, "logps/rejected": -362.32501220703125, "loss": 0.0896, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20206299424171448, "rewards/margins": 4.660546779632568, "rewards/rejected": -4.461328029632568, "step": 2230 }, { "epoch": 1.9672743246211288, "grad_norm": 30.157112176215417, "learning_rate": 5.08128295254833e-07, "logits/chosen": -0.06862182915210724, "logits/rejected": -0.141448974609375, "logps/chosen": -351.8125, "logps/rejected": -389.95001220703125, "loss": 0.1074, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.09684143215417862, "rewards/margins": 4.792187690734863, "rewards/rejected": -4.887890815734863, "step": 2240 }, { "epoch": 1.9760597408302218, "grad_norm": 16.742567307257893, "learning_rate": 5.059314586994727e-07, "logits/chosen": 0.03885803371667862, "logits/rejected": -0.07906799018383026, "logps/chosen": -356.04998779296875, "logps/rejected": -383.8999938964844, "loss": 0.0536, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.14378051459789276, "rewards/margins": 5.378125190734863, "rewards/rejected": -5.235156059265137, "step": 2250 }, { "epoch": 1.9848451570393149, "grad_norm": 34.98118524942001, "learning_rate": 5.037346221441124e-07, "logits/chosen": -0.04459533840417862, "logits/rejected": -0.20485839247703552, "logps/chosen": -370.0874938964844, "logps/rejected": -360.3999938964844, "loss": 0.0827, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.04562988132238388, "rewards/margins": 4.726171970367432, "rewards/rejected": -4.6796875, "step": 2260 }, { "epoch": 1.9936305732484076, "grad_norm": 53.645629249804756, "learning_rate": 5.015377855887521e-07, "logits/chosen": 0.10119018703699112, "logits/rejected": -0.02578124962747097, "logps/chosen": -416.4750061035156, "logps/rejected": -410.1499938964844, "loss": 0.0728, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.657470703125, "rewards/margins": 5.428515434265137, "rewards/rejected": -4.766406059265137, "step": 2270 }, { "epoch": 2.001757083241819, "grad_norm": 2.9291367306680502, "learning_rate": 4.993409490333919e-07, "logits/chosen": 0.036185599863529205, "logits/rejected": -0.14994853734970093, "logps/chosen": -371.2567443847656, "logps/rejected": -398.1891784667969, "loss": 0.0592, "rewards/accuracies": 0.9797297120094299, "rewards/chosen": 0.4413534700870514, "rewards/margins": 5.098606586456299, "rewards/rejected": -4.654138565063477, "step": 2280 }, { "epoch": 2.0105424994509113, "grad_norm": 6.27533096638329, "learning_rate": 4.971441124780316e-07, "logits/chosen": -0.021514892578125, "logits/rejected": -0.18354186415672302, "logps/chosen": -347.61248779296875, "logps/rejected": -414.82501220703125, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 0.4934448301792145, "rewards/margins": 6.227734565734863, "rewards/rejected": -5.736718654632568, "step": 2290 }, { "epoch": 2.0193279156600044, "grad_norm": 13.178816741459281, "learning_rate": 4.949472759226713e-07, "logits/chosen": 0.02014465257525444, "logits/rejected": -0.20020751655101776, "logps/chosen": -381.2562561035156, "logps/rejected": -404.1875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 0.5638672113418579, "rewards/margins": 6.46484375, "rewards/rejected": -5.900000095367432, "step": 2300 }, { "epoch": 2.0281133318690974, "grad_norm": 6.862525646332434, "learning_rate": 4.92750439367311e-07, "logits/chosen": -0.03378600999712944, "logits/rejected": -0.2543884217739105, "logps/chosen": -357.7749938964844, "logps/rejected": -356.45001220703125, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.4351440370082855, "rewards/margins": 6.307031154632568, "rewards/rejected": -5.869921684265137, "step": 2310 }, { "epoch": 2.0368987480781904, "grad_norm": 3.17481583355211, "learning_rate": 4.905536028119508e-07, "logits/chosen": -0.06697998195886612, "logits/rejected": -0.23040771484375, "logps/chosen": -358.36248779296875, "logps/rejected": -389.01251220703125, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 0.738574206829071, "rewards/margins": 6.33984375, "rewards/rejected": -5.6015625, "step": 2320 }, { "epoch": 2.045684164287283, "grad_norm": 9.038016350499541, "learning_rate": 4.883567662565905e-07, "logits/chosen": -0.18193969130516052, "logits/rejected": -0.288644403219223, "logps/chosen": -346.8125, "logps/rejected": -394.67498779296875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 0.400299072265625, "rewards/margins": 6.757031440734863, "rewards/rejected": -6.353515625, "step": 2330 }, { "epoch": 2.054469580496376, "grad_norm": 3.027318217678933, "learning_rate": 4.861599297012302e-07, "logits/chosen": -0.22588500380516052, "logits/rejected": -0.4237060546875, "logps/chosen": -360.5375061035156, "logps/rejected": -390.95001220703125, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -0.01669921912252903, "rewards/margins": 6.840624809265137, "rewards/rejected": -6.857031345367432, "step": 2340 }, { "epoch": 2.063254996705469, "grad_norm": 5.668816926448476, "learning_rate": 4.839630931458699e-07, "logits/chosen": -0.25148314237594604, "logits/rejected": -0.32157936692237854, "logps/chosen": -390.9750061035156, "logps/rejected": -417.57501220703125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.55560302734375, "rewards/margins": 6.658593654632568, "rewards/rejected": -7.214062690734863, "step": 2350 }, { "epoch": 2.072040412914562, "grad_norm": 2.1906473591198177, "learning_rate": 4.817662565905096e-07, "logits/chosen": -0.15169677138328552, "logits/rejected": -0.290170282125473, "logps/chosen": -353.1499938964844, "logps/rejected": -376.54998779296875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 0.12647704780101776, "rewards/margins": 6.827343940734863, "rewards/rejected": -6.699999809265137, "step": 2360 }, { "epoch": 2.0808258291236545, "grad_norm": 13.629491334690275, "learning_rate": 4.795694200351494e-07, "logits/chosen": -0.07437743991613388, "logits/rejected": -0.30955809354782104, "logps/chosen": -340.7875061035156, "logps/rejected": -383.5, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 0.4625290036201477, "rewards/margins": 6.671875, "rewards/rejected": -6.2109375, "step": 2370 }, { "epoch": 2.0896112453327476, "grad_norm": 18.618823086752364, "learning_rate": 4.77372583479789e-07, "logits/chosen": -0.14633789658546448, "logits/rejected": -0.29362183809280396, "logps/chosen": -390.0, "logps/rejected": -464.32501220703125, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 0.013745117001235485, "rewards/margins": 7.696875095367432, "rewards/rejected": -7.685937404632568, "step": 2380 }, { "epoch": 2.0983966615418406, "grad_norm": 3.80609180221979, "learning_rate": 4.751757469244288e-07, "logits/chosen": -0.24139404296875, "logits/rejected": -0.43452149629592896, "logps/chosen": -400.04998779296875, "logps/rejected": -425.23748779296875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.8094482421875, "rewards/margins": 7.296093940734863, "rewards/rejected": -8.104296684265137, "step": 2390 }, { "epoch": 2.1071820777509336, "grad_norm": 3.2163049281165588, "learning_rate": 4.729789103690685e-07, "logits/chosen": -0.19597777724266052, "logits/rejected": -0.3367919921875, "logps/chosen": -389.6499938964844, "logps/rejected": -429.70001220703125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.35594481229782104, "rewards/margins": 7.242968559265137, "rewards/rejected": -7.601953029632568, "step": 2400 }, { "epoch": 2.115967493960026, "grad_norm": 6.503219625914523, "learning_rate": 4.707820738137082e-07, "logits/chosen": -0.16562195122241974, "logits/rejected": -0.38630980253219604, "logps/chosen": -380.8125, "logps/rejected": -402.75, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -0.37993162870407104, "rewards/margins": 7.212500095367432, "rewards/rejected": -7.592187404632568, "step": 2410 }, { "epoch": 2.124752910169119, "grad_norm": 5.042045582969805, "learning_rate": 4.68585237258348e-07, "logits/chosen": -0.23321227729320526, "logits/rejected": -0.385488897562027, "logps/chosen": -345.9750061035156, "logps/rejected": -403.20001220703125, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -0.13564452528953552, "rewards/margins": 6.798437595367432, "rewards/rejected": -6.932812690734863, "step": 2420 }, { "epoch": 2.133538326378212, "grad_norm": 29.608374448158905, "learning_rate": 4.663884007029877e-07, "logits/chosen": -0.23314514756202698, "logits/rejected": -0.37202757596969604, "logps/chosen": -378.3125, "logps/rejected": -428.7250061035156, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -0.42387086153030396, "rewards/margins": 6.88671875, "rewards/rejected": -7.311718940734863, "step": 2430 }, { "epoch": 2.142323742587305, "grad_norm": 5.151101545232443, "learning_rate": 4.641915641476274e-07, "logits/chosen": -0.26030272245407104, "logits/rejected": -0.3988891541957855, "logps/chosen": -347.875, "logps/rejected": -381.67498779296875, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -0.20126953721046448, "rewards/margins": 6.65234375, "rewards/rejected": -6.853906154632568, "step": 2440 }, { "epoch": 2.151109158796398, "grad_norm": 4.871099632681394, "learning_rate": 4.619947275922671e-07, "logits/chosen": -0.32098084688186646, "logits/rejected": -0.4642700254917145, "logps/chosen": -395.70001220703125, "logps/rejected": -440.0249938964844, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.236083984375, "rewards/margins": 7.096875190734863, "rewards/rejected": -7.328515529632568, "step": 2450 }, { "epoch": 2.1598945750054908, "grad_norm": 4.3758373263578, "learning_rate": 4.5979789103690687e-07, "logits/chosen": -0.20975951850414276, "logits/rejected": -0.5375915765762329, "logps/chosen": -384.86248779296875, "logps/rejected": -424.54998779296875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.011401367373764515, "rewards/margins": 7.377343654632568, "rewards/rejected": -7.36328125, "step": 2460 }, { "epoch": 2.1686799912145838, "grad_norm": 1.9137420289658507, "learning_rate": 4.576010544815466e-07, "logits/chosen": -0.17230530083179474, "logits/rejected": -0.33401793241500854, "logps/chosen": -383.9750061035156, "logps/rejected": -407.25, "loss": 0.0292, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10100097954273224, "rewards/margins": 7.116406440734863, "rewards/rejected": -7.217968940734863, "step": 2470 }, { "epoch": 2.177465407423677, "grad_norm": 4.1125516617035665, "learning_rate": 4.554042179261863e-07, "logits/chosen": -0.417755126953125, "logits/rejected": -0.590441882610321, "logps/chosen": -361.71875, "logps/rejected": -416.5, "loss": 0.0211, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.577832043170929, "rewards/margins": 7.376953125, "rewards/rejected": -7.958593845367432, "step": 2480 }, { "epoch": 2.18625082363277, "grad_norm": 3.550255321030772, "learning_rate": 4.53207381370826e-07, "logits/chosen": -0.15140990912914276, "logits/rejected": -0.526336669921875, "logps/chosen": -366.4375, "logps/rejected": -390.32501220703125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -0.36409300565719604, "rewards/margins": 7.587500095367432, "rewards/rejected": -7.950781345367432, "step": 2490 }, { "epoch": 2.1950362398418624, "grad_norm": 3.9272833595808305, "learning_rate": 4.510105448154657e-07, "logits/chosen": -0.21398010849952698, "logits/rejected": -0.551409900188446, "logps/chosen": -364.5375061035156, "logps/rejected": -373.8500061035156, "loss": 0.0182, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.39179688692092896, "rewards/margins": 7.410937309265137, "rewards/rejected": -7.803906440734863, "step": 2500 }, { "epoch": 2.2038216560509554, "grad_norm": 16.86128650070453, "learning_rate": 4.4881370826010546e-07, "logits/chosen": -0.2578887939453125, "logits/rejected": -0.48576658964157104, "logps/chosen": -425.98748779296875, "logps/rejected": -422.125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7382568120956421, "rewards/margins": 7.348437309265137, "rewards/rejected": -8.08984375, "step": 2510 }, { "epoch": 2.2126070722600484, "grad_norm": 16.45928339332874, "learning_rate": 4.4661687170474517e-07, "logits/chosen": -0.22574463486671448, "logits/rejected": -0.37681883573532104, "logps/chosen": -375.86248779296875, "logps/rejected": -417.95001220703125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -0.49298095703125, "rewards/margins": 7.26171875, "rewards/rejected": -7.757031440734863, "step": 2520 }, { "epoch": 2.2213924884691414, "grad_norm": 2.510412138965512, "learning_rate": 4.444200351493849e-07, "logits/chosen": -0.30738526582717896, "logits/rejected": -0.4878478944301605, "logps/chosen": -393.1499938964844, "logps/rejected": -440.20001220703125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.2742675840854645, "rewards/margins": 7.192968845367432, "rewards/rejected": -7.470312595367432, "step": 2530 }, { "epoch": 2.230177904678234, "grad_norm": 3.812051398115719, "learning_rate": 4.422231985940246e-07, "logits/chosen": -0.16746215522289276, "logits/rejected": -0.278036504983902, "logps/chosen": -385.25, "logps/rejected": -440.67498779296875, "loss": 0.0179, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.76324462890625, "rewards/margins": 7.225781440734863, "rewards/rejected": -7.987500190734863, "step": 2540 }, { "epoch": 2.238963320887327, "grad_norm": 5.063407595674223, "learning_rate": 4.400263620386643e-07, "logits/chosen": -0.33931273221969604, "logits/rejected": -0.513317883014679, "logps/chosen": -342.8125, "logps/rejected": -363.95001220703125, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5584961175918579, "rewards/margins": 7.168749809265137, "rewards/rejected": -7.724999904632568, "step": 2550 }, { "epoch": 2.24774873709642, "grad_norm": 6.535198535246274, "learning_rate": 4.3782952548330405e-07, "logits/chosen": -0.20695190131664276, "logits/rejected": -0.3479980528354645, "logps/chosen": -357.7124938964844, "logps/rejected": -404.75, "loss": 0.0143, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.638031005859375, "rewards/margins": 7.467968940734863, "rewards/rejected": -8.107812881469727, "step": 2560 }, { "epoch": 2.256534153305513, "grad_norm": 8.854781054719902, "learning_rate": 4.3563268892794376e-07, "logits/chosen": -0.29822999238967896, "logits/rejected": -0.524304211139679, "logps/chosen": -344.5249938964844, "logps/rejected": -383.07501220703125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.94140625, "rewards/margins": 7.305468559265137, "rewards/rejected": -8.25, "step": 2570 }, { "epoch": 2.265319569514606, "grad_norm": 2.1746290458233335, "learning_rate": 4.3343585237258347e-07, "logits/chosen": -0.27978515625, "logits/rejected": -0.4582153260707855, "logps/chosen": -344.2124938964844, "logps/rejected": -388.8999938964844, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.7440215945243835, "rewards/margins": 7.5859375, "rewards/rejected": -8.335156440734863, "step": 2580 }, { "epoch": 2.2741049857236986, "grad_norm": 2.0258444494308443, "learning_rate": 4.312390158172232e-07, "logits/chosen": -0.18880614638328552, "logits/rejected": -0.303985595703125, "logps/chosen": -362.63751220703125, "logps/rejected": -407.3500061035156, "loss": 0.0155, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6651519536972046, "rewards/margins": 7.212500095367432, "rewards/rejected": -7.879687309265137, "step": 2590 }, { "epoch": 2.2828904019327916, "grad_norm": 2.9728503682004757, "learning_rate": 4.2904217926186293e-07, "logits/chosen": -0.3635192811489105, "logits/rejected": -0.5358642339706421, "logps/chosen": -385.9125061035156, "logps/rejected": -415.17498779296875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.3575195372104645, "rewards/margins": 7.819531440734863, "rewards/rejected": -8.178906440734863, "step": 2600 }, { "epoch": 2.2916758181418846, "grad_norm": 4.015502826298815, "learning_rate": 4.2684534270650264e-07, "logits/chosen": -0.1348876953125, "logits/rejected": -0.39111328125, "logps/chosen": -398.32501220703125, "logps/rejected": -432.625, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.3585449159145355, "rewards/margins": 7.3125, "rewards/rejected": -7.669531345367432, "step": 2610 }, { "epoch": 2.300461234350977, "grad_norm": 9.045647313658883, "learning_rate": 4.2464850615114235e-07, "logits/chosen": -0.19957885146141052, "logits/rejected": -0.646832287311554, "logps/chosen": -413.8125, "logps/rejected": -401.04998779296875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.2560058534145355, "rewards/margins": 7.372656345367432, "rewards/rejected": -7.629687309265137, "step": 2620 }, { "epoch": 2.30924665056007, "grad_norm": 5.489334562080459, "learning_rate": 4.2245166959578206e-07, "logits/chosen": -0.20491942763328552, "logits/rejected": -0.3609863221645355, "logps/chosen": -367.20001220703125, "logps/rejected": -413.29998779296875, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -0.79742431640625, "rewards/margins": 7.124218940734863, "rewards/rejected": -7.922656059265137, "step": 2630 }, { "epoch": 2.318032066769163, "grad_norm": 38.545437004019796, "learning_rate": 4.2025483304042177e-07, "logits/chosen": -0.28825074434280396, "logits/rejected": -0.4668335020542145, "logps/chosen": -389.3999938964844, "logps/rejected": -408.82501220703125, "loss": 0.0226, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5816406011581421, "rewards/margins": 7.721875190734863, "rewards/rejected": -8.295312881469727, "step": 2640 }, { "epoch": 2.326817482978256, "grad_norm": 4.713166708910227, "learning_rate": 4.180579964850615e-07, "logits/chosen": -0.14959716796875, "logits/rejected": -0.443948358297348, "logps/chosen": -404.6000061035156, "logps/rejected": -425.4750061035156, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -0.7139892578125, "rewards/margins": 7.622656345367432, "rewards/rejected": -8.340624809265137, "step": 2650 }, { "epoch": 2.335602899187349, "grad_norm": 1.8046036048254315, "learning_rate": 4.1586115992970123e-07, "logits/chosen": -0.20021972060203552, "logits/rejected": -0.32643431425094604, "logps/chosen": -383.875, "logps/rejected": -482.07501220703125, "loss": 0.0131, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.62994384765625, "rewards/margins": 7.51171875, "rewards/rejected": -8.141406059265137, "step": 2660 }, { "epoch": 2.3443883153964418, "grad_norm": 6.198321244493611, "learning_rate": 4.1366432337434094e-07, "logits/chosen": -0.2520813047885895, "logits/rejected": -0.4922851622104645, "logps/chosen": -400.76251220703125, "logps/rejected": -452.20001220703125, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.493896484375, "rewards/margins": 7.728125095367432, "rewards/rejected": -8.21875, "step": 2670 }, { "epoch": 2.353173731605535, "grad_norm": 5.00136090335461, "learning_rate": 4.1146748681898065e-07, "logits/chosen": -0.18798217177391052, "logits/rejected": -0.27148741483688354, "logps/chosen": -395.1625061035156, "logps/rejected": -422.04998779296875, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -0.960949718952179, "rewards/margins": 7.43359375, "rewards/rejected": -8.393359184265137, "step": 2680 }, { "epoch": 2.361959147814628, "grad_norm": 21.754447033602066, "learning_rate": 4.0927065026362036e-07, "logits/chosen": -0.25688475370407104, "logits/rejected": -0.400726318359375, "logps/chosen": -344.2250061035156, "logps/rejected": -397.1499938964844, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.1561615467071533, "rewards/margins": 7.20703125, "rewards/rejected": -8.359375, "step": 2690 }, { "epoch": 2.370744564023721, "grad_norm": 24.973782157202145, "learning_rate": 4.070738137082601e-07, "logits/chosen": -0.17396239936351776, "logits/rejected": -0.35225218534469604, "logps/chosen": -434.2749938964844, "logps/rejected": -473.70001220703125, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -0.704699695110321, "rewards/margins": 7.698437690734863, "rewards/rejected": -8.404687881469727, "step": 2700 }, { "epoch": 2.3795299802328134, "grad_norm": 2.487817733383802, "learning_rate": 4.048769771528998e-07, "logits/chosen": -0.2567138671875, "logits/rejected": -0.42595213651657104, "logps/chosen": -424.5, "logps/rejected": -492.04998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.1319427490234375, "rewards/margins": 8.086718559265137, "rewards/rejected": -9.217968940734863, "step": 2710 }, { "epoch": 2.3883153964419064, "grad_norm": 3.4519986201942716, "learning_rate": 4.0268014059753953e-07, "logits/chosen": -0.12812194228172302, "logits/rejected": -0.4225097596645355, "logps/chosen": -376.79998779296875, "logps/rejected": -443.8999938964844, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.722381591796875, "rewards/margins": 8.080469131469727, "rewards/rejected": -8.80859375, "step": 2720 }, { "epoch": 2.3971008126509994, "grad_norm": 2.2388787281885043, "learning_rate": 4.0048330404217924e-07, "logits/chosen": -0.30510252714157104, "logits/rejected": -0.5164947509765625, "logps/chosen": -418.2875061035156, "logps/rejected": -450.70001220703125, "loss": 0.0222, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.201409935951233, "rewards/margins": 7.653124809265137, "rewards/rejected": -8.856249809265137, "step": 2730 }, { "epoch": 2.4058862288600924, "grad_norm": 1.4512066664760737, "learning_rate": 3.98286467486819e-07, "logits/chosen": -0.295623779296875, "logits/rejected": -0.5380157232284546, "logps/chosen": -415.2749938964844, "logps/rejected": -441.7749938964844, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -0.9215942621231079, "rewards/margins": 7.524218559265137, "rewards/rejected": -8.442968368530273, "step": 2740 }, { "epoch": 2.414671645069185, "grad_norm": 2.774056562463395, "learning_rate": 3.960896309314587e-07, "logits/chosen": -0.3450073301792145, "logits/rejected": -0.5826416015625, "logps/chosen": -370.98748779296875, "logps/rejected": -424.0, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.9279845952987671, "rewards/margins": 7.7734375, "rewards/rejected": -8.692968368530273, "step": 2750 }, { "epoch": 2.423457061278278, "grad_norm": 3.9137889326205313, "learning_rate": 3.938927943760984e-07, "logits/chosen": -0.4027954041957855, "logits/rejected": -0.5885864496231079, "logps/chosen": -382.4937438964844, "logps/rejected": -438.57501220703125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -0.6872802972793579, "rewards/margins": 8.035547256469727, "rewards/rejected": -8.725000381469727, "step": 2760 }, { "epoch": 2.432242477487371, "grad_norm": 16.363874591657655, "learning_rate": 3.916959578207381e-07, "logits/chosen": -0.06805419921875, "logits/rejected": -0.29248046875, "logps/chosen": -380.0, "logps/rejected": -409.9750061035156, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.615081787109375, "rewards/margins": 7.232031345367432, "rewards/rejected": -7.848437309265137, "step": 2770 }, { "epoch": 2.441027893696464, "grad_norm": 9.450298874586872, "learning_rate": 3.8949912126537783e-07, "logits/chosen": -0.2859863340854645, "logits/rejected": -0.3762145936489105, "logps/chosen": -366.8500061035156, "logps/rejected": -423.70001220703125, "loss": 0.024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4927612245082855, "rewards/margins": 7.172265529632568, "rewards/rejected": -7.666796684265137, "step": 2780 }, { "epoch": 2.449813309905557, "grad_norm": 10.243294780380417, "learning_rate": 3.873022847100176e-07, "logits/chosen": -0.1900787353515625, "logits/rejected": -0.3408264219760895, "logps/chosen": -321.07501220703125, "logps/rejected": -382.92498779296875, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -0.4469360411167145, "rewards/margins": 6.989843845367432, "rewards/rejected": -7.444140434265137, "step": 2790 }, { "epoch": 2.4585987261146496, "grad_norm": 0.8212638058565808, "learning_rate": 3.851054481546573e-07, "logits/chosen": -0.21848145127296448, "logits/rejected": -0.378793329000473, "logps/chosen": -424.375, "logps/rejected": -444.82501220703125, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -0.4463134706020355, "rewards/margins": 7.140625, "rewards/rejected": -7.586718559265137, "step": 2800 }, { "epoch": 2.4673841423237426, "grad_norm": 12.361716030769655, "learning_rate": 3.82908611599297e-07, "logits/chosen": -0.4783935546875, "logits/rejected": -0.569836437702179, "logps/chosen": -349.32501220703125, "logps/rejected": -415.20001220703125, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -1.2066528797149658, "rewards/margins": 7.37109375, "rewards/rejected": -8.573437690734863, "step": 2810 }, { "epoch": 2.4761695585328356, "grad_norm": 10.57553925881371, "learning_rate": 3.807117750439367e-07, "logits/chosen": -0.469329833984375, "logits/rejected": -0.523590087890625, "logps/chosen": -354.8812561035156, "logps/rejected": -418.7250061035156, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.8855712413787842, "rewards/margins": 7.653515815734863, "rewards/rejected": -9.5390625, "step": 2820 }, { "epoch": 2.484954974741928, "grad_norm": 3.3473492587499623, "learning_rate": 3.785149384885764e-07, "logits/chosen": -0.28306275606155396, "logits/rejected": -0.413726806640625, "logps/chosen": -409.29998779296875, "logps/rejected": -444.5249938964844, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.02392578125, "rewards/margins": 7.507031440734863, "rewards/rejected": -8.532031059265137, "step": 2830 }, { "epoch": 2.493740390951021, "grad_norm": 3.36779500187562, "learning_rate": 3.763181019332162e-07, "logits/chosen": -0.20631103217601776, "logits/rejected": -0.4558868408203125, "logps/chosen": -342.79998779296875, "logps/rejected": -441.6000061035156, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.7587401866912842, "rewards/margins": 7.977343559265137, "rewards/rejected": -9.737500190734863, "step": 2840 }, { "epoch": 2.502525807160114, "grad_norm": 7.673500951691884, "learning_rate": 3.741212653778559e-07, "logits/chosen": -0.40937042236328125, "logits/rejected": -0.45112913846969604, "logps/chosen": -371.75, "logps/rejected": -431.6499938964844, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.424951195716858, "rewards/margins": 7.478906154632568, "rewards/rejected": -8.90625, "step": 2850 }, { "epoch": 2.511311223369207, "grad_norm": 10.689675658397737, "learning_rate": 3.719244288224956e-07, "logits/chosen": -0.36890870332717896, "logits/rejected": -0.4322265684604645, "logps/chosen": -367.7250061035156, "logps/rejected": -448.95001220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -0.666943371295929, "rewards/margins": 8.053906440734863, "rewards/rejected": -8.720312118530273, "step": 2860 }, { "epoch": 2.5200966395783, "grad_norm": 9.60682712100841, "learning_rate": 3.697275922671353e-07, "logits/chosen": -0.322927862405777, "logits/rejected": -0.4036407470703125, "logps/chosen": -367.875, "logps/rejected": -444.07501220703125, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -0.9753173589706421, "rewards/margins": 7.010156154632568, "rewards/rejected": -7.986718654632568, "step": 2870 }, { "epoch": 2.528882055787393, "grad_norm": 8.833435840267025, "learning_rate": 3.6753075571177507e-07, "logits/chosen": -0.4574951231479645, "logits/rejected": -0.5293518304824829, "logps/chosen": -333.6875, "logps/rejected": -381.7749938964844, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9216064214706421, "rewards/margins": 7.266406059265137, "rewards/rejected": -8.185937881469727, "step": 2880 }, { "epoch": 2.537667471996486, "grad_norm": 8.698623441575752, "learning_rate": 3.653339191564148e-07, "logits/chosen": -0.20809325575828552, "logits/rejected": -0.5367462038993835, "logps/chosen": -389.26251220703125, "logps/rejected": -382.2749938964844, "loss": 0.0162, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.470062255859375, "rewards/margins": 7.400000095367432, "rewards/rejected": -7.871874809265137, "step": 2890 }, { "epoch": 2.546452888205579, "grad_norm": 7.055241526499518, "learning_rate": 3.631370826010545e-07, "logits/chosen": -0.12445678561925888, "logits/rejected": -0.528668224811554, "logps/chosen": -371.75, "logps/rejected": -400.375, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.55108642578125, "rewards/margins": 7.552343845367432, "rewards/rejected": -8.103124618530273, "step": 2900 }, { "epoch": 2.5552383044146714, "grad_norm": 16.404871866352575, "learning_rate": 3.609402460456942e-07, "logits/chosen": -0.24597778916358948, "logits/rejected": -0.539825439453125, "logps/chosen": -411.6499938964844, "logps/rejected": -462.5249938964844, "loss": 0.0335, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7365356683731079, "rewards/margins": 8.223437309265137, "rewards/rejected": -8.971875190734863, "step": 2910 }, { "epoch": 2.564023720623765, "grad_norm": 9.24611992692557, "learning_rate": 3.587434094903339e-07, "logits/chosen": -0.29466551542282104, "logits/rejected": -0.590380847454071, "logps/chosen": -413.2124938964844, "logps/rejected": -426.3500061035156, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.731555163860321, "rewards/margins": 8.076562881469727, "rewards/rejected": -8.803906440734863, "step": 2920 }, { "epoch": 2.5728091368328574, "grad_norm": 4.876130726573987, "learning_rate": 3.5654657293497366e-07, "logits/chosen": -0.315267950296402, "logits/rejected": -0.648303210735321, "logps/chosen": -401.8500061035156, "logps/rejected": -459.3999938964844, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.2493164539337158, "rewards/margins": 8.137499809265137, "rewards/rejected": -9.383593559265137, "step": 2930 }, { "epoch": 2.5815945530419504, "grad_norm": 31.76492803015283, "learning_rate": 3.5434973637961337e-07, "logits/chosen": -0.37178343534469604, "logits/rejected": -0.46307373046875, "logps/chosen": -388.29998779296875, "logps/rejected": -477.70001220703125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -1.4794921875, "rewards/margins": 7.517968654632568, "rewards/rejected": -8.99609375, "step": 2940 }, { "epoch": 2.5903799692510434, "grad_norm": 124.74200119036365, "learning_rate": 3.521528998242531e-07, "logits/chosen": -0.3253540098667145, "logits/rejected": -0.51220703125, "logps/chosen": -379.20001220703125, "logps/rejected": -414.1000061035156, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.229528784751892, "rewards/margins": 7.419140815734863, "rewards/rejected": -8.649218559265137, "step": 2950 }, { "epoch": 2.599165385460136, "grad_norm": 3.9409379005932847, "learning_rate": 3.499560632688928e-07, "logits/chosen": -0.319244384765625, "logits/rejected": -0.5097411870956421, "logps/chosen": -428.3999938964844, "logps/rejected": -438.3999938964844, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.9976806640625, "rewards/margins": 7.713281154632568, "rewards/rejected": -8.712499618530273, "step": 2960 }, { "epoch": 2.607950801669229, "grad_norm": 1.2038539490657265, "learning_rate": 3.477592267135325e-07, "logits/chosen": -0.21314696967601776, "logits/rejected": -0.542706310749054, "logps/chosen": -395.6499938964844, "logps/rejected": -418.1499938964844, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -0.8330932855606079, "rewards/margins": 7.497656345367432, "rewards/rejected": -8.329687118530273, "step": 2970 }, { "epoch": 2.616736217878322, "grad_norm": 15.458039007182121, "learning_rate": 3.4556239015817225e-07, "logits/chosen": -0.3296142518520355, "logits/rejected": -0.4171142578125, "logps/chosen": -364.67498779296875, "logps/rejected": -383.95001220703125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.629101574420929, "rewards/margins": 7.237500190734863, "rewards/rejected": -7.8671875, "step": 2980 }, { "epoch": 2.625521634087415, "grad_norm": 5.654393044832561, "learning_rate": 3.4336555360281196e-07, "logits/chosen": -0.16024628281593323, "logits/rejected": -0.23216553032398224, "logps/chosen": -424.5, "logps/rejected": -470.95001220703125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.3349243104457855, "rewards/margins": 7.585156440734863, "rewards/rejected": -7.91796875, "step": 2990 }, { "epoch": 2.634307050296508, "grad_norm": 5.025642851285795, "learning_rate": 3.4116871704745167e-07, "logits/chosen": -0.260498046875, "logits/rejected": -0.35023802518844604, "logps/chosen": -377.9375, "logps/rejected": -448.7250061035156, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.768200695514679, "rewards/margins": 7.2890625, "rewards/rejected": -8.057812690734863, "step": 3000 }, { "epoch": 2.6430924665056006, "grad_norm": 28.60449401074229, "learning_rate": 3.389718804920914e-07, "logits/chosen": -0.213226318359375, "logits/rejected": -0.41349488496780396, "logps/chosen": -381.4375, "logps/rejected": -454.4750061035156, "loss": 0.022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2241699695587158, "rewards/margins": 7.706640720367432, "rewards/rejected": -8.932812690734863, "step": 3010 }, { "epoch": 2.6518778827146936, "grad_norm": 1.4419585470754546, "learning_rate": 3.3677504393673114e-07, "logits/chosen": -0.2596069276332855, "logits/rejected": -0.5130890011787415, "logps/chosen": -387.0249938964844, "logps/rejected": -416.5, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9375976324081421, "rewards/margins": 7.809374809265137, "rewards/rejected": -8.740625381469727, "step": 3020 }, { "epoch": 2.6606632989237866, "grad_norm": 5.511560365028605, "learning_rate": 3.3457820738137084e-07, "logits/chosen": -0.3284545838832855, "logits/rejected": -0.44810789823532104, "logps/chosen": -375.4750061035156, "logps/rejected": -437.6000061035156, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -0.8687988519668579, "rewards/margins": 7.159375190734863, "rewards/rejected": -8.028905868530273, "step": 3030 }, { "epoch": 2.669448715132879, "grad_norm": 13.95640549919119, "learning_rate": 3.3238137082601055e-07, "logits/chosen": -0.3338989317417145, "logits/rejected": -0.5943359136581421, "logps/chosen": -365.20001220703125, "logps/rejected": -425.8500061035156, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.369531273841858, "rewards/margins": 8.192968368530273, "rewards/rejected": -9.564844131469727, "step": 3040 }, { "epoch": 2.678234131341972, "grad_norm": 22.811628014005453, "learning_rate": 3.3018453427065026e-07, "logits/chosen": -0.359375, "logits/rejected": -0.61712646484375, "logps/chosen": -376.01251220703125, "logps/rejected": -388.0249938964844, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -1.0308105945587158, "rewards/margins": 7.91015625, "rewards/rejected": -8.940625190734863, "step": 3050 }, { "epoch": 2.687019547551065, "grad_norm": 15.218282480016757, "learning_rate": 3.2798769771528997e-07, "logits/chosen": -0.4687744081020355, "logits/rejected": -0.5656906366348267, "logps/chosen": -372.9750061035156, "logps/rejected": -437.2749938964844, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.0572693347930908, "rewards/margins": 7.724218845367432, "rewards/rejected": -8.77734375, "step": 3060 }, { "epoch": 2.695804963760158, "grad_norm": 3.082632458044639, "learning_rate": 3.2579086115992973e-07, "logits/chosen": -0.40772247314453125, "logits/rejected": -0.4769348204135895, "logps/chosen": -358.9750061035156, "logps/rejected": -440.45001220703125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.491369605064392, "rewards/margins": 8.235156059265137, "rewards/rejected": -9.727343559265137, "step": 3070 }, { "epoch": 2.7045903799692512, "grad_norm": 47.982591752547016, "learning_rate": 3.2359402460456944e-07, "logits/chosen": -0.27104490995407104, "logits/rejected": -0.41773682832717896, "logps/chosen": -389.48748779296875, "logps/rejected": -406.125, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -0.7391327023506165, "rewards/margins": 7.654687404632568, "rewards/rejected": -8.3984375, "step": 3080 }, { "epoch": 2.713375796178344, "grad_norm": 2.5526394929739995, "learning_rate": 3.2139718804920914e-07, "logits/chosen": -0.3531494140625, "logits/rejected": -0.58197021484375, "logps/chosen": -390.57501220703125, "logps/rejected": -444.07501220703125, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.74755859375, "rewards/margins": 8.154687881469727, "rewards/rejected": -8.899999618530273, "step": 3090 }, { "epoch": 2.722161212387437, "grad_norm": 2.132523000353378, "learning_rate": 3.1920035149384885e-07, "logits/chosen": -0.33130186796188354, "logits/rejected": -0.42029112577438354, "logps/chosen": -412.5, "logps/rejected": -437.2749938964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -0.8748413324356079, "rewards/margins": 7.435937404632568, "rewards/rejected": -8.313281059265137, "step": 3100 }, { "epoch": 2.73094662859653, "grad_norm": 15.561049225138323, "learning_rate": 3.1700351493848856e-07, "logits/chosen": -0.4017883241176605, "logits/rejected": -0.42497557401657104, "logps/chosen": -360.0249938964844, "logps/rejected": -419.9750061035156, "loss": 0.0206, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.47798460721969604, "rewards/margins": 7.396093845367432, "rewards/rejected": -7.870312690734863, "step": 3110 }, { "epoch": 2.739732044805623, "grad_norm": 16.18783596396693, "learning_rate": 3.148066783831283e-07, "logits/chosen": -0.20206299424171448, "logits/rejected": -0.39802855253219604, "logps/chosen": -413.625, "logps/rejected": -455.79998779296875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.659680187702179, "rewards/margins": 8.2734375, "rewards/rejected": -8.93359375, "step": 3120 }, { "epoch": 2.748517461014716, "grad_norm": 7.849079396241412, "learning_rate": 3.1260984182776803e-07, "logits/chosen": -0.3252319395542145, "logits/rejected": -0.62310791015625, "logps/chosen": -417.375, "logps/rejected": -407.2749938964844, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.272485375404358, "rewards/margins": 7.742968559265137, "rewards/rejected": -9.013280868530273, "step": 3130 }, { "epoch": 2.7573028772238084, "grad_norm": 9.030420262069391, "learning_rate": 3.1041300527240773e-07, "logits/chosen": -0.20905761420726776, "logits/rejected": -0.3013320863246918, "logps/chosen": -456.2250061035156, "logps/rejected": -451.3999938964844, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.8207153081893921, "rewards/margins": 7.942968845367432, "rewards/rejected": -8.763280868530273, "step": 3140 }, { "epoch": 2.7660882934329014, "grad_norm": 19.81033690420415, "learning_rate": 3.0821616871704744e-07, "logits/chosen": -0.35826414823532104, "logits/rejected": -0.6433471441268921, "logps/chosen": -358.0, "logps/rejected": -415.04998779296875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -0.943310558795929, "rewards/margins": 7.758593559265137, "rewards/rejected": -8.701562881469727, "step": 3150 }, { "epoch": 2.7748737096419944, "grad_norm": 1.515403913894267, "learning_rate": 3.060193321616872e-07, "logits/chosen": -0.3252624571323395, "logits/rejected": -0.5146728754043579, "logps/chosen": -393.375, "logps/rejected": -453.67498779296875, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.865063488483429, "rewards/margins": 7.251562595367432, "rewards/rejected": -8.117578506469727, "step": 3160 }, { "epoch": 2.783659125851087, "grad_norm": 1.9932499796436933, "learning_rate": 3.038224956063269e-07, "logits/chosen": -0.37471312284469604, "logits/rejected": -0.4589599668979645, "logps/chosen": -386.04998779296875, "logps/rejected": -403.3374938964844, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -0.90618896484375, "rewards/margins": 7.732812404632568, "rewards/rejected": -8.638280868530273, "step": 3170 }, { "epoch": 2.79244454206018, "grad_norm": 2.0436377456349937, "learning_rate": 3.016256590509666e-07, "logits/chosen": -0.4875229001045227, "logits/rejected": -0.65435791015625, "logps/chosen": -366.07501220703125, "logps/rejected": -440.0249938964844, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.901806652545929, "rewards/margins": 8.01953125, "rewards/rejected": -8.920312881469727, "step": 3180 }, { "epoch": 2.801229958269273, "grad_norm": 0.5266406614611127, "learning_rate": 2.994288224956063e-07, "logits/chosen": -0.33830565214157104, "logits/rejected": -0.6584717035293579, "logps/chosen": -392.1875, "logps/rejected": -418.8999938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.091699242591858, "rewards/margins": 7.986718654632568, "rewards/rejected": -9.0703125, "step": 3190 }, { "epoch": 2.810015374478366, "grad_norm": 19.179382087990543, "learning_rate": 2.9723198594024603e-07, "logits/chosen": -0.29812318086624146, "logits/rejected": -0.529614269733429, "logps/chosen": -329.2250061035156, "logps/rejected": -377.82501220703125, "loss": 0.0156, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.671679675579071, "rewards/margins": 7.189062595367432, "rewards/rejected": -7.864843845367432, "step": 3200 }, { "epoch": 2.818800790687459, "grad_norm": 18.377591658138265, "learning_rate": 2.950351493848858e-07, "logits/chosen": -0.3313964903354645, "logits/rejected": -0.6072311401367188, "logps/chosen": -371.8374938964844, "logps/rejected": -427.73748779296875, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328246593475342, "rewards/margins": 7.886328220367432, "rewards/rejected": -9.022656440734863, "step": 3210 }, { "epoch": 2.8275862068965516, "grad_norm": 15.261534991651782, "learning_rate": 2.928383128295255e-07, "logits/chosen": -0.49371033906936646, "logits/rejected": -0.5156310796737671, "logps/chosen": -325.67498779296875, "logps/rejected": -418.5375061035156, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -0.861584484577179, "rewards/margins": 7.321484565734863, "rewards/rejected": -8.184374809265137, "step": 3220 }, { "epoch": 2.8363716231056446, "grad_norm": 4.200834651532248, "learning_rate": 2.906414762741652e-07, "logits/chosen": -0.30797117948532104, "logits/rejected": -0.5961853265762329, "logps/chosen": -399.32501220703125, "logps/rejected": -424.1499938964844, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.990216076374054, "rewards/margins": 7.85546875, "rewards/rejected": -8.842968940734863, "step": 3230 }, { "epoch": 2.8451570393147376, "grad_norm": 46.91686943046311, "learning_rate": 2.884446397188049e-07, "logits/chosen": -0.3236450254917145, "logits/rejected": -0.5774596929550171, "logps/chosen": -389.875, "logps/rejected": -463.04998779296875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.96209716796875, "rewards/margins": 8.109375, "rewards/rejected": -9.073437690734863, "step": 3240 }, { "epoch": 2.85394245552383, "grad_norm": 12.322655451791768, "learning_rate": 2.862478031634446e-07, "logits/chosen": -0.25725096464157104, "logits/rejected": -0.511278510093689, "logps/chosen": -419.54998779296875, "logps/rejected": -397.0249938964844, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -0.7013183832168579, "rewards/margins": 7.61328125, "rewards/rejected": -8.3125, "step": 3250 }, { "epoch": 2.862727871732923, "grad_norm": 5.008576141979654, "learning_rate": 2.840509666080844e-07, "logits/chosen": -0.4677490293979645, "logits/rejected": -0.501110851764679, "logps/chosen": -357.17498779296875, "logps/rejected": -431.95001220703125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.4968017637729645, "rewards/margins": 7.753125190734863, "rewards/rejected": -8.24609375, "step": 3260 }, { "epoch": 2.871513287942016, "grad_norm": 3.654392897183233, "learning_rate": 2.818541300527241e-07, "logits/chosen": -0.2781982421875, "logits/rejected": -0.6202392578125, "logps/chosen": -383.875, "logps/rejected": -412.3500061035156, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.2693970203399658, "rewards/margins": 7.74609375, "rewards/rejected": -9.015625, "step": 3270 }, { "epoch": 2.8802987041511092, "grad_norm": 3.8751434560430886, "learning_rate": 2.796572934973638e-07, "logits/chosen": -0.30253905057907104, "logits/rejected": -0.534741222858429, "logps/chosen": -401.625, "logps/rejected": -456.9750061035156, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.9421752691268921, "rewards/margins": 8.469531059265137, "rewards/rejected": -9.41015625, "step": 3280 }, { "epoch": 2.8890841203602022, "grad_norm": 4.022210783451489, "learning_rate": 2.774604569420035e-07, "logits/chosen": -0.4882659912109375, "logits/rejected": -0.674560546875, "logps/chosen": -412.8500061035156, "logps/rejected": -456.1499938964844, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -1.479699730873108, "rewards/margins": 8.539843559265137, "rewards/rejected": -10.020312309265137, "step": 3290 }, { "epoch": 2.897869536569295, "grad_norm": 5.726733990570096, "learning_rate": 2.7526362038664327e-07, "logits/chosen": -0.2865356504917145, "logits/rejected": -0.5616912841796875, "logps/chosen": -385.39373779296875, "logps/rejected": -433.4750061035156, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4399292469024658, "rewards/margins": 8.296875, "rewards/rejected": -9.736719131469727, "step": 3300 }, { "epoch": 2.906654952778388, "grad_norm": 13.230595695101877, "learning_rate": 2.73066783831283e-07, "logits/chosen": -0.3056640625, "logits/rejected": -0.657366931438446, "logps/chosen": -432.375, "logps/rejected": -428.20001220703125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.2949707508087158, "rewards/margins": 8.254687309265137, "rewards/rejected": -9.5546875, "step": 3310 }, { "epoch": 2.915440368987481, "grad_norm": 7.606490261928186, "learning_rate": 2.708699472759227e-07, "logits/chosen": -0.2972412109375, "logits/rejected": -0.5560333132743835, "logps/chosen": -360.9125061035156, "logps/rejected": -377.20001220703125, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -1.100860595703125, "rewards/margins": 7.416406154632568, "rewards/rejected": -8.510937690734863, "step": 3320 }, { "epoch": 2.924225785196574, "grad_norm": 5.37455641815673, "learning_rate": 2.686731107205624e-07, "logits/chosen": -0.35124510526657104, "logits/rejected": -0.4336090087890625, "logps/chosen": -361.73748779296875, "logps/rejected": -414.29998779296875, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -0.78021240234375, "rewards/margins": 7.864062309265137, "rewards/rejected": -8.6484375, "step": 3330 }, { "epoch": 2.933011201405667, "grad_norm": 0.762855724870387, "learning_rate": 2.664762741652021e-07, "logits/chosen": -0.32752686738967896, "logits/rejected": -0.5947265625, "logps/chosen": -377.7250061035156, "logps/rejected": -415.6499938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.028710961341858, "rewards/margins": 8.041406631469727, "rewards/rejected": -9.067187309265137, "step": 3340 }, { "epoch": 2.9417966176147594, "grad_norm": 16.18775050450688, "learning_rate": 2.6427943760984186e-07, "logits/chosen": -0.3336181640625, "logits/rejected": -0.4357543885707855, "logps/chosen": -400.11248779296875, "logps/rejected": -436.67498779296875, "loss": 0.0201, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.92840576171875, "rewards/margins": 7.759765625, "rewards/rejected": -8.6953125, "step": 3350 }, { "epoch": 2.9505820338238524, "grad_norm": 14.91712818619008, "learning_rate": 2.6208260105448157e-07, "logits/chosen": -0.24808654189109802, "logits/rejected": -0.5434967279434204, "logps/chosen": -397.8500061035156, "logps/rejected": -451.82501220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.2307007312774658, "rewards/margins": 8.34765625, "rewards/rejected": -9.584375381469727, "step": 3360 }, { "epoch": 2.9593674500329454, "grad_norm": 4.484676563407132, "learning_rate": 2.598857644991213e-07, "logits/chosen": -0.365457147359848, "logits/rejected": -0.6591552495956421, "logps/chosen": -383.70001220703125, "logps/rejected": -450.95001220703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.1678345203399658, "rewards/margins": 8.0859375, "rewards/rejected": -9.254687309265137, "step": 3370 }, { "epoch": 2.968152866242038, "grad_norm": 8.155666020611571, "learning_rate": 2.57688927943761e-07, "logits/chosen": -0.18227538466453552, "logits/rejected": -0.4980102479457855, "logps/chosen": -423.54998779296875, "logps/rejected": -486.29998779296875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -1.3596985340118408, "rewards/margins": 8.5234375, "rewards/rejected": -9.87890625, "step": 3380 }, { "epoch": 2.976938282451131, "grad_norm": 8.838987667930958, "learning_rate": 2.5549209138840064e-07, "logits/chosen": -0.4617919921875, "logits/rejected": -0.5541747808456421, "logps/chosen": -376.7749938964844, "logps/rejected": -432.45001220703125, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.627783179283142, "rewards/margins": 7.901562690734863, "rewards/rejected": -9.53515625, "step": 3390 }, { "epoch": 2.985723698660224, "grad_norm": 2.139383751276774, "learning_rate": 2.5329525483304045e-07, "logits/chosen": -0.2857299745082855, "logits/rejected": -0.649920642375946, "logps/chosen": -448.2749938964844, "logps/rejected": -467.6000061035156, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -1.5390136241912842, "rewards/margins": 7.939843654632568, "rewards/rejected": -9.48046875, "step": 3400 }, { "epoch": 2.994509114869317, "grad_norm": 2.42699239323414, "learning_rate": 2.5109841827768016e-07, "logits/chosen": -0.265716552734375, "logits/rejected": -0.454833984375, "logps/chosen": -371.92498779296875, "logps/rejected": -458.6499938964844, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6144653558731079, "rewards/margins": 7.891797065734863, "rewards/rejected": -8.509374618530273, "step": 3410 }, { "epoch": 3.002635624862728, "grad_norm": 0.4947469630952311, "learning_rate": 2.489015817223198e-07, "logits/chosen": -0.2870301902294159, "logits/rejected": -0.6013777256011963, "logps/chosen": -352.9662170410156, "logps/rejected": -437.0, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.8606748580932617, "rewards/margins": 8.769425392150879, "rewards/rejected": -9.629222869873047, "step": 3420 }, { "epoch": 3.011421041071821, "grad_norm": 1.3304741598029002, "learning_rate": 2.467047451669596e-07, "logits/chosen": -0.20772704482078552, "logits/rejected": -0.45839232206344604, "logps/chosen": -400.6499938964844, "logps/rejected": -415.1000061035156, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.2857666015625, "rewards/margins": 8.451562881469727, "rewards/rejected": -8.745312690734863, "step": 3430 }, { "epoch": 3.0202064572809135, "grad_norm": 2.9464292753790065, "learning_rate": 2.445079086115993e-07, "logits/chosen": -0.45708006620407104, "logits/rejected": -0.6176086664199829, "logps/chosen": -369.0625, "logps/rejected": -455.3500061035156, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.0432860851287842, "rewards/margins": 9.317968368530273, "rewards/rejected": -10.361719131469727, "step": 3440 }, { "epoch": 3.0289918734900065, "grad_norm": 1.1792833054259277, "learning_rate": 2.42311072056239e-07, "logits/chosen": -0.4381347596645355, "logits/rejected": -0.68011474609375, "logps/chosen": -338.20001220703125, "logps/rejected": -424.17498779296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.2186768054962158, "rewards/margins": 8.736719131469727, "rewards/rejected": -9.959375381469727, "step": 3450 }, { "epoch": 3.0377772896990995, "grad_norm": 0.8239122702155047, "learning_rate": 2.401142355008787e-07, "logits/chosen": -0.47568970918655396, "logits/rejected": -0.6011413335800171, "logps/chosen": -376.86248779296875, "logps/rejected": -451.375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.6725342273712158, "rewards/margins": 9.478906631469727, "rewards/rejected": -11.158594131469727, "step": 3460 }, { "epoch": 3.0465627059081926, "grad_norm": 1.1633404107280914, "learning_rate": 2.3791739894551843e-07, "logits/chosen": -0.4798339903354645, "logits/rejected": -0.688586413860321, "logps/chosen": -413.0874938964844, "logps/rejected": -489.7250061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.0950638055801392, "rewards/margins": 9.354687690734863, "rewards/rejected": -10.44140625, "step": 3470 }, { "epoch": 3.055348122117285, "grad_norm": 1.726598799228875, "learning_rate": 2.3572056239015817e-07, "logits/chosen": -0.374267578125, "logits/rejected": -0.594683825969696, "logps/chosen": -366.25, "logps/rejected": -465.54998779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.5639159679412842, "rewards/margins": 9.232812881469727, "rewards/rejected": -10.794530868530273, "step": 3480 }, { "epoch": 3.064133538326378, "grad_norm": 0.9398204643206183, "learning_rate": 2.3352372583479788e-07, "logits/chosen": -0.6318725347518921, "logits/rejected": -0.7012389898300171, "logps/chosen": -402.57501220703125, "logps/rejected": -472.32501220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.400537133216858, "rewards/margins": 9.252344131469727, "rewards/rejected": -10.660937309265137, "step": 3490 }, { "epoch": 3.072918954535471, "grad_norm": 0.7594445394636874, "learning_rate": 2.313268892794376e-07, "logits/chosen": -0.3261657655239105, "logits/rejected": -0.5736023187637329, "logps/chosen": -355.07501220703125, "logps/rejected": -428.42498779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.838854968547821, "rewards/margins": 8.672656059265137, "rewards/rejected": -9.510937690734863, "step": 3500 }, { "epoch": 3.081704370744564, "grad_norm": 0.8927135674140892, "learning_rate": 2.2913005272407732e-07, "logits/chosen": -0.35134583711624146, "logits/rejected": -0.6823486089706421, "logps/chosen": -397.4125061035156, "logps/rejected": -440.8500061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.071997046470642, "rewards/margins": 9.17578125, "rewards/rejected": -10.239843368530273, "step": 3510 }, { "epoch": 3.0904897869536567, "grad_norm": 0.5578449895128904, "learning_rate": 2.2693321616871705e-07, "logits/chosen": -0.3760009706020355, "logits/rejected": -0.62835693359375, "logps/chosen": -358.375, "logps/rejected": -405.3999938964844, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.1746094226837158, "rewards/margins": 9.329687118530273, "rewards/rejected": -10.508593559265137, "step": 3520 }, { "epoch": 3.0992752031627497, "grad_norm": 0.3990353622353858, "learning_rate": 2.2473637961335676e-07, "logits/chosen": -0.38876646757125854, "logits/rejected": -0.6612548828125, "logps/chosen": -382.6625061035156, "logps/rejected": -440.4750061035156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.1665465831756592, "rewards/margins": 9.272656440734863, "rewards/rejected": -10.4375, "step": 3530 }, { "epoch": 3.1080606193718427, "grad_norm": 6.557858984363339, "learning_rate": 2.2253954305799647e-07, "logits/chosen": -0.376251220703125, "logits/rejected": -0.665087878704071, "logps/chosen": -380.20001220703125, "logps/rejected": -438.79998779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.4282958507537842, "rewards/margins": 8.997655868530273, "rewards/rejected": -10.428906440734863, "step": 3540 }, { "epoch": 3.1168460355809358, "grad_norm": 2.08803655711173, "learning_rate": 2.203427065026362e-07, "logits/chosen": -0.46757811307907104, "logits/rejected": -0.701428234577179, "logps/chosen": -419.92498779296875, "logps/rejected": -478.75, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.0525879859924316, "rewards/margins": 9.921875, "rewards/rejected": -11.970312118530273, "step": 3550 }, { "epoch": 3.1256314517900288, "grad_norm": 0.47511287212627473, "learning_rate": 2.181458699472759e-07, "logits/chosen": -0.36270445585250854, "logits/rejected": -0.5930420160293579, "logps/chosen": -445.6499938964844, "logps/rejected": -469.57501220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.3514893054962158, "rewards/margins": 9.09765625, "rewards/rejected": -10.453906059265137, "step": 3560 }, { "epoch": 3.1344168679991213, "grad_norm": 0.8334521919899409, "learning_rate": 2.1594903339191564e-07, "logits/chosen": -0.2652831971645355, "logits/rejected": -0.5739990472793579, "logps/chosen": -377.5874938964844, "logps/rejected": -443.2749938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.301611304283142, "rewards/margins": 9.410937309265137, "rewards/rejected": -10.712499618530273, "step": 3570 }, { "epoch": 3.1432022842082143, "grad_norm": 1.6860533220668106, "learning_rate": 2.1375219683655535e-07, "logits/chosen": -0.3873123228549957, "logits/rejected": -0.65838623046875, "logps/chosen": -382.45001220703125, "logps/rejected": -436.2749938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6729247570037842, "rewards/margins": 9.532812118530273, "rewards/rejected": -11.20703125, "step": 3580 }, { "epoch": 3.1519877004173074, "grad_norm": 0.8647687057540114, "learning_rate": 2.1155536028119509e-07, "logits/chosen": -0.35127562284469604, "logits/rejected": -0.789868175983429, "logps/chosen": -374.3500061035156, "logps/rejected": -430.8999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.664575219154358, "rewards/margins": 9.907031059265137, "rewards/rejected": -11.568750381469727, "step": 3590 }, { "epoch": 3.1607731166264004, "grad_norm": 1.7219813814831317, "learning_rate": 2.093585237258348e-07, "logits/chosen": -0.5527588129043579, "logits/rejected": -0.6464599370956421, "logps/chosen": -362.8500061035156, "logps/rejected": -463.92498779296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.5403809547424316, "rewards/margins": 9.689062118530273, "rewards/rejected": -12.229687690734863, "step": 3600 }, { "epoch": 3.169558532835493, "grad_norm": 0.35364168707551513, "learning_rate": 2.071616871704745e-07, "logits/chosen": -0.5586181879043579, "logits/rejected": -0.726306140422821, "logps/chosen": -384.45001220703125, "logps/rejected": -436.25, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.7664062976837158, "rewards/margins": 9.470312118530273, "rewards/rejected": -11.232030868530273, "step": 3610 }, { "epoch": 3.178343949044586, "grad_norm": 1.62394984928576, "learning_rate": 2.0496485061511424e-07, "logits/chosen": -0.577832043170929, "logits/rejected": -0.700610339641571, "logps/chosen": -334.7250061035156, "logps/rejected": -382.1499938964844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.6718261241912842, "rewards/margins": 9.08984375, "rewards/rejected": -10.766406059265137, "step": 3620 }, { "epoch": 3.187129365253679, "grad_norm": 0.8177416711583946, "learning_rate": 2.0276801405975394e-07, "logits/chosen": -0.43592530488967896, "logits/rejected": -0.6963958740234375, "logps/chosen": -368.4125061035156, "logps/rejected": -457.57501220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6544678211212158, "rewards/margins": 8.979687690734863, "rewards/rejected": -10.638280868530273, "step": 3630 }, { "epoch": 3.195914781462772, "grad_norm": 0.4813885442017244, "learning_rate": 2.0057117750439368e-07, "logits/chosen": -0.36067503690719604, "logits/rejected": -0.6654113531112671, "logps/chosen": -398.7250061035156, "logps/rejected": -442.79998779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.419189453125, "rewards/margins": 9.517969131469727, "rewards/rejected": -10.939062118530273, "step": 3640 }, { "epoch": 3.2047001976718645, "grad_norm": 0.4880717439749104, "learning_rate": 1.9837434094903339e-07, "logits/chosen": -0.29167479276657104, "logits/rejected": -0.5412842035293579, "logps/chosen": -401.9750061035156, "logps/rejected": -441.75, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0406250953674316, "rewards/margins": 9.483593940734863, "rewards/rejected": -11.521875381469727, "step": 3650 }, { "epoch": 3.2134856138809575, "grad_norm": 0.5757448906140887, "learning_rate": 1.9617750439367312e-07, "logits/chosen": -0.334463506937027, "logits/rejected": -0.6652466058731079, "logps/chosen": -386.8374938964844, "logps/rejected": -405.625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.3944091796875, "rewards/margins": 9.173437118530273, "rewards/rejected": -10.5703125, "step": 3660 }, { "epoch": 3.2222710300900506, "grad_norm": 0.2691037071247476, "learning_rate": 1.9398066783831283e-07, "logits/chosen": -0.2171630859375, "logits/rejected": -0.652270495891571, "logps/chosen": -406.9125061035156, "logps/rejected": -472.67498779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.6406981945037842, "rewards/margins": 9.78125, "rewards/rejected": -11.415624618530273, "step": 3670 }, { "epoch": 3.2310564462991436, "grad_norm": 5.1940979597293495, "learning_rate": 1.9178383128295253e-07, "logits/chosen": -0.5974487066268921, "logits/rejected": -0.8372131586074829, "logps/chosen": -403.70001220703125, "logps/rejected": -491.25, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.164501905441284, "rewards/margins": 10.0078125, "rewards/rejected": -12.177343368530273, "step": 3680 }, { "epoch": 3.239841862508236, "grad_norm": 3.474538694377807, "learning_rate": 1.8958699472759227e-07, "logits/chosen": -0.3102661073207855, "logits/rejected": -0.681323230266571, "logps/chosen": -390.17498779296875, "logps/rejected": -434.6499938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.817480444908142, "rewards/margins": 9.426562309265137, "rewards/rejected": -11.245312690734863, "step": 3690 }, { "epoch": 3.248627278717329, "grad_norm": 0.8782079381916751, "learning_rate": 1.8739015817223198e-07, "logits/chosen": -0.35603028535842896, "logits/rejected": -0.526501476764679, "logps/chosen": -390.3125, "logps/rejected": -464.29998779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.8155395984649658, "rewards/margins": 10.159375190734863, "rewards/rejected": -11.978124618530273, "step": 3700 }, { "epoch": 3.257412694926422, "grad_norm": 0.35232540584125555, "learning_rate": 1.851933216168717e-07, "logits/chosen": -0.516693115234375, "logits/rejected": -0.677764892578125, "logps/chosen": -392.63751220703125, "logps/rejected": -454.9750061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6456787586212158, "rewards/margins": 9.846094131469727, "rewards/rejected": -11.5, "step": 3710 }, { "epoch": 3.266198111135515, "grad_norm": 0.39107479037002285, "learning_rate": 1.8299648506151142e-07, "logits/chosen": -0.33820801973342896, "logits/rejected": -0.6116393804550171, "logps/chosen": -451.79998779296875, "logps/rejected": -480.04998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.982519507408142, "rewards/margins": 9.417187690734863, "rewards/rejected": -11.396875381469727, "step": 3720 }, { "epoch": 3.2749835273446077, "grad_norm": 0.3616296466649269, "learning_rate": 1.8079964850615115e-07, "logits/chosen": -0.508471667766571, "logits/rejected": -0.6277710199356079, "logps/chosen": -353.79998779296875, "logps/rejected": -413.95001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.577062964439392, "rewards/margins": 9.564844131469727, "rewards/rejected": -11.142187118530273, "step": 3730 }, { "epoch": 3.2837689435537007, "grad_norm": 0.8208552322886763, "learning_rate": 1.7860281195079086e-07, "logits/chosen": -0.28206175565719604, "logits/rejected": -0.70770263671875, "logps/chosen": -394.88751220703125, "logps/rejected": -414.25, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.445184350013733, "rewards/margins": 9.489062309265137, "rewards/rejected": -10.936718940734863, "step": 3740 }, { "epoch": 3.2925543597627938, "grad_norm": 1.1461539410506165, "learning_rate": 1.7640597539543057e-07, "logits/chosen": -0.488067626953125, "logits/rejected": -0.650805652141571, "logps/chosen": -341.23748779296875, "logps/rejected": -410.0249938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.52178955078125, "rewards/margins": 9.262499809265137, "rewards/rejected": -10.792187690734863, "step": 3750 }, { "epoch": 3.3013397759718868, "grad_norm": 0.6825877148164754, "learning_rate": 1.742091388400703e-07, "logits/chosen": -0.510241687297821, "logits/rejected": -0.622222900390625, "logps/chosen": -369.98748779296875, "logps/rejected": -439.75, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.6619873046875, "rewards/margins": 9.591405868530273, "rewards/rejected": -11.251562118530273, "step": 3760 }, { "epoch": 3.31012519218098, "grad_norm": 0.16879293098110762, "learning_rate": 1.7201230228471e-07, "logits/chosen": -0.49965208768844604, "logits/rejected": -0.9062744379043579, "logps/chosen": -383.0375061035156, "logps/rejected": -458.17498779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.781945824623108, "rewards/margins": 9.935155868530273, "rewards/rejected": -11.721094131469727, "step": 3770 }, { "epoch": 3.3189106083900723, "grad_norm": 2.261722951981144, "learning_rate": 1.6981546572934974e-07, "logits/chosen": -0.5705932378768921, "logits/rejected": -0.6486450433731079, "logps/chosen": -416.0249938964844, "logps/rejected": -494.4750061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.270703077316284, "rewards/margins": 9.532031059265137, "rewards/rejected": -11.801562309265137, "step": 3780 }, { "epoch": 3.3276960245991654, "grad_norm": 0.6180882544324542, "learning_rate": 1.6761862917398945e-07, "logits/chosen": -0.32520753145217896, "logits/rejected": -0.679186999797821, "logps/chosen": -372.7124938964844, "logps/rejected": -412.7250061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.635833740234375, "rewards/margins": 9.584375381469727, "rewards/rejected": -11.217187881469727, "step": 3790 }, { "epoch": 3.3364814408082584, "grad_norm": 1.3994591967074144, "learning_rate": 1.6542179261862919e-07, "logits/chosen": -0.4407409727573395, "logits/rejected": -0.732226550579071, "logps/chosen": -393.1499938964844, "logps/rejected": -481.1499938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.502539038658142, "rewards/margins": 9.735156059265137, "rewards/rejected": -11.237500190734863, "step": 3800 }, { "epoch": 3.3452668570173514, "grad_norm": 2.023624705456325, "learning_rate": 1.632249560632689e-07, "logits/chosen": -0.44676512479782104, "logits/rejected": -0.7496703863143921, "logps/chosen": -464.6499938964844, "logps/rejected": -508.17498779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.051257371902466, "rewards/margins": 9.568750381469727, "rewards/rejected": -11.6171875, "step": 3810 }, { "epoch": 3.354052273226444, "grad_norm": 0.08575482687653603, "learning_rate": 1.610281195079086e-07, "logits/chosen": -0.5762695074081421, "logits/rejected": -0.8279510736465454, "logps/chosen": -404.0249938964844, "logps/rejected": -450.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.435253858566284, "rewards/margins": 9.931249618530273, "rewards/rejected": -12.373437881469727, "step": 3820 }, { "epoch": 3.362837689435537, "grad_norm": 0.2999139407798578, "learning_rate": 1.5883128295254834e-07, "logits/chosen": -0.39515382051467896, "logits/rejected": -0.627368152141571, "logps/chosen": -430.6875, "logps/rejected": -499.20001220703125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.030566453933716, "rewards/margins": 9.704687118530273, "rewards/rejected": -11.7265625, "step": 3830 }, { "epoch": 3.37162310564463, "grad_norm": 1.06571266775449, "learning_rate": 1.5663444639718804e-07, "logits/chosen": -0.4414306581020355, "logits/rejected": -0.729962170124054, "logps/chosen": -392.8125, "logps/rejected": -451.125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.932763695716858, "rewards/margins": 10.082812309265137, "rewards/rejected": -12.009374618530273, "step": 3840 }, { "epoch": 3.380408521853723, "grad_norm": 0.5354377653885222, "learning_rate": 1.5443760984182778e-07, "logits/chosen": -0.3774780333042145, "logits/rejected": -0.77630615234375, "logps/chosen": -373.70001220703125, "logps/rejected": -427.125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.8197753429412842, "rewards/margins": 9.643750190734863, "rewards/rejected": -11.467968940734863, "step": 3850 }, { "epoch": 3.3891939380628155, "grad_norm": 0.21665117151484717, "learning_rate": 1.5224077328646749e-07, "logits/chosen": -0.40594482421875, "logits/rejected": -0.775646984577179, "logps/chosen": -415.57501220703125, "logps/rejected": -460.2250061035156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.7807128429412842, "rewards/margins": 9.829687118530273, "rewards/rejected": -11.610156059265137, "step": 3860 }, { "epoch": 3.3979793542719086, "grad_norm": 0.7396155423143176, "learning_rate": 1.5004393673110722e-07, "logits/chosen": -0.4874725341796875, "logits/rejected": -0.6720825433731079, "logps/chosen": -378.36248779296875, "logps/rejected": -490.1000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.946752905845642, "rewards/margins": 9.786718368530273, "rewards/rejected": -11.737500190734863, "step": 3870 }, { "epoch": 3.4067647704810016, "grad_norm": 0.5351471496894081, "learning_rate": 1.4784710017574693e-07, "logits/chosen": -0.3068481385707855, "logits/rejected": -0.7298339605331421, "logps/chosen": -414.7250061035156, "logps/rejected": -456.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.6942260265350342, "rewards/margins": 10.064062118530273, "rewards/rejected": -11.762499809265137, "step": 3880 }, { "epoch": 3.4155501866900946, "grad_norm": 5.2090719368594804, "learning_rate": 1.4565026362038664e-07, "logits/chosen": -0.33463746309280396, "logits/rejected": -0.652172863483429, "logps/chosen": -400.1499938964844, "logps/rejected": -440.95001220703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.5683501958847046, "rewards/margins": 9.62109375, "rewards/rejected": -11.185937881469727, "step": 3890 }, { "epoch": 3.4243356028991876, "grad_norm": 0.15544361251379685, "learning_rate": 1.4345342706502637e-07, "logits/chosen": -0.5911010503768921, "logits/rejected": -0.8438965082168579, "logps/chosen": -428.2124938964844, "logps/rejected": -478.04998779296875, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.727178931236267, "rewards/margins": 9.809374809265137, "rewards/rejected": -11.5390625, "step": 3900 }, { "epoch": 3.43312101910828, "grad_norm": 0.48268886336364514, "learning_rate": 1.4125659050966608e-07, "logits/chosen": -0.5591675043106079, "logits/rejected": -0.7767730951309204, "logps/chosen": -418.07501220703125, "logps/rejected": -487.75, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8233153820037842, "rewards/margins": 9.362500190734863, "rewards/rejected": -11.184374809265137, "step": 3910 }, { "epoch": 3.441906435317373, "grad_norm": 0.5385918303839256, "learning_rate": 1.390597539543058e-07, "logits/chosen": -0.44647216796875, "logits/rejected": -0.790515124797821, "logps/chosen": -364.7124938964844, "logps/rejected": -475.45001220703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.9438965320587158, "rewards/margins": 10.130468368530273, "rewards/rejected": -12.081250190734863, "step": 3920 }, { "epoch": 3.450691851526466, "grad_norm": 1.7093345940665992, "learning_rate": 1.3686291739894552e-07, "logits/chosen": -0.527984619140625, "logits/rejected": -0.712127685546875, "logps/chosen": -400.86248779296875, "logps/rejected": -462.32501220703125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.958581566810608, "rewards/margins": 9.407812118530273, "rewards/rejected": -11.374218940734863, "step": 3930 }, { "epoch": 3.4594772677355587, "grad_norm": 3.1704913476091554, "learning_rate": 1.3466608084358525e-07, "logits/chosen": -0.40324705839157104, "logits/rejected": -0.7781738042831421, "logps/chosen": -345.125, "logps/rejected": -440.9750061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.4681639671325684, "rewards/margins": 9.34765625, "rewards/rejected": -11.815625190734863, "step": 3940 }, { "epoch": 3.4682626839446518, "grad_norm": 0.563928663717626, "learning_rate": 1.3246924428822496e-07, "logits/chosen": -0.51605224609375, "logits/rejected": -0.7085601687431335, "logps/chosen": -401.0375061035156, "logps/rejected": -472.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.4456849098205566, "rewards/margins": 9.814062118530273, "rewards/rejected": -12.2578125, "step": 3950 }, { "epoch": 3.4770481001537448, "grad_norm": 0.5358554386138321, "learning_rate": 1.3027240773286467e-07, "logits/chosen": -0.5149902105331421, "logits/rejected": -0.6937255859375, "logps/chosen": -416.6625061035156, "logps/rejected": -468.07501220703125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.0930418968200684, "rewards/margins": 9.767187118530273, "rewards/rejected": -11.856249809265137, "step": 3960 }, { "epoch": 3.485833516362838, "grad_norm": 2.3903052829135585, "learning_rate": 1.280755711775044e-07, "logits/chosen": -0.48883056640625, "logits/rejected": -0.729937732219696, "logps/chosen": -422.45001220703125, "logps/rejected": -459.20001220703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.1565794944763184, "rewards/margins": 9.399999618530273, "rewards/rejected": -11.560937881469727, "step": 3970 }, { "epoch": 3.494618932571931, "grad_norm": 0.2765786493166369, "learning_rate": 1.258787346221441e-07, "logits/chosen": -0.698718249797821, "logits/rejected": -0.997509777545929, "logps/chosen": -370.51251220703125, "logps/rejected": -450.125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.7210204601287842, "rewards/margins": 10.071874618530273, "rewards/rejected": -11.792187690734863, "step": 3980 }, { "epoch": 3.5034043487810234, "grad_norm": 0.5967830676139014, "learning_rate": 1.2368189806678382e-07, "logits/chosen": -0.4857544004917145, "logits/rejected": -0.83538818359375, "logps/chosen": -438.07501220703125, "logps/rejected": -473.17498779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.0298094749450684, "rewards/margins": 9.588281631469727, "rewards/rejected": -11.620312690734863, "step": 3990 }, { "epoch": 3.5121897649901164, "grad_norm": 1.1971559489128203, "learning_rate": 1.2148506151142355e-07, "logits/chosen": -0.512908935546875, "logits/rejected": -0.639361560344696, "logps/chosen": -363.48748779296875, "logps/rejected": -462.54998779296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.71435546875, "rewards/margins": 9.603906631469727, "rewards/rejected": -11.321093559265137, "step": 4000 }, { "epoch": 3.5209751811992094, "grad_norm": 0.6704603694286425, "learning_rate": 1.1928822495606326e-07, "logits/chosen": -0.5065368413925171, "logits/rejected": -0.751416027545929, "logps/chosen": -397.54998779296875, "logps/rejected": -465.42498779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6153686046600342, "rewards/margins": 9.25, "rewards/rejected": -10.866406440734863, "step": 4010 }, { "epoch": 3.529760597408302, "grad_norm": 0.4198687175492848, "learning_rate": 1.1709138840070298e-07, "logits/chosen": -0.528411865234375, "logits/rejected": -0.7674316167831421, "logps/chosen": -421.1000061035156, "logps/rejected": -458.07501220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.113903760910034, "rewards/margins": 9.496874809265137, "rewards/rejected": -11.608593940734863, "step": 4020 }, { "epoch": 3.538546013617395, "grad_norm": 1.7394583137755435, "learning_rate": 1.148945518453427e-07, "logits/chosen": -0.500927746295929, "logits/rejected": -0.629852294921875, "logps/chosen": -365.92498779296875, "logps/rejected": -433.1000061035156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.93585205078125, "rewards/margins": 9.44921875, "rewards/rejected": -11.382031440734863, "step": 4030 }, { "epoch": 3.547331429826488, "grad_norm": 0.11575267547187734, "learning_rate": 1.1269771528998242e-07, "logits/chosen": -0.52313232421875, "logits/rejected": -0.840405285358429, "logps/chosen": -339.51251220703125, "logps/rejected": -419.82501220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.5574219226837158, "rewards/margins": 9.489843368530273, "rewards/rejected": -11.041406631469727, "step": 4040 }, { "epoch": 3.556116846035581, "grad_norm": 1.5886647457049394, "learning_rate": 1.1050087873462213e-07, "logits/chosen": -0.5488525629043579, "logits/rejected": -0.703601062297821, "logps/chosen": -353.89373779296875, "logps/rejected": -489.1499938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.534875512123108, "rewards/margins": 9.764843940734863, "rewards/rejected": -11.293749809265137, "step": 4050 }, { "epoch": 3.564902262244674, "grad_norm": 0.3869110331009357, "learning_rate": 1.0830404217926185e-07, "logits/chosen": -0.5734313726425171, "logits/rejected": -0.737902820110321, "logps/chosen": -425.4125061035156, "logps/rejected": -477.6499938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.925390601158142, "rewards/margins": 9.858593940734863, "rewards/rejected": -11.7890625, "step": 4060 }, { "epoch": 3.5736876784537666, "grad_norm": 0.8173195643919873, "learning_rate": 1.0610720562390157e-07, "logits/chosen": -0.4827209413051605, "logits/rejected": -0.6772400140762329, "logps/chosen": -362.23748779296875, "logps/rejected": -455.2250061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.4961669445037842, "rewards/margins": 9.873437881469727, "rewards/rejected": -11.359375, "step": 4070 }, { "epoch": 3.5824730946628596, "grad_norm": 0.4827112693501916, "learning_rate": 1.039103690685413e-07, "logits/chosen": -0.39856261014938354, "logits/rejected": -0.678875744342804, "logps/chosen": -461.0249938964844, "logps/rejected": -518.4249877929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9762694835662842, "rewards/margins": 9.96875, "rewards/rejected": -11.942968368530273, "step": 4080 }, { "epoch": 3.5912585108719526, "grad_norm": 0.5385261386651687, "learning_rate": 1.0171353251318102e-07, "logits/chosen": -0.602734386920929, "logits/rejected": -0.7258651852607727, "logps/chosen": -376.04998779296875, "logps/rejected": -474.8500061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9387603998184204, "rewards/margins": 9.32421875, "rewards/rejected": -11.265625, "step": 4090 }, { "epoch": 3.6000439270810456, "grad_norm": 0.46685903419503927, "learning_rate": 9.951669595782074e-08, "logits/chosen": -0.534252941608429, "logits/rejected": -0.7227112054824829, "logps/chosen": -374.1499938964844, "logps/rejected": -415.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.4334716796875, "rewards/margins": 9.466405868530273, "rewards/rejected": -10.900781631469727, "step": 4100 }, { "epoch": 3.6088293432901386, "grad_norm": 0.9441376220012491, "learning_rate": 9.731985940246046e-08, "logits/chosen": -0.5670531988143921, "logits/rejected": -0.720593273639679, "logps/chosen": -376.4624938964844, "logps/rejected": -477.8500061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.0525879859924316, "rewards/margins": 9.543749809265137, "rewards/rejected": -11.594531059265137, "step": 4110 }, { "epoch": 3.617614759499231, "grad_norm": 0.383436941128621, "learning_rate": 9.512302284710017e-08, "logits/chosen": -0.46836549043655396, "logits/rejected": -0.7594238519668579, "logps/chosen": -452.32501220703125, "logps/rejected": -460.42498779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.426464796066284, "rewards/margins": 9.385937690734863, "rewards/rejected": -11.813281059265137, "step": 4120 }, { "epoch": 3.626400175708324, "grad_norm": 0.4253321635842686, "learning_rate": 9.292618629173989e-08, "logits/chosen": -0.46489256620407104, "logits/rejected": -0.875073254108429, "logps/chosen": -391.5249938964844, "logps/rejected": -426.57501220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.005505323410034, "rewards/margins": 9.578125, "rewards/rejected": -11.584375381469727, "step": 4130 }, { "epoch": 3.635185591917417, "grad_norm": 2.668511714240837, "learning_rate": 9.072934973637961e-08, "logits/chosen": -0.54132080078125, "logits/rejected": -0.6517578363418579, "logps/chosen": -398.375, "logps/rejected": -458.125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.9595947265625, "rewards/margins": 9.314844131469727, "rewards/rejected": -11.274999618530273, "step": 4140 }, { "epoch": 3.6439710081265098, "grad_norm": 0.818435080322933, "learning_rate": 8.853251318101933e-08, "logits/chosen": -0.683209240436554, "logits/rejected": -0.7607421875, "logps/chosen": -370.625, "logps/rejected": -468.3500061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.7220947742462158, "rewards/margins": 9.79296875, "rewards/rejected": -11.518750190734863, "step": 4150 }, { "epoch": 3.6527564243356028, "grad_norm": 0.6167464115911807, "learning_rate": 8.633567662565905e-08, "logits/chosen": -0.573901355266571, "logits/rejected": -0.79345703125, "logps/chosen": -369.2124938964844, "logps/rejected": -457.125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.923828125, "rewards/margins": 10.323437690734863, "rewards/rejected": -12.245312690734863, "step": 4160 }, { "epoch": 3.661541840544696, "grad_norm": 0.7192226468703802, "learning_rate": 8.413884007029877e-08, "logits/chosen": -0.5554443597793579, "logits/rejected": -0.728283703327179, "logps/chosen": -360.4750061035156, "logps/rejected": -420.8500061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9084594249725342, "rewards/margins": 9.564062118530273, "rewards/rejected": -11.471875190734863, "step": 4170 }, { "epoch": 3.670327256753789, "grad_norm": 0.5068067327136949, "learning_rate": 8.194200351493849e-08, "logits/chosen": -0.41857606172561646, "logits/rejected": -0.7850097417831421, "logps/chosen": -376.32501220703125, "logps/rejected": -432.20001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.9494507312774658, "rewards/margins": 10.059374809265137, "rewards/rejected": -12.010156631469727, "step": 4180 }, { "epoch": 3.679112672962882, "grad_norm": 0.46701406787767585, "learning_rate": 7.97451669595782e-08, "logits/chosen": -0.533984363079071, "logits/rejected": -0.8119751214981079, "logps/chosen": -397.7250061035156, "logps/rejected": -441.1000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.6925780773162842, "rewards/margins": 9.660937309265137, "rewards/rejected": -11.352343559265137, "step": 4190 }, { "epoch": 3.6878980891719744, "grad_norm": 1.566177299223276, "learning_rate": 7.754833040421792e-08, "logits/chosen": -0.5422302484512329, "logits/rejected": -0.696093738079071, "logps/chosen": -418.25, "logps/rejected": -451.95001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6950562000274658, "rewards/margins": 9.817968368530273, "rewards/rejected": -11.520312309265137, "step": 4200 }, { "epoch": 3.6966835053810674, "grad_norm": 4.847238611077766, "learning_rate": 7.535149384885764e-08, "logits/chosen": -0.4397949278354645, "logits/rejected": -0.6341003179550171, "logps/chosen": -400.7749938964844, "logps/rejected": -468.5249938964844, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.628820776939392, "rewards/margins": 9.532812118530273, "rewards/rejected": -11.153124809265137, "step": 4210 }, { "epoch": 3.7054689215901604, "grad_norm": 0.5811511290233548, "learning_rate": 7.315465729349736e-08, "logits/chosen": -0.46912842988967896, "logits/rejected": -0.77001953125, "logps/chosen": -390.2250061035156, "logps/rejected": -460.6499938964844, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.666528344154358, "rewards/margins": 9.456250190734863, "rewards/rejected": -11.122655868530273, "step": 4220 }, { "epoch": 3.7142543377992534, "grad_norm": 2.5322142943206285, "learning_rate": 7.095782073813708e-08, "logits/chosen": -0.5831054449081421, "logits/rejected": -0.665722668170929, "logps/chosen": -372.125, "logps/rejected": -447.6000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8344848155975342, "rewards/margins": 9.364062309265137, "rewards/rejected": -11.19140625, "step": 4230 }, { "epoch": 3.7230397540083464, "grad_norm": 0.3782725509774744, "learning_rate": 6.87609841827768e-08, "logits/chosen": -0.47623902559280396, "logits/rejected": -0.64453125, "logps/chosen": -387.7250061035156, "logps/rejected": -443.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.2542235851287842, "rewards/margins": 9.625781059265137, "rewards/rejected": -10.879687309265137, "step": 4240 }, { "epoch": 3.731825170217439, "grad_norm": 1.0086915550694853, "learning_rate": 6.656414762741652e-08, "logits/chosen": -0.433135986328125, "logits/rejected": -0.610699474811554, "logps/chosen": -375.9375, "logps/rejected": -434.25, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.523779273033142, "rewards/margins": 9.271093368530273, "rewards/rejected": -10.790624618530273, "step": 4250 }, { "epoch": 3.740610586426532, "grad_norm": 0.67138111704898, "learning_rate": 6.436731107205623e-08, "logits/chosen": -0.38511353731155396, "logits/rejected": -0.8965514898300171, "logps/chosen": -439.6000061035156, "logps/rejected": -422.1000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.602441430091858, "rewards/margins": 9.758593559265137, "rewards/rejected": -11.359375, "step": 4260 }, { "epoch": 3.749396002635625, "grad_norm": 1.048396790595168, "learning_rate": 6.217047451669595e-08, "logits/chosen": -0.6033935546875, "logits/rejected": -0.645825207233429, "logps/chosen": -377.8999938964844, "logps/rejected": -466.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.608300805091858, "rewards/margins": 9.233593940734863, "rewards/rejected": -10.83984375, "step": 4270 }, { "epoch": 3.7581814188447176, "grad_norm": 1.509693647988286, "learning_rate": 5.997363796133567e-08, "logits/chosen": -0.37947386503219604, "logits/rejected": -0.787353515625, "logps/chosen": -371.1000061035156, "logps/rejected": -433.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.664953589439392, "rewards/margins": 9.435937881469727, "rewards/rejected": -11.095312118530273, "step": 4280 }, { "epoch": 3.7669668350538106, "grad_norm": 0.6932023382609098, "learning_rate": 5.7776801405975395e-08, "logits/chosen": -0.6350342035293579, "logits/rejected": -0.814624011516571, "logps/chosen": -350.88751220703125, "logps/rejected": -435.7250061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.7236328125, "rewards/margins": 9.560155868530273, "rewards/rejected": -11.296093940734863, "step": 4290 }, { "epoch": 3.7757522512629036, "grad_norm": 0.500700794661723, "learning_rate": 5.5579964850615116e-08, "logits/chosen": -0.46148681640625, "logits/rejected": -0.784130871295929, "logps/chosen": -429.7749938964844, "logps/rejected": -488.95001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6139037609100342, "rewards/margins": 9.853906631469727, "rewards/rejected": -11.46484375, "step": 4300 }, { "epoch": 3.7845376674719966, "grad_norm": 0.4772887973441252, "learning_rate": 5.338312829525484e-08, "logits/chosen": -0.36472779512405396, "logits/rejected": -0.6630462408065796, "logps/chosen": -413.3374938964844, "logps/rejected": -435.70001220703125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.0349974632263184, "rewards/margins": 9.146093368530273, "rewards/rejected": -11.175000190734863, "step": 4310 }, { "epoch": 3.7933230836810896, "grad_norm": 0.27449896209489894, "learning_rate": 5.1186291739894545e-08, "logits/chosen": -0.47600096464157104, "logits/rejected": -0.6543945074081421, "logps/chosen": -386.82501220703125, "logps/rejected": -472.8999938964844, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.806396484375, "rewards/margins": 9.934374809265137, "rewards/rejected": -11.750781059265137, "step": 4320 }, { "epoch": 3.802108499890182, "grad_norm": 0.4183747274530641, "learning_rate": 4.8989455184534266e-08, "logits/chosen": -0.493408203125, "logits/rejected": -0.6705688238143921, "logps/chosen": -415.17498779296875, "logps/rejected": -463.7250061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.5869872570037842, "rewards/margins": 9.462499618530273, "rewards/rejected": -11.047656059265137, "step": 4330 }, { "epoch": 3.810893916099275, "grad_norm": 0.9654058148138566, "learning_rate": 4.679261862917399e-08, "logits/chosen": -0.59295654296875, "logits/rejected": -0.8846191167831421, "logps/chosen": -422.32501220703125, "logps/rejected": -475.375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.7505066394805908, "rewards/margins": 9.372655868530273, "rewards/rejected": -11.123437881469727, "step": 4340 }, { "epoch": 3.819679332308368, "grad_norm": 2.630871178275919, "learning_rate": 4.45957820738137e-08, "logits/chosen": -0.467459112405777, "logits/rejected": -0.756744384765625, "logps/chosen": -415.92498779296875, "logps/rejected": -472.29998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.6919434070587158, "rewards/margins": 9.707812309265137, "rewards/rejected": -11.391406059265137, "step": 4350 }, { "epoch": 3.8284647485174608, "grad_norm": 1.604078026177294, "learning_rate": 4.239894551845342e-08, "logits/chosen": -0.5298385620117188, "logits/rejected": -0.9034423828125, "logps/chosen": -434.625, "logps/rejected": -474.625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.7743408679962158, "rewards/margins": 9.821093559265137, "rewards/rejected": -11.594531059265137, "step": 4360 }, { "epoch": 3.837250164726554, "grad_norm": 4.905965543090113, "learning_rate": 4.020210896309314e-08, "logits/chosen": -0.3059326112270355, "logits/rejected": -0.7429748773574829, "logps/chosen": -396.17498779296875, "logps/rejected": -480.3999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.07861328125, "rewards/margins": 9.915624618530273, "rewards/rejected": -11.989062309265137, "step": 4370 }, { "epoch": 3.846035580935647, "grad_norm": 7.1160418097985625, "learning_rate": 3.8005272407732864e-08, "logits/chosen": -0.626232922077179, "logits/rejected": -0.890899658203125, "logps/chosen": -391.45001220703125, "logps/rejected": -447.125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.7014892101287842, "rewards/margins": 9.244531631469727, "rewards/rejected": -10.936718940734863, "step": 4380 }, { "epoch": 3.85482099714474, "grad_norm": 0.437797831781398, "learning_rate": 3.580843585237258e-08, "logits/chosen": -0.47617799043655396, "logits/rejected": -0.6903076171875, "logps/chosen": -394.4624938964844, "logps/rejected": -454.75, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.610498070716858, "rewards/margins": 9.271875381469727, "rewards/rejected": -10.885937690734863, "step": 4390 }, { "epoch": 3.863606413353833, "grad_norm": 1.0586346805687972, "learning_rate": 3.36115992970123e-08, "logits/chosen": -0.5580108761787415, "logits/rejected": -0.7952514886856079, "logps/chosen": -376.29998779296875, "logps/rejected": -462.07501220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.920068383216858, "rewards/margins": 9.302343368530273, "rewards/rejected": -11.216405868530273, "step": 4400 }, { "epoch": 3.8723918295629254, "grad_norm": 0.7719931703125223, "learning_rate": 3.141476274165202e-08, "logits/chosen": -0.544689953327179, "logits/rejected": -0.8418334722518921, "logps/chosen": -411.5625, "logps/rejected": -463.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.7377440929412842, "rewards/margins": 9.29296875, "rewards/rejected": -11.029687881469727, "step": 4410 }, { "epoch": 3.8811772457720184, "grad_norm": 1.1800826782412661, "learning_rate": 2.9217926186291738e-08, "logits/chosen": -0.4901367127895355, "logits/rejected": -0.8205627202987671, "logps/chosen": -420.51251220703125, "logps/rejected": -490.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.6544678211212158, "rewards/margins": 10.008593559265137, "rewards/rejected": -11.6640625, "step": 4420 }, { "epoch": 3.8899626619811114, "grad_norm": 0.18235364265211731, "learning_rate": 2.7021089630931456e-08, "logits/chosen": -0.5367339849472046, "logits/rejected": -0.7855224609375, "logps/chosen": -376.92498779296875, "logps/rejected": -424.3500061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.7763793468475342, "rewards/margins": 9.834375381469727, "rewards/rejected": -11.608593940734863, "step": 4430 }, { "epoch": 3.8987480781902044, "grad_norm": 0.3144827525540439, "learning_rate": 2.4824253075571177e-08, "logits/chosen": -0.585375964641571, "logits/rejected": -0.857800304889679, "logps/chosen": -369.625, "logps/rejected": -428.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.1150879859924316, "rewards/margins": 9.669530868530273, "rewards/rejected": -11.793749809265137, "step": 4440 }, { "epoch": 3.9075334943992974, "grad_norm": 1.6091155348521224, "learning_rate": 2.2627416520210894e-08, "logits/chosen": -0.623583972454071, "logits/rejected": -0.7492920160293579, "logps/chosen": -383.2749938964844, "logps/rejected": -465.9750061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6556885242462158, "rewards/margins": 9.525781631469727, "rewards/rejected": -11.185937881469727, "step": 4450 }, { "epoch": 3.91631891060839, "grad_norm": 0.25155184030932726, "learning_rate": 2.0430579964850612e-08, "logits/chosen": -0.5146118402481079, "logits/rejected": -0.7874511480331421, "logps/chosen": -417.2749938964844, "logps/rejected": -468.6499938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.5927245616912842, "rewards/margins": 9.71484375, "rewards/rejected": -11.307812690734863, "step": 4460 }, { "epoch": 3.925104326817483, "grad_norm": 4.154825395122275, "learning_rate": 1.8233743409490333e-08, "logits/chosen": -0.5347900390625, "logits/rejected": -0.690112292766571, "logps/chosen": -369.5, "logps/rejected": -433.07501220703125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.3373780250549316, "rewards/margins": 9.625781059265137, "rewards/rejected": -11.95703125, "step": 4470 }, { "epoch": 3.933889743026576, "grad_norm": 0.6071795612124373, "learning_rate": 1.603690685413005e-08, "logits/chosen": -0.4066101014614105, "logits/rejected": -0.692370593547821, "logps/chosen": -410.7875061035156, "logps/rejected": -458.6000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.944787621498108, "rewards/margins": 9.526562690734863, "rewards/rejected": -11.460156440734863, "step": 4480 }, { "epoch": 3.9426751592356686, "grad_norm": 2.5427233056855965, "learning_rate": 1.3840070298769772e-08, "logits/chosen": -0.4770141541957855, "logits/rejected": -0.693041980266571, "logps/chosen": -371.0375061035156, "logps/rejected": -437.1000061035156, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.4297363758087158, "rewards/margins": 9.5234375, "rewards/rejected": -10.952343940734863, "step": 4490 }, { "epoch": 3.9514605754447616, "grad_norm": 0.22970297797933395, "learning_rate": 1.164323374340949e-08, "logits/chosen": -0.6468353271484375, "logits/rejected": -0.806445300579071, "logps/chosen": -380.79998779296875, "logps/rejected": -443.8999938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.0345826148986816, "rewards/margins": 9.59375, "rewards/rejected": -11.616406440734863, "step": 4500 }, { "epoch": 3.9602459916538546, "grad_norm": 0.31389821628651554, "learning_rate": 9.446397188049209e-09, "logits/chosen": -0.569781482219696, "logits/rejected": -0.7513183355331421, "logps/chosen": -417.42498779296875, "logps/rejected": -490.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.1475586891174316, "rewards/margins": 9.6796875, "rewards/rejected": -11.829687118530273, "step": 4510 }, { "epoch": 3.9690314078629476, "grad_norm": 0.9643568227505125, "learning_rate": 7.249560632688927e-09, "logits/chosen": -0.5411376953125, "logits/rejected": -0.7351928949356079, "logps/chosen": -384.95001220703125, "logps/rejected": -458.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.86883544921875, "rewards/margins": 9.639843940734863, "rewards/rejected": -11.514062881469727, "step": 4520 }, { "epoch": 3.9778168240720406, "grad_norm": 1.7143493669754075, "learning_rate": 5.0527240773286466e-09, "logits/chosen": -0.598297119140625, "logits/rejected": -0.7579345703125, "logps/chosen": -397.125, "logps/rejected": -509.45001220703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.204150438308716, "rewards/margins": 9.600781440734863, "rewards/rejected": -11.800000190734863, "step": 4530 }, { "epoch": 3.986602240281133, "grad_norm": 1.0944287263288583, "learning_rate": 2.8558875219683655e-09, "logits/chosen": -0.41770631074905396, "logits/rejected": -0.7955688238143921, "logps/chosen": -379.2749938964844, "logps/rejected": -437.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8447754383087158, "rewards/margins": 9.181249618530273, "rewards/rejected": -11.028124809265137, "step": 4540 }, { "epoch": 3.995387656490226, "grad_norm": 0.44244404956302746, "learning_rate": 6.590509666080844e-10, "logits/chosen": -0.661608874797821, "logits/rejected": -0.954638659954071, "logps/chosen": -393.83123779296875, "logps/rejected": -466.95001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.893408179283142, "rewards/margins": 9.960156440734863, "rewards/rejected": -11.850000381469727, "step": 4550 } ], "logging_steps": 10, "max_steps": 4552, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }