{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.14839797639123, "eval_steps": 100, "global_step": 3300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 262.6458435058594, "epoch": 0.003372681281618887, "grad_norm": 18.2549130889672, "kl": 0.0, "learning_rate": 9.997184684684683e-07, "loss": 0.0, "reward": 2.236002564430237, "reward_std": 0.798353910446167, "rewards/final_reward": 0.1961517347854389, "rewards/mask_iou_reward": 0.09807586739271945, "rewards/sam_format_reward": 0.78125, "rewards/sam_reward_func_ultra": 0.4755857586860657, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1, "think_completion_length": 227.66666666666666 }, { "clip_ratio": 0.0, "completion_length": 268.46876525878906, "epoch": 0.006745362563237774, "grad_norm": 3.0259863371376707, "kl": 0.000598907470703125, "learning_rate": 9.994369369369369e-07, "loss": 0.0, "reward": 2.0085843801498413, "reward_std": 0.7900800108909607, "rewards/final_reward": 0.17079922505335737, "rewards/mask_iou_reward": 0.08539961252667869, "rewards/sam_format_reward": 0.7604166865348816, "rewards/sam_reward_func_ultra": 0.2898343503475189, "rewards/thk_ans_format_reward": 0.9583333730697632, "step": 2, "think_completion_length": 164.70833333333331 }, { "clip_ratio": 0.0, "completion_length": 221.7916717529297, "epoch": 0.01011804384485666, "grad_norm": 3.5258361720821183, "kl": 0.0006771087646484375, "learning_rate": 9.991554054054052e-07, "loss": 0.0, "reward": 2.462438941001892, "reward_std": 0.8611267507076263, "rewards/final_reward": 0.5922547578171589, "rewards/mask_iou_reward": 0.29612737890857943, "rewards/sam_format_reward": 0.8750000298023224, "rewards/sam_reward_func_ultra": 0.6395223438739777, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 3, "think_completion_length": 175.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 224.59375, "epoch": 0.013490725126475547, "grad_norm": 4.726047943419514, "kl": 0.0013427734375, "learning_rate": 9.988738738738738e-07, "loss": 0.0, "reward": 2.2907389402389526, "reward_std": 0.7321855425834656, "rewards/final_reward": 0.4825787532407586, "rewards/mask_iou_reward": 0.2412893766203793, "rewards/sam_format_reward": 0.8854166865348816, "rewards/sam_reward_func_ultra": 0.4574054926633835, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 4, "think_completion_length": 136.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 234.28125762939453, "epoch": 0.016863406408094434, "grad_norm": 5.4744398823185385, "kl": 0.001377105712890625, "learning_rate": 9.985923423423422e-07, "loss": 0.0, "reward": 2.405760407447815, "reward_std": 0.7585574686527252, "rewards/final_reward": 0.7464167821487235, "rewards/mask_iou_reward": 0.37320839107436177, "rewards/sam_format_reward": 0.8750000298023224, "rewards/sam_reward_func_ultra": 0.5724269300699234, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 5, "think_completion_length": 153.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 232.53125762939453, "epoch": 0.02023608768971332, "grad_norm": 2.6261435014718977, "kl": 0.00251007080078125, "learning_rate": 9.983108108108107e-07, "loss": 0.0, "reward": 2.4854605197906494, "reward_std": 0.6170332133769989, "rewards/final_reward": 0.5536762328101035, "rewards/mask_iou_reward": 0.27683811640505174, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 0.5479605048894882, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 6, "think_completion_length": 144.25 }, { "clip_ratio": 0.0, "completion_length": 210.90625, "epoch": 0.023608768971332208, "grad_norm": 3.411913189332803, "kl": 0.00347900390625, "learning_rate": 9.980292792792793e-07, "loss": 0.0, "reward": 2.411745548248291, "reward_std": 0.578468382358551, "rewards/final_reward": 0.24950741038787477, "rewards/mask_iou_reward": 0.12475370519393739, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 0.46382857859134674, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 7, "think_completion_length": 147.25 }, { "clip_ratio": 0.0, "completion_length": 172.30209350585938, "epoch": 0.026981450252951095, "grad_norm": 4.706402296307697, "kl": 0.0042724609375, "learning_rate": 9.977477477477476e-07, "loss": 0.0, "reward": 2.6951953172683716, "reward_std": 0.6168608367443085, "rewards/final_reward": 0.7593355902470126, "rewards/mask_iou_reward": 0.3796677951235063, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 0.7264453172683716, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 8, "think_completion_length": 108.375 }, { "clip_ratio": 0.0, "completion_length": 207.61458587646484, "epoch": 0.03035413153456998, "grad_norm": 5.850108394416396, "kl": 0.00506591796875, "learning_rate": 9.974662162162162e-07, "loss": 0.0, "reward": 2.4541696310043335, "reward_std": 0.5115222632884979, "rewards/final_reward": 0.6870269603439862, "rewards/mask_iou_reward": 0.3435134801719931, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.48541969060897827, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 9, "think_completion_length": 179.58333333333331 }, { "clip_ratio": 0.0, "completion_length": 201.65625762939453, "epoch": 0.03372681281618887, "grad_norm": 3.8688201576142967, "kl": 0.0067596435546875, "learning_rate": 9.971846846846846e-07, "loss": 0.0, "reward": 2.526506543159485, "reward_std": 0.5167315006256104, "rewards/final_reward": 1.0214127943524371, "rewards/mask_iou_reward": 0.5107063971762186, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.6619232296943665, "rewards/thk_ans_format_reward": 0.8750000298023224, "step": 10, "think_completion_length": 109.125 }, { "clip_ratio": 0.0, "completion_length": 185.0, "epoch": 0.03709949409780776, "grad_norm": 4.7134408075823435, "kl": 0.010284423828125, "learning_rate": 9.969031531531531e-07, "loss": 0.0, "reward": 2.434975743293762, "reward_std": 0.40706782042980194, "rewards/final_reward": 0.9454941172749969, "rewards/mask_iou_reward": 0.47274705863749844, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4558090567588806, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 11, "think_completion_length": 133.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 190.86458587646484, "epoch": 0.04047217537942664, "grad_norm": 9.134381347922247, "kl": 0.010589599609375, "learning_rate": 9.966216216216215e-07, "loss": 0.0, "reward": 2.6082929372787476, "reward_std": 0.4820929616689682, "rewards/final_reward": 0.5997067438178366, "rewards/mask_iou_reward": 0.2998533719089183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6187096536159515, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 12, "think_completion_length": 107.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 184.5729217529297, "epoch": 0.04384485666104553, "grad_norm": 5.958574182814823, "kl": 0.014892578125, "learning_rate": 9.9634009009009e-07, "loss": 0.0, "reward": 2.874386191368103, "reward_std": 0.6012694835662842, "rewards/final_reward": 1.1462164005316828, "rewards/mask_iou_reward": 0.5731082002658414, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8952195346355438, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 13, "think_completion_length": 95.5 }, { "clip_ratio": 0.0, "completion_length": 192.67709350585938, "epoch": 0.047217537942664416, "grad_norm": 9.395049743033171, "kl": 0.015472412109375, "learning_rate": 9.960585585585584e-07, "loss": 0.0, "reward": 2.7581610679626465, "reward_std": 0.39922401309013367, "rewards/final_reward": 0.322904850008464, "rewards/mask_iou_reward": 0.161452425004232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7685778141021729, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 14, "think_completion_length": 91.5 }, { "clip_ratio": 0.0, "completion_length": 179.375, "epoch": 0.050590219224283306, "grad_norm": 11.83054025776063, "kl": 0.0218505859375, "learning_rate": 9.95777027027027e-07, "loss": 0.0, "reward": 2.6608554124832153, "reward_std": 0.4703047573566437, "rewards/final_reward": 0.664067629310831, "rewards/mask_iou_reward": 0.3320338146554155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6712720990180969, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 15, "think_completion_length": 95.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.77084350585938, "epoch": 0.05396290050590219, "grad_norm": 4.207564129436361, "kl": 0.023193359375, "learning_rate": 9.954954954954955e-07, "loss": 0.0, "reward": 2.6589574813842773, "reward_std": 0.2804350033402443, "rewards/final_reward": 0.302006787477356, "rewards/mask_iou_reward": 0.151003393738678, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6589572131633759, "rewards/thk_ans_format_reward": 1.0, "step": 16, "think_completion_length": 122.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 176.70834350585938, "epoch": 0.05733558178752108, "grad_norm": 10.925973964489978, "kl": 0.02789306640625, "learning_rate": 9.952139639639639e-07, "loss": 0.0, "reward": 2.7561737298965454, "reward_std": 0.5501474440097809, "rewards/final_reward": 0.37828136243168875, "rewards/mask_iou_reward": 0.18914068121584438, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7561735510826111, "rewards/thk_ans_format_reward": 1.0, "step": 17, "think_completion_length": 101.75 }, { "clip_ratio": 0.0, "completion_length": 199.92708587646484, "epoch": 0.06070826306913996, "grad_norm": 4.10772567894951, "kl": 0.02850341796875, "learning_rate": 9.949324324324325e-07, "loss": 0.0, "reward": 2.593306541442871, "reward_std": 0.36468201875686646, "rewards/final_reward": 0.4049101159612564, "rewards/mask_iou_reward": 0.25851768383432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5933065414428711, "rewards/thk_ans_format_reward": 1.0, "step": 18, "think_completion_length": 110.0 }, { "clip_ratio": 0.0, "completion_length": 191.48958587646484, "epoch": 0.06408094435075885, "grad_norm": 5.711915284565923, "kl": 0.03118896484375, "learning_rate": 9.946509009009008e-07, "loss": 0.0, "reward": 2.6937522888183594, "reward_std": 0.5062113702297211, "rewards/final_reward": 1.0718381430291133, "rewards/mask_iou_reward": 0.5359190715145566, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.7145856469869614, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 19, "think_completion_length": 129.66666666666669 }, { "clip_ratio": 0.0, "completion_length": 185.34375762939453, "epoch": 0.06745362563237774, "grad_norm": 5.726509164029333, "kl": 0.039794921875, "learning_rate": 9.943693693693694e-07, "loss": 0.0, "reward": 2.4703195095062256, "reward_std": 0.3861998915672302, "rewards/final_reward": 0.34144271480993443, "rewards/mask_iou_reward": 0.17072135740496722, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.4807361662387848, "rewards/thk_ans_format_reward": 1.0, "step": 20, "think_completion_length": 109.95833333333334 }, { "clip_ratio": 0.0, "completion_length": 180.11459350585938, "epoch": 0.07082630691399663, "grad_norm": 10.317711135994427, "kl": 0.036376953125, "learning_rate": 9.940878378378377e-07, "loss": 0.0, "reward": 2.5708796977996826, "reward_std": 0.36896252632141113, "rewards/final_reward": 0.44824195500453407, "rewards/mask_iou_reward": 0.22412097750226703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5708796679973602, "rewards/thk_ans_format_reward": 1.0, "step": 21, "think_completion_length": 123.75 }, { "clip_ratio": 0.0, "completion_length": 189.96875762939453, "epoch": 0.07419898819561552, "grad_norm": 3.1813431001378536, "kl": 0.0360107421875, "learning_rate": 9.938063063063063e-07, "loss": 0.0001, "reward": 2.4606316089630127, "reward_std": 0.35946163535118103, "rewards/final_reward": 0.37041026102779656, "rewards/mask_iou_reward": 0.18520513051389828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.46063150465488434, "rewards/thk_ans_format_reward": 1.0, "step": 22, "think_completion_length": 100.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 198.6979217529297, "epoch": 0.0775716694772344, "grad_norm": 4.09602863721668, "kl": 0.047607421875, "learning_rate": 9.935247747747747e-07, "loss": 0.0, "reward": 2.44514799118042, "reward_std": 0.40501701831817627, "rewards/final_reward": 0.08334334422518366, "rewards/mask_iou_reward": 0.04167167211259183, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.4555644392967224, "rewards/thk_ans_format_reward": 1.0, "step": 23, "think_completion_length": 99.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 166.5104217529297, "epoch": 0.08094435075885328, "grad_norm": 6.604801123537757, "kl": 0.0496826171875, "learning_rate": 9.932432432432432e-07, "loss": 0.0, "reward": 2.8794195652008057, "reward_std": 0.4920074939727783, "rewards/final_reward": 0.1977291756476331, "rewards/mask_iou_reward": 0.09886458782381655, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9106696248054504, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 24, "think_completion_length": 97.375 }, { "clip_ratio": 0.0, "completion_length": 175.6041717529297, "epoch": 0.08431703204047218, "grad_norm": 6.320587869504459, "kl": 0.0426025390625, "learning_rate": 9.929617117117116e-07, "loss": 0.0, "reward": 2.7572332620620728, "reward_std": 0.41600461304187775, "rewards/final_reward": 0.6702880951115837, "rewards/mask_iou_reward": 0.33514404755579186, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.7676499336957932, "rewards/thk_ans_format_reward": 1.0, "step": 25, "think_completion_length": 101.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.30209350585938, "epoch": 0.08768971332209106, "grad_norm": 7.749212860781184, "kl": 0.046630859375, "learning_rate": 9.926801801801801e-07, "loss": 0.0, "reward": 2.748945713043213, "reward_std": 0.4598637521266937, "rewards/final_reward": 0.6600739575919204, "rewards/mask_iou_reward": 0.3300369787959602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7489454746246338, "rewards/thk_ans_format_reward": 1.0, "step": 26, "think_completion_length": 107.375 }, { "clip_ratio": 0.0, "completion_length": 164.75000762939453, "epoch": 0.09106239460370995, "grad_norm": 4.032996224301655, "kl": 0.052001953125, "learning_rate": 9.923986486486487e-07, "loss": 0.0001, "reward": 2.9561160802841187, "reward_std": 0.4863891154527664, "rewards/final_reward": 0.6586801252841948, "rewards/mask_iou_reward": 0.3293400626420974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9561160504817963, "rewards/thk_ans_format_reward": 1.0, "step": 27, "think_completion_length": 84.75 }, { "clip_ratio": 0.0, "completion_length": 177.68750762939453, "epoch": 0.09443507588532883, "grad_norm": 22.503580944554333, "kl": 0.05712890625, "learning_rate": 9.92117117117117e-07, "loss": 0.0001, "reward": 2.79542875289917, "reward_std": 0.3854813724756241, "rewards/final_reward": 0.9766034139788797, "rewards/mask_iou_reward": 0.48830170698943987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7954287976026535, "rewards/thk_ans_format_reward": 1.0, "step": 28, "think_completion_length": 107.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 161.90625, "epoch": 0.09780775716694773, "grad_norm": 5.444098681896409, "kl": 0.053955078125, "learning_rate": 9.918355855855856e-07, "loss": 0.0001, "reward": 3.074863076210022, "reward_std": 0.3014441579580307, "rewards/final_reward": 1.681790123556664, "rewards/mask_iou_reward": 0.840895061778332, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0748630166053772, "rewards/thk_ans_format_reward": 1.0, "step": 29, "think_completion_length": 108.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 174.90625762939453, "epoch": 0.10118043844856661, "grad_norm": 3.941049607151489, "kl": 0.0626220703125, "learning_rate": 9.91554054054054e-07, "loss": 0.0001, "reward": 2.320728898048401, "reward_std": 0.24627278745174408, "rewards/final_reward": 0.30145627819432563, "rewards/mask_iou_reward": 0.15072813909716282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.3207288384437561, "rewards/thk_ans_format_reward": 1.0, "step": 30, "think_completion_length": 75.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 166.37500762939453, "epoch": 0.1045531197301855, "grad_norm": 5.256367822273493, "kl": 0.072021484375, "learning_rate": 9.912725225225226e-07, "loss": 0.0001, "reward": 2.81955087184906, "reward_std": 0.4395739734172821, "rewards/final_reward": 0.6934297826888204, "rewards/mask_iou_reward": 0.3467148913444102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8195509016513824, "rewards/thk_ans_format_reward": 1.0, "step": 31, "think_completion_length": 75.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 153.8541717529297, "epoch": 0.10792580101180438, "grad_norm": 4.941175549973696, "kl": 0.0615234375, "learning_rate": 9.90990990990991e-07, "loss": 0.0001, "reward": 3.0443195104599, "reward_std": 0.36992163956165314, "rewards/final_reward": 1.0586814928047903, "rewards/mask_iou_reward": 0.5293407464023951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0443194210529327, "rewards/thk_ans_format_reward": 1.0, "step": 32, "think_completion_length": 106.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 173.14584350585938, "epoch": 0.11129848229342328, "grad_norm": 6.373319719553773, "kl": 0.06689453125, "learning_rate": 9.907094594594595e-07, "loss": 0.0001, "reward": 2.7814066410064697, "reward_std": 0.34776973724365234, "rewards/final_reward": 0.39388861840120193, "rewards/mask_iou_reward": 0.19694430920060096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7814066708087921, "rewards/thk_ans_format_reward": 1.0, "step": 33, "think_completion_length": 93.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 152.0, "epoch": 0.11467116357504216, "grad_norm": 6.760195356920477, "kl": 0.063232421875, "learning_rate": 9.904279279279278e-07, "loss": 0.0001, "reward": 3.0221351385116577, "reward_std": 0.3316876143217087, "rewards/final_reward": 1.0392349291181158, "rewards/mask_iou_reward": 0.5196174645590579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0221351981163025, "rewards/thk_ans_format_reward": 1.0, "step": 34, "think_completion_length": 72.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 150.12500762939453, "epoch": 0.11804384485666104, "grad_norm": 11.70900945875506, "kl": 0.071044921875, "learning_rate": 9.901463963963964e-07, "loss": 0.0001, "reward": 2.652697205543518, "reward_std": 0.44748905301094055, "rewards/final_reward": 1.167913835000586, "rewards/mask_iou_reward": 0.583956917500293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6526971757411957, "rewards/thk_ans_format_reward": 1.0, "step": 35, "think_completion_length": 93.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 150.48959350585938, "epoch": 0.12141652613827993, "grad_norm": 10.978024050704882, "kl": 0.0614013671875, "learning_rate": 9.89864864864865e-07, "loss": 0.0001, "reward": 3.1266239881515503, "reward_std": 0.3739871680736542, "rewards/final_reward": 1.6664728440852943, "rewards/mask_iou_reward": 0.8332364220426471, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1370405852794647, "rewards/thk_ans_format_reward": 1.0, "step": 36, "think_completion_length": 75.25 }, { "clip_ratio": 0.0, "completion_length": 166.1666717529297, "epoch": 0.12478920741989882, "grad_norm": 4.687339166938046, "kl": 0.07763671875, "learning_rate": 9.895833333333333e-07, "loss": 0.0001, "reward": 2.7773534059524536, "reward_std": 0.4481920897960663, "rewards/final_reward": 0.7356097783982714, "rewards/mask_iou_reward": 0.3678048891991357, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.7981867790222168, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 37, "think_completion_length": 78.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 158.65625, "epoch": 0.1281618887015177, "grad_norm": 16.500247450151882, "kl": 0.07470703125, "learning_rate": 9.893018018018019e-07, "loss": 0.0001, "reward": 2.6673413515090942, "reward_std": 0.41427473723888397, "rewards/final_reward": 0.9702149166908527, "rewards/mask_iou_reward": 0.48510745834542635, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.6777579486370087, "rewards/thk_ans_format_reward": 1.0, "step": 38, "think_completion_length": 86.125 }, { "clip_ratio": 0.0, "completion_length": 171.98959350585938, "epoch": 0.1315345699831366, "grad_norm": 58.074807859002405, "kl": 0.08544921875, "learning_rate": 9.890202702702702e-07, "loss": 0.0001, "reward": 2.5168732404708862, "reward_std": 0.6337110698223114, "rewards/final_reward": 0.47920808376200985, "rewards/mask_iou_reward": 0.23960404188100493, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 0.5585398375988007, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 39, "think_completion_length": 95.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 156.45833587646484, "epoch": 0.13490725126475547, "grad_norm": 4.72506162337957, "kl": 0.073486328125, "learning_rate": 9.887387387387386e-07, "loss": 0.0001, "reward": 3.031570553779602, "reward_std": 0.46547742187976837, "rewards/final_reward": 0.9411258904506269, "rewards/mask_iou_reward": 0.47056294522531344, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.031570553779602, "rewards/thk_ans_format_reward": 1.0, "step": 40, "think_completion_length": 87.66666666666667 }, { "clip_ratio": 0.0, "completion_length": 155.1354217529297, "epoch": 0.13827993254637436, "grad_norm": 6.6206234978194125, "kl": 0.078369140625, "learning_rate": 9.884572072072072e-07, "loss": 0.0001, "reward": 3.18060564994812, "reward_std": 0.49932096898555756, "rewards/final_reward": 1.1321352132892444, "rewards/mask_iou_reward": 0.5660676066446222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1806055903434753, "rewards/thk_ans_format_reward": 1.0, "step": 41, "think_completion_length": 82.25 }, { "clip_ratio": 0.0, "completion_length": 161.0416717529297, "epoch": 0.14165261382799327, "grad_norm": 3.8491770091768016, "kl": 0.08154296875, "learning_rate": 9.881756756756755e-07, "loss": 0.0001, "reward": 2.55053448677063, "reward_std": 0.31911052763462067, "rewards/final_reward": 0.5311662418642663, "rewards/mask_iou_reward": 0.26558312093213315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5505344271659851, "rewards/thk_ans_format_reward": 1.0, "step": 42, "think_completion_length": 90.25 }, { "clip_ratio": 0.0, "completion_length": 145.0416717529297, "epoch": 0.14502529510961215, "grad_norm": 6.789756683137194, "kl": 0.078369140625, "learning_rate": 9.87894144144144e-07, "loss": 0.0001, "reward": 2.9492753744125366, "reward_std": 0.2880386933684349, "rewards/final_reward": 0.5763597238360774, "rewards/mask_iou_reward": 0.2881798619180387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9492754340171814, "rewards/thk_ans_format_reward": 1.0, "step": 43, "think_completion_length": 70.875 }, { "clip_ratio": 0.0, "completion_length": 167.71875762939453, "epoch": 0.14839797639123103, "grad_norm": 13.041295860696609, "kl": 0.078369140625, "learning_rate": 9.876126126126124e-07, "loss": 0.0001, "reward": 3.086544990539551, "reward_std": 0.3482399433851242, "rewards/final_reward": 1.6059155017247742, "rewards/mask_iou_reward": 0.8029577508623871, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0865449905395508, "rewards/thk_ans_format_reward": 1.0, "step": 44, "think_completion_length": 86.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.78125762939453, "epoch": 0.15177065767284992, "grad_norm": 7.34001355188649, "kl": 0.07568359375, "learning_rate": 9.87331081081081e-07, "loss": 0.0001, "reward": 2.7933939695358276, "reward_std": 0.455650195479393, "rewards/final_reward": 1.0957763493410244, "rewards/mask_iou_reward": 0.5478881746705122, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7933937758207321, "rewards/thk_ans_format_reward": 1.0, "step": 45, "think_completion_length": 72.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 155.1354217529297, "epoch": 0.1551433389544688, "grad_norm": 6.723874556684061, "kl": 0.08349609375, "learning_rate": 9.870495495495496e-07, "loss": 0.0001, "reward": 3.034854769706726, "reward_std": 0.3657161295413971, "rewards/final_reward": 1.2670155002357293, "rewards/mask_iou_reward": 0.6335077501178646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0348548293113708, "rewards/thk_ans_format_reward": 1.0, "step": 46, "think_completion_length": 85.375 }, { "clip_ratio": 0.0, "completion_length": 157.5729217529297, "epoch": 0.15851602023608768, "grad_norm": 5.1064273040513815, "kl": 0.094970703125, "learning_rate": 9.86768018018018e-07, "loss": 0.0001, "reward": 2.641897439956665, "reward_std": 0.4104642868041992, "rewards/final_reward": 0.6724832396800458, "rewards/mask_iou_reward": 0.3362416198400229, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.6523140668869019, "rewards/thk_ans_format_reward": 1.0, "step": 47, "think_completion_length": 83.625 }, { "clip_ratio": 0.0, "completion_length": 159.14584350585938, "epoch": 0.16188870151770657, "grad_norm": 15.24708912565845, "kl": 0.08447265625, "learning_rate": 9.864864864864865e-07, "loss": 0.0001, "reward": 2.9382331371307373, "reward_std": 0.3587312549352646, "rewards/final_reward": 1.0089048256920612, "rewards/mask_iou_reward": 0.5044524128460306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9382330775260925, "rewards/thk_ans_format_reward": 1.0, "step": 48, "think_completion_length": 83.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 153.59375, "epoch": 0.16526138279932545, "grad_norm": 7.158849673206477, "kl": 0.09033203125, "learning_rate": 9.862049549549548e-07, "loss": 0.0001, "reward": 2.7654584646224976, "reward_std": 0.3078659772872925, "rewards/final_reward": 0.931761202507062, "rewards/mask_iou_reward": 0.465880601253531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7654582858085632, "rewards/thk_ans_format_reward": 1.0, "step": 49, "think_completion_length": 77.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 156.30208587646484, "epoch": 0.16863406408094436, "grad_norm": 8.151717013930563, "kl": 0.090087890625, "learning_rate": 9.859234234234234e-07, "loss": 0.0001, "reward": 2.9706579446792603, "reward_std": 0.39569392800331116, "rewards/final_reward": 0.44502848416067103, "rewards/mask_iou_reward": 0.22251424208033552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9706579446792603, "rewards/thk_ans_format_reward": 1.0, "step": 50, "think_completion_length": 102.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 143.7916717529297, "epoch": 0.17200674536256325, "grad_norm": 6.831911618016697, "kl": 0.085205078125, "learning_rate": 9.856418918918918e-07, "loss": 0.0001, "reward": 3.266450881958008, "reward_std": 0.2550960034132004, "rewards/final_reward": 1.463418817207728, "rewards/mask_iou_reward": 0.731709408603864, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2664507329463959, "rewards/thk_ans_format_reward": 1.0, "step": 51, "think_completion_length": 74.875 }, { "clip_ratio": 0.0, "completion_length": 149.15625, "epoch": 0.17537942664418213, "grad_norm": 84.87485355159552, "kl": 0.09228515625, "learning_rate": 9.853603603603603e-07, "loss": 0.0001, "reward": 2.6442378759384155, "reward_std": 0.28126492351293564, "rewards/final_reward": 1.2378643356828511, "rewards/mask_iou_reward": 0.6189321678414256, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6442377269268036, "rewards/thk_ans_format_reward": 1.0, "step": 52, "think_completion_length": 65.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 156.27083587646484, "epoch": 0.178752107925801, "grad_norm": 11.894833801964715, "kl": 0.089111328125, "learning_rate": 9.850788288288287e-07, "loss": 0.0001, "reward": 2.7687995433807373, "reward_std": 0.26568184792995453, "rewards/final_reward": 0.879780526858291, "rewards/mask_iou_reward": 0.4398902634291455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7687995284795761, "rewards/thk_ans_format_reward": 1.0, "step": 53, "think_completion_length": 72.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 153.0416717529297, "epoch": 0.1821247892074199, "grad_norm": 5.807968972350355, "kl": 0.089111328125, "learning_rate": 9.847972972972973e-07, "loss": 0.0001, "reward": 2.8916733264923096, "reward_std": 0.20665724575519562, "rewards/final_reward": 0.7938957174186951, "rewards/mask_iou_reward": 0.39694785870934757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8916733860969543, "rewards/thk_ans_format_reward": 1.0, "step": 54, "think_completion_length": 82.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.87500762939453, "epoch": 0.18549747048903878, "grad_norm": 7.6025544739888975, "kl": 0.100341796875, "learning_rate": 9.845157657657656e-07, "loss": 0.0001, "reward": 2.9938149452209473, "reward_std": 0.2959776520729065, "rewards/final_reward": 0.8594974404944786, "rewards/mask_iou_reward": 0.4297487202472393, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.0146484375, "rewards/thk_ans_format_reward": 1.0, "step": 55, "think_completion_length": 76.20833333333334 }, { "clip_ratio": 0.0, "completion_length": 145.6666717529297, "epoch": 0.18887015177065766, "grad_norm": 4.240339295381267, "kl": 0.09375, "learning_rate": 9.842342342342342e-07, "loss": 0.0001, "reward": 3.2571107149124146, "reward_std": 0.3112206757068634, "rewards/final_reward": 1.0376420191306543, "rewards/mask_iou_reward": 0.5188210095653272, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2571107149124146, "rewards/thk_ans_format_reward": 1.0, "step": 56, "think_completion_length": 69.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 148.9375, "epoch": 0.19224283305227655, "grad_norm": 9.498023007923031, "kl": 0.0947265625, "learning_rate": 9.839527027027027e-07, "loss": 0.0001, "reward": 2.7155935764312744, "reward_std": 0.33761440217494965, "rewards/final_reward": 1.145701922917429, "rewards/mask_iou_reward": 0.5728509614587145, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.7260101139545441, "rewards/thk_ans_format_reward": 1.0, "step": 57, "think_completion_length": 79.25 }, { "clip_ratio": 0.0, "completion_length": 138.95833587646484, "epoch": 0.19561551433389546, "grad_norm": 12.122477919232969, "kl": 0.1005859375, "learning_rate": 9.83671171171171e-07, "loss": 0.0001, "reward": 2.712533116340637, "reward_std": 0.20152553170919418, "rewards/final_reward": 1.0981164497885971, "rewards/mask_iou_reward": 0.5490582248942986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7125331163406372, "rewards/thk_ans_format_reward": 1.0, "step": 58, "think_completion_length": 64.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 152.39583587646484, "epoch": 0.19898819561551434, "grad_norm": 4.960251178888369, "kl": 0.09619140625, "learning_rate": 9.833896396396397e-07, "loss": 0.0001, "reward": 2.893130302429199, "reward_std": 0.26960865780711174, "rewards/final_reward": 0.44181123180177767, "rewards/mask_iou_reward": 0.22090561590088884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8931301832199097, "rewards/thk_ans_format_reward": 1.0, "step": 59, "think_completion_length": 83.375 }, { "clip_ratio": 0.0, "completion_length": 146.34375762939453, "epoch": 0.20236087689713322, "grad_norm": 14.477576256950904, "kl": 0.112060546875, "learning_rate": 9.83108108108108e-07, "loss": 0.0001, "reward": 3.004229187965393, "reward_std": 0.26009829342365265, "rewards/final_reward": 1.154425956510769, "rewards/mask_iou_reward": 0.5772129782553845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0042291581630707, "rewards/thk_ans_format_reward": 1.0, "step": 60, "think_completion_length": 65.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 148.48958587646484, "epoch": 0.2057335581787521, "grad_norm": 5.4023722595664365, "kl": 0.10302734375, "learning_rate": 9.828265765765766e-07, "loss": 0.0001, "reward": 2.922378182411194, "reward_std": 0.2460155412554741, "rewards/final_reward": 1.1422041590321816, "rewards/mask_iou_reward": 0.5711020795160908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9223781824111938, "rewards/thk_ans_format_reward": 1.0, "step": 61, "think_completion_length": 74.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 150.84375762939453, "epoch": 0.209106239460371, "grad_norm": 17.679265660046486, "kl": 0.159912109375, "learning_rate": 9.82545045045045e-07, "loss": 0.0002, "reward": 2.8391278982162476, "reward_std": 0.38828714191913605, "rewards/final_reward": 1.057218125291731, "rewards/mask_iou_reward": 0.5286090626458655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8391278684139252, "rewards/thk_ans_format_reward": 1.0, "step": 62, "think_completion_length": 71.04166666666667 }, { "clip_ratio": 0.0, "completion_length": 141.30208587646484, "epoch": 0.21247892074198987, "grad_norm": 13.719301439552309, "kl": 0.11181640625, "learning_rate": 9.822635135135135e-07, "loss": 0.0001, "reward": 3.010664939880371, "reward_std": 0.240879625082016, "rewards/final_reward": 1.6053594054522557, "rewards/mask_iou_reward": 0.8026797027261279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0106649100780487, "rewards/thk_ans_format_reward": 1.0, "step": 63, "think_completion_length": 69.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 140.8854217529297, "epoch": 0.21585160202360876, "grad_norm": 40.58424129421211, "kl": 0.12646484375, "learning_rate": 9.819819819819819e-07, "loss": 0.0001, "reward": 2.95255708694458, "reward_std": 0.3045773357152939, "rewards/final_reward": 0.5511350359294102, "rewards/mask_iou_reward": 0.2755675179647051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9525572657585144, "rewards/thk_ans_format_reward": 1.0, "step": 64, "think_completion_length": 76.625 }, { "clip_ratio": 0.0, "completion_length": 156.6666717529297, "epoch": 0.21922428330522767, "grad_norm": 6.376365790032157, "kl": 0.111328125, "learning_rate": 9.817004504504504e-07, "loss": 0.0001, "reward": 3.245211124420166, "reward_std": 0.3611130267381668, "rewards/final_reward": 1.1441606900744024, "rewards/mask_iou_reward": 0.5720803450372012, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.255627989768982, "rewards/thk_ans_format_reward": 1.0, "step": 65, "think_completion_length": 73.75 }, { "clip_ratio": 0.0, "completion_length": 149.8854217529297, "epoch": 0.22259696458684655, "grad_norm": 4.339143871006515, "kl": 0.11181640625, "learning_rate": 9.81418918918919e-07, "loss": 0.0001, "reward": 3.0247561931610107, "reward_std": 0.34554168581962585, "rewards/final_reward": 1.0067040342282128, "rewards/mask_iou_reward": 0.5033520171141064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.024756371974945, "rewards/thk_ans_format_reward": 1.0, "step": 66, "think_completion_length": 76.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 141.21875762939453, "epoch": 0.22596964586846544, "grad_norm": 5.8507038942252905, "kl": 0.112548828125, "learning_rate": 9.811373873873873e-07, "loss": 0.0001, "reward": 2.7070631980895996, "reward_std": 0.27164027094841003, "rewards/final_reward": 0.6720597954271437, "rewards/mask_iou_reward": 0.33602989771357183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7070631682872772, "rewards/thk_ans_format_reward": 1.0, "step": 67, "think_completion_length": 60.625 }, { "clip_ratio": 0.0, "completion_length": 160.55209350585938, "epoch": 0.22934232715008432, "grad_norm": 10.822994146609568, "kl": 0.12646484375, "learning_rate": 9.80855855855856e-07, "loss": 0.0001, "reward": 2.8537681102752686, "reward_std": 0.34967314451932907, "rewards/final_reward": 0.8741368306469235, "rewards/mask_iou_reward": 0.43706841532346175, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.874601423740387, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 68, "think_completion_length": 66.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 136.86459350585938, "epoch": 0.2327150084317032, "grad_norm": 5.3148935449566235, "kl": 0.142578125, "learning_rate": 9.805743243243243e-07, "loss": 0.0001, "reward": 2.923313856124878, "reward_std": 0.38698340952396393, "rewards/final_reward": 0.8756449063816937, "rewards/mask_iou_reward": 0.43782245319084684, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9233138561248779, "rewards/thk_ans_format_reward": 1.0, "step": 69, "think_completion_length": 79.625 }, { "clip_ratio": 0.0, "completion_length": 137.6979217529297, "epoch": 0.23608768971332209, "grad_norm": 5.415557085648394, "kl": 0.107177734375, "learning_rate": 9.802927927927928e-07, "loss": 0.0001, "reward": 2.9035059213638306, "reward_std": 0.19867272675037384, "rewards/final_reward": 1.3126234085256436, "rewards/mask_iou_reward": 0.6563117042628218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9035059213638306, "rewards/thk_ans_format_reward": 1.0, "step": 70, "think_completion_length": 77.875 }, { "clip_ratio": 0.0, "completion_length": 144.64584350585938, "epoch": 0.23946037099494097, "grad_norm": 10.86839202590463, "kl": 0.114990234375, "learning_rate": 9.800112612612612e-07, "loss": 0.0001, "reward": 2.8913815021514893, "reward_std": 0.5183416604995728, "rewards/final_reward": 0.48871107211835074, "rewards/mask_iou_reward": 0.24435553605917537, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9017982482910156, "rewards/thk_ans_format_reward": 1.0, "step": 71, "think_completion_length": 84.5 }, { "clip_ratio": 0.0, "completion_length": 139.1041717529297, "epoch": 0.24283305227655985, "grad_norm": 4.026949460423362, "kl": 0.116943359375, "learning_rate": 9.797297297297298e-07, "loss": 0.0001, "reward": 3.240618944168091, "reward_std": 0.22773104906082153, "rewards/final_reward": 0.7806309576412722, "rewards/mask_iou_reward": 0.3903154788206361, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2406185865402222, "rewards/thk_ans_format_reward": 1.0, "step": 72, "think_completion_length": 79.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 136.45833587646484, "epoch": 0.24620573355817876, "grad_norm": 4.335677102355079, "kl": 0.107177734375, "learning_rate": 9.794481981981981e-07, "loss": 0.0001, "reward": 2.805757761001587, "reward_std": 0.3943777531385422, "rewards/final_reward": 0.8992796951354338, "rewards/mask_iou_reward": 0.4496398475677169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8057577311992645, "rewards/thk_ans_format_reward": 1.0, "step": 73, "think_completion_length": 70.375 }, { "clip_ratio": 0.0, "completion_length": 143.36458587646484, "epoch": 0.24957841483979765, "grad_norm": 11.769442693892403, "kl": 0.12890625, "learning_rate": 9.791666666666667e-07, "loss": 0.0001, "reward": 3.5046173334121704, "reward_std": 0.3032621145248413, "rewards/final_reward": 1.4271000649967103, "rewards/mask_iou_reward": 0.7135500324983551, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.504617154598236, "rewards/thk_ans_format_reward": 1.0, "step": 74, "think_completion_length": 71.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 155.17708587646484, "epoch": 0.25295109612141653, "grad_norm": 8.11512275742484, "kl": 0.1064453125, "learning_rate": 9.78885135135135e-07, "loss": 0.0001, "reward": 2.857938289642334, "reward_std": 0.2891754060983658, "rewards/final_reward": 1.2226563332272598, "rewards/mask_iou_reward": 0.6113281666136299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.857938289642334, "rewards/thk_ans_format_reward": 1.0, "step": 75, "think_completion_length": 80.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 135.23958587646484, "epoch": 0.2563237774030354, "grad_norm": 7.686578576868314, "kl": 0.127685546875, "learning_rate": 9.786036036036036e-07, "loss": 0.0001, "reward": 3.027070999145508, "reward_std": 0.3380560874938965, "rewards/final_reward": 0.1450894432775949, "rewards/mask_iou_reward": 0.07254472163879745, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0270708799362183, "rewards/thk_ans_format_reward": 1.0, "step": 76, "think_completion_length": 72.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 137.7291717529297, "epoch": 0.2596964586846543, "grad_norm": 4.448677151834081, "kl": 0.12939453125, "learning_rate": 9.783220720720722e-07, "loss": 0.0001, "reward": 2.7701700925827026, "reward_std": 0.20241041854023933, "rewards/final_reward": 0.9208412423412211, "rewards/mask_iou_reward": 0.46042062117061056, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7701701521873474, "rewards/thk_ans_format_reward": 1.0, "step": 77, "think_completion_length": 83.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 143.1979217529297, "epoch": 0.2630691399662732, "grad_norm": 8.075882706845949, "kl": 0.123291015625, "learning_rate": 9.780405405405405e-07, "loss": 0.0001, "reward": 3.1208958625793457, "reward_std": 0.27007415145635605, "rewards/final_reward": 1.3364821226854584, "rewards/mask_iou_reward": 0.6682410613427292, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1208957433700562, "rewards/thk_ans_format_reward": 1.0, "step": 78, "think_completion_length": 69.41666666666666 }, { "clip_ratio": 0.0, "completion_length": 134.9791717529297, "epoch": 0.26644182124789206, "grad_norm": 11.33968365370947, "kl": 0.131103515625, "learning_rate": 9.77759009009009e-07, "loss": 0.0001, "reward": 3.168465733528137, "reward_std": 0.20537365972995758, "rewards/final_reward": 0.6006425484332252, "rewards/mask_iou_reward": 0.3003212742166126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1684656739234924, "rewards/thk_ans_format_reward": 1.0, "step": 79, "think_completion_length": 74.875 }, { "clip_ratio": 0.0, "completion_length": 140.125, "epoch": 0.26981450252951095, "grad_norm": 29.683199255535733, "kl": 0.13330078125, "learning_rate": 9.774774774774774e-07, "loss": 0.0001, "reward": 2.8754775524139404, "reward_std": 0.3308670222759247, "rewards/final_reward": 1.0366864076232571, "rewards/mask_iou_reward": 0.5183432038116286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8754774928092957, "rewards/thk_ans_format_reward": 1.0, "step": 80, "think_completion_length": 66.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 147.21875762939453, "epoch": 0.27318718381112983, "grad_norm": 12.216127743389162, "kl": 0.1181640625, "learning_rate": 9.771959459459458e-07, "loss": 0.0001, "reward": 2.8978902101516724, "reward_std": 0.40983031690120697, "rewards/final_reward": 0.950881268056939, "rewards/mask_iou_reward": 0.4754406340284695, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9083065688610077, "rewards/thk_ans_format_reward": 1.0, "step": 81, "think_completion_length": 71.58333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.36458587646484, "epoch": 0.2765598650927487, "grad_norm": 7.045076203455561, "kl": 0.3701171875, "learning_rate": 9.769144144144144e-07, "loss": 0.0004, "reward": 2.778691291809082, "reward_std": 0.3490441143512726, "rewards/final_reward": 0.33543247619297384, "rewards/mask_iou_reward": 0.16771623809648692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7786912024021149, "rewards/thk_ans_format_reward": 1.0, "step": 82, "think_completion_length": 76.91666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.46875762939453, "epoch": 0.2799325463743676, "grad_norm": 5.069750542905911, "kl": 0.12451171875, "learning_rate": 9.766328828828827e-07, "loss": 0.0001, "reward": 2.6801772117614746, "reward_std": 0.4206371158361435, "rewards/final_reward": 1.0430866882172039, "rewards/mask_iou_reward": 0.5215433441086019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6801770925521851, "rewards/thk_ans_format_reward": 1.0, "step": 83, "think_completion_length": 79.25 }, { "clip_ratio": 0.0, "completion_length": 137.77084350585938, "epoch": 0.28330522765598654, "grad_norm": 12.00513443485755, "kl": 0.12255859375, "learning_rate": 9.763513513513513e-07, "loss": 0.0001, "reward": 2.851745367050171, "reward_std": 0.4787246733903885, "rewards/final_reward": 1.1709810151121771, "rewards/mask_iou_reward": 0.5854905075560886, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8621618747711182, "rewards/thk_ans_format_reward": 1.0, "step": 84, "think_completion_length": 74.375 }, { "clip_ratio": 0.0, "completion_length": 142.40625762939453, "epoch": 0.2866779089376054, "grad_norm": 22.291846229168698, "kl": 0.133544921875, "learning_rate": 9.760698198198196e-07, "loss": 0.0001, "reward": 2.7811670303344727, "reward_std": 0.2809675335884094, "rewards/final_reward": 0.6003249698041416, "rewards/mask_iou_reward": 0.3001624849020708, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7811669409275055, "rewards/thk_ans_format_reward": 1.0, "step": 85, "think_completion_length": 73.25 }, { "clip_ratio": 0.0, "completion_length": 143.46875762939453, "epoch": 0.2900505902192243, "grad_norm": 9.086774756116599, "kl": 0.1328125, "learning_rate": 9.757882882882882e-07, "loss": 0.0001, "reward": 3.1159489154815674, "reward_std": 0.1485934928059578, "rewards/final_reward": 1.1099682739039065, "rewards/mask_iou_reward": 0.5549841369519533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.115948736667633, "rewards/thk_ans_format_reward": 1.0, "step": 86, "think_completion_length": 83.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 150.4791717529297, "epoch": 0.2934232715008432, "grad_norm": 10.646510048049379, "kl": 0.112060546875, "learning_rate": 9.755067567567568e-07, "loss": 0.0001, "reward": 3.175132393836975, "reward_std": 0.4541157931089401, "rewards/final_reward": 0.9455331042822714, "rewards/mask_iou_reward": 0.4727665521411357, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1855489611625671, "rewards/thk_ans_format_reward": 1.0, "step": 87, "think_completion_length": 77.375 }, { "clip_ratio": 0.0, "completion_length": 154.1979217529297, "epoch": 0.29679595278246207, "grad_norm": 19.66868318193964, "kl": 0.103271484375, "learning_rate": 9.752252252252251e-07, "loss": 0.0001, "reward": 3.08430278301239, "reward_std": 0.24113387614488602, "rewards/final_reward": 1.204691090668408, "rewards/mask_iou_reward": 0.602345545334204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.084302693605423, "rewards/thk_ans_format_reward": 1.0, "step": 88, "think_completion_length": 87.5 }, { "clip_ratio": 0.0, "completion_length": 141.14584350585938, "epoch": 0.30016863406408095, "grad_norm": 4.329310940421637, "kl": 0.13623046875, "learning_rate": 9.749436936936937e-07, "loss": 0.0001, "reward": 3.087108612060547, "reward_std": 0.3445526212453842, "rewards/final_reward": 1.7332479771038667, "rewards/mask_iou_reward": 0.8666239885519333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0871086716651917, "rewards/thk_ans_format_reward": 1.0, "step": 89, "think_completion_length": 70.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 147.58333587646484, "epoch": 0.30354131534569984, "grad_norm": 5.52593254215471, "kl": 0.15185546875, "learning_rate": 9.74662162162162e-07, "loss": 0.0002, "reward": 3.0328943729400635, "reward_std": 0.3087661564350128, "rewards/final_reward": 1.1219688993011085, "rewards/mask_iou_reward": 0.5609844496505543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0328941941261292, "rewards/thk_ans_format_reward": 1.0, "step": 90, "think_completion_length": 76.75 }, { "clip_ratio": 0.0, "completion_length": 145.09375762939453, "epoch": 0.3069139966273187, "grad_norm": 4.83035426553915, "kl": 0.10595703125, "learning_rate": 9.743806306306306e-07, "loss": 0.0001, "reward": 3.2756351232528687, "reward_std": 0.21860820055007935, "rewards/final_reward": 1.5841647447377873, "rewards/mask_iou_reward": 0.7920823723688937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2756351232528687, "rewards/thk_ans_format_reward": 1.0, "step": 91, "think_completion_length": 73.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 147.4166717529297, "epoch": 0.3102866779089376, "grad_norm": 5.485844590793826, "kl": 0.130126953125, "learning_rate": 9.74099099099099e-07, "loss": 0.0001, "reward": 3.332701802253723, "reward_std": 0.25605323910713196, "rewards/final_reward": 1.7413345387431938, "rewards/mask_iou_reward": 0.8706672693715969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3327018022537231, "rewards/thk_ans_format_reward": 1.0, "step": 92, "think_completion_length": 82.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 150.6979217529297, "epoch": 0.3136593591905565, "grad_norm": 12.53059709455058, "kl": 0.108642578125, "learning_rate": 9.738175675675675e-07, "loss": 0.0001, "reward": 2.661715030670166, "reward_std": 0.3630661815404892, "rewards/final_reward": 0.7241040143891004, "rewards/mask_iou_reward": 0.3620520071945502, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6617150008678436, "rewards/thk_ans_format_reward": 1.0, "step": 93, "think_completion_length": 84.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 160.59375762939453, "epoch": 0.31703204047217537, "grad_norm": 9.425838303959774, "kl": 0.13330078125, "learning_rate": 9.735360360360359e-07, "loss": 0.0001, "reward": 3.524588942527771, "reward_std": 0.25210119783878326, "rewards/final_reward": 1.5078914610928074, "rewards/mask_iou_reward": 0.7539457305464037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.524588942527771, "rewards/thk_ans_format_reward": 1.0, "step": 94, "think_completion_length": 86.25 }, { "clip_ratio": 0.0, "completion_length": 152.78125762939453, "epoch": 0.32040472175379425, "grad_norm": 6.176918426451065, "kl": 0.1142578125, "learning_rate": 9.732545045045045e-07, "loss": 0.0001, "reward": 2.603161573410034, "reward_std": 0.23625994473695755, "rewards/final_reward": 0.7563615294481078, "rewards/mask_iou_reward": 0.3781807647240539, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6031614542007446, "rewards/thk_ans_format_reward": 1.0, "step": 95, "think_completion_length": 89.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.1666717529297, "epoch": 0.32377740303541314, "grad_norm": 4.239631710734004, "kl": 0.14306640625, "learning_rate": 9.72972972972973e-07, "loss": 0.0001, "reward": 3.153728723526001, "reward_std": 0.2302696853876114, "rewards/final_reward": 1.4158820962324687, "rewards/mask_iou_reward": 0.7079410481162344, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1537286639213562, "rewards/thk_ans_format_reward": 1.0, "step": 96, "think_completion_length": 83.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 153.77083587646484, "epoch": 0.327150084317032, "grad_norm": 4.11745240192496, "kl": 0.140625, "learning_rate": 9.726914414414414e-07, "loss": 0.0001, "reward": 2.7436962127685547, "reward_std": 0.2693813741207123, "rewards/final_reward": 0.41176920242236215, "rewards/mask_iou_reward": 0.20588460121118107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7436961233615875, "rewards/thk_ans_format_reward": 1.0, "step": 97, "think_completion_length": 81.5 }, { "clip_ratio": 0.0, "completion_length": 156.75, "epoch": 0.3305227655986509, "grad_norm": 4.489432400487865, "kl": 0.11962890625, "learning_rate": 9.7240990990991e-07, "loss": 0.0001, "reward": 2.873469591140747, "reward_std": 0.35211898386478424, "rewards/final_reward": 0.2694470590593998, "rewards/mask_iou_reward": 0.1347235295296999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8734694123268127, "rewards/thk_ans_format_reward": 1.0, "step": 98, "think_completion_length": 96.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 149.8854217529297, "epoch": 0.33389544688026984, "grad_norm": 5.779299275244791, "kl": 0.12890625, "learning_rate": 9.721283783783783e-07, "loss": 0.0001, "reward": 2.620466947555542, "reward_std": 0.3322184383869171, "rewards/final_reward": 0.32427589794335787, "rewards/mask_iou_reward": 0.16213794897167894, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 0.6413003206253052, "rewards/thk_ans_format_reward": 1.0, "step": 99, "think_completion_length": 94.75 }, { "clip_ratio": 0.0, "completion_length": 162.0625, "epoch": 0.3372681281618887, "grad_norm": 7.847700810555537, "kl": 0.12109375, "learning_rate": 9.718468468468469e-07, "loss": 0.0001, "reward": 2.8851125240325928, "reward_std": 0.20413611084222794, "rewards/final_reward": 0.9221734908674944, "rewards/mask_iou_reward": 0.4610867454337472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8851124942302704, "rewards/thk_ans_format_reward": 1.0, "step": 100, "think_completion_length": 92.375 }, { "clip_ratio": 0.0, "completion_length": 142.61459350585938, "epoch": 0.3406408094435076, "grad_norm": 6.340272960727325, "kl": 0.127197265625, "learning_rate": 9.715653153153152e-07, "loss": 0.0001, "reward": 3.1949912309646606, "reward_std": 0.3106095865368843, "rewards/final_reward": 0.9199014567652839, "rewards/mask_iou_reward": 0.45995072838264195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1949909627437592, "rewards/thk_ans_format_reward": 1.0, "step": 101, "think_completion_length": 76.20833333333334 }, { "clip_ratio": 0.0, "completion_length": 153.8854217529297, "epoch": 0.3440134907251265, "grad_norm": 10.886937887424214, "kl": 0.12109375, "learning_rate": 9.712837837837838e-07, "loss": 0.0001, "reward": 2.9201362133026123, "reward_std": 0.2227378636598587, "rewards/final_reward": 0.9971067769618158, "rewards/mask_iou_reward": 0.4985533884809079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9201361835002899, "rewards/thk_ans_format_reward": 1.0, "step": 102, "think_completion_length": 89.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 155.0729217529297, "epoch": 0.3473861720067454, "grad_norm": 10.041510866770627, "kl": 0.125, "learning_rate": 9.710022522522521e-07, "loss": 0.0001, "reward": 2.7521921396255493, "reward_std": 0.3331163227558136, "rewards/final_reward": 0.9283792153480164, "rewards/mask_iou_reward": 0.4641896076740082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7521921098232269, "rewards/thk_ans_format_reward": 1.0, "step": 103, "think_completion_length": 101.75 }, { "clip_ratio": 0.0, "completion_length": 159.39584350585938, "epoch": 0.35075885328836426, "grad_norm": 10.712565225905651, "kl": 0.109619140625, "learning_rate": 9.707207207207207e-07, "loss": 0.0001, "reward": 3.1159496307373047, "reward_std": 0.14314200729131699, "rewards/final_reward": 1.6823086959495313, "rewards/mask_iou_reward": 0.8411543479747656, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1159498393535614, "rewards/thk_ans_format_reward": 1.0, "step": 104, "think_completion_length": 85.95833333333334 }, { "clip_ratio": 0.0, "completion_length": 168.1041717529297, "epoch": 0.35413153456998314, "grad_norm": 9.741009916482888, "kl": 0.1142578125, "learning_rate": 9.70439189189189e-07, "loss": 0.0001, "reward": 2.7842196226119995, "reward_std": 0.3237437531352043, "rewards/final_reward": 0.5121928576839352, "rewards/mask_iou_reward": 0.2560964288419676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.78421950340271, "rewards/thk_ans_format_reward": 1.0, "step": 105, "think_completion_length": 87.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 151.4479217529297, "epoch": 0.357504215851602, "grad_norm": 50.68857598303174, "kl": 0.123779296875, "learning_rate": 9.701576576576576e-07, "loss": 0.0001, "reward": 3.562849760055542, "reward_std": 0.17162877321243286, "rewards/final_reward": 1.7938839579394017, "rewards/mask_iou_reward": 0.8969419789697008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5628495812416077, "rewards/thk_ans_format_reward": 1.0, "step": 106, "think_completion_length": 92.95833333333334 }, { "clip_ratio": 0.0, "completion_length": 165.87500762939453, "epoch": 0.3608768971332209, "grad_norm": 7.883374108444154, "kl": 0.12109375, "learning_rate": 9.698761261261262e-07, "loss": 0.0001, "reward": 3.2476431131362915, "reward_std": 0.341851145029068, "rewards/final_reward": 1.393488279415816, "rewards/mask_iou_reward": 0.696744139707908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2476428151130676, "rewards/thk_ans_format_reward": 1.0, "step": 107, "think_completion_length": 106.41666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.45834350585938, "epoch": 0.3642495784148398, "grad_norm": 7.594040012287189, "kl": 0.1796875, "learning_rate": 9.695945945945946e-07, "loss": 0.0002, "reward": 3.0297967195510864, "reward_std": 0.20907431468367577, "rewards/final_reward": 0.9516663204931226, "rewards/mask_iou_reward": 0.4758331602465613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0297967791557312, "rewards/thk_ans_format_reward": 1.0, "step": 108, "think_completion_length": 100.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 164.98958587646484, "epoch": 0.3676222596964587, "grad_norm": 8.488356286505727, "kl": 0.12451171875, "learning_rate": 9.693130630630631e-07, "loss": 0.0001, "reward": 3.048767328262329, "reward_std": 0.383039191365242, "rewards/final_reward": 0.9600915533933114, "rewards/mask_iou_reward": 0.4800457766966557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0487673580646515, "rewards/thk_ans_format_reward": 1.0, "step": 109, "think_completion_length": 103.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 156.02084350585938, "epoch": 0.37099494097807756, "grad_norm": 6.210723775719561, "kl": 0.125732421875, "learning_rate": 9.690315315315315e-07, "loss": 0.0001, "reward": 2.9819425344467163, "reward_std": 0.27216267585754395, "rewards/final_reward": 0.46485509202170155, "rewards/mask_iou_reward": 0.23242754601085078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9819426238536835, "rewards/thk_ans_format_reward": 1.0, "step": 110, "think_completion_length": 93.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 159.15625, "epoch": 0.37436762225969644, "grad_norm": 8.678738528400313, "kl": 0.15576171875, "learning_rate": 9.6875e-07, "loss": 0.0002, "reward": 2.936001181602478, "reward_std": 0.22103118896484375, "rewards/final_reward": 0.6134164982851417, "rewards/mask_iou_reward": 0.30670824914257083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9360010623931885, "rewards/thk_ans_format_reward": 1.0, "step": 111, "think_completion_length": 92.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 180.8229217529297, "epoch": 0.3777403035413153, "grad_norm": 3.946076786450147, "kl": 0.15966796875, "learning_rate": 9.684684684684684e-07, "loss": 0.0002, "reward": 2.8658047914505005, "reward_std": 0.267331525683403, "rewards/final_reward": 0.8884399087779431, "rewards/mask_iou_reward": 0.44421995438897155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8658046722412109, "rewards/thk_ans_format_reward": 1.0, "step": 112, "think_completion_length": 121.0 }, { "clip_ratio": 0.0, "completion_length": 181.9479217529297, "epoch": 0.3811129848229342, "grad_norm": 13.643204279995123, "kl": 0.1142578125, "learning_rate": 9.68186936936937e-07, "loss": 0.0001, "reward": 3.2716516256332397, "reward_std": 0.26144395768642426, "rewards/final_reward": 0.5963319934097903, "rewards/mask_iou_reward": 0.29816599670489513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2716516256332397, "rewards/thk_ans_format_reward": 1.0, "step": 113, "think_completion_length": 90.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 171.89584350585938, "epoch": 0.3844856661045531, "grad_norm": 4.560320424584854, "kl": 0.121826171875, "learning_rate": 9.679054054054053e-07, "loss": 0.0001, "reward": 2.9422881603240967, "reward_std": 0.30686257779598236, "rewards/final_reward": 1.3590917123836523, "rewards/mask_iou_reward": 0.6795458561918262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9422881901264191, "rewards/thk_ans_format_reward": 1.0, "step": 114, "think_completion_length": 101.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 171.0416717529297, "epoch": 0.38785834738617203, "grad_norm": 9.483350438450554, "kl": 0.130615234375, "learning_rate": 9.676238738738739e-07, "loss": 0.0001, "reward": 3.0643558502197266, "reward_std": 0.35449835658073425, "rewards/final_reward": 0.6536492522459832, "rewards/mask_iou_reward": 0.3268246261229916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0747724771499634, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 115, "think_completion_length": 96.125 }, { "clip_ratio": 0.0, "completion_length": 169.27084350585938, "epoch": 0.3912310286677909, "grad_norm": 8.231822162450193, "kl": 0.12744140625, "learning_rate": 9.673423423423422e-07, "loss": 0.0001, "reward": 3.1779122352600098, "reward_std": 0.24591050297021866, "rewards/final_reward": 1.5311987731543408, "rewards/mask_iou_reward": 0.7655993865771704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1779123544692993, "rewards/thk_ans_format_reward": 1.0, "step": 116, "think_completion_length": 94.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 178.83333587646484, "epoch": 0.3946037099494098, "grad_norm": 4.853211323414288, "kl": 0.133544921875, "learning_rate": 9.670608108108108e-07, "loss": 0.0001, "reward": 2.904745578765869, "reward_std": 0.34407839179039, "rewards/final_reward": 1.1153784326788996, "rewards/mask_iou_reward": 0.5576892163394498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9047453701496124, "rewards/thk_ans_format_reward": 1.0, "step": 117, "think_completion_length": 95.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 160.64583587646484, "epoch": 0.3979763912310287, "grad_norm": 5.020686586708645, "kl": 0.121337890625, "learning_rate": 9.667792792792794e-07, "loss": 0.0001, "reward": 3.063020706176758, "reward_std": 0.2569497376680374, "rewards/final_reward": 0.6591898883028006, "rewards/mask_iou_reward": 0.3295949441514003, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0630205869674683, "rewards/thk_ans_format_reward": 1.0, "step": 118, "think_completion_length": 95.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 168.4791717529297, "epoch": 0.40134907251264756, "grad_norm": 12.412696310883033, "kl": 0.1435546875, "learning_rate": 9.664977477477477e-07, "loss": 0.0001, "reward": 2.9063340425491333, "reward_std": 0.26251453161239624, "rewards/final_reward": 0.7353001495272187, "rewards/mask_iou_reward": 0.36765007476360934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9063339531421661, "rewards/thk_ans_format_reward": 1.0, "step": 119, "think_completion_length": 102.66666666666667 }, { "clip_ratio": 0.0, "completion_length": 173.1979217529297, "epoch": 0.40472175379426645, "grad_norm": 3.992774052478872, "kl": 0.145751953125, "learning_rate": 9.66216216216216e-07, "loss": 0.0003, "reward": 2.862497925758362, "reward_std": 0.31197597831487656, "rewards/final_reward": 1.068799297125957, "rewards/mask_iou_reward": 0.5343996485629785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8624976277351379, "rewards/thk_ans_format_reward": 1.0, "step": 120, "think_completion_length": 119.625 }, { "clip_ratio": 0.0, "completion_length": 170.14584350585938, "epoch": 0.40809443507588533, "grad_norm": 10.473006443962179, "kl": 0.15771484375, "learning_rate": 9.659346846846846e-07, "loss": 0.0002, "reward": 2.971013307571411, "reward_std": 0.33689363300800323, "rewards/final_reward": 1.1416579108015639, "rewards/mask_iou_reward": 0.5708289554007819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9710133671760559, "rewards/thk_ans_format_reward": 1.0, "step": 121, "think_completion_length": 97.91666666666666 }, { "clip_ratio": 0.0, "completion_length": 183.5416717529297, "epoch": 0.4114671163575042, "grad_norm": 9.534381874086584, "kl": 0.1396484375, "learning_rate": 9.65653153153153e-07, "loss": 0.0002, "reward": 2.8356984853744507, "reward_std": 0.15941885858774185, "rewards/final_reward": 0.4459266845397626, "rewards/mask_iou_reward": 0.2229633422698813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8356985449790955, "rewards/thk_ans_format_reward": 1.0, "step": 122, "think_completion_length": 118.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 172.70833587646484, "epoch": 0.4148397976391231, "grad_norm": 6.86959731192038, "kl": 0.136474609375, "learning_rate": 9.653716216216216e-07, "loss": 0.0001, "reward": 3.089380621910095, "reward_std": 0.19493500515818596, "rewards/final_reward": 0.9564584633661732, "rewards/mask_iou_reward": 0.4782292316830866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0893806219100952, "rewards/thk_ans_format_reward": 1.0, "step": 123, "think_completion_length": 104.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 185.21875762939453, "epoch": 0.418212478920742, "grad_norm": 6.799688461335475, "kl": 0.1376953125, "learning_rate": 9.6509009009009e-07, "loss": 0.0001, "reward": 3.0790122747421265, "reward_std": 0.2588284760713577, "rewards/final_reward": 0.9644275758627705, "rewards/mask_iou_reward": 0.48221378793138525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0790123343467712, "rewards/thk_ans_format_reward": 1.0, "step": 124, "think_completion_length": 124.75 }, { "clip_ratio": 0.0, "completion_length": 171.4166717529297, "epoch": 0.42158516020236086, "grad_norm": 13.750045710115277, "kl": 0.16064453125, "learning_rate": 9.648085585585585e-07, "loss": 0.0002, "reward": 3.409953236579895, "reward_std": 0.3135468512773514, "rewards/final_reward": 1.509239971271122, "rewards/mask_iou_reward": 0.754619985635561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4099529385566711, "rewards/thk_ans_format_reward": 1.0, "step": 125, "think_completion_length": 92.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 166.81250762939453, "epoch": 0.42495784148397975, "grad_norm": 12.535850216866876, "kl": 0.1357421875, "learning_rate": 9.645270270270268e-07, "loss": 0.0001, "reward": 3.326646089553833, "reward_std": 0.23538677394390106, "rewards/final_reward": 0.8067363062978283, "rewards/mask_iou_reward": 0.40336815314891417, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3266459703445435, "rewards/thk_ans_format_reward": 1.0, "step": 126, "think_completion_length": 110.625 }, { "clip_ratio": 0.0, "completion_length": 197.4479217529297, "epoch": 0.42833052276559863, "grad_norm": 4.964117400514983, "kl": 0.12890625, "learning_rate": 9.642454954954954e-07, "loss": 0.0001, "reward": 3.025187849998474, "reward_std": 0.3882623016834259, "rewards/final_reward": 1.5354492991876239, "rewards/mask_iou_reward": 0.7677246495938119, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0251877903938293, "rewards/thk_ans_format_reward": 1.0, "step": 127, "think_completion_length": 97.25 }, { "clip_ratio": 0.0, "completion_length": 172.1979217529297, "epoch": 0.4317032040472175, "grad_norm": 8.253511110956323, "kl": 0.13916015625, "learning_rate": 9.63963963963964e-07, "loss": 0.0001, "reward": 3.17389976978302, "reward_std": 0.34934788942337036, "rewards/final_reward": 0.6571360829891589, "rewards/mask_iou_reward": 0.32856804149457947, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1738998293876648, "rewards/thk_ans_format_reward": 1.0, "step": 128, "think_completion_length": 114.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 172.6979217529297, "epoch": 0.4350758853288364, "grad_norm": 9.764055720521588, "kl": 0.1328125, "learning_rate": 9.636824324324323e-07, "loss": 0.0001, "reward": 2.9551135301589966, "reward_std": 0.23873476684093475, "rewards/final_reward": 0.29115001782333383, "rewards/mask_iou_reward": 0.14557500891166691, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9655300378799438, "rewards/thk_ans_format_reward": 1.0, "step": 129, "think_completion_length": 126.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 171.0104217529297, "epoch": 0.43844856661045534, "grad_norm": 5.43923330871092, "kl": 0.1748046875, "learning_rate": 9.63400900900901e-07, "loss": 0.0002, "reward": 3.2071645259857178, "reward_std": 0.24996963143348694, "rewards/final_reward": 1.1977146186007315, "rewards/mask_iou_reward": 0.5988573093003657, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2071644067764282, "rewards/thk_ans_format_reward": 1.0, "step": 130, "think_completion_length": 102.875 }, { "clip_ratio": 0.0, "completion_length": 171.625, "epoch": 0.4418212478920742, "grad_norm": 6.752256975163003, "kl": 0.171875, "learning_rate": 9.631193693693693e-07, "loss": 0.0002, "reward": 3.077457904815674, "reward_std": 0.16670826077461243, "rewards/final_reward": 1.0109958851085978, "rewards/mask_iou_reward": 0.5054979425542989, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0774577260017395, "rewards/thk_ans_format_reward": 1.0, "step": 131, "think_completion_length": 109.41666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.15625, "epoch": 0.4451939291736931, "grad_norm": 7.106410397724427, "kl": 0.1474609375, "learning_rate": 9.628378378378378e-07, "loss": 0.0001, "reward": 2.8066269159317017, "reward_std": 0.38548873364925385, "rewards/final_reward": 0.707636198443585, "rewards/mask_iou_reward": 0.3538180992217925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8066268563270569, "rewards/thk_ans_format_reward": 1.0, "step": 132, "think_completion_length": 93.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 172.17709350585938, "epoch": 0.448566610455312, "grad_norm": 7.057980509975816, "kl": 0.1796875, "learning_rate": 9.625563063063062e-07, "loss": 0.0002, "reward": 3.3535468578338623, "reward_std": 0.21944965422153473, "rewards/final_reward": 1.243831253053847, "rewards/mask_iou_reward": 0.6219156265269234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.353546917438507, "rewards/thk_ans_format_reward": 1.0, "step": 133, "think_completion_length": 103.25 }, { "clip_ratio": 0.0, "completion_length": 182.2291717529297, "epoch": 0.45193929173693087, "grad_norm": 4.563927394653558, "kl": 0.15625, "learning_rate": 9.622747747747747e-07, "loss": 0.0002, "reward": 2.870627284049988, "reward_std": 0.2948570251464844, "rewards/final_reward": 0.6376485645134273, "rewards/mask_iou_reward": 0.31882428225671366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.870627224445343, "rewards/thk_ans_format_reward": 1.0, "step": 134, "think_completion_length": 99.95833333333334 }, { "clip_ratio": 0.0, "completion_length": 180.9479217529297, "epoch": 0.45531197301854975, "grad_norm": 7.050674237211012, "kl": 0.15869140625, "learning_rate": 9.61993243243243e-07, "loss": 0.0002, "reward": 2.6339285373687744, "reward_std": 0.42745040357112885, "rewards/final_reward": 0.12814824118852572, "rewards/mask_iou_reward": 0.06407412059426286, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.6547618210315704, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 135, "think_completion_length": 105.375 }, { "clip_ratio": 0.0, "completion_length": 178.2916717529297, "epoch": 0.45868465430016864, "grad_norm": 11.431729430530904, "kl": 0.16259765625, "learning_rate": 9.617117117117117e-07, "loss": 0.0002, "reward": 2.931082606315613, "reward_std": 0.40491442382335663, "rewards/final_reward": 1.4102242110294274, "rewards/mask_iou_reward": 0.7051121055147137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9310824871063232, "rewards/thk_ans_format_reward": 1.0, "step": 136, "think_completion_length": 110.5 }, { "clip_ratio": 0.0, "completion_length": 166.73958587646484, "epoch": 0.4620573355817875, "grad_norm": 21.583035164198247, "kl": 0.17724609375, "learning_rate": 9.614301801801802e-07, "loss": 0.0002, "reward": 2.7264972925186157, "reward_std": 0.36133062839508057, "rewards/final_reward": 0.7153198900624561, "rewards/mask_iou_reward": 0.35765994503122805, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7264970242977142, "rewards/thk_ans_format_reward": 1.0, "step": 137, "think_completion_length": 112.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 164.28125762939453, "epoch": 0.4654300168634064, "grad_norm": 9.40238456718145, "kl": 0.17822265625, "learning_rate": 9.611486486486486e-07, "loss": 0.0002, "reward": 3.1096519231796265, "reward_std": 0.30247916281223297, "rewards/final_reward": 1.4404627860085197, "rewards/mask_iou_reward": 0.7202313930042599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1096520125865936, "rewards/thk_ans_format_reward": 1.0, "step": 138, "think_completion_length": 106.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 169.52084350585938, "epoch": 0.4688026981450253, "grad_norm": 4.9043637265708515, "kl": 0.16943359375, "learning_rate": 9.608671171171172e-07, "loss": 0.0002, "reward": 2.866019368171692, "reward_std": 0.21420340985059738, "rewards/final_reward": 0.525169727990159, "rewards/mask_iou_reward": 0.2625848639950795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8660192787647247, "rewards/thk_ans_format_reward": 1.0, "step": 139, "think_completion_length": 94.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 155.96875762939453, "epoch": 0.47217537942664417, "grad_norm": 5.837762710429281, "kl": 0.1826171875, "learning_rate": 9.605855855855855e-07, "loss": 0.0002, "reward": 3.2641146183013916, "reward_std": 0.2204541265964508, "rewards/final_reward": 1.4925167842831235, "rewards/mask_iou_reward": 0.7462583921415618, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2641146779060364, "rewards/thk_ans_format_reward": 1.0, "step": 140, "think_completion_length": 82.58333333333334 }, { "clip_ratio": 0.0, "completion_length": 165.8125, "epoch": 0.47554806070826305, "grad_norm": 9.65070929894506, "kl": 0.20068359375, "learning_rate": 9.60304054054054e-07, "loss": 0.0002, "reward": 2.757500410079956, "reward_std": 0.26694121956825256, "rewards/final_reward": 0.7410724561390301, "rewards/mask_iou_reward": 0.37053622806951503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7575002908706665, "rewards/thk_ans_format_reward": 1.0, "step": 141, "think_completion_length": 84.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 162.90625762939453, "epoch": 0.47892074198988194, "grad_norm": 7.747616062513729, "kl": 0.1953125, "learning_rate": 9.600225225225224e-07, "loss": 0.0002, "reward": 3.1408188343048096, "reward_std": 0.25297851860523224, "rewards/final_reward": 0.9605678349099196, "rewards/mask_iou_reward": 0.4802839174549598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1408189535140991, "rewards/thk_ans_format_reward": 1.0, "step": 142, "think_completion_length": 92.5 }, { "clip_ratio": 0.0, "completion_length": 156.4166717529297, "epoch": 0.4822934232715008, "grad_norm": 6.639684074671962, "kl": 0.18212890625, "learning_rate": 9.59740990990991e-07, "loss": 0.0002, "reward": 3.2298460006713867, "reward_std": 0.24782373011112213, "rewards/final_reward": 1.4688922774294029, "rewards/mask_iou_reward": 0.7344461387147014, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.229845941066742, "rewards/thk_ans_format_reward": 1.0, "step": 143, "think_completion_length": 82.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 159.52083587646484, "epoch": 0.4856661045531197, "grad_norm": 11.690583175650898, "kl": 0.19287109375, "learning_rate": 9.594594594594594e-07, "loss": 0.0002, "reward": 2.953932523727417, "reward_std": 0.24815939366817474, "rewards/final_reward": 0.25470155704168357, "rewards/mask_iou_reward": 0.12735077852084178, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9539322257041931, "rewards/thk_ans_format_reward": 1.0, "step": 144, "think_completion_length": 88.75 }, { "clip_ratio": 0.0, "completion_length": 166.0625, "epoch": 0.48903878583473864, "grad_norm": 8.171245738483481, "kl": 0.20947265625, "learning_rate": 9.59177927927928e-07, "loss": 0.0002, "reward": 3.105030059814453, "reward_std": 0.41488519310951233, "rewards/final_reward": 1.0450923837882327, "rewards/mask_iou_reward": 0.5225461918941163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1050302386283875, "rewards/thk_ans_format_reward": 1.0, "step": 145, "think_completion_length": 84.75 }, { "clip_ratio": 0.0, "completion_length": 162.9791717529297, "epoch": 0.4924114671163575, "grad_norm": 6.8042002771504695, "kl": 0.1826171875, "learning_rate": 9.588963963963963e-07, "loss": 0.0002, "reward": 2.978797674179077, "reward_std": 0.23305433988571167, "rewards/final_reward": 0.5362301160155533, "rewards/mask_iou_reward": 0.26811505800777663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.97879758477211, "rewards/thk_ans_format_reward": 1.0, "step": 146, "think_completion_length": 79.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.7604217529297, "epoch": 0.4957841483979764, "grad_norm": 5.8864092171349585, "kl": 0.21484375, "learning_rate": 9.586148648648648e-07, "loss": 0.0002, "reward": 3.4847919940948486, "reward_std": 0.13796599209308624, "rewards/final_reward": 1.3652275476646998, "rewards/mask_iou_reward": 0.6826137738323499, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4847919344902039, "rewards/thk_ans_format_reward": 1.0, "step": 147, "think_completion_length": 82.625 }, { "clip_ratio": 0.0, "completion_length": 157.30209350585938, "epoch": 0.4991568296795953, "grad_norm": 5.0106064638814205, "kl": 0.2138671875, "learning_rate": 9.583333333333334e-07, "loss": 0.0002, "reward": 3.076038360595703, "reward_std": 0.27134670317173004, "rewards/final_reward": 0.9536914128816507, "rewards/mask_iou_reward": 0.47684570644082536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0760382413864136, "rewards/thk_ans_format_reward": 1.0, "step": 148, "think_completion_length": 83.33333333333334 }, { "clip_ratio": 0.0, "completion_length": 146.6666717529297, "epoch": 0.5025295109612141, "grad_norm": 8.274985300107584, "kl": 0.2060546875, "learning_rate": 9.580518018018018e-07, "loss": 0.0002, "reward": 3.1267894506454468, "reward_std": 0.18382571265101433, "rewards/final_reward": 0.8946684420728204, "rewards/mask_iou_reward": 0.4473342210364102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1267894208431244, "rewards/thk_ans_format_reward": 1.0, "step": 149, "think_completion_length": 84.58333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.6979217529297, "epoch": 0.5059021922428331, "grad_norm": 16.142757363004044, "kl": 0.23291015625, "learning_rate": 9.577702702702703e-07, "loss": 0.0002, "reward": 3.0016664266586304, "reward_std": 0.2265520542860031, "rewards/final_reward": 1.3027442461369658, "rewards/mask_iou_reward": 0.6513721230684829, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0016663670539856, "rewards/thk_ans_format_reward": 1.0, "step": 150, "think_completion_length": 81.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 147.0104217529297, "epoch": 0.5092748735244519, "grad_norm": 20.331315869387037, "kl": 0.2412109375, "learning_rate": 9.574887387387387e-07, "loss": 0.0002, "reward": 3.1735047101974487, "reward_std": 0.22183486074209213, "rewards/final_reward": 1.0375799778137802, "rewards/mask_iou_reward": 0.5187899889068901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.173504650592804, "rewards/thk_ans_format_reward": 1.0, "step": 151, "think_completion_length": 84.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 146.8541717529297, "epoch": 0.5126475548060708, "grad_norm": 7.285780487915525, "kl": 0.19775390625, "learning_rate": 9.572072072072072e-07, "loss": 0.0002, "reward": 2.487415313720703, "reward_std": 0.2901010140776634, "rewards/final_reward": 0.6410345660592923, "rewards/mask_iou_reward": 0.32051728302964616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4874153882265091, "rewards/thk_ans_format_reward": 1.0, "step": 152, "think_completion_length": 70.125 }, { "clip_ratio": 0.0, "completion_length": 171.58333587646484, "epoch": 0.5160202360876898, "grad_norm": 5.037142388718095, "kl": 0.37255859375, "learning_rate": 9.569256756756756e-07, "loss": 0.0004, "reward": 3.0975812673568726, "reward_std": 0.41272978484630585, "rewards/final_reward": 1.2630302577736632, "rewards/mask_iou_reward": 0.6315151288868316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0975810885429382, "rewards/thk_ans_format_reward": 1.0, "step": 153, "think_completion_length": 87.66666666666667 }, { "clip_ratio": 0.0, "completion_length": 171.92709350585938, "epoch": 0.5193929173693086, "grad_norm": 9.476578185984213, "kl": 0.27099609375, "learning_rate": 9.566441441441442e-07, "loss": 0.0003, "reward": 2.7476073503494263, "reward_std": 0.12777689844369888, "rewards/final_reward": 1.0467066867601786, "rewards/mask_iou_reward": 0.5233533433800893, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7476073503494263, "rewards/thk_ans_format_reward": 1.0, "step": 154, "think_completion_length": 81.875 }, { "clip_ratio": 0.0, "completion_length": 152.37500762939453, "epoch": 0.5227655986509275, "grad_norm": 10.872523571455144, "kl": 0.263671875, "learning_rate": 9.563626126126125e-07, "loss": 0.0003, "reward": 3.087652087211609, "reward_std": 0.2608217652887106, "rewards/final_reward": 1.1027023211599696, "rewards/mask_iou_reward": 0.5513511605799848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0876522362232208, "rewards/thk_ans_format_reward": 1.0, "step": 155, "think_completion_length": 84.25 }, { "clip_ratio": 0.0, "completion_length": 148.80208587646484, "epoch": 0.5261382799325464, "grad_norm": 20.114727083957355, "kl": 0.23681640625, "learning_rate": 9.56081081081081e-07, "loss": 0.0002, "reward": 3.0146535634994507, "reward_std": 0.25460537523031235, "rewards/final_reward": 0.5053419912394292, "rewards/mask_iou_reward": 0.2526709956197146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0146536529064178, "rewards/thk_ans_format_reward": 1.0, "step": 156, "think_completion_length": 92.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 149.48958587646484, "epoch": 0.5295109612141653, "grad_norm": 7.9617120534329535, "kl": 0.228515625, "learning_rate": 9.557995495495497e-07, "loss": 0.0002, "reward": 3.141195058822632, "reward_std": 0.10193538293242455, "rewards/final_reward": 0.5441117242993203, "rewards/mask_iou_reward": 0.27205586214966015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.14119490981102, "rewards/thk_ans_format_reward": 1.0, "step": 157, "think_completion_length": 89.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 144.61459350585938, "epoch": 0.5328836424957841, "grad_norm": 7.479607652753245, "kl": 0.2685546875, "learning_rate": 9.55518018018018e-07, "loss": 0.0003, "reward": 3.1644891500473022, "reward_std": 0.2984514832496643, "rewards/final_reward": 0.957407182539989, "rewards/mask_iou_reward": 0.4787035912699945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.164489209651947, "rewards/thk_ans_format_reward": 1.0, "step": 158, "think_completion_length": 81.75 }, { "clip_ratio": 0.0, "completion_length": 148.23958587646484, "epoch": 0.5362563237774031, "grad_norm": 5.402652813833984, "kl": 0.2373046875, "learning_rate": 9.552364864864864e-07, "loss": 0.0002, "reward": 2.986154317855835, "reward_std": 0.3532260060310364, "rewards/final_reward": 1.7312428830340285, "rewards/mask_iou_reward": 0.8656214415170143, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9861544370651245, "rewards/thk_ans_format_reward": 1.0, "step": 159, "think_completion_length": 77.04166666666667 }, { "clip_ratio": 0.0, "completion_length": 141.56250762939453, "epoch": 0.5396290050590219, "grad_norm": 31.67704383926532, "kl": 0.35546875, "learning_rate": 9.54954954954955e-07, "loss": 0.0004, "reward": 3.3903119564056396, "reward_std": 0.2488839253783226, "rewards/final_reward": 1.530599074157094, "rewards/mask_iou_reward": 0.765299537078547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3903120160102844, "rewards/thk_ans_format_reward": 1.0, "step": 160, "think_completion_length": 74.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 138.86458587646484, "epoch": 0.5430016863406408, "grad_norm": 10.93334970931945, "kl": 0.2666015625, "learning_rate": 9.546734234234233e-07, "loss": 0.0003, "reward": 3.126182436943054, "reward_std": 0.23756013810634613, "rewards/final_reward": 1.3464974720541376, "rewards/mask_iou_reward": 0.6732487360270688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.126182347536087, "rewards/thk_ans_format_reward": 1.0, "step": 161, "think_completion_length": 68.875 }, { "clip_ratio": 0.0, "completion_length": 142.9166717529297, "epoch": 0.5463743676222597, "grad_norm": 7.656379946063235, "kl": 0.24755859375, "learning_rate": 9.543918918918919e-07, "loss": 0.0003, "reward": 2.90953528881073, "reward_std": 0.115766741335392, "rewards/final_reward": 0.8745932935241574, "rewards/mask_iou_reward": 0.4372966467620787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9095353484153748, "rewards/thk_ans_format_reward": 1.0, "step": 162, "think_completion_length": 74.375 }, { "clip_ratio": 0.0, "completion_length": 144.73958587646484, "epoch": 0.5497470489038786, "grad_norm": 6.866761916729452, "kl": 0.2314453125, "learning_rate": 9.541103603603602e-07, "loss": 0.0002, "reward": 3.1094547510147095, "reward_std": 0.22638342529535294, "rewards/final_reward": 1.349065662874475, "rewards/mask_iou_reward": 0.6745328314372375, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.119871363043785, "rewards/thk_ans_format_reward": 1.0, "step": 163, "think_completion_length": 65.125 }, { "clip_ratio": 0.0, "completion_length": 150.83333587646484, "epoch": 0.5531197301854974, "grad_norm": 5.657018351301425, "kl": 0.24658203125, "learning_rate": 9.538288288288288e-07, "loss": 0.0002, "reward": 2.8507970571517944, "reward_std": 0.1843552067875862, "rewards/final_reward": 0.3972928312518996, "rewards/mask_iou_reward": 0.1986464156259498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8507969975471497, "rewards/thk_ans_format_reward": 1.0, "step": 164, "think_completion_length": 86.125 }, { "clip_ratio": 0.0, "completion_length": 136.78125762939453, "epoch": 0.5564924114671164, "grad_norm": 13.056201326210605, "kl": 0.28515625, "learning_rate": 9.535472972972972e-07, "loss": 0.0003, "reward": 3.584625482559204, "reward_std": 0.14753572642803192, "rewards/final_reward": 1.270440407309784, "rewards/mask_iou_reward": 0.635220203654892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5846253633499146, "rewards/thk_ans_format_reward": 1.0, "step": 165, "think_completion_length": 72.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 142.9166717529297, "epoch": 0.5598650927487352, "grad_norm": 5.643407452949351, "kl": 0.26171875, "learning_rate": 9.532657657657657e-07, "loss": 0.0003, "reward": 3.2158550024032593, "reward_std": 0.11347110942006111, "rewards/final_reward": 1.7543514291511069, "rewards/mask_iou_reward": 0.8771757145755534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2158551216125488, "rewards/thk_ans_format_reward": 1.0, "step": 166, "think_completion_length": 75.66666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.7916717529297, "epoch": 0.5632377740303541, "grad_norm": 12.62639672873956, "kl": 0.26220703125, "learning_rate": 9.529842342342343e-07, "loss": 0.0003, "reward": 2.8758022785186768, "reward_std": 0.2610451355576515, "rewards/final_reward": 0.9628604077281282, "rewards/mask_iou_reward": 0.4814302038640641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8758021593093872, "rewards/thk_ans_format_reward": 1.0, "step": 167, "think_completion_length": 84.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 149.1041717529297, "epoch": 0.5666104553119731, "grad_norm": 15.581925152352548, "kl": 0.248046875, "learning_rate": 9.527027027027027e-07, "loss": 0.0003, "reward": 3.1900848150253296, "reward_std": 0.30189305543899536, "rewards/final_reward": 1.1237223638565845, "rewards/mask_iou_reward": 0.5618611819282923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1900847554206848, "rewards/thk_ans_format_reward": 1.0, "step": 168, "think_completion_length": 78.0 }, { "clip_ratio": 0.0, "completion_length": 148.6041717529297, "epoch": 0.5699831365935919, "grad_norm": 9.718093636345376, "kl": 0.2734375, "learning_rate": 9.524211711711712e-07, "loss": 0.0003, "reward": 3.1855918169021606, "reward_std": 0.26739974319934845, "rewards/final_reward": 1.2005245534438336, "rewards/mask_iou_reward": 0.6002622767219168, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.196008563041687, "rewards/thk_ans_format_reward": 1.0, "step": 169, "think_completion_length": 73.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 150.93750762939453, "epoch": 0.5733558178752108, "grad_norm": 9.760113218323678, "kl": 0.27294921875, "learning_rate": 9.521396396396396e-07, "loss": 0.0003, "reward": 3.3592395782470703, "reward_std": 0.20917140692472458, "rewards/final_reward": 1.4165027563247574, "rewards/mask_iou_reward": 0.7082513781623787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3592395782470703, "rewards/thk_ans_format_reward": 1.0, "step": 170, "think_completion_length": 84.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 163.90625762939453, "epoch": 0.5767284991568297, "grad_norm": 31.25506389342562, "kl": 0.29296875, "learning_rate": 9.518581081081081e-07, "loss": 0.0003, "reward": 3.5906260013580322, "reward_std": 0.2482130452990532, "rewards/final_reward": 1.7329653551792745, "rewards/mask_iou_reward": 0.8664826775896373, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6010427474975586, "rewards/thk_ans_format_reward": 1.0, "step": 171, "think_completion_length": 83.66666666666666 }, { "clip_ratio": 0.0, "completion_length": 150.8229217529297, "epoch": 0.5801011804384486, "grad_norm": 5.221187581915739, "kl": 0.3369140625, "learning_rate": 9.515765765765766e-07, "loss": 0.0003, "reward": 3.1545242071151733, "reward_std": 0.23806846141815186, "rewards/final_reward": 1.0177511067470535, "rewards/mask_iou_reward": 0.5088755533735267, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1545242071151733, "rewards/thk_ans_format_reward": 1.0, "step": 172, "think_completion_length": 77.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.21875, "epoch": 0.5834738617200674, "grad_norm": 6.89011581902954, "kl": 0.259765625, "learning_rate": 9.51295045045045e-07, "loss": 0.0003, "reward": 3.1506524085998535, "reward_std": 0.20986726135015488, "rewards/final_reward": 0.9594745083224383, "rewards/mask_iou_reward": 0.47973725416121915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1506522297859192, "rewards/thk_ans_format_reward": 1.0, "step": 173, "think_completion_length": 137.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 159.08333587646484, "epoch": 0.5868465430016864, "grad_norm": 13.689620989891141, "kl": 0.25439453125, "learning_rate": 9.510135135135135e-07, "loss": 0.0001, "reward": 2.8189436197280884, "reward_std": 0.12411446496844292, "rewards/final_reward": 0.6297055339218588, "rewards/mask_iou_reward": 0.3148527669609294, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.818943589925766, "rewards/thk_ans_format_reward": 1.0, "step": 174, "think_completion_length": 102.125 }, { "clip_ratio": 0.0, "completion_length": 177.42709350585938, "epoch": 0.5902192242833052, "grad_norm": 10.636713075668137, "kl": 0.240234375, "learning_rate": 9.50731981981982e-07, "loss": 0.0002, "reward": 3.1594501733779907, "reward_std": 0.33075501024723053, "rewards/final_reward": 0.22629558092618446, "rewards/mask_iou_reward": 0.11314779046309223, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1594501733779907, "rewards/thk_ans_format_reward": 1.0, "step": 175, "think_completion_length": 108.875 }, { "clip_ratio": 0.0, "completion_length": 184.1666717529297, "epoch": 0.5935919055649241, "grad_norm": 4.761664368364506, "kl": 0.2587890625, "learning_rate": 9.504504504504504e-07, "loss": 0.0003, "reward": 3.0785621404647827, "reward_std": 0.24554403126239777, "rewards/final_reward": 1.2377652068387972, "rewards/mask_iou_reward": 0.6188826034193986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0785619616508484, "rewards/thk_ans_format_reward": 1.0, "step": 176, "think_completion_length": 105.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 166.11459350585938, "epoch": 0.596964586846543, "grad_norm": 30.958999247456223, "kl": 0.2939453125, "learning_rate": 9.50168918918919e-07, "loss": 0.0003, "reward": 3.3248791694641113, "reward_std": 0.32074373215436935, "rewards/final_reward": 1.5646520924025296, "rewards/mask_iou_reward": 0.7823260462012648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3248790502548218, "rewards/thk_ans_format_reward": 1.0, "step": 177, "think_completion_length": 86.625 }, { "clip_ratio": 0.0, "completion_length": 188.42709350585938, "epoch": 0.6003372681281619, "grad_norm": 8.363721329646177, "kl": 0.26708984375, "learning_rate": 9.498873873873874e-07, "loss": 0.0003, "reward": 2.8474905490875244, "reward_std": 0.17037975788116455, "rewards/final_reward": 1.4689836335171615, "rewards/mask_iou_reward": 0.7344918167585808, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.847490519285202, "rewards/thk_ans_format_reward": 1.0, "step": 178, "think_completion_length": 95.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 184.23959350585938, "epoch": 0.6037099494097807, "grad_norm": 16.177949288946124, "kl": 0.25244140625, "learning_rate": 9.496058558558558e-07, "loss": 0.0003, "reward": 2.9460397958755493, "reward_std": 0.40348224341869354, "rewards/final_reward": 0.7546152974114886, "rewards/mask_iou_reward": 0.3773076487057443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9460396766662598, "rewards/thk_ans_format_reward": 1.0, "step": 179, "think_completion_length": 127.25 }, { "clip_ratio": 0.0, "completion_length": 174.65625762939453, "epoch": 0.6070826306913997, "grad_norm": 5.815776866446163, "kl": 0.2744140625, "learning_rate": 9.493243243243243e-07, "loss": 0.0003, "reward": 2.947666049003601, "reward_std": 0.2804013565182686, "rewards/final_reward": 1.3667480820389621, "rewards/mask_iou_reward": 0.6833740410194811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9476659297943115, "rewards/thk_ans_format_reward": 1.0, "step": 180, "think_completion_length": 91.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 170.0104217529297, "epoch": 0.6104553119730185, "grad_norm": 6.897634968859645, "kl": 0.2666015625, "learning_rate": 9.490427927927927e-07, "loss": 0.0003, "reward": 2.8933818340301514, "reward_std": 0.3454447239637375, "rewards/final_reward": 1.2089149397368106, "rewards/mask_iou_reward": 0.6044574698684053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8933817148208618, "rewards/thk_ans_format_reward": 1.0, "step": 181, "think_completion_length": 101.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 180.9479217529297, "epoch": 0.6138279932546374, "grad_norm": 23.56354755740188, "kl": 0.2900390625, "learning_rate": 9.487612612612612e-07, "loss": 0.0003, "reward": 3.2707679271698, "reward_std": 0.3094882294535637, "rewards/final_reward": 1.6510349024018907, "rewards/mask_iou_reward": 0.8255174512009453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.270767867565155, "rewards/thk_ans_format_reward": 1.0, "step": 182, "think_completion_length": 91.25 }, { "clip_ratio": 0.0, "completion_length": 168.5416717529297, "epoch": 0.6172006745362564, "grad_norm": 7.566129474584754, "kl": 0.23828125, "learning_rate": 9.484797297297296e-07, "loss": 0.0003, "reward": 3.528642773628235, "reward_std": 0.19238104671239853, "rewards/final_reward": 1.3287466025722514, "rewards/mask_iou_reward": 0.6643733012861257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5286428332328796, "rewards/thk_ans_format_reward": 1.0, "step": 183, "think_completion_length": 104.125 }, { "clip_ratio": 0.0, "completion_length": 161.14583587646484, "epoch": 0.6205733558178752, "grad_norm": 19.462449457855936, "kl": 0.314453125, "learning_rate": 9.481981981981981e-07, "loss": 0.0003, "reward": 3.2879135608673096, "reward_std": 0.1729014366865158, "rewards/final_reward": 0.6132975123786321, "rewards/mask_iou_reward": 0.30664875618931603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2879136204719543, "rewards/thk_ans_format_reward": 1.0, "step": 184, "think_completion_length": 105.125 }, { "clip_ratio": 0.0, "completion_length": 161.71875762939453, "epoch": 0.6239460370994941, "grad_norm": 10.238505822769678, "kl": 0.291015625, "learning_rate": 9.479166666666666e-07, "loss": 0.0003, "reward": 3.4363847970962524, "reward_std": 0.08068331144750118, "rewards/final_reward": 1.9090332802960994, "rewards/mask_iou_reward": 0.9545166401480497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.436384677886963, "rewards/thk_ans_format_reward": 1.0, "step": 185, "think_completion_length": 95.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 154.87500762939453, "epoch": 0.627318718381113, "grad_norm": 55.15622668515774, "kl": 0.25390625, "learning_rate": 9.47635135135135e-07, "loss": 0.0003, "reward": 3.5349490642547607, "reward_std": 0.11874636262655258, "rewards/final_reward": 1.7553482866542853, "rewards/mask_iou_reward": 0.8776741433271427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.534949004650116, "rewards/thk_ans_format_reward": 1.0, "step": 186, "think_completion_length": 94.125 }, { "clip_ratio": 0.0, "completion_length": 177.3541717529297, "epoch": 0.6306913996627319, "grad_norm": 18.497533459465107, "kl": 0.275390625, "learning_rate": 9.473536036036036e-07, "loss": 0.0003, "reward": 3.135318160057068, "reward_std": 0.2178964763879776, "rewards/final_reward": 1.3029347302478484, "rewards/mask_iou_reward": 0.6514673651239242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.135318249464035, "rewards/thk_ans_format_reward": 1.0, "step": 187, "think_completion_length": 97.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 168.17708587646484, "epoch": 0.6340640809443507, "grad_norm": 9.223637235595785, "kl": 0.2685546875, "learning_rate": 9.47072072072072e-07, "loss": 0.0003, "reward": 2.919049620628357, "reward_std": 0.11340761929750443, "rewards/final_reward": 0.1910890640154114, "rewards/mask_iou_reward": 0.0955445320077057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9190497100353241, "rewards/thk_ans_format_reward": 1.0, "step": 188, "think_completion_length": 92.20833333333334 }, { "clip_ratio": 0.0, "completion_length": 192.80208587646484, "epoch": 0.6374367622259697, "grad_norm": 7.568967069603324, "kl": 0.2451171875, "learning_rate": 9.467905405405405e-07, "loss": 0.0002, "reward": 3.226958751678467, "reward_std": 0.22894418239593506, "rewards/final_reward": 1.2491899400329127, "rewards/mask_iou_reward": 0.6245949700164564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2373753786087036, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 189, "think_completion_length": 104.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 155.73958587646484, "epoch": 0.6408094435075885, "grad_norm": 5.306307755700064, "kl": 0.2431640625, "learning_rate": 9.46509009009009e-07, "loss": 0.0002, "reward": 3.098444104194641, "reward_std": 0.3319072127342224, "rewards/final_reward": 1.2748850668597314, "rewards/mask_iou_reward": 0.6374425334298657, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.108860731124878, "rewards/thk_ans_format_reward": 1.0, "step": 190, "think_completion_length": 75.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 174.34375, "epoch": 0.6441821247892074, "grad_norm": 6.471694725642505, "kl": 0.25390625, "learning_rate": 9.462274774774774e-07, "loss": 0.0003, "reward": 2.9371249675750732, "reward_std": 0.2752944231033325, "rewards/final_reward": 1.2037132542187892, "rewards/mask_iou_reward": 0.6018566271093946, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9371249675750732, "rewards/thk_ans_format_reward": 1.0, "step": 191, "think_completion_length": 101.66666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.39583587646484, "epoch": 0.6475548060708263, "grad_norm": 4.684049245891642, "kl": 0.2587890625, "learning_rate": 9.459459459459459e-07, "loss": 0.0003, "reward": 3.32126247882843, "reward_std": 0.2728252410888672, "rewards/final_reward": 1.6915106845598409, "rewards/mask_iou_reward": 0.8457553422799204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3212623596191406, "rewards/thk_ans_format_reward": 1.0, "step": 192, "think_completion_length": 94.625 }, { "clip_ratio": 0.0, "completion_length": 184.84375762939453, "epoch": 0.6509274873524452, "grad_norm": 5.739390664097678, "kl": 0.24267578125, "learning_rate": 9.456644144144143e-07, "loss": 0.0002, "reward": 3.100097417831421, "reward_std": 0.3362935483455658, "rewards/final_reward": 1.1146668513107205, "rewards/mask_iou_reward": 0.5573334256553603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1000972986221313, "rewards/thk_ans_format_reward": 1.0, "step": 193, "think_completion_length": 114.41666666666666 }, { "clip_ratio": 0.0, "completion_length": 162.58333587646484, "epoch": 0.654300168634064, "grad_norm": 8.735426480074405, "kl": 0.26953125, "learning_rate": 9.453828828828828e-07, "loss": 0.0003, "reward": 3.3539435863494873, "reward_std": 0.3737208843231201, "rewards/final_reward": 1.2457365957684952, "rewards/mask_iou_reward": 0.6228682978842476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3539432883262634, "rewards/thk_ans_format_reward": 1.0, "step": 194, "think_completion_length": 96.75 }, { "clip_ratio": 0.0, "completion_length": 167.65625762939453, "epoch": 0.657672849915683, "grad_norm": 5.834979241570896, "kl": 0.2578125, "learning_rate": 9.451013513513513e-07, "loss": 0.0003, "reward": 3.2236061096191406, "reward_std": 0.2855689972639084, "rewards/final_reward": 1.3954733399497419, "rewards/mask_iou_reward": 0.6977366699748709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2236062288284302, "rewards/thk_ans_format_reward": 1.0, "step": 195, "think_completion_length": 114.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 176.12500762939453, "epoch": 0.6610455311973018, "grad_norm": 62.393868086469084, "kl": 0.26171875, "learning_rate": 9.448198198198197e-07, "loss": 0.0003, "reward": 3.0305802822113037, "reward_std": 0.2396130934357643, "rewards/final_reward": 0.3726910393779626, "rewards/mask_iou_reward": 0.1863455196889813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0305803418159485, "rewards/thk_ans_format_reward": 1.0, "step": 196, "think_completion_length": 104.875 }, { "clip_ratio": 0.0, "completion_length": 177.65625762939453, "epoch": 0.6644182124789207, "grad_norm": 9.09552951283109, "kl": 0.2548828125, "learning_rate": 9.445382882882883e-07, "loss": 0.0003, "reward": 3.033574938774109, "reward_std": 0.30480627715587616, "rewards/final_reward": 0.6127038885463165, "rewards/mask_iou_reward": 0.30635194427315826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0335749089717865, "rewards/thk_ans_format_reward": 1.0, "step": 197, "think_completion_length": 109.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 170.6041717529297, "epoch": 0.6677908937605397, "grad_norm": 4.541735314198181, "kl": 0.2734375, "learning_rate": 9.442567567567568e-07, "loss": 0.0003, "reward": 3.2191314697265625, "reward_std": 0.12774834409356117, "rewards/final_reward": 1.363589056546357, "rewards/mask_iou_reward": 0.6817945282731785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2191317081451416, "rewards/thk_ans_format_reward": 1.0, "step": 198, "think_completion_length": 109.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 201.11459350585938, "epoch": 0.6711635750421585, "grad_norm": 7.252136816346154, "kl": 0.23779296875, "learning_rate": 9.439752252252252e-07, "loss": 0.0002, "reward": 3.0531833171844482, "reward_std": 0.40384694933891296, "rewards/final_reward": 1.0973872267338165, "rewards/mask_iou_reward": 0.5486936133669083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0531832575798035, "rewards/thk_ans_format_reward": 1.0, "step": 199, "think_completion_length": 125.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 181.6041717529297, "epoch": 0.6745362563237775, "grad_norm": 11.134304628990678, "kl": 0.2470703125, "learning_rate": 9.436936936936937e-07, "loss": 0.0002, "reward": 3.1323522329330444, "reward_std": 0.34756386280059814, "rewards/final_reward": 0.4455057568614778, "rewards/mask_iou_reward": 0.2227528784307389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1323521137237549, "rewards/thk_ans_format_reward": 1.0, "step": 200, "think_completion_length": 121.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 190.18750762939453, "epoch": 0.6779089376053963, "grad_norm": 12.334649936602695, "kl": 0.26123046875, "learning_rate": 9.434121621621621e-07, "loss": 0.0003, "reward": 3.055158853530884, "reward_std": 0.1682959347963333, "rewards/final_reward": 0.5981111851549227, "rewards/mask_iou_reward": 0.29905559257746134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0551589131355286, "rewards/thk_ans_format_reward": 1.0, "step": 201, "think_completion_length": 151.45833333333331 }, { "clip_ratio": 0.0, "completion_length": 190.67709350585938, "epoch": 0.6812816188870152, "grad_norm": 7.556066147867235, "kl": 0.24072265625, "learning_rate": 9.431306306306306e-07, "loss": 0.0002, "reward": 3.263322353363037, "reward_std": 0.2816054970026016, "rewards/final_reward": 1.0462942667515105, "rewards/mask_iou_reward": 0.5231471333757552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2633222341537476, "rewards/thk_ans_format_reward": 1.0, "step": 202, "think_completion_length": 115.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 197.12500762939453, "epoch": 0.684654300168634, "grad_norm": 6.2700840389340575, "kl": 0.2587890625, "learning_rate": 9.428490990990991e-07, "loss": 0.0003, "reward": 2.998106360435486, "reward_std": 0.26637783646583557, "rewards/final_reward": 1.4625346666906975, "rewards/mask_iou_reward": 0.7312673333453488, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.998106449842453, "rewards/thk_ans_format_reward": 1.0, "step": 203, "think_completion_length": 117.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 195.23958587646484, "epoch": 0.688026981450253, "grad_norm": 3.79495885528206, "kl": 0.240234375, "learning_rate": 9.425675675675675e-07, "loss": 0.0002, "reward": 3.110295057296753, "reward_std": 0.24425452947616577, "rewards/final_reward": 1.3623789795938142, "rewards/mask_iou_reward": 0.6811894897969071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.110295057296753, "rewards/thk_ans_format_reward": 1.0, "step": 204, "think_completion_length": 127.125 }, { "clip_ratio": 0.0, "completion_length": 200.9791717529297, "epoch": 0.6913996627318718, "grad_norm": 37.346696638085206, "kl": 0.2841796875, "learning_rate": 9.42286036036036e-07, "loss": 0.0003, "reward": 3.355635404586792, "reward_std": 0.22298195213079453, "rewards/final_reward": 1.2873584034100065, "rewards/mask_iou_reward": 0.6436792017050033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3556353449821472, "rewards/thk_ans_format_reward": 1.0, "step": 205, "think_completion_length": 143.5 }, { "clip_ratio": 0.0, "completion_length": 208.96875762939453, "epoch": 0.6947723440134908, "grad_norm": 5.433853004920652, "kl": 0.2333984375, "learning_rate": 9.420045045045044e-07, "loss": 0.0002, "reward": 3.380358934402466, "reward_std": 0.16319020092487335, "rewards/final_reward": 1.0986353687626877, "rewards/mask_iou_reward": 0.5493176843813439, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3803590536117554, "rewards/thk_ans_format_reward": 1.0, "step": 206, "think_completion_length": 143.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 217.90625, "epoch": 0.6981450252951096, "grad_norm": 4.334262383011323, "kl": 0.228515625, "learning_rate": 9.41722972972973e-07, "loss": 0.0002, "reward": 3.2010854482650757, "reward_std": 0.19329330325126648, "rewards/final_reward": 1.3971072227643941, "rewards/mask_iou_reward": 0.6985536113821971, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2010854482650757, "rewards/thk_ans_format_reward": 1.0, "step": 207, "think_completion_length": 159.54166666666666 }, { "clip_ratio": 0.0, "completion_length": 207.6666717529297, "epoch": 0.7015177065767285, "grad_norm": 4.896160260038562, "kl": 0.251953125, "learning_rate": 9.414414414414415e-07, "loss": 0.0003, "reward": 3.0796992778778076, "reward_std": 0.13358672708272934, "rewards/final_reward": 1.442706489408488, "rewards/mask_iou_reward": 0.721353244704244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.079699158668518, "rewards/thk_ans_format_reward": 1.0, "step": 208, "think_completion_length": 143.625 }, { "clip_ratio": 0.0, "completion_length": 222.14583587646484, "epoch": 0.7048903878583473, "grad_norm": 19.68180647849107, "kl": 0.25, "learning_rate": 9.411599099099099e-07, "loss": 0.0003, "reward": 2.9135031700134277, "reward_std": 0.18659613281488419, "rewards/final_reward": 0.8169442864924998, "rewards/mask_iou_reward": 0.4084721432462499, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.913503110408783, "rewards/thk_ans_format_reward": 1.0, "step": 209, "think_completion_length": 152.875 }, { "clip_ratio": 0.0, "completion_length": 217.89583587646484, "epoch": 0.7082630691399663, "grad_norm": 15.962050613392199, "kl": 0.2509765625, "learning_rate": 9.408783783783784e-07, "loss": 0.0002, "reward": 2.6730724573135376, "reward_std": 0.1774405688047409, "rewards/final_reward": 0.3143013680390508, "rewards/mask_iou_reward": 0.1571506840195254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6730725169181824, "rewards/thk_ans_format_reward": 1.0, "step": 210, "think_completion_length": 146.91666666666669 }, { "clip_ratio": 0.0, "completion_length": 212.1666717529297, "epoch": 0.7116357504215851, "grad_norm": 10.032991417039138, "kl": 0.2451171875, "learning_rate": 9.405968468468469e-07, "loss": 0.0002, "reward": 3.3573029041290283, "reward_std": 0.12738436460494995, "rewards/final_reward": 1.4354043883805865, "rewards/mask_iou_reward": 0.7177021941902932, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.357302963733673, "rewards/thk_ans_format_reward": 1.0, "step": 211, "think_completion_length": 139.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 206.1875, "epoch": 0.715008431703204, "grad_norm": 8.828001892207169, "kl": 0.24609375, "learning_rate": 9.403153153153153e-07, "loss": 0.0002, "reward": 3.031591057777405, "reward_std": 0.1295524761080742, "rewards/final_reward": 0.9175850267396465, "rewards/mask_iou_reward": 0.45879251336982324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0315908789634705, "rewards/thk_ans_format_reward": 1.0, "step": 212, "think_completion_length": 135.5 }, { "clip_ratio": 0.0, "completion_length": 198.9791717529297, "epoch": 0.718381112984823, "grad_norm": 3.6035040176651636, "kl": 0.2802734375, "learning_rate": 9.400337837837838e-07, "loss": 0.0003, "reward": 2.9412542581558228, "reward_std": 0.20104002207517624, "rewards/final_reward": 0.9532803738516593, "rewards/mask_iou_reward": 0.47664018692582966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9412541389465332, "rewards/thk_ans_format_reward": 1.0, "step": 213, "think_completion_length": 136.58333333333331 }, { "clip_ratio": 0.0, "completion_length": 235.6979217529297, "epoch": 0.7217537942664418, "grad_norm": 6.910035871057598, "kl": 0.22509765625, "learning_rate": 9.397522522522522e-07, "loss": 0.0002, "reward": 2.9119738340377808, "reward_std": 0.2971492111682892, "rewards/final_reward": 1.0333231695646627, "rewards/mask_iou_reward": 0.5166615847823314, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9119738936424255, "rewards/thk_ans_format_reward": 1.0, "step": 214, "think_completion_length": 158.16666666666669 }, { "clip_ratio": 0.0, "completion_length": 210.80208587646484, "epoch": 0.7251264755480608, "grad_norm": 6.761279164799118, "kl": 0.2802734375, "learning_rate": 9.394707207207207e-07, "loss": 0.0003, "reward": 3.0778356790542603, "reward_std": 0.18553235195577145, "rewards/final_reward": 1.3225514499288327, "rewards/mask_iou_reward": 0.6612757249644163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0778355598449707, "rewards/thk_ans_format_reward": 1.0, "step": 215, "think_completion_length": 124.91666666666666 }, { "clip_ratio": 0.0, "completion_length": 191.5416717529297, "epoch": 0.7284991568296796, "grad_norm": 7.233850506151749, "kl": 0.3037109375, "learning_rate": 9.391891891891892e-07, "loss": 0.0003, "reward": 2.8451234102249146, "reward_std": 0.23334325850009918, "rewards/final_reward": 1.244009831333866, "rewards/mask_iou_reward": 0.622004915666933, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8451233506202698, "rewards/thk_ans_format_reward": 1.0, "step": 216, "think_completion_length": 111.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 222.27083587646484, "epoch": 0.7318718381112985, "grad_norm": 5.531946000322627, "kl": 0.259765625, "learning_rate": 9.389076576576577e-07, "loss": 0.0003, "reward": 3.2371309995651245, "reward_std": 0.22913970798254013, "rewards/final_reward": 1.3814128170381639, "rewards/mask_iou_reward": 0.6907064085190819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2371309995651245, "rewards/thk_ans_format_reward": 1.0, "step": 217, "think_completion_length": 144.125 }, { "clip_ratio": 0.0, "completion_length": 205.27083587646484, "epoch": 0.7352445193929174, "grad_norm": 4.716600374535074, "kl": 0.275390625, "learning_rate": 9.386261261261261e-07, "loss": 0.0003, "reward": 3.3373496532440186, "reward_std": 0.10465261340141296, "rewards/final_reward": 1.1443409000330345, "rewards/mask_iou_reward": 0.5721704500165172, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.337349534034729, "rewards/thk_ans_format_reward": 1.0, "step": 218, "think_completion_length": 151.25 }, { "clip_ratio": 0.0, "completion_length": 212.6354217529297, "epoch": 0.7386172006745363, "grad_norm": 4.826058419182305, "kl": 0.271484375, "learning_rate": 9.383445945945945e-07, "loss": 0.0003, "reward": 3.228532910346985, "reward_std": 0.2345435619354248, "rewards/final_reward": 0.9689652674189777, "rewards/mask_iou_reward": 0.48448263370948885, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2285327315330505, "rewards/thk_ans_format_reward": 1.0, "step": 219, "think_completion_length": 120.75 }, { "clip_ratio": 0.0, "completion_length": 204.18750762939453, "epoch": 0.7419898819561551, "grad_norm": 13.942520207451084, "kl": 0.294921875, "learning_rate": 9.38063063063063e-07, "loss": 0.0003, "reward": 3.0537610054016113, "reward_std": 0.22725334763526917, "rewards/final_reward": 1.3937016158440043, "rewards/mask_iou_reward": 0.6968508079220022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.053760975599289, "rewards/thk_ans_format_reward": 1.0, "step": 220, "think_completion_length": 135.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 179.3229217529297, "epoch": 0.7453625632377741, "grad_norm": 12.578546976305708, "kl": 0.265625, "learning_rate": 9.377815315315315e-07, "loss": 0.0003, "reward": 3.0679415464401245, "reward_std": 0.24034039676189423, "rewards/final_reward": 1.0855294095190113, "rewards/mask_iou_reward": 0.5427647047595057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.067941665649414, "rewards/thk_ans_format_reward": 1.0, "step": 221, "think_completion_length": 120.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 198.00000762939453, "epoch": 0.7487352445193929, "grad_norm": 11.785419292047843, "kl": 0.310546875, "learning_rate": 9.374999999999999e-07, "loss": 0.0003, "reward": 3.1781201362609863, "reward_std": 0.19415687024593353, "rewards/final_reward": 1.2690602405632174, "rewards/mask_iou_reward": 0.6345301202816087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1781199276447296, "rewards/thk_ans_format_reward": 1.0, "step": 222, "think_completion_length": 114.0 }, { "clip_ratio": 0.0, "completion_length": 178.45834350585938, "epoch": 0.7521079258010118, "grad_norm": 10.962821825044882, "kl": 0.3408203125, "learning_rate": 9.372184684684684e-07, "loss": 0.0003, "reward": 3.1292738914489746, "reward_std": 0.28961120545864105, "rewards/final_reward": 1.1409641431470974, "rewards/mask_iou_reward": 0.5704820715735487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1292736232280731, "rewards/thk_ans_format_reward": 1.0, "step": 223, "think_completion_length": 115.875 }, { "clip_ratio": 0.0, "completion_length": 190.14584350585938, "epoch": 0.7554806070826307, "grad_norm": 8.060905494849647, "kl": 0.2578125, "learning_rate": 9.369369369369368e-07, "loss": 0.0003, "reward": 3.0584421157836914, "reward_std": 0.278117410838604, "rewards/final_reward": 1.3437103210085293, "rewards/mask_iou_reward": 0.6718551605042646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0584419965744019, "rewards/thk_ans_format_reward": 1.0, "step": 224, "think_completion_length": 138.20833333333331 }, { "clip_ratio": 0.0, "completion_length": 184.6354217529297, "epoch": 0.7588532883642496, "grad_norm": 5.240615156237188, "kl": 0.29296875, "learning_rate": 9.366554054054053e-07, "loss": 0.0003, "reward": 2.867990255355835, "reward_std": 0.21263901889324188, "rewards/final_reward": 0.36395343568342153, "rewards/mask_iou_reward": 0.18197671784171077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8679901361465454, "rewards/thk_ans_format_reward": 1.0, "step": 225, "think_completion_length": 108.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 170.86458587646484, "epoch": 0.7622259696458684, "grad_norm": 6.065632045897228, "kl": 0.32421875, "learning_rate": 9.363738738738738e-07, "loss": 0.0003, "reward": 3.0205975770950317, "reward_std": 0.10638023167848587, "rewards/final_reward": 1.1502363860044684, "rewards/mask_iou_reward": 0.5751181930022342, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0205976217985153, "rewards/thk_ans_format_reward": 1.0, "step": 226, "think_completion_length": 92.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 174.625, "epoch": 0.7655986509274874, "grad_norm": 4.96500687828192, "kl": 0.3212890625, "learning_rate": 9.360923423423422e-07, "loss": 0.0003, "reward": 3.1703583002090454, "reward_std": 0.14759309589862823, "rewards/final_reward": 1.3432951532984052, "rewards/mask_iou_reward": 0.6716475766492026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1703582406044006, "rewards/thk_ans_format_reward": 1.0, "step": 227, "think_completion_length": 104.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 160.7291717529297, "epoch": 0.7689713322091062, "grad_norm": 7.745198108166682, "kl": 0.462890625, "learning_rate": 9.358108108108108e-07, "loss": 0.0006, "reward": 2.9740800857543945, "reward_std": 0.3129550665616989, "rewards/final_reward": 1.496134752534723, "rewards/mask_iou_reward": 0.7480673762673615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9740800261497498, "rewards/thk_ans_format_reward": 1.0, "step": 228, "think_completion_length": 113.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 158.34375762939453, "epoch": 0.7723440134907251, "grad_norm": 12.566104219984988, "kl": 0.3935546875, "learning_rate": 9.355292792792792e-07, "loss": 0.0004, "reward": 3.2419862747192383, "reward_std": 0.22945839166641235, "rewards/final_reward": 1.9098584814414274, "rewards/mask_iou_reward": 0.9549292407207137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2419861555099487, "rewards/thk_ans_format_reward": 1.0, "step": 229, "think_completion_length": 92.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 164.95833587646484, "epoch": 0.7757166947723441, "grad_norm": 6.96585549840121, "kl": 0.318359375, "learning_rate": 9.352477477477477e-07, "loss": 0.0003, "reward": 3.1130073070526123, "reward_std": 0.3432839810848236, "rewards/final_reward": 1.2066239717794454, "rewards/mask_iou_reward": 0.6033119858897227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1130073070526123, "rewards/thk_ans_format_reward": 1.0, "step": 230, "think_completion_length": 103.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 154.53125762939453, "epoch": 0.7790893760539629, "grad_norm": 13.129698712685878, "kl": 0.283203125, "learning_rate": 9.349662162162162e-07, "loss": 0.0003, "reward": 3.0300129652023315, "reward_std": 0.17744334042072296, "rewards/final_reward": 1.4037192994587762, "rewards/mask_iou_reward": 0.7018596497293881, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0300128757953644, "rewards/thk_ans_format_reward": 1.0, "step": 231, "think_completion_length": 89.0 }, { "clip_ratio": 0.0, "completion_length": 146.55208587646484, "epoch": 0.7824620573355818, "grad_norm": 4.300902307031056, "kl": 0.30859375, "learning_rate": 9.346846846846846e-07, "loss": 0.0003, "reward": 3.1586432456970215, "reward_std": 0.2268233448266983, "rewards/final_reward": 0.7366462498745183, "rewards/mask_iou_reward": 0.36832312493725916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1586434841156006, "rewards/thk_ans_format_reward": 1.0, "step": 232, "think_completion_length": 81.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 148.64583587646484, "epoch": 0.7858347386172007, "grad_norm": 6.856175117227461, "kl": 0.33203125, "learning_rate": 9.344031531531531e-07, "loss": 0.0003, "reward": 3.409724473953247, "reward_std": 0.14944615215063095, "rewards/final_reward": 0.99799662277511, "rewards/mask_iou_reward": 0.498998311387555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4097245931625366, "rewards/thk_ans_format_reward": 1.0, "step": 233, "think_completion_length": 86.08333333333334 }, { "clip_ratio": 0.0, "completion_length": 141.55208587646484, "epoch": 0.7892074198988196, "grad_norm": 7.912545060528145, "kl": 0.3115234375, "learning_rate": 9.341216216216216e-07, "loss": 0.0003, "reward": 2.913196563720703, "reward_std": 0.23317305743694305, "rewards/final_reward": 0.8338462476799143, "rewards/mask_iou_reward": 0.41692312383995717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9131967127323151, "rewards/thk_ans_format_reward": 1.0, "step": 234, "think_completion_length": 77.875 }, { "clip_ratio": 0.0, "completion_length": 141.7604217529297, "epoch": 0.7925801011804384, "grad_norm": 8.777457829113022, "kl": 0.326171875, "learning_rate": 9.3384009009009e-07, "loss": 0.0003, "reward": 3.0413867235183716, "reward_std": 0.2278767228126526, "rewards/final_reward": 0.5746736596316071, "rewards/mask_iou_reward": 0.28733682981580355, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0413867831230164, "rewards/thk_ans_format_reward": 1.0, "step": 235, "think_completion_length": 81.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 134.61459350585938, "epoch": 0.7959527824620574, "grad_norm": 7.805444170540823, "kl": 0.4345703125, "learning_rate": 9.335585585585585e-07, "loss": 0.0004, "reward": 3.0897037982940674, "reward_std": 0.3523574024438858, "rewards/final_reward": 1.2939491615198757, "rewards/mask_iou_reward": 0.6469745807599379, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0897037386894226, "rewards/thk_ans_format_reward": 1.0, "step": 236, "think_completion_length": 67.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 133.67709350585938, "epoch": 0.7993254637436762, "grad_norm": 13.994501698735474, "kl": 0.509765625, "learning_rate": 9.332770270270269e-07, "loss": 0.0005, "reward": 2.8868261575698853, "reward_std": 0.31315620988607407, "rewards/final_reward": 1.5869208062798532, "rewards/mask_iou_reward": 0.7934604031399266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8868262767791748, "rewards/thk_ans_format_reward": 1.0, "step": 237, "think_completion_length": 71.45833333333334 }, { "clip_ratio": 0.0, "completion_length": 133.30209350585938, "epoch": 0.8026981450252951, "grad_norm": 16.495913905202936, "kl": 0.341796875, "learning_rate": 9.329954954954955e-07, "loss": 0.0003, "reward": 3.322413921356201, "reward_std": 0.13931374996900558, "rewards/final_reward": 0.9929226750951938, "rewards/mask_iou_reward": 0.4964613375475969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3224138617515564, "rewards/thk_ans_format_reward": 1.0, "step": 238, "think_completion_length": 69.41666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.52083587646484, "epoch": 0.806070826306914, "grad_norm": 5.448551014834402, "kl": 0.3017578125, "learning_rate": 9.32713963963964e-07, "loss": 0.0003, "reward": 3.1825672388076782, "reward_std": 0.15929418802261353, "rewards/final_reward": 0.7330971489037537, "rewards/mask_iou_reward": 0.36654857445187683, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1825674176216125, "rewards/thk_ans_format_reward": 1.0, "step": 239, "think_completion_length": 60.541666666666664 }, { "clip_ratio": 0.0, "completion_length": 132.48959350585938, "epoch": 0.8094435075885329, "grad_norm": 5.379072408289089, "kl": 0.330078125, "learning_rate": 9.324324324324324e-07, "loss": 0.0003, "reward": 2.834401249885559, "reward_std": 0.28128696233034134, "rewards/final_reward": 0.7617745861450789, "rewards/mask_iou_reward": 0.38088729307253943, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 0.8552343845367432, "rewards/thk_ans_format_reward": 1.0, "step": 240, "think_completion_length": 64.125 }, { "clip_ratio": 0.0, "completion_length": 129.45833587646484, "epoch": 0.8128161888701517, "grad_norm": 20.354490739109075, "kl": 0.328125, "learning_rate": 9.321509009009009e-07, "loss": 0.0003, "reward": 3.0151419639587402, "reward_std": 0.15956728160381317, "rewards/final_reward": 1.046194412546264, "rewards/mask_iou_reward": 0.523097206273132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0151418447494507, "rewards/thk_ans_format_reward": 1.0, "step": 241, "think_completion_length": 68.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 133.67708587646484, "epoch": 0.8161888701517707, "grad_norm": 9.0604303740086, "kl": 0.3173828125, "learning_rate": 9.318693693693693e-07, "loss": 0.0003, "reward": 3.092646598815918, "reward_std": 0.23580051958560944, "rewards/final_reward": 0.3709977762409087, "rewards/mask_iou_reward": 0.18549888812045434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0926466882228851, "rewards/thk_ans_format_reward": 1.0, "step": 242, "think_completion_length": 67.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 140.7604217529297, "epoch": 0.8195615514333895, "grad_norm": 10.869798525247067, "kl": 0.3125, "learning_rate": 9.315878378378378e-07, "loss": 0.0003, "reward": 3.2662233114242554, "reward_std": 0.13431217521429062, "rewards/final_reward": 0.47974971969132774, "rewards/mask_iou_reward": 0.23987485984566387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2662232518196106, "rewards/thk_ans_format_reward": 1.0, "step": 243, "think_completion_length": 59.0 }, { "clip_ratio": 0.0, "completion_length": 135.43750762939453, "epoch": 0.8229342327150084, "grad_norm": 15.448332221184192, "kl": 0.2939453125, "learning_rate": 9.313063063063063e-07, "loss": 0.0003, "reward": 2.899739623069763, "reward_std": 0.21755699813365936, "rewards/final_reward": 0.7527796999562915, "rewards/mask_iou_reward": 0.37638984997814573, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8997395038604736, "rewards/thk_ans_format_reward": 1.0, "step": 244, "think_completion_length": 72.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 139.42708587646484, "epoch": 0.8263069139966274, "grad_norm": 17.271046761465456, "kl": 0.2978515625, "learning_rate": 9.310247747747747e-07, "loss": 0.0003, "reward": 3.221222758293152, "reward_std": 0.19722003489732742, "rewards/final_reward": 1.0775364410245993, "rewards/mask_iou_reward": 0.5387682205122997, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2212228178977966, "rewards/thk_ans_format_reward": 1.0, "step": 245, "think_completion_length": 87.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 135.61459350585938, "epoch": 0.8296795952782462, "grad_norm": 6.529827798108178, "kl": 0.32421875, "learning_rate": 9.307432432432432e-07, "loss": 0.0003, "reward": 3.007061004638672, "reward_std": 0.23605723679065704, "rewards/final_reward": 0.7326444464817671, "rewards/mask_iou_reward": 0.36632222324088354, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0070610642433167, "rewards/thk_ans_format_reward": 1.0, "step": 246, "think_completion_length": 60.083333333333336 }, { "clip_ratio": 0.0, "completion_length": 128.89583587646484, "epoch": 0.8330522765598651, "grad_norm": 6.977264576795783, "kl": 0.3427734375, "learning_rate": 9.304617117117116e-07, "loss": 0.0003, "reward": 3.198093056678772, "reward_std": 0.09082278236746788, "rewards/final_reward": 0.6644612765647351, "rewards/mask_iou_reward": 0.33223063828236754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1980929970741272, "rewards/thk_ans_format_reward": 1.0, "step": 247, "think_completion_length": 63.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 151.08333587646484, "epoch": 0.836424957841484, "grad_norm": 6.7442749121409555, "kl": 0.271484375, "learning_rate": 9.301801801801802e-07, "loss": 0.0003, "reward": 3.0286999940872192, "reward_std": 0.20378149673342705, "rewards/final_reward": 0.7986789049758385, "rewards/mask_iou_reward": 0.39933945248791924, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0391165912151337, "rewards/thk_ans_format_reward": 1.0, "step": 248, "think_completion_length": 66.70833333333334 }, { "clip_ratio": 0.0, "completion_length": 140.59375762939453, "epoch": 0.8397976391231029, "grad_norm": 15.270005105074052, "kl": 0.29296875, "learning_rate": 9.298986486486487e-07, "loss": 0.0003, "reward": 2.760540723800659, "reward_std": 0.2210642360150814, "rewards/final_reward": 0.802359601161798, "rewards/mask_iou_reward": 0.401179800580899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7605406939983368, "rewards/thk_ans_format_reward": 1.0, "step": 249, "think_completion_length": 61.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 136.87500762939453, "epoch": 0.8431703204047217, "grad_norm": 9.56026883907223, "kl": 0.2939453125, "learning_rate": 9.296171171171171e-07, "loss": 0.0003, "reward": 3.385342001914978, "reward_std": 0.20231656730175018, "rewards/final_reward": 1.6732546895302796, "rewards/mask_iou_reward": 0.8366273447651398, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.385342001914978, "rewards/thk_ans_format_reward": 1.0, "step": 250, "think_completion_length": 60.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 153.67708587646484, "epoch": 0.8465430016863407, "grad_norm": 14.351451585261236, "kl": 0.326171875, "learning_rate": 9.293355855855856e-07, "loss": 0.0003, "reward": 3.166195034980774, "reward_std": 0.22971728444099426, "rewards/final_reward": 0.8075349486655949, "rewards/mask_iou_reward": 0.40376747433279747, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1661950051784515, "rewards/thk_ans_format_reward": 1.0, "step": 251, "think_completion_length": 64.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 132.34375, "epoch": 0.8499156829679595, "grad_norm": 8.474053501046951, "kl": 0.28515625, "learning_rate": 9.290540540540541e-07, "loss": 0.0003, "reward": 2.7512192726135254, "reward_std": 0.24805811047554016, "rewards/final_reward": 1.4858556421360731, "rewards/mask_iou_reward": 0.7429278210680366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7512190937995911, "rewards/thk_ans_format_reward": 1.0, "step": 252, "think_completion_length": 66.83333333333334 }, { "clip_ratio": 0.0, "completion_length": 141.75000762939453, "epoch": 0.8532883642495784, "grad_norm": 9.573703576617035, "kl": 0.294921875, "learning_rate": 9.287725225225225e-07, "loss": 0.0003, "reward": 3.110999345779419, "reward_std": 0.18930789083242416, "rewards/final_reward": 1.3280421091057069, "rewards/mask_iou_reward": 0.6640210545528534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1109992265701294, "rewards/thk_ans_format_reward": 1.0, "step": 253, "think_completion_length": 63.333333333333336 }, { "clip_ratio": 0.0, "completion_length": 130.1666717529297, "epoch": 0.8566610455311973, "grad_norm": 7.772974324691213, "kl": 0.2900390625, "learning_rate": 9.28490990990991e-07, "loss": 0.0003, "reward": 2.875392436981201, "reward_std": 0.18357276916503906, "rewards/final_reward": 0.6232644154840334, "rewards/mask_iou_reward": 0.3116322077420167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8753922432661057, "rewards/thk_ans_format_reward": 1.0, "step": 254, "think_completion_length": 65.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 128.43750381469727, "epoch": 0.8600337268128162, "grad_norm": 10.93387391032777, "kl": 0.662109375, "learning_rate": 9.282094594594594e-07, "loss": 0.0007, "reward": 3.0346208810806274, "reward_std": 0.2844673991203308, "rewards/final_reward": 1.1061904723534532, "rewards/mask_iou_reward": 0.5530952361767266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.034620761871338, "rewards/thk_ans_format_reward": 1.0, "step": 255, "think_completion_length": 64.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 131.00000762939453, "epoch": 0.863406408094435, "grad_norm": 8.986343999032677, "kl": 0.2626953125, "learning_rate": 9.279279279279278e-07, "loss": 0.0003, "reward": 2.7675180435180664, "reward_std": 0.1956443041563034, "rewards/final_reward": 0.3999406699768425, "rewards/mask_iou_reward": 0.19997033498842126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.767518013715744, "rewards/thk_ans_format_reward": 1.0, "step": 256, "think_completion_length": 63.625 }, { "clip_ratio": 0.0, "completion_length": 132.5104217529297, "epoch": 0.866779089376054, "grad_norm": 9.091060447695627, "kl": 0.2919921875, "learning_rate": 9.276463963963963e-07, "loss": 0.0003, "reward": 3.3000314235687256, "reward_std": 0.1437247097492218, "rewards/final_reward": 1.140711099988953, "rewards/mask_iou_reward": 0.5703555499944765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3000314831733704, "rewards/thk_ans_format_reward": 1.0, "step": 257, "think_completion_length": 63.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 160.3854217529297, "epoch": 0.8701517706576728, "grad_norm": 7.808627484090949, "kl": 0.2890625, "learning_rate": 9.273648648648648e-07, "loss": 0.0003, "reward": 2.876472234725952, "reward_std": 0.16375703364610672, "rewards/final_reward": 0.9649949267346407, "rewards/mask_iou_reward": 0.48249746336732036, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8868888914585114, "rewards/thk_ans_format_reward": 1.0, "step": 258, "think_completion_length": 57.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 134.6041717529297, "epoch": 0.8735244519392917, "grad_norm": 9.507535751938144, "kl": 0.2841796875, "learning_rate": 9.270833333333333e-07, "loss": 0.0003, "reward": 3.252575159072876, "reward_std": 0.17459773272275925, "rewards/final_reward": 0.8978655204807608, "rewards/mask_iou_reward": 0.4489327602403804, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2525752782821655, "rewards/thk_ans_format_reward": 1.0, "step": 259, "think_completion_length": 55.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 130.67708587646484, "epoch": 0.8768971332209107, "grad_norm": 8.94111935304906, "kl": 0.28515625, "learning_rate": 9.268018018018017e-07, "loss": 0.0003, "reward": 3.020329713821411, "reward_std": 0.21678221970796585, "rewards/final_reward": 0.695925233246127, "rewards/mask_iou_reward": 0.3479626166230635, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0203297436237335, "rewards/thk_ans_format_reward": 1.0, "step": 260, "think_completion_length": 62.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 130.86458587646484, "epoch": 0.8802698145025295, "grad_norm": 17.658472675005275, "kl": 0.2666015625, "learning_rate": 9.265202702702702e-07, "loss": 0.0003, "reward": 3.2855056524276733, "reward_std": 0.1681097000837326, "rewards/final_reward": 0.8268895833072832, "rewards/mask_iou_reward": 0.4134447916536416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2855055332183838, "rewards/thk_ans_format_reward": 1.0, "step": 261, "think_completion_length": 63.625 }, { "clip_ratio": 0.0, "completion_length": 129.65625762939453, "epoch": 0.8836424957841484, "grad_norm": 7.071090498535165, "kl": 0.2822265625, "learning_rate": 9.262387387387387e-07, "loss": 0.0003, "reward": 3.1476712226867676, "reward_std": 0.22411763668060303, "rewards/final_reward": 1.200344153075685, "rewards/mask_iou_reward": 0.6001720765378425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1476709842681885, "rewards/thk_ans_format_reward": 1.0, "step": 262, "think_completion_length": 60.5 }, { "clip_ratio": 0.0, "completion_length": 151.92708587646484, "epoch": 0.8870151770657673, "grad_norm": 6.309393231244978, "kl": 0.296875, "learning_rate": 9.259572072072071e-07, "loss": 0.0003, "reward": 3.180266857147217, "reward_std": 0.19312848150730133, "rewards/final_reward": 1.2631500104789617, "rewards/mask_iou_reward": 0.6315750052394808, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.180266797542572, "rewards/thk_ans_format_reward": 1.0, "step": 263, "think_completion_length": 64.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 127.58333587646484, "epoch": 0.8903878583473862, "grad_norm": 10.50935003017543, "kl": 0.3125, "learning_rate": 9.256756756756756e-07, "loss": 0.0003, "reward": 2.9178611040115356, "reward_std": 0.18160251900553703, "rewards/final_reward": 0.6870835665857659, "rewards/mask_iou_reward": 0.34354178329288293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9178611040115356, "rewards/thk_ans_format_reward": 1.0, "step": 264, "think_completion_length": 61.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 137.7916717529297, "epoch": 0.893760539629005, "grad_norm": 7.908751555564282, "kl": 0.3466796875, "learning_rate": 9.25394144144144e-07, "loss": 0.0003, "reward": 3.27189302444458, "reward_std": 0.19905216246843338, "rewards/final_reward": 0.7301093085456902, "rewards/mask_iou_reward": 0.3650546542728451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2718929648399353, "rewards/thk_ans_format_reward": 1.0, "step": 265, "think_completion_length": 62.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 141.33334350585938, "epoch": 0.897133220910624, "grad_norm": 6.078543222665936, "kl": 0.2958984375, "learning_rate": 9.251126126126125e-07, "loss": 0.0003, "reward": 2.89335036277771, "reward_std": 0.23937778174877167, "rewards/final_reward": 0.9255969796433386, "rewards/mask_iou_reward": 0.4627984898216693, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8933502733707428, "rewards/thk_ans_format_reward": 1.0, "step": 266, "think_completion_length": 69.625 }, { "clip_ratio": 0.0, "completion_length": 148.86458587646484, "epoch": 0.9005059021922428, "grad_norm": 6.210516801841927, "kl": 0.271484375, "learning_rate": 9.24831081081081e-07, "loss": 0.0004, "reward": 2.7252787351608276, "reward_std": 0.3134681284427643, "rewards/final_reward": 0.2785383929054998, "rewards/mask_iou_reward": 0.1392691964527499, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 0.7773620188236237, "rewards/thk_ans_format_reward": 1.0, "step": 267, "think_completion_length": 72.875 }, { "clip_ratio": 0.0, "completion_length": 145.27083587646484, "epoch": 0.9038785834738617, "grad_norm": 4.866428396155994, "kl": 0.2734375, "learning_rate": 9.245495495495495e-07, "loss": 0.0003, "reward": 2.961507558822632, "reward_std": 0.13086314499378204, "rewards/final_reward": 0.4467381767630276, "rewards/mask_iou_reward": 0.2233690883815138, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9615074098110199, "rewards/thk_ans_format_reward": 1.0, "step": 268, "think_completion_length": 64.625 }, { "clip_ratio": 0.0, "completion_length": 135.1666717529297, "epoch": 0.9072512647554806, "grad_norm": 9.751330518139088, "kl": 0.31640625, "learning_rate": 9.24268018018018e-07, "loss": 0.0003, "reward": 3.138562798500061, "reward_std": 0.18835950642824173, "rewards/final_reward": 0.7717814922707004, "rewards/mask_iou_reward": 0.3858907461353502, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1385626792907715, "rewards/thk_ans_format_reward": 1.0, "step": 269, "think_completion_length": 70.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 135.03125762939453, "epoch": 0.9106239460370995, "grad_norm": 5.261070572921644, "kl": 0.306640625, "learning_rate": 9.239864864864865e-07, "loss": 0.0003, "reward": 3.116981267929077, "reward_std": 0.24101653695106506, "rewards/final_reward": 1.3195163044452227, "rewards/mask_iou_reward": 0.6597581522226114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1169812679290771, "rewards/thk_ans_format_reward": 1.0, "step": 270, "think_completion_length": 73.25 }, { "clip_ratio": 0.0, "completion_length": 136.6875, "epoch": 0.9139966273187183, "grad_norm": 6.026911173579967, "kl": 0.2666015625, "learning_rate": 9.237049549549549e-07, "loss": 0.0003, "reward": 3.1197715997695923, "reward_std": 0.3672148436307907, "rewards/final_reward": 0.4015020312015995, "rewards/mask_iou_reward": 0.20075101560079975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1197713613510132, "rewards/thk_ans_format_reward": 1.0, "step": 271, "think_completion_length": 73.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 143.28125762939453, "epoch": 0.9173693086003373, "grad_norm": 12.035554252990222, "kl": 0.271484375, "learning_rate": 9.234234234234234e-07, "loss": 0.0003, "reward": 3.3170535564422607, "reward_std": 0.0954308807849884, "rewards/final_reward": 1.2198915213394868, "rewards/mask_iou_reward": 0.6099457606697434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3170537948608398, "rewards/thk_ans_format_reward": 1.0, "step": 272, "think_completion_length": 72.875 }, { "clip_ratio": 0.0, "completion_length": 136.71875, "epoch": 0.9207419898819561, "grad_norm": 9.822815415789027, "kl": 0.2841796875, "learning_rate": 9.231418918918918e-07, "loss": 0.0003, "reward": 3.4125086069107056, "reward_std": 0.1690869778394699, "rewards/final_reward": 1.3490748535842736, "rewards/mask_iou_reward": 0.6745374267921368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4125087261199951, "rewards/thk_ans_format_reward": 1.0, "step": 273, "think_completion_length": 65.75 }, { "clip_ratio": 0.0, "completion_length": 137.3541717529297, "epoch": 0.924114671163575, "grad_norm": 4.455084478392973, "kl": 0.302734375, "learning_rate": 9.228603603603603e-07, "loss": 0.0003, "reward": 3.191675901412964, "reward_std": 0.28003841638565063, "rewards/final_reward": 1.7114721652174203, "rewards/mask_iou_reward": 0.8557360826087101, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1916756629943848, "rewards/thk_ans_format_reward": 1.0, "step": 274, "think_completion_length": 71.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.14583587646484, "epoch": 0.927487352445194, "grad_norm": 5.316324049123673, "kl": 0.27734375, "learning_rate": 9.225788288288288e-07, "loss": 0.0003, "reward": 2.9825011491775513, "reward_std": 0.04889613017439842, "rewards/final_reward": 0.6555297220953474, "rewards/mask_iou_reward": 0.3277648610476737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9825010895729065, "rewards/thk_ans_format_reward": 1.0, "step": 275, "think_completion_length": 69.625 }, { "clip_ratio": 0.0, "completion_length": 139.8854217529297, "epoch": 0.9308600337268128, "grad_norm": 7.561019231281056, "kl": 0.2998046875, "learning_rate": 9.222972972972972e-07, "loss": 0.0003, "reward": 3.01677143573761, "reward_std": 0.34936605393886566, "rewards/final_reward": 0.778316704859161, "rewards/mask_iou_reward": 0.3891583524295805, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0271879434585571, "rewards/thk_ans_format_reward": 1.0, "step": 276, "think_completion_length": 69.25 }, { "clip_ratio": 0.0, "completion_length": 141.6666717529297, "epoch": 0.9342327150084317, "grad_norm": 11.899753302570709, "kl": 0.365234375, "learning_rate": 9.220157657657657e-07, "loss": 0.0004, "reward": 3.115198850631714, "reward_std": 0.35679496824741364, "rewards/final_reward": 1.2854900381729721, "rewards/mask_iou_reward": 0.6427450190864861, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.1568655371665955, "rewards/thk_ans_format_reward": 1.0, "step": 277, "think_completion_length": 65.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 146.33333587646484, "epoch": 0.9376053962900506, "grad_norm": 6.487732011910408, "kl": 0.30859375, "learning_rate": 9.217342342342342e-07, "loss": 0.0003, "reward": 3.1566383838653564, "reward_std": 0.31383951008319855, "rewards/final_reward": 0.7289020591503977, "rewards/mask_iou_reward": 0.36445102957519887, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1670548915863037, "rewards/thk_ans_format_reward": 1.0, "step": 278, "think_completion_length": 67.5 }, { "clip_ratio": 0.0, "completion_length": 131.0, "epoch": 0.9409780775716695, "grad_norm": 14.827734251735015, "kl": 0.3056640625, "learning_rate": 9.214527027027027e-07, "loss": 0.0003, "reward": 3.365423798561096, "reward_std": 0.22016916424036026, "rewards/final_reward": 1.4150596939165623, "rewards/mask_iou_reward": 0.7075298469582811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.365423560142517, "rewards/thk_ans_format_reward": 1.0, "step": 279, "think_completion_length": 63.5 }, { "clip_ratio": 0.0, "completion_length": 136.77083587646484, "epoch": 0.9443507588532883, "grad_norm": 7.785269993330108, "kl": 0.2861328125, "learning_rate": 9.211711711711712e-07, "loss": 0.0003, "reward": 2.8190064430236816, "reward_std": 0.20893994718790054, "rewards/final_reward": 0.2796995562385235, "rewards/mask_iou_reward": 0.13984977811926175, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8190064132213593, "rewards/thk_ans_format_reward": 1.0, "step": 280, "think_completion_length": 63.0 }, { "clip_ratio": 0.0, "completion_length": 137.22916793823242, "epoch": 0.9477234401349073, "grad_norm": 16.756402992934778, "kl": 0.30078125, "learning_rate": 9.208896396396396e-07, "loss": 0.0003, "reward": 3.037890315055847, "reward_std": 0.23893652856349945, "rewards/final_reward": 0.7540887574307235, "rewards/mask_iou_reward": 0.3770443787153617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0378903150558472, "rewards/thk_ans_format_reward": 1.0, "step": 281, "think_completion_length": 61.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 130.97916793823242, "epoch": 0.9510961214165261, "grad_norm": 5.730385007491753, "kl": 0.328125, "learning_rate": 9.206081081081081e-07, "loss": 0.0003, "reward": 2.7796366214752197, "reward_std": 0.22210410237312317, "rewards/final_reward": 0.9956611302221129, "rewards/mask_iou_reward": 0.49783056511105644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7796366810798645, "rewards/thk_ans_format_reward": 1.0, "step": 282, "think_completion_length": 60.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 146.36458587646484, "epoch": 0.954468802698145, "grad_norm": 10.183292819390926, "kl": 0.310546875, "learning_rate": 9.203265765765765e-07, "loss": 0.0003, "reward": 3.1968239545822144, "reward_std": 0.17233379930257797, "rewards/final_reward": 1.6030817393916479, "rewards/mask_iou_reward": 0.8015408696958239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1968241333961487, "rewards/thk_ans_format_reward": 1.0, "step": 283, "think_completion_length": 70.0 }, { "clip_ratio": 0.0, "completion_length": 131.0104217529297, "epoch": 0.9578414839797639, "grad_norm": 5.6452981089641865, "kl": 0.337890625, "learning_rate": 9.20045045045045e-07, "loss": 0.0003, "reward": 3.0770174264907837, "reward_std": 0.40977267920970917, "rewards/final_reward": 0.8966653625690325, "rewards/mask_iou_reward": 0.44833268128451625, "rewards/sam_format_reward": 0.9166666865348816, "rewards/sam_reward_func_ultra": 1.160350501537323, "rewards/thk_ans_format_reward": 1.0, "step": 284, "think_completion_length": 63.875 }, { "clip_ratio": 0.0, "completion_length": 146.31250762939453, "epoch": 0.9612141652613828, "grad_norm": 26.57340182822755, "kl": 0.3095703125, "learning_rate": 9.197635135135135e-07, "loss": 0.0003, "reward": 3.146125912666321, "reward_std": 0.36684100329875946, "rewards/final_reward": 1.7262357467138223, "rewards/mask_iou_reward": 0.8631178733569111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1461257934570312, "rewards/thk_ans_format_reward": 1.0, "step": 285, "think_completion_length": 54.125 }, { "clip_ratio": 0.0, "completion_length": 118.64583587646484, "epoch": 0.9645868465430016, "grad_norm": 4.967807923742779, "kl": 0.3525390625, "learning_rate": 9.194819819819819e-07, "loss": 0.0004, "reward": 2.8754937648773193, "reward_std": 0.11622428148984909, "rewards/final_reward": 0.728122555218105, "rewards/mask_iou_reward": 0.3640612776090525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8754937052726746, "rewards/thk_ans_format_reward": 1.0, "step": 286, "think_completion_length": 59.5 }, { "clip_ratio": 0.0, "completion_length": 133.9791717529297, "epoch": 0.9679595278246206, "grad_norm": 7.684197556109967, "kl": 0.3359375, "learning_rate": 9.192004504504504e-07, "loss": 0.0003, "reward": 3.0598126649856567, "reward_std": 0.2900129407644272, "rewards/final_reward": 1.679721792899501, "rewards/mask_iou_reward": 0.8398608964497505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0598126649856567, "rewards/thk_ans_format_reward": 1.0, "step": 287, "think_completion_length": 56.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 136.2395896911621, "epoch": 0.9713322091062394, "grad_norm": 5.760732467267119, "kl": 0.365234375, "learning_rate": 9.18918918918919e-07, "loss": 0.0004, "reward": 2.9172106981277466, "reward_std": 0.15981094166636467, "rewards/final_reward": 0.9524224872353481, "rewards/mask_iou_reward": 0.47621124361767403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9172105193138123, "rewards/thk_ans_format_reward": 1.0, "step": 288, "think_completion_length": 56.04166666666667 }, { "clip_ratio": 0.0, "completion_length": 131.6354217529297, "epoch": 0.9747048903878583, "grad_norm": 5.7714829299216355, "kl": 0.3427734375, "learning_rate": 9.186373873873874e-07, "loss": 0.0003, "reward": 3.3801859617233276, "reward_std": 0.2743668332695961, "rewards/final_reward": 1.7714707382638077, "rewards/mask_iou_reward": 0.8857353691319039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3801860809326172, "rewards/thk_ans_format_reward": 1.0, "step": 289, "think_completion_length": 57.583333333333336 }, { "clip_ratio": 0.0, "completion_length": 123.75000381469727, "epoch": 0.9780775716694773, "grad_norm": 16.609336956336424, "kl": 0.3720703125, "learning_rate": 9.183558558558559e-07, "loss": 0.0005, "reward": 3.2253894805908203, "reward_std": 0.17594532668590546, "rewards/final_reward": 1.675658520954385, "rewards/mask_iou_reward": 0.8378292604771925, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2358060479164124, "rewards/thk_ans_format_reward": 1.0, "step": 290, "think_completion_length": 55.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 130.18750762939453, "epoch": 0.9814502529510961, "grad_norm": 11.02554804355287, "kl": 0.353515625, "learning_rate": 9.180743243243243e-07, "loss": 0.0004, "reward": 3.0442588329315186, "reward_std": 0.31053662300109863, "rewards/final_reward": 1.8277712111056867, "rewards/mask_iou_reward": 0.9138856055528434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0442588925361633, "rewards/thk_ans_format_reward": 1.0, "step": 291, "think_completion_length": 53.875 }, { "clip_ratio": 0.0, "completion_length": 122.67708587646484, "epoch": 0.984822934232715, "grad_norm": 6.016837428298982, "kl": 0.3681640625, "learning_rate": 9.177927927927928e-07, "loss": 0.0004, "reward": 3.389703154563904, "reward_std": 0.17725949734449387, "rewards/final_reward": 1.3419651151754086, "rewards/mask_iou_reward": 0.6709825575877043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3897031545639038, "rewards/thk_ans_format_reward": 1.0, "step": 292, "think_completion_length": 55.5 }, { "clip_ratio": 0.0, "completion_length": 127.375, "epoch": 0.9881956155143339, "grad_norm": 10.216163938938413, "kl": 0.3359375, "learning_rate": 9.175112612612613e-07, "loss": 0.0003, "reward": 3.1626222133636475, "reward_std": 0.20485194586217403, "rewards/final_reward": 1.1873850468419163, "rewards/mask_iou_reward": 0.5936925234209581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1626221537590027, "rewards/thk_ans_format_reward": 1.0, "step": 293, "think_completion_length": 53.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 140.7604217529297, "epoch": 0.9915682967959528, "grad_norm": 8.66201289787897, "kl": 0.373046875, "learning_rate": 9.172297297297297e-07, "loss": 0.0004, "reward": 3.003246307373047, "reward_std": 0.18188440799713135, "rewards/final_reward": 1.3201920244706054, "rewards/mask_iou_reward": 0.6600960122353027, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0032461285591125, "rewards/thk_ans_format_reward": 1.0, "step": 294, "think_completion_length": 57.0 }, { "clip_ratio": 0.0, "completion_length": 126.31250762939453, "epoch": 0.9949409780775716, "grad_norm": 10.932256085842326, "kl": 0.3720703125, "learning_rate": 9.169481981981982e-07, "loss": 0.0004, "reward": 2.940837025642395, "reward_std": 0.1912689208984375, "rewards/final_reward": 0.8208294820244021, "rewards/mask_iou_reward": 0.41041474101220105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9408369064331055, "rewards/thk_ans_format_reward": 1.0, "step": 295, "think_completion_length": 54.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 141.92105102539062, "epoch": 0.9983136593591906, "grad_norm": 12.666395801396902, "kl": 0.4267578125, "learning_rate": 9.166666666666665e-07, "loss": 0.0004, "reward": 2.8901513814926147, "reward_std": 0.2604110687971115, "rewards/final_reward": 0.7894457895435861, "rewards/mask_iou_reward": 0.39472289477179306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8901514261960983, "rewards/thk_ans_format_reward": 1.0, "step": 296, "think_completion_length": 49.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 136.47916793823242, "epoch": 1.0033726812816188, "grad_norm": 5.005331899064764, "kl": 0.4736328125, "learning_rate": 9.16385135135135e-07, "loss": 0.0005, "reward": 3.0064163208007812, "reward_std": 0.13095365837216377, "rewards/final_reward": 1.0119502184697924, "rewards/mask_iou_reward": 0.5059751092348962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0064163208007812, "rewards/thk_ans_format_reward": 1.0, "step": 297, "think_completion_length": 52.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 124.40625381469727, "epoch": 1.0067453625632379, "grad_norm": 13.649116642243577, "kl": 0.421875, "learning_rate": 9.161036036036036e-07, "loss": 0.0004, "reward": 3.4440163373947144, "reward_std": 0.20361152291297913, "rewards/final_reward": 1.5423490000752769, "rewards/mask_iou_reward": 0.7711745000376384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4440162181854248, "rewards/thk_ans_format_reward": 1.0, "step": 298, "think_completion_length": 53.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 122.96875381469727, "epoch": 1.0101180438448567, "grad_norm": 5.44147932218635, "kl": 0.458984375, "learning_rate": 9.15822072072072e-07, "loss": 0.0005, "reward": 3.1435067653656006, "reward_std": 0.33636271953582764, "rewards/final_reward": 1.2787666554971673, "rewards/mask_iou_reward": 0.6393833277485836, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1435065269470215, "rewards/thk_ans_format_reward": 1.0, "step": 299, "think_completion_length": 50.125 }, { "clip_ratio": 0.0, "completion_length": 152.52083587646484, "epoch": 1.0134907251264755, "grad_norm": 13.105966422900543, "kl": 0.42578125, "learning_rate": 9.155405405405405e-07, "loss": 0.0004, "reward": 2.835780143737793, "reward_std": 0.22343186289072037, "rewards/final_reward": 0.05838687385318925, "rewards/mask_iou_reward": 0.029193436926594624, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8357802033424377, "rewards/thk_ans_format_reward": 1.0, "step": 300, "think_completion_length": 53.583333333333336 }, { "clip_ratio": 0.0, "completion_length": 120.22917175292969, "epoch": 1.0168634064080944, "grad_norm": 16.097568168056785, "kl": 0.3955078125, "learning_rate": 9.152590090090089e-07, "loss": 0.0004, "reward": 3.235043168067932, "reward_std": 0.3790423274040222, "rewards/final_reward": 1.5514956112814668, "rewards/mask_iou_reward": 0.7757478056407334, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.2975430488586426, "rewards/thk_ans_format_reward": 1.0, "step": 301, "think_completion_length": 54.125 }, { "clip_ratio": 0.0, "completion_length": 128.55208587646484, "epoch": 1.0202360876897134, "grad_norm": 5.126815049659408, "kl": 0.3359375, "learning_rate": 9.149774774774774e-07, "loss": 0.0003, "reward": 3.221487045288086, "reward_std": 0.2783351540565491, "rewards/final_reward": 0.9219856744032554, "rewards/mask_iou_reward": 0.4609928372016277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2214871644973755, "rewards/thk_ans_format_reward": 1.0, "step": 302, "think_completion_length": 54.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 132.125, "epoch": 1.0236087689713322, "grad_norm": 7.340733693513745, "kl": 0.54296875, "learning_rate": 9.146959459459459e-07, "loss": 0.0005, "reward": 3.38637638092041, "reward_std": 0.14389759302139282, "rewards/final_reward": 1.8092322656026716, "rewards/mask_iou_reward": 0.9046161328013358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3863762021064758, "rewards/thk_ans_format_reward": 1.0, "step": 303, "think_completion_length": 51.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 119.40625, "epoch": 1.026981450252951, "grad_norm": 5.265802698091234, "kl": 0.3916015625, "learning_rate": 9.144144144144143e-07, "loss": 0.0004, "reward": 3.4280911684036255, "reward_std": 0.12274213880300522, "rewards/final_reward": 1.5388529343411281, "rewards/mask_iou_reward": 0.7694264671705641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4280911087989807, "rewards/thk_ans_format_reward": 1.0, "step": 304, "think_completion_length": 59.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 125.13542175292969, "epoch": 1.03035413153457, "grad_norm": 7.1611615022036, "kl": 0.373046875, "learning_rate": 9.141328828828828e-07, "loss": 0.0004, "reward": 3.184275269508362, "reward_std": 0.14417023956775665, "rewards/final_reward": 0.579938548943454, "rewards/mask_iou_reward": 0.289969274471727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1842751204967499, "rewards/thk_ans_format_reward": 1.0, "step": 305, "think_completion_length": 57.791666666666664 }, { "clip_ratio": 0.0, "completion_length": 134.37500381469727, "epoch": 1.033726812816189, "grad_norm": 13.402703536491135, "kl": 0.3583984375, "learning_rate": 9.138513513513512e-07, "loss": 0.0004, "reward": 3.2078309059143066, "reward_std": 0.23842425644397736, "rewards/final_reward": 0.8247731713945541, "rewards/mask_iou_reward": 0.41238658569727704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.207830786705017, "rewards/thk_ans_format_reward": 1.0, "step": 306, "think_completion_length": 52.208333333333336 }, { "clip_ratio": 0.0, "completion_length": 125.34375, "epoch": 1.0370994940978078, "grad_norm": 5.58524354551215, "kl": 0.3662109375, "learning_rate": 9.135698198198197e-07, "loss": 0.0004, "reward": 3.319042444229126, "reward_std": 0.10922817140817642, "rewards/final_reward": 1.763124109472381, "rewards/mask_iou_reward": 0.8815620547361905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3190423250198364, "rewards/thk_ans_format_reward": 1.0, "step": 307, "think_completion_length": 56.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 140.59375762939453, "epoch": 1.0404721753794266, "grad_norm": 5.602803151228455, "kl": 0.388671875, "learning_rate": 9.132882882882883e-07, "loss": 0.0004, "reward": 3.567653179168701, "reward_std": 0.15950121730566025, "rewards/final_reward": 1.6246588918868299, "rewards/mask_iou_reward": 0.8123294459434149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5676530003547668, "rewards/thk_ans_format_reward": 1.0, "step": 308, "think_completion_length": 58.875 }, { "clip_ratio": 0.0, "completion_length": 128.93750381469727, "epoch": 1.0438448566610454, "grad_norm": 6.580711507853475, "kl": 0.4091796875, "learning_rate": 9.130067567567567e-07, "loss": 0.0004, "reward": 3.3005740642547607, "reward_std": 0.16971008479595184, "rewards/final_reward": 0.9619296225234968, "rewards/mask_iou_reward": 0.4809648112617484, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3005741834640503, "rewards/thk_ans_format_reward": 1.0, "step": 309, "think_completion_length": 57.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 121.27083587646484, "epoch": 1.0472175379426645, "grad_norm": 4.5198260011088855, "kl": 0.4189453125, "learning_rate": 9.127252252252252e-07, "loss": 0.0004, "reward": 3.3053261041641235, "reward_std": 0.18478236347436905, "rewards/final_reward": 0.9405771749478797, "rewards/mask_iou_reward": 0.47028858747393987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3053261637687683, "rewards/thk_ans_format_reward": 1.0, "step": 310, "think_completion_length": 61.625 }, { "clip_ratio": 0.0, "completion_length": 124.90625381469727, "epoch": 1.0505902192242833, "grad_norm": 5.3429957757313975, "kl": 0.3955078125, "learning_rate": 9.124436936936937e-07, "loss": 0.0004, "reward": 3.3730320930480957, "reward_std": 0.122093815356493, "rewards/final_reward": 1.3094642304518052, "rewards/mask_iou_reward": 0.6547321152259026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3730321526527405, "rewards/thk_ans_format_reward": 1.0, "step": 311, "think_completion_length": 61.5 }, { "clip_ratio": 0.0, "completion_length": 145.14583587646484, "epoch": 1.0539629005059021, "grad_norm": 39.54666840814476, "kl": 0.640625, "learning_rate": 9.121621621621621e-07, "loss": 0.0006, "reward": 3.1350291967391968, "reward_std": 0.17641381546854973, "rewards/final_reward": 1.493971782697706, "rewards/mask_iou_reward": 0.746985891348853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1350291967391968, "rewards/thk_ans_format_reward": 1.0, "step": 312, "think_completion_length": 60.333333333333336 }, { "clip_ratio": 0.0, "completion_length": 124.47916793823242, "epoch": 1.0573355817875212, "grad_norm": 15.95302602085042, "kl": 0.4345703125, "learning_rate": 9.118806306306306e-07, "loss": 0.0004, "reward": 3.29574453830719, "reward_std": 0.12975647673010826, "rewards/final_reward": 1.198893491412423, "rewards/mask_iou_reward": 0.5994467457062115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.29574453830719, "rewards/thk_ans_format_reward": 1.0, "step": 313, "think_completion_length": 61.5 }, { "clip_ratio": 0.0, "completion_length": 131.64583587646484, "epoch": 1.06070826306914, "grad_norm": 14.401676576205926, "kl": 0.3408203125, "learning_rate": 9.11599099099099e-07, "loss": 0.0003, "reward": 3.13248074054718, "reward_std": 0.31112509220838547, "rewards/final_reward": 1.2162999474308984, "rewards/mask_iou_reward": 0.6081499737154492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1324808597564697, "rewards/thk_ans_format_reward": 1.0, "step": 314, "think_completion_length": 58.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 142.8541717529297, "epoch": 1.0640809443507588, "grad_norm": 8.28584318628263, "kl": 0.39453125, "learning_rate": 9.113175675675675e-07, "loss": 0.0004, "reward": 3.271073341369629, "reward_std": 0.2790771424770355, "rewards/final_reward": 1.3517874102249416, "rewards/mask_iou_reward": 0.6758937051124708, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2919068932533264, "rewards/thk_ans_format_reward": 1.0, "step": 315, "think_completion_length": 57.416666666666664 }, { "clip_ratio": 0.0, "completion_length": 136.33333587646484, "epoch": 1.0674536256323777, "grad_norm": 7.111793447392556, "kl": 0.91796875, "learning_rate": 9.11036036036036e-07, "loss": 0.0009, "reward": 3.2280776500701904, "reward_std": 0.2805949002504349, "rewards/final_reward": 0.7988367844819377, "rewards/mask_iou_reward": 0.39941839224096887, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2280775904655457, "rewards/thk_ans_format_reward": 1.0, "step": 316, "think_completion_length": 64.375 }, { "clip_ratio": 0.0, "completion_length": 128.0729217529297, "epoch": 1.0708263069139967, "grad_norm": 9.066928980109676, "kl": 0.3876953125, "learning_rate": 9.107545045045044e-07, "loss": 0.0004, "reward": 3.4015519618988037, "reward_std": 0.14515436440706253, "rewards/final_reward": 1.753466714461804, "rewards/mask_iou_reward": 0.876733357230902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4015517830848694, "rewards/thk_ans_format_reward": 1.0, "step": 317, "think_completion_length": 57.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 130.31250381469727, "epoch": 1.0741989881956155, "grad_norm": 5.169293870970053, "kl": 0.373046875, "learning_rate": 9.10472972972973e-07, "loss": 0.0004, "reward": 3.331157088279724, "reward_std": 0.24197855591773987, "rewards/final_reward": 1.239624879007727, "rewards/mask_iou_reward": 0.6198124395038636, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3519902229309082, "rewards/thk_ans_format_reward": 1.0, "step": 318, "think_completion_length": 64.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 123.90625, "epoch": 1.0775716694772344, "grad_norm": 6.663895649543028, "kl": 0.40234375, "learning_rate": 9.101914414414415e-07, "loss": 0.0004, "reward": 3.3463969230651855, "reward_std": 0.27501678466796875, "rewards/final_reward": 0.9133032025802322, "rewards/mask_iou_reward": 0.4566516012901161, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.346396803855896, "rewards/thk_ans_format_reward": 1.0, "step": 319, "think_completion_length": 56.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 129.25, "epoch": 1.0809443507588532, "grad_norm": 4.811356387200662, "kl": 0.3759765625, "learning_rate": 9.099099099099099e-07, "loss": 0.0004, "reward": 3.1629140377044678, "reward_std": 0.16561146825551987, "rewards/final_reward": 1.28658606261834, "rewards/mask_iou_reward": 0.64329303130917, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1629141569137573, "rewards/thk_ans_format_reward": 1.0, "step": 320, "think_completion_length": 56.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 128.68750762939453, "epoch": 1.0843170320404723, "grad_norm": 5.455993712741003, "kl": 0.4814453125, "learning_rate": 9.096283783783784e-07, "loss": 0.0005, "reward": 3.163417100906372, "reward_std": 0.08710319921374321, "rewards/final_reward": 0.9828214154254767, "rewards/mask_iou_reward": 0.49141070771273837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1634172201156616, "rewards/thk_ans_format_reward": 1.0, "step": 321, "think_completion_length": 64.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 128.03125762939453, "epoch": 1.087689713322091, "grad_norm": 20.96952304765022, "kl": 0.5859375, "learning_rate": 9.093468468468468e-07, "loss": 0.0006, "reward": 3.3637170791625977, "reward_std": 0.17667383700609207, "rewards/final_reward": 1.1931116823244388, "rewards/mask_iou_reward": 0.5965558411622194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3637170791625977, "rewards/thk_ans_format_reward": 1.0, "step": 322, "think_completion_length": 58.375 }, { "clip_ratio": 0.0, "completion_length": 125.91667175292969, "epoch": 1.09106239460371, "grad_norm": 27.194575381051997, "kl": 0.4462890625, "learning_rate": 9.090653153153153e-07, "loss": 0.0004, "reward": 3.3401143550872803, "reward_std": 0.09220879897475243, "rewards/final_reward": 0.9237625026772636, "rewards/mask_iou_reward": 0.4618812513386318, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3401142954826355, "rewards/thk_ans_format_reward": 1.0, "step": 323, "think_completion_length": 61.0 }, { "clip_ratio": 0.0, "completion_length": 134.6354217529297, "epoch": 1.0944350758853287, "grad_norm": 7.9243269526549325, "kl": 0.4013671875, "learning_rate": 9.087837837837838e-07, "loss": 0.0004, "reward": 3.462162137031555, "reward_std": 0.17394614964723587, "rewards/final_reward": 0.9065336502215842, "rewards/mask_iou_reward": 0.4532668251107921, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.462161898612976, "rewards/thk_ans_format_reward": 1.0, "step": 324, "think_completion_length": 59.0 }, { "clip_ratio": 0.0, "completion_length": 121.55208587646484, "epoch": 1.0978077571669478, "grad_norm": 7.6876068680615495, "kl": 0.396484375, "learning_rate": 9.085022522522522e-07, "loss": 0.0004, "reward": 3.240488648414612, "reward_std": 0.24380206316709518, "rewards/final_reward": 0.9493554668714257, "rewards/mask_iou_reward": 0.47467773343571285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2404886484146118, "rewards/thk_ans_format_reward": 1.0, "step": 325, "think_completion_length": 61.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 124.93750381469727, "epoch": 1.1011804384485666, "grad_norm": 17.497626082493245, "kl": 0.40625, "learning_rate": 9.082207207207207e-07, "loss": 0.0004, "reward": 2.984956741333008, "reward_std": 0.16040324792265892, "rewards/final_reward": 1.161768861024481, "rewards/mask_iou_reward": 0.5808844305122405, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9953732788562775, "rewards/thk_ans_format_reward": 1.0, "step": 326, "think_completion_length": 60.25 }, { "clip_ratio": 0.0, "completion_length": 119.0625, "epoch": 1.1045531197301854, "grad_norm": 8.722872084277531, "kl": 0.4140625, "learning_rate": 9.079391891891891e-07, "loss": 0.0004, "reward": 3.3815075159072876, "reward_std": 0.2122548222541809, "rewards/final_reward": 1.1613737494992136, "rewards/mask_iou_reward": 0.5806868747496068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3815075755119324, "rewards/thk_ans_format_reward": 1.0, "step": 327, "think_completion_length": 54.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 131.93750762939453, "epoch": 1.1079258010118043, "grad_norm": 6.552934117656638, "kl": 0.3876953125, "learning_rate": 9.076576576576577e-07, "loss": 0.0004, "reward": 3.2308026552200317, "reward_std": 0.32459303736686707, "rewards/final_reward": 0.9769021255318333, "rewards/mask_iou_reward": 0.48845106276591665, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.3037192821502686, "rewards/thk_ans_format_reward": 1.0, "step": 328, "think_completion_length": 58.25 }, { "clip_ratio": 0.0, "completion_length": 125.34375, "epoch": 1.1112984822934233, "grad_norm": 5.333888749074564, "kl": 0.40625, "learning_rate": 9.073761261261262e-07, "loss": 0.0004, "reward": 3.342952847480774, "reward_std": 0.16162853688001633, "rewards/final_reward": 1.0840490033547516, "rewards/mask_iou_reward": 0.5420245016773758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3429526686668396, "rewards/thk_ans_format_reward": 1.0, "step": 329, "think_completion_length": 55.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 128.8333396911621, "epoch": 1.1146711635750421, "grad_norm": 24.794201686691693, "kl": 0.435546875, "learning_rate": 9.070945945945946e-07, "loss": 0.0005, "reward": 3.0924460887908936, "reward_std": 0.13998809456825256, "rewards/final_reward": 0.839294410155661, "rewards/mask_iou_reward": 0.4196472050778305, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1028627753257751, "rewards/thk_ans_format_reward": 1.0, "step": 330, "think_completion_length": 64.0 }, { "clip_ratio": 0.0, "completion_length": 141.41666793823242, "epoch": 1.118043844856661, "grad_norm": 14.98694410165655, "kl": 0.3916015625, "learning_rate": 9.068130630630631e-07, "loss": 0.0004, "reward": 3.1274584531784058, "reward_std": 0.2977043092250824, "rewards/final_reward": 0.9250505167662035, "rewards/mask_iou_reward": 0.46252525838310177, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1378751993179321, "rewards/thk_ans_format_reward": 1.0, "step": 331, "think_completion_length": 60.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 129.37500381469727, "epoch": 1.12141652613828, "grad_norm": 5.783084805803711, "kl": 0.44921875, "learning_rate": 9.065315315315315e-07, "loss": 0.0004, "reward": 2.9259408712387085, "reward_std": 0.2695099413394928, "rewards/final_reward": 0.5560096592874704, "rewards/mask_iou_reward": 0.2780048296437352, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9259407222270966, "rewards/thk_ans_format_reward": 1.0, "step": 332, "think_completion_length": 62.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 153.8541717529297, "epoch": 1.1247892074198989, "grad_norm": 9.244155069198841, "kl": 0.470703125, "learning_rate": 9.0625e-07, "loss": 0.0005, "reward": 3.3988051414489746, "reward_std": 0.14826885610818863, "rewards/final_reward": 1.260855574702926, "rewards/mask_iou_reward": 0.630427787351463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3988049626350403, "rewards/thk_ans_format_reward": 1.0, "step": 333, "think_completion_length": 61.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 125.73958587646484, "epoch": 1.1281618887015177, "grad_norm": 19.766878070326953, "kl": 0.412109375, "learning_rate": 9.059684684684685e-07, "loss": 0.0004, "reward": 3.0867655277252197, "reward_std": 0.2322430983185768, "rewards/final_reward": 0.8911001395752802, "rewards/mask_iou_reward": 0.4455500697876401, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0867655277252197, "rewards/thk_ans_format_reward": 1.0, "step": 334, "think_completion_length": 64.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 126.42708969116211, "epoch": 1.1315345699831365, "grad_norm": 5.298022005482819, "kl": 0.4306640625, "learning_rate": 9.056869369369369e-07, "loss": 0.0004, "reward": 3.566340446472168, "reward_std": 0.08108654618263245, "rewards/final_reward": 1.876243345798129, "rewards/mask_iou_reward": 0.9381216728990645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5663405060768127, "rewards/thk_ans_format_reward": 1.0, "step": 335, "think_completion_length": 61.875 }, { "clip_ratio": 0.0, "completion_length": 126.68750381469727, "epoch": 1.1349072512647556, "grad_norm": 12.434004407283455, "kl": 0.4384765625, "learning_rate": 9.054054054054053e-07, "loss": 0.0005, "reward": 3.3265706300735474, "reward_std": 0.23486195504665375, "rewards/final_reward": 1.474851406697328, "rewards/mask_iou_reward": 0.737425703348664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3265705704689026, "rewards/thk_ans_format_reward": 1.0, "step": 336, "think_completion_length": 61.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 156.0729217529297, "epoch": 1.1382799325463744, "grad_norm": 7.178608067119752, "kl": 0.533203125, "learning_rate": 9.051238738738737e-07, "loss": 0.0005, "reward": 3.492894411087036, "reward_std": 0.2055719941854477, "rewards/final_reward": 0.990075326096736, "rewards/mask_iou_reward": 0.495037663048368, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5033112168312073, "rewards/thk_ans_format_reward": 1.0, "step": 337, "think_completion_length": 54.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 127.22917175292969, "epoch": 1.1416526138279932, "grad_norm": 5.280799948687187, "kl": 0.48046875, "learning_rate": 9.048423423423422e-07, "loss": 0.0005, "reward": 3.5345152616500854, "reward_std": 0.10706453770399094, "rewards/final_reward": 1.7453732959394208, "rewards/mask_iou_reward": 0.8726866479697104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5449321866035461, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 338, "think_completion_length": 62.083333333333336 }, { "clip_ratio": 0.0, "completion_length": 130.30208587646484, "epoch": 1.1450252951096123, "grad_norm": 7.207464617440506, "kl": 0.3759765625, "learning_rate": 9.045608108108108e-07, "loss": 0.0004, "reward": 2.865417242050171, "reward_std": 0.37941035628318787, "rewards/final_reward": 0.8771180013313473, "rewards/mask_iou_reward": 0.43855900066567366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8654172122478485, "rewards/thk_ans_format_reward": 1.0, "step": 339, "think_completion_length": 71.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 159.92708587646484, "epoch": 1.148397976391231, "grad_norm": 11.641408008070787, "kl": 0.4130859375, "learning_rate": 9.042792792792792e-07, "loss": 0.0004, "reward": 3.2123430967330933, "reward_std": 0.2588522955775261, "rewards/final_reward": 1.4110095741889364, "rewards/mask_iou_reward": 0.7055047870944682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.21234330534935, "rewards/thk_ans_format_reward": 1.0, "step": 340, "think_completion_length": 65.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 133.13541793823242, "epoch": 1.15177065767285, "grad_norm": 6.67371298694648, "kl": 0.4033203125, "learning_rate": 9.039977477477477e-07, "loss": 0.0004, "reward": 3.1938616037368774, "reward_std": 0.16771592944860458, "rewards/final_reward": 1.5646572712220155, "rewards/mask_iou_reward": 0.7823286356110077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1938616037368774, "rewards/thk_ans_format_reward": 1.0, "step": 341, "think_completion_length": 62.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 129.16666793823242, "epoch": 1.1551433389544687, "grad_norm": 5.263702820720401, "kl": 0.5166015625, "learning_rate": 9.037162162162162e-07, "loss": 0.0005, "reward": 3.3237626552581787, "reward_std": 0.17711199820041656, "rewards/final_reward": 1.1942757440934884, "rewards/mask_iou_reward": 0.5971378720467442, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.323762595653534, "rewards/thk_ans_format_reward": 1.0, "step": 342, "think_completion_length": 58.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 132.80208587646484, "epoch": 1.1585160202360876, "grad_norm": 6.791804755889098, "kl": 0.4521484375, "learning_rate": 9.034346846846846e-07, "loss": 0.0005, "reward": 3.1926233768463135, "reward_std": 0.1412236988544464, "rewards/final_reward": 1.196448614697763, "rewards/mask_iou_reward": 0.5982243073488815, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1926233768463135, "rewards/thk_ans_format_reward": 1.0, "step": 343, "think_completion_length": 61.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 141.14583587646484, "epoch": 1.1618887015177066, "grad_norm": 6.534097465373398, "kl": 0.4228515625, "learning_rate": 9.031531531531531e-07, "loss": 0.0004, "reward": 2.971271514892578, "reward_std": 0.3797031044960022, "rewards/final_reward": 1.2611627610058478, "rewards/mask_iou_reward": 0.6305813805029239, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9816881716251373, "rewards/thk_ans_format_reward": 1.0, "step": 344, "think_completion_length": 65.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 125.53125381469727, "epoch": 1.1652613827993255, "grad_norm": 7.74065202517212, "kl": 0.46875, "learning_rate": 9.028716216216215e-07, "loss": 0.0005, "reward": 3.186206817626953, "reward_std": 0.11258535459637642, "rewards/final_reward": 1.2044975491309748, "rewards/mask_iou_reward": 0.6022487745654874, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1862066984176636, "rewards/thk_ans_format_reward": 1.0, "step": 345, "think_completion_length": 64.125 }, { "clip_ratio": 0.0, "completion_length": 133.3229217529297, "epoch": 1.1686340640809443, "grad_norm": 6.48305370335897, "kl": 0.4521484375, "learning_rate": 9.0259009009009e-07, "loss": 0.0005, "reward": 3.4441089630126953, "reward_std": 0.23721785843372345, "rewards/final_reward": 1.1137867720324555, "rewards/mask_iou_reward": 0.5568933860162277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4441088438034058, "rewards/thk_ans_format_reward": 1.0, "step": 346, "think_completion_length": 66.20833333333334 }, { "clip_ratio": 0.0, "completion_length": 121.98958587646484, "epoch": 1.1720067453625633, "grad_norm": 11.63205530379195, "kl": 0.4541015625, "learning_rate": 9.023085585585585e-07, "loss": 0.0005, "reward": 3.355055093765259, "reward_std": 0.1520690880715847, "rewards/final_reward": 0.31849268904859007, "rewards/mask_iou_reward": 0.15924634452429504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3550548553466797, "rewards/thk_ans_format_reward": 1.0, "step": 347, "think_completion_length": 67.04166666666666 }, { "clip_ratio": 0.0, "completion_length": 124.09375381469727, "epoch": 1.1753794266441822, "grad_norm": 6.91360662733961, "kl": 0.4638671875, "learning_rate": 9.020270270270269e-07, "loss": 0.0005, "reward": 3.23860239982605, "reward_std": 0.09308822453022003, "rewards/final_reward": 0.7723889578444937, "rewards/mask_iou_reward": 0.38619447892224684, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.238602489233017, "rewards/thk_ans_format_reward": 1.0, "step": 348, "think_completion_length": 58.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 133.98958587646484, "epoch": 1.178752107925801, "grad_norm": 6.787137639312774, "kl": 0.400390625, "learning_rate": 9.017454954954955e-07, "loss": 0.0005, "reward": 3.483386754989624, "reward_std": 0.15565379709005356, "rewards/final_reward": 1.2750090210076546, "rewards/mask_iou_reward": 0.6375045105038273, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4833868145942688, "rewards/thk_ans_format_reward": 1.0, "step": 349, "think_completion_length": 60.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 127.92708587646484, "epoch": 1.1821247892074198, "grad_norm": 6.595761566501617, "kl": 0.4248046875, "learning_rate": 9.014639639639639e-07, "loss": 0.0004, "reward": 3.601720929145813, "reward_std": 0.10276154428720474, "rewards/final_reward": 1.8197904926380537, "rewards/mask_iou_reward": 0.9098952463190269, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6017211079597473, "rewards/thk_ans_format_reward": 1.0, "step": 350, "think_completion_length": 52.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 124.72916793823242, "epoch": 1.1854974704890389, "grad_norm": 19.507311166473414, "kl": 0.416015625, "learning_rate": 9.011824324324324e-07, "loss": 0.0004, "reward": 3.3562408685684204, "reward_std": 0.13687966763973236, "rewards/final_reward": 1.2024907054821465, "rewards/mask_iou_reward": 0.6012453527410733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3562407493591309, "rewards/thk_ans_format_reward": 1.0, "step": 351, "think_completion_length": 60.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 132.875, "epoch": 1.1888701517706577, "grad_norm": 19.857489845731262, "kl": 0.47265625, "learning_rate": 9.009009009009009e-07, "loss": 0.0005, "reward": 3.7000582218170166, "reward_std": 0.07116364687681198, "rewards/final_reward": 1.3070988339251544, "rewards/mask_iou_reward": 0.6535494169625772, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.700058102607727, "rewards/thk_ans_format_reward": 1.0, "step": 352, "think_completion_length": 64.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 124.48958587646484, "epoch": 1.1922428330522765, "grad_norm": 8.064821636428672, "kl": 0.45703125, "learning_rate": 9.006193693693693e-07, "loss": 0.0004, "reward": 3.244594097137451, "reward_std": 0.16334939748048782, "rewards/final_reward": 1.0210304897324536, "rewards/mask_iou_reward": 0.5105152448662268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2445939779281616, "rewards/thk_ans_format_reward": 1.0, "step": 353, "think_completion_length": 60.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 126.05208969116211, "epoch": 1.1956155143338956, "grad_norm": 37.25785884236522, "kl": 0.455078125, "learning_rate": 9.003378378378378e-07, "loss": 0.0005, "reward": 3.482452630996704, "reward_std": 0.17858774214982986, "rewards/final_reward": 0.9796232169947886, "rewards/mask_iou_reward": 0.4898116084973943, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4824528098106384, "rewards/thk_ans_format_reward": 1.0, "step": 354, "think_completion_length": 67.0 }, { "clip_ratio": 0.0, "completion_length": 123.69792175292969, "epoch": 1.1989881956155144, "grad_norm": 6.518708947918132, "kl": 0.4267578125, "learning_rate": 9.000563063063062e-07, "loss": 0.0004, "reward": 3.297169804573059, "reward_std": 0.27533242851495743, "rewards/final_reward": 1.3079855408483727, "rewards/mask_iou_reward": 0.6539927704241864, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.297169804573059, "rewards/thk_ans_format_reward": 1.0, "step": 355, "think_completion_length": 55.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 123.62500381469727, "epoch": 1.2023608768971332, "grad_norm": 6.487089048133617, "kl": 0.4150390625, "learning_rate": 8.997747747747747e-07, "loss": 0.0004, "reward": 3.531155824661255, "reward_std": 0.08609841391444206, "rewards/final_reward": 1.8262309782537098, "rewards/mask_iou_reward": 0.9131154891268549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5311556458473206, "rewards/thk_ans_format_reward": 1.0, "step": 356, "think_completion_length": 57.666666666666664 }, { "clip_ratio": 0.0, "completion_length": 133.5208396911621, "epoch": 1.205733558178752, "grad_norm": 12.991734029654161, "kl": 0.388671875, "learning_rate": 8.994932432432432e-07, "loss": 0.0004, "reward": 3.5364201068878174, "reward_std": 0.10052265971899033, "rewards/final_reward": 0.9239991930084288, "rewards/mask_iou_reward": 0.4619995965042144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5364200472831726, "rewards/thk_ans_format_reward": 1.0, "step": 357, "think_completion_length": 59.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 127.96875381469727, "epoch": 1.2091062394603709, "grad_norm": 13.287657453364371, "kl": 0.3974609375, "learning_rate": 8.992117117117116e-07, "loss": 0.0004, "reward": 3.169158458709717, "reward_std": 0.18534202873706818, "rewards/final_reward": 0.9565178424519037, "rewards/mask_iou_reward": 0.47825892122595187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.16915825009346, "rewards/thk_ans_format_reward": 1.0, "step": 358, "think_completion_length": 59.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 124.81250762939453, "epoch": 1.21247892074199, "grad_norm": 7.958957052398867, "kl": 0.4306640625, "learning_rate": 8.989301801801802e-07, "loss": 0.0004, "reward": 2.9273035526275635, "reward_std": 0.22559326887130737, "rewards/final_reward": 1.1544018769118864, "rewards/mask_iou_reward": 0.5772009384559432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9273033142089844, "rewards/thk_ans_format_reward": 1.0, "step": 359, "think_completion_length": 60.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 148.06250381469727, "epoch": 1.2158516020236088, "grad_norm": 26.49649303527819, "kl": 0.431640625, "learning_rate": 8.986486486486487e-07, "loss": 0.0004, "reward": 3.3369808197021484, "reward_std": 0.21792490035295486, "rewards/final_reward": 0.8552900644315113, "rewards/mask_iou_reward": 0.42764503221575567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3369808793067932, "rewards/thk_ans_format_reward": 1.0, "step": 360, "think_completion_length": 64.29166666666666 }, { "clip_ratio": 0.0, "completion_length": 127.25, "epoch": 1.2192242833052276, "grad_norm": 8.376718706661617, "kl": 0.40625, "learning_rate": 8.983671171171171e-07, "loss": 0.0004, "reward": 3.4866225719451904, "reward_std": 0.11170128360390663, "rewards/final_reward": 1.5778857509655904, "rewards/mask_iou_reward": 0.7889428754827952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4866225719451904, "rewards/thk_ans_format_reward": 1.0, "step": 361, "think_completion_length": 61.833333333333336 }, { "clip_ratio": 0.0, "completion_length": 152.11458587646484, "epoch": 1.2225969645868466, "grad_norm": 9.68698005550589, "kl": 0.375, "learning_rate": 8.980855855855856e-07, "loss": 0.0004, "reward": 3.416308283805847, "reward_std": 0.19030694663524628, "rewards/final_reward": 0.9690831250824701, "rewards/mask_iou_reward": 0.48454156254123504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4163082838058472, "rewards/thk_ans_format_reward": 1.0, "step": 362, "think_completion_length": 60.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 124.96875381469727, "epoch": 1.2259696458684655, "grad_norm": 5.466876208748925, "kl": 0.3701171875, "learning_rate": 8.97804054054054e-07, "loss": 0.0004, "reward": 3.083742141723633, "reward_std": 0.2588346600532532, "rewards/final_reward": 1.0219780085731855, "rewards/mask_iou_reward": 0.5109890042865928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0837420225143433, "rewards/thk_ans_format_reward": 1.0, "step": 363, "think_completion_length": 57.833333333333336 }, { "clip_ratio": 0.0, "completion_length": 125.04167175292969, "epoch": 1.2293423271500843, "grad_norm": 6.13962734112624, "kl": 0.4306640625, "learning_rate": 8.975225225225225e-07, "loss": 0.0004, "reward": 3.355513334274292, "reward_std": 0.11050765588879585, "rewards/final_reward": 1.684858807403022, "rewards/mask_iou_reward": 0.842429403701511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.355513334274292, "rewards/thk_ans_format_reward": 1.0, "step": 364, "think_completion_length": 62.625 }, { "clip_ratio": 0.0, "completion_length": 119.09375, "epoch": 1.2327150084317031, "grad_norm": 7.982760324456483, "kl": 0.42578125, "learning_rate": 8.97240990990991e-07, "loss": 0.0004, "reward": 3.561036229133606, "reward_std": 0.1595241203904152, "rewards/final_reward": 1.6920156046347539, "rewards/mask_iou_reward": 0.8460078023173769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5610363483428955, "rewards/thk_ans_format_reward": 1.0, "step": 365, "think_completion_length": 58.0 }, { "clip_ratio": 0.0, "completion_length": 120.54166793823242, "epoch": 1.2360876897133222, "grad_norm": 6.512397630777975, "kl": 0.4765625, "learning_rate": 8.969594594594594e-07, "loss": 0.0005, "reward": 3.0696985721588135, "reward_std": 0.1708909571170807, "rewards/final_reward": 1.20217084539104, "rewards/mask_iou_reward": 0.60108542269552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0696982145309448, "rewards/thk_ans_format_reward": 1.0, "step": 366, "think_completion_length": 60.375 }, { "clip_ratio": 0.0, "completion_length": 125.29166793823242, "epoch": 1.239460370994941, "grad_norm": 6.0760802820191415, "kl": 0.419921875, "learning_rate": 8.966779279279279e-07, "loss": 0.0004, "reward": 3.317044496536255, "reward_std": 0.18018481880426407, "rewards/final_reward": 0.9774291667228298, "rewards/mask_iou_reward": 0.4887145833614149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3170446157455444, "rewards/thk_ans_format_reward": 1.0, "step": 367, "think_completion_length": 73.375 }, { "clip_ratio": 0.0, "completion_length": 138.18750762939453, "epoch": 1.2428330522765598, "grad_norm": 19.438760787888953, "kl": 0.4072265625, "learning_rate": 8.963963963963963e-07, "loss": 0.0004, "reward": 2.8064730167388916, "reward_std": 0.15254508703947067, "rewards/final_reward": 1.2937061810125203, "rewards/mask_iou_reward": 0.6468530905062602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8064730167388916, "rewards/thk_ans_format_reward": 1.0, "step": 368, "think_completion_length": 58.5 }, { "clip_ratio": 0.0, "completion_length": 178.65625762939453, "epoch": 1.2462057335581789, "grad_norm": 5.73307642782275, "kl": 0.4267578125, "learning_rate": 8.961148648648649e-07, "loss": 0.0004, "reward": 3.4007943868637085, "reward_std": 0.13911010324954987, "rewards/final_reward": 1.4890716395653536, "rewards/mask_iou_reward": 0.7445358197826768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4007944464683533, "rewards/thk_ans_format_reward": 1.0, "step": 369, "think_completion_length": 58.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 132.6770896911621, "epoch": 1.2495784148397977, "grad_norm": 9.536909751789882, "kl": 0.408203125, "learning_rate": 8.958333333333334e-07, "loss": 0.0004, "reward": 2.7433794736862183, "reward_std": 0.3473154753446579, "rewards/final_reward": 0.42417780778787956, "rewards/mask_iou_reward": 0.21208890389393978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7433794736862183, "rewards/thk_ans_format_reward": 1.0, "step": 370, "think_completion_length": 70.79166666666666 }, { "clip_ratio": 0.0, "completion_length": 123.17708587646484, "epoch": 1.2529510961214165, "grad_norm": 7.0441708924637565, "kl": 0.380859375, "learning_rate": 8.955518018018018e-07, "loss": 0.0004, "reward": 3.249468207359314, "reward_std": 0.19624735042452812, "rewards/final_reward": 0.7434260286837571, "rewards/mask_iou_reward": 0.37171301434187853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2494682669639587, "rewards/thk_ans_format_reward": 1.0, "step": 371, "think_completion_length": 62.375 }, { "clip_ratio": 0.0, "completion_length": 147.3645896911621, "epoch": 1.2563237774030354, "grad_norm": 9.238593494156854, "kl": 0.38671875, "learning_rate": 8.952702702702703e-07, "loss": 0.0003, "reward": 3.3437212705612183, "reward_std": 0.3516358807682991, "rewards/final_reward": 1.4175673642321096, "rewards/mask_iou_reward": 0.7087836821160548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3437212705612183, "rewards/thk_ans_format_reward": 1.0, "step": 372, "think_completion_length": 62.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 149.90625, "epoch": 1.2596964586846542, "grad_norm": 11.358320745996085, "kl": 0.48828125, "learning_rate": 8.949887387387387e-07, "loss": 0.0005, "reward": 3.284479856491089, "reward_std": 0.18969615548849106, "rewards/final_reward": 1.4953732471384509, "rewards/mask_iou_reward": 0.7476866235692254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2844797372817993, "rewards/thk_ans_format_reward": 1.0, "step": 373, "think_completion_length": 59.25 }, { "clip_ratio": 0.0, "completion_length": 122.84375, "epoch": 1.2630691399662732, "grad_norm": 7.151983483918055, "kl": 0.3818359375, "learning_rate": 8.947072072072072e-07, "loss": 0.0004, "reward": 3.1946144104003906, "reward_std": 0.11458705738186836, "rewards/final_reward": 1.103767047410554, "rewards/mask_iou_reward": 0.551883523705277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1946144104003906, "rewards/thk_ans_format_reward": 1.0, "step": 374, "think_completion_length": 56.125 }, { "clip_ratio": 0.0, "completion_length": 127.03125, "epoch": 1.266441821247892, "grad_norm": 9.071646300744112, "kl": 0.4267578125, "learning_rate": 8.944256756756756e-07, "loss": 0.0004, "reward": 2.8307807445526123, "reward_std": 0.15931977331638336, "rewards/final_reward": 0.8759926112551624, "rewards/mask_iou_reward": 0.4379963056275812, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8307807892560959, "rewards/thk_ans_format_reward": 1.0, "step": 375, "think_completion_length": 55.125 }, { "clip_ratio": 0.0, "completion_length": 124.02083587646484, "epoch": 1.269814502529511, "grad_norm": 16.160925277891703, "kl": 0.4169921875, "learning_rate": 8.94144144144144e-07, "loss": 0.0005, "reward": 3.142805814743042, "reward_std": 0.11035696789622307, "rewards/final_reward": 1.6712795075685372, "rewards/mask_iou_reward": 0.8356397537842686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1428057253360748, "rewards/thk_ans_format_reward": 1.0, "step": 376, "think_completion_length": 53.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 121.34375381469727, "epoch": 1.27318718381113, "grad_norm": 5.971996512155509, "kl": 0.427734375, "learning_rate": 8.938626126126125e-07, "loss": 0.0004, "reward": 2.9981162548065186, "reward_std": 0.093520887196064, "rewards/final_reward": 0.7963105389822663, "rewards/mask_iou_reward": 0.39815526949113317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.998116135597229, "rewards/thk_ans_format_reward": 1.0, "step": 377, "think_completion_length": 55.54166666666667 }, { "clip_ratio": 0.0, "completion_length": 117.36458587646484, "epoch": 1.2765598650927488, "grad_norm": 10.43167191450742, "kl": 0.3994140625, "learning_rate": 8.93581081081081e-07, "loss": 0.0004, "reward": 3.275425672531128, "reward_std": 0.11471785977482796, "rewards/final_reward": 1.7041599415896092, "rewards/mask_iou_reward": 0.8520799707948046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2754257917404175, "rewards/thk_ans_format_reward": 1.0, "step": 378, "think_completion_length": 58.208333333333336 }, { "clip_ratio": 0.0, "completion_length": 139.95833587646484, "epoch": 1.2799325463743676, "grad_norm": 11.154206440284236, "kl": 0.369140625, "learning_rate": 8.932995495495495e-07, "loss": 0.0004, "reward": 3.4361212253570557, "reward_std": 0.11556711047887802, "rewards/final_reward": 1.3823572230237666, "rewards/mask_iou_reward": 0.6911786115118833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4361212849617004, "rewards/thk_ans_format_reward": 1.0, "step": 379, "think_completion_length": 65.16666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.03125381469727, "epoch": 1.2833052276559864, "grad_norm": 5.38958933410073, "kl": 0.3984375, "learning_rate": 8.93018018018018e-07, "loss": 0.0004, "reward": 3.418978691101074, "reward_std": 0.22982808575034142, "rewards/final_reward": 1.6648917460641777, "rewards/mask_iou_reward": 0.8324458730320888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.41897851228714, "rewards/thk_ans_format_reward": 1.0, "step": 380, "think_completion_length": 55.333333333333336 }, { "clip_ratio": 0.0, "completion_length": 144.0625, "epoch": 1.2866779089376055, "grad_norm": 14.204627852713159, "kl": 0.466796875, "learning_rate": 8.927364864864864e-07, "loss": 0.0005, "reward": 3.205819845199585, "reward_std": 0.17886455357074738, "rewards/final_reward": 1.0723750964434002, "rewards/mask_iou_reward": 0.5361875482217001, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2058197855949402, "rewards/thk_ans_format_reward": 1.0, "step": 381, "think_completion_length": 56.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 123.11458969116211, "epoch": 1.2900505902192243, "grad_norm": 7.3675402122446085, "kl": 0.37890625, "learning_rate": 8.924549549549549e-07, "loss": 0.0004, "reward": 3.1716625690460205, "reward_std": 0.17073575779795647, "rewards/final_reward": 1.919256260742495, "rewards/mask_iou_reward": 0.9596281303712475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1716625690460205, "rewards/thk_ans_format_reward": 1.0, "step": 382, "think_completion_length": 62.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 140.0208396911621, "epoch": 1.2934232715008431, "grad_norm": 15.221231003902952, "kl": 0.341796875, "learning_rate": 8.921734234234234e-07, "loss": 0.0003, "reward": 3.087328314781189, "reward_std": 0.1399368941783905, "rewards/final_reward": 0.9321140483293636, "rewards/mask_iou_reward": 0.4660570241646818, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0873282551765442, "rewards/thk_ans_format_reward": 1.0, "step": 383, "think_completion_length": 59.625 }, { "clip_ratio": 0.0, "completion_length": 128.37500381469727, "epoch": 1.2967959527824622, "grad_norm": 5.2845428955702785, "kl": 0.3779296875, "learning_rate": 8.918918918918918e-07, "loss": 0.0004, "reward": 2.6019227504730225, "reward_std": 0.1827988475561142, "rewards/final_reward": 0.28128574034548853, "rewards/mask_iou_reward": 0.14064287017274427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6019226759672165, "rewards/thk_ans_format_reward": 1.0, "step": 384, "think_completion_length": 59.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 116.87500381469727, "epoch": 1.300168634064081, "grad_norm": 8.713017994196841, "kl": 0.6875, "learning_rate": 8.916103603603603e-07, "loss": 0.0007, "reward": 2.8522579669952393, "reward_std": 0.2745845168828964, "rewards/final_reward": 0.7205070515589428, "rewards/mask_iou_reward": 0.3602535257794714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8522579073905945, "rewards/thk_ans_format_reward": 1.0, "step": 385, "think_completion_length": 58.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 126.27083587646484, "epoch": 1.3035413153456998, "grad_norm": 4.3399686330782785, "kl": 0.416015625, "learning_rate": 8.913288288288287e-07, "loss": 0.0004, "reward": 2.8038978576660156, "reward_std": 0.20723021775484085, "rewards/final_reward": 1.0406138061752164, "rewards/mask_iou_reward": 0.5203069030876082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8038977980613708, "rewards/thk_ans_format_reward": 1.0, "step": 386, "think_completion_length": 56.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 117.17708587646484, "epoch": 1.3069139966273187, "grad_norm": 15.658706712906872, "kl": 0.3671875, "learning_rate": 8.910472972972972e-07, "loss": 0.0004, "reward": 3.316901206970215, "reward_std": 0.1013756264001131, "rewards/final_reward": 1.9059538432217367, "rewards/mask_iou_reward": 0.9529769216108683, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3169008493423462, "rewards/thk_ans_format_reward": 1.0, "step": 387, "think_completion_length": 52.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 114.92708587646484, "epoch": 1.3102866779089375, "grad_norm": 9.430723989126525, "kl": 0.4130859375, "learning_rate": 8.907657657657657e-07, "loss": 0.0004, "reward": 3.4749109745025635, "reward_std": 0.08457119390368462, "rewards/final_reward": 1.693266615218402, "rewards/mask_iou_reward": 0.846633307609201, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4749109148979187, "rewards/thk_ans_format_reward": 1.0, "step": 388, "think_completion_length": 52.916666666666664 }, { "clip_ratio": 0.0, "completion_length": 122.30208587646484, "epoch": 1.3136593591905565, "grad_norm": 39.06175249555972, "kl": 0.349609375, "learning_rate": 8.904842342342342e-07, "loss": 0.0004, "reward": 3.163516402244568, "reward_std": 0.09743357449769974, "rewards/final_reward": 1.123364980192178, "rewards/mask_iou_reward": 0.561682490096089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1635163724422455, "rewards/thk_ans_format_reward": 1.0, "step": 389, "think_completion_length": 56.5 }, { "clip_ratio": 0.0, "completion_length": 141.1354217529297, "epoch": 1.3170320404721754, "grad_norm": 6.782655594229362, "kl": 0.400390625, "learning_rate": 8.902027027027027e-07, "loss": 0.0004, "reward": 3.012680411338806, "reward_std": 0.22783783078193665, "rewards/final_reward": 0.8515693866925474, "rewards/mask_iou_reward": 0.4257846933462737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0126804113388062, "rewards/thk_ans_format_reward": 1.0, "step": 390, "think_completion_length": 56.75 }, { "clip_ratio": 0.0, "completion_length": 122.89583587646484, "epoch": 1.3204047217537942, "grad_norm": 14.255778869795074, "kl": 0.470703125, "learning_rate": 8.899211711711711e-07, "loss": 0.0005, "reward": 3.1088274717330933, "reward_std": 0.25957299768924713, "rewards/final_reward": 1.1813193456230515, "rewards/mask_iou_reward": 0.5906596728115258, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1088275015354156, "rewards/thk_ans_format_reward": 1.0, "step": 391, "think_completion_length": 50.875 }, { "clip_ratio": 0.0, "completion_length": 143.12500381469727, "epoch": 1.3237774030354132, "grad_norm": 21.896006193903432, "kl": 0.39453125, "learning_rate": 8.896396396396396e-07, "loss": 0.0004, "reward": 2.811727285385132, "reward_std": 0.24812977015972137, "rewards/final_reward": 0.6650224283784837, "rewards/mask_iou_reward": 0.33251121418924184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8117272257804871, "rewards/thk_ans_format_reward": 1.0, "step": 392, "think_completion_length": 45.833333333333336 }, { "clip_ratio": 0.0, "completion_length": 117.71875381469727, "epoch": 1.327150084317032, "grad_norm": 12.972249422550195, "kl": 0.4287109375, "learning_rate": 8.893581081081081e-07, "loss": 0.0004, "reward": 3.5639950037002563, "reward_std": 0.1949900984764099, "rewards/final_reward": 1.6489261938244277, "rewards/mask_iou_reward": 0.8244630969122139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5639949440956116, "rewards/thk_ans_format_reward": 1.0, "step": 393, "think_completion_length": 54.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 115.04166793823242, "epoch": 1.330522765598651, "grad_norm": 5.648364619261588, "kl": 0.390625, "learning_rate": 8.890765765765765e-07, "loss": 0.0004, "reward": 3.349947929382324, "reward_std": 0.17918875813484192, "rewards/final_reward": 0.6127089105917438, "rewards/mask_iou_reward": 0.3063544552958719, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.34994775056839, "rewards/thk_ans_format_reward": 1.0, "step": 394, "think_completion_length": 59.04166666666667 }, { "clip_ratio": 0.0, "completion_length": 125.28125, "epoch": 1.3338954468802697, "grad_norm": 6.458264633868568, "kl": 0.4365234375, "learning_rate": 8.88795045045045e-07, "loss": 0.0004, "reward": 3.2618488073349, "reward_std": 0.09879514575004578, "rewards/final_reward": 1.8915855231564893, "rewards/mask_iou_reward": 0.9457927615782447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2618490755558014, "rewards/thk_ans_format_reward": 1.0, "step": 395, "think_completion_length": 51.5 }, { "clip_ratio": 0.0, "completion_length": 117.5625, "epoch": 1.3372681281618888, "grad_norm": 21.847456737133793, "kl": 0.388671875, "learning_rate": 8.885135135135135e-07, "loss": 0.0004, "reward": 3.172235608100891, "reward_std": 0.17507488653063774, "rewards/final_reward": 1.7546129488764217, "rewards/mask_iou_reward": 0.8773064744382109, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1722354888916016, "rewards/thk_ans_format_reward": 1.0, "step": 396, "think_completion_length": 61.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 117.83333587646484, "epoch": 1.3406408094435076, "grad_norm": 10.361193095562697, "kl": 0.3955078125, "learning_rate": 8.882319819819819e-07, "loss": 0.0004, "reward": 3.3426592350006104, "reward_std": 0.05589485540986061, "rewards/final_reward": 1.4442277286890854, "rewards/mask_iou_reward": 0.7221138643445427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3426590859889984, "rewards/thk_ans_format_reward": 1.0, "step": 397, "think_completion_length": 53.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 130.94791793823242, "epoch": 1.3440134907251264, "grad_norm": 6.465458092344196, "kl": 0.435546875, "learning_rate": 8.879504504504504e-07, "loss": 0.0004, "reward": 3.2610703706741333, "reward_std": 0.18213983997702599, "rewards/final_reward": 1.2578534767851406, "rewards/mask_iou_reward": 0.6289267383925703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2610703706741333, "rewards/thk_ans_format_reward": 1.0, "step": 398, "think_completion_length": 61.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 123.26042175292969, "epoch": 1.3473861720067455, "grad_norm": 4.881478330864418, "kl": 0.365234375, "learning_rate": 8.876689189189189e-07, "loss": 0.0004, "reward": 2.9544575214385986, "reward_std": 0.2750149741768837, "rewards/final_reward": 1.321201826779002, "rewards/mask_iou_reward": 0.660600913389501, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9544573724269867, "rewards/thk_ans_format_reward": 1.0, "step": 399, "think_completion_length": 49.0 }, { "clip_ratio": 0.0, "completion_length": 134.36458587646484, "epoch": 1.3507588532883643, "grad_norm": 9.08727978460178, "kl": 0.3720703125, "learning_rate": 8.873873873873874e-07, "loss": 0.0004, "reward": 3.395462989807129, "reward_std": 0.1573108658194542, "rewards/final_reward": 1.591655385445944, "rewards/mask_iou_reward": 0.795827692722972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3954631090164185, "rewards/thk_ans_format_reward": 1.0, "step": 400, "think_completion_length": 56.0 }, { "clip_ratio": 0.0, "completion_length": 127.89583587646484, "epoch": 1.3541315345699831, "grad_norm": 10.98203028725922, "kl": 0.490234375, "learning_rate": 8.871058558558559e-07, "loss": 0.0005, "reward": 3.2958803176879883, "reward_std": 0.29315295070409775, "rewards/final_reward": 1.8774677527947858, "rewards/mask_iou_reward": 0.9387338763973929, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.295880138874054, "rewards/thk_ans_format_reward": 1.0, "step": 401, "think_completion_length": 53.625 }, { "clip_ratio": 0.0, "completion_length": 129.36458587646484, "epoch": 1.357504215851602, "grad_norm": 10.763475285844352, "kl": 0.46875, "learning_rate": 8.868243243243243e-07, "loss": 0.0005, "reward": 3.1025447845458984, "reward_std": 0.1851225420832634, "rewards/final_reward": 1.1530047529999958, "rewards/mask_iou_reward": 0.5765023764999979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1025445461273193, "rewards/thk_ans_format_reward": 1.0, "step": 402, "think_completion_length": 59.0 }, { "clip_ratio": 0.0, "completion_length": 119.30208587646484, "epoch": 1.3608768971332208, "grad_norm": 13.295790055078449, "kl": 0.4345703125, "learning_rate": 8.865427927927928e-07, "loss": 0.0004, "reward": 3.2150758504867554, "reward_std": 0.2421677317470312, "rewards/final_reward": 0.4944866514919177, "rewards/mask_iou_reward": 0.24724332574595884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2150757312774658, "rewards/thk_ans_format_reward": 1.0, "step": 403, "think_completion_length": 54.875 }, { "clip_ratio": 0.0, "completion_length": 121.03125381469727, "epoch": 1.3642495784148398, "grad_norm": 15.86535809741585, "kl": 0.4970703125, "learning_rate": 8.862612612612612e-07, "loss": 0.0005, "reward": 2.974599838256836, "reward_std": 0.1448173001408577, "rewards/final_reward": 0.9453635419360841, "rewards/mask_iou_reward": 0.47268177096804204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9745997190475464, "rewards/thk_ans_format_reward": 1.0, "step": 404, "think_completion_length": 55.33333333333333 }, { "clip_ratio": 0.0, "completion_length": 117.80208587646484, "epoch": 1.3676222596964587, "grad_norm": 23.773005695655176, "kl": 0.4541015625, "learning_rate": 8.859797297297297e-07, "loss": 0.0005, "reward": 3.1880345344543457, "reward_std": 0.18816696107387543, "rewards/final_reward": 1.3801016130752197, "rewards/mask_iou_reward": 0.6900508065376099, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.18803471326828, "rewards/thk_ans_format_reward": 1.0, "step": 405, "think_completion_length": 50.75 }, { "clip_ratio": 0.0, "completion_length": 120.12500381469727, "epoch": 1.3709949409780775, "grad_norm": 5.653160672220901, "kl": 0.4296875, "learning_rate": 8.856981981981982e-07, "loss": 0.0005, "reward": 3.32892107963562, "reward_std": 0.15366527438163757, "rewards/final_reward": 1.3260781944652618, "rewards/mask_iou_reward": 0.6630390972326309, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.328921139240265, "rewards/thk_ans_format_reward": 1.0, "step": 406, "think_completion_length": 51.125 }, { "clip_ratio": 0.0, "completion_length": 111.35416793823242, "epoch": 1.3743676222596966, "grad_norm": 29.17112442936424, "kl": 0.4013671875, "learning_rate": 8.854166666666666e-07, "loss": 0.0004, "reward": 3.4100879430770874, "reward_std": 0.23499078676104546, "rewards/final_reward": 0.8420019533822547, "rewards/mask_iou_reward": 0.42100097669112735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4100881218910217, "rewards/thk_ans_format_reward": 1.0, "step": 407, "think_completion_length": 53.958333333333336 }, { "clip_ratio": 0.0, "completion_length": 125.83333969116211, "epoch": 1.3777403035413154, "grad_norm": 27.863976318871792, "kl": 0.427734375, "learning_rate": 8.851351351351351e-07, "loss": 0.0005, "reward": 3.4160239696502686, "reward_std": 0.13645297288894653, "rewards/final_reward": 0.8744746874887974, "rewards/mask_iou_reward": 0.4372373437443987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.416023850440979, "rewards/thk_ans_format_reward": 1.0, "step": 408, "think_completion_length": 60.125 }, { "clip_ratio": 0.0, "completion_length": 112.44791793823242, "epoch": 1.3811129848229342, "grad_norm": 9.56224631051352, "kl": 0.400390625, "learning_rate": 8.848536036036037e-07, "loss": 0.0004, "reward": 2.9102158546447754, "reward_std": 0.17862200736999512, "rewards/final_reward": 0.8199508471048658, "rewards/mask_iou_reward": 0.4099754235524329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9102159142494202, "rewards/thk_ans_format_reward": 1.0, "step": 409, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 115.06250381469727, "epoch": 1.384485666104553, "grad_norm": 6.5133688824628555, "kl": 0.580078125, "learning_rate": 8.845720720720721e-07, "loss": 0.0006, "reward": 3.3277347087860107, "reward_std": 0.1092943362891674, "rewards/final_reward": 1.4457946634446563, "rewards/mask_iou_reward": 0.7228973317223282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.327734887599945, "rewards/thk_ans_format_reward": 1.0, "step": 410, "think_completion_length": 51.5 }, { "clip_ratio": 0.0, "completion_length": 112.61458587646484, "epoch": 1.387858347386172, "grad_norm": 8.819736571295985, "kl": 0.4853515625, "learning_rate": 8.842905405405406e-07, "loss": 0.0005, "reward": 3.330057144165039, "reward_std": 0.1943942978978157, "rewards/final_reward": 1.2174243849477313, "rewards/mask_iou_reward": 0.6087121924738657, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3300570249557495, "rewards/thk_ans_format_reward": 1.0, "step": 411, "think_completion_length": 49.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 112.32291793823242, "epoch": 1.391231028667791, "grad_norm": 5.187668148870701, "kl": 0.466796875, "learning_rate": 8.84009009009009e-07, "loss": 0.0005, "reward": 3.494489073753357, "reward_std": 0.057365935295820236, "rewards/final_reward": 1.885988405475033, "rewards/mask_iou_reward": 0.9429942027375166, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.494489073753357, "rewards/thk_ans_format_reward": 1.0, "step": 412, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 109.41667175292969, "epoch": 1.3946037099494097, "grad_norm": 7.520342801689036, "kl": 0.5849609375, "learning_rate": 8.837274774774775e-07, "loss": 0.0006, "reward": 3.332287549972534, "reward_std": 0.2603805884718895, "rewards/final_reward": 1.2709260084128842, "rewards/mask_iou_reward": 0.6354630042064421, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3322875499725342, "rewards/thk_ans_format_reward": 1.0, "step": 413, "think_completion_length": 50.95833333333333 }, { "clip_ratio": 0.0, "completion_length": 112.57292175292969, "epoch": 1.3979763912310288, "grad_norm": 33.403308964345825, "kl": 0.4951171875, "learning_rate": 8.83445945945946e-07, "loss": 0.0005, "reward": 3.364820122718811, "reward_std": 0.1603723168373108, "rewards/final_reward": 1.4100766007692036, "rewards/mask_iou_reward": 0.7050383003846018, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.364820122718811, "rewards/thk_ans_format_reward": 1.0, "step": 414, "think_completion_length": 38.0 }, { "clip_ratio": 0.0, "completion_length": 112.10416793823242, "epoch": 1.4013490725126476, "grad_norm": 5.862994402404749, "kl": 0.47265625, "learning_rate": 8.831644144144143e-07, "loss": 0.0005, "reward": 3.317633271217346, "reward_std": 0.08006502967327833, "rewards/final_reward": 1.4176781291376375, "rewards/mask_iou_reward": 0.7088390645688187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.317633032798767, "rewards/thk_ans_format_reward": 1.0, "step": 415, "think_completion_length": 47.08333333333333 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 1.4047217537942664, "grad_norm": 18.78986909256566, "kl": 0.494140625, "learning_rate": 8.828828828828828e-07, "loss": 0.0005, "reward": 3.111713409423828, "reward_std": 0.16862037405371666, "rewards/final_reward": 1.5124446319151947, "rewards/mask_iou_reward": 0.7562223159575974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1117135286331177, "rewards/thk_ans_format_reward": 1.0, "step": 416, "think_completion_length": 45.45833333333333 }, { "clip_ratio": 0.0, "completion_length": 103.50000381469727, "epoch": 1.4080944350758853, "grad_norm": 6.739482714526197, "kl": 0.482421875, "learning_rate": 8.826013513513512e-07, "loss": 0.0006, "reward": 3.124404549598694, "reward_std": 0.19819872826337814, "rewards/final_reward": 0.4775836903808357, "rewards/mask_iou_reward": 0.23879184519041785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1244046092033386, "rewards/thk_ans_format_reward": 1.0, "step": 417, "think_completion_length": 42.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 111.61458587646484, "epoch": 1.411467116357504, "grad_norm": 5.99580145734765, "kl": 0.4814453125, "learning_rate": 8.823198198198197e-07, "loss": 0.0005, "reward": 3.605440378189087, "reward_std": 0.1609882414340973, "rewards/final_reward": 1.6322748536347056, "rewards/mask_iou_reward": 0.8161374268173528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.605440378189087, "rewards/thk_ans_format_reward": 1.0, "step": 418, "think_completion_length": 46.583333333333336 }, { "clip_ratio": 0.0, "completion_length": 119.47917175292969, "epoch": 1.4148397976391232, "grad_norm": 11.118318311069785, "kl": 0.498046875, "learning_rate": 8.820382882882883e-07, "loss": 0.0005, "reward": 3.18659508228302, "reward_std": 0.32843393087387085, "rewards/final_reward": 0.8392202733213036, "rewards/mask_iou_reward": 0.4196101366606518, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2074283957481384, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 419, "think_completion_length": 52.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 125.58333587646484, "epoch": 1.418212478920742, "grad_norm": 77.20231858601272, "kl": 0.42578125, "learning_rate": 8.817567567567567e-07, "loss": 0.0005, "reward": 3.1859129667282104, "reward_std": 0.13804687187075615, "rewards/final_reward": 1.168682475575744, "rewards/mask_iou_reward": 0.584341237787872, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1859130263328552, "rewards/thk_ans_format_reward": 1.0, "step": 420, "think_completion_length": 44.79166666666667 }, { "clip_ratio": 0.0, "completion_length": 133.3125, "epoch": 1.4215851602023608, "grad_norm": 16.91364728527806, "kl": 0.4970703125, "learning_rate": 8.814752252252252e-07, "loss": 0.0005, "reward": 3.5506646633148193, "reward_std": 0.0705304704606533, "rewards/final_reward": 1.1024359001502577, "rewards/mask_iou_reward": 0.5512179500751289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5506644248962402, "rewards/thk_ans_format_reward": 1.0, "step": 421, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 102.02083587646484, "epoch": 1.4249578414839799, "grad_norm": 9.552478339006777, "kl": 0.498046875, "learning_rate": 8.811936936936936e-07, "loss": 0.0005, "reward": 3.524839758872986, "reward_std": 0.16047295182943344, "rewards/final_reward": 1.4494069846057993, "rewards/mask_iou_reward": 0.7247034923028997, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5248395204544067, "rewards/thk_ans_format_reward": 1.0, "step": 422, "think_completion_length": 41.20833333333333 }, { "clip_ratio": 0.0, "completion_length": 100.58333587646484, "epoch": 1.4283305227655987, "grad_norm": 6.705347286067029, "kl": 0.474609375, "learning_rate": 8.809121621621621e-07, "loss": 0.0005, "reward": 3.1085526943206787, "reward_std": 0.13762886077165604, "rewards/final_reward": 1.4445938675272614, "rewards/mask_iou_reward": 0.7222969337636307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1085528135299683, "rewards/thk_ans_format_reward": 1.0, "step": 423, "think_completion_length": 34.5 }, { "clip_ratio": 0.0, "completion_length": 104.75000381469727, "epoch": 1.4317032040472175, "grad_norm": 5.960587019979815, "kl": 0.44921875, "learning_rate": 8.806306306306306e-07, "loss": 0.0004, "reward": 2.8898168802261353, "reward_std": 0.2549128457903862, "rewards/final_reward": 0.6743037408393452, "rewards/mask_iou_reward": 0.3371518704196726, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8898166865110397, "rewards/thk_ans_format_reward": 1.0, "step": 424, "think_completion_length": 39.04166666666667 }, { "clip_ratio": 0.0, "completion_length": 113.00000381469727, "epoch": 1.4350758853288363, "grad_norm": 4.748435112911275, "kl": 0.54296875, "learning_rate": 8.80349099099099e-07, "loss": 0.0005, "reward": 3.3094550371170044, "reward_std": 0.11676504462957382, "rewards/final_reward": 1.1488917938058036, "rewards/mask_iou_reward": 0.5744458969029018, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3094549775123596, "rewards/thk_ans_format_reward": 1.0, "step": 425, "think_completion_length": 37.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 125.91666793823242, "epoch": 1.4384485666104554, "grad_norm": 8.627079159689744, "kl": 0.4765625, "learning_rate": 8.800675675675675e-07, "loss": 0.0005, "reward": 2.928268551826477, "reward_std": 0.10477589443325996, "rewards/final_reward": 1.1542270032630197, "rewards/mask_iou_reward": 0.5771135016315099, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9282682836055756, "rewards/thk_ans_format_reward": 1.0, "step": 426, "think_completion_length": 37.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 108.70833587646484, "epoch": 1.4418212478920742, "grad_norm": 10.280001178760335, "kl": 0.458984375, "learning_rate": 8.797860360360359e-07, "loss": 0.0005, "reward": 3.1266590356826782, "reward_std": 0.28933235257864, "rewards/final_reward": 0.5333257628063448, "rewards/mask_iou_reward": 0.2666628814031724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1370754837989807, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 427, "think_completion_length": 41.58333333333333 }, { "clip_ratio": 0.0, "completion_length": 122.42708587646484, "epoch": 1.445193929173693, "grad_norm": 9.663883538288415, "kl": 0.4208984375, "learning_rate": 8.795045045045044e-07, "loss": 0.0004, "reward": 3.270835518836975, "reward_std": 0.3859306201338768, "rewards/final_reward": 1.4194974541074559, "rewards/mask_iou_reward": 0.7097487270537279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2708353996276855, "rewards/thk_ans_format_reward": 1.0, "step": 428, "think_completion_length": 42.41666666666667 }, { "clip_ratio": 0.0, "completion_length": 104.62500381469727, "epoch": 1.448566610455312, "grad_norm": 19.624851392813568, "kl": 0.416015625, "learning_rate": 8.79222972972973e-07, "loss": 0.0004, "reward": 3.212664246559143, "reward_std": 0.1854577735066414, "rewards/final_reward": 0.692815486793328, "rewards/mask_iou_reward": 0.346407743396664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2126640677452087, "rewards/thk_ans_format_reward": 1.0, "step": 429, "think_completion_length": 45.91666666666667 }, { "clip_ratio": 0.0, "completion_length": 113.18750762939453, "epoch": 1.451939291736931, "grad_norm": 8.15185088347606, "kl": 0.404296875, "learning_rate": 8.789414414414414e-07, "loss": 0.0004, "reward": 3.1903375387191772, "reward_std": 0.18493592739105225, "rewards/final_reward": 1.2455512865472378, "rewards/mask_iou_reward": 0.6227756432736189, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1903374195098877, "rewards/thk_ans_format_reward": 1.0, "step": 430, "think_completion_length": 49.66666666666667 }, { "clip_ratio": 0.0, "completion_length": 130.93750381469727, "epoch": 1.4553119730185498, "grad_norm": 9.638413199088216, "kl": 0.572265625, "learning_rate": 8.786599099099099e-07, "loss": 0.0006, "reward": 3.6001389026641846, "reward_std": 0.06927749514579773, "rewards/final_reward": 1.8230214998088752, "rewards/mask_iou_reward": 0.9115107499044376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6001389026641846, "rewards/thk_ans_format_reward": 1.0, "step": 431, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 113.9375, "epoch": 1.4586846543001686, "grad_norm": 8.658861372580853, "kl": 0.4814453125, "learning_rate": 8.783783783783784e-07, "loss": 0.0005, "reward": 3.0466322898864746, "reward_std": 0.21577009186148643, "rewards/final_reward": 0.2688910628278502, "rewards/mask_iou_reward": 0.1344455314139251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0466321110725403, "rewards/thk_ans_format_reward": 1.0, "step": 432, "think_completion_length": 44.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 113.21875381469727, "epoch": 1.4620573355817874, "grad_norm": 8.80212834256664, "kl": 0.4404296875, "learning_rate": 8.780968468468468e-07, "loss": 0.0004, "reward": 3.499486207962036, "reward_std": 0.18082892894744873, "rewards/final_reward": 1.3780213683306257, "rewards/mask_iou_reward": 0.6890106841653129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4994860887527466, "rewards/thk_ans_format_reward": 1.0, "step": 433, "think_completion_length": 36.5 }, { "clip_ratio": 0.0, "completion_length": 104.58333587646484, "epoch": 1.4654300168634065, "grad_norm": 22.96490471862446, "kl": 0.4736328125, "learning_rate": 8.778153153153153e-07, "loss": 0.0005, "reward": 3.292214035987854, "reward_std": 0.24014803767204285, "rewards/final_reward": 1.4014288031328084, "rewards/mask_iou_reward": 0.7007144015664042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2922139763832092, "rewards/thk_ans_format_reward": 1.0, "step": 434, "think_completion_length": 38.29166666666667 }, { "clip_ratio": 0.0, "completion_length": 105.82291793823242, "epoch": 1.4688026981450253, "grad_norm": 14.29935664393217, "kl": 0.4375, "learning_rate": 8.775337837837837e-07, "loss": 0.0004, "reward": 2.967806816101074, "reward_std": 0.28818748891353607, "rewards/final_reward": 0.2677004428336839, "rewards/mask_iou_reward": 0.13385022141684194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9678069055080414, "rewards/thk_ans_format_reward": 1.0, "step": 435, "think_completion_length": 41.25 }, { "clip_ratio": 0.0, "completion_length": 119.98958587646484, "epoch": 1.4721753794266441, "grad_norm": 6.3730077599899015, "kl": 0.390625, "learning_rate": 8.772522522522522e-07, "loss": 0.0004, "reward": 3.3646087646484375, "reward_std": 0.08628809824585915, "rewards/final_reward": 0.9938647796127891, "rewards/mask_iou_reward": 0.49693238980639454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.364608883857727, "rewards/thk_ans_format_reward": 1.0, "step": 436, "think_completion_length": 38.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 135.6354217529297, "epoch": 1.4755480607082632, "grad_norm": 10.925111726883417, "kl": 0.4765625, "learning_rate": 8.769707207207207e-07, "loss": 0.0005, "reward": 2.952384114265442, "reward_std": 0.16299670934677124, "rewards/final_reward": 1.2880874610009931, "rewards/mask_iou_reward": 0.6440437305004966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9523837566375732, "rewards/thk_ans_format_reward": 1.0, "step": 437, "think_completion_length": 35.25 }, { "clip_ratio": 0.0, "completion_length": 113.42708587646484, "epoch": 1.478920741989882, "grad_norm": 10.015341910465036, "kl": 0.3837890625, "learning_rate": 8.766891891891891e-07, "loss": 0.0004, "reward": 3.039707899093628, "reward_std": 0.13153230771422386, "rewards/final_reward": 0.9393191034972563, "rewards/mask_iou_reward": 0.46965955174862817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0397077798843384, "rewards/thk_ans_format_reward": 1.0, "step": 438, "think_completion_length": 39.16666666666667 }, { "clip_ratio": 0.0, "completion_length": 101.88541793823242, "epoch": 1.4822934232715008, "grad_norm": 39.92832341017036, "kl": 0.4462890625, "learning_rate": 8.764076576576577e-07, "loss": 0.0004, "reward": 3.5502430200576782, "reward_std": 0.1625949591398239, "rewards/final_reward": 1.6293686105111496, "rewards/mask_iou_reward": 0.8146843052555748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.550242841243744, "rewards/thk_ans_format_reward": 1.0, "step": 439, "think_completion_length": 39.125 }, { "clip_ratio": 0.0, "completion_length": 112.8125, "epoch": 1.4856661045531196, "grad_norm": 11.879109435228093, "kl": 0.3935546875, "learning_rate": 8.761261261261261e-07, "loss": 0.0004, "reward": 3.2247962951660156, "reward_std": 0.27370116859674454, "rewards/final_reward": 0.9659916672752272, "rewards/mask_iou_reward": 0.4829958336376136, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2352128624916077, "rewards/thk_ans_format_reward": 1.0, "step": 440, "think_completion_length": 39.333333333333336 }, { "clip_ratio": 0.0, "completion_length": 106.41667175292969, "epoch": 1.4890387858347387, "grad_norm": 6.712408642446364, "kl": 0.3935546875, "learning_rate": 8.758445945945946e-07, "loss": 0.0004, "reward": 3.5379750728607178, "reward_std": 0.17404749989509583, "rewards/final_reward": 1.6421923444437927, "rewards/mask_iou_reward": 0.8210961722218963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5379751920700073, "rewards/thk_ans_format_reward": 1.0, "step": 441, "think_completion_length": 35.875 }, { "clip_ratio": 0.0, "completion_length": 100.19791793823242, "epoch": 1.4924114671163575, "grad_norm": 6.709798659834236, "kl": 0.525390625, "learning_rate": 8.755630630630631e-07, "loss": 0.0005, "reward": 3.350723624229431, "reward_std": 0.17634809762239456, "rewards/final_reward": 1.4496213303634948, "rewards/mask_iou_reward": 0.7248106651817474, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3507237434387207, "rewards/thk_ans_format_reward": 1.0, "step": 442, "think_completion_length": 33.583333333333336 }, { "clip_ratio": 0.0, "completion_length": 104.33333587646484, "epoch": 1.4957841483979764, "grad_norm": 7.958027162502342, "kl": 0.4453125, "learning_rate": 8.752815315315315e-07, "loss": 0.0004, "reward": 2.863914966583252, "reward_std": 0.15693428367376328, "rewards/final_reward": 0.44607979220935845, "rewards/mask_iou_reward": 0.22303989610467922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8639149367809296, "rewards/thk_ans_format_reward": 1.0, "step": 443, "think_completion_length": 44.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 100.19791793823242, "epoch": 1.4991568296795954, "grad_norm": 9.352421356313323, "kl": 0.4365234375, "learning_rate": 8.75e-07, "loss": 0.0004, "reward": 3.37521755695343, "reward_std": 0.2912629693746567, "rewards/final_reward": 0.9321297674460074, "rewards/mask_iou_reward": 0.4660648837230037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3752171993255615, "rewards/thk_ans_format_reward": 1.0, "step": 444, "think_completion_length": 32.70833333333333 }, { "clip_ratio": 0.0, "completion_length": 95.16666793823242, "epoch": 1.5025295109612142, "grad_norm": 13.719292122120075, "kl": 0.5673828125, "learning_rate": 8.747184684684684e-07, "loss": 0.0006, "reward": 3.136060118675232, "reward_std": 0.17749232798814774, "rewards/final_reward": 0.6155818352553188, "rewards/mask_iou_reward": 0.3077909176276594, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1360602378845215, "rewards/thk_ans_format_reward": 1.0, "step": 445, "think_completion_length": 23.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 106.65625381469727, "epoch": 1.505902192242833, "grad_norm": 11.080309636704607, "kl": 0.4482421875, "learning_rate": 8.744369369369369e-07, "loss": 0.0005, "reward": 3.1183054447174072, "reward_std": 0.10751515999436378, "rewards/final_reward": 1.6852675556133896, "rewards/mask_iou_reward": 0.8426337778066948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1183055639266968, "rewards/thk_ans_format_reward": 1.0, "step": 446, "think_completion_length": 31.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 93.02083587646484, "epoch": 1.5092748735244519, "grad_norm": 13.161494532551057, "kl": 0.4482421875, "learning_rate": 8.741554054054054e-07, "loss": 0.0005, "reward": 3.2273337841033936, "reward_std": 0.1627396196126938, "rewards/final_reward": 0.9859541692869749, "rewards/mask_iou_reward": 0.49297708464348744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2273337244987488, "rewards/thk_ans_format_reward": 1.0, "step": 447, "think_completion_length": 36.83333333333333 }, { "clip_ratio": 0.0, "completion_length": 89.33333587646484, "epoch": 1.5126475548060707, "grad_norm": 10.348315811850904, "kl": 0.50390625, "learning_rate": 8.738738738738738e-07, "loss": 0.0005, "reward": 2.807453989982605, "reward_std": 0.19256117939949036, "rewards/final_reward": 0.813686752756777, "rewards/mask_iou_reward": 0.4068433763783885, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8074538707733154, "rewards/thk_ans_format_reward": 1.0, "step": 448, "think_completion_length": 23.791666666666664 }, { "clip_ratio": 0.0, "completion_length": 113.36458969116211, "epoch": 1.5160202360876898, "grad_norm": 21.72668434175921, "kl": 0.412109375, "learning_rate": 8.735923423423423e-07, "loss": 0.0004, "reward": 2.9704527854919434, "reward_std": 0.25510428100824356, "rewards/final_reward": 0.9751997237566301, "rewards/mask_iou_reward": 0.48759986187831506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9704526960849762, "rewards/thk_ans_format_reward": 1.0, "step": 449, "think_completion_length": 19.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 106.71875381469727, "epoch": 1.5193929173693086, "grad_norm": 7.1547058197514115, "kl": 0.4228515625, "learning_rate": 8.733108108108109e-07, "loss": 0.0004, "reward": 2.8380229473114014, "reward_std": 0.3202047646045685, "rewards/final_reward": 1.2212116006312888, "rewards/mask_iou_reward": 0.6106058003156444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8380228579044342, "rewards/thk_ans_format_reward": 1.0, "step": 450, "think_completion_length": 24.375 }, { "clip_ratio": 0.0, "completion_length": 98.96875381469727, "epoch": 1.5227655986509276, "grad_norm": 13.72594240653149, "kl": 0.4697265625, "learning_rate": 8.730292792792793e-07, "loss": 0.0005, "reward": 3.395395040512085, "reward_std": 0.1617676541209221, "rewards/final_reward": 1.5347950284425567, "rewards/mask_iou_reward": 0.7673975142212783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3953949809074402, "rewards/thk_ans_format_reward": 1.0, "step": 451, "think_completion_length": 22.791666666666664 }, { "clip_ratio": 0.0, "completion_length": 82.9375, "epoch": 1.5261382799325465, "grad_norm": 132.82594405838844, "kl": 0.4873046875, "learning_rate": 8.727477477477478e-07, "loss": 0.0005, "reward": 3.166351079940796, "reward_std": 0.22434765845537186, "rewards/final_reward": 1.1407594605091715, "rewards/mask_iou_reward": 0.5703797302545858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1663509607315063, "rewards/thk_ans_format_reward": 1.0, "step": 452, "think_completion_length": 23.0 }, { "clip_ratio": 0.0, "completion_length": 120.25000762939453, "epoch": 1.5295109612141653, "grad_norm": 8.261307910998266, "kl": 0.435546875, "learning_rate": 8.724662162162162e-07, "loss": 0.0004, "reward": 3.166746139526367, "reward_std": 0.1179632619023323, "rewards/final_reward": 1.3772007662946268, "rewards/mask_iou_reward": 0.6886003831473134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1667458713054657, "rewards/thk_ans_format_reward": 1.0, "step": 453, "think_completion_length": 18.375 }, { "clip_ratio": 0.0, "completion_length": 82.47916793823242, "epoch": 1.5328836424957841, "grad_norm": 18.2005658169965, "kl": 0.490234375, "learning_rate": 8.721846846846846e-07, "loss": 0.0005, "reward": 3.2263890504837036, "reward_std": 0.1771574541926384, "rewards/final_reward": 1.3200745693435916, "rewards/mask_iou_reward": 0.6600372846717958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2263891696929932, "rewards/thk_ans_format_reward": 1.0, "step": 454, "think_completion_length": 16.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 105.53125381469727, "epoch": 1.536256323777403, "grad_norm": 6.950321217424066, "kl": 0.4453125, "learning_rate": 8.719031531531531e-07, "loss": 0.0005, "reward": 3.212833523750305, "reward_std": 0.16523578390479088, "rewards/final_reward": 1.696258540360979, "rewards/mask_iou_reward": 0.8481292701804894, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2232502102851868, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 455, "think_completion_length": 18.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 78.37500381469727, "epoch": 1.5396290050590218, "grad_norm": 17.482705271182557, "kl": 0.587890625, "learning_rate": 8.716216216216215e-07, "loss": 0.0006, "reward": 3.290819525718689, "reward_std": 0.2551625818014145, "rewards/final_reward": 0.7625233320673338, "rewards/mask_iou_reward": 0.3812616660336669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2908196151256561, "rewards/thk_ans_format_reward": 1.0, "step": 456, "think_completion_length": 23.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 80.08333587646484, "epoch": 1.5430016863406408, "grad_norm": 9.824438040670186, "kl": 0.4658203125, "learning_rate": 8.7134009009009e-07, "loss": 0.0005, "reward": 2.9174301624298096, "reward_std": 0.298883818089962, "rewards/final_reward": 0.5540578812699941, "rewards/mask_iou_reward": 0.27702894063499706, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9174301624298096, "rewards/thk_ans_format_reward": 1.0, "step": 457, "think_completion_length": 21.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 84.35416793823242, "epoch": 1.5463743676222597, "grad_norm": 9.440898123085097, "kl": 0.47265625, "learning_rate": 8.710585585585584e-07, "loss": 0.0005, "reward": 3.1261075735092163, "reward_std": 0.16530471108853817, "rewards/final_reward": 1.1013812481237961, "rewards/mask_iou_reward": 0.5506906240618981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1261076629161835, "rewards/thk_ans_format_reward": 1.0, "step": 458, "think_completion_length": 18.375 }, { "clip_ratio": 0.0, "completion_length": 97.77083587646484, "epoch": 1.5497470489038787, "grad_norm": 16.567436679373156, "kl": 0.474609375, "learning_rate": 8.707770270270269e-07, "loss": 0.0005, "reward": 2.9942362308502197, "reward_std": 0.27375921979546547, "rewards/final_reward": 1.4684187417871153, "rewards/mask_iou_reward": 0.7342093708935576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.994236171245575, "rewards/thk_ans_format_reward": 1.0, "step": 459, "think_completion_length": 23.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 85.05208587646484, "epoch": 1.5531197301854975, "grad_norm": 54.120110367425866, "kl": 0.95703125, "learning_rate": 8.704954954954955e-07, "loss": 0.001, "reward": 2.818720817565918, "reward_std": 0.18573438376188278, "rewards/final_reward": 0.8216863238924244, "rewards/mask_iou_reward": 0.4108431619462122, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8187207877635956, "rewards/thk_ans_format_reward": 1.0, "step": 460, "think_completion_length": 20.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 81.76041793823242, "epoch": 1.5564924114671164, "grad_norm": 11.965967732569721, "kl": 0.44921875, "learning_rate": 8.702139639639639e-07, "loss": 0.0005, "reward": 3.033520817756653, "reward_std": 0.08643431216478348, "rewards/final_reward": 1.4058163678866773, "rewards/mask_iou_reward": 0.7029081839433386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0335208177566528, "rewards/thk_ans_format_reward": 1.0, "step": 461, "think_completion_length": 17.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 83.97916793823242, "epoch": 1.5598650927487352, "grad_norm": 15.679449940056134, "kl": 0.4609375, "learning_rate": 8.699324324324324e-07, "loss": 0.0005, "reward": 3.1351873874664307, "reward_std": 0.0932794027030468, "rewards/final_reward": 1.1743316658542557, "rewards/mask_iou_reward": 0.5871658329271279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1351872682571411, "rewards/thk_ans_format_reward": 1.0, "step": 462, "think_completion_length": 17.5 }, { "clip_ratio": 0.0, "completion_length": 88.63541793823242, "epoch": 1.563237774030354, "grad_norm": 8.531893149998371, "kl": 0.46875, "learning_rate": 8.696509009009008e-07, "loss": 0.0005, "reward": 3.204105496406555, "reward_std": 0.22264982759952545, "rewards/final_reward": 1.4184024437330647, "rewards/mask_iou_reward": 0.7092012218665323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2041053175926208, "rewards/thk_ans_format_reward": 1.0, "step": 463, "think_completion_length": 20.791666666666664 }, { "clip_ratio": 0.0, "completion_length": 104.35417175292969, "epoch": 1.566610455311973, "grad_norm": 35.768289242163206, "kl": 0.5595703125, "learning_rate": 8.693693693693693e-07, "loss": 0.0006, "reward": 3.28743577003479, "reward_std": 0.14121374301612377, "rewards/final_reward": 1.7685978748483455, "rewards/mask_iou_reward": 0.8842989374241728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2874356508255005, "rewards/thk_ans_format_reward": 1.0, "step": 464, "think_completion_length": 22.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 88.71875381469727, "epoch": 1.569983136593592, "grad_norm": 19.549502851993616, "kl": 0.4150390625, "learning_rate": 8.690878378378378e-07, "loss": 0.0004, "reward": 3.0029417276382446, "reward_std": 0.027136605232954025, "rewards/final_reward": 1.772966376810864, "rewards/mask_iou_reward": 0.886483188405432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0029414594173431, "rewards/thk_ans_format_reward": 1.0, "step": 465, "think_completion_length": 17.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 84.94791793823242, "epoch": 1.573355817875211, "grad_norm": 7.625249061403965, "kl": 0.44921875, "learning_rate": 8.688063063063062e-07, "loss": 0.0005, "reward": 3.1399009227752686, "reward_std": 0.11434066295623779, "rewards/final_reward": 1.3885686335373428, "rewards/mask_iou_reward": 0.6942843167686714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.139900803565979, "rewards/thk_ans_format_reward": 1.0, "step": 466, "think_completion_length": 22.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 85.31250381469727, "epoch": 1.5767284991568298, "grad_norm": 46.22982661858084, "kl": 0.5126953125, "learning_rate": 8.685247747747747e-07, "loss": 0.0005, "reward": 3.077829360961914, "reward_std": 0.15692508220672607, "rewards/final_reward": 1.2113539620508234, "rewards/mask_iou_reward": 0.6056769810254117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.077829360961914, "rewards/thk_ans_format_reward": 1.0, "step": 467, "think_completion_length": 16.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 93.375, "epoch": 1.5801011804384486, "grad_norm": 17.505457784534546, "kl": 0.595703125, "learning_rate": 8.682432432432431e-07, "loss": 0.0006, "reward": 2.748835325241089, "reward_std": 0.12427278235554695, "rewards/final_reward": 0.780510380319, "rewards/mask_iou_reward": 0.3902551901595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7488352358341217, "rewards/thk_ans_format_reward": 1.0, "step": 468, "think_completion_length": 18.416666666666664 }, { "clip_ratio": 0.0, "completion_length": 80.96875381469727, "epoch": 1.5834738617200674, "grad_norm": 8.498249651892177, "kl": 0.4677734375, "learning_rate": 8.679617117117116e-07, "loss": 0.0005, "reward": 3.204338550567627, "reward_std": 0.19518911838531494, "rewards/final_reward": 1.7290932128407333, "rewards/mask_iou_reward": 0.8645466064203666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2043386101722717, "rewards/thk_ans_format_reward": 1.0, "step": 469, "think_completion_length": 18.666666666666664 }, { "clip_ratio": 0.0, "completion_length": 77.96875381469727, "epoch": 1.5868465430016863, "grad_norm": 15.412252606508801, "kl": 0.568359375, "learning_rate": 8.676801801801802e-07, "loss": 0.0006, "reward": 3.1517279148101807, "reward_std": 0.21602170914411545, "rewards/final_reward": 1.6399599658435098, "rewards/mask_iou_reward": 0.8199799829217549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1517279148101807, "rewards/thk_ans_format_reward": 1.0, "step": 470, "think_completion_length": 24.541666666666664 }, { "clip_ratio": 0.0, "completion_length": 81.05208587646484, "epoch": 1.590219224283305, "grad_norm": 28.822529403476647, "kl": 0.515625, "learning_rate": 8.673986486486486e-07, "loss": 0.0005, "reward": 2.7272047996520996, "reward_std": 0.15829136967658997, "rewards/final_reward": 0.8382349980237175, "rewards/mask_iou_reward": 0.41911749901185874, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7272045910358429, "rewards/thk_ans_format_reward": 1.0, "step": 471, "think_completion_length": 18.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 96.35417175292969, "epoch": 1.5935919055649241, "grad_norm": 102.25877988832697, "kl": 0.3818359375, "learning_rate": 8.671171171171171e-07, "loss": 0.0004, "reward": 3.501787781715393, "reward_std": 0.09546659886837006, "rewards/final_reward": 1.8930582525314399, "rewards/mask_iou_reward": 0.9465291262657199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5017877221107483, "rewards/thk_ans_format_reward": 1.0, "step": 472, "think_completion_length": 22.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 85.50000381469727, "epoch": 1.596964586846543, "grad_norm": 4.43375616737534, "kl": 0.6943359375, "learning_rate": 8.668355855855856e-07, "loss": 0.0007, "reward": 3.063448905944824, "reward_std": 0.18014680407941341, "rewards/final_reward": 0.22403323992912672, "rewards/mask_iou_reward": 0.11201661996456336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0634491443634033, "rewards/thk_ans_format_reward": 1.0, "step": 473, "think_completion_length": 21.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 86.02083587646484, "epoch": 1.600337268128162, "grad_norm": 14.473369009625422, "kl": 0.494140625, "learning_rate": 8.66554054054054e-07, "loss": 0.0005, "reward": 2.9824938774108887, "reward_std": 0.16130833327770233, "rewards/final_reward": 1.0900408410882503, "rewards/mask_iou_reward": 0.5450204205441251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9824941158294678, "rewards/thk_ans_format_reward": 1.0, "step": 474, "think_completion_length": 20.625 }, { "clip_ratio": 0.0, "completion_length": 93.46875381469727, "epoch": 1.6037099494097808, "grad_norm": 12.531912108914801, "kl": 0.4267578125, "learning_rate": 8.662725225225225e-07, "loss": 0.0004, "reward": 2.71895968914032, "reward_std": 0.16828986257314682, "rewards/final_reward": 0.9872382404177518, "rewards/mask_iou_reward": 0.4936191202088759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7189596146345139, "rewards/thk_ans_format_reward": 1.0, "step": 475, "think_completion_length": 14.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 80.45833587646484, "epoch": 1.6070826306913997, "grad_norm": 23.104837063754868, "kl": 0.478515625, "learning_rate": 8.659909909909909e-07, "loss": 0.0005, "reward": 3.0753190517425537, "reward_std": 0.057626042515039444, "rewards/final_reward": 0.6522182180405172, "rewards/mask_iou_reward": 0.3261091090202586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0753188133239746, "rewards/thk_ans_format_reward": 1.0, "step": 476, "think_completion_length": 14.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 85.70833587646484, "epoch": 1.6104553119730185, "grad_norm": 51.14924206166379, "kl": 0.47265625, "learning_rate": 8.657094594594594e-07, "loss": 0.0005, "reward": 3.417826771736145, "reward_std": 0.2440406084060669, "rewards/final_reward": 1.1676209565768971, "rewards/mask_iou_reward": 0.5838104782884486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4178267121315002, "rewards/thk_ans_format_reward": 1.0, "step": 477, "think_completion_length": 15.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 84.11458587646484, "epoch": 1.6138279932546373, "grad_norm": 16.65912906390247, "kl": 0.541015625, "learning_rate": 8.654279279279279e-07, "loss": 0.0006, "reward": 3.257392644882202, "reward_std": 0.11912001296877861, "rewards/final_reward": 0.640605444527259, "rewards/mask_iou_reward": 0.3203027222636295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2573924660682678, "rewards/thk_ans_format_reward": 1.0, "step": 478, "think_completion_length": 14.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 105.68750381469727, "epoch": 1.6172006745362564, "grad_norm": 14.555733217592502, "kl": 0.4775390625, "learning_rate": 8.651463963963963e-07, "loss": 0.0005, "reward": 2.956945300102234, "reward_std": 0.2652505896985531, "rewards/final_reward": 1.0335901193700312, "rewards/mask_iou_reward": 0.5167950596850156, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9673618376255035, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 479, "think_completion_length": 14.0 }, { "clip_ratio": 0.0, "completion_length": 82.15625, "epoch": 1.6205733558178752, "grad_norm": 45.33214603312758, "kl": 0.4638671875, "learning_rate": 8.648648648648649e-07, "loss": 0.0005, "reward": 3.351254463195801, "reward_std": 0.1269562803208828, "rewards/final_reward": 1.082978360079899, "rewards/mask_iou_reward": 0.5414891800399495, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3512542843818665, "rewards/thk_ans_format_reward": 1.0, "step": 480, "think_completion_length": 13.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 82.38541793823242, "epoch": 1.6239460370994943, "grad_norm": 12.613422364468201, "kl": 0.4736328125, "learning_rate": 8.645833333333333e-07, "loss": 0.0005, "reward": 3.2807374000549316, "reward_std": 0.06218157522380352, "rewards/final_reward": 1.29486728669709, "rewards/mask_iou_reward": 0.647433643348545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2807374000549316, "rewards/thk_ans_format_reward": 1.0, "step": 481, "think_completion_length": 13.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 86.15625, "epoch": 1.627318718381113, "grad_norm": 29.66491767918893, "kl": 0.54296875, "learning_rate": 8.643018018018018e-07, "loss": 0.0008, "reward": 3.1102946996688843, "reward_std": 0.16617947816848755, "rewards/final_reward": 0.9862282502708326, "rewards/mask_iou_reward": 0.4931141251354163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1207115054130554, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 482, "think_completion_length": 12.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 97.30208587646484, "epoch": 1.630691399662732, "grad_norm": 55.41155718440913, "kl": 0.4482421875, "learning_rate": 8.640202702702703e-07, "loss": 0.0005, "reward": 3.4419562816619873, "reward_std": 0.07434825040400028, "rewards/final_reward": 1.716160589569279, "rewards/mask_iou_reward": 0.8580802947846395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4523729085922241, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 483, "think_completion_length": 13.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 78.58333587646484, "epoch": 1.6340640809443507, "grad_norm": 16.071683379814385, "kl": 0.45703125, "learning_rate": 8.637387387387387e-07, "loss": 0.0005, "reward": 3.5900611877441406, "reward_std": 0.13412418961524963, "rewards/final_reward": 1.8966849195779, "rewards/mask_iou_reward": 0.94834245978895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6108945608139038, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 484, "think_completion_length": 12.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 75.98958587646484, "epoch": 1.6374367622259696, "grad_norm": 9.311258158438548, "kl": 0.490234375, "learning_rate": 8.634572072072072e-07, "loss": 0.0005, "reward": 3.518470883369446, "reward_std": 0.19531650096178055, "rewards/final_reward": 1.5953611940366779, "rewards/mask_iou_reward": 0.7976805970183389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.518470823764801, "rewards/thk_ans_format_reward": 1.0, "step": 485, "think_completion_length": 12.125 }, { "clip_ratio": 0.0, "completion_length": 79.07291793823242, "epoch": 1.6408094435075884, "grad_norm": 12.15817766930668, "kl": 0.4482421875, "learning_rate": 8.631756756756757e-07, "loss": 0.0004, "reward": 2.868876814842224, "reward_std": 0.09707498550415039, "rewards/final_reward": 1.3961014519571442, "rewards/mask_iou_reward": 0.6980507259785721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8688768744468689, "rewards/thk_ans_format_reward": 1.0, "step": 486, "think_completion_length": 11.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 81.60416793823242, "epoch": 1.6441821247892074, "grad_norm": 14.482530307624337, "kl": 0.515625, "learning_rate": 8.628941441441441e-07, "loss": 0.0005, "reward": 3.417795419692993, "reward_std": 0.04655772354453802, "rewards/final_reward": 1.5810112497156465, "rewards/mask_iou_reward": 0.7905056248578233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.417795479297638, "rewards/thk_ans_format_reward": 1.0, "step": 487, "think_completion_length": 11.25 }, { "clip_ratio": 0.0, "completion_length": 80.21875381469727, "epoch": 1.6475548060708263, "grad_norm": 141.43443814301298, "kl": 0.4111328125, "learning_rate": 8.626126126126126e-07, "loss": 0.0004, "reward": 3.459301471710205, "reward_std": 0.07235825061798096, "rewards/final_reward": 0.9971469538340672, "rewards/mask_iou_reward": 0.4985734769170336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4593015313148499, "rewards/thk_ans_format_reward": 1.0, "step": 488, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 70.87500381469727, "epoch": 1.6509274873524453, "grad_norm": 15.072714849056247, "kl": 0.53515625, "learning_rate": 8.62331081081081e-07, "loss": 0.0005, "reward": 3.366679072380066, "reward_std": 0.21495439112186432, "rewards/final_reward": 1.6226693889678039, "rewards/mask_iou_reward": 0.8113346944839019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.366679072380066, "rewards/thk_ans_format_reward": 1.0, "step": 489, "think_completion_length": 12.0 }, { "clip_ratio": 0.0, "completion_length": 74.31250381469727, "epoch": 1.6543001686340641, "grad_norm": 12.923645356988114, "kl": 0.46484375, "learning_rate": 8.620495495495496e-07, "loss": 0.0005, "reward": 3.3765722513198853, "reward_std": 0.09477205201983452, "rewards/final_reward": 0.87301188904249, "rewards/mask_iou_reward": 0.436505944521245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3765720129013062, "rewards/thk_ans_format_reward": 1.0, "step": 490, "think_completion_length": 10.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 80.06250381469727, "epoch": 1.657672849915683, "grad_norm": 14.60867178239802, "kl": 0.51171875, "learning_rate": 8.617680180180181e-07, "loss": 0.0005, "reward": 3.226666212081909, "reward_std": 0.2977278307080269, "rewards/final_reward": 1.088146077694477, "rewards/mask_iou_reward": 0.5440730388472385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.226666271686554, "rewards/thk_ans_format_reward": 1.0, "step": 491, "think_completion_length": 11.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 85.10416793823242, "epoch": 1.6610455311973018, "grad_norm": 31.411912777573136, "kl": 0.5390625, "learning_rate": 8.614864864864865e-07, "loss": 0.0005, "reward": 3.4004725217819214, "reward_std": 0.12245327979326248, "rewards/final_reward": 1.3255855717465488, "rewards/mask_iou_reward": 0.6627927858732744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4004724025726318, "rewards/thk_ans_format_reward": 1.0, "step": 492, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 84.72916793823242, "epoch": 1.6644182124789206, "grad_norm": 16.023223475825134, "kl": 0.458984375, "learning_rate": 8.61204954954955e-07, "loss": 0.0005, "reward": 3.2062329053878784, "reward_std": 0.18962369859218597, "rewards/final_reward": 1.2517562804400746, "rewards/mask_iou_reward": 0.6258781402200373, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.206232726573944, "rewards/thk_ans_format_reward": 1.0, "step": 493, "think_completion_length": 13.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 80.58333587646484, "epoch": 1.6677908937605397, "grad_norm": 11.265672770232708, "kl": 0.552734375, "learning_rate": 8.609234234234233e-07, "loss": 0.0006, "reward": 2.867155909538269, "reward_std": 0.0892084464430809, "rewards/final_reward": 0.8286811223868835, "rewards/mask_iou_reward": 0.41434056119344176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8671558797359467, "rewards/thk_ans_format_reward": 1.0, "step": 494, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 80.00000381469727, "epoch": 1.6711635750421585, "grad_norm": 13.26919597028308, "kl": 0.46875, "learning_rate": 8.606418918918918e-07, "loss": 0.0005, "reward": 3.3766839504241943, "reward_std": 0.15332239121198654, "rewards/final_reward": 1.256387966408285, "rewards/mask_iou_reward": 0.6281939832041425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3766838312149048, "rewards/thk_ans_format_reward": 1.0, "step": 495, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 72.58333587646484, "epoch": 1.6745362563237776, "grad_norm": 13.018456947061745, "kl": 0.4384765625, "learning_rate": 8.603603603603603e-07, "loss": 0.0005, "reward": 3.3175920248031616, "reward_std": 0.11998457461595535, "rewards/final_reward": 1.6143369815601565, "rewards/mask_iou_reward": 0.8071684907800782, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3175921440124512, "rewards/thk_ans_format_reward": 1.0, "step": 496, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 79.94791793823242, "epoch": 1.6779089376053964, "grad_norm": 11.159094503630527, "kl": 0.50390625, "learning_rate": 8.600788288288287e-07, "loss": 0.0005, "reward": 3.057952880859375, "reward_std": 0.11427609622478485, "rewards/final_reward": 1.5136194076524858, "rewards/mask_iou_reward": 0.7568097038262429, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0579529404640198, "rewards/thk_ans_format_reward": 1.0, "step": 497, "think_completion_length": 10.375 }, { "clip_ratio": 0.0, "completion_length": 85.65625381469727, "epoch": 1.6812816188870152, "grad_norm": 16.197329729682938, "kl": 0.6064453125, "learning_rate": 8.597972972972972e-07, "loss": 0.0006, "reward": 3.7405370473861694, "reward_std": 0.13544230163097382, "rewards/final_reward": 1.7687259535170339, "rewards/mask_iou_reward": 0.8843629767585169, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7509537935256958, "rewards/thk_ans_format_reward": 1.0, "step": 498, "think_completion_length": 11.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 85.45833587646484, "epoch": 1.684654300168634, "grad_norm": 13.052639073244066, "kl": 0.4970703125, "learning_rate": 8.595157657657656e-07, "loss": 0.0005, "reward": 2.982196807861328, "reward_std": 0.21756897866725922, "rewards/final_reward": 0.9414634619208263, "rewards/mask_iou_reward": 0.47073173096041315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9821969568729401, "rewards/thk_ans_format_reward": 1.0, "step": 499, "think_completion_length": 13.25 }, { "clip_ratio": 0.0, "completion_length": 99.01041793823242, "epoch": 1.6880269814502529, "grad_norm": 8.795085757071028, "kl": 0.5146484375, "learning_rate": 8.592342342342342e-07, "loss": 0.0005, "reward": 3.319603443145752, "reward_std": 0.1779472529888153, "rewards/final_reward": 0.9019757379410115, "rewards/mask_iou_reward": 0.45098786897050575, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3196034133434296, "rewards/thk_ans_format_reward": 1.0, "step": 500, "think_completion_length": 11.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 75.90625381469727, "epoch": 1.6913996627318717, "grad_norm": 7.774994787051015, "kl": 0.4931640625, "learning_rate": 8.589527027027027e-07, "loss": 0.0005, "reward": 3.361993193626404, "reward_std": 0.15704002976417542, "rewards/final_reward": 1.2416237096540461, "rewards/mask_iou_reward": 0.6208118548270231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.361993432044983, "rewards/thk_ans_format_reward": 1.0, "step": 501, "think_completion_length": 11.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 73.57292175292969, "epoch": 1.6947723440134908, "grad_norm": 15.137187828443501, "kl": 0.529296875, "learning_rate": 8.586711711711711e-07, "loss": 0.0005, "reward": 3.0099265575408936, "reward_std": 0.1874416284263134, "rewards/final_reward": 0.9739286970285753, "rewards/mask_iou_reward": 0.48696434851428766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0203429758548737, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 502, "think_completion_length": 10.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 76.6875, "epoch": 1.6981450252951096, "grad_norm": 36.580436506414216, "kl": 0.5, "learning_rate": 8.583896396396396e-07, "loss": 0.0005, "reward": 3.3445026874542236, "reward_std": 0.3046005591750145, "rewards/final_reward": 1.582493975030316, "rewards/mask_iou_reward": 0.791246987515158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3445026874542236, "rewards/thk_ans_format_reward": 1.0, "step": 503, "think_completion_length": 11.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 109.33333969116211, "epoch": 1.7015177065767286, "grad_norm": 28.402567095919988, "kl": 0.4189453125, "learning_rate": 8.58108108108108e-07, "loss": 0.0004, "reward": 3.3351190090179443, "reward_std": 0.3104002997279167, "rewards/final_reward": 1.1093516487241417, "rewards/mask_iou_reward": 0.5546758243620709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3351190090179443, "rewards/thk_ans_format_reward": 1.0, "step": 504, "think_completion_length": 10.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 87.37500381469727, "epoch": 1.7048903878583475, "grad_norm": 15.640567490493542, "kl": 0.623046875, "learning_rate": 8.578265765765765e-07, "loss": 0.0006, "reward": 3.162856936454773, "reward_std": 0.10240738838911057, "rewards/final_reward": 0.9918880465103225, "rewards/mask_iou_reward": 0.49594402325516124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1628568768501282, "rewards/thk_ans_format_reward": 1.0, "step": 505, "think_completion_length": 12.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 94.22916793823242, "epoch": 1.7082630691399663, "grad_norm": 16.62535695298484, "kl": 0.5078125, "learning_rate": 8.57545045045045e-07, "loss": 0.0006, "reward": 3.3045690059661865, "reward_std": 0.14377547800540924, "rewards/final_reward": 1.5430981242085318, "rewards/mask_iou_reward": 0.7715490621042659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3045691847801208, "rewards/thk_ans_format_reward": 1.0, "step": 506, "think_completion_length": 11.25 }, { "clip_ratio": 0.0, "completion_length": 78.67708587646484, "epoch": 1.7116357504215851, "grad_norm": 11.151972314274246, "kl": 0.50390625, "learning_rate": 8.572635135135134e-07, "loss": 0.0006, "reward": 3.242234230041504, "reward_std": 0.08867185190320015, "rewards/final_reward": 0.7304459733074367, "rewards/mask_iou_reward": 0.36522298665371833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2422342896461487, "rewards/thk_ans_format_reward": 1.0, "step": 507, "think_completion_length": 12.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 91.82292175292969, "epoch": 1.715008431703204, "grad_norm": 9.160278725973138, "kl": 0.4521484375, "learning_rate": 8.569819819819819e-07, "loss": 0.0005, "reward": 3.0099756717681885, "reward_std": 0.3636237531900406, "rewards/final_reward": 0.9719676653414557, "rewards/mask_iou_reward": 0.48598383267072787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.009975790977478, "rewards/thk_ans_format_reward": 1.0, "step": 508, "think_completion_length": 11.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 132.50000381469727, "epoch": 1.718381112984823, "grad_norm": 25.299347390182906, "kl": 0.5615234375, "learning_rate": 8.567004504504504e-07, "loss": 0.0006, "reward": 2.975674629211426, "reward_std": 0.2414003312587738, "rewards/final_reward": 1.415178078522424, "rewards/mask_iou_reward": 0.707589039261212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9756745994091034, "rewards/thk_ans_format_reward": 1.0, "step": 509, "think_completion_length": 12.25 }, { "clip_ratio": 0.0, "completion_length": 75.50000381469727, "epoch": 1.7217537942664418, "grad_norm": 5.745786155962972, "kl": 0.525390625, "learning_rate": 8.564189189189189e-07, "loss": 0.0007, "reward": 2.874884009361267, "reward_std": 0.1341996043920517, "rewards/final_reward": 0.1135072243268755, "rewards/mask_iou_reward": 0.05675361216343775, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.874883770942688, "rewards/thk_ans_format_reward": 1.0, "step": 510, "think_completion_length": 14.375 }, { "clip_ratio": 0.0, "completion_length": 84.76041793823242, "epoch": 1.7251264755480609, "grad_norm": 12.503675028002357, "kl": 0.4873046875, "learning_rate": 8.561373873873874e-07, "loss": 0.0005, "reward": 3.1886398792266846, "reward_std": 0.08919402211904526, "rewards/final_reward": 0.5874913855773956, "rewards/mask_iou_reward": 0.2937456927886978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1886397004127502, "rewards/thk_ans_format_reward": 1.0, "step": 511, "think_completion_length": 15.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 76.57291793823242, "epoch": 1.7284991568296797, "grad_norm": 19.913816628162035, "kl": 0.57421875, "learning_rate": 8.558558558558558e-07, "loss": 0.0006, "reward": 3.1143531799316406, "reward_std": 0.1809094250202179, "rewards/final_reward": 0.949235596383203, "rewards/mask_iou_reward": 0.4746177981916015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.114353060722351, "rewards/thk_ans_format_reward": 1.0, "step": 512, "think_completion_length": 12.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 77.93750381469727, "epoch": 1.7318718381112985, "grad_norm": 65.4953382208261, "kl": 0.5458984375, "learning_rate": 8.555743243243243e-07, "loss": 0.0005, "reward": 3.310346841812134, "reward_std": 0.18927180767059326, "rewards/final_reward": 1.249014511299547, "rewards/mask_iou_reward": 0.6245072556497735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3103469014167786, "rewards/thk_ans_format_reward": 1.0, "step": 513, "think_completion_length": 11.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 82.62500381469727, "epoch": 1.7352445193929174, "grad_norm": 13.336240430300492, "kl": 0.5078125, "learning_rate": 8.552927927927928e-07, "loss": 0.0005, "reward": 3.2578518390655518, "reward_std": 0.17345689982175827, "rewards/final_reward": 0.9461252377763101, "rewards/mask_iou_reward": 0.47306261888815504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2578518390655518, "rewards/thk_ans_format_reward": 1.0, "step": 514, "think_completion_length": 13.75 }, { "clip_ratio": 0.0, "completion_length": 110.25000762939453, "epoch": 1.7386172006745362, "grad_norm": 16.689742144094275, "kl": 0.4970703125, "learning_rate": 8.550112612612612e-07, "loss": 0.0005, "reward": 3.4923455715179443, "reward_std": 0.17182448878884315, "rewards/final_reward": 1.7788363585498463, "rewards/mask_iou_reward": 0.8894181792749232, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5131787657737732, "rewards/thk_ans_format_reward": 1.0, "step": 515, "think_completion_length": 13.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 76.95833587646484, "epoch": 1.741989881956155, "grad_norm": 22.269439493190728, "kl": 0.61328125, "learning_rate": 8.547297297297297e-07, "loss": 0.0006, "reward": 3.4098668098449707, "reward_std": 0.1380665097385645, "rewards/final_reward": 1.274625326712929, "rewards/mask_iou_reward": 0.6373126633564645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.409866750240326, "rewards/thk_ans_format_reward": 1.0, "step": 516, "think_completion_length": 12.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 100.95833587646484, "epoch": 1.745362563237774, "grad_norm": 13.715544708065021, "kl": 0.447265625, "learning_rate": 8.544481981981981e-07, "loss": 0.0004, "reward": 2.9615875482559204, "reward_std": 0.36108721792697906, "rewards/final_reward": 0.36051196483533215, "rewards/mask_iou_reward": 0.18025598241766608, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 0.9824208915233612, "rewards/thk_ans_format_reward": 1.0, "step": 517, "think_completion_length": 15.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 81.00000381469727, "epoch": 1.7487352445193929, "grad_norm": 66.02704543638346, "kl": 0.48828125, "learning_rate": 8.541666666666666e-07, "loss": 0.0005, "reward": 3.1223738193511963, "reward_std": 0.13363390415906906, "rewards/final_reward": 0.5471890291502142, "rewards/mask_iou_reward": 0.2735945145751071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1223737597465515, "rewards/thk_ans_format_reward": 1.0, "step": 518, "think_completion_length": 13.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 76.18750381469727, "epoch": 1.752107925801012, "grad_norm": 7.8067666006986, "kl": 0.5322265625, "learning_rate": 8.538851351351351e-07, "loss": 0.0005, "reward": 3.3601648807525635, "reward_std": 0.15161875635385513, "rewards/final_reward": 1.4802281160901876, "rewards/mask_iou_reward": 0.7401140580450938, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3705815076828003, "rewards/thk_ans_format_reward": 1.0, "step": 519, "think_completion_length": 13.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 99.61458587646484, "epoch": 1.7554806070826308, "grad_norm": 16.629566761999957, "kl": 0.759765625, "learning_rate": 8.536036036036036e-07, "loss": 0.0008, "reward": 2.8733246326446533, "reward_std": 0.21523992344737053, "rewards/final_reward": 1.0543335481722007, "rewards/mask_iou_reward": 0.5271667740861004, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 0.9670746624469757, "rewards/thk_ans_format_reward": 1.0, "step": 520, "think_completion_length": 14.625 }, { "clip_ratio": 0.0, "completion_length": 76.09375381469727, "epoch": 1.7588532883642496, "grad_norm": 14.082221414377713, "kl": 0.537109375, "learning_rate": 8.533220720720721e-07, "loss": 0.0005, "reward": 3.441567301750183, "reward_std": 0.20213724300265312, "rewards/final_reward": 1.5816256478233968, "rewards/mask_iou_reward": 0.7908128239116984, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.462400734424591, "rewards/thk_ans_format_reward": 1.0, "step": 521, "think_completion_length": 13.875 }, { "clip_ratio": 0.0, "completion_length": 78.57291793823242, "epoch": 1.7622259696458684, "grad_norm": 8.582734926536023, "kl": 0.544921875, "learning_rate": 8.530405405405406e-07, "loss": 0.0005, "reward": 3.404173731803894, "reward_std": 0.3278542831540108, "rewards/final_reward": 1.3103814615068121, "rewards/mask_iou_reward": 0.6551907307534061, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.414590299129486, "rewards/thk_ans_format_reward": 1.0, "step": 522, "think_completion_length": 19.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 92.21875381469727, "epoch": 1.7655986509274872, "grad_norm": 6.750860216925896, "kl": 0.546875, "learning_rate": 8.52759009009009e-07, "loss": 0.0006, "reward": 3.1857587099075317, "reward_std": 0.2380918264389038, "rewards/final_reward": 0.5405636463610947, "rewards/mask_iou_reward": 0.27028182318054733, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2065917253494263, "rewards/thk_ans_format_reward": 1.0, "step": 523, "think_completion_length": 15.0 }, { "clip_ratio": 0.0, "completion_length": 81.8125, "epoch": 1.768971332209106, "grad_norm": 8.026721438225874, "kl": 0.5068359375, "learning_rate": 8.524774774774775e-07, "loss": 0.0005, "reward": 3.1504483222961426, "reward_std": 0.17338748276233673, "rewards/final_reward": 1.2888089465669585, "rewards/mask_iou_reward": 0.6444044732834793, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.160864770412445, "rewards/thk_ans_format_reward": 1.0, "step": 524, "think_completion_length": 15.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 79.48958587646484, "epoch": 1.7723440134907251, "grad_norm": 13.596240992474886, "kl": 0.4853515625, "learning_rate": 8.521959459459459e-07, "loss": 0.0005, "reward": 3.5323015451431274, "reward_std": 0.13418062031269073, "rewards/final_reward": 1.7019234677183674, "rewards/mask_iou_reward": 0.8509617338591837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5323014855384827, "rewards/thk_ans_format_reward": 1.0, "step": 525, "think_completion_length": 15.875 }, { "clip_ratio": 0.0, "completion_length": 132.375, "epoch": 1.7757166947723442, "grad_norm": 11.637227110484277, "kl": 0.4384765625, "learning_rate": 8.519144144144144e-07, "loss": 0.0004, "reward": 3.1013338565826416, "reward_std": 0.18526217341423035, "rewards/final_reward": 1.5633311583878755, "rewards/mask_iou_reward": 0.7816655791939378, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1221671551465988, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 526, "think_completion_length": 15.0 }, { "clip_ratio": 0.0, "completion_length": 98.42708587646484, "epoch": 1.779089376053963, "grad_norm": 18.429457228413558, "kl": 0.4365234375, "learning_rate": 8.516328828828829e-07, "loss": 0.0004, "reward": 3.297514796257019, "reward_std": 0.15136384963989258, "rewards/final_reward": 1.268170770882234, "rewards/mask_iou_reward": 0.634085385441117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2975147366523743, "rewards/thk_ans_format_reward": 1.0, "step": 527, "think_completion_length": 18.916666666666664 }, { "clip_ratio": 0.0, "completion_length": 89.92708587646484, "epoch": 1.7824620573355818, "grad_norm": 5.886426929840268, "kl": 0.521484375, "learning_rate": 8.513513513513513e-07, "loss": 0.0005, "reward": 3.117105484008789, "reward_std": 0.31912093609571457, "rewards/final_reward": 1.445791756163469, "rewards/mask_iou_reward": 0.7228958780817345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1171055436134338, "rewards/thk_ans_format_reward": 1.0, "step": 528, "think_completion_length": 20.25 }, { "clip_ratio": 0.0, "completion_length": 133.76041793823242, "epoch": 1.7858347386172007, "grad_norm": 99.15974140991527, "kl": 0.484375, "learning_rate": 8.510698198198198e-07, "loss": 0.0005, "reward": 3.0164090394973755, "reward_std": 0.2540616989135742, "rewards/final_reward": 0.9807436103667273, "rewards/mask_iou_reward": 0.49037180518336365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.016409009695053, "rewards/thk_ans_format_reward": 1.0, "step": 529, "think_completion_length": 14.875 }, { "clip_ratio": 0.0, "completion_length": 88.61458587646484, "epoch": 1.7892074198988195, "grad_norm": 12.846402119311383, "kl": 0.49609375, "learning_rate": 8.507882882882883e-07, "loss": 0.0005, "reward": 3.278444290161133, "reward_std": 0.19332800060510635, "rewards/final_reward": 1.4669976720066265, "rewards/mask_iou_reward": 0.7334988360033132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2784444093704224, "rewards/thk_ans_format_reward": 1.0, "step": 530, "think_completion_length": 18.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 84.89583587646484, "epoch": 1.7925801011804383, "grad_norm": 50.20147304166135, "kl": 0.748046875, "learning_rate": 8.505067567567568e-07, "loss": 0.0008, "reward": 3.725192904472351, "reward_std": 0.09113395772874355, "rewards/final_reward": 1.8154955316875, "rewards/mask_iou_reward": 0.90774776584375, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.725192904472351, "rewards/thk_ans_format_reward": 1.0, "step": 531, "think_completion_length": 21.916666666666664 }, { "clip_ratio": 0.0, "completion_length": 90.03125381469727, "epoch": 1.7959527824620574, "grad_norm": 10.161187398861516, "kl": 0.4833984375, "learning_rate": 8.502252252252253e-07, "loss": 0.0005, "reward": 3.1416884660720825, "reward_std": 0.08379896730184555, "rewards/final_reward": 0.6741074384144312, "rewards/mask_iou_reward": 0.3370537192072156, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1416882276535034, "rewards/thk_ans_format_reward": 1.0, "step": 532, "think_completion_length": 18.5 }, { "clip_ratio": 0.0, "completion_length": 102.19791793823242, "epoch": 1.7993254637436762, "grad_norm": 11.261171569594273, "kl": 0.4853515625, "learning_rate": 8.499436936936937e-07, "loss": 0.0005, "reward": 2.8111319541931152, "reward_std": 0.2083485722541809, "rewards/final_reward": 1.0420207367476402, "rewards/mask_iou_reward": 0.5210103683738201, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8319653868675232, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 533, "think_completion_length": 22.416666666666664 }, { "clip_ratio": 0.0, "completion_length": 93.73958587646484, "epoch": 1.8026981450252952, "grad_norm": 6.990923161253162, "kl": 0.5390625, "learning_rate": 8.496621621621621e-07, "loss": 0.0006, "reward": 3.4891117811203003, "reward_std": 0.1246962659060955, "rewards/final_reward": 1.4770193974135732, "rewards/mask_iou_reward": 0.7385096987067866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4891117215156555, "rewards/thk_ans_format_reward": 1.0, "step": 534, "think_completion_length": 21.375 }, { "clip_ratio": 0.0, "completion_length": 95.36458587646484, "epoch": 1.806070826306914, "grad_norm": 6.968802100285715, "kl": 0.5068359375, "learning_rate": 8.493806306306305e-07, "loss": 0.0005, "reward": 2.9380651712417603, "reward_std": 0.1562102735042572, "rewards/final_reward": 1.3316888883683826, "rewards/mask_iou_reward": 0.6658444441841913, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9380651414394379, "rewards/thk_ans_format_reward": 1.0, "step": 535, "think_completion_length": 19.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 79.26041793823242, "epoch": 1.809443507588533, "grad_norm": 12.590632039643781, "kl": 0.4951171875, "learning_rate": 8.49099099099099e-07, "loss": 0.0005, "reward": 3.259642004966736, "reward_std": 0.1716170459985733, "rewards/final_reward": 0.9560325567170211, "rewards/mask_iou_reward": 0.47801627835851057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2596420645713806, "rewards/thk_ans_format_reward": 1.0, "step": 536, "think_completion_length": 18.875 }, { "clip_ratio": 0.0, "completion_length": 90.65625, "epoch": 1.8128161888701517, "grad_norm": 10.64822504901881, "kl": 0.546875, "learning_rate": 8.488175675675675e-07, "loss": 0.0005, "reward": 3.2001864910125732, "reward_std": 0.17615430057048798, "rewards/final_reward": 1.3802497216416596, "rewards/mask_iou_reward": 0.6901248608208298, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2210198640823364, "rewards/thk_ans_format_reward": 1.0, "step": 537, "think_completion_length": 19.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 90.92708587646484, "epoch": 1.8161888701517706, "grad_norm": 8.766553578424087, "kl": 0.4453125, "learning_rate": 8.485360360360359e-07, "loss": 0.0004, "reward": 3.3197951316833496, "reward_std": 0.2897758111357689, "rewards/final_reward": 1.2185051844394084, "rewards/mask_iou_reward": 0.6092525922197042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3197951316833496, "rewards/thk_ans_format_reward": 1.0, "step": 538, "think_completion_length": 19.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 89.87500381469727, "epoch": 1.8195615514333894, "grad_norm": 8.711449233550052, "kl": 0.4384765625, "learning_rate": 8.482545045045044e-07, "loss": 0.0004, "reward": 3.3355772495269775, "reward_std": 0.14928391575813293, "rewards/final_reward": 1.659850050810137, "rewards/mask_iou_reward": 0.8299250254050685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.335577130317688, "rewards/thk_ans_format_reward": 1.0, "step": 539, "think_completion_length": 21.541666666666664 }, { "clip_ratio": 0.0, "completion_length": 125.375, "epoch": 1.8229342327150084, "grad_norm": 8.527152093675014, "kl": 0.5087890625, "learning_rate": 8.47972972972973e-07, "loss": 0.0005, "reward": 3.0749590396881104, "reward_std": 0.2024538516998291, "rewards/final_reward": 1.164322524556697, "rewards/mask_iou_reward": 0.5821612622783485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0749590396881104, "rewards/thk_ans_format_reward": 1.0, "step": 540, "think_completion_length": 20.583333333333336 }, { "clip_ratio": 0.0, "completion_length": 84.73958587646484, "epoch": 1.8263069139966275, "grad_norm": 17.30409683070416, "kl": 0.513671875, "learning_rate": 8.476914414414414e-07, "loss": 0.0005, "reward": 3.2158886194229126, "reward_std": 0.15056533366441727, "rewards/final_reward": 1.4837446194475552, "rewards/mask_iou_reward": 0.7418723097237776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2158884406089783, "rewards/thk_ans_format_reward": 1.0, "step": 541, "think_completion_length": 19.958333333333336 }, { "clip_ratio": 0.0, "completion_length": 85.14583587646484, "epoch": 1.8296795952782463, "grad_norm": 19.940723616089134, "kl": 0.4716796875, "learning_rate": 8.474099099099099e-07, "loss": 0.0005, "reward": 3.301823854446411, "reward_std": 0.15247973054647446, "rewards/final_reward": 1.0294040972133638, "rewards/mask_iou_reward": 0.5147020486066819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.301823914051056, "rewards/thk_ans_format_reward": 1.0, "step": 542, "think_completion_length": 23.291666666666664 }, { "clip_ratio": 0.0, "completion_length": 103.6875, "epoch": 1.8330522765598651, "grad_norm": 7.025931078759855, "kl": 0.5224609375, "learning_rate": 8.471283783783783e-07, "loss": 0.0005, "reward": 3.1780710220336914, "reward_std": 0.0799750704318285, "rewards/final_reward": 0.8655492573961091, "rewards/mask_iou_reward": 0.43277462869805455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1780711114406586, "rewards/thk_ans_format_reward": 1.0, "step": 543, "think_completion_length": 24.0 }, { "clip_ratio": 0.0, "completion_length": 89.25000381469727, "epoch": 1.836424957841484, "grad_norm": 8.697002105342502, "kl": 0.4951171875, "learning_rate": 8.468468468468468e-07, "loss": 0.0005, "reward": 3.0120667219161987, "reward_std": 0.18368937820196152, "rewards/final_reward": 0.4529770983270571, "rewards/mask_iou_reward": 0.22648854916352856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.012066513299942, "rewards/thk_ans_format_reward": 1.0, "step": 544, "think_completion_length": 24.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 97.69792175292969, "epoch": 1.8397976391231028, "grad_norm": 16.1047685273957, "kl": 0.53125, "learning_rate": 8.465653153153153e-07, "loss": 0.0005, "reward": 3.142224669456482, "reward_std": 0.1355418637394905, "rewards/final_reward": 1.393036506519874, "rewards/mask_iou_reward": 0.696518253259937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1422248184680939, "rewards/thk_ans_format_reward": 1.0, "step": 545, "think_completion_length": 21.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 115.76042175292969, "epoch": 1.8431703204047216, "grad_norm": 14.620952623904842, "kl": 0.5224609375, "learning_rate": 8.462837837837837e-07, "loss": 0.0005, "reward": 3.158683657646179, "reward_std": 0.30953148007392883, "rewards/final_reward": 0.8971900857425309, "rewards/mask_iou_reward": 0.44859504287126545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1586836576461792, "rewards/thk_ans_format_reward": 1.0, "step": 546, "think_completion_length": 18.666666666666664 }, { "clip_ratio": 0.0, "completion_length": 87.78125, "epoch": 1.8465430016863407, "grad_norm": 10.621494015912932, "kl": 0.529296875, "learning_rate": 8.460022522522522e-07, "loss": 0.0005, "reward": 2.7198134660720825, "reward_std": 0.30272846668958664, "rewards/final_reward": 1.0138567852360123, "rewards/mask_iou_reward": 0.5069283926180062, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 0.8135634064674377, "rewards/thk_ans_format_reward": 1.0, "step": 547, "think_completion_length": 21.125 }, { "clip_ratio": 0.0, "completion_length": 89.03125381469727, "epoch": 1.8499156829679595, "grad_norm": 8.631107282509562, "kl": 0.541015625, "learning_rate": 8.457207207207206e-07, "loss": 0.0006, "reward": 3.328859329223633, "reward_std": 0.07117979228496552, "rewards/final_reward": 1.3748636443975273, "rewards/mask_iou_reward": 0.6874318221987636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3288592994213104, "rewards/thk_ans_format_reward": 1.0, "step": 548, "think_completion_length": 24.166666666666664 }, { "clip_ratio": 0.0, "completion_length": 116.85417175292969, "epoch": 1.8532883642495785, "grad_norm": 29.54763154400814, "kl": 0.455078125, "learning_rate": 8.454391891891891e-07, "loss": 0.0005, "reward": 3.215111494064331, "reward_std": 0.1470106765627861, "rewards/final_reward": 1.3981518771005164, "rewards/mask_iou_reward": 0.6990759385502582, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2255281805992126, "rewards/thk_ans_format_reward": 1.0, "step": 549, "think_completion_length": 22.125 }, { "clip_ratio": 0.0, "completion_length": 101.35416793823242, "epoch": 1.8566610455311974, "grad_norm": 15.323512729927169, "kl": 0.474609375, "learning_rate": 8.451576576576577e-07, "loss": 0.0005, "reward": 3.0582566261291504, "reward_std": 0.16113968193531036, "rewards/final_reward": 0.47448022909594245, "rewards/mask_iou_reward": 0.23724011454797123, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0582563877105713, "rewards/thk_ans_format_reward": 1.0, "step": 550, "think_completion_length": 22.708333333333336 }, { "clip_ratio": 0.0, "completion_length": 109.36458969116211, "epoch": 1.8600337268128162, "grad_norm": 6.6363475425215706, "kl": 0.46484375, "learning_rate": 8.448761261261261e-07, "loss": 0.0005, "reward": 3.2980759143829346, "reward_std": 0.12696680054068565, "rewards/final_reward": 1.476385199323587, "rewards/mask_iou_reward": 0.7381925996617935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2980758547782898, "rewards/thk_ans_format_reward": 1.0, "step": 551, "think_completion_length": 19.916666666666664 }, { "clip_ratio": 0.0, "completion_length": 93.5, "epoch": 1.863406408094435, "grad_norm": 10.793882018166471, "kl": 0.501953125, "learning_rate": 8.445945945945946e-07, "loss": 0.0005, "reward": 3.0894296169281006, "reward_std": 0.14166902750730515, "rewards/final_reward": 0.6742415648528425, "rewards/mask_iou_reward": 0.33712078242642124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0894296169281006, "rewards/thk_ans_format_reward": 1.0, "step": 552, "think_completion_length": 22.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 105.26041793823242, "epoch": 1.8667790893760539, "grad_norm": 8.964690189516379, "kl": 0.4306640625, "learning_rate": 8.44313063063063e-07, "loss": 0.0004, "reward": 3.16372811794281, "reward_std": 0.3229072540998459, "rewards/final_reward": 1.920765938205148, "rewards/mask_iou_reward": 0.960382969102574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1637282073497772, "rewards/thk_ans_format_reward": 1.0, "step": 553, "think_completion_length": 19.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 106.62500381469727, "epoch": 1.8701517706576727, "grad_norm": 10.639129220312041, "kl": 0.45703125, "learning_rate": 8.440315315315315e-07, "loss": 0.0005, "reward": 3.0981560945510864, "reward_std": 0.10398751497268677, "rewards/final_reward": 1.3976912835931667, "rewards/mask_iou_reward": 0.6988456417965834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.098155915737152, "rewards/thk_ans_format_reward": 1.0, "step": 554, "think_completion_length": 18.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 113.68750381469727, "epoch": 1.8735244519392917, "grad_norm": 11.375196091996045, "kl": 0.41015625, "learning_rate": 8.4375e-07, "loss": 0.0004, "reward": 3.0671032667160034, "reward_std": 0.08987793326377869, "rewards/final_reward": 0.9952254414524675, "rewards/mask_iou_reward": 0.49761272072623375, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0671032667160034, "rewards/thk_ans_format_reward": 1.0, "step": 555, "think_completion_length": 20.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 129.7291717529297, "epoch": 1.8768971332209108, "grad_norm": 7.375784396473635, "kl": 0.4189453125, "learning_rate": 8.434684684684684e-07, "loss": 0.0004, "reward": 3.3185118436813354, "reward_std": 0.1634932905435562, "rewards/final_reward": 0.7958873993629867, "rewards/mask_iou_reward": 0.39794369968149335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.318511962890625, "rewards/thk_ans_format_reward": 1.0, "step": 556, "think_completion_length": 25.458333333333336 }, { "clip_ratio": 0.0, "completion_length": 162.2916717529297, "epoch": 1.8802698145025296, "grad_norm": 13.869981408595814, "kl": 0.390625, "learning_rate": 8.431869369369369e-07, "loss": 0.0004, "reward": 3.16398823261261, "reward_std": 0.21535015106201172, "rewards/final_reward": 0.7045846438755945, "rewards/mask_iou_reward": 0.35229232193779725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1639882326126099, "rewards/thk_ans_format_reward": 1.0, "step": 557, "think_completion_length": 25.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.97916793823242, "epoch": 1.8836424957841484, "grad_norm": 16.455273669557467, "kl": 0.5283203125, "learning_rate": 8.429054054054054e-07, "loss": 0.0005, "reward": 3.16058886051178, "reward_std": 0.3432590663433075, "rewards/final_reward": 1.1915005029036203, "rewards/mask_iou_reward": 0.5957502514518102, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1710055470466614, "rewards/thk_ans_format_reward": 1.0, "step": 558, "think_completion_length": 17.666666666666664 }, { "clip_ratio": 0.0, "completion_length": 128.5729217529297, "epoch": 1.8870151770657673, "grad_norm": 36.217949351883604, "kl": 0.3828125, "learning_rate": 8.426238738738738e-07, "loss": 0.0004, "reward": 3.172240138053894, "reward_std": 0.2821786254644394, "rewards/final_reward": 1.4714988759551624, "rewards/mask_iou_reward": 0.7357494379775812, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1826567649841309, "rewards/thk_ans_format_reward": 1.0, "step": 559, "think_completion_length": 22.75 }, { "clip_ratio": 0.0, "completion_length": 130.45833587646484, "epoch": 1.890387858347386, "grad_norm": 8.45836576322927, "kl": 0.384765625, "learning_rate": 8.423423423423423e-07, "loss": 0.0004, "reward": 3.2458680868148804, "reward_std": 0.10261016711592674, "rewards/final_reward": 1.7873590346464214, "rewards/mask_iou_reward": 0.8936795173232107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2458679676055908, "rewards/thk_ans_format_reward": 1.0, "step": 560, "think_completion_length": 19.5 }, { "clip_ratio": 0.0, "completion_length": 132.6041717529297, "epoch": 1.893760539629005, "grad_norm": 9.87017521422623, "kl": 0.3740234375, "learning_rate": 8.420608108108108e-07, "loss": 0.0004, "reward": 3.1932257413864136, "reward_std": 0.2400343120098114, "rewards/final_reward": 1.5858177346376892, "rewards/mask_iou_reward": 0.7929088673188446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1932256817817688, "rewards/thk_ans_format_reward": 1.0, "step": 561, "think_completion_length": 18.375 }, { "clip_ratio": 0.0, "completion_length": 146.7916717529297, "epoch": 1.897133220910624, "grad_norm": 8.911821171091752, "kl": 0.41015625, "learning_rate": 8.417792792792793e-07, "loss": 0.0004, "reward": 3.3439966440200806, "reward_std": 0.11329784244298935, "rewards/final_reward": 1.786571429457327, "rewards/mask_iou_reward": 0.8932857147286635, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3439966440200806, "rewards/thk_ans_format_reward": 1.0, "step": 562, "think_completion_length": 16.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 160.13541793823242, "epoch": 1.9005059021922428, "grad_norm": 22.033788659128657, "kl": 0.5859375, "learning_rate": 8.414977477477478e-07, "loss": 0.0006, "reward": 3.2525041103363037, "reward_std": 0.1615203619003296, "rewards/final_reward": 1.3467275082537487, "rewards/mask_iou_reward": 0.6733637541268743, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.3254209160804749, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 563, "think_completion_length": 19.75 }, { "clip_ratio": 0.0, "completion_length": 133.3958396911621, "epoch": 1.9038785834738619, "grad_norm": 5.99702068619869, "kl": 0.4228515625, "learning_rate": 8.412162162162162e-07, "loss": 0.0004, "reward": 2.9900684356689453, "reward_std": 0.2933308109641075, "rewards/final_reward": 1.1604803190239532, "rewards/mask_iou_reward": 0.5802401595119766, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.0109014511108398, "rewards/thk_ans_format_reward": 1.0, "step": 564, "think_completion_length": 17.25 }, { "clip_ratio": 0.0, "completion_length": 132.5104217529297, "epoch": 1.9072512647554807, "grad_norm": 36.300540211403195, "kl": 0.439453125, "learning_rate": 8.409346846846847e-07, "loss": 0.0004, "reward": 3.008383631706238, "reward_std": 0.3261030241847038, "rewards/final_reward": 1.6861075354920192, "rewards/mask_iou_reward": 0.8430537677460096, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.0604670345783234, "rewards/thk_ans_format_reward": 1.0, "step": 565, "think_completion_length": 19.375 }, { "clip_ratio": 0.0, "completion_length": 126.00000381469727, "epoch": 1.9106239460370995, "grad_norm": 8.11030374288559, "kl": 0.4375, "learning_rate": 8.406531531531531e-07, "loss": 0.0004, "reward": 3.056068181991577, "reward_std": 0.3603939563035965, "rewards/final_reward": 1.7098159849058376, "rewards/mask_iou_reward": 0.8549079924529188, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.0873180031776428, "rewards/thk_ans_format_reward": 1.0, "step": 566, "think_completion_length": 15.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.27083587646484, "epoch": 1.9139966273187183, "grad_norm": 11.099614270228152, "kl": 0.380859375, "learning_rate": 8.403716216216216e-07, "loss": 0.0004, "reward": 3.1362944841384888, "reward_std": 0.16306444257497787, "rewards/final_reward": 1.3717985459900603, "rewards/mask_iou_reward": 0.6858992729950302, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.146710753440857, "rewards/thk_ans_format_reward": 1.0, "step": 567, "think_completion_length": 19.875 }, { "clip_ratio": 0.0, "completion_length": 136.4375, "epoch": 1.9173693086003372, "grad_norm": 7.039001675126875, "kl": 0.396484375, "learning_rate": 8.400900900900901e-07, "loss": 0.0004, "reward": 3.3202935457229614, "reward_std": 0.16550321877002716, "rewards/final_reward": 1.3515027226349199, "rewards/mask_iou_reward": 0.6757513613174599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3202936053276062, "rewards/thk_ans_format_reward": 1.0, "step": 568, "think_completion_length": 20.125 }, { "clip_ratio": 0.0, "completion_length": 134.95833587646484, "epoch": 1.920741989881956, "grad_norm": 17.834134911337642, "kl": 0.4169921875, "learning_rate": 8.398085585585585e-07, "loss": 0.0004, "reward": 3.235255718231201, "reward_std": 0.15589579567313194, "rewards/final_reward": 1.129114600213489, "rewards/mask_iou_reward": 0.5645573001067445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2352555394172668, "rewards/thk_ans_format_reward": 1.0, "step": 569, "think_completion_length": 16.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 179.46875762939453, "epoch": 1.924114671163575, "grad_norm": 12.902306834957399, "kl": 0.3740234375, "learning_rate": 8.39527027027027e-07, "loss": 0.0004, "reward": 3.1055132150650024, "reward_std": 0.3345019519329071, "rewards/final_reward": 0.8178962587305949, "rewards/mask_iou_reward": 0.40894812936529745, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.115929663181305, "rewards/thk_ans_format_reward": 1.0, "step": 570, "think_completion_length": 16.791666666666664 }, { "clip_ratio": 0.0, "completion_length": 131.9479217529297, "epoch": 1.927487352445194, "grad_norm": 5.639968467120446, "kl": 0.408203125, "learning_rate": 8.392454954954956e-07, "loss": 0.0004, "reward": 3.051029324531555, "reward_std": 0.21297653764486313, "rewards/final_reward": 1.22482687311132, "rewards/mask_iou_reward": 0.61241343655566, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.082279086112976, "rewards/thk_ans_format_reward": 1.0, "step": 571, "think_completion_length": 19.666666666666664 }, { "clip_ratio": 0.0, "completion_length": 134.46875381469727, "epoch": 1.930860033726813, "grad_norm": 13.607824062123045, "kl": 0.4013671875, "learning_rate": 8.38963963963964e-07, "loss": 0.0004, "reward": 3.496493697166443, "reward_std": 0.30988840758800507, "rewards/final_reward": 1.2803013559711691, "rewards/mask_iou_reward": 0.6401506779855846, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5173268914222717, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 572, "think_completion_length": 19.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 135.55208587646484, "epoch": 1.9342327150084317, "grad_norm": 8.18383905891403, "kl": 0.431640625, "learning_rate": 8.386824324324324e-07, "loss": 0.0005, "reward": 3.22155499458313, "reward_std": 0.1360001638531685, "rewards/final_reward": 1.4888304391906995, "rewards/mask_iou_reward": 0.7444152195953497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2215549945831299, "rewards/thk_ans_format_reward": 1.0, "step": 573, "think_completion_length": 15.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 134.17708587646484, "epoch": 1.9376053962900506, "grad_norm": 13.28590429258825, "kl": 0.4375, "learning_rate": 8.384009009009008e-07, "loss": 0.0004, "reward": 3.196623682975769, "reward_std": 0.27096298336982727, "rewards/final_reward": 0.6739398318220715, "rewards/mask_iou_reward": 0.33696991591103576, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.2487069964408875, "rewards/thk_ans_format_reward": 1.0, "step": 574, "think_completion_length": 14.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.22917175292969, "epoch": 1.9409780775716694, "grad_norm": 17.936727518963696, "kl": 0.4462890625, "learning_rate": 8.381193693693693e-07, "loss": 0.0004, "reward": 3.063702344894409, "reward_std": 0.3580833673477173, "rewards/final_reward": 1.254804599144204, "rewards/mask_iou_reward": 0.627402299572102, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 1.1574523150920868, "rewards/thk_ans_format_reward": 1.0, "step": 575, "think_completion_length": 17.0 }, { "clip_ratio": 0.0, "completion_length": 127.125, "epoch": 1.9443507588532882, "grad_norm": 9.001644590818499, "kl": 0.369140625, "learning_rate": 8.378378378378377e-07, "loss": 0.0004, "reward": 3.3422189950942993, "reward_std": 0.1806359961628914, "rewards/final_reward": 1.3741094642864033, "rewards/mask_iou_reward": 0.6870547321432017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3422189950942993, "rewards/thk_ans_format_reward": 1.0, "step": 576, "think_completion_length": 16.25 }, { "clip_ratio": 0.0, "completion_length": 127.11458587646484, "epoch": 1.9477234401349073, "grad_norm": 6.693206561854366, "kl": 0.3984375, "learning_rate": 8.375563063063062e-07, "loss": 0.0004, "reward": 3.150846838951111, "reward_std": 0.23167786747217178, "rewards/final_reward": 1.1163596449199726, "rewards/mask_iou_reward": 0.5581798224599863, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.182096779346466, "rewards/thk_ans_format_reward": 1.0, "step": 577, "think_completion_length": 16.041666666666664 }, { "clip_ratio": 0.0, "completion_length": 127.27083587646484, "epoch": 1.951096121416526, "grad_norm": 8.591423372545021, "kl": 0.4287109375, "learning_rate": 8.372747747747747e-07, "loss": 0.0004, "reward": 2.970146656036377, "reward_std": 0.18085888028144836, "rewards/final_reward": 0.30750705059321537, "rewards/mask_iou_reward": 0.15375352529660768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.970146656036377, "rewards/thk_ans_format_reward": 1.0, "step": 578, "think_completion_length": 14.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 128.98958587646484, "epoch": 1.9544688026981452, "grad_norm": 68.40018713629142, "kl": 0.408203125, "learning_rate": 8.369932432432431e-07, "loss": 0.0004, "reward": 3.5376436710357666, "reward_std": 0.07944156974554062, "rewards/final_reward": 1.7256402832301423, "rewards/mask_iou_reward": 0.8628201416150711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5376437306404114, "rewards/thk_ans_format_reward": 1.0, "step": 579, "think_completion_length": 19.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.56250381469727, "epoch": 1.957841483979764, "grad_norm": 14.388456789886524, "kl": 0.43359375, "learning_rate": 8.367117117117116e-07, "loss": 0.0004, "reward": 3.4017832279205322, "reward_std": 0.14799121022224426, "rewards/final_reward": 0.722340396741691, "rewards/mask_iou_reward": 0.3611701983708455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.401783049106598, "rewards/thk_ans_format_reward": 1.0, "step": 580, "think_completion_length": 14.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 127.58333587646484, "epoch": 1.9612141652613828, "grad_norm": 24.806741406182244, "kl": 0.4228515625, "learning_rate": 8.364301801801802e-07, "loss": 0.0004, "reward": 3.232166051864624, "reward_std": 0.05064544826745987, "rewards/final_reward": 0.9439742251988625, "rewards/mask_iou_reward": 0.47198711259943127, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2321660816669464, "rewards/thk_ans_format_reward": 1.0, "step": 581, "think_completion_length": 15.25 }, { "clip_ratio": 0.0, "completion_length": 133.06250381469727, "epoch": 1.9645868465430016, "grad_norm": 4.889973371546044, "kl": 0.6435546875, "learning_rate": 8.361486486486486e-07, "loss": 0.0006, "reward": 3.579144597053528, "reward_std": 0.09146898984909058, "rewards/final_reward": 1.687550136151648, "rewards/mask_iou_reward": 0.843775068075824, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.579144537448883, "rewards/thk_ans_format_reward": 1.0, "step": 582, "think_completion_length": 16.25 }, { "clip_ratio": 0.0, "completion_length": 125.56250381469727, "epoch": 1.9679595278246205, "grad_norm": 5.160628487146692, "kl": 0.4306640625, "learning_rate": 8.358671171171171e-07, "loss": 0.0004, "reward": 3.189295530319214, "reward_std": 0.21881115436553955, "rewards/final_reward": 1.0328786095998268, "rewards/mask_iou_reward": 0.5164393047999134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1892955303192139, "rewards/thk_ans_format_reward": 1.0, "step": 583, "think_completion_length": 14.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 124.47917175292969, "epoch": 1.9713322091062393, "grad_norm": 11.902639843711885, "kl": 0.384765625, "learning_rate": 8.355855855855855e-07, "loss": 0.0004, "reward": 2.815028190612793, "reward_std": 0.3476174771785736, "rewards/final_reward": 1.0217865616110187, "rewards/mask_iou_reward": 0.5108932808055093, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8254446983337402, "rewards/thk_ans_format_reward": 1.0, "step": 584, "think_completion_length": 13.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.875, "epoch": 1.9747048903878583, "grad_norm": 41.89633117528405, "kl": 0.4580078125, "learning_rate": 8.35304054054054e-07, "loss": 0.0005, "reward": 3.0252166986465454, "reward_std": 0.12799861282110214, "rewards/final_reward": 0.8591126518569622, "rewards/mask_iou_reward": 0.4295563259284811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0252164900302887, "rewards/thk_ans_format_reward": 1.0, "step": 585, "think_completion_length": 14.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 126.14583587646484, "epoch": 1.9780775716694774, "grad_norm": 8.279356260712635, "kl": 0.4189453125, "learning_rate": 8.350225225225225e-07, "loss": 0.0004, "reward": 3.1389033794403076, "reward_std": 0.17402781546115875, "rewards/final_reward": 0.7781741211554355, "rewards/mask_iou_reward": 0.38908706057771775, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1389034986495972, "rewards/thk_ans_format_reward": 1.0, "step": 586, "think_completion_length": 14.25 }, { "clip_ratio": 0.0, "completion_length": 130.3229217529297, "epoch": 1.9814502529510962, "grad_norm": 7.239676615842828, "kl": 0.421875, "learning_rate": 8.347409909909909e-07, "loss": 0.0004, "reward": 3.356380581855774, "reward_std": 0.1445971019566059, "rewards/final_reward": 1.6324843258097554, "rewards/mask_iou_reward": 0.8162421629048777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.356380581855774, "rewards/thk_ans_format_reward": 1.0, "step": 587, "think_completion_length": 13.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.72916793823242, "epoch": 1.984822934232715, "grad_norm": 6.624462864165187, "kl": 0.626953125, "learning_rate": 8.344594594594594e-07, "loss": 0.0006, "reward": 3.1781363487243652, "reward_std": 0.18783046305179596, "rewards/final_reward": 1.0586905247523806, "rewards/mask_iou_reward": 0.5293452623761903, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1781366467475891, "rewards/thk_ans_format_reward": 1.0, "step": 588, "think_completion_length": 14.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 126.95833587646484, "epoch": 1.9881956155143339, "grad_norm": 6.657899573691945, "kl": 0.66796875, "learning_rate": 8.341779279279278e-07, "loss": 0.0007, "reward": 3.361886739730835, "reward_std": 0.10654079541563988, "rewards/final_reward": 1.6515764617191033, "rewards/mask_iou_reward": 0.8257882308595517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.361886739730835, "rewards/thk_ans_format_reward": 1.0, "step": 589, "think_completion_length": 16.375 }, { "clip_ratio": 0.0, "completion_length": 117.50000381469727, "epoch": 1.9915682967959527, "grad_norm": 7.385702122693169, "kl": 0.4765625, "learning_rate": 8.338963963963963e-07, "loss": 0.0005, "reward": 3.151884913444519, "reward_std": 0.2869477644562721, "rewards/final_reward": 1.5238424558101884, "rewards/mask_iou_reward": 0.7619212279050942, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1518847942352295, "rewards/thk_ans_format_reward": 1.0, "step": 590, "think_completion_length": 16.0 }, { "clip_ratio": 0.0, "completion_length": 124.81250381469727, "epoch": 1.9949409780775715, "grad_norm": 14.355643385177684, "kl": 0.443359375, "learning_rate": 8.336148648648649e-07, "loss": 0.0004, "reward": 3.2977166175842285, "reward_std": 0.2152240127325058, "rewards/final_reward": 0.6568418408176202, "rewards/mask_iou_reward": 0.3284209204088101, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2977163195610046, "rewards/thk_ans_format_reward": 1.0, "step": 591, "think_completion_length": 16.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.15789413452148, "epoch": 1.9983136593591906, "grad_norm": 10.702373666078783, "kl": 0.390625, "learning_rate": 8.333333333333333e-07, "loss": 0.0004, "reward": 3.004240393638611, "reward_std": 0.21375977247953415, "rewards/final_reward": 0.6922338710796716, "rewards/mask_iou_reward": 0.3461169355398358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0042401552200317, "rewards/thk_ans_format_reward": 1.0, "step": 592, "think_completion_length": 13.5 }, { "clip_ratio": 0.0, "completion_length": 124.15625, "epoch": 2.003372681281619, "grad_norm": 7.316613913101059, "kl": 0.494140625, "learning_rate": 8.330518018018018e-07, "loss": 0.0005, "reward": 3.213071584701538, "reward_std": 0.21354631334543228, "rewards/final_reward": 1.085932180658682, "rewards/mask_iou_reward": 0.542966090329341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.223488211631775, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 593, "think_completion_length": 10.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.83333587646484, "epoch": 2.0067453625632377, "grad_norm": 17.99254075390951, "kl": 0.4375, "learning_rate": 8.327702702702703e-07, "loss": 0.0004, "reward": 3.254488706588745, "reward_std": 0.16734839975833893, "rewards/final_reward": 0.6797857897801465, "rewards/mask_iou_reward": 0.33989289489007324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2544885873794556, "rewards/thk_ans_format_reward": 1.0, "step": 594, "think_completion_length": 12.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.27083587646484, "epoch": 2.0101180438448565, "grad_norm": 6.0033516691706, "kl": 0.4580078125, "learning_rate": 8.324887387387387e-07, "loss": 0.0005, "reward": 3.2571709156036377, "reward_std": 0.15888730436563492, "rewards/final_reward": 1.4449123377187612, "rewards/mask_iou_reward": 0.7224561688593806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2571707367897034, "rewards/thk_ans_format_reward": 1.0, "step": 595, "think_completion_length": 11.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.65625381469727, "epoch": 2.0134907251264758, "grad_norm": 11.736645609380771, "kl": 0.427734375, "learning_rate": 8.322072072072072e-07, "loss": 0.0004, "reward": 3.4008623361587524, "reward_std": 0.19675985723733902, "rewards/final_reward": 1.419864947487815, "rewards/mask_iou_reward": 0.7099324737439076, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4008623361587524, "rewards/thk_ans_format_reward": 1.0, "step": 596, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.26041793823242, "epoch": 2.0168634064080946, "grad_norm": 7.651004215995592, "kl": 0.45703125, "learning_rate": 8.319256756756756e-07, "loss": 0.0005, "reward": 3.1595300436019897, "reward_std": 0.16874928027391434, "rewards/final_reward": 1.1975441458176574, "rewards/mask_iou_reward": 0.5987720729088287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1595301032066345, "rewards/thk_ans_format_reward": 1.0, "step": 597, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.68750381469727, "epoch": 2.0202360876897134, "grad_norm": 7.692105896132088, "kl": 0.38671875, "learning_rate": 8.316441441441441e-07, "loss": 0.0004, "reward": 3.4031869173049927, "reward_std": 0.1947040967643261, "rewards/final_reward": 1.5671920196850815, "rewards/mask_iou_reward": 0.7835960098425407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4031866192817688, "rewards/thk_ans_format_reward": 1.0, "step": 598, "think_completion_length": 11.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 2.0236087689713322, "grad_norm": 5.199739352470058, "kl": 0.6396484375, "learning_rate": 8.313626126126126e-07, "loss": 0.0006, "reward": 3.4093559980392456, "reward_std": 0.10994856804609299, "rewards/final_reward": 0.9188244675614923, "rewards/mask_iou_reward": 0.45941223378074614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4093559384346008, "rewards/thk_ans_format_reward": 1.0, "step": 599, "think_completion_length": 10.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 110.23958587646484, "epoch": 2.026981450252951, "grad_norm": 8.866087972734299, "kl": 0.4931640625, "learning_rate": 8.31081081081081e-07, "loss": 0.0005, "reward": 2.955493450164795, "reward_std": 0.1533719301223755, "rewards/final_reward": 1.0310845691694952, "rewards/mask_iou_reward": 0.5155422845847476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.955493301153183, "rewards/thk_ans_format_reward": 1.0, "step": 600, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 119.33333587646484, "epoch": 2.03035413153457, "grad_norm": 10.027714875616592, "kl": 0.484375, "learning_rate": 8.307995495495496e-07, "loss": 0.0005, "reward": 3.1500658988952637, "reward_std": 0.17688149958848953, "rewards/final_reward": 1.3310248751677642, "rewards/mask_iou_reward": 0.6655124375838821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1500657796859741, "rewards/thk_ans_format_reward": 1.0, "step": 601, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.875, "epoch": 2.0337268128161887, "grad_norm": 15.868187188602219, "kl": 0.4130859375, "learning_rate": 8.30518018018018e-07, "loss": 0.0004, "reward": 3.543818950653076, "reward_std": 0.22326519712805748, "rewards/final_reward": 1.3907805272302842, "rewards/mask_iou_reward": 0.6953902636151421, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5438188910484314, "rewards/thk_ans_format_reward": 1.0, "step": 602, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 119.53125, "epoch": 2.0370994940978076, "grad_norm": 22.880355331905218, "kl": 0.4365234375, "learning_rate": 8.302364864864865e-07, "loss": 0.0004, "reward": 2.813183307647705, "reward_std": 0.23120269179344177, "rewards/final_reward": 1.3932862144098808, "rewards/mask_iou_reward": 0.6966431072049404, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.82359978556633, "rewards/thk_ans_format_reward": 1.0, "step": 603, "think_completion_length": 11.625 }, { "clip_ratio": 0.0, "completion_length": 121.14583587646484, "epoch": 2.040472175379427, "grad_norm": 52.891657910077754, "kl": 0.439453125, "learning_rate": 8.29954954954955e-07, "loss": 0.0004, "reward": 3.7011115550994873, "reward_std": 0.1626211702823639, "rewards/final_reward": 1.7655026499713529, "rewards/mask_iou_reward": 0.8827513249856764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.701111614704132, "rewards/thk_ans_format_reward": 1.0, "step": 604, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 121.50000381469727, "epoch": 2.0438448566610457, "grad_norm": 5.637244830932424, "kl": 0.44921875, "learning_rate": 8.296734234234234e-07, "loss": 0.0005, "reward": 3.6091989278793335, "reward_std": 0.11021934449672699, "rewards/final_reward": 1.5831235540128628, "rewards/mask_iou_reward": 0.7915617770064314, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6091991662979126, "rewards/thk_ans_format_reward": 1.0, "step": 605, "think_completion_length": 11.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.36458587646484, "epoch": 2.0472175379426645, "grad_norm": 27.7979225374541, "kl": 0.4326171875, "learning_rate": 8.293918918918919e-07, "loss": 0.0004, "reward": 3.490352988243103, "reward_std": 0.23794641345739365, "rewards/final_reward": 1.4083256625127278, "rewards/mask_iou_reward": 0.7041628312563639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4903529286384583, "rewards/thk_ans_format_reward": 1.0, "step": 606, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.2604217529297, "epoch": 2.0505902192242833, "grad_norm": 10.320866046158933, "kl": 0.4775390625, "learning_rate": 8.291103603603603e-07, "loss": 0.0005, "reward": 3.5833810567855835, "reward_std": 0.2443862035870552, "rewards/final_reward": 1.7815358176192633, "rewards/mask_iou_reward": 0.8907679088096316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5833812355995178, "rewards/thk_ans_format_reward": 1.0, "step": 607, "think_completion_length": 12.625 }, { "clip_ratio": 0.0, "completion_length": 120.50000381469727, "epoch": 2.053962900505902, "grad_norm": 21.575973813768552, "kl": 0.51171875, "learning_rate": 8.288288288288288e-07, "loss": 0.0005, "reward": 3.36086106300354, "reward_std": 0.09921130910515785, "rewards/final_reward": 1.5238539110371945, "rewards/mask_iou_reward": 0.7619269555185972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3608611226081848, "rewards/thk_ans_format_reward": 1.0, "step": 608, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 144.36458587646484, "epoch": 2.057335581787521, "grad_norm": 7.857035715646524, "kl": 0.4697265625, "learning_rate": 8.285472972972973e-07, "loss": 0.0005, "reward": 3.3863420486450195, "reward_std": 0.13179394975304604, "rewards/final_reward": 1.3365266390593695, "rewards/mask_iou_reward": 0.6682633195296848, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3967588543891907, "rewards/thk_ans_format_reward": 1.0, "step": 609, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 122.20833587646484, "epoch": 2.06070826306914, "grad_norm": 9.309948150573302, "kl": 0.412109375, "learning_rate": 8.282657657657657e-07, "loss": 0.0004, "reward": 2.7876734733581543, "reward_std": 0.17468656226992607, "rewards/final_reward": 1.1084424634770147, "rewards/mask_iou_reward": 0.5542212317385073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7876733839511871, "rewards/thk_ans_format_reward": 1.0, "step": 610, "think_completion_length": 12.25 }, { "clip_ratio": 0.0, "completion_length": 121.92708587646484, "epoch": 2.064080944350759, "grad_norm": 8.297823282134887, "kl": 0.5166015625, "learning_rate": 8.279842342342343e-07, "loss": 0.0005, "reward": 3.274766206741333, "reward_std": 0.2503800541162491, "rewards/final_reward": 1.1995371075953356, "rewards/mask_iou_reward": 0.5997685537976678, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.295599639415741, "rewards/thk_ans_format_reward": 1.0, "step": 611, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 121.46875381469727, "epoch": 2.067453625632378, "grad_norm": 5.647774807748787, "kl": 0.455078125, "learning_rate": 8.277027027027028e-07, "loss": 0.0005, "reward": 3.3382933139801025, "reward_std": 0.06181446276605129, "rewards/final_reward": 0.7236724084096133, "rewards/mask_iou_reward": 0.36183620420480667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.338293194770813, "rewards/thk_ans_format_reward": 1.0, "step": 612, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.80208587646484, "epoch": 2.0708263069139967, "grad_norm": 14.09217085577382, "kl": 0.46484375, "learning_rate": 8.274211711711711e-07, "loss": 0.0005, "reward": 3.541401505470276, "reward_std": 0.1674039326608181, "rewards/final_reward": 1.3222702404190154, "rewards/mask_iou_reward": 0.6611351202095077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5414013266563416, "rewards/thk_ans_format_reward": 1.0, "step": 613, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 128.2083396911621, "epoch": 2.0741989881956155, "grad_norm": 7.905145281311906, "kl": 0.4130859375, "learning_rate": 8.271396396396396e-07, "loss": 0.0004, "reward": 3.4339908361434937, "reward_std": 0.08459173701703548, "rewards/final_reward": 1.4237471525821739, "rewards/mask_iou_reward": 0.7118735762910869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.433990716934204, "rewards/thk_ans_format_reward": 1.0, "step": 614, "think_completion_length": 10.0 }, { "clip_ratio": 0.0, "completion_length": 121.38541793823242, "epoch": 2.0775716694772344, "grad_norm": 11.317119418959988, "kl": 0.5244140625, "learning_rate": 8.26858108108108e-07, "loss": 0.0005, "reward": 2.9590145349502563, "reward_std": 0.08396272733807564, "rewards/final_reward": 1.5951641649651038, "rewards/mask_iou_reward": 0.7975820824825519, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9590144753456116, "rewards/thk_ans_format_reward": 1.0, "step": 615, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.65625381469727, "epoch": 2.080944350758853, "grad_norm": 10.528441939049495, "kl": 0.4189453125, "learning_rate": 8.265765765765765e-07, "loss": 0.0004, "reward": 3.174362897872925, "reward_std": 0.20493387430906296, "rewards/final_reward": 1.32150395635473, "rewards/mask_iou_reward": 0.660751978177365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1743630170822144, "rewards/thk_ans_format_reward": 1.0, "step": 616, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 110.5625, "epoch": 2.084317032040472, "grad_norm": 11.632868904279976, "kl": 0.548828125, "learning_rate": 8.26295045045045e-07, "loss": 0.0005, "reward": 3.5149197578430176, "reward_std": 0.16870852559804916, "rewards/final_reward": 1.1256652894876513, "rewards/mask_iou_reward": 0.5628326447438257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5149198174476624, "rewards/thk_ans_format_reward": 1.0, "step": 617, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 119.04166793823242, "epoch": 2.087689713322091, "grad_norm": 10.672156817999449, "kl": 0.4765625, "learning_rate": 8.260135135135134e-07, "loss": 0.0005, "reward": 3.462290406227112, "reward_std": 0.1524660885334015, "rewards/final_reward": 1.6213807924322559, "rewards/mask_iou_reward": 0.8106903962161279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4622901678085327, "rewards/thk_ans_format_reward": 1.0, "step": 618, "think_completion_length": 10.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 114.40625381469727, "epoch": 2.09106239460371, "grad_norm": 9.171979331095862, "kl": 0.490234375, "learning_rate": 8.257319819819819e-07, "loss": 0.0005, "reward": 3.575721502304077, "reward_std": 0.22333138436079025, "rewards/final_reward": 1.589309503896279, "rewards/mask_iou_reward": 0.7946547519481395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5757215023040771, "rewards/thk_ans_format_reward": 1.0, "step": 619, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.43750381469727, "epoch": 2.094435075885329, "grad_norm": 4.728161898826185, "kl": 0.44921875, "learning_rate": 8.254504504504503e-07, "loss": 0.0005, "reward": 3.340423822402954, "reward_std": 0.12725035846233368, "rewards/final_reward": 0.8856937987091074, "rewards/mask_iou_reward": 0.4428468993545537, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.340423822402954, "rewards/thk_ans_format_reward": 1.0, "step": 620, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.38541793823242, "epoch": 2.097807757166948, "grad_norm": 31.809054413592087, "kl": 0.4521484375, "learning_rate": 8.251689189189189e-07, "loss": 0.0005, "reward": 3.261618733406067, "reward_std": 0.12912706285715103, "rewards/final_reward": 1.4930522317336727, "rewards/mask_iou_reward": 0.7465261158668364, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2616186141967773, "rewards/thk_ans_format_reward": 1.0, "step": 621, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.32291793823242, "epoch": 2.1011804384485666, "grad_norm": 6.562821215019139, "kl": 0.5, "learning_rate": 8.248873873873874e-07, "loss": 0.0005, "reward": 3.1994651556015015, "reward_std": 0.17931604385375977, "rewards/final_reward": 0.9012642826978075, "rewards/mask_iou_reward": 0.4506321413489037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1994652152061462, "rewards/thk_ans_format_reward": 1.0, "step": 622, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.11458969116211, "epoch": 2.1045531197301854, "grad_norm": 12.171223316148184, "kl": 0.53125, "learning_rate": 8.246058558558558e-07, "loss": 0.0005, "reward": 3.2846641540527344, "reward_std": 0.14336452260613441, "rewards/final_reward": 1.3247871014828667, "rewards/mask_iou_reward": 0.6623935507414334, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2846640944480896, "rewards/thk_ans_format_reward": 1.0, "step": 623, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.75000381469727, "epoch": 2.1079258010118043, "grad_norm": 16.407159118171702, "kl": 0.578125, "learning_rate": 8.243243243243243e-07, "loss": 0.0006, "reward": 3.310416102409363, "reward_std": 0.2252977043390274, "rewards/final_reward": 1.1116370296886442, "rewards/mask_iou_reward": 0.5558185148443221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3104161024093628, "rewards/thk_ans_format_reward": 1.0, "step": 624, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 153.8333396911621, "epoch": 2.111298482293423, "grad_norm": 50.87161067879284, "kl": 0.40625, "learning_rate": 8.240427927927927e-07, "loss": 0.0004, "reward": 3.2098931074142456, "reward_std": 0.14654186181724072, "rewards/final_reward": 0.2948156663781496, "rewards/mask_iou_reward": 0.1474078331890748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2098931670188904, "rewards/thk_ans_format_reward": 1.0, "step": 625, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 110.34375381469727, "epoch": 2.1146711635750424, "grad_norm": 72.77830962492465, "kl": 0.505859375, "learning_rate": 8.237612612612612e-07, "loss": 0.0005, "reward": 3.3907355070114136, "reward_std": 0.2131858579814434, "rewards/final_reward": 1.1081243696967564, "rewards/mask_iou_reward": 0.5540621848483782, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3907356262207031, "rewards/thk_ans_format_reward": 1.0, "step": 626, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.83333587646484, "epoch": 2.118043844856661, "grad_norm": 8.563624464427336, "kl": 0.4130859375, "learning_rate": 8.234797297297297e-07, "loss": 0.0005, "reward": 3.1916167736053467, "reward_std": 0.0674378052353859, "rewards/final_reward": 0.8132947407315055, "rewards/mask_iou_reward": 0.40664737036575277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.191616952419281, "rewards/thk_ans_format_reward": 1.0, "step": 627, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.28125, "epoch": 2.12141652613828, "grad_norm": 18.233602877049936, "kl": 0.5576171875, "learning_rate": 8.231981981981981e-07, "loss": 0.0007, "reward": 3.689383387565613, "reward_std": 0.058890651911497116, "rewards/final_reward": 1.573615220073414, "rewards/mask_iou_reward": 0.786807610036707, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6893833875656128, "rewards/thk_ans_format_reward": 1.0, "step": 628, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.55208587646484, "epoch": 2.124789207419899, "grad_norm": 26.123304191153395, "kl": 0.4013671875, "learning_rate": 8.229166666666666e-07, "loss": 0.0004, "reward": 3.4777636528015137, "reward_std": 0.07786043360829353, "rewards/final_reward": 1.213893867253093, "rewards/mask_iou_reward": 0.6069469336265465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4777635335922241, "rewards/thk_ans_format_reward": 1.0, "step": 629, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 120.04166793823242, "epoch": 2.1281618887015177, "grad_norm": 6.574647453197092, "kl": 0.439453125, "learning_rate": 8.22635135135135e-07, "loss": 0.0004, "reward": 3.1995784044265747, "reward_std": 0.11072956770658493, "rewards/final_reward": 1.289389255471192, "rewards/mask_iou_reward": 0.644694627735596, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1995784044265747, "rewards/thk_ans_format_reward": 1.0, "step": 630, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.27083587646484, "epoch": 2.1315345699831365, "grad_norm": 8.857344628430472, "kl": 0.5341796875, "learning_rate": 8.223536036036036e-07, "loss": 0.0005, "reward": 3.105801820755005, "reward_std": 0.08226869069039822, "rewards/final_reward": 1.1846171339453866, "rewards/mask_iou_reward": 0.5923085669726933, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1058015823364258, "rewards/thk_ans_format_reward": 1.0, "step": 631, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 125.80208587646484, "epoch": 2.1349072512647553, "grad_norm": 6.608877064688749, "kl": 0.4501953125, "learning_rate": 8.220720720720721e-07, "loss": 0.0005, "reward": 3.033010244369507, "reward_std": 0.23232241719961166, "rewards/final_reward": 0.7721974526983842, "rewards/mask_iou_reward": 0.3860987263491921, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0330101251602173, "rewards/thk_ans_format_reward": 1.0, "step": 632, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 121.82292175292969, "epoch": 2.138279932546374, "grad_norm": 26.31636866542848, "kl": 0.48828125, "learning_rate": 8.217905405405405e-07, "loss": 0.0005, "reward": 3.6307315826416016, "reward_std": 0.11909966915845871, "rewards/final_reward": 1.1624019924863673, "rewards/mask_iou_reward": 0.5812009962431837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6307316422462463, "rewards/thk_ans_format_reward": 1.0, "step": 633, "think_completion_length": 10.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.64583587646484, "epoch": 2.1416526138279934, "grad_norm": 6.46074500536475, "kl": 0.49609375, "learning_rate": 8.21509009009009e-07, "loss": 0.0005, "reward": 3.474371314048767, "reward_std": 0.0844818763434887, "rewards/final_reward": 0.9150024138423042, "rewards/mask_iou_reward": 0.4575012069211521, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4743713736534119, "rewards/thk_ans_format_reward": 1.0, "step": 634, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.84375381469727, "epoch": 2.1450252951096123, "grad_norm": 9.554990225390688, "kl": 0.4521484375, "learning_rate": 8.212274774774775e-07, "loss": 0.0005, "reward": 3.6118842363357544, "reward_std": 0.07224077731370926, "rewards/final_reward": 1.7452657955108715, "rewards/mask_iou_reward": 0.8726328977554357, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6118841767311096, "rewards/thk_ans_format_reward": 1.0, "step": 635, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 120.91667175292969, "epoch": 2.148397976391231, "grad_norm": 57.03052652676022, "kl": 0.435546875, "learning_rate": 8.209459459459459e-07, "loss": 0.0004, "reward": 3.4605711698532104, "reward_std": 0.14525556564331055, "rewards/final_reward": 1.7284143941174483, "rewards/mask_iou_reward": 0.8642071970587242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4605711102485657, "rewards/thk_ans_format_reward": 1.0, "step": 636, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.6875, "epoch": 2.15177065767285, "grad_norm": 11.390704504352188, "kl": 0.4794921875, "learning_rate": 8.206644144144144e-07, "loss": 0.0005, "reward": 3.2578898668289185, "reward_std": 0.07665400765836239, "rewards/final_reward": 1.8478282830408852, "rewards/mask_iou_reward": 0.9239141415204426, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2578898072242737, "rewards/thk_ans_format_reward": 1.0, "step": 637, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.13542175292969, "epoch": 2.1551433389544687, "grad_norm": 10.79482036058893, "kl": 0.4150390625, "learning_rate": 8.203828828828828e-07, "loss": 0.0004, "reward": 3.455119490623474, "reward_std": 0.12551749870181084, "rewards/final_reward": 1.6525657013001847, "rewards/mask_iou_reward": 0.8262828506500923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4551194906234741, "rewards/thk_ans_format_reward": 1.0, "step": 638, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.01041793823242, "epoch": 2.1585160202360876, "grad_norm": 12.000131006840634, "kl": 0.5625, "learning_rate": 8.201013513513513e-07, "loss": 0.0006, "reward": 3.327172040939331, "reward_std": 0.1687596347182989, "rewards/final_reward": 1.7966222098178166, "rewards/mask_iou_reward": 0.8983111049089083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3271721005439758, "rewards/thk_ans_format_reward": 1.0, "step": 639, "think_completion_length": 10.0 }, { "clip_ratio": 0.0, "completion_length": 120.59375381469727, "epoch": 2.1618887015177064, "grad_norm": 11.094244556452955, "kl": 0.46875, "learning_rate": 8.198198198198198e-07, "loss": 0.0005, "reward": 3.408894658088684, "reward_std": 0.08874082565307617, "rewards/final_reward": 1.6323091881331337, "rewards/mask_iou_reward": 0.8161545940665669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4088948369026184, "rewards/thk_ans_format_reward": 1.0, "step": 640, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.38541793823242, "epoch": 2.1652613827993257, "grad_norm": 8.392822333817206, "kl": 0.4453125, "learning_rate": 8.195382882882883e-07, "loss": 0.0004, "reward": 3.328891634941101, "reward_std": 0.14553168416023254, "rewards/final_reward": 1.6706313970049362, "rewards/mask_iou_reward": 0.8353156985024681, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.328891634941101, "rewards/thk_ans_format_reward": 1.0, "step": 641, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 126.125, "epoch": 2.1686340640809445, "grad_norm": 8.583111088104932, "kl": 0.43359375, "learning_rate": 8.192567567567568e-07, "loss": 0.0004, "reward": 3.2445223331451416, "reward_std": 0.10854201018810272, "rewards/final_reward": 1.436967285600808, "rewards/mask_iou_reward": 0.718483642800404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2445223331451416, "rewards/thk_ans_format_reward": 1.0, "step": 642, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 136.78125381469727, "epoch": 2.1720067453625633, "grad_norm": 10.30401824580383, "kl": 0.4208984375, "learning_rate": 8.189752252252252e-07, "loss": 0.0004, "reward": 3.26160991191864, "reward_std": 0.22055093199014664, "rewards/final_reward": 0.8553469935214908, "rewards/mask_iou_reward": 0.4276734967607454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2616099119186401, "rewards/thk_ans_format_reward": 1.0, "step": 643, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 124.36458587646484, "epoch": 2.175379426644182, "grad_norm": 295.59265233358633, "kl": 0.6044921875, "learning_rate": 8.186936936936937e-07, "loss": 0.0006, "reward": 3.2367950677871704, "reward_std": 0.13342823646962643, "rewards/final_reward": 1.6191268212909824, "rewards/mask_iou_reward": 0.8095634106454912, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2367949485778809, "rewards/thk_ans_format_reward": 1.0, "step": 644, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 115.52083969116211, "epoch": 2.178752107925801, "grad_norm": 7.906168172774929, "kl": 0.4853515625, "learning_rate": 8.184121621621622e-07, "loss": 0.0005, "reward": 3.4528859853744507, "reward_std": 0.19831737503409386, "rewards/final_reward": 0.8998141944664336, "rewards/mask_iou_reward": 0.4499070972332168, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4528858661651611, "rewards/thk_ans_format_reward": 1.0, "step": 645, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 127.00000762939453, "epoch": 2.18212478920742, "grad_norm": 7.826136559641617, "kl": 0.46875, "learning_rate": 8.181306306306306e-07, "loss": 0.0005, "reward": 3.1739590167999268, "reward_std": 0.18386272341012955, "rewards/final_reward": 1.6549560272173047, "rewards/mask_iou_reward": 0.8274780136086524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1739588975906372, "rewards/thk_ans_format_reward": 1.0, "step": 646, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 117.92708587646484, "epoch": 2.1854974704890386, "grad_norm": 16.502530928602464, "kl": 0.4208984375, "learning_rate": 8.178490990990991e-07, "loss": 0.0004, "reward": 3.230895757675171, "reward_std": 0.20900658890604973, "rewards/final_reward": 0.8469308443974172, "rewards/mask_iou_reward": 0.4234654221987086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2308956980705261, "rewards/thk_ans_format_reward": 1.0, "step": 647, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 130.30208587646484, "epoch": 2.1888701517706575, "grad_norm": 19.93251577727481, "kl": 0.73046875, "learning_rate": 8.175675675675676e-07, "loss": 0.0007, "reward": 3.41804575920105, "reward_std": 0.21081428974866867, "rewards/final_reward": 1.8266925880479017, "rewards/mask_iou_reward": 0.9133462940239508, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.418045699596405, "rewards/thk_ans_format_reward": 1.0, "step": 648, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 121.80208587646484, "epoch": 2.1922428330522767, "grad_norm": 44.50576070941139, "kl": 0.421875, "learning_rate": 8.17286036036036e-07, "loss": 0.0005, "reward": 3.07036817073822, "reward_std": 0.10139280930161476, "rewards/final_reward": 0.8796635025441313, "rewards/mask_iou_reward": 0.43983175127206564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.070368230342865, "rewards/thk_ans_format_reward": 1.0, "step": 649, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 132.70833587646484, "epoch": 2.1956155143338956, "grad_norm": 6.518147187815124, "kl": 0.470703125, "learning_rate": 8.170045045045045e-07, "loss": 0.0005, "reward": 3.2474400997161865, "reward_std": 0.13931379839777946, "rewards/final_reward": 1.875586365490201, "rewards/mask_iou_reward": 0.9377931827451005, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.247439980506897, "rewards/thk_ans_format_reward": 1.0, "step": 650, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 108.06250381469727, "epoch": 2.1989881956155144, "grad_norm": 13.896457174710385, "kl": 0.521484375, "learning_rate": 8.16722972972973e-07, "loss": 0.0006, "reward": 3.6261767148971558, "reward_std": 0.1183246560394764, "rewards/final_reward": 1.7615443987622346, "rewards/mask_iou_reward": 0.8807721993811173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6261768341064453, "rewards/thk_ans_format_reward": 1.0, "step": 651, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.78125381469727, "epoch": 2.2023608768971332, "grad_norm": 7.521553036506293, "kl": 0.509765625, "learning_rate": 8.164414414414414e-07, "loss": 0.0005, "reward": 3.126633405685425, "reward_std": 0.1798637956380844, "rewards/final_reward": 1.4663160227039, "rewards/mask_iou_reward": 0.73315801135195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1266332566738129, "rewards/thk_ans_format_reward": 1.0, "step": 652, "think_completion_length": 10.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 110.69791793823242, "epoch": 2.205733558178752, "grad_norm": 9.400018448545131, "kl": 0.482421875, "learning_rate": 8.161599099099099e-07, "loss": 0.0005, "reward": 3.5450875759124756, "reward_std": 0.09950285963714123, "rewards/final_reward": 1.4591154590757514, "rewards/mask_iou_reward": 0.7295577295378757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.545087456703186, "rewards/thk_ans_format_reward": 1.0, "step": 653, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 128.13541793823242, "epoch": 2.209106239460371, "grad_norm": 16.586317511471435, "kl": 0.443359375, "learning_rate": 8.158783783783783e-07, "loss": 0.0004, "reward": 3.3824548721313477, "reward_std": 0.11548849008977413, "rewards/final_reward": 0.9872206684867463, "rewards/mask_iou_reward": 0.49361033424337314, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.382454752922058, "rewards/thk_ans_format_reward": 1.0, "step": 654, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 134.43750381469727, "epoch": 2.2124789207419897, "grad_norm": 15.711839076861132, "kl": 1.1123046875, "learning_rate": 8.155968468468468e-07, "loss": 0.0011, "reward": 3.5824573040008545, "reward_std": 0.1437189131975174, "rewards/final_reward": 1.5714705673789258, "rewards/mask_iou_reward": 0.7857352836894629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5824573040008545, "rewards/thk_ans_format_reward": 1.0, "step": 655, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 109.60417175292969, "epoch": 2.2158516020236085, "grad_norm": 7.4396341779376405, "kl": 0.583984375, "learning_rate": 8.153153153153152e-07, "loss": 0.0006, "reward": 3.394721031188965, "reward_std": 0.18189126066863537, "rewards/final_reward": 1.6694186379997311, "rewards/mask_iou_reward": 0.8347093189998656, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3947210907936096, "rewards/thk_ans_format_reward": 1.0, "step": 656, "think_completion_length": 11.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.35417175292969, "epoch": 2.219224283305228, "grad_norm": 14.309370554809359, "kl": 0.4638671875, "learning_rate": 8.150337837837837e-07, "loss": 0.0005, "reward": 3.1339633464813232, "reward_std": 0.17729290574789047, "rewards/final_reward": 1.0959308709827407, "rewards/mask_iou_reward": 0.5479654354913703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1339632868766785, "rewards/thk_ans_format_reward": 1.0, "step": 657, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.1770896911621, "epoch": 2.2225969645868466, "grad_norm": 4.732564548831271, "kl": 0.4404296875, "learning_rate": 8.147522522522522e-07, "loss": 0.0004, "reward": 3.3023016452789307, "reward_std": 0.12397704645991325, "rewards/final_reward": 1.0975228223193203, "rewards/mask_iou_reward": 0.5487614111596602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.302301824092865, "rewards/thk_ans_format_reward": 1.0, "step": 658, "think_completion_length": 9.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.00000381469727, "epoch": 2.2259696458684655, "grad_norm": 12.362785143592603, "kl": 0.447265625, "learning_rate": 8.144707207207206e-07, "loss": 0.0004, "reward": 3.2936304807662964, "reward_std": 0.12760978937149048, "rewards/final_reward": 0.6488444817317736, "rewards/mask_iou_reward": 0.3244222408658868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2936302423477173, "rewards/thk_ans_format_reward": 1.0, "step": 659, "think_completion_length": 11.375 }, { "clip_ratio": 0.0, "completion_length": 118.45833587646484, "epoch": 2.2293423271500843, "grad_norm": 8.083525219026132, "kl": 0.4404296875, "learning_rate": 8.141891891891891e-07, "loss": 0.0005, "reward": 3.3997479677200317, "reward_std": 0.2194822132587433, "rewards/final_reward": 1.5548540385668563, "rewards/mask_iou_reward": 0.7774270192834282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3997478485107422, "rewards/thk_ans_format_reward": 1.0, "step": 660, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.12500381469727, "epoch": 2.232715008431703, "grad_norm": 7.8635506669881465, "kl": 0.4794921875, "learning_rate": 8.139076576576576e-07, "loss": 0.0005, "reward": 3.634366989135742, "reward_std": 0.11458337679505348, "rewards/final_reward": 1.6881464055982023, "rewards/mask_iou_reward": 0.8440732027991011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6343669295310974, "rewards/thk_ans_format_reward": 1.0, "step": 661, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 121.00000381469727, "epoch": 2.236087689713322, "grad_norm": 9.681507857307889, "kl": 0.412109375, "learning_rate": 8.136261261261261e-07, "loss": 0.0004, "reward": 3.29227614402771, "reward_std": 0.09453720226883888, "rewards/final_reward": 1.3683644280381078, "rewards/mask_iou_reward": 0.6841822140190539, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.29227614402771, "rewards/thk_ans_format_reward": 1.0, "step": 662, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.93750381469727, "epoch": 2.2394603709949408, "grad_norm": 7.5674477129734745, "kl": 0.392578125, "learning_rate": 8.133445945945946e-07, "loss": 0.0004, "reward": 3.743380904197693, "reward_std": 0.015893162926658988, "rewards/final_reward": 1.5229256954080168, "rewards/mask_iou_reward": 0.7614628477040084, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7433809041976929, "rewards/thk_ans_format_reward": 1.0, "step": 663, "think_completion_length": 10.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.73958587646484, "epoch": 2.24283305227656, "grad_norm": 17.68664735167123, "kl": 0.4287109375, "learning_rate": 8.13063063063063e-07, "loss": 0.0004, "reward": 3.303257942199707, "reward_std": 0.18324602022767067, "rewards/final_reward": 1.7402201218143234, "rewards/mask_iou_reward": 0.8701100609071617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3032580018043518, "rewards/thk_ans_format_reward": 1.0, "step": 664, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.42708587646484, "epoch": 2.246205733558179, "grad_norm": 6.916780449362966, "kl": 0.42578125, "learning_rate": 8.127815315315315e-07, "loss": 0.0004, "reward": 3.1752452850341797, "reward_std": 0.09016630239784718, "rewards/final_reward": 0.7180743257293005, "rewards/mask_iou_reward": 0.35903716286465026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.175245225429535, "rewards/thk_ans_format_reward": 1.0, "step": 665, "think_completion_length": 10.0 }, { "clip_ratio": 0.0, "completion_length": 122.44792175292969, "epoch": 2.2495784148397977, "grad_norm": 16.657736889436094, "kl": 0.44921875, "learning_rate": 8.125e-07, "loss": 0.0005, "reward": 3.4963905811309814, "reward_std": 0.1357239931821823, "rewards/final_reward": 1.911610818305828, "rewards/mask_iou_reward": 0.955805409152914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4963907599449158, "rewards/thk_ans_format_reward": 1.0, "step": 666, "think_completion_length": 13.5 }, { "clip_ratio": 0.0, "completion_length": 122.72917175292969, "epoch": 2.2529510961214165, "grad_norm": 15.76213501616429, "kl": 0.3994140625, "learning_rate": 8.122184684684684e-07, "loss": 0.0004, "reward": 3.174826979637146, "reward_std": 0.10770581662654877, "rewards/final_reward": 1.3201200373647521, "rewards/mask_iou_reward": 0.6600600186823761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1748269498348236, "rewards/thk_ans_format_reward": 1.0, "step": 667, "think_completion_length": 11.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.6875, "epoch": 2.2563237774030354, "grad_norm": 13.227401276492902, "kl": 0.421875, "learning_rate": 8.119369369369369e-07, "loss": 0.0004, "reward": 3.2839311361312866, "reward_std": 0.08108958974480629, "rewards/final_reward": 1.5689729594727198, "rewards/mask_iou_reward": 0.7844864797363599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2839311361312866, "rewards/thk_ans_format_reward": 1.0, "step": 668, "think_completion_length": 10.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.15625, "epoch": 2.259696458684654, "grad_norm": 10.326067198476693, "kl": 0.490234375, "learning_rate": 8.116554054054053e-07, "loss": 0.0005, "reward": 3.206782102584839, "reward_std": 0.1343858316540718, "rewards/final_reward": 1.233378810524416, "rewards/mask_iou_reward": 0.616689405262208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2067819237709045, "rewards/thk_ans_format_reward": 1.0, "step": 669, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.67708587646484, "epoch": 2.263069139966273, "grad_norm": 14.823830916830676, "kl": 0.45703125, "learning_rate": 8.113738738738738e-07, "loss": 0.0005, "reward": 3.258223295211792, "reward_std": 0.10940613597631454, "rewards/final_reward": 1.0222389748364655, "rewards/mask_iou_reward": 0.5111194874182328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.258223056793213, "rewards/thk_ans_format_reward": 1.0, "step": 670, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 121.87500381469727, "epoch": 2.2664418212478923, "grad_norm": 14.677750769066721, "kl": 0.388671875, "learning_rate": 8.110923423423423e-07, "loss": 0.0004, "reward": 3.5307780504226685, "reward_std": 0.11446313932538033, "rewards/final_reward": 1.7383022024633599, "rewards/mask_iou_reward": 0.8691511012316799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5307780504226685, "rewards/thk_ans_format_reward": 1.0, "step": 671, "think_completion_length": 10.0 }, { "clip_ratio": 0.0, "completion_length": 125.89583969116211, "epoch": 2.269814502529511, "grad_norm": 10.298976393267948, "kl": 0.4404296875, "learning_rate": 8.108108108108108e-07, "loss": 0.0004, "reward": 3.189149498939514, "reward_std": 0.12401114031672478, "rewards/final_reward": 1.5704407655379913, "rewards/mask_iou_reward": 0.7852203827689956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1891493201255798, "rewards/thk_ans_format_reward": 1.0, "step": 672, "think_completion_length": 10.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.30208587646484, "epoch": 2.27318718381113, "grad_norm": 11.436957821981954, "kl": 0.4765625, "learning_rate": 8.105292792792793e-07, "loss": 0.0005, "reward": 3.208579659461975, "reward_std": 0.11934243142604828, "rewards/final_reward": 1.6359468909472215, "rewards/mask_iou_reward": 0.8179734454736107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2085795402526855, "rewards/thk_ans_format_reward": 1.0, "step": 673, "think_completion_length": 10.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.53125, "epoch": 2.2765598650927488, "grad_norm": 12.15331139839109, "kl": 0.5390625, "learning_rate": 8.102477477477477e-07, "loss": 0.0005, "reward": 3.4956218004226685, "reward_std": 0.12992879003286362, "rewards/final_reward": 1.7897713668932158, "rewards/mask_iou_reward": 0.8948856834466079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4956215023994446, "rewards/thk_ans_format_reward": 1.0, "step": 674, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.45833587646484, "epoch": 2.2799325463743676, "grad_norm": 9.99176569694546, "kl": 0.412109375, "learning_rate": 8.099662162162162e-07, "loss": 0.0004, "reward": 3.268976926803589, "reward_std": 0.25114165246486664, "rewards/final_reward": 1.6725001930072043, "rewards/mask_iou_reward": 0.8362500965036022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2689767479896545, "rewards/thk_ans_format_reward": 1.0, "step": 675, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.01042175292969, "epoch": 2.2833052276559864, "grad_norm": 10.650917935298724, "kl": 0.419921875, "learning_rate": 8.096846846846847e-07, "loss": 0.0004, "reward": 3.4945836067199707, "reward_std": 0.11074310541152954, "rewards/final_reward": 1.5783195531313818, "rewards/mask_iou_reward": 0.7891597765656909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4945836067199707, "rewards/thk_ans_format_reward": 1.0, "step": 676, "think_completion_length": 10.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 115.09375, "epoch": 2.2866779089376053, "grad_norm": 8.477878692957459, "kl": 0.494140625, "learning_rate": 8.094031531531531e-07, "loss": 0.0005, "reward": 3.306473970413208, "reward_std": 0.20133724063634872, "rewards/final_reward": 1.4288747373343276, "rewards/mask_iou_reward": 0.7144373686671638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3064739108085632, "rewards/thk_ans_format_reward": 1.0, "step": 677, "think_completion_length": 11.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.67708587646484, "epoch": 2.2900505902192245, "grad_norm": 7.188209827485836, "kl": 0.36328125, "learning_rate": 8.091216216216216e-07, "loss": 0.0004, "reward": 3.555638909339905, "reward_std": 0.12217172980308533, "rewards/final_reward": 1.5055865595552458, "rewards/mask_iou_reward": 0.7527932797776229, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5556389093399048, "rewards/thk_ans_format_reward": 1.0, "step": 678, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.82291793823242, "epoch": 2.2934232715008434, "grad_norm": 9.750461972160247, "kl": 0.4072265625, "learning_rate": 8.0884009009009e-07, "loss": 0.0004, "reward": 3.2604702711105347, "reward_std": 0.09934688359498978, "rewards/final_reward": 0.9451985545989782, "rewards/mask_iou_reward": 0.4725992772994891, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2604702711105347, "rewards/thk_ans_format_reward": 1.0, "step": 679, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.27083587646484, "epoch": 2.296795952782462, "grad_norm": 15.365217581053004, "kl": 0.59375, "learning_rate": 8.085585585585585e-07, "loss": 0.0006, "reward": 3.0533676147460938, "reward_std": 0.16271667182445526, "rewards/final_reward": 0.4927129483488749, "rewards/mask_iou_reward": 0.24635647417443746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0533676147460938, "rewards/thk_ans_format_reward": 1.0, "step": 680, "think_completion_length": 10.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.30208587646484, "epoch": 2.300168634064081, "grad_norm": 9.802254184837405, "kl": 0.447265625, "learning_rate": 8.08277027027027e-07, "loss": 0.0004, "reward": 3.028236746788025, "reward_std": 0.21147775277495384, "rewards/final_reward": 0.3851396040127628, "rewards/mask_iou_reward": 0.1925698020063814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0282368659973145, "rewards/thk_ans_format_reward": 1.0, "step": 681, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 126.79167175292969, "epoch": 2.3035413153457, "grad_norm": 14.761547115745145, "kl": 0.53125, "learning_rate": 8.079954954954955e-07, "loss": 0.0005, "reward": 3.521707773208618, "reward_std": 0.12188014201819897, "rewards/final_reward": 1.7237706323114415, "rewards/mask_iou_reward": 0.8618853161557207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5217076539993286, "rewards/thk_ans_format_reward": 1.0, "step": 682, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 135.53125, "epoch": 2.3069139966273187, "grad_norm": 9.288430390244054, "kl": 0.3671875, "learning_rate": 8.07713963963964e-07, "loss": 0.0004, "reward": 3.4879062175750732, "reward_std": 0.22596611082553864, "rewards/final_reward": 1.8825232373395848, "rewards/mask_iou_reward": 0.9412616186697924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.487906038761139, "rewards/thk_ans_format_reward": 1.0, "step": 683, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.36458587646484, "epoch": 2.3102866779089375, "grad_norm": 14.599410626383364, "kl": 0.4267578125, "learning_rate": 8.074324324324325e-07, "loss": 0.0004, "reward": 3.4649771451950073, "reward_std": 0.11124672368168831, "rewards/final_reward": 1.5080582017135327, "rewards/mask_iou_reward": 0.7540291008567663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4649773240089417, "rewards/thk_ans_format_reward": 1.0, "step": 684, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.20833587646484, "epoch": 2.3136593591905563, "grad_norm": 10.06748944006634, "kl": 0.4892578125, "learning_rate": 8.071509009009009e-07, "loss": 0.0005, "reward": 3.408280611038208, "reward_std": 0.09913060441613197, "rewards/final_reward": 1.7207976126492222, "rewards/mask_iou_reward": 0.8603988063246111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4082804918289185, "rewards/thk_ans_format_reward": 1.0, "step": 685, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 125.02083969116211, "epoch": 2.317032040472175, "grad_norm": 7.514829742049848, "kl": 0.4521484375, "learning_rate": 8.068693693693694e-07, "loss": 0.0005, "reward": 3.2785524129867554, "reward_std": 0.02402611169964075, "rewards/final_reward": 1.5484094219072886, "rewards/mask_iou_reward": 0.7742047109536443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.278552532196045, "rewards/thk_ans_format_reward": 1.0, "step": 686, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 131.56250762939453, "epoch": 2.3204047217537944, "grad_norm": 9.801115745438581, "kl": 0.484375, "learning_rate": 8.065878378378378e-07, "loss": 0.0005, "reward": 2.937888503074646, "reward_std": 0.13681496307253838, "rewards/final_reward": 0.39095593049748123, "rewards/mask_iou_reward": 0.19547796524874061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9378882646560669, "rewards/thk_ans_format_reward": 1.0, "step": 687, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.85416793823242, "epoch": 2.3237774030354132, "grad_norm": 8.040804487614768, "kl": 0.451171875, "learning_rate": 8.063063063063063e-07, "loss": 0.0005, "reward": 3.6306684017181396, "reward_std": 0.046969225630164146, "rewards/final_reward": 1.8572513075997634, "rewards/mask_iou_reward": 0.9286256537998817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.63066828250885, "rewards/thk_ans_format_reward": 1.0, "step": 688, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.14583587646484, "epoch": 2.327150084317032, "grad_norm": 8.439382052771062, "kl": 0.4775390625, "learning_rate": 8.060247747747748e-07, "loss": 0.0005, "reward": 3.4388445615768433, "reward_std": 0.14623238891363144, "rewards/final_reward": 1.2177924990928135, "rewards/mask_iou_reward": 0.6088962495464068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4388444423675537, "rewards/thk_ans_format_reward": 1.0, "step": 689, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 120.82291793823242, "epoch": 2.330522765598651, "grad_norm": 16.475672726688114, "kl": 0.4375, "learning_rate": 8.057432432432431e-07, "loss": 0.0004, "reward": 3.5080931186676025, "reward_std": 0.2046094499528408, "rewards/final_reward": 1.4977205328748284, "rewards/mask_iou_reward": 0.7488602664374142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.508092999458313, "rewards/thk_ans_format_reward": 1.0, "step": 690, "think_completion_length": 10.5 }, { "clip_ratio": 0.0, "completion_length": 121.69791793823242, "epoch": 2.3338954468802697, "grad_norm": 16.864680723786535, "kl": 0.421875, "learning_rate": 8.054617117117116e-07, "loss": 0.0004, "reward": 2.887763261795044, "reward_std": 0.0428765881806612, "rewards/final_reward": 1.6773612262571533, "rewards/mask_iou_reward": 0.8386806131285767, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.887763112783432, "rewards/thk_ans_format_reward": 1.0, "step": 691, "think_completion_length": 10.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 178.21875381469727, "epoch": 2.3372681281618886, "grad_norm": 14.022754013063981, "kl": 0.400390625, "learning_rate": 8.051801801801801e-07, "loss": 0.0004, "reward": 3.0394562482833862, "reward_std": 0.18824466317892075, "rewards/final_reward": 0.954554072359945, "rewards/mask_iou_reward": 0.4772770361799725, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0602895319461823, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 692, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.67708587646484, "epoch": 2.3406408094435074, "grad_norm": 12.700617794911235, "kl": 0.443359375, "learning_rate": 8.048986486486486e-07, "loss": 0.0005, "reward": 3.3846123218536377, "reward_std": 0.13955982774496078, "rewards/final_reward": 0.8707478787601919, "rewards/mask_iou_reward": 0.43537393938009594, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3846123218536377, "rewards/thk_ans_format_reward": 1.0, "step": 693, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 110.25000381469727, "epoch": 2.3440134907251267, "grad_norm": 5.292351081418642, "kl": 0.455078125, "learning_rate": 8.046171171171171e-07, "loss": 0.0005, "reward": 3.3495113849639893, "reward_std": 0.06185881420969963, "rewards/final_reward": 1.8658587353803533, "rewards/mask_iou_reward": 0.9329293676901766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3495115637779236, "rewards/thk_ans_format_reward": 1.0, "step": 694, "think_completion_length": 10.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.38541793823242, "epoch": 2.3473861720067455, "grad_norm": 11.049291780315604, "kl": 0.451171875, "learning_rate": 8.043355855855855e-07, "loss": 0.0005, "reward": 3.3538196086883545, "reward_std": 0.05195300653576851, "rewards/final_reward": 0.056906077261131706, "rewards/mask_iou_reward": 0.028453038630565853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.353819489479065, "rewards/thk_ans_format_reward": 1.0, "step": 695, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 102.95833587646484, "epoch": 2.3507588532883643, "grad_norm": 19.11427795079353, "kl": 0.58203125, "learning_rate": 8.04054054054054e-07, "loss": 0.0006, "reward": 3.5086394548416138, "reward_std": 0.14855768531560898, "rewards/final_reward": 1.686626725092181, "rewards/mask_iou_reward": 0.8433133625460905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5086395740509033, "rewards/thk_ans_format_reward": 1.0, "step": 696, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.2708396911621, "epoch": 2.354131534569983, "grad_norm": 9.67194746839267, "kl": 0.4052734375, "learning_rate": 8.037725225225224e-07, "loss": 0.0004, "reward": 3.3273189067840576, "reward_std": 0.19882089644670486, "rewards/final_reward": 1.3801675103822553, "rewards/mask_iou_reward": 0.6900837551911276, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.327318787574768, "rewards/thk_ans_format_reward": 1.0, "step": 697, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 126.67708587646484, "epoch": 2.357504215851602, "grad_norm": 40.37500069302396, "kl": 0.654296875, "learning_rate": 8.034909909909909e-07, "loss": 0.0007, "reward": 3.2038848400115967, "reward_std": 0.2914121150970459, "rewards/final_reward": 0.5983370712448542, "rewards/mask_iou_reward": 0.2991685356224271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2038847208023071, "rewards/thk_ans_format_reward": 1.0, "step": 698, "think_completion_length": 11.75 }, { "clip_ratio": 0.0, "completion_length": 123.35416793823242, "epoch": 2.360876897133221, "grad_norm": 10.49432652015857, "kl": 0.404296875, "learning_rate": 8.032094594594594e-07, "loss": 0.0004, "reward": 3.1447372436523438, "reward_std": 0.17952913511544466, "rewards/final_reward": 1.5661687614291515, "rewards/mask_iou_reward": 0.7830843807145758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1447371244430542, "rewards/thk_ans_format_reward": 1.0, "step": 699, "think_completion_length": 11.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 108.88541793823242, "epoch": 2.3642495784148396, "grad_norm": 19.711801674853763, "kl": 0.53515625, "learning_rate": 8.029279279279278e-07, "loss": 0.0005, "reward": 3.3763688802719116, "reward_std": 0.14744899049401283, "rewards/final_reward": 1.0176800515031297, "rewards/mask_iou_reward": 0.5088400257515648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3763686418533325, "rewards/thk_ans_format_reward": 1.0, "step": 700, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.79166793823242, "epoch": 2.367622259696459, "grad_norm": 21.065652131398185, "kl": 0.4423828125, "learning_rate": 8.026463963963963e-07, "loss": 0.0005, "reward": 3.5854740142822266, "reward_std": 0.08657106757164001, "rewards/final_reward": 1.1906331353971875, "rewards/mask_iou_reward": 0.5953165676985938, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5854740738868713, "rewards/thk_ans_format_reward": 1.0, "step": 701, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.20833587646484, "epoch": 2.3709949409780777, "grad_norm": 7.539771960538076, "kl": 0.390625, "learning_rate": 8.023648648648649e-07, "loss": 0.0004, "reward": 3.166221857070923, "reward_std": 0.06890098564326763, "rewards/final_reward": 1.1171324478432192, "rewards/mask_iou_reward": 0.5585662239216096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.166221797466278, "rewards/thk_ans_format_reward": 1.0, "step": 702, "think_completion_length": 11.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.10416793823242, "epoch": 2.3743676222596966, "grad_norm": 25.268475645824264, "kl": 0.484375, "learning_rate": 8.020833333333333e-07, "loss": 0.0005, "reward": 3.6058748960494995, "reward_std": 0.1462496928870678, "rewards/final_reward": 1.7142995604894324, "rewards/mask_iou_reward": 0.8571497802447162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.60587477684021, "rewards/thk_ans_format_reward": 1.0, "step": 703, "think_completion_length": 10.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 110.07291793823242, "epoch": 2.3777403035413154, "grad_norm": 5.86936429351156, "kl": 0.513671875, "learning_rate": 8.018018018018018e-07, "loss": 0.0005, "reward": 3.587558388710022, "reward_std": 0.10432857647538185, "rewards/final_reward": 1.5256586245058101, "rewards/mask_iou_reward": 0.7628293122529051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5875583291053772, "rewards/thk_ans_format_reward": 1.0, "step": 704, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.98958587646484, "epoch": 2.381112984822934, "grad_norm": 24.572719789475233, "kl": 0.458984375, "learning_rate": 8.015202702702702e-07, "loss": 0.0005, "reward": 3.235170602798462, "reward_std": 0.1947356564924121, "rewards/final_reward": 1.6970421912813713, "rewards/mask_iou_reward": 0.8485210956406857, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2351704239845276, "rewards/thk_ans_format_reward": 1.0, "step": 705, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 133.29166793823242, "epoch": 2.384485666104553, "grad_norm": 12.228544427874436, "kl": 0.390625, "learning_rate": 8.012387387387387e-07, "loss": 0.0004, "reward": 3.09304141998291, "reward_std": 0.1303301863372326, "rewards/final_reward": 1.5203176608072408, "rewards/mask_iou_reward": 0.7601588304036204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0930412411689758, "rewards/thk_ans_format_reward": 1.0, "step": 706, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.30208587646484, "epoch": 2.387858347386172, "grad_norm": 11.83368014597079, "kl": 4.857421875, "learning_rate": 8.009572072072072e-07, "loss": 0.0048, "reward": 3.2624597549438477, "reward_std": 0.18296461552381516, "rewards/final_reward": 1.3711881609397212, "rewards/mask_iou_reward": 0.6855940804698606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.262459933757782, "rewards/thk_ans_format_reward": 1.0, "step": 707, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 123.33333587646484, "epoch": 2.391231028667791, "grad_norm": 16.164645790308068, "kl": 0.4619140625, "learning_rate": 8.006756756756756e-07, "loss": 0.0005, "reward": 2.9492790699005127, "reward_std": 0.065645731985569, "rewards/final_reward": 0.12719705036946563, "rewards/mask_iou_reward": 0.06359852518473282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9492788910865784, "rewards/thk_ans_format_reward": 1.0, "step": 708, "think_completion_length": 11.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 122.19792175292969, "epoch": 2.39460370994941, "grad_norm": 11.315576698691505, "kl": 0.486328125, "learning_rate": 8.003941441441441e-07, "loss": 0.0005, "reward": 3.2986977100372314, "reward_std": 0.16338078677654266, "rewards/final_reward": 1.7742719041990807, "rewards/mask_iou_reward": 0.8871359520995403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.298697590827942, "rewards/thk_ans_format_reward": 1.0, "step": 709, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.64583587646484, "epoch": 2.397976391231029, "grad_norm": 10.16630512602375, "kl": 0.4345703125, "learning_rate": 8.001126126126125e-07, "loss": 0.0004, "reward": 3.5177905559539795, "reward_std": 0.13988900184631348, "rewards/final_reward": 1.7767121947188076, "rewards/mask_iou_reward": 0.8883560973594038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5177903771400452, "rewards/thk_ans_format_reward": 1.0, "step": 710, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.73958587646484, "epoch": 2.4013490725126476, "grad_norm": 11.27077040193017, "kl": 0.4462890625, "learning_rate": 7.99831081081081e-07, "loss": 0.0004, "reward": 3.3645020723342896, "reward_std": 0.13164759427309036, "rewards/final_reward": 0.8320936686355329, "rewards/mask_iou_reward": 0.41604683431776646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3645018935203552, "rewards/thk_ans_format_reward": 1.0, "step": 711, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 128.75000381469727, "epoch": 2.4047217537942664, "grad_norm": 10.775746892595787, "kl": 0.66015625, "learning_rate": 7.995495495495496e-07, "loss": 0.0007, "reward": 3.5911271572113037, "reward_std": 0.06256835721433163, "rewards/final_reward": 1.9670477449263895, "rewards/mask_iou_reward": 0.9835238724631947, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.591127097606659, "rewards/thk_ans_format_reward": 1.0, "step": 712, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.04166793823242, "epoch": 2.4080944350758853, "grad_norm": 17.154979763981668, "kl": 0.48828125, "learning_rate": 7.99268018018018e-07, "loss": 0.0005, "reward": 3.3226637840270996, "reward_std": 0.06822742521762848, "rewards/final_reward": 1.8144922612172647, "rewards/mask_iou_reward": 0.9072461306086324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3226639032363892, "rewards/thk_ans_format_reward": 1.0, "step": 713, "think_completion_length": 10.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 124.36458587646484, "epoch": 2.411467116357504, "grad_norm": 7.41758232371269, "kl": 0.5302734375, "learning_rate": 7.989864864864865e-07, "loss": 0.0005, "reward": 3.438087582588196, "reward_std": 0.1307989489287138, "rewards/final_reward": 1.4276302815895208, "rewards/mask_iou_reward": 0.7138151407947604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4380874633789062, "rewards/thk_ans_format_reward": 1.0, "step": 714, "think_completion_length": 11.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 125.89583969116211, "epoch": 2.414839797639123, "grad_norm": 8.601012209208584, "kl": 0.54296875, "learning_rate": 7.987049549549549e-07, "loss": 0.0005, "reward": 3.1718395948410034, "reward_std": 0.15443510934710503, "rewards/final_reward": 1.1746513664066902, "rewards/mask_iou_reward": 0.5873256832033451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1718396544456482, "rewards/thk_ans_format_reward": 1.0, "step": 715, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.21875, "epoch": 2.4182124789207418, "grad_norm": 13.661422976929645, "kl": 0.6015625, "learning_rate": 7.984234234234234e-07, "loss": 0.0006, "reward": 3.3755754232406616, "reward_std": 0.17705781757831573, "rewards/final_reward": 1.4551628170109665, "rewards/mask_iou_reward": 0.7275814085054833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3755754232406616, "rewards/thk_ans_format_reward": 1.0, "step": 716, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 136.30208587646484, "epoch": 2.421585160202361, "grad_norm": 15.528903391546768, "kl": 0.53125, "learning_rate": 7.981418918918919e-07, "loss": 0.0005, "reward": 3.4267293214797974, "reward_std": 0.21596352756023407, "rewards/final_reward": 1.5162321323850754, "rewards/mask_iou_reward": 0.7581160661925377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4267293214797974, "rewards/thk_ans_format_reward": 1.0, "step": 717, "think_completion_length": 11.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.13541793823242, "epoch": 2.42495784148398, "grad_norm": 7.660762203198963, "kl": 0.763671875, "learning_rate": 7.978603603603603e-07, "loss": 0.0008, "reward": 3.088021755218506, "reward_std": 0.17404986545443535, "rewards/final_reward": 1.4431847347902549, "rewards/mask_iou_reward": 0.7215923673951274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.088021606206894, "rewards/thk_ans_format_reward": 1.0, "step": 718, "think_completion_length": 10.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 115.41666793823242, "epoch": 2.4283305227655987, "grad_norm": 10.275559775968965, "kl": 0.771484375, "learning_rate": 7.975788288288288e-07, "loss": 0.0008, "reward": 3.5344756841659546, "reward_std": 0.09361070767045021, "rewards/final_reward": 1.5879363495312444, "rewards/mask_iou_reward": 0.7939681747656222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5344756841659546, "rewards/thk_ans_format_reward": 1.0, "step": 719, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 123.1875, "epoch": 2.4317032040472175, "grad_norm": 6.11686753079081, "kl": 0.6015625, "learning_rate": 7.972972972972972e-07, "loss": 0.0006, "reward": 3.258207678794861, "reward_std": 0.21193058043718338, "rewards/final_reward": 0.5687722459062485, "rewards/mask_iou_reward": 0.28438612295312427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2582077383995056, "rewards/thk_ans_format_reward": 1.0, "step": 720, "think_completion_length": 11.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.82291793823242, "epoch": 2.4350758853288363, "grad_norm": 188.70353819108283, "kl": 0.548828125, "learning_rate": 7.970157657657657e-07, "loss": 0.0005, "reward": 3.328008770942688, "reward_std": 0.07661649584770203, "rewards/final_reward": 1.3145267406893124, "rewards/mask_iou_reward": 0.6572633703446562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3280084133148193, "rewards/thk_ans_format_reward": 1.0, "step": 721, "think_completion_length": 9.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.79167175292969, "epoch": 2.438448566610455, "grad_norm": 8.175555799208823, "kl": 0.595703125, "learning_rate": 7.967342342342343e-07, "loss": 0.0006, "reward": 3.286064028739929, "reward_std": 0.1305839866399765, "rewards/final_reward": 1.9222867314872671, "rewards/mask_iou_reward": 0.9611433657436336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2860642075538635, "rewards/thk_ans_format_reward": 1.0, "step": 722, "think_completion_length": 10.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.91667175292969, "epoch": 2.441821247892074, "grad_norm": 19.677120564321587, "kl": 0.62109375, "learning_rate": 7.964527027027027e-07, "loss": 0.0006, "reward": 3.4861291646957397, "reward_std": 0.111208725720644, "rewards/final_reward": 1.6771212868223513, "rewards/mask_iou_reward": 0.8385606434111756, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4861292839050293, "rewards/thk_ans_format_reward": 1.0, "step": 723, "think_completion_length": 11.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 122.41667175292969, "epoch": 2.4451939291736933, "grad_norm": 12.386210317467004, "kl": 0.548828125, "learning_rate": 7.961711711711712e-07, "loss": 0.0005, "reward": 3.435595750808716, "reward_std": 0.21505354344844818, "rewards/final_reward": 1.6424393573599578, "rewards/mask_iou_reward": 0.8212196786799789, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4460124373435974, "rewards/thk_ans_format_reward": 1.0, "step": 724, "think_completion_length": 11.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.35416793823242, "epoch": 2.448566610455312, "grad_norm": 90.88539800576194, "kl": 0.56640625, "learning_rate": 7.958896396396397e-07, "loss": 0.0006, "reward": 3.139678478240967, "reward_std": 0.1556791141629219, "rewards/final_reward": 1.0097397701785693, "rewards/mask_iou_reward": 0.5048698850892847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.13967826962471, "rewards/thk_ans_format_reward": 1.0, "step": 725, "think_completion_length": 11.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.94791793823242, "epoch": 2.451939291736931, "grad_norm": 11.034827414989431, "kl": 0.580078125, "learning_rate": 7.956081081081081e-07, "loss": 0.0006, "reward": 3.3471599817276, "reward_std": 0.052762774750590324, "rewards/final_reward": 1.4570581685778468, "rewards/mask_iou_reward": 0.7285290842889234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3471597135066986, "rewards/thk_ans_format_reward": 1.0, "step": 726, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 121.71875381469727, "epoch": 2.4553119730185498, "grad_norm": 10.107819781591768, "kl": 0.611328125, "learning_rate": 7.953265765765766e-07, "loss": 0.0006, "reward": 3.216899871826172, "reward_std": 0.11818000301718712, "rewards/final_reward": 1.7161205932460326, "rewards/mask_iou_reward": 0.8580602966230163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2168999910354614, "rewards/thk_ans_format_reward": 1.0, "step": 727, "think_completion_length": 10.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.04166793823242, "epoch": 2.4586846543001686, "grad_norm": 6.674039882817239, "kl": 0.560546875, "learning_rate": 7.95045045045045e-07, "loss": 0.0006, "reward": 3.42681086063385, "reward_std": 0.13024066016077995, "rewards/final_reward": 1.6990264231312215, "rewards/mask_iou_reward": 0.8495132115656108, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4268107414245605, "rewards/thk_ans_format_reward": 1.0, "step": 728, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 123.71875381469727, "epoch": 2.4620573355817874, "grad_norm": 13.049637366531218, "kl": 0.65234375, "learning_rate": 7.947635135135135e-07, "loss": 0.0006, "reward": 3.1891380548477173, "reward_std": 0.19797684997320175, "rewards/final_reward": 1.1756615795267868, "rewards/mask_iou_reward": 0.5878307897633934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.189138114452362, "rewards/thk_ans_format_reward": 1.0, "step": 729, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 123.70833587646484, "epoch": 2.4654300168634062, "grad_norm": 4.379687752783074, "kl": 0.5703125, "learning_rate": 7.944819819819819e-07, "loss": 0.0006, "reward": 3.259893774986267, "reward_std": 0.07151791453361511, "rewards/final_reward": 1.5924158283390697, "rewards/mask_iou_reward": 0.7962079141695348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.259893774986267, "rewards/thk_ans_format_reward": 1.0, "step": 730, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.13541793823242, "epoch": 2.4688026981450255, "grad_norm": 9.507324536247651, "kl": 0.5546875, "learning_rate": 7.942004504504503e-07, "loss": 0.0006, "reward": 3.215202569961548, "reward_std": 0.0699087530374527, "rewards/final_reward": 0.9720705627363007, "rewards/mask_iou_reward": 0.48603528136815033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2152024507522583, "rewards/thk_ans_format_reward": 1.0, "step": 731, "think_completion_length": 9.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.61458587646484, "epoch": 2.4721753794266443, "grad_norm": 12.873305700274182, "kl": 0.7734375, "learning_rate": 7.939189189189189e-07, "loss": 0.0008, "reward": 3.025223970413208, "reward_std": 0.12610819563269615, "rewards/final_reward": 1.7189425763675459, "rewards/mask_iou_reward": 0.8594712881837729, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0252237915992737, "rewards/thk_ans_format_reward": 1.0, "step": 732, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.91666793823242, "epoch": 2.475548060708263, "grad_norm": 9.722875671257425, "kl": 0.615234375, "learning_rate": 7.936373873873873e-07, "loss": 0.0006, "reward": 3.4183343648910522, "reward_std": 0.08810793235898018, "rewards/final_reward": 1.855176038165883, "rewards/mask_iou_reward": 0.9275880190829415, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.418334186077118, "rewards/thk_ans_format_reward": 1.0, "step": 733, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 122.67708587646484, "epoch": 2.478920741989882, "grad_norm": 15.856710122466179, "kl": 0.587890625, "learning_rate": 7.933558558558558e-07, "loss": 0.0006, "reward": 3.4329049587249756, "reward_std": 0.07124324329197407, "rewards/final_reward": 1.4161840189363588, "rewards/mask_iou_reward": 0.7080920094681794, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4329049587249756, "rewards/thk_ans_format_reward": 1.0, "step": 734, "think_completion_length": 9.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.44791793823242, "epoch": 2.482293423271501, "grad_norm": 152.2683999431526, "kl": 0.56640625, "learning_rate": 7.930743243243243e-07, "loss": 0.0006, "reward": 3.3764851093292236, "reward_std": 0.15127253159880638, "rewards/final_reward": 1.7125658778008792, "rewards/mask_iou_reward": 0.8562829389004396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3764851689338684, "rewards/thk_ans_format_reward": 1.0, "step": 735, "think_completion_length": 9.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.04166793823242, "epoch": 2.4856661045531196, "grad_norm": 8.316898560977593, "kl": 0.572265625, "learning_rate": 7.927927927927927e-07, "loss": 0.0006, "reward": 3.1848560571670532, "reward_std": 0.11521704494953156, "rewards/final_reward": 0.7065922922768331, "rewards/mask_iou_reward": 0.35329614613841653, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1848559379577637, "rewards/thk_ans_format_reward": 1.0, "step": 736, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.37500381469727, "epoch": 2.4890387858347385, "grad_norm": 11.203963199161379, "kl": 0.595703125, "learning_rate": 7.925112612612612e-07, "loss": 0.0006, "reward": 3.3153754472732544, "reward_std": 0.23398957401514053, "rewards/final_reward": 1.038106492915174, "rewards/mask_iou_reward": 0.519053246457587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3153753578662872, "rewards/thk_ans_format_reward": 1.0, "step": 737, "think_completion_length": 11.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 122.13542175292969, "epoch": 2.4924114671163577, "grad_norm": 7.715947393781205, "kl": 0.6328125, "learning_rate": 7.922297297297296e-07, "loss": 0.0006, "reward": 3.4682679176330566, "reward_std": 0.08761341124773026, "rewards/final_reward": 1.088569312463899, "rewards/mask_iou_reward": 0.5442846562319495, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4682677388191223, "rewards/thk_ans_format_reward": 1.0, "step": 738, "think_completion_length": 10.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 126.80208587646484, "epoch": 2.4957841483979766, "grad_norm": 41.751188322079955, "kl": 0.55078125, "learning_rate": 7.919481981981981e-07, "loss": 0.0006, "reward": 3.5137277841567993, "reward_std": 0.13675726018846035, "rewards/final_reward": 0.9585005643467535, "rewards/mask_iou_reward": 0.47925028217337673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5137277245521545, "rewards/thk_ans_format_reward": 1.0, "step": 739, "think_completion_length": 12.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 111.72917175292969, "epoch": 2.4991568296795954, "grad_norm": 8.421219003802829, "kl": 0.6328125, "learning_rate": 7.916666666666666e-07, "loss": 0.0006, "reward": 3.611706256866455, "reward_std": 0.21243294700980186, "rewards/final_reward": 1.7471342800039473, "rewards/mask_iou_reward": 0.8735671400019737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6117061376571655, "rewards/thk_ans_format_reward": 1.0, "step": 740, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 121.42708587646484, "epoch": 2.5025295109612142, "grad_norm": 9.829424324427816, "kl": 0.568359375, "learning_rate": 7.91385135135135e-07, "loss": 0.0006, "reward": 3.395363688468933, "reward_std": 0.12305304408073425, "rewards/final_reward": 1.5213473515200766, "rewards/mask_iou_reward": 0.7606736757600383, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.395363688468933, "rewards/thk_ans_format_reward": 1.0, "step": 741, "think_completion_length": 10.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.51041793823242, "epoch": 2.505902192242833, "grad_norm": 11.854834571460097, "kl": 0.625, "learning_rate": 7.911036036036036e-07, "loss": 0.0006, "reward": 3.4499409198760986, "reward_std": 0.10465443879365921, "rewards/final_reward": 1.543411262235664, "rewards/mask_iou_reward": 0.771705631117832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.449940800666809, "rewards/thk_ans_format_reward": 1.0, "step": 742, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.41666793823242, "epoch": 2.509274873524452, "grad_norm": 11.431790307732706, "kl": 0.6015625, "learning_rate": 7.908220720720721e-07, "loss": 0.0006, "reward": 3.214622139930725, "reward_std": 0.03172614425420761, "rewards/final_reward": 1.272425770492403, "rewards/mask_iou_reward": 0.6362128852462015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2146221995353699, "rewards/thk_ans_format_reward": 1.0, "step": 743, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 121.41667175292969, "epoch": 2.5126475548060707, "grad_norm": 8.830193308051955, "kl": 0.572265625, "learning_rate": 7.905405405405405e-07, "loss": 0.0006, "reward": 3.1759774684906006, "reward_std": 0.10025330260396004, "rewards/final_reward": 1.260989037491286, "rewards/mask_iou_reward": 0.630494518745643, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1759773790836334, "rewards/thk_ans_format_reward": 1.0, "step": 744, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 129.50000762939453, "epoch": 2.51602023608769, "grad_norm": 20.759261570638913, "kl": 0.80078125, "learning_rate": 7.90259009009009e-07, "loss": 0.0008, "reward": 3.3195990324020386, "reward_std": 0.09332029893994331, "rewards/final_reward": 0.9221775935859856, "rewards/mask_iou_reward": 0.4610887967929928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.319598913192749, "rewards/thk_ans_format_reward": 1.0, "step": 745, "think_completion_length": 10.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.58333587646484, "epoch": 2.5193929173693084, "grad_norm": 6.896446483981856, "kl": 0.5546875, "learning_rate": 7.899774774774774e-07, "loss": 0.0006, "reward": 3.1739712953567505, "reward_std": 0.16202839836478233, "rewards/final_reward": 1.7786173434773, "rewards/mask_iou_reward": 0.88930867173865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.173971176147461, "rewards/thk_ans_format_reward": 1.0, "step": 746, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.38541793823242, "epoch": 2.5227655986509276, "grad_norm": 11.71991482226643, "kl": 0.625, "learning_rate": 7.896959459459459e-07, "loss": 0.0006, "reward": 3.44577157497406, "reward_std": 0.09941475465893745, "rewards/final_reward": 1.323128358011256, "rewards/mask_iou_reward": 0.661564179005628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4457715153694153, "rewards/thk_ans_format_reward": 1.0, "step": 747, "think_completion_length": 10.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.65625381469727, "epoch": 2.5261382799325465, "grad_norm": 7.493391681330979, "kl": 0.5625, "learning_rate": 7.894144144144144e-07, "loss": 0.0006, "reward": 3.0702245235443115, "reward_std": 0.15229638293385506, "rewards/final_reward": 1.1008842091447275, "rewards/mask_iou_reward": 0.5504421045723638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0702243149280548, "rewards/thk_ans_format_reward": 1.0, "step": 748, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.35417175292969, "epoch": 2.5295109612141653, "grad_norm": 14.720404781621857, "kl": 0.87109375, "learning_rate": 7.891328828828828e-07, "loss": 0.0009, "reward": 3.369284749031067, "reward_std": 0.09888229332864285, "rewards/final_reward": 1.722342529541773, "rewards/mask_iou_reward": 0.8611712647708865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3692848086357117, "rewards/thk_ans_format_reward": 1.0, "step": 749, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.29166793823242, "epoch": 2.532883642495784, "grad_norm": 33.226300521794954, "kl": 0.64453125, "learning_rate": 7.888513513513513e-07, "loss": 0.0006, "reward": 3.3416668176651, "reward_std": 0.1597162000834942, "rewards/final_reward": 1.3613542161465868, "rewards/mask_iou_reward": 0.6806771080732934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3416666984558105, "rewards/thk_ans_format_reward": 1.0, "step": 750, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 120.77083587646484, "epoch": 2.536256323777403, "grad_norm": 17.600051361273188, "kl": 0.591796875, "learning_rate": 7.885698198198197e-07, "loss": 0.0006, "reward": 3.325364351272583, "reward_std": 0.21401405334472656, "rewards/final_reward": 1.6552300188658267, "rewards/mask_iou_reward": 0.8276150094329133, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3253644108772278, "rewards/thk_ans_format_reward": 1.0, "step": 751, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 129.31250762939453, "epoch": 2.539629005059022, "grad_norm": 22.345390640061805, "kl": 0.775390625, "learning_rate": 7.882882882882883e-07, "loss": 0.0008, "reward": 3.559385657310486, "reward_std": 0.09891241788864136, "rewards/final_reward": 1.1205373816626092, "rewards/mask_iou_reward": 0.5602686908313046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5593854784965515, "rewards/thk_ans_format_reward": 1.0, "step": 752, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.83333587646484, "epoch": 2.5430016863406406, "grad_norm": 42.517389071612314, "kl": 0.560546875, "learning_rate": 7.880067567567568e-07, "loss": 0.0006, "reward": 3.2056604623794556, "reward_std": 0.1193031445145607, "rewards/final_reward": 1.1134455035306088, "rewards/mask_iou_reward": 0.5567227517653044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2056604027748108, "rewards/thk_ans_format_reward": 1.0, "step": 753, "think_completion_length": 6.5 }, { "clip_ratio": 0.0, "completion_length": 127.86458587646484, "epoch": 2.54637436762226, "grad_norm": 31.312787511542698, "kl": 0.583984375, "learning_rate": 7.877252252252252e-07, "loss": 0.0006, "reward": 3.3875555992126465, "reward_std": 0.14285332709550858, "rewards/final_reward": 1.5229593984733443, "rewards/mask_iou_reward": 0.7614796992366721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3875554203987122, "rewards/thk_ans_format_reward": 1.0, "step": 754, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.88542175292969, "epoch": 2.5497470489038787, "grad_norm": 8.990249785443053, "kl": 0.58984375, "learning_rate": 7.874436936936937e-07, "loss": 0.0006, "reward": 2.9223185777664185, "reward_std": 0.08924070559442043, "rewards/final_reward": 1.0728038663698005, "rewards/mask_iou_reward": 0.5364019331849003, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9223185181617737, "rewards/thk_ans_format_reward": 1.0, "step": 755, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 118.76042175292969, "epoch": 2.5531197301854975, "grad_norm": 13.843095526147629, "kl": 0.83203125, "learning_rate": 7.871621621621622e-07, "loss": 0.0008, "reward": 3.4950000047683716, "reward_std": 0.05128934606909752, "rewards/final_reward": 1.7058042723780633, "rewards/mask_iou_reward": 0.8529021361890317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4949997067451477, "rewards/thk_ans_format_reward": 1.0, "step": 756, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 123.50000381469727, "epoch": 2.5564924114671164, "grad_norm": 31.95743816328789, "kl": 0.5703125, "learning_rate": 7.868806306306306e-07, "loss": 0.0006, "reward": 3.0966413021087646, "reward_std": 0.09938472509384155, "rewards/final_reward": 1.5479266235541327, "rewards/mask_iou_reward": 0.7739633117770663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0966413915157318, "rewards/thk_ans_format_reward": 1.0, "step": 757, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.64583587646484, "epoch": 2.559865092748735, "grad_norm": 12.172451648435866, "kl": 0.591796875, "learning_rate": 7.865990990990991e-07, "loss": 0.0006, "reward": 3.5292413234710693, "reward_std": 0.08699771389365196, "rewards/final_reward": 0.8571612771535517, "rewards/mask_iou_reward": 0.42858063857677586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5292413234710693, "rewards/thk_ans_format_reward": 1.0, "step": 758, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.45833587646484, "epoch": 2.563237774030354, "grad_norm": 11.122775520400872, "kl": 0.62109375, "learning_rate": 7.863175675675675e-07, "loss": 0.0006, "reward": 3.1527295112609863, "reward_std": 0.2126385048031807, "rewards/final_reward": 0.9309354482470767, "rewards/mask_iou_reward": 0.4654677241235384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1527293026447296, "rewards/thk_ans_format_reward": 1.0, "step": 759, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.54166793823242, "epoch": 2.566610455311973, "grad_norm": 10.546932510994008, "kl": 1.5859375, "learning_rate": 7.86036036036036e-07, "loss": 0.0016, "reward": 3.086781620979309, "reward_std": 0.07337499689310789, "rewards/final_reward": 0.7351426490944075, "rewards/mask_iou_reward": 0.36757132454720376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.08678138256073, "rewards/thk_ans_format_reward": 1.0, "step": 760, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.02083587646484, "epoch": 2.569983136593592, "grad_norm": 13.133133046507506, "kl": 0.62109375, "learning_rate": 7.857545045045045e-07, "loss": 0.0006, "reward": 3.4137195348739624, "reward_std": 0.044736314564943314, "rewards/final_reward": 1.2257461358321629, "rewards/mask_iou_reward": 0.6128730679160814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4137197136878967, "rewards/thk_ans_format_reward": 1.0, "step": 761, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 121.65625381469727, "epoch": 2.573355817875211, "grad_norm": 32.595514732619456, "kl": 0.615234375, "learning_rate": 7.85472972972973e-07, "loss": 0.0006, "reward": 3.445251703262329, "reward_std": 0.20248185843229294, "rewards/final_reward": 1.233189164325462, "rewards/mask_iou_reward": 0.616594582162731, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4452515840530396, "rewards/thk_ans_format_reward": 1.0, "step": 762, "think_completion_length": 10.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.32292175292969, "epoch": 2.5767284991568298, "grad_norm": 77.43355853852889, "kl": 0.541015625, "learning_rate": 7.851914414414415e-07, "loss": 0.0005, "reward": 3.3963409662246704, "reward_std": 0.1772306263446808, "rewards/final_reward": 1.3941096598606393, "rewards/mask_iou_reward": 0.6970548299303196, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.396340787410736, "rewards/thk_ans_format_reward": 1.0, "step": 763, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.41667175292969, "epoch": 2.5801011804384486, "grad_norm": 8.543003932244329, "kl": 0.611328125, "learning_rate": 7.849099099099099e-07, "loss": 0.0006, "reward": 3.338282823562622, "reward_std": 0.1316997967660427, "rewards/final_reward": 1.7267376913174535, "rewards/mask_iou_reward": 0.8633688456587267, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3382827639579773, "rewards/thk_ans_format_reward": 1.0, "step": 764, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 110.79166793823242, "epoch": 2.5834738617200674, "grad_norm": 42.44242367659991, "kl": 0.626953125, "learning_rate": 7.846283783783784e-07, "loss": 0.0007, "reward": 3.2067281007766724, "reward_std": 0.10132832825183868, "rewards/final_reward": 0.6540462389631674, "rewards/mask_iou_reward": 0.3270231194815837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2067280411720276, "rewards/thk_ans_format_reward": 1.0, "step": 765, "think_completion_length": 10.375 }, { "clip_ratio": 0.0, "completion_length": 120.52083587646484, "epoch": 2.5868465430016863, "grad_norm": 32.92501596552101, "kl": 0.572265625, "learning_rate": 7.843468468468469e-07, "loss": 0.0006, "reward": 3.105804204940796, "reward_std": 0.20033784210681915, "rewards/final_reward": 1.447985529265118, "rewards/mask_iou_reward": 0.723992764632559, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1162208020687103, "rewards/thk_ans_format_reward": 1.0, "step": 766, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.35417175292969, "epoch": 2.590219224283305, "grad_norm": 5.836494401716033, "kl": 0.646484375, "learning_rate": 7.840653153153153e-07, "loss": 0.0007, "reward": 3.3003053665161133, "reward_std": 0.06181888282299042, "rewards/final_reward": 1.5327181532154235, "rewards/mask_iou_reward": 0.7663590766077117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3003052473068237, "rewards/thk_ans_format_reward": 1.0, "step": 767, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.82292175292969, "epoch": 2.5935919055649244, "grad_norm": 31.644724773768857, "kl": 0.599609375, "learning_rate": 7.837837837837838e-07, "loss": 0.0006, "reward": 3.611915349960327, "reward_std": 0.133062107488513, "rewards/final_reward": 1.6232534975612043, "rewards/mask_iou_reward": 0.8116267487806021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6119154691696167, "rewards/thk_ans_format_reward": 1.0, "step": 768, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 2.5969645868465427, "grad_norm": 6.714877945185845, "kl": 0.595703125, "learning_rate": 7.835022522522522e-07, "loss": 0.0006, "reward": 3.281790614128113, "reward_std": 0.27748487889766693, "rewards/final_reward": 1.1574744637593963, "rewards/mask_iou_reward": 0.5787372318796982, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2817904949188232, "rewards/thk_ans_format_reward": 1.0, "step": 769, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 122.28125381469727, "epoch": 2.600337268128162, "grad_norm": 16.498780792457595, "kl": 0.677734375, "learning_rate": 7.832207207207206e-07, "loss": 0.0007, "reward": 3.0205026865005493, "reward_std": 0.08207221701741219, "rewards/final_reward": 1.0467192538355548, "rewards/mask_iou_reward": 0.5233596269177774, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0205026268959045, "rewards/thk_ans_format_reward": 1.0, "step": 770, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 129.52083587646484, "epoch": 2.603709949409781, "grad_norm": 11.788568156409315, "kl": 0.57421875, "learning_rate": 7.829391891891891e-07, "loss": 0.0006, "reward": 3.317116856575012, "reward_std": 0.11956719309091568, "rewards/final_reward": 1.6758310418194653, "rewards/mask_iou_reward": 0.8379155209097326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3171168565750122, "rewards/thk_ans_format_reward": 1.0, "step": 771, "think_completion_length": 11.25 }, { "clip_ratio": 0.0, "completion_length": 120.4375, "epoch": 2.6070826306913997, "grad_norm": 33.04893941697202, "kl": 0.5859375, "learning_rate": 7.826576576576576e-07, "loss": 0.0006, "reward": 3.2055855989456177, "reward_std": 0.08506038412451744, "rewards/final_reward": 1.090031976130355, "rewards/mask_iou_reward": 0.5450159880651775, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2055857181549072, "rewards/thk_ans_format_reward": 1.0, "step": 772, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.94791793823242, "epoch": 2.6104553119730185, "grad_norm": 16.00591255201323, "kl": 0.62109375, "learning_rate": 7.823761261261261e-07, "loss": 0.0006, "reward": 3.259799003601074, "reward_std": 0.06505817919969559, "rewards/final_reward": 1.2474749216725154, "rewards/mask_iou_reward": 0.6237374608362577, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2597991228103638, "rewards/thk_ans_format_reward": 1.0, "step": 773, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.30208587646484, "epoch": 2.6138279932546373, "grad_norm": 16.467078793275938, "kl": 0.509765625, "learning_rate": 7.820945945945945e-07, "loss": 0.0005, "reward": 3.4382166862487793, "reward_std": 0.09156141243875027, "rewards/final_reward": 1.5640969101735402, "rewards/mask_iou_reward": 0.7820484550867701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4382166266441345, "rewards/thk_ans_format_reward": 1.0, "step": 774, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.69791793823242, "epoch": 2.6172006745362566, "grad_norm": 8.68110014124045, "kl": 0.560546875, "learning_rate": 7.81813063063063e-07, "loss": 0.0005, "reward": 3.143561840057373, "reward_std": 0.05993725173175335, "rewards/final_reward": 1.4912784029807846, "rewards/mask_iou_reward": 0.7456392014903923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1435619592666626, "rewards/thk_ans_format_reward": 1.0, "step": 775, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 122.79166793823242, "epoch": 2.620573355817875, "grad_norm": 8.67400352232433, "kl": 0.60546875, "learning_rate": 7.815315315315315e-07, "loss": 0.0006, "reward": 3.6337943077087402, "reward_std": 0.2572034075856209, "rewards/final_reward": 1.5875537717192088, "rewards/mask_iou_reward": 0.7937768858596044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6337940096855164, "rewards/thk_ans_format_reward": 1.0, "step": 776, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 123.41667175292969, "epoch": 2.6239460370994943, "grad_norm": 8.556882945235555, "kl": 0.71875, "learning_rate": 7.812499999999999e-07, "loss": 0.0007, "reward": 3.184899926185608, "reward_std": 0.141241867095232, "rewards/final_reward": 1.1142707135886631, "rewards/mask_iou_reward": 0.5571353567943316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1848998963832855, "rewards/thk_ans_format_reward": 1.0, "step": 777, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 121.36458587646484, "epoch": 2.627318718381113, "grad_norm": 26.69004358153012, "kl": 0.5859375, "learning_rate": 7.809684684684684e-07, "loss": 0.0006, "reward": 3.3422510623931885, "reward_std": 0.10112036764621735, "rewards/final_reward": 1.6472102036234255, "rewards/mask_iou_reward": 0.8236051018117128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3422508835792542, "rewards/thk_ans_format_reward": 1.0, "step": 778, "think_completion_length": 10.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.23958587646484, "epoch": 2.630691399662732, "grad_norm": 12.340711701333001, "kl": 0.607421875, "learning_rate": 7.806869369369369e-07, "loss": 0.0006, "reward": 3.3806689977645874, "reward_std": 0.3221001923084259, "rewards/final_reward": 1.4816500684072826, "rewards/mask_iou_reward": 0.7408250342036413, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3806690573692322, "rewards/thk_ans_format_reward": 1.0, "step": 779, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 126.14583587646484, "epoch": 2.6340640809443507, "grad_norm": 28.7933229375175, "kl": 0.599609375, "learning_rate": 7.804054054054053e-07, "loss": 0.0006, "reward": 3.145534038543701, "reward_std": 0.1955154836177826, "rewards/final_reward": 1.109550532358149, "rewards/mask_iou_reward": 0.5547752661790745, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1455340385437012, "rewards/thk_ans_format_reward": 1.0, "step": 780, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.23958587646484, "epoch": 2.6374367622259696, "grad_norm": 13.664880476457705, "kl": 0.638671875, "learning_rate": 7.801238738738738e-07, "loss": 0.0007, "reward": 3.010381579399109, "reward_std": 0.16878048330545425, "rewards/final_reward": 1.2768509072340888, "rewards/mask_iou_reward": 0.6384254536170444, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0207980871200562, "rewards/thk_ans_format_reward": 1.0, "step": 781, "think_completion_length": 11.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.81250381469727, "epoch": 2.6408094435075884, "grad_norm": 6.9108413654848455, "kl": 0.755859375, "learning_rate": 7.798423423423422e-07, "loss": 0.0008, "reward": 3.547639846801758, "reward_std": 0.07762636616826057, "rewards/final_reward": 1.7800088484193495, "rewards/mask_iou_reward": 0.8900044242096747, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5476398468017578, "rewards/thk_ans_format_reward": 1.0, "step": 782, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.85416793823242, "epoch": 2.6441821247892072, "grad_norm": 9.5857677536209, "kl": 0.640625, "learning_rate": 7.795608108108108e-07, "loss": 0.0006, "reward": 3.4680780172348022, "reward_std": 0.11806654557585716, "rewards/final_reward": 1.7424451239109253, "rewards/mask_iou_reward": 0.8712225619554627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4680778980255127, "rewards/thk_ans_format_reward": 1.0, "step": 783, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 136.3125, "epoch": 2.6475548060708265, "grad_norm": 57.780748368086115, "kl": 0.5546875, "learning_rate": 7.792792792792793e-07, "loss": 0.0007, "reward": 2.8306479454040527, "reward_std": 0.1650489717721939, "rewards/final_reward": 0.5575330646046901, "rewards/mask_iou_reward": 0.27876653230234505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8306480944156647, "rewards/thk_ans_format_reward": 1.0, "step": 784, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.29167175292969, "epoch": 2.6509274873524453, "grad_norm": 22.501702902369207, "kl": 0.6015625, "learning_rate": 7.789977477477477e-07, "loss": 0.0008, "reward": 3.1414239406585693, "reward_std": 0.08215552754700184, "rewards/final_reward": 0.9676854785923981, "rewards/mask_iou_reward": 0.48384273929619903, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1414238214492798, "rewards/thk_ans_format_reward": 1.0, "step": 785, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.88541793823242, "epoch": 2.654300168634064, "grad_norm": 21.46706283599788, "kl": 0.5703125, "learning_rate": 7.787162162162162e-07, "loss": 0.0006, "reward": 3.4730279445648193, "reward_std": 0.08347597345709801, "rewards/final_reward": 1.4931069761398357, "rewards/mask_iou_reward": 0.7465534880699178, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4730280041694641, "rewards/thk_ans_format_reward": 1.0, "step": 786, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 118.5, "epoch": 2.657672849915683, "grad_norm": 9.05205086856852, "kl": 0.595703125, "learning_rate": 7.784346846846846e-07, "loss": 0.0006, "reward": 3.118174910545349, "reward_std": 0.0817178450524807, "rewards/final_reward": 1.3790720365145952, "rewards/mask_iou_reward": 0.6895360182572976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1181747615337372, "rewards/thk_ans_format_reward": 1.0, "step": 787, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.87500381469727, "epoch": 2.661045531197302, "grad_norm": 21.72331596169387, "kl": 0.787109375, "learning_rate": 7.781531531531531e-07, "loss": 0.0008, "reward": 3.3093440532684326, "reward_std": 0.10094352997839451, "rewards/final_reward": 1.704846715631396, "rewards/mask_iou_reward": 0.852423357815698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3093441128730774, "rewards/thk_ans_format_reward": 1.0, "step": 788, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 110.23958587646484, "epoch": 2.6644182124789206, "grad_norm": 10.87397622578721, "kl": 0.7890625, "learning_rate": 7.778716216216216e-07, "loss": 0.0008, "reward": 3.4442501068115234, "reward_std": 0.07595014199614525, "rewards/final_reward": 1.6731010669686723, "rewards/mask_iou_reward": 0.8365505334843362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4442499279975891, "rewards/thk_ans_format_reward": 1.0, "step": 789, "think_completion_length": 10.25 }, { "clip_ratio": 0.0, "completion_length": 125.16666793823242, "epoch": 2.6677908937605395, "grad_norm": 34.326508886063536, "kl": 0.55078125, "learning_rate": 7.7759009009009e-07, "loss": 0.0006, "reward": 3.0237648487091064, "reward_std": 0.14004899561405182, "rewards/final_reward": 0.8710051356850619, "rewards/mask_iou_reward": 0.43550256784253094, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0237650275230408, "rewards/thk_ans_format_reward": 1.0, "step": 790, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.41667175292969, "epoch": 2.6711635750421587, "grad_norm": 10.07884750719032, "kl": 0.541015625, "learning_rate": 7.773085585585585e-07, "loss": 0.0005, "reward": 3.0886744260787964, "reward_std": 0.14586707949638367, "rewards/final_reward": 0.8601907362518856, "rewards/mask_iou_reward": 0.4300953681259428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.088674247264862, "rewards/thk_ans_format_reward": 1.0, "step": 791, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.63542175292969, "epoch": 2.6745362563237776, "grad_norm": 41.51262508899244, "kl": 0.599609375, "learning_rate": 7.77027027027027e-07, "loss": 0.0006, "reward": 3.345741033554077, "reward_std": 0.09703287482261658, "rewards/final_reward": 1.1597580532708955, "rewards/mask_iou_reward": 0.5798790266354478, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3457409143447876, "rewards/thk_ans_format_reward": 1.0, "step": 792, "think_completion_length": 11.375 }, { "clip_ratio": 0.0, "completion_length": 121.09375, "epoch": 2.6779089376053964, "grad_norm": 7.769022810438781, "kl": 0.654296875, "learning_rate": 7.767454954954955e-07, "loss": 0.0007, "reward": 3.578448176383972, "reward_std": 0.07281693629920483, "rewards/final_reward": 1.2307804240001008, "rewards/mask_iou_reward": 0.6153902120000504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5784481167793274, "rewards/thk_ans_format_reward": 1.0, "step": 793, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 119.22916793823242, "epoch": 2.681281618887015, "grad_norm": 15.460994244582118, "kl": 0.578125, "learning_rate": 7.76463963963964e-07, "loss": 0.0006, "reward": 3.132830500602722, "reward_std": 0.18471045047044754, "rewards/final_reward": 1.4256940575311807, "rewards/mask_iou_reward": 0.7128470287655904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1328304409980774, "rewards/thk_ans_format_reward": 1.0, "step": 794, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 121.05208587646484, "epoch": 2.684654300168634, "grad_norm": 10.330542806577336, "kl": 0.58203125, "learning_rate": 7.761824324324324e-07, "loss": 0.0006, "reward": 2.9948168992996216, "reward_std": 0.11794888228178024, "rewards/final_reward": 0.36272560535996373, "rewards/mask_iou_reward": 0.18136280267998187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9948166310787201, "rewards/thk_ans_format_reward": 1.0, "step": 795, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.75000381469727, "epoch": 2.688026981450253, "grad_norm": 8.850614484587357, "kl": 0.578125, "learning_rate": 7.759009009009009e-07, "loss": 0.0006, "reward": 3.410898804664612, "reward_std": 0.031522348057478666, "rewards/final_reward": 1.9422227818541504, "rewards/mask_iou_reward": 0.9711113909270752, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4108988046646118, "rewards/thk_ans_format_reward": 1.0, "step": 796, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 124.875, "epoch": 2.6913996627318717, "grad_norm": 40.152092093398544, "kl": 0.6640625, "learning_rate": 7.756193693693694e-07, "loss": 0.0007, "reward": 3.6436657905578613, "reward_std": 0.10985162109136581, "rewards/final_reward": 1.8214199167549583, "rewards/mask_iou_reward": 0.9107099583774791, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6436654925346375, "rewards/thk_ans_format_reward": 1.0, "step": 797, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 133.71875762939453, "epoch": 2.694772344013491, "grad_norm": 10.087858747837071, "kl": 0.537109375, "learning_rate": 7.753378378378378e-07, "loss": 0.0006, "reward": 3.4809383153915405, "reward_std": 0.08559620007872581, "rewards/final_reward": 0.6748064299090106, "rewards/mask_iou_reward": 0.3374032149545053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.48093843460083, "rewards/thk_ans_format_reward": 1.0, "step": 798, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 139.15625381469727, "epoch": 2.6981450252951094, "grad_norm": 18.52801654982961, "kl": 0.548828125, "learning_rate": 7.750563063063063e-07, "loss": 0.0006, "reward": 3.5452533960342407, "reward_std": 0.1478636972606182, "rewards/final_reward": 1.5319860973186294, "rewards/mask_iou_reward": 0.7659930486593147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.545253336429596, "rewards/thk_ans_format_reward": 1.0, "step": 799, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.13542175292969, "epoch": 2.7015177065767286, "grad_norm": 10.657004525322808, "kl": 0.58203125, "learning_rate": 7.747747747747747e-07, "loss": 0.0006, "reward": 3.6775232553482056, "reward_std": 0.09374432638287544, "rewards/final_reward": 1.8238359214270736, "rewards/mask_iou_reward": 0.9119179607135368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.677523136138916, "rewards/thk_ans_format_reward": 1.0, "step": 800, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 154.4166717529297, "epoch": 2.7048903878583475, "grad_norm": 11.342698990143404, "kl": 0.552734375, "learning_rate": 7.744932432432432e-07, "loss": 0.0006, "reward": 2.8509023189544678, "reward_std": 0.1925317421555519, "rewards/final_reward": 1.3280640382980335, "rewards/mask_iou_reward": 0.6640320191490168, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.850902259349823, "rewards/thk_ans_format_reward": 1.0, "step": 801, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.60417175292969, "epoch": 2.7082630691399663, "grad_norm": 9.082344751225259, "kl": 0.69140625, "learning_rate": 7.742117117117117e-07, "loss": 0.0007, "reward": 3.264283299446106, "reward_std": 0.0901465336792171, "rewards/final_reward": 1.0489787386958307, "rewards/mask_iou_reward": 0.5244893693479153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2642829716205597, "rewards/thk_ans_format_reward": 1.0, "step": 802, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 153.3229217529297, "epoch": 2.711635750421585, "grad_norm": 20.722170032828355, "kl": 0.486328125, "learning_rate": 7.739301801801802e-07, "loss": 0.0005, "reward": 3.3872246742248535, "reward_std": 0.10432956367731094, "rewards/final_reward": 1.1002763490882639, "rewards/mask_iou_reward": 0.5501381745441319, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3872247338294983, "rewards/thk_ans_format_reward": 1.0, "step": 803, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 121.33333587646484, "epoch": 2.715008431703204, "grad_norm": 7.361199516507246, "kl": 0.58203125, "learning_rate": 7.736486486486487e-07, "loss": 0.0006, "reward": 3.1800429821014404, "reward_std": 0.07173176482319832, "rewards/final_reward": 1.3100901139880954, "rewards/mask_iou_reward": 0.6550450569940477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1800428628921509, "rewards/thk_ans_format_reward": 1.0, "step": 804, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.66666793823242, "epoch": 2.718381112984823, "grad_norm": 9.78121969722757, "kl": 0.578125, "learning_rate": 7.733671171171171e-07, "loss": 0.0006, "reward": 3.287275791168213, "reward_std": 0.06260454282164574, "rewards/final_reward": 1.7459222437466897, "rewards/mask_iou_reward": 0.8729611218733448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.287275791168213, "rewards/thk_ans_format_reward": 1.0, "step": 805, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.63541793823242, "epoch": 2.7217537942664416, "grad_norm": 10.8247555149658, "kl": 0.658203125, "learning_rate": 7.730855855855856e-07, "loss": 0.0007, "reward": 3.0468918085098267, "reward_std": 0.18924224376678467, "rewards/final_reward": 1.2246028478679378, "rewards/mask_iou_reward": 0.6123014239339689, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.046891689300537, "rewards/thk_ans_format_reward": 1.0, "step": 806, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.56250381469727, "epoch": 2.725126475548061, "grad_norm": 17.086063759084684, "kl": 1.6005859375, "learning_rate": 7.728040540540541e-07, "loss": 0.0016, "reward": 3.525339722633362, "reward_std": 0.03663340024650097, "rewards/final_reward": 1.7980175535130674, "rewards/mask_iou_reward": 0.8990087767565337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.525339961051941, "rewards/thk_ans_format_reward": 1.0, "step": 807, "think_completion_length": 10.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.34375, "epoch": 2.7284991568296797, "grad_norm": 7.833542813069643, "kl": 0.62890625, "learning_rate": 7.725225225225225e-07, "loss": 0.0006, "reward": 3.1154762506484985, "reward_std": 0.04759081266820431, "rewards/final_reward": 0.8310892527079872, "rewards/mask_iou_reward": 0.4155446263539936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.115476131439209, "rewards/thk_ans_format_reward": 1.0, "step": 808, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.48958969116211, "epoch": 2.7318718381112985, "grad_norm": 9.54137374323096, "kl": 0.544921875, "learning_rate": 7.722409909909909e-07, "loss": 0.0006, "reward": 3.054728627204895, "reward_std": 0.10850285552442074, "rewards/final_reward": 1.290442879401124, "rewards/mask_iou_reward": 0.645221439700562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0547286868095398, "rewards/thk_ans_format_reward": 1.0, "step": 809, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.55208587646484, "epoch": 2.7352445193929174, "grad_norm": 11.466989004969372, "kl": 0.564453125, "learning_rate": 7.719594594594593e-07, "loss": 0.0006, "reward": 3.473750948905945, "reward_std": 0.1244993582367897, "rewards/final_reward": 0.9779945483246706, "rewards/mask_iou_reward": 0.4889972741623353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4737508893013, "rewards/thk_ans_format_reward": 1.0, "step": 810, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 118.77083587646484, "epoch": 2.738617200674536, "grad_norm": 6.899361597270846, "kl": 0.58203125, "learning_rate": 7.716779279279278e-07, "loss": 0.0006, "reward": 3.7204372882843018, "reward_std": 0.06413896754384041, "rewards/final_reward": 1.8257265424655205, "rewards/mask_iou_reward": 0.9128632712327602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7204372882843018, "rewards/thk_ans_format_reward": 1.0, "step": 811, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 118.70833587646484, "epoch": 2.741989881956155, "grad_norm": 26.308406144808927, "kl": 0.6171875, "learning_rate": 7.713963963963963e-07, "loss": 0.0006, "reward": 3.508059859275818, "reward_std": 0.12611358240246773, "rewards/final_reward": 1.4690036592278546, "rewards/mask_iou_reward": 0.7345018296139273, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.508059799671173, "rewards/thk_ans_format_reward": 1.0, "step": 812, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 135.87500381469727, "epoch": 2.745362563237774, "grad_norm": 12.883486258773985, "kl": 0.634765625, "learning_rate": 7.711148648648648e-07, "loss": 0.0006, "reward": 3.470430374145508, "reward_std": 0.1318624820560217, "rewards/final_reward": 1.7007833258640779, "rewards/mask_iou_reward": 0.8503916629320389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4704301953315735, "rewards/thk_ans_format_reward": 1.0, "step": 813, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 118.57292175292969, "epoch": 2.748735244519393, "grad_norm": 33.01004357932649, "kl": 0.607421875, "learning_rate": 7.708333333333333e-07, "loss": 0.0006, "reward": 3.14322030544281, "reward_std": 0.16609105467796326, "rewards/final_reward": 1.6312836242368816, "rewards/mask_iou_reward": 0.8156418121184408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.14322030544281, "rewards/thk_ans_format_reward": 1.0, "step": 814, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 116.04167175292969, "epoch": 2.752107925801012, "grad_norm": 11.569718364209079, "kl": 0.54296875, "learning_rate": 7.705518018018018e-07, "loss": 0.0006, "reward": 3.368572950363159, "reward_std": 0.09339471906423569, "rewards/final_reward": 1.6949728648788742, "rewards/mask_iou_reward": 0.8474864324394371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3685729503631592, "rewards/thk_ans_format_reward": 1.0, "step": 815, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 117.89583587646484, "epoch": 2.7554806070826308, "grad_norm": 10.47686924222413, "kl": 0.609375, "learning_rate": 7.702702702702702e-07, "loss": 0.0006, "reward": 3.353748917579651, "reward_std": 0.06471065618097782, "rewards/final_reward": 0.9686143763127055, "rewards/mask_iou_reward": 0.48430718815635276, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3537487983703613, "rewards/thk_ans_format_reward": 1.0, "step": 816, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.36458587646484, "epoch": 2.7588532883642496, "grad_norm": 14.196036160401253, "kl": 0.56640625, "learning_rate": 7.699887387387387e-07, "loss": 0.0006, "reward": 3.155430555343628, "reward_std": 0.17521775886416435, "rewards/final_reward": 1.5953339400653568, "rewards/mask_iou_reward": 0.7976669700326784, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1554304361343384, "rewards/thk_ans_format_reward": 1.0, "step": 817, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.17708587646484, "epoch": 2.7622259696458684, "grad_norm": 8.868325037864638, "kl": 0.630859375, "learning_rate": 7.697072072072071e-07, "loss": 0.0006, "reward": 3.2271395921707153, "reward_std": 0.1283954232931137, "rewards/final_reward": 1.859811171144015, "rewards/mask_iou_reward": 0.9299055855720075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2271394431591034, "rewards/thk_ans_format_reward": 1.0, "step": 818, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 119.84375381469727, "epoch": 2.7655986509274872, "grad_norm": 10.61611861966689, "kl": 0.6015625, "learning_rate": 7.694256756756756e-07, "loss": 0.0006, "reward": 3.309920072555542, "reward_std": 0.09556181728839874, "rewards/final_reward": 1.5073089264477035, "rewards/mask_iou_reward": 0.7536544632238518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.309919834136963, "rewards/thk_ans_format_reward": 1.0, "step": 819, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.68750381469727, "epoch": 2.768971332209106, "grad_norm": 23.21356291600513, "kl": 0.58984375, "learning_rate": 7.691441441441441e-07, "loss": 0.0006, "reward": 3.2571535110473633, "reward_std": 0.09767593070864677, "rewards/final_reward": 1.4686262349494574, "rewards/mask_iou_reward": 0.7343131174747287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2571535110473633, "rewards/thk_ans_format_reward": 1.0, "step": 820, "think_completion_length": 7.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 118.82291793823242, "epoch": 2.7723440134907253, "grad_norm": 18.56850560046818, "kl": 0.611328125, "learning_rate": 7.688626126126125e-07, "loss": 0.0006, "reward": 3.1783725023269653, "reward_std": 0.12528940849006176, "rewards/final_reward": 0.9751798189706983, "rewards/mask_iou_reward": 0.48758990948534914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1783722639083862, "rewards/thk_ans_format_reward": 1.0, "step": 821, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 120.22916793823242, "epoch": 2.775716694772344, "grad_norm": 11.552415569645232, "kl": 0.578125, "learning_rate": 7.68581081081081e-07, "loss": 0.0006, "reward": 3.4517630338668823, "reward_std": 0.09905361756682396, "rewards/final_reward": 1.0850289802981778, "rewards/mask_iou_reward": 0.5425144901490889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4517630338668823, "rewards/thk_ans_format_reward": 1.0, "step": 822, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 107.69791793823242, "epoch": 2.779089376053963, "grad_norm": 14.11041251237749, "kl": 0.67578125, "learning_rate": 7.682995495495495e-07, "loss": 0.0007, "reward": 3.6109098196029663, "reward_std": 0.15111944265663624, "rewards/final_reward": 1.4433314059687583, "rewards/mask_iou_reward": 0.7216657029843792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.610909879207611, "rewards/thk_ans_format_reward": 1.0, "step": 823, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.13541793823242, "epoch": 2.782462057335582, "grad_norm": 19.407652730449964, "kl": 0.59375, "learning_rate": 7.68018018018018e-07, "loss": 0.0006, "reward": 3.5962259769439697, "reward_std": 0.09219707362353802, "rewards/final_reward": 1.0636648632991816, "rewards/mask_iou_reward": 0.5318324316495908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5962256789207458, "rewards/thk_ans_format_reward": 1.0, "step": 824, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.45833587646484, "epoch": 2.7858347386172007, "grad_norm": 48.76370678893969, "kl": 0.61328125, "learning_rate": 7.677364864864865e-07, "loss": 0.0006, "reward": 3.2339768409729004, "reward_std": 0.021290178410708904, "rewards/final_reward": 0.9685718969102269, "rewards/mask_iou_reward": 0.48428594845511347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2339767217636108, "rewards/thk_ans_format_reward": 1.0, "step": 825, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 120.19791793823242, "epoch": 2.7892074198988195, "grad_norm": 23.00837890523965, "kl": 0.615234375, "learning_rate": 7.674549549549549e-07, "loss": 0.0006, "reward": 3.3137834072113037, "reward_std": 0.12772230803966522, "rewards/final_reward": 0.8328103990514315, "rewards/mask_iou_reward": 0.41640519952571575, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3137832880020142, "rewards/thk_ans_format_reward": 1.0, "step": 826, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 125.57291793823242, "epoch": 2.7925801011804383, "grad_norm": 10.93210808105748, "kl": 0.599609375, "learning_rate": 7.671734234234234e-07, "loss": 0.0006, "reward": 3.1316012144088745, "reward_std": 0.13967030495405197, "rewards/final_reward": 0.7290396522833856, "rewards/mask_iou_reward": 0.3645198261416928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1316012740135193, "rewards/thk_ans_format_reward": 1.0, "step": 827, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.54166793823242, "epoch": 2.7959527824620576, "grad_norm": 7.509396937254856, "kl": 0.576171875, "learning_rate": 7.668918918918918e-07, "loss": 0.0006, "reward": 3.354486584663391, "reward_std": 0.09209609404206276, "rewards/final_reward": 0.9352793145560208, "rewards/mask_iou_reward": 0.4676396572780104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3544864654541016, "rewards/thk_ans_format_reward": 1.0, "step": 828, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.16666793823242, "epoch": 2.799325463743676, "grad_norm": 19.71623898667479, "kl": 0.59765625, "learning_rate": 7.666103603603603e-07, "loss": 0.0006, "reward": 3.440225601196289, "reward_std": 0.15994113497436047, "rewards/final_reward": 1.4639581496698817, "rewards/mask_iou_reward": 0.7319790748349408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.440225601196289, "rewards/thk_ans_format_reward": 1.0, "step": 829, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 119.73958587646484, "epoch": 2.8026981450252952, "grad_norm": 7.641932508738693, "kl": 0.974609375, "learning_rate": 7.663288288288288e-07, "loss": 0.001, "reward": 3.4703879356384277, "reward_std": 0.06228171847760677, "rewards/final_reward": 0.9386934150923482, "rewards/mask_iou_reward": 0.4693467075461741, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4703876972198486, "rewards/thk_ans_format_reward": 1.0, "step": 830, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 118.61458587646484, "epoch": 2.806070826306914, "grad_norm": 8.790813148317039, "kl": 0.5625, "learning_rate": 7.660472972972972e-07, "loss": 0.0006, "reward": 3.3965048789978027, "reward_std": 0.12809063494205475, "rewards/final_reward": 1.6194442213451081, "rewards/mask_iou_reward": 0.8097221106725541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3965047597885132, "rewards/thk_ans_format_reward": 1.0, "step": 831, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 120.71875381469727, "epoch": 2.809443507588533, "grad_norm": 11.087230061558003, "kl": 0.6015625, "learning_rate": 7.657657657657657e-07, "loss": 0.0006, "reward": 2.8211324214935303, "reward_std": 0.06655344553291798, "rewards/final_reward": 0.9339966788083726, "rewards/mask_iou_reward": 0.4669983394041863, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.821132242679596, "rewards/thk_ans_format_reward": 1.0, "step": 832, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 118.75000381469727, "epoch": 2.8128161888701517, "grad_norm": 11.599365716013358, "kl": 0.6171875, "learning_rate": 7.654842342342343e-07, "loss": 0.0006, "reward": 3.1649749279022217, "reward_std": 0.1306835636496544, "rewards/final_reward": 1.1254830358712231, "rewards/mask_iou_reward": 0.5627415179356116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1649749279022217, "rewards/thk_ans_format_reward": 1.0, "step": 833, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 110.16666793823242, "epoch": 2.8161888701517706, "grad_norm": 34.09302576043981, "kl": 0.658203125, "learning_rate": 7.652027027027027e-07, "loss": 0.0007, "reward": 3.0825788974761963, "reward_std": 0.19801976531744003, "rewards/final_reward": 1.8678686666522455, "rewards/mask_iou_reward": 0.9339343333261227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0825787782669067, "rewards/thk_ans_format_reward": 1.0, "step": 834, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 114.60416793823242, "epoch": 2.8195615514333894, "grad_norm": 6.514989494359038, "kl": 0.638671875, "learning_rate": 7.649211711711712e-07, "loss": 0.0006, "reward": 3.1945159435272217, "reward_std": 0.21621747314929962, "rewards/final_reward": 1.2842385429337646, "rewards/mask_iou_reward": 0.6421192714668823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1945159435272217, "rewards/thk_ans_format_reward": 1.0, "step": 835, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.25, "epoch": 2.822934232715008, "grad_norm": 12.012866329193594, "kl": 0.71875, "learning_rate": 7.646396396396396e-07, "loss": 0.0007, "reward": 3.374995231628418, "reward_std": 0.10654059797525406, "rewards/final_reward": 0.9926595462633434, "rewards/mask_iou_reward": 0.4963297731316717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3749953508377075, "rewards/thk_ans_format_reward": 1.0, "step": 836, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.05208587646484, "epoch": 2.8263069139966275, "grad_norm": 12.41159393978972, "kl": 0.533203125, "learning_rate": 7.643581081081081e-07, "loss": 0.0005, "reward": 3.1620161533355713, "reward_std": 0.11073334142565727, "rewards/final_reward": 1.1209113472967716, "rewards/mask_iou_reward": 0.5604556736483858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1620161533355713, "rewards/thk_ans_format_reward": 1.0, "step": 837, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 133.93750762939453, "epoch": 2.8296795952782463, "grad_norm": 31.69858166724224, "kl": 0.55859375, "learning_rate": 7.640765765765766e-07, "loss": 0.0006, "reward": 3.7063937187194824, "reward_std": 0.14060832560062408, "rewards/final_reward": 1.7010022958199755, "rewards/mask_iou_reward": 0.8505011479099878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7063936591148376, "rewards/thk_ans_format_reward": 1.0, "step": 838, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.8958396911621, "epoch": 2.833052276559865, "grad_norm": 16.940299380888987, "kl": 0.580078125, "learning_rate": 7.63795045045045e-07, "loss": 0.0006, "reward": 3.263391137123108, "reward_std": 0.27852288633584976, "rewards/final_reward": 1.2191613669587613, "rewards/mask_iou_reward": 0.6095806834793807, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2738077640533447, "rewards/thk_ans_format_reward": 1.0, "step": 839, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.56250381469727, "epoch": 2.836424957841484, "grad_norm": 9.3217946169954, "kl": 0.5703125, "learning_rate": 7.635135135135135e-07, "loss": 0.0006, "reward": 3.2963112592697144, "reward_std": 0.2867426946759224, "rewards/final_reward": 1.3462905702697516, "rewards/mask_iou_reward": 0.6731452851348758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.29631108045578, "rewards/thk_ans_format_reward": 1.0, "step": 840, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 2.839797639123103, "grad_norm": 6.720125561600698, "kl": 0.64453125, "learning_rate": 7.632319819819819e-07, "loss": 0.0006, "reward": 3.3049758672714233, "reward_std": 0.1726619452238083, "rewards/final_reward": 1.7811929357135363, "rewards/mask_iou_reward": 0.8905964678567682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3049756586551666, "rewards/thk_ans_format_reward": 1.0, "step": 841, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 117.30208587646484, "epoch": 2.8431703204047216, "grad_norm": 12.783520229439375, "kl": 0.58984375, "learning_rate": 7.629504504504504e-07, "loss": 0.0006, "reward": 3.605306386947632, "reward_std": 0.09047617763280869, "rewards/final_reward": 1.752034756424516, "rewards/mask_iou_reward": 0.876017378212258, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6053063869476318, "rewards/thk_ans_format_reward": 1.0, "step": 842, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.61458587646484, "epoch": 2.8465430016863404, "grad_norm": 13.67923872207617, "kl": 0.666015625, "learning_rate": 7.62668918918919e-07, "loss": 0.0007, "reward": 3.123603582382202, "reward_std": 0.21856746077537537, "rewards/final_reward": 0.8532247935583459, "rewards/mask_iou_reward": 0.42661239677917295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1236035227775574, "rewards/thk_ans_format_reward": 1.0, "step": 843, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 117.44792175292969, "epoch": 2.8499156829679597, "grad_norm": 12.297845593512243, "kl": 0.59765625, "learning_rate": 7.623873873873874e-07, "loss": 0.0006, "reward": 3.408300280570984, "reward_std": 0.11788310110569, "rewards/final_reward": 1.5378804551548173, "rewards/mask_iou_reward": 0.7689402275774087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4083001613616943, "rewards/thk_ans_format_reward": 1.0, "step": 844, "think_completion_length": 5.25 }, { "clip_ratio": 0.0, "completion_length": 118.13542175292969, "epoch": 2.8532883642495785, "grad_norm": 7.898260141271431, "kl": 0.650390625, "learning_rate": 7.621058558558559e-07, "loss": 0.0007, "reward": 3.4168694019317627, "reward_std": 0.0871502235531807, "rewards/final_reward": 1.674952679337213, "rewards/mask_iou_reward": 0.8374763396686065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.416869342327118, "rewards/thk_ans_format_reward": 1.0, "step": 845, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.26041793823242, "epoch": 2.8566610455311974, "grad_norm": 18.807496744343155, "kl": 0.634765625, "learning_rate": 7.618243243243244e-07, "loss": 0.0007, "reward": 3.249427914619446, "reward_std": 0.11862549185752869, "rewards/final_reward": 0.9638047950481252, "rewards/mask_iou_reward": 0.4819023975240626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2494277954101562, "rewards/thk_ans_format_reward": 1.0, "step": 846, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.76041793823242, "epoch": 2.860033726812816, "grad_norm": 13.577778955368622, "kl": 0.609375, "learning_rate": 7.615427927927928e-07, "loss": 0.0006, "reward": 3.3712258338928223, "reward_std": 0.08537932112812996, "rewards/final_reward": 1.7251943154869958, "rewards/mask_iou_reward": 0.8625971577434979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3712258338928223, "rewards/thk_ans_format_reward": 1.0, "step": 847, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 110.09375, "epoch": 2.863406408094435, "grad_norm": 15.904610578246603, "kl": 0.595703125, "learning_rate": 7.612612612612613e-07, "loss": 0.0006, "reward": 3.2905749082565308, "reward_std": 0.13749309442937374, "rewards/final_reward": 1.7126632806370585, "rewards/mask_iou_reward": 0.8563316403185293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2905749678611755, "rewards/thk_ans_format_reward": 1.0, "step": 848, "think_completion_length": 5.25 }, { "clip_ratio": 0.0, "completion_length": 106.93750381469727, "epoch": 2.866779089376054, "grad_norm": 44.65435750438078, "kl": 0.658203125, "learning_rate": 7.609797297297296e-07, "loss": 0.0007, "reward": 3.3973218202590942, "reward_std": 0.16537553817033768, "rewards/final_reward": 1.4147304409035975, "rewards/mask_iou_reward": 0.7073652204517987, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.4494050741195679, "rewards/thk_ans_format_reward": 1.0, "step": 849, "think_completion_length": 5.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.84375, "epoch": 2.8701517706576727, "grad_norm": 114.68917591790206, "kl": 0.619140625, "learning_rate": 7.606981981981981e-07, "loss": 0.0006, "reward": 3.299852728843689, "reward_std": 0.10934684053063393, "rewards/final_reward": 0.6728932611020485, "rewards/mask_iou_reward": 0.33644663055102425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.299852728843689, "rewards/thk_ans_format_reward": 1.0, "step": 850, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.3854217529297, "epoch": 2.873524451939292, "grad_norm": 14.365891116065718, "kl": 0.5625, "learning_rate": 7.604166666666666e-07, "loss": 0.0006, "reward": 3.340060830116272, "reward_std": 0.19183791242539883, "rewards/final_reward": 0.27853339711707975, "rewards/mask_iou_reward": 0.13926669855853988, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.36089426279068, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 851, "think_completion_length": 5.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 115.73958587646484, "epoch": 2.876897133220911, "grad_norm": 84.95926295981053, "kl": 0.75, "learning_rate": 7.60135135135135e-07, "loss": 0.0007, "reward": 3.185040235519409, "reward_std": 0.18357310444116592, "rewards/final_reward": 0.8179576324073183, "rewards/mask_iou_reward": 0.40897881620365917, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.18503999710083, "rewards/thk_ans_format_reward": 1.0, "step": 852, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 116.94792175292969, "epoch": 2.8802698145025296, "grad_norm": 14.3857169488127, "kl": 0.607421875, "learning_rate": 7.598536036036036e-07, "loss": 0.0006, "reward": 3.0738601684570312, "reward_std": 0.1482534147799015, "rewards/final_reward": 1.5809381835741352, "rewards/mask_iou_reward": 0.7904690917870676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0738602876663208, "rewards/thk_ans_format_reward": 1.0, "step": 853, "think_completion_length": 5.0 }, { "clip_ratio": 0.0, "completion_length": 122.34375, "epoch": 2.8836424957841484, "grad_norm": 15.143727058052493, "kl": 0.572265625, "learning_rate": 7.59572072072072e-07, "loss": 0.0006, "reward": 3.4103574752807617, "reward_std": 0.18912208080291748, "rewards/final_reward": 1.4438347686562816, "rewards/mask_iou_reward": 0.7219173843281408, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4311907291412354, "rewards/thk_ans_format_reward": 1.0, "step": 854, "think_completion_length": 5.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 120.71875381469727, "epoch": 2.8870151770657673, "grad_norm": 9.044759438493474, "kl": 0.64453125, "learning_rate": 7.592905405405405e-07, "loss": 0.0006, "reward": 3.445785164833069, "reward_std": 0.11425403878092766, "rewards/final_reward": 1.7823204157680697, "rewards/mask_iou_reward": 0.8911602078840348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4457851648330688, "rewards/thk_ans_format_reward": 1.0, "step": 855, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 117.75, "epoch": 2.890387858347386, "grad_norm": 6.976236686403784, "kl": 0.576171875, "learning_rate": 7.59009009009009e-07, "loss": 0.0006, "reward": 3.5354377031326294, "reward_std": 0.0927988737821579, "rewards/final_reward": 1.2084556142828462, "rewards/mask_iou_reward": 0.6042278071414231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5354375839233398, "rewards/thk_ans_format_reward": 1.0, "step": 856, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 124.40625762939453, "epoch": 2.893760539629005, "grad_norm": 13.588204117509628, "kl": 0.564453125, "learning_rate": 7.587274774774774e-07, "loss": 0.0006, "reward": 3.5360511541366577, "reward_std": 0.13357886672019958, "rewards/final_reward": 1.4003553047999098, "rewards/mask_iou_reward": 0.7001776523999549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.536051332950592, "rewards/thk_ans_format_reward": 1.0, "step": 857, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 118.46875, "epoch": 2.897133220910624, "grad_norm": 7.591543471473635, "kl": 0.544921875, "learning_rate": 7.584459459459459e-07, "loss": 0.0005, "reward": 3.4365715980529785, "reward_std": 0.02841498889029026, "rewards/final_reward": 0.8789773549323615, "rewards/mask_iou_reward": 0.43948867746618075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.436571717262268, "rewards/thk_ans_format_reward": 1.0, "step": 858, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 104.77083587646484, "epoch": 2.9005059021922426, "grad_norm": 69.82960550237168, "kl": 0.638671875, "learning_rate": 7.581644144144143e-07, "loss": 0.0006, "reward": 3.4785648584365845, "reward_std": 0.03144015744328499, "rewards/final_reward": 1.0896791036053899, "rewards/mask_iou_reward": 0.5448395518026949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4785647988319397, "rewards/thk_ans_format_reward": 1.0, "step": 859, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.38541793823242, "epoch": 2.903878583473862, "grad_norm": 41.63556782679187, "kl": 0.546875, "learning_rate": 7.578828828828828e-07, "loss": 0.0005, "reward": 3.352591037750244, "reward_std": 0.19262491166591644, "rewards/final_reward": 1.6292080190650648, "rewards/mask_iou_reward": 0.8146040095325324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3525908589363098, "rewards/thk_ans_format_reward": 1.0, "step": 860, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.44792175292969, "epoch": 2.9072512647554807, "grad_norm": 17.09162298189723, "kl": 0.58203125, "learning_rate": 7.576013513513513e-07, "loss": 0.0006, "reward": 3.504484534263611, "reward_std": 0.04877541400492191, "rewards/final_reward": 1.386411616863553, "rewards/mask_iou_reward": 0.6932058084317765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.50448477268219, "rewards/thk_ans_format_reward": 1.0, "step": 861, "think_completion_length": 5.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 118.34375381469727, "epoch": 2.9106239460370995, "grad_norm": 17.334156061583297, "kl": 0.732421875, "learning_rate": 7.573198198198197e-07, "loss": 0.0007, "reward": 3.111849308013916, "reward_std": 0.15942617878317833, "rewards/final_reward": 0.8906268643760444, "rewards/mask_iou_reward": 0.4453134321880222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1222659349441528, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 862, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 119.60416793823242, "epoch": 2.9139966273187183, "grad_norm": 9.828375359131886, "kl": 0.611328125, "learning_rate": 7.570382882882883e-07, "loss": 0.0006, "reward": 3.6048978567123413, "reward_std": 0.08647706173360348, "rewards/final_reward": 1.4054294071757991, "rewards/mask_iou_reward": 0.7027147035878996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6048980355262756, "rewards/thk_ans_format_reward": 1.0, "step": 863, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.61458587646484, "epoch": 2.917369308600337, "grad_norm": 14.152492330600408, "kl": 0.8125, "learning_rate": 7.567567567567568e-07, "loss": 0.0008, "reward": 3.552889108657837, "reward_std": 0.06724633555859327, "rewards/final_reward": 1.411230216936757, "rewards/mask_iou_reward": 0.7056151084683785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5528889298439026, "rewards/thk_ans_format_reward": 1.0, "step": 864, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.81250381469727, "epoch": 2.920741989881956, "grad_norm": 12.085736518884486, "kl": 0.576171875, "learning_rate": 7.564752252252252e-07, "loss": 0.0006, "reward": 3.794761896133423, "reward_std": 0.0494751688092947, "rewards/final_reward": 1.8719720376871902, "rewards/mask_iou_reward": 0.9359860188435951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.794761836528778, "rewards/thk_ans_format_reward": 1.0, "step": 865, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.80208587646484, "epoch": 2.924114671163575, "grad_norm": 10.394094315559089, "kl": 0.591796875, "learning_rate": 7.561936936936937e-07, "loss": 0.0006, "reward": 3.1822298765182495, "reward_std": 0.09092389792203903, "rewards/final_reward": 0.8456114424522805, "rewards/mask_iou_reward": 0.42280572122614024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1822299361228943, "rewards/thk_ans_format_reward": 1.0, "step": 866, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.58333587646484, "epoch": 2.927487352445194, "grad_norm": 10.304628611502135, "kl": 0.728515625, "learning_rate": 7.559121621621621e-07, "loss": 0.0007, "reward": 3.419139266014099, "reward_std": 0.14908288419246674, "rewards/final_reward": 0.6543206464945732, "rewards/mask_iou_reward": 0.3271603232472866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4191393852233887, "rewards/thk_ans_format_reward": 1.0, "step": 867, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.71875762939453, "epoch": 2.930860033726813, "grad_norm": 13.912370115835516, "kl": 0.58203125, "learning_rate": 7.556306306306306e-07, "loss": 0.0006, "reward": 3.4562323093414307, "reward_std": 0.11177996918559074, "rewards/final_reward": 1.6936401492810278, "rewards/mask_iou_reward": 0.8468200746405139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4562321901321411, "rewards/thk_ans_format_reward": 1.0, "step": 868, "think_completion_length": 5.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 122.64583969116211, "epoch": 2.9342327150084317, "grad_norm": 5.671121715929217, "kl": 0.58203125, "learning_rate": 7.553490990990991e-07, "loss": 0.0006, "reward": 3.084256410598755, "reward_std": 0.11680587381124496, "rewards/final_reward": 1.2962688239052136, "rewards/mask_iou_reward": 0.6481344119526068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0842564702033997, "rewards/thk_ans_format_reward": 1.0, "step": 869, "think_completion_length": 5.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 107.05208587646484, "epoch": 2.9376053962900506, "grad_norm": 13.12728442117305, "kl": 0.60546875, "learning_rate": 7.550675675675675e-07, "loss": 0.0006, "reward": 3.352104663848877, "reward_std": 0.062287621200084686, "rewards/final_reward": 1.205827396942925, "rewards/mask_iou_reward": 0.6029136984714625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.352104663848877, "rewards/thk_ans_format_reward": 1.0, "step": 870, "think_completion_length": 5.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.44791793823242, "epoch": 2.9409780775716694, "grad_norm": 22.34161037724869, "kl": 0.591796875, "learning_rate": 7.54786036036036e-07, "loss": 0.0006, "reward": 3.1144481897354126, "reward_std": 0.09380730241537094, "rewards/final_reward": 0.7915381221938045, "rewards/mask_iou_reward": 0.39576906109690224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1144481599330902, "rewards/thk_ans_format_reward": 1.0, "step": 871, "think_completion_length": 5.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 117.97916793823242, "epoch": 2.9443507588532882, "grad_norm": 13.595642606692284, "kl": 0.607421875, "learning_rate": 7.545045045045044e-07, "loss": 0.0006, "reward": 3.6105018854141235, "reward_std": 0.15074967592954636, "rewards/final_reward": 1.491989261637721, "rewards/mask_iou_reward": 0.7459946308188605, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6105021238327026, "rewards/thk_ans_format_reward": 1.0, "step": 872, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 117.01041793823242, "epoch": 2.947723440134907, "grad_norm": 15.298145363684752, "kl": 0.62109375, "learning_rate": 7.54222972972973e-07, "loss": 0.0006, "reward": 3.4203200340270996, "reward_std": 0.11649902537465096, "rewards/final_reward": 1.252674476480225, "rewards/mask_iou_reward": 0.6263372382401124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.42031991481781, "rewards/thk_ans_format_reward": 1.0, "step": 873, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 186.15625762939453, "epoch": 2.9510961214165263, "grad_norm": 6.164036490191176, "kl": 0.76171875, "learning_rate": 7.539414414414415e-07, "loss": 0.0008, "reward": 3.377174139022827, "reward_std": 0.25197170674800873, "rewards/final_reward": 0.8577654543055984, "rewards/mask_iou_reward": 0.4288827271527992, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.387590765953064, "rewards/thk_ans_format_reward": 1.0, "step": 874, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 142.56250762939453, "epoch": 2.954468802698145, "grad_norm": 7.4405944010792116, "kl": 0.56640625, "learning_rate": 7.536599099099099e-07, "loss": 0.0006, "reward": 3.17443311214447, "reward_std": 0.17868845723569393, "rewards/final_reward": 0.5799214539574395, "rewards/mask_iou_reward": 0.2899607269787198, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1744331121444702, "rewards/thk_ans_format_reward": 1.0, "step": 875, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 118.69792175292969, "epoch": 2.957841483979764, "grad_norm": 18.856636432233937, "kl": 0.55859375, "learning_rate": 7.533783783783784e-07, "loss": 0.0006, "reward": 3.4503493309020996, "reward_std": 0.14452160894870758, "rewards/final_reward": 1.6115709043227955, "rewards/mask_iou_reward": 0.8057854521613977, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4503492712974548, "rewards/thk_ans_format_reward": 1.0, "step": 876, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 170.64583587646484, "epoch": 2.961214165261383, "grad_norm": 6.072809246494209, "kl": 0.5546875, "learning_rate": 7.530968468468468e-07, "loss": 0.0006, "reward": 3.4942249059677124, "reward_std": 0.3860067129135132, "rewards/final_reward": 1.5278986439451647, "rewards/mask_iou_reward": 0.7639493219725824, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.5254749059677124, "rewards/thk_ans_format_reward": 1.0, "step": 877, "think_completion_length": 5.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 129.39583587646484, "epoch": 2.9645868465430016, "grad_norm": 33.280175809232574, "kl": 0.541015625, "learning_rate": 7.528153153153153e-07, "loss": 0.0006, "reward": 3.324143171310425, "reward_std": 0.04724482260644436, "rewards/final_reward": 0.8673007401404467, "rewards/mask_iou_reward": 0.43365037007022333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.324143260717392, "rewards/thk_ans_format_reward": 1.0, "step": 878, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 133.7291717529297, "epoch": 2.9679595278246205, "grad_norm": 13.072328707508886, "kl": 0.5078125, "learning_rate": 7.525337837837838e-07, "loss": 0.0005, "reward": 3.472671866416931, "reward_std": 0.11400551535189152, "rewards/final_reward": 1.63759505136584, "rewards/mask_iou_reward": 0.81879752568292, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4726718068122864, "rewards/thk_ans_format_reward": 1.0, "step": 879, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.64583587646484, "epoch": 2.9713322091062393, "grad_norm": 17.45267800201795, "kl": 0.57421875, "learning_rate": 7.522522522522522e-07, "loss": 0.0006, "reward": 3.4755067825317383, "reward_std": 0.14281757548451424, "rewards/final_reward": 1.3163042838611094, "rewards/mask_iou_reward": 0.6581521419305547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.475506603717804, "rewards/thk_ans_format_reward": 1.0, "step": 880, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.42708587646484, "epoch": 2.9747048903878586, "grad_norm": 21.418213924625412, "kl": 0.708984375, "learning_rate": 7.519707207207207e-07, "loss": 0.0007, "reward": 3.1879799365997314, "reward_std": 0.05884386505931616, "rewards/final_reward": 1.182249554893902, "rewards/mask_iou_reward": 0.591124777446951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1879799962043762, "rewards/thk_ans_format_reward": 1.0, "step": 881, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.31250381469727, "epoch": 2.9780775716694774, "grad_norm": 20.640043092169407, "kl": 0.578125, "learning_rate": 7.516891891891891e-07, "loss": 0.0006, "reward": 3.0795818567276, "reward_std": 0.10945974290370941, "rewards/final_reward": 0.5469563823219387, "rewards/mask_iou_reward": 0.27347819116096933, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0795818269252777, "rewards/thk_ans_format_reward": 1.0, "step": 882, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.18750381469727, "epoch": 2.9814502529510962, "grad_norm": 11.876012262077612, "kl": 0.591796875, "learning_rate": 7.514076576576577e-07, "loss": 0.0006, "reward": 3.385973572731018, "reward_std": 0.05023301625624299, "rewards/final_reward": 1.2205781767959176, "rewards/mask_iou_reward": 0.6102890883979588, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3859735131263733, "rewards/thk_ans_format_reward": 1.0, "step": 883, "think_completion_length": 5.625 }, { "clip_ratio": 0.0, "completion_length": 119.19791793823242, "epoch": 2.984822934232715, "grad_norm": 14.764258051685124, "kl": 0.544921875, "learning_rate": 7.511261261261262e-07, "loss": 0.0006, "reward": 3.135034918785095, "reward_std": 0.18100818619132042, "rewards/final_reward": 1.6384016507086034, "rewards/mask_iou_reward": 0.8192008253543017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1350347995758057, "rewards/thk_ans_format_reward": 1.0, "step": 884, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 137.32291793823242, "epoch": 2.988195615514334, "grad_norm": 18.135963045966133, "kl": 0.59765625, "learning_rate": 7.508445945945946e-07, "loss": 0.0006, "reward": 3.519152283668518, "reward_std": 0.13311653956770897, "rewards/final_reward": 1.8226950428889377, "rewards/mask_iou_reward": 0.9113475214444688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5191519260406494, "rewards/thk_ans_format_reward": 1.0, "step": 885, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 140.25, "epoch": 2.9915682967959527, "grad_norm": 12.289949298946091, "kl": 0.5625, "learning_rate": 7.505630630630631e-07, "loss": 0.0006, "reward": 3.38117778301239, "reward_std": 0.05419469904154539, "rewards/final_reward": 1.1812705416065237, "rewards/mask_iou_reward": 0.5906352708032618, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3811779618263245, "rewards/thk_ans_format_reward": 1.0, "step": 886, "think_completion_length": 5.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.62500381469727, "epoch": 2.9949409780775715, "grad_norm": 14.89633635692612, "kl": 0.57421875, "learning_rate": 7.502815315315316e-07, "loss": 0.0006, "reward": 3.042057514190674, "reward_std": 0.16031931340694427, "rewards/final_reward": 1.0657722241196894, "rewards/mask_iou_reward": 0.5328861120598447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.042057454586029, "rewards/thk_ans_format_reward": 1.0, "step": 887, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 119.10526657104492, "epoch": 2.998313659359191, "grad_norm": 16.25986106299666, "kl": 2.525390625, "learning_rate": 7.5e-07, "loss": 0.0025, "reward": 3.4078052043914795, "reward_std": 0.08364403434097767, "rewards/final_reward": 1.5097456033230692, "rewards/mask_iou_reward": 0.7548728016615346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4078055024147034, "rewards/thk_ans_format_reward": 1.0, "step": 888, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 117.72917175292969, "epoch": 3.003372681281619, "grad_norm": 12.515595643382929, "kl": 0.56640625, "learning_rate": 7.497184684684684e-07, "loss": 0.0006, "reward": 3.089647054672241, "reward_std": 0.08703098073601723, "rewards/final_reward": 1.5181027098126436, "rewards/mask_iou_reward": 0.7590513549063218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0896472930908203, "rewards/thk_ans_format_reward": 1.0, "step": 889, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 119.18750381469727, "epoch": 3.0067453625632377, "grad_norm": 8.349129573087078, "kl": 0.63671875, "learning_rate": 7.494369369369368e-07, "loss": 0.0006, "reward": 3.489098906517029, "reward_std": 0.047829316928982735, "rewards/final_reward": 1.3113581184064553, "rewards/mask_iou_reward": 0.6556790592032277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4890989661216736, "rewards/thk_ans_format_reward": 1.0, "step": 890, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 117.59375, "epoch": 3.0101180438448565, "grad_norm": 13.899706919226007, "kl": 0.58984375, "learning_rate": 7.491554054054053e-07, "loss": 0.0006, "reward": 3.176019310951233, "reward_std": 0.07747252658009529, "rewards/final_reward": 0.5975464560543442, "rewards/mask_iou_reward": 0.2987732280271721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1760192811489105, "rewards/thk_ans_format_reward": 1.0, "step": 891, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 120.66667175292969, "epoch": 3.0134907251264758, "grad_norm": 11.626946310302078, "kl": 0.595703125, "learning_rate": 7.488738738738738e-07, "loss": 0.0006, "reward": 3.317869186401367, "reward_std": 0.08332300186157227, "rewards/final_reward": 1.464982306688062, "rewards/mask_iou_reward": 0.732491153344031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3178690671920776, "rewards/thk_ans_format_reward": 1.0, "step": 892, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.82292175292969, "epoch": 3.0168634064080946, "grad_norm": 15.906736400086396, "kl": 0.70703125, "learning_rate": 7.485923423423422e-07, "loss": 0.0007, "reward": 3.5675476789474487, "reward_std": 0.07752594165503979, "rewards/final_reward": 1.9184772041573201, "rewards/mask_iou_reward": 0.9592386020786601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5675475597381592, "rewards/thk_ans_format_reward": 1.0, "step": 893, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 118.07292175292969, "epoch": 3.0202360876897134, "grad_norm": 7.317486836759224, "kl": 0.6796875, "learning_rate": 7.483108108108108e-07, "loss": 0.0007, "reward": 2.8391889333724976, "reward_std": 0.12115252763032913, "rewards/final_reward": 1.2024703756275028, "rewards/mask_iou_reward": 0.6012351878137514, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8391889035701752, "rewards/thk_ans_format_reward": 1.0, "step": 894, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 120.17708587646484, "epoch": 3.0236087689713322, "grad_norm": 7.041223797807325, "kl": 0.6171875, "learning_rate": 7.480292792792792e-07, "loss": 0.0006, "reward": 3.25685453414917, "reward_std": 0.05167396366596222, "rewards/final_reward": 1.5520342513306697, "rewards/mask_iou_reward": 0.7760171256653349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2568546533584595, "rewards/thk_ans_format_reward": 1.0, "step": 895, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 118.96875381469727, "epoch": 3.026981450252951, "grad_norm": 21.643386059658464, "kl": 0.64453125, "learning_rate": 7.477477477477477e-07, "loss": 0.0007, "reward": 3.4809720516204834, "reward_std": 0.16744033992290497, "rewards/final_reward": 1.7740240475182896, "rewards/mask_iou_reward": 0.8870120237591448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4809719920158386, "rewards/thk_ans_format_reward": 1.0, "step": 896, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.375, "epoch": 3.03035413153457, "grad_norm": 12.520660533843897, "kl": 0.5859375, "learning_rate": 7.474662162162162e-07, "loss": 0.0006, "reward": 3.8004626035690308, "reward_std": 0.04519801028072834, "rewards/final_reward": 1.9572329694899626, "rewards/mask_iou_reward": 0.9786164847449813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8004624843597412, "rewards/thk_ans_format_reward": 1.0, "step": 897, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.80208969116211, "epoch": 3.0337268128161887, "grad_norm": 8.19496444222374, "kl": 0.642578125, "learning_rate": 7.471846846846846e-07, "loss": 0.0006, "reward": 3.576119303703308, "reward_std": 0.09716996923089027, "rewards/final_reward": 1.809313079459582, "rewards/mask_iou_reward": 0.904656539729791, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5761194229125977, "rewards/thk_ans_format_reward": 1.0, "step": 898, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 117.42708587646484, "epoch": 3.0370994940978076, "grad_norm": 15.121604716993511, "kl": 0.646484375, "learning_rate": 7.469031531531531e-07, "loss": 0.0007, "reward": 3.381696939468384, "reward_std": 0.03125000186264515, "rewards/final_reward": 1.098972567366859, "rewards/mask_iou_reward": 0.5494862836834296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.381696879863739, "rewards/thk_ans_format_reward": 1.0, "step": 899, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.39583587646484, "epoch": 3.040472175379427, "grad_norm": 16.861506466251836, "kl": 0.6953125, "learning_rate": 7.466216216216215e-07, "loss": 0.0007, "reward": 3.390763521194458, "reward_std": 0.14983459934592247, "rewards/final_reward": 0.7988987902225801, "rewards/mask_iou_reward": 0.3994493951112901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3907636404037476, "rewards/thk_ans_format_reward": 1.0, "step": 900, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 106.01041793823242, "epoch": 3.0438448566610457, "grad_norm": 7.578870543907528, "kl": 2.359375, "learning_rate": 7.4634009009009e-07, "loss": 0.0025, "reward": 3.7292349338531494, "reward_std": 0.05419635772705078, "rewards/final_reward": 1.5353630167973926, "rewards/mask_iou_reward": 0.7676815083986963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7292349934577942, "rewards/thk_ans_format_reward": 1.0, "step": 901, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 118.26041793823242, "epoch": 3.0472175379426645, "grad_norm": 7.03828666042413, "kl": 0.6171875, "learning_rate": 7.460585585585585e-07, "loss": 0.0006, "reward": 3.1864256858825684, "reward_std": 0.11836714297533035, "rewards/final_reward": 1.870425437434307, "rewards/mask_iou_reward": 0.9352127187171535, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1864253878593445, "rewards/thk_ans_format_reward": 1.0, "step": 902, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 122.31250381469727, "epoch": 3.0505902192242833, "grad_norm": 78.39294728322437, "kl": 0.595703125, "learning_rate": 7.457770270270269e-07, "loss": 0.0006, "reward": 3.6524301767349243, "reward_std": 0.05959741398692131, "rewards/final_reward": 1.6786844417748537, "rewards/mask_iou_reward": 0.8393422208874268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6524302959442139, "rewards/thk_ans_format_reward": 1.0, "step": 903, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.60417175292969, "epoch": 3.053962900505902, "grad_norm": 20.847676302517282, "kl": 0.6171875, "learning_rate": 7.454954954954955e-07, "loss": 0.0006, "reward": 3.5041786432266235, "reward_std": 0.02324836002662778, "rewards/final_reward": 1.0827496827285863, "rewards/mask_iou_reward": 0.5413748413642931, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5041785836219788, "rewards/thk_ans_format_reward": 1.0, "step": 904, "think_completion_length": 6.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 139.53125762939453, "epoch": 3.057335581787521, "grad_norm": 20.33434168015281, "kl": 1.6171875, "learning_rate": 7.45213963963964e-07, "loss": 0.0016, "reward": 3.3115906715393066, "reward_std": 0.1272381842136383, "rewards/final_reward": 0.7400105838265411, "rewards/mask_iou_reward": 0.37000529191327053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3115903735160828, "rewards/thk_ans_format_reward": 1.0, "step": 905, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 120.5, "epoch": 3.06070826306914, "grad_norm": 11.339915806504091, "kl": 0.60546875, "learning_rate": 7.449324324324324e-07, "loss": 0.0006, "reward": 3.45768141746521, "reward_std": 0.03249655629042536, "rewards/final_reward": 1.942120311669283, "rewards/mask_iou_reward": 0.9710601558346414, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4576812982559204, "rewards/thk_ans_format_reward": 1.0, "step": 906, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.10416793823242, "epoch": 3.064080944350759, "grad_norm": 126.51719502876821, "kl": 0.64453125, "learning_rate": 7.446509009009009e-07, "loss": 0.0006, "reward": 3.493595004081726, "reward_std": 0.14895956590771675, "rewards/final_reward": 1.4613195587593473, "rewards/mask_iou_reward": 0.7306597793796736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4935947060585022, "rewards/thk_ans_format_reward": 1.0, "step": 907, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 119.86458587646484, "epoch": 3.067453625632378, "grad_norm": 10.176054229968825, "kl": 0.671875, "learning_rate": 7.443693693693693e-07, "loss": 0.0007, "reward": 3.5275977849960327, "reward_std": 0.06466570496559143, "rewards/final_reward": 1.871962030287786, "rewards/mask_iou_reward": 0.935981015143893, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.527597725391388, "rewards/thk_ans_format_reward": 1.0, "step": 908, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 144.18750762939453, "epoch": 3.0708263069139967, "grad_norm": 139.48223009045432, "kl": 0.55859375, "learning_rate": 7.440878378378378e-07, "loss": 0.0006, "reward": 3.551697611808777, "reward_std": 0.1385558396577835, "rewards/final_reward": 1.7636108345880088, "rewards/mask_iou_reward": 0.8818054172940044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.551697313785553, "rewards/thk_ans_format_reward": 1.0, "step": 909, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 119.07291793823242, "epoch": 3.0741989881956155, "grad_norm": 59.71015922108442, "kl": 0.634765625, "learning_rate": 7.438063063063063e-07, "loss": 0.0007, "reward": 3.5257649421691895, "reward_std": 0.04206752963364124, "rewards/final_reward": 1.8485441979398265, "rewards/mask_iou_reward": 0.9242720989699132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5257649421691895, "rewards/thk_ans_format_reward": 1.0, "step": 910, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 134.09375, "epoch": 3.0775716694772344, "grad_norm": 9.506682856346176, "kl": 0.57421875, "learning_rate": 7.435247747747747e-07, "loss": 0.0006, "reward": 3.1859865188598633, "reward_std": 0.20397471636533737, "rewards/final_reward": 1.104930686579807, "rewards/mask_iou_reward": 0.5524653432899035, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.185986340045929, "rewards/thk_ans_format_reward": 1.0, "step": 911, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 163.58333587646484, "epoch": 3.080944350758853, "grad_norm": 92.45653407457144, "kl": 0.52734375, "learning_rate": 7.432432432432432e-07, "loss": 0.0005, "reward": 3.264729380607605, "reward_std": 0.11243878304958344, "rewards/final_reward": 1.5774552687450192, "rewards/mask_iou_reward": 0.7887276343725096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2647295594215393, "rewards/thk_ans_format_reward": 1.0, "step": 912, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 120.96875381469727, "epoch": 3.084317032040472, "grad_norm": 166.42384825356794, "kl": 0.62890625, "learning_rate": 7.429617117117116e-07, "loss": 0.0006, "reward": 3.5839874744415283, "reward_std": 0.08542875573039055, "rewards/final_reward": 1.7537484169410895, "rewards/mask_iou_reward": 0.8768742084705448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5839874744415283, "rewards/thk_ans_format_reward": 1.0, "step": 913, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 132.5416717529297, "epoch": 3.087689713322091, "grad_norm": 13.859197836267999, "kl": 0.689453125, "learning_rate": 7.426801801801802e-07, "loss": 0.0007, "reward": 3.61310076713562, "reward_std": 0.09716634452342987, "rewards/final_reward": 1.817545375249206, "rewards/mask_iou_reward": 0.908772687624603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6131007075309753, "rewards/thk_ans_format_reward": 1.0, "step": 914, "think_completion_length": 6.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 105.9375, "epoch": 3.09106239460371, "grad_norm": 24.634255031261517, "kl": 0.677734375, "learning_rate": 7.423986486486487e-07, "loss": 0.0007, "reward": 3.1513859033584595, "reward_std": 0.07639642804861069, "rewards/final_reward": 1.7290901915677561, "rewards/mask_iou_reward": 0.8645450957838781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1513857245445251, "rewards/thk_ans_format_reward": 1.0, "step": 915, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.01041793823242, "epoch": 3.094435075885329, "grad_norm": 36.117144199009914, "kl": 0.626953125, "learning_rate": 7.421171171171171e-07, "loss": 0.0006, "reward": 3.2969553470611572, "reward_std": 0.027974323369562626, "rewards/final_reward": 1.5465697682341046, "rewards/mask_iou_reward": 0.7732848841170523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.296955168247223, "rewards/thk_ans_format_reward": 1.0, "step": 916, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 118.01041793823242, "epoch": 3.097807757166948, "grad_norm": 8.421180282732553, "kl": 0.630859375, "learning_rate": 7.418355855855856e-07, "loss": 0.0006, "reward": 3.229664444923401, "reward_std": 0.10771491751074791, "rewards/final_reward": 1.196235040485815, "rewards/mask_iou_reward": 0.5981175202429075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2296642065048218, "rewards/thk_ans_format_reward": 1.0, "step": 917, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 128.95833587646484, "epoch": 3.1011804384485666, "grad_norm": 7.4188141354688275, "kl": 0.546875, "learning_rate": 7.41554054054054e-07, "loss": 0.0006, "reward": 3.500945806503296, "reward_std": 0.0665590912103653, "rewards/final_reward": 1.8405408670810295, "rewards/mask_iou_reward": 0.9202704335405147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5009459853172302, "rewards/thk_ans_format_reward": 1.0, "step": 918, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 131.75000381469727, "epoch": 3.1045531197301854, "grad_norm": 55.03653096258315, "kl": 0.5859375, "learning_rate": 7.412725225225225e-07, "loss": 0.0006, "reward": 3.723360061645508, "reward_std": 0.0918300710618496, "rewards/final_reward": 1.5982669399787868, "rewards/mask_iou_reward": 0.7991334699893934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7233601808547974, "rewards/thk_ans_format_reward": 1.0, "step": 919, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 144.71875, "epoch": 3.1079258010118043, "grad_norm": 12.853547166556982, "kl": 0.5380859375, "learning_rate": 7.40990990990991e-07, "loss": 0.0005, "reward": 3.3267405033111572, "reward_std": 0.07052680477499962, "rewards/final_reward": 1.8134163801056866, "rewards/mask_iou_reward": 0.9067081900528433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3267402052879333, "rewards/thk_ans_format_reward": 1.0, "step": 920, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.58333587646484, "epoch": 3.111298482293423, "grad_norm": 8.187308407871758, "kl": 0.65234375, "learning_rate": 7.407094594594594e-07, "loss": 0.0007, "reward": 3.596097469329834, "reward_std": 0.06756597012281418, "rewards/final_reward": 1.6684026304190667, "rewards/mask_iou_reward": 0.8342013152095333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.596097707748413, "rewards/thk_ans_format_reward": 1.0, "step": 921, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.12500381469727, "epoch": 3.1146711635750424, "grad_norm": 52.74813869912448, "kl": 0.607421875, "learning_rate": 7.404279279279279e-07, "loss": 0.0006, "reward": 3.3573185205459595, "reward_std": 0.09697642922401428, "rewards/final_reward": 1.0614244569828557, "rewards/mask_iou_reward": 0.5307122284914279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3573182821273804, "rewards/thk_ans_format_reward": 1.0, "step": 922, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 108.90625381469727, "epoch": 3.118043844856661, "grad_norm": 8.819321791565677, "kl": 0.71875, "learning_rate": 7.401463963963964e-07, "loss": 0.0007, "reward": 3.456843376159668, "reward_std": 0.17637907341122627, "rewards/final_reward": 1.6175144350984227, "rewards/mask_iou_reward": 0.8087572175492114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4568431377410889, "rewards/thk_ans_format_reward": 1.0, "step": 923, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 116.38541793823242, "epoch": 3.12141652613828, "grad_norm": 91.36087855306776, "kl": 0.76171875, "learning_rate": 7.398648648648649e-07, "loss": 0.0008, "reward": 3.151221990585327, "reward_std": 0.07979770191013813, "rewards/final_reward": 1.3401678123640988, "rewards/mask_iou_reward": 0.6700839061820494, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1512219905853271, "rewards/thk_ans_format_reward": 1.0, "step": 924, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 117.90625381469727, "epoch": 3.124789207419899, "grad_norm": 19.614225208295004, "kl": 0.595703125, "learning_rate": 7.395833333333334e-07, "loss": 0.0006, "reward": 3.4868357181549072, "reward_std": 0.13209467381238937, "rewards/final_reward": 1.517233627020157, "rewards/mask_iou_reward": 0.7586168135100785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4868356585502625, "rewards/thk_ans_format_reward": 1.0, "step": 925, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 123.69791793823242, "epoch": 3.1281618887015177, "grad_norm": 122.10948460669458, "kl": 138.37109375, "learning_rate": 7.393018018018018e-07, "loss": 0.1394, "reward": 3.5085551738739014, "reward_std": 0.0744461640715599, "rewards/final_reward": 1.2090095717241383, "rewards/mask_iou_reward": 0.6045047858620691, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5085551142692566, "rewards/thk_ans_format_reward": 1.0, "step": 926, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.85417175292969, "epoch": 3.1315345699831365, "grad_norm": 12.186450381014977, "kl": 0.599609375, "learning_rate": 7.390202702702703e-07, "loss": 0.0006, "reward": 3.471701979637146, "reward_std": 0.12539499625563622, "rewards/final_reward": 1.2408520506021146, "rewards/mask_iou_reward": 0.6204260253010573, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4717020392417908, "rewards/thk_ans_format_reward": 1.0, "step": 927, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 168.02083587646484, "epoch": 3.1349072512647553, "grad_norm": 8.400877279407904, "kl": 0.5634765625, "learning_rate": 7.387387387387387e-07, "loss": 0.0006, "reward": 3.4186493158340454, "reward_std": 0.12011561915278435, "rewards/final_reward": 1.0084027343217206, "rewards/mask_iou_reward": 0.5042013671608603, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4290658235549927, "rewards/thk_ans_format_reward": 1.0, "step": 928, "think_completion_length": 5.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 105.00000381469727, "epoch": 3.138279932546374, "grad_norm": 4.785202894611176, "kl": 0.66796875, "learning_rate": 7.384572072072071e-07, "loss": 0.0007, "reward": 3.502060651779175, "reward_std": 0.03595947311259806, "rewards/final_reward": 1.7552573719457105, "rewards/mask_iou_reward": 0.8776286859728553, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.50206059217453, "rewards/thk_ans_format_reward": 1.0, "step": 929, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.46875, "epoch": 3.1416526138279934, "grad_norm": 48.073277858921884, "kl": 0.75390625, "learning_rate": 7.381756756756756e-07, "loss": 0.0008, "reward": 3.3581387996673584, "reward_std": 0.08590967021882534, "rewards/final_reward": 1.7207684166968091, "rewards/mask_iou_reward": 0.8603842083484046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3581388592720032, "rewards/thk_ans_format_reward": 1.0, "step": 930, "think_completion_length": 5.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 135.79166793823242, "epoch": 3.1450252951096123, "grad_norm": 42.756840812915485, "kl": 2.42578125, "learning_rate": 7.37894144144144e-07, "loss": 0.0024, "reward": 3.279011845588684, "reward_std": 0.08761481195688248, "rewards/final_reward": 1.0889703858570359, "rewards/mask_iou_reward": 0.5444851929285179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2790116667747498, "rewards/thk_ans_format_reward": 1.0, "step": 931, "think_completion_length": 5.375 }, { "clip_ratio": 0.0, "completion_length": 118.40625, "epoch": 3.148397976391231, "grad_norm": 13.101321531658755, "kl": 0.568359375, "learning_rate": 7.376126126126125e-07, "loss": 0.0006, "reward": 3.520051956176758, "reward_std": 0.10048893839120865, "rewards/final_reward": 1.7624207074205627, "rewards/mask_iou_reward": 0.8812103537102813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.520051896572113, "rewards/thk_ans_format_reward": 1.0, "step": 932, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 115.48958587646484, "epoch": 3.15177065767285, "grad_norm": 17.5324248573886, "kl": 0.572265625, "learning_rate": 7.37331081081081e-07, "loss": 0.0006, "reward": 3.2548365592956543, "reward_std": 0.049117712303996086, "rewards/final_reward": 0.8347675929608372, "rewards/mask_iou_reward": 0.4173837964804186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2548364996910095, "rewards/thk_ans_format_reward": 1.0, "step": 933, "think_completion_length": 5.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 125.09375381469727, "epoch": 3.1551433389544687, "grad_norm": 8.456217280680285, "kl": 0.533203125, "learning_rate": 7.370495495495495e-07, "loss": 0.0005, "reward": 3.2601990699768066, "reward_std": 0.11271973326802254, "rewards/final_reward": 1.0942927702331247, "rewards/mask_iou_reward": 0.5471463851165623, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2601988911628723, "rewards/thk_ans_format_reward": 1.0, "step": 934, "think_completion_length": 5.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.32291793823242, "epoch": 3.1585160202360876, "grad_norm": 28.327635527597177, "kl": 0.615234375, "learning_rate": 7.36768018018018e-07, "loss": 0.0006, "reward": 3.3950055837631226, "reward_std": 0.12466976046562195, "rewards/final_reward": 1.5702693201072733, "rewards/mask_iou_reward": 0.7851346600536366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3950055241584778, "rewards/thk_ans_format_reward": 1.0, "step": 935, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.17708587646484, "epoch": 3.1618887015177064, "grad_norm": 24.013934474599623, "kl": 0.615234375, "learning_rate": 7.364864864864864e-07, "loss": 0.0006, "reward": 2.972390651702881, "reward_std": 0.20534201711416245, "rewards/final_reward": 1.6523960960794286, "rewards/mask_iou_reward": 0.8261980480397143, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9723906219005585, "rewards/thk_ans_format_reward": 1.0, "step": 936, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.40625381469727, "epoch": 3.1652613827993257, "grad_norm": 16.595506940383846, "kl": 0.591796875, "learning_rate": 7.362049549549549e-07, "loss": 0.0006, "reward": 3.3171249628067017, "reward_std": 0.18518901616334915, "rewards/final_reward": 1.7000440197250972, "rewards/mask_iou_reward": 0.8500220098625486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3171249628067017, "rewards/thk_ans_format_reward": 1.0, "step": 937, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.3645896911621, "epoch": 3.1686340640809445, "grad_norm": 8.610812224568315, "kl": 0.65625, "learning_rate": 7.359234234234234e-07, "loss": 0.0007, "reward": 2.8874831199645996, "reward_std": 0.19021157920360565, "rewards/final_reward": 0.7664530144380096, "rewards/mask_iou_reward": 0.3832265072190048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8874831795692444, "rewards/thk_ans_format_reward": 1.0, "step": 938, "think_completion_length": 5.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 198.20833587646484, "epoch": 3.1720067453625633, "grad_norm": 10.624550680340164, "kl": 0.578125, "learning_rate": 7.356418918918918e-07, "loss": 0.0006, "reward": 3.331384778022766, "reward_std": 0.33483661711215973, "rewards/final_reward": 1.2137129383296097, "rewards/mask_iou_reward": 0.6068564691648048, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4147179126739502, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 939, "think_completion_length": 5.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 116.54166793823242, "epoch": 3.175379426644182, "grad_norm": 13.798134393053907, "kl": 0.7109375, "learning_rate": 7.353603603603603e-07, "loss": 0.0007, "reward": 3.5317646265029907, "reward_std": 0.12036162614822388, "rewards/final_reward": 1.7559949168114533, "rewards/mask_iou_reward": 0.8779974584057266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5317646265029907, "rewards/thk_ans_format_reward": 1.0, "step": 940, "think_completion_length": 6.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.60417175292969, "epoch": 3.178752107925801, "grad_norm": 8.133255831518898, "kl": 0.56640625, "learning_rate": 7.350788288288288e-07, "loss": 0.0006, "reward": 3.4287021160125732, "reward_std": 0.07130642794072628, "rewards/final_reward": 1.857115805332103, "rewards/mask_iou_reward": 0.9285579026660515, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.428701937198639, "rewards/thk_ans_format_reward": 1.0, "step": 941, "think_completion_length": 6.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 132.46875381469727, "epoch": 3.18212478920742, "grad_norm": 227.71057599835822, "kl": 0.5048828125, "learning_rate": 7.347972972972972e-07, "loss": 0.0005, "reward": 3.421720027923584, "reward_std": 0.10221107676625252, "rewards/final_reward": 0.7738278771749434, "rewards/mask_iou_reward": 0.3869139385874717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4217200875282288, "rewards/thk_ans_format_reward": 1.0, "step": 942, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 123.59375, "epoch": 3.1854974704890386, "grad_norm": 15.557246933598993, "kl": 0.56640625, "learning_rate": 7.345157657657657e-07, "loss": 0.0006, "reward": 3.083019495010376, "reward_std": 0.10839825868606567, "rewards/final_reward": 0.886000136338485, "rewards/mask_iou_reward": 0.4430000681692425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0830194354057312, "rewards/thk_ans_format_reward": 1.0, "step": 943, "think_completion_length": 5.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 125.94792175292969, "epoch": 3.1888701517706575, "grad_norm": 8.349077581952226, "kl": 0.587890625, "learning_rate": 7.342342342342342e-07, "loss": 0.0006, "reward": 3.22943913936615, "reward_std": 0.20535418391227722, "rewards/final_reward": 1.4362136554059581, "rewards/mask_iou_reward": 0.7181068277029791, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2294391095638275, "rewards/thk_ans_format_reward": 1.0, "step": 944, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 119.23958587646484, "epoch": 3.1922428330522767, "grad_norm": 21.27652996901679, "kl": 0.564453125, "learning_rate": 7.339527027027027e-07, "loss": 0.0006, "reward": 3.469966769218445, "reward_std": 0.08743277750909328, "rewards/final_reward": 1.046480487366158, "rewards/mask_iou_reward": 0.523240243683079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4699668884277344, "rewards/thk_ans_format_reward": 1.0, "step": 945, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 132.2916717529297, "epoch": 3.1956155143338956, "grad_norm": 15.628271383029498, "kl": 0.568359375, "learning_rate": 7.336711711711712e-07, "loss": 0.0006, "reward": 3.507090926170349, "reward_std": 0.090285774320364, "rewards/final_reward": 1.6270001024893102, "rewards/mask_iou_reward": 0.8135000512446551, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5070908665657043, "rewards/thk_ans_format_reward": 1.0, "step": 946, "think_completion_length": 5.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.66667175292969, "epoch": 3.1989881956155144, "grad_norm": 11.767457004148945, "kl": 0.607421875, "learning_rate": 7.333896396396396e-07, "loss": 0.0006, "reward": 3.5310587882995605, "reward_std": 0.12183480244129896, "rewards/final_reward": 1.6213213793067296, "rewards/mask_iou_reward": 0.8106606896533648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.531058669090271, "rewards/thk_ans_format_reward": 1.0, "step": 947, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 138.82291793823242, "epoch": 3.2023608768971332, "grad_norm": 31.23548501325163, "kl": 0.904296875, "learning_rate": 7.331081081081081e-07, "loss": 0.0009, "reward": 3.2420685291290283, "reward_std": 0.14567308127880096, "rewards/final_reward": 1.282041882933168, "rewards/mask_iou_reward": 0.641020941466584, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2420682311058044, "rewards/thk_ans_format_reward": 1.0, "step": 948, "think_completion_length": 5.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.13541793823242, "epoch": 3.205733558178752, "grad_norm": 23.86354633860368, "kl": 0.5703125, "learning_rate": 7.328265765765765e-07, "loss": 0.0006, "reward": 3.7221736907958984, "reward_std": 0.06484057754278183, "rewards/final_reward": 1.404249844720991, "rewards/mask_iou_reward": 0.7021249223604955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.722173810005188, "rewards/thk_ans_format_reward": 1.0, "step": 949, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 148.1145896911621, "epoch": 3.209106239460371, "grad_norm": 12.380395095394663, "kl": 0.57421875, "learning_rate": 7.32545045045045e-07, "loss": 0.0006, "reward": 3.3760533332824707, "reward_std": 0.1979294866323471, "rewards/final_reward": 1.7005252617653546, "rewards/mask_iou_reward": 0.8502626308826773, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3968866467475891, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 950, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 127.21875, "epoch": 3.2124789207419897, "grad_norm": 8.26952523545033, "kl": 0.576171875, "learning_rate": 7.322635135135135e-07, "loss": 0.0006, "reward": 3.1699079275131226, "reward_std": 0.07622763887047768, "rewards/final_reward": 1.3101456169278447, "rewards/mask_iou_reward": 0.6550728084639224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.169907808303833, "rewards/thk_ans_format_reward": 1.0, "step": 951, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.95833587646484, "epoch": 3.2158516020236085, "grad_norm": 9.638521530190893, "kl": 0.4833984375, "learning_rate": 7.319819819819819e-07, "loss": 0.0005, "reward": 3.4880365133285522, "reward_std": 0.1652812361717224, "rewards/final_reward": 1.8128116128729428, "rewards/mask_iou_reward": 0.9064058064364714, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5088698863983154, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 952, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 124.25000762939453, "epoch": 3.219224283305228, "grad_norm": 8.410396514523777, "kl": 0.5703125, "learning_rate": 7.317004504504504e-07, "loss": 0.0006, "reward": 3.0509506464004517, "reward_std": 0.08591302763670683, "rewards/final_reward": 1.466382150338418, "rewards/mask_iou_reward": 0.733191075169209, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0509507060050964, "rewards/thk_ans_format_reward": 1.0, "step": 953, "think_completion_length": 5.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 134.68750381469727, "epoch": 3.2225969645868466, "grad_norm": 21.489909003804172, "kl": 0.658203125, "learning_rate": 7.31418918918919e-07, "loss": 0.0007, "reward": 3.298169493675232, "reward_std": 0.1636265590786934, "rewards/final_reward": 0.8282513182543959, "rewards/mask_iou_reward": 0.41412565912719795, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3190029561519623, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 954, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.50000381469727, "epoch": 3.2259696458684655, "grad_norm": 5.470747692734715, "kl": 0.517578125, "learning_rate": 7.311373873873874e-07, "loss": 0.0005, "reward": 3.7121076583862305, "reward_std": 0.09200547635555267, "rewards/final_reward": 1.6347616027676106, "rewards/mask_iou_reward": 0.8173808013838053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7121077179908752, "rewards/thk_ans_format_reward": 1.0, "step": 955, "think_completion_length": 5.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 139.03125, "epoch": 3.2293423271500843, "grad_norm": 10.76266896794945, "kl": 0.53515625, "learning_rate": 7.308558558558559e-07, "loss": 0.0005, "reward": 3.611359715461731, "reward_std": 0.2015197928994894, "rewards/final_reward": 1.4773672950030936, "rewards/mask_iou_reward": 0.7386836475015468, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6321927905082703, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 956, "think_completion_length": 5.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 140.3854217529297, "epoch": 3.232715008431703, "grad_norm": 25.410441482857728, "kl": 0.4921875, "learning_rate": 7.305743243243243e-07, "loss": 0.0005, "reward": 3.1916489601135254, "reward_std": 0.4065837115049362, "rewards/final_reward": 1.5027361413481117, "rewards/mask_iou_reward": 0.7513680706740559, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.2333155870437622, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 957, "think_completion_length": 5.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 151.47916793823242, "epoch": 3.236087689713322, "grad_norm": 6.036564560288615, "kl": 0.498046875, "learning_rate": 7.302927927927928e-07, "loss": 0.0005, "reward": 3.3439793586730957, "reward_std": 0.2639719545841217, "rewards/final_reward": 1.7893254228602058, "rewards/mask_iou_reward": 0.8946627114301029, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3648127317428589, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 958, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.30208587646484, "epoch": 3.2394603709949408, "grad_norm": 9.480511697981585, "kl": 0.56640625, "learning_rate": 7.300112612612613e-07, "loss": 0.0006, "reward": 3.5378222465515137, "reward_std": 0.06817605718970299, "rewards/final_reward": 1.8195322300541235, "rewards/mask_iou_reward": 0.9097661150270617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.537822186946869, "rewards/thk_ans_format_reward": 1.0, "step": 959, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 148.59375762939453, "epoch": 3.24283305227656, "grad_norm": 21.98608140049359, "kl": 0.626953125, "learning_rate": 7.297297297297297e-07, "loss": 0.0006, "reward": 3.4865334033966064, "reward_std": 0.1169372908771038, "rewards/final_reward": 0.8317292459713557, "rewards/mask_iou_reward": 0.41586462298567783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.486533284187317, "rewards/thk_ans_format_reward": 1.0, "step": 960, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 118.96875, "epoch": 3.246205733558179, "grad_norm": 22.098295435474142, "kl": 0.626953125, "learning_rate": 7.294481981981982e-07, "loss": 0.0006, "reward": 3.630619168281555, "reward_std": 0.10067232511937618, "rewards/final_reward": 1.8147314003863455, "rewards/mask_iou_reward": 0.9073657001931728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6306187510490417, "rewards/thk_ans_format_reward": 1.0, "step": 961, "think_completion_length": 6.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 193.14583587646484, "epoch": 3.2495784148397977, "grad_norm": 56.289192010391524, "kl": 0.5048828125, "learning_rate": 7.291666666666666e-07, "loss": 0.0005, "reward": 3.6313655376434326, "reward_std": 0.312417708337307, "rewards/final_reward": 1.5549074475717943, "rewards/mask_iou_reward": 0.7774537237858972, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.6938652992248535, "rewards/thk_ans_format_reward": 0.96875, "step": 962, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 142.40625381469727, "epoch": 3.2529510961214165, "grad_norm": 10.269311605816217, "kl": 0.529296875, "learning_rate": 7.288851351351351e-07, "loss": 0.0005, "reward": 3.4660444259643555, "reward_std": 0.17312489449977875, "rewards/final_reward": 1.5843367482599913, "rewards/mask_iou_reward": 0.7921683741299956, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4764612317085266, "rewards/thk_ans_format_reward": 1.0, "step": 963, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 117.29166793823242, "epoch": 3.2563237774030354, "grad_norm": 9.743631276599844, "kl": 0.619140625, "learning_rate": 7.286036036036037e-07, "loss": 0.0006, "reward": 3.665140748023987, "reward_std": 0.07369671761989594, "rewards/final_reward": 1.884653197648869, "rewards/mask_iou_reward": 0.9423265988244345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6651407480239868, "rewards/thk_ans_format_reward": 1.0, "step": 964, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 118.54166793823242, "epoch": 3.259696458684654, "grad_norm": 6.260349384279472, "kl": 0.62109375, "learning_rate": 7.283220720720721e-07, "loss": 0.0006, "reward": 3.545573592185974, "reward_std": 0.0774066224694252, "rewards/final_reward": 1.257693189516263, "rewards/mask_iou_reward": 0.6288465947581315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5455738306045532, "rewards/thk_ans_format_reward": 1.0, "step": 965, "think_completion_length": 6.5 }, { "clip_ratio": 0.0, "completion_length": 157.67708587646484, "epoch": 3.263069139966273, "grad_norm": 10.940295050921081, "kl": 0.560546875, "learning_rate": 7.280405405405406e-07, "loss": 0.0006, "reward": 3.335355520248413, "reward_std": 0.3451412171125412, "rewards/final_reward": 1.4840171773306559, "rewards/mask_iou_reward": 0.7420085886653279, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3770219087600708, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 966, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 171.90625, "epoch": 3.2664418212478923, "grad_norm": 18.922494525837497, "kl": 0.60546875, "learning_rate": 7.27759009009009e-07, "loss": 0.0006, "reward": 3.4567893743515015, "reward_std": 0.19448533281683922, "rewards/final_reward": 1.2659538781637694, "rewards/mask_iou_reward": 0.6329769390818847, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4672061204910278, "rewards/thk_ans_format_reward": 1.0, "step": 967, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 171.4791717529297, "epoch": 3.269814502529511, "grad_norm": 14.231900921928126, "kl": 0.5859375, "learning_rate": 7.274774774774774e-07, "loss": 0.0006, "reward": 3.3344634771347046, "reward_std": 0.286946564912796, "rewards/final_reward": 1.7930482971354982, "rewards/mask_iou_reward": 0.8965241485677491, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3761300444602966, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 968, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 170.1770896911621, "epoch": 3.27318718381113, "grad_norm": 6.237807951070738, "kl": 0.56640625, "learning_rate": 7.271959459459459e-07, "loss": 0.0006, "reward": 3.3685081005096436, "reward_std": 0.1493084542453289, "rewards/final_reward": 0.5514383027772634, "rewards/mask_iou_reward": 0.2757191513886317, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3789244294166565, "rewards/thk_ans_format_reward": 1.0, "step": 969, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 124.6875, "epoch": 3.2765598650927488, "grad_norm": 37.178831343629014, "kl": 0.607421875, "learning_rate": 7.269144144144143e-07, "loss": 0.0006, "reward": 3.1892114877700806, "reward_std": 0.07886414229869843, "rewards/final_reward": 1.2344537926142465, "rewards/mask_iou_reward": 0.6172268963071232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1892114281654358, "rewards/thk_ans_format_reward": 1.0, "step": 970, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 106.20833587646484, "epoch": 3.2799325463743676, "grad_norm": 30.71626399986299, "kl": 0.7421875, "learning_rate": 7.266328828828828e-07, "loss": 0.0007, "reward": 3.5415124893188477, "reward_std": 0.16267375275492668, "rewards/final_reward": 1.4044140215816594, "rewards/mask_iou_reward": 0.7022070107908297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5415124297142029, "rewards/thk_ans_format_reward": 1.0, "step": 971, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 133.0104217529297, "epoch": 3.2833052276559864, "grad_norm": 42.712795178679805, "kl": 0.572265625, "learning_rate": 7.263513513513512e-07, "loss": 0.0006, "reward": 3.5569279193878174, "reward_std": 0.16379550099372864, "rewards/final_reward": 1.6601145498407242, "rewards/mask_iou_reward": 0.8300572749203621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5569279789924622, "rewards/thk_ans_format_reward": 1.0, "step": 972, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 169.0104217529297, "epoch": 3.2866779089376053, "grad_norm": 19.441170666993003, "kl": 0.5546875, "learning_rate": 7.260698198198197e-07, "loss": 0.0006, "reward": 3.3968396186828613, "reward_std": 0.23672273010015488, "rewards/final_reward": 1.2417896327478177, "rewards/mask_iou_reward": 0.6208948163739089, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.438506305217743, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 973, "think_completion_length": 5.375 }, { "clip_ratio": 0.0, "completion_length": 137.6979217529297, "epoch": 3.2900505902192245, "grad_norm": 13.760331090171656, "kl": 0.5078125, "learning_rate": 7.257882882882883e-07, "loss": 0.0005, "reward": 3.2817405462265015, "reward_std": 0.2441714182496071, "rewards/final_reward": 1.0549440408050355, "rewards/mask_iou_reward": 0.5274720204025177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2817405462265015, "rewards/thk_ans_format_reward": 1.0, "step": 974, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.55208587646484, "epoch": 3.2934232715008434, "grad_norm": 18.820609699132905, "kl": 0.58203125, "learning_rate": 7.255067567567567e-07, "loss": 0.0006, "reward": 3.1479690074920654, "reward_std": 0.18045621365308762, "rewards/final_reward": 1.0056742193732744, "rewards/mask_iou_reward": 0.5028371096866372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.147968977689743, "rewards/thk_ans_format_reward": 1.0, "step": 975, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.09375381469727, "epoch": 3.296795952782462, "grad_norm": 9.561561905297891, "kl": 0.71875, "learning_rate": 7.252252252252252e-07, "loss": 0.0007, "reward": 3.4808413982391357, "reward_std": 0.045143453404307365, "rewards/final_reward": 1.7793298953695453, "rewards/mask_iou_reward": 0.8896649476847727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4808412790298462, "rewards/thk_ans_format_reward": 1.0, "step": 976, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 155.4583396911621, "epoch": 3.300168634064081, "grad_norm": 25.168918736528095, "kl": 0.5029296875, "learning_rate": 7.249436936936937e-07, "loss": 0.0005, "reward": 3.6008447408676147, "reward_std": 0.15011634677648544, "rewards/final_reward": 1.8015549346745128, "rewards/mask_iou_reward": 0.9007774673372564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.60084468126297, "rewards/thk_ans_format_reward": 1.0, "step": 977, "think_completion_length": 5.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.65625, "epoch": 3.3035413153457, "grad_norm": 11.306943843758415, "kl": 0.615234375, "learning_rate": 7.246621621621621e-07, "loss": 0.0006, "reward": 3.3849350214004517, "reward_std": 0.07456074655056, "rewards/final_reward": 1.1514664823950427, "rewards/mask_iou_reward": 0.5757332411975213, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3849350214004517, "rewards/thk_ans_format_reward": 1.0, "step": 978, "think_completion_length": 5.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.93750762939453, "epoch": 3.3069139966273187, "grad_norm": 10.21767200683652, "kl": 0.654296875, "learning_rate": 7.243806306306306e-07, "loss": 0.0007, "reward": 3.547337055206299, "reward_std": 0.17310263961553574, "rewards/final_reward": 1.8143662268646201, "rewards/mask_iou_reward": 0.9071831134323101, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5473372340202332, "rewards/thk_ans_format_reward": 1.0, "step": 979, "think_completion_length": 5.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 126.69791793823242, "epoch": 3.3102866779089375, "grad_norm": 11.932914959574404, "kl": 1.396484375, "learning_rate": 7.24099099099099e-07, "loss": 0.0014, "reward": 3.4563785791397095, "reward_std": 0.20236939936876297, "rewards/final_reward": 1.5273377877499197, "rewards/mask_iou_reward": 0.7636688938749598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.456378698348999, "rewards/thk_ans_format_reward": 1.0, "step": 980, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.21875, "epoch": 3.3136593591905563, "grad_norm": 18.065931443302414, "kl": 0.56640625, "learning_rate": 7.238175675675675e-07, "loss": 0.0006, "reward": 3.3971447944641113, "reward_std": 0.1286163404583931, "rewards/final_reward": 1.5317366674830568, "rewards/mask_iou_reward": 0.7658683337415284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3971449732780457, "rewards/thk_ans_format_reward": 1.0, "step": 981, "think_completion_length": 4.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 128.17708587646484, "epoch": 3.317032040472175, "grad_norm": 11.931088848754653, "kl": 0.541015625, "learning_rate": 7.23536036036036e-07, "loss": 0.0006, "reward": 3.5437206029891968, "reward_std": 0.09862393140792847, "rewards/final_reward": 1.673236912252018, "rewards/mask_iou_reward": 0.836618456126009, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5437206029891968, "rewards/thk_ans_format_reward": 1.0, "step": 982, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 139.1979217529297, "epoch": 3.3204047217537944, "grad_norm": 14.687787310829892, "kl": 0.55859375, "learning_rate": 7.232545045045044e-07, "loss": 0.0006, "reward": 3.3935028314590454, "reward_std": 0.09046576172113419, "rewards/final_reward": 1.3983763624905539, "rewards/mask_iou_reward": 0.6991881812452769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3935027718544006, "rewards/thk_ans_format_reward": 1.0, "step": 983, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 167.0729217529297, "epoch": 3.3237774030354132, "grad_norm": 15.914576651462694, "kl": 0.578125, "learning_rate": 7.22972972972973e-07, "loss": 0.0006, "reward": 3.3592745065689087, "reward_std": 0.2667535990476608, "rewards/final_reward": 1.5896124561677536, "rewards/mask_iou_reward": 0.7948062280838768, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3801075220108032, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 984, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 125.88541793823242, "epoch": 3.327150084317032, "grad_norm": 16.2544577755076, "kl": 0.615234375, "learning_rate": 7.226914414414414e-07, "loss": 0.0006, "reward": 3.4338074922561646, "reward_std": 0.18879379332065582, "rewards/final_reward": 1.4561126090209395, "rewards/mask_iou_reward": 0.7280563045104698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4338074922561646, "rewards/thk_ans_format_reward": 1.0, "step": 985, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 161.7291717529297, "epoch": 3.330522765598651, "grad_norm": 24.925539285671643, "kl": 0.51953125, "learning_rate": 7.224099099099099e-07, "loss": 0.0005, "reward": 3.032238721847534, "reward_std": 0.1719028726220131, "rewards/final_reward": 0.9244178777787244, "rewards/mask_iou_reward": 0.4622089388893622, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0426553189754486, "rewards/thk_ans_format_reward": 1.0, "step": 986, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 122.30208587646484, "epoch": 3.3338954468802697, "grad_norm": 12.043070314100586, "kl": 0.583984375, "learning_rate": 7.221283783783784e-07, "loss": 0.0006, "reward": 3.320284605026245, "reward_std": 0.17223292589187622, "rewards/final_reward": 1.2791357239045653, "rewards/mask_iou_reward": 0.6395678619522827, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3202844858169556, "rewards/thk_ans_format_reward": 1.0, "step": 987, "think_completion_length": 5.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.00000762939453, "epoch": 3.3372681281618886, "grad_norm": 58.09821409600536, "kl": 0.591796875, "learning_rate": 7.218468468468468e-07, "loss": 0.0007, "reward": 2.9446089267730713, "reward_std": 0.19425636157393456, "rewards/final_reward": 0.6028217155810873, "rewards/mask_iou_reward": 0.30141085779054366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9446086883544922, "rewards/thk_ans_format_reward": 1.0, "step": 988, "think_completion_length": 6.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 134.46875, "epoch": 3.3406408094435074, "grad_norm": 25.87132248920914, "kl": 2.58984375, "learning_rate": 7.215653153153153e-07, "loss": 0.0026, "reward": 3.2142951488494873, "reward_std": 0.08715818449854851, "rewards/final_reward": 0.05117792402181443, "rewards/mask_iou_reward": 0.025588962010907216, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2142953276634216, "rewards/thk_ans_format_reward": 1.0, "step": 989, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.56250381469727, "epoch": 3.3440134907251267, "grad_norm": 29.103910507604148, "kl": 0.67578125, "learning_rate": 7.212837837837837e-07, "loss": 0.0007, "reward": 3.138665556907654, "reward_std": 0.13922191970050335, "rewards/final_reward": 0.9125553380184775, "rewards/mask_iou_reward": 0.4562776690092388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1386656165122986, "rewards/thk_ans_format_reward": 1.0, "step": 990, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 118.48958587646484, "epoch": 3.3473861720067455, "grad_norm": 76.55762479280577, "kl": 0.5703125, "learning_rate": 7.210022522522522e-07, "loss": 0.0006, "reward": 3.4942747354507446, "reward_std": 0.050971828401088715, "rewards/final_reward": 1.619205718547097, "rewards/mask_iou_reward": 0.8096028592735485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4942745566368103, "rewards/thk_ans_format_reward": 1.0, "step": 991, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 161.48958587646484, "epoch": 3.3507588532883643, "grad_norm": 8.513639306430193, "kl": 0.5546875, "learning_rate": 7.207207207207207e-07, "loss": 0.0006, "reward": 3.1507691144943237, "reward_std": 0.13545623049139977, "rewards/final_reward": 1.7922970168952874, "rewards/mask_iou_reward": 0.8961485084476437, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1716025471687317, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 992, "think_completion_length": 5.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 183.14583587646484, "epoch": 3.354131534569983, "grad_norm": 14.357147390481792, "kl": 0.552734375, "learning_rate": 7.204391891891891e-07, "loss": 0.0006, "reward": 3.3162766695022583, "reward_std": 0.19133785367012024, "rewards/final_reward": 1.6321412486895803, "rewards/mask_iou_reward": 0.8160706243447902, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3579435348510742, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 993, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 137.5729217529297, "epoch": 3.357504215851602, "grad_norm": 16.092738065915093, "kl": 0.6015625, "learning_rate": 7.201576576576577e-07, "loss": 0.0006, "reward": 3.2174041271209717, "reward_std": 0.1023724116384983, "rewards/final_reward": 1.379500604239666, "rewards/mask_iou_reward": 0.689750302119833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2174039483070374, "rewards/thk_ans_format_reward": 1.0, "step": 994, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.45833587646484, "epoch": 3.360876897133221, "grad_norm": 6.582151617464147, "kl": 0.580078125, "learning_rate": 7.198761261261262e-07, "loss": 0.0006, "reward": 3.3776434659957886, "reward_std": 0.18915196508169174, "rewards/final_reward": 1.344115784489828, "rewards/mask_iou_reward": 0.672057892244914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3776432871818542, "rewards/thk_ans_format_reward": 1.0, "step": 995, "think_completion_length": 4.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.53125381469727, "epoch": 3.3642495784148396, "grad_norm": 745.984834898941, "kl": 0.603515625, "learning_rate": 7.195945945945946e-07, "loss": 0.0006, "reward": 3.356271982192993, "reward_std": 0.10604298114776611, "rewards/final_reward": 0.7671502588916607, "rewards/mask_iou_reward": 0.38357512944583033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3562718629837036, "rewards/thk_ans_format_reward": 1.0, "step": 996, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.72916793823242, "epoch": 3.367622259696459, "grad_norm": 9.526246336319616, "kl": 1.103515625, "learning_rate": 7.193130630630631e-07, "loss": 0.0011, "reward": 3.502687931060791, "reward_std": 0.11498154327273369, "rewards/final_reward": 1.8288208638873686, "rewards/mask_iou_reward": 0.9144104319436843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5026878118515015, "rewards/thk_ans_format_reward": 1.0, "step": 997, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 127.39583587646484, "epoch": 3.3709949409780777, "grad_norm": 14.226556765416788, "kl": 0.70703125, "learning_rate": 7.190315315315315e-07, "loss": 0.0007, "reward": 3.314034104347229, "reward_std": 0.1424049399793148, "rewards/final_reward": 1.7067345169496768, "rewards/mask_iou_reward": 0.8533672584748384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3140341639518738, "rewards/thk_ans_format_reward": 1.0, "step": 998, "think_completion_length": 5.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.9583396911621, "epoch": 3.3743676222596966, "grad_norm": 23.291971729599407, "kl": 0.560546875, "learning_rate": 7.1875e-07, "loss": 0.0006, "reward": 3.2234108448028564, "reward_std": 0.12070946767926216, "rewards/final_reward": 1.400489735686926, "rewards/mask_iou_reward": 0.700244867843463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2234107553958893, "rewards/thk_ans_format_reward": 1.0, "step": 999, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 117.73958587646484, "epoch": 3.3777403035413154, "grad_norm": 14.459397846292022, "kl": 0.689453125, "learning_rate": 7.184684684684685e-07, "loss": 0.0007, "reward": 3.5166208744049072, "reward_std": 0.19298768788576126, "rewards/final_reward": 1.4085889481186704, "rewards/mask_iou_reward": 0.7042944740593352, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5166207551956177, "rewards/thk_ans_format_reward": 1.0, "step": 1000, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.41667175292969, "epoch": 3.381112984822934, "grad_norm": 39.82344513333924, "kl": 0.552734375, "learning_rate": 7.181869369369369e-07, "loss": 0.0006, "reward": 3.3766207695007324, "reward_std": 0.1761082075536251, "rewards/final_reward": 1.035476774600391, "rewards/mask_iou_reward": 0.5177383873001955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3766207098960876, "rewards/thk_ans_format_reward": 1.0, "step": 1001, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 118.48958587646484, "epoch": 3.384485666104553, "grad_norm": 26.368496651256653, "kl": 0.654296875, "learning_rate": 7.179054054054054e-07, "loss": 0.0007, "reward": 3.17868971824646, "reward_std": 0.1292325034737587, "rewards/final_reward": 1.2441194261216724, "rewards/mask_iou_reward": 0.6220597130608362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1786896586418152, "rewards/thk_ans_format_reward": 1.0, "step": 1002, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 117.62500381469727, "epoch": 3.387858347386172, "grad_norm": 16.701541199494095, "kl": 0.583984375, "learning_rate": 7.176238738738738e-07, "loss": 0.0006, "reward": 3.057352304458618, "reward_std": 0.32341183722019196, "rewards/final_reward": 1.515199413691938, "rewards/mask_iou_reward": 0.757599706845969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.057352364063263, "rewards/thk_ans_format_reward": 1.0, "step": 1003, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 118.09375, "epoch": 3.391231028667791, "grad_norm": 10.720874146170441, "kl": 0.78515625, "learning_rate": 7.173423423423423e-07, "loss": 0.0008, "reward": 2.9834378957748413, "reward_std": 0.18407592922449112, "rewards/final_reward": 1.285521686629659, "rewards/mask_iou_reward": 0.6427608433148295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9834379553794861, "rewards/thk_ans_format_reward": 1.0, "step": 1004, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 118.78125, "epoch": 3.39460370994941, "grad_norm": 12.028385144223307, "kl": 0.53125, "learning_rate": 7.170608108108109e-07, "loss": 0.0005, "reward": 3.208145260810852, "reward_std": 0.22115997970104218, "rewards/final_reward": 1.1253257130810643, "rewards/mask_iou_reward": 0.5626628565405322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2081450521945953, "rewards/thk_ans_format_reward": 1.0, "step": 1005, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 118.27083587646484, "epoch": 3.397976391231029, "grad_norm": 294.64644819140955, "kl": 0.916015625, "learning_rate": 7.167792792792793e-07, "loss": 0.0009, "reward": 3.362850785255432, "reward_std": 0.12509119138121605, "rewards/final_reward": 1.4450654677791082, "rewards/mask_iou_reward": 0.7225327338895541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3628507852554321, "rewards/thk_ans_format_reward": 1.0, "step": 1006, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 135.90625762939453, "epoch": 3.4013490725126476, "grad_norm": 11.539631893789023, "kl": 0.58203125, "learning_rate": 7.164977477477477e-07, "loss": 0.0006, "reward": 3.2372653484344482, "reward_std": 0.2422248274087906, "rewards/final_reward": 1.3411681388721064, "rewards/mask_iou_reward": 0.6705840694360532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2372653186321259, "rewards/thk_ans_format_reward": 1.0, "step": 1007, "think_completion_length": 4.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 118.00000381469727, "epoch": 3.4047217537942664, "grad_norm": 14.000104300475737, "kl": 0.6171875, "learning_rate": 7.162162162162161e-07, "loss": 0.0006, "reward": 3.285541296005249, "reward_std": 0.16645950078964233, "rewards/final_reward": 0.8122575552678174, "rewards/mask_iou_reward": 0.4061287776339087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2855412364006042, "rewards/thk_ans_format_reward": 1.0, "step": 1008, "think_completion_length": 5.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 117.14583587646484, "epoch": 3.4080944350758853, "grad_norm": 6.440987836858945, "kl": 0.583984375, "learning_rate": 7.159346846846846e-07, "loss": 0.0006, "reward": 3.5113813877105713, "reward_std": 0.17040079832077026, "rewards/final_reward": 1.3076065447677623, "rewards/mask_iou_reward": 0.6538032723838811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5113813281059265, "rewards/thk_ans_format_reward": 1.0, "step": 1009, "think_completion_length": 5.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.46875381469727, "epoch": 3.411467116357504, "grad_norm": 77.78402545991266, "kl": 0.591796875, "learning_rate": 7.156531531531531e-07, "loss": 0.0006, "reward": 3.265267848968506, "reward_std": 0.0866379663348198, "rewards/final_reward": 1.559751116566344, "rewards/mask_iou_reward": 0.779875558283172, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2652679085731506, "rewards/thk_ans_format_reward": 1.0, "step": 1010, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.38541793823242, "epoch": 3.414839797639123, "grad_norm": 9.63407648262111, "kl": 0.671875, "learning_rate": 7.153716216216215e-07, "loss": 0.0007, "reward": 3.1705085039138794, "reward_std": 0.1311897709965706, "rewards/final_reward": 0.8278068040052939, "rewards/mask_iou_reward": 0.41390340200264697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1705085337162018, "rewards/thk_ans_format_reward": 1.0, "step": 1011, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.19791793823242, "epoch": 3.4182124789207418, "grad_norm": 10.822874300002365, "kl": 0.5703125, "learning_rate": 7.1509009009009e-07, "loss": 0.0006, "reward": 2.9343758821487427, "reward_std": 0.1505995076149702, "rewards/final_reward": 0.016859356044498678, "rewards/mask_iou_reward": 0.008429678022249339, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9343758225440979, "rewards/thk_ans_format_reward": 1.0, "step": 1012, "think_completion_length": 5.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 107.15625, "epoch": 3.421585160202361, "grad_norm": 30.516206363772966, "kl": 0.65234375, "learning_rate": 7.148085585585584e-07, "loss": 0.0007, "reward": 3.3128777742385864, "reward_std": 0.09216344356536865, "rewards/final_reward": 1.3309258266346848, "rewards/mask_iou_reward": 0.6654629133173424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3128776550292969, "rewards/thk_ans_format_reward": 1.0, "step": 1013, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 107.69791793823242, "epoch": 3.42495784148398, "grad_norm": 9.172659732253166, "kl": 0.619140625, "learning_rate": 7.145270270270269e-07, "loss": 0.0006, "reward": 3.099686861038208, "reward_std": 0.16987330839037895, "rewards/final_reward": 0.5487296181785621, "rewards/mask_iou_reward": 0.27436480908928107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.099686861038208, "rewards/thk_ans_format_reward": 1.0, "step": 1014, "think_completion_length": 5.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 118.57291793823242, "epoch": 3.4283305227655987, "grad_norm": 14.378581603595407, "kl": 0.583984375, "learning_rate": 7.142454954954955e-07, "loss": 0.0006, "reward": 3.4818849563598633, "reward_std": 0.09820106998085976, "rewards/final_reward": 0.8706115831652791, "rewards/mask_iou_reward": 0.43530579158263955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.481884777545929, "rewards/thk_ans_format_reward": 1.0, "step": 1015, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.18750381469727, "epoch": 3.4317032040472175, "grad_norm": 81.49210537692733, "kl": 0.578125, "learning_rate": 7.139639639639639e-07, "loss": 0.0006, "reward": 3.278176784515381, "reward_std": 0.17319020628929138, "rewards/final_reward": 1.311001176552672, "rewards/mask_iou_reward": 0.655500588276336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.278176724910736, "rewards/thk_ans_format_reward": 1.0, "step": 1016, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.10416793823242, "epoch": 3.4350758853288363, "grad_norm": 9.026460969823404, "kl": 0.72265625, "learning_rate": 7.136824324324324e-07, "loss": 0.0007, "reward": 3.395389676094055, "reward_std": 0.24909770116209984, "rewards/final_reward": 1.7078976175332112, "rewards/mask_iou_reward": 0.8539488087666056, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.447472870349884, "rewards/thk_ans_format_reward": 1.0, "step": 1017, "think_completion_length": 5.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 137.09375762939453, "epoch": 3.438448566610455, "grad_norm": 15.858132519987185, "kl": 0.595703125, "learning_rate": 7.134009009009009e-07, "loss": 0.0006, "reward": 3.220812678337097, "reward_std": 0.048954208847135305, "rewards/final_reward": 1.0691695640976275, "rewards/mask_iou_reward": 0.5345847820488138, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2208125591278076, "rewards/thk_ans_format_reward": 1.0, "step": 1018, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.25, "epoch": 3.441821247892074, "grad_norm": 17.508912895515763, "kl": 0.54296875, "learning_rate": 7.131193693693693e-07, "loss": 0.0005, "reward": 3.421711802482605, "reward_std": 0.08426211401820183, "rewards/final_reward": 1.4825259952470544, "rewards/mask_iou_reward": 0.7412629976235272, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4217117428779602, "rewards/thk_ans_format_reward": 1.0, "step": 1019, "think_completion_length": 6.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 129.70833587646484, "epoch": 3.4451939291736933, "grad_norm": 9.819818344541755, "kl": 0.5859375, "learning_rate": 7.128378378378378e-07, "loss": 0.0006, "reward": 3.5098577737808228, "reward_std": 0.0756399855017662, "rewards/final_reward": 1.8016748094536283, "rewards/mask_iou_reward": 0.9008374047268142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.509857714176178, "rewards/thk_ans_format_reward": 1.0, "step": 1020, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 118.62500381469727, "epoch": 3.448566610455312, "grad_norm": 9.870525631678277, "kl": 0.59765625, "learning_rate": 7.125563063063062e-07, "loss": 0.0006, "reward": 3.2804505825042725, "reward_std": 0.044149222783744335, "rewards/final_reward": 1.4287736457226252, "rewards/mask_iou_reward": 0.7143868228613126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.280450701713562, "rewards/thk_ans_format_reward": 1.0, "step": 1021, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.01041793823242, "epoch": 3.451939291736931, "grad_norm": 14.709563001580337, "kl": 0.58984375, "learning_rate": 7.122747747747747e-07, "loss": 0.0006, "reward": 3.495166778564453, "reward_std": 0.08700309973210096, "rewards/final_reward": 1.4906810415064982, "rewards/mask_iou_reward": 0.7453405207532491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4951667785644531, "rewards/thk_ans_format_reward": 1.0, "step": 1022, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 119.00000381469727, "epoch": 3.4553119730185498, "grad_norm": 11.39047362652609, "kl": 0.58203125, "learning_rate": 7.119932432432432e-07, "loss": 0.0006, "reward": 3.120210886001587, "reward_std": 0.11821487359702587, "rewards/final_reward": 0.7625099604747464, "rewards/mask_iou_reward": 0.3812549802373732, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1202109456062317, "rewards/thk_ans_format_reward": 1.0, "step": 1023, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 119.14583587646484, "epoch": 3.4586846543001686, "grad_norm": 12.194540716632517, "kl": 0.640625, "learning_rate": 7.117117117117116e-07, "loss": 0.0006, "reward": 3.422505021095276, "reward_std": 0.06610206328332424, "rewards/final_reward": 1.3190821987653705, "rewards/mask_iou_reward": 0.6595410993826852, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4225048422813416, "rewards/thk_ans_format_reward": 1.0, "step": 1024, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 120.20833587646484, "epoch": 3.4620573355817874, "grad_norm": 27.511552738652743, "kl": 0.59765625, "learning_rate": 7.114301801801802e-07, "loss": 0.0006, "reward": 3.3963574171066284, "reward_std": 0.05695942044258118, "rewards/final_reward": 1.834786710391498, "rewards/mask_iou_reward": 0.917393355195749, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3963574171066284, "rewards/thk_ans_format_reward": 1.0, "step": 1025, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.625, "epoch": 3.4654300168634062, "grad_norm": 32.633397899339734, "kl": 0.625, "learning_rate": 7.111486486486487e-07, "loss": 0.0006, "reward": 3.3901573419570923, "reward_std": 0.3244527727365494, "rewards/final_reward": 1.6744121917113473, "rewards/mask_iou_reward": 0.8372060958556736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3901574611663818, "rewards/thk_ans_format_reward": 1.0, "step": 1026, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 119.47916793823242, "epoch": 3.4688026981450255, "grad_norm": 11.68248831433917, "kl": 0.578125, "learning_rate": 7.108671171171171e-07, "loss": 0.0006, "reward": 3.42316472530365, "reward_std": 0.1372026950120926, "rewards/final_reward": 1.2456748072284278, "rewards/mask_iou_reward": 0.6228374036142139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4231645464897156, "rewards/thk_ans_format_reward": 1.0, "step": 1027, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 106.57291793823242, "epoch": 3.4721753794266443, "grad_norm": 15.197451678878082, "kl": 1.07421875, "learning_rate": 7.105855855855856e-07, "loss": 0.0011, "reward": 3.4628374576568604, "reward_std": 0.14769380167126656, "rewards/final_reward": 1.2430186680061066, "rewards/mask_iou_reward": 0.6215093340030533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4628373980522156, "rewards/thk_ans_format_reward": 1.0, "step": 1028, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 132.56250381469727, "epoch": 3.475548060708263, "grad_norm": 8.819992248258746, "kl": 0.623046875, "learning_rate": 7.10304054054054e-07, "loss": 0.0006, "reward": 3.103990077972412, "reward_std": 0.17439210042357445, "rewards/final_reward": 0.11795010824929059, "rewards/mask_iou_reward": 0.05897505412464529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1039898991584778, "rewards/thk_ans_format_reward": 1.0, "step": 1029, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.91666793823242, "epoch": 3.478920741989882, "grad_norm": 34.69363820910185, "kl": 0.83203125, "learning_rate": 7.100225225225225e-07, "loss": 0.0008, "reward": 3.3977935314178467, "reward_std": 0.08490690216422081, "rewards/final_reward": 1.6032253984319262, "rewards/mask_iou_reward": 0.8016126992159631, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3977934122085571, "rewards/thk_ans_format_reward": 1.0, "step": 1030, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.34375381469727, "epoch": 3.482293423271501, "grad_norm": 27.013938649093514, "kl": 0.5703125, "learning_rate": 7.09740990990991e-07, "loss": 0.0006, "reward": 3.4430354833602905, "reward_std": 0.10244200751185417, "rewards/final_reward": 1.7051466739701793, "rewards/mask_iou_reward": 0.8525733369850896, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4430354833602905, "rewards/thk_ans_format_reward": 1.0, "step": 1031, "think_completion_length": 5.625 }, { "clip_ratio": 0.0, "completion_length": 119.48958587646484, "epoch": 3.4856661045531196, "grad_norm": 12.142628291268021, "kl": 0.65625, "learning_rate": 7.094594594594594e-07, "loss": 0.0007, "reward": 3.4855599403381348, "reward_std": 0.14212478697299957, "rewards/final_reward": 1.9155205822997325, "rewards/mask_iou_reward": 0.9577602911498663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4855600595474243, "rewards/thk_ans_format_reward": 1.0, "step": 1032, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 118.60416793823242, "epoch": 3.4890387858347385, "grad_norm": 36.40473198415941, "kl": 0.6171875, "learning_rate": 7.091779279279279e-07, "loss": 0.0006, "reward": 3.26971971988678, "reward_std": 0.11622913181781769, "rewards/final_reward": 1.2890900177746465, "rewards/mask_iou_reward": 0.6445450088873232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2697195410728455, "rewards/thk_ans_format_reward": 1.0, "step": 1033, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.01041793823242, "epoch": 3.4924114671163577, "grad_norm": 8.404072392453646, "kl": 0.61328125, "learning_rate": 7.088963963963963e-07, "loss": 0.0007, "reward": 3.4270265102386475, "reward_std": 0.10212980210781097, "rewards/final_reward": 1.8368501315873327, "rewards/mask_iou_reward": 0.9184250657936663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4270267486572266, "rewards/thk_ans_format_reward": 1.0, "step": 1034, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 118.04166793823242, "epoch": 3.4957841483979766, "grad_norm": 7.301922297356568, "kl": 0.587890625, "learning_rate": 7.086148648648649e-07, "loss": 0.0006, "reward": 3.2225120067596436, "reward_std": 0.18624672293663025, "rewards/final_reward": 1.7009970313208873, "rewards/mask_iou_reward": 0.8504985156604437, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.222511887550354, "rewards/thk_ans_format_reward": 1.0, "step": 1035, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 119.02083587646484, "epoch": 3.4991568296795954, "grad_norm": 11.427769416824685, "kl": 0.560546875, "learning_rate": 7.083333333333334e-07, "loss": 0.0006, "reward": 3.1986045837402344, "reward_std": 0.16096675768494606, "rewards/final_reward": 0.6870702507692115, "rewards/mask_iou_reward": 0.34353512538460573, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1986045241355896, "rewards/thk_ans_format_reward": 1.0, "step": 1036, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 124.78125381469727, "epoch": 3.5025295109612142, "grad_norm": 30.010243571681038, "kl": 0.564453125, "learning_rate": 7.080518018018018e-07, "loss": 0.0006, "reward": 3.338430643081665, "reward_std": 0.1133829839527607, "rewards/final_reward": 1.564505636833713, "rewards/mask_iou_reward": 0.7822528184168565, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.338430643081665, "rewards/thk_ans_format_reward": 1.0, "step": 1037, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 122.76042175292969, "epoch": 3.505902192242833, "grad_norm": 16.436476631009363, "kl": 0.55859375, "learning_rate": 7.077702702702703e-07, "loss": 0.0006, "reward": 3.476973533630371, "reward_std": 0.14361018687486649, "rewards/final_reward": 1.3896836175215372, "rewards/mask_iou_reward": 0.6948418087607686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.476973533630371, "rewards/thk_ans_format_reward": 1.0, "step": 1038, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.56250381469727, "epoch": 3.509274873524452, "grad_norm": 24.14126315202132, "kl": 0.587890625, "learning_rate": 7.074887387387387e-07, "loss": 0.0006, "reward": 3.375579357147217, "reward_std": 0.057378935627639294, "rewards/final_reward": 1.8330912942769158, "rewards/mask_iou_reward": 0.9165456471384579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3755793571472168, "rewards/thk_ans_format_reward": 1.0, "step": 1039, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.67708587646484, "epoch": 3.5126475548060707, "grad_norm": 11.93388249586833, "kl": 0.6484375, "learning_rate": 7.072072072072072e-07, "loss": 0.0007, "reward": 3.459592580795288, "reward_std": 0.10880524665117264, "rewards/final_reward": 1.161511180571614, "rewards/mask_iou_reward": 0.580755590285807, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4595925211906433, "rewards/thk_ans_format_reward": 1.0, "step": 1040, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.89583587646484, "epoch": 3.51602023608769, "grad_norm": 11.0119929277027, "kl": 0.669921875, "learning_rate": 7.069256756756757e-07, "loss": 0.0007, "reward": 2.977562189102173, "reward_std": 0.2377164401113987, "rewards/final_reward": 0.13724020798525324, "rewards/mask_iou_reward": 0.06862010399262662, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9775621891021729, "rewards/thk_ans_format_reward": 1.0, "step": 1041, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.52083587646484, "epoch": 3.5193929173693084, "grad_norm": 10.690119770965087, "kl": 0.6015625, "learning_rate": 7.066441441441441e-07, "loss": 0.0006, "reward": 3.548872470855713, "reward_std": 0.05538544990122318, "rewards/final_reward": 1.0239648673086363, "rewards/mask_iou_reward": 0.5119824336543182, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.548872470855713, "rewards/thk_ans_format_reward": 1.0, "step": 1042, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.15625, "epoch": 3.5227655986509276, "grad_norm": 13.157116461404055, "kl": 0.544921875, "learning_rate": 7.063626126126126e-07, "loss": 0.0005, "reward": 3.264909267425537, "reward_std": 0.08749586343765259, "rewards/final_reward": 1.3016585640148213, "rewards/mask_iou_reward": 0.6508292820074106, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2649091482162476, "rewards/thk_ans_format_reward": 1.0, "step": 1043, "think_completion_length": 5.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 107.58333587646484, "epoch": 3.5261382799325465, "grad_norm": 32.2281674012685, "kl": 2.115234375, "learning_rate": 7.06081081081081e-07, "loss": 0.0021, "reward": 3.306381583213806, "reward_std": 0.1866590976715088, "rewards/final_reward": 1.010364329315494, "rewards/mask_iou_reward": 0.505182164657747, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3376315236091614, "rewards/thk_ans_format_reward": 1.0, "step": 1044, "think_completion_length": 6.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 105.04166793823242, "epoch": 3.5295109612141653, "grad_norm": 25.27852220793088, "kl": 0.671875, "learning_rate": 7.057995495495496e-07, "loss": 0.0007, "reward": 3.2020857334136963, "reward_std": 0.07453176006674767, "rewards/final_reward": 1.422622440794243, "rewards/mask_iou_reward": 0.7113112203971215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2020857334136963, "rewards/thk_ans_format_reward": 1.0, "step": 1045, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 117.01042175292969, "epoch": 3.532883642495784, "grad_norm": 21.604898654705316, "kl": 0.703125, "learning_rate": 7.055180180180181e-07, "loss": 0.0007, "reward": 3.4886926412582397, "reward_std": 0.18602124601602554, "rewards/final_reward": 0.9582341538163569, "rewards/mask_iou_reward": 0.47911707690817845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4886927604675293, "rewards/thk_ans_format_reward": 1.0, "step": 1046, "think_completion_length": 5.25 }, { "clip_ratio": 0.0, "completion_length": 128.2916717529297, "epoch": 3.536256323777403, "grad_norm": 15.95223692471976, "kl": 0.69140625, "learning_rate": 7.052364864864864e-07, "loss": 0.0007, "reward": 3.167095899581909, "reward_std": 0.20952048152685165, "rewards/final_reward": 1.5740200985301716, "rewards/mask_iou_reward": 0.7870100492650858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1670955419540405, "rewards/thk_ans_format_reward": 1.0, "step": 1047, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 119.97916793823242, "epoch": 3.539629005059022, "grad_norm": 17.810160005926072, "kl": 0.556640625, "learning_rate": 7.049549549549549e-07, "loss": 0.0006, "reward": 3.306037187576294, "reward_std": 0.2319406047463417, "rewards/final_reward": 1.0558346546299109, "rewards/mask_iou_reward": 0.5279173273149554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.306037187576294, "rewards/thk_ans_format_reward": 1.0, "step": 1048, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 117.87500381469727, "epoch": 3.5430016863406406, "grad_norm": 163.3696580898773, "kl": 0.609375, "learning_rate": 7.046734234234234e-07, "loss": 0.0006, "reward": 3.6224377155303955, "reward_std": 0.013898211065679789, "rewards/final_reward": 1.6734318576156038, "rewards/mask_iou_reward": 0.8367159288078019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6224374771118164, "rewards/thk_ans_format_reward": 1.0, "step": 1049, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 118.61458587646484, "epoch": 3.54637436762226, "grad_norm": 18.906848115177475, "kl": 0.666015625, "learning_rate": 7.043918918918918e-07, "loss": 0.0007, "reward": 3.389710783958435, "reward_std": 0.23390305787324905, "rewards/final_reward": 1.5296034337359203, "rewards/mask_iou_reward": 0.7648017168679602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3897106647491455, "rewards/thk_ans_format_reward": 1.0, "step": 1050, "think_completion_length": 6.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 147.37500381469727, "epoch": 3.5497470489038787, "grad_norm": 14.33713455543594, "kl": 0.580078125, "learning_rate": 7.041103603603603e-07, "loss": 0.0006, "reward": 2.87336528301239, "reward_std": 0.24747732281684875, "rewards/final_reward": 1.3500116230362549, "rewards/mask_iou_reward": 0.6750058115181274, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 0.946281909942627, "rewards/thk_ans_format_reward": 0.96875, "step": 1051, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 117.94791793823242, "epoch": 3.5531197301854975, "grad_norm": 9.87205102185769, "kl": 0.658203125, "learning_rate": 7.038288288288287e-07, "loss": 0.0007, "reward": 3.4340622425079346, "reward_std": 0.11470721662044525, "rewards/final_reward": 1.6583001580940349, "rewards/mask_iou_reward": 0.8291500790470174, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4340623021125793, "rewards/thk_ans_format_reward": 1.0, "step": 1052, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.08333587646484, "epoch": 3.5564924114671164, "grad_norm": 9.62732499087711, "kl": 0.587890625, "learning_rate": 7.035472972972972e-07, "loss": 0.0006, "reward": 3.484450578689575, "reward_std": 0.11886984389275312, "rewards/final_reward": 1.9086974388312166, "rewards/mask_iou_reward": 0.9543487194156083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4844504594802856, "rewards/thk_ans_format_reward": 1.0, "step": 1053, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 105.83333587646484, "epoch": 3.559865092748735, "grad_norm": 14.8438354944717, "kl": 0.775390625, "learning_rate": 7.032657657657657e-07, "loss": 0.0008, "reward": 3.4443854093551636, "reward_std": 0.2593641094863415, "rewards/final_reward": 1.3469927356049844, "rewards/mask_iou_reward": 0.6734963678024922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4443853497505188, "rewards/thk_ans_format_reward": 1.0, "step": 1054, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.56250381469727, "epoch": 3.563237774030354, "grad_norm": 11.380540635575645, "kl": 0.6015625, "learning_rate": 7.029842342342342e-07, "loss": 0.0006, "reward": 3.0353925228118896, "reward_std": 0.15303052216768265, "rewards/final_reward": 1.38494825478689, "rewards/mask_iou_reward": 0.692474127393445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0353924930095673, "rewards/thk_ans_format_reward": 1.0, "step": 1055, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.71875762939453, "epoch": 3.566610455311973, "grad_norm": 8.725713225667999, "kl": 0.765625, "learning_rate": 7.027027027027027e-07, "loss": 0.0008, "reward": 3.3467824459075928, "reward_std": 0.3739045560359955, "rewards/final_reward": 1.7465660805194694, "rewards/mask_iou_reward": 0.8732830402597347, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3780324459075928, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1056, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.22916793823242, "epoch": 3.569983136593592, "grad_norm": 15.459697290041083, "kl": 0.62109375, "learning_rate": 7.024211711711711e-07, "loss": 0.0006, "reward": 3.3566296100616455, "reward_std": 0.14935403689742088, "rewards/final_reward": 1.384715234534162, "rewards/mask_iou_reward": 0.692357617267081, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3566296696662903, "rewards/thk_ans_format_reward": 1.0, "step": 1057, "think_completion_length": 6.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 121.39583587646484, "epoch": 3.573355817875211, "grad_norm": 12.131914905442958, "kl": 0.83203125, "learning_rate": 7.021396396396396e-07, "loss": 0.0008, "reward": 3.395006537437439, "reward_std": 0.12985088303685188, "rewards/final_reward": 1.3578735883213409, "rewards/mask_iou_reward": 0.6789367941606704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4054231643676758, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1058, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 118.85417175292969, "epoch": 3.5767284991568298, "grad_norm": 6.470768374066208, "kl": 0.599609375, "learning_rate": 7.018581081081081e-07, "loss": 0.0006, "reward": 3.2740269899368286, "reward_std": 0.16072769463062286, "rewards/final_reward": 1.8859906667640165, "rewards/mask_iou_reward": 0.9429953333820082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2740269899368286, "rewards/thk_ans_format_reward": 1.0, "step": 1059, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.96875381469727, "epoch": 3.5801011804384486, "grad_norm": 10.983155260058247, "kl": 0.580078125, "learning_rate": 7.015765765765765e-07, "loss": 0.0006, "reward": 3.5100547075271606, "reward_std": 0.11096128076314926, "rewards/final_reward": 1.2615163082914997, "rewards/mask_iou_reward": 0.6307581541457499, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5100546479225159, "rewards/thk_ans_format_reward": 1.0, "step": 1060, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 134.09375762939453, "epoch": 3.5834738617200674, "grad_norm": 8.844336705792847, "kl": 0.826171875, "learning_rate": 7.01295045045045e-07, "loss": 0.0008, "reward": 3.2920873165130615, "reward_std": 0.09549107030034065, "rewards/final_reward": 1.6379837260078178, "rewards/mask_iou_reward": 0.8189918630039089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2920872569084167, "rewards/thk_ans_format_reward": 1.0, "step": 1061, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 100.60416793823242, "epoch": 3.5868465430016863, "grad_norm": 16.903653696816175, "kl": 0.708984375, "learning_rate": 7.010135135135134e-07, "loss": 0.0007, "reward": 3.3538187742233276, "reward_std": 0.24481885135173798, "rewards/final_reward": 1.5848143437602702, "rewards/mask_iou_reward": 0.7924071718801351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3538187742233276, "rewards/thk_ans_format_reward": 1.0, "step": 1062, "think_completion_length": 5.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.91666793823242, "epoch": 3.590219224283305, "grad_norm": 27.439916704869667, "kl": 0.572265625, "learning_rate": 7.007319819819819e-07, "loss": 0.0006, "reward": 3.5271008014678955, "reward_std": 0.05490931309759617, "rewards/final_reward": 1.4191140366165103, "rewards/mask_iou_reward": 0.7095570183082551, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.527100920677185, "rewards/thk_ans_format_reward": 1.0, "step": 1063, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.62500381469727, "epoch": 3.5935919055649244, "grad_norm": 13.85268564742067, "kl": 0.55078125, "learning_rate": 7.004504504504504e-07, "loss": 0.0006, "reward": 3.6005152463912964, "reward_std": 0.12097604386508465, "rewards/final_reward": 1.4686288065436273, "rewards/mask_iou_reward": 0.7343144032718136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6005152463912964, "rewards/thk_ans_format_reward": 1.0, "step": 1064, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 116.55208587646484, "epoch": 3.5969645868465427, "grad_norm": 19.073402183690888, "kl": 0.677734375, "learning_rate": 7.001689189189189e-07, "loss": 0.0007, "reward": 2.8053646087646484, "reward_std": 0.062431491911411285, "rewards/final_reward": 0.9842462252401399, "rewards/mask_iou_reward": 0.49212311262006997, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8157810866832733, "rewards/thk_ans_format_reward": 1.0, "step": 1065, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 122.34375381469727, "epoch": 3.600337268128162, "grad_norm": 12.129261872140868, "kl": 0.609375, "learning_rate": 6.998873873873874e-07, "loss": 0.0006, "reward": 3.1443766355514526, "reward_std": 0.04327939311042428, "rewards/final_reward": 0.2489097145314306, "rewards/mask_iou_reward": 0.1244548572657153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.144376516342163, "rewards/thk_ans_format_reward": 1.0, "step": 1066, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.65625381469727, "epoch": 3.603709949409781, "grad_norm": 90.03169091556349, "kl": 0.62890625, "learning_rate": 6.996058558558559e-07, "loss": 0.0006, "reward": 3.5203282833099365, "reward_std": 0.09074808657169342, "rewards/final_reward": 1.6439105325893757, "rewards/mask_iou_reward": 0.8219552662946878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5203283429145813, "rewards/thk_ans_format_reward": 1.0, "step": 1067, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 108.76041793823242, "epoch": 3.6070826306913997, "grad_norm": 9.523252458140334, "kl": 0.67578125, "learning_rate": 6.993243243243243e-07, "loss": 0.0007, "reward": 3.134290933609009, "reward_std": 0.1437622308731079, "rewards/final_reward": 1.7129957587646047, "rewards/mask_iou_reward": 0.8564978793823024, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.144707441329956, "rewards/thk_ans_format_reward": 1.0, "step": 1068, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 131.59375381469727, "epoch": 3.6104553119730185, "grad_norm": 34.119749728566035, "kl": 0.59765625, "learning_rate": 6.990427927927928e-07, "loss": 0.0006, "reward": 3.7921831607818604, "reward_std": 0.05658973567187786, "rewards/final_reward": 1.8885518493785414, "rewards/mask_iou_reward": 0.9442759246892707, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7921829223632812, "rewards/thk_ans_format_reward": 1.0, "step": 1069, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 116.58333587646484, "epoch": 3.6138279932546373, "grad_norm": 16.675311982963155, "kl": 0.587890625, "learning_rate": 6.987612612612612e-07, "loss": 0.0006, "reward": 3.5710495710372925, "reward_std": 0.14011128805577755, "rewards/final_reward": 1.092474775438216, "rewards/mask_iou_reward": 0.546237387719108, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.571049690246582, "rewards/thk_ans_format_reward": 1.0, "step": 1070, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 116.54167175292969, "epoch": 3.6172006745362566, "grad_norm": 28.795790674378512, "kl": 0.677734375, "learning_rate": 6.984797297297297e-07, "loss": 0.0007, "reward": 3.082141876220703, "reward_std": 0.22064311429858208, "rewards/final_reward": 0.9153156300068633, "rewards/mask_iou_reward": 0.45765781500343167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0821418762207031, "rewards/thk_ans_format_reward": 1.0, "step": 1071, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 165.59375, "epoch": 3.620573355817875, "grad_norm": 11.444374570788154, "kl": 0.810546875, "learning_rate": 6.981981981981982e-07, "loss": 0.0008, "reward": 3.4599640369415283, "reward_std": 0.1495438888669014, "rewards/final_reward": 1.1625894442026803, "rewards/mask_iou_reward": 0.5812947221013401, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4599639773368835, "rewards/thk_ans_format_reward": 1.0, "step": 1072, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 118.16666793823242, "epoch": 3.6239460370994943, "grad_norm": 12.101685454634838, "kl": 0.658203125, "learning_rate": 6.979166666666666e-07, "loss": 0.0006, "reward": 3.401353597640991, "reward_std": 0.10515595600008965, "rewards/final_reward": 0.9484960177048483, "rewards/mask_iou_reward": 0.47424800885242413, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4013535976409912, "rewards/thk_ans_format_reward": 1.0, "step": 1073, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.63542175292969, "epoch": 3.627318718381113, "grad_norm": 14.46073592297596, "kl": 0.82421875, "learning_rate": 6.976351351351351e-07, "loss": 0.0008, "reward": 3.64565372467041, "reward_std": 0.12325317412614822, "rewards/final_reward": 1.910804574900559, "rewards/mask_iou_reward": 0.9554022874502796, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6456536650657654, "rewards/thk_ans_format_reward": 1.0, "step": 1074, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.06250381469727, "epoch": 3.630691399662732, "grad_norm": 80.55259372670554, "kl": 0.7421875, "learning_rate": 6.973536036036036e-07, "loss": 0.0008, "reward": 3.5288643836975098, "reward_std": 0.2130519635975361, "rewards/final_reward": 1.7231965755366625, "rewards/mask_iou_reward": 0.8615982877683313, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.549697756767273, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1075, "think_completion_length": 5.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 105.72916793823242, "epoch": 3.6340640809443507, "grad_norm": 6.664651113291088, "kl": 0.76953125, "learning_rate": 6.970720720720721e-07, "loss": 0.0008, "reward": 3.5804613828659058, "reward_std": 0.07663201168179512, "rewards/final_reward": 1.1199998042376362, "rewards/mask_iou_reward": 0.5599999021188181, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5804613828659058, "rewards/thk_ans_format_reward": 1.0, "step": 1076, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 117.85417175292969, "epoch": 3.6374367622259696, "grad_norm": 7.411611992399397, "kl": 0.611328125, "learning_rate": 6.967905405405406e-07, "loss": 0.0006, "reward": 3.0753601789474487, "reward_std": 0.22175676375627518, "rewards/final_reward": 1.190157277543239, "rewards/mask_iou_reward": 0.5950786387716195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0753599405288696, "rewards/thk_ans_format_reward": 1.0, "step": 1077, "think_completion_length": 5.375 }, { "clip_ratio": 0.0, "completion_length": 171.4479217529297, "epoch": 3.6408094435075884, "grad_norm": 16.851340992880754, "kl": 0.826171875, "learning_rate": 6.96509009009009e-07, "loss": 0.0009, "reward": 3.230084538459778, "reward_std": 0.26128628849983215, "rewards/final_reward": 0.0064829542036839855, "rewards/mask_iou_reward": 0.0032414771018419927, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2613343596458435, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1078, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 127.06250762939453, "epoch": 3.6441821247892072, "grad_norm": 24.82890569936366, "kl": 0.671875, "learning_rate": 6.962274774774775e-07, "loss": 0.0007, "reward": 3.4247443675994873, "reward_std": 0.21899319719523191, "rewards/final_reward": 1.5726510366981412, "rewards/mask_iou_reward": 0.7863255183490706, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.455994427204132, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1079, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 117.11458587646484, "epoch": 3.6475548060708265, "grad_norm": 11.50113447748046, "kl": 0.607421875, "learning_rate": 6.95945945945946e-07, "loss": 0.0006, "reward": 3.3249796628952026, "reward_std": 0.15425102412700653, "rewards/final_reward": 1.3915881672936048, "rewards/mask_iou_reward": 0.6957940836468024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.324979841709137, "rewards/thk_ans_format_reward": 1.0, "step": 1080, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.95833587646484, "epoch": 3.6509274873524453, "grad_norm": 9.449392867624718, "kl": 0.625, "learning_rate": 6.956644144144144e-07, "loss": 0.0006, "reward": 3.43624210357666, "reward_std": 0.01355983130633831, "rewards/final_reward": 0.9758531922051863, "rewards/mask_iou_reward": 0.48792659610259315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4362419247627258, "rewards/thk_ans_format_reward": 1.0, "step": 1081, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 125.88541793823242, "epoch": 3.654300168634064, "grad_norm": 9.488940964063072, "kl": 0.666015625, "learning_rate": 6.953828828828829e-07, "loss": 0.0007, "reward": 3.3464990854263306, "reward_std": 0.30547909438610077, "rewards/final_reward": 1.5841524038260197, "rewards/mask_iou_reward": 0.7920762019130099, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3464989066123962, "rewards/thk_ans_format_reward": 1.0, "step": 1082, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 116.42708587646484, "epoch": 3.657672849915683, "grad_norm": 12.788530897249785, "kl": 0.59375, "learning_rate": 6.951013513513513e-07, "loss": 0.0006, "reward": 3.588488817214966, "reward_std": 0.07584836706519127, "rewards/final_reward": 1.5972509743272374, "rewards/mask_iou_reward": 0.7986254871636187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5884888768196106, "rewards/thk_ans_format_reward": 1.0, "step": 1083, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.10416793823242, "epoch": 3.661045531197302, "grad_norm": 14.276790072775468, "kl": 0.591796875, "learning_rate": 6.948198198198198e-07, "loss": 0.0006, "reward": 3.343292236328125, "reward_std": 0.1573808193206787, "rewards/final_reward": 0.9708995355436782, "rewards/mask_iou_reward": 0.4854497677718391, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3432920575141907, "rewards/thk_ans_format_reward": 1.0, "step": 1084, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 120.77083587646484, "epoch": 3.6644182124789206, "grad_norm": 13.768661675669374, "kl": 0.70703125, "learning_rate": 6.945382882882884e-07, "loss": 0.0007, "reward": 3.697332262992859, "reward_std": 0.03931210841983557, "rewards/final_reward": 1.948831729093463, "rewards/mask_iou_reward": 0.9744158645467315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6973322629928589, "rewards/thk_ans_format_reward": 1.0, "step": 1085, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 116.03125381469727, "epoch": 3.6677908937605395, "grad_norm": 19.0330526800938, "kl": 0.6484375, "learning_rate": 6.942567567567568e-07, "loss": 0.0006, "reward": 3.194105863571167, "reward_std": 0.17677438259124756, "rewards/final_reward": 1.039079084824662, "rewards/mask_iou_reward": 0.519539542412331, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1941059827804565, "rewards/thk_ans_format_reward": 1.0, "step": 1086, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 118.07291793823242, "epoch": 3.6711635750421587, "grad_norm": 15.123282614981838, "kl": 0.625, "learning_rate": 6.939752252252252e-07, "loss": 0.0006, "reward": 3.3549336194992065, "reward_std": 0.0925322026014328, "rewards/final_reward": 1.3952954506377155, "rewards/mask_iou_reward": 0.6976477253188578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.354933738708496, "rewards/thk_ans_format_reward": 1.0, "step": 1087, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.98959350585938, "epoch": 3.6745362563237776, "grad_norm": 24.47213890889503, "kl": 0.55859375, "learning_rate": 6.936936936936936e-07, "loss": 0.0006, "reward": 3.6780649423599243, "reward_std": 0.15190696716308594, "rewards/final_reward": 1.8513842679316488, "rewards/mask_iou_reward": 0.9256921339658244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6884815692901611, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1088, "think_completion_length": 6.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 136.59375762939453, "epoch": 3.6779089376053964, "grad_norm": 7.552053296826617, "kl": 0.595703125, "learning_rate": 6.934121621621621e-07, "loss": 0.0006, "reward": 3.3988085985183716, "reward_std": 0.21870959550142288, "rewards/final_reward": 0.8550849830494813, "rewards/mask_iou_reward": 0.42754249152474066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3988087177276611, "rewards/thk_ans_format_reward": 1.0, "step": 1089, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 122.62500381469727, "epoch": 3.681281618887015, "grad_norm": 36.284952682642654, "kl": 0.59765625, "learning_rate": 6.931306306306306e-07, "loss": 0.0006, "reward": 3.5274451971054077, "reward_std": 0.1046636514365673, "rewards/final_reward": 1.7440145405320384, "rewards/mask_iou_reward": 0.8720072702660192, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.527445137500763, "rewards/thk_ans_format_reward": 1.0, "step": 1090, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.83333587646484, "epoch": 3.684654300168634, "grad_norm": 10.87900442779485, "kl": 0.6484375, "learning_rate": 6.92849099099099e-07, "loss": 0.0006, "reward": 3.2674392461776733, "reward_std": 0.2030637189745903, "rewards/final_reward": 1.2155159261571356, "rewards/mask_iou_reward": 0.6077579630785678, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.267439216375351, "rewards/thk_ans_format_reward": 1.0, "step": 1091, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 104.72916793823242, "epoch": 3.688026981450253, "grad_norm": 11.530580162530784, "kl": 0.712890625, "learning_rate": 6.925675675675675e-07, "loss": 0.0007, "reward": 3.406046509742737, "reward_std": 0.15012041572481394, "rewards/final_reward": 1.4677173188719124, "rewards/mask_iou_reward": 0.7338586594359562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4060462713241577, "rewards/thk_ans_format_reward": 1.0, "step": 1092, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 118.23958587646484, "epoch": 3.6913996627318717, "grad_norm": 8.836161855161931, "kl": 0.62890625, "learning_rate": 6.922860360360359e-07, "loss": 0.0006, "reward": 3.214850425720215, "reward_std": 0.09776772558689117, "rewards/final_reward": 1.3247986783728343, "rewards/mask_iou_reward": 0.6623993391864171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2148504257202148, "rewards/thk_ans_format_reward": 1.0, "step": 1093, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.40625, "epoch": 3.694772344013491, "grad_norm": 5.76477120156763, "kl": 0.748046875, "learning_rate": 6.920045045045044e-07, "loss": 0.0008, "reward": 3.4666571617126465, "reward_std": 0.2039150409400463, "rewards/final_reward": 1.340760683680751, "rewards/mask_iou_reward": 0.6703803418403755, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4874904155731201, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1094, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.8229217529297, "epoch": 3.6981450252951094, "grad_norm": 9.585543183381706, "kl": 0.587890625, "learning_rate": 6.91722972972973e-07, "loss": 0.0006, "reward": 3.43148934841156, "reward_std": 0.09372329898178577, "rewards/final_reward": 0.9576623108718747, "rewards/mask_iou_reward": 0.47883115543593735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4314891397953033, "rewards/thk_ans_format_reward": 1.0, "step": 1095, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.21875, "epoch": 3.7015177065767286, "grad_norm": 10.038236194020055, "kl": 0.791015625, "learning_rate": 6.914414414414414e-07, "loss": 0.0008, "reward": 3.4386956691741943, "reward_std": 0.08693969808518887, "rewards/final_reward": 1.8579690886791975, "rewards/mask_iou_reward": 0.9289845443395988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4386956095695496, "rewards/thk_ans_format_reward": 1.0, "step": 1096, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 142.84375762939453, "epoch": 3.7048903878583475, "grad_norm": 109.9158361563393, "kl": 0.537109375, "learning_rate": 6.911599099099099e-07, "loss": 0.0005, "reward": 3.5106626749038696, "reward_std": 0.051349299028515816, "rewards/final_reward": 1.7499927058187152, "rewards/mask_iou_reward": 0.8749963529093576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5106624960899353, "rewards/thk_ans_format_reward": 1.0, "step": 1097, "think_completion_length": 6.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 146.02083587646484, "epoch": 3.7082630691399663, "grad_norm": 10.121253017002285, "kl": 0.66796875, "learning_rate": 6.908783783783783e-07, "loss": 0.0007, "reward": 3.3785040378570557, "reward_std": 0.2980985939502716, "rewards/final_reward": 1.6061714274948828, "rewards/mask_iou_reward": 0.8030857137474414, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.399337112903595, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1098, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 127.52083587646484, "epoch": 3.711635750421585, "grad_norm": 9.155132777155144, "kl": 0.587890625, "learning_rate": 6.905968468468468e-07, "loss": 0.0006, "reward": 3.364309310913086, "reward_std": 0.1669931337237358, "rewards/final_reward": 1.3568525625508892, "rewards/mask_iou_reward": 0.6784262812754446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3643090724945068, "rewards/thk_ans_format_reward": 1.0, "step": 1099, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 122.75000381469727, "epoch": 3.715008431703204, "grad_norm": 9.480165526801581, "kl": 0.59375, "learning_rate": 6.903153153153153e-07, "loss": 0.0006, "reward": 3.4969812631607056, "reward_std": 0.16354385018348694, "rewards/final_reward": 1.4303749695856478, "rewards/mask_iou_reward": 0.7151874847928239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.496981143951416, "rewards/thk_ans_format_reward": 1.0, "step": 1100, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 100.62500381469727, "epoch": 3.718381112984823, "grad_norm": 19.621153860163872, "kl": 0.849609375, "learning_rate": 6.900337837837837e-07, "loss": 0.0009, "reward": 3.537114977836609, "reward_std": 0.07870265282690525, "rewards/final_reward": 1.685322757336444, "rewards/mask_iou_reward": 0.842661378668222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5371148586273193, "rewards/thk_ans_format_reward": 1.0, "step": 1101, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 104.55208587646484, "epoch": 3.7217537942664416, "grad_norm": 10.553586034745063, "kl": 0.7578125, "learning_rate": 6.897522522522522e-07, "loss": 0.0008, "reward": 3.653933048248291, "reward_std": 0.10681610554456711, "rewards/final_reward": 1.5648624304704355, "rewards/mask_iou_reward": 0.7824312152352177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6539331674575806, "rewards/thk_ans_format_reward": 1.0, "step": 1102, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 142.91666793823242, "epoch": 3.725126475548061, "grad_norm": 7.397855296650245, "kl": 0.5859375, "learning_rate": 6.894707207207207e-07, "loss": 0.0006, "reward": 3.233435034751892, "reward_std": 0.25034917145967484, "rewards/final_reward": 0.9141035635760388, "rewards/mask_iou_reward": 0.4570517817880194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2334350645542145, "rewards/thk_ans_format_reward": 1.0, "step": 1103, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 177.81250762939453, "epoch": 3.7284991568296797, "grad_norm": 37.53188467605128, "kl": 0.5380859375, "learning_rate": 6.891891891891891e-07, "loss": 0.0005, "reward": 3.3896223306655884, "reward_std": 0.2384193167090416, "rewards/final_reward": 1.4327673652494468, "rewards/mask_iou_reward": 0.7163836826247234, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4104554653167725, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1104, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 131.95833587646484, "epoch": 3.7318718381112985, "grad_norm": 9.254473164975776, "kl": 0.556640625, "learning_rate": 6.889076576576577e-07, "loss": 0.0006, "reward": 3.3896210193634033, "reward_std": 0.13112148270010948, "rewards/final_reward": 1.5128234298816556, "rewards/mask_iou_reward": 0.7564117149408278, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3896209001541138, "rewards/thk_ans_format_reward": 1.0, "step": 1105, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.89583587646484, "epoch": 3.7352445193929174, "grad_norm": 23.84978331881755, "kl": 0.56640625, "learning_rate": 6.886261261261261e-07, "loss": 0.0006, "reward": 3.1603639125823975, "reward_std": 0.07676676660776138, "rewards/final_reward": 0.890607477137064, "rewards/mask_iou_reward": 0.445303738568532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.160364031791687, "rewards/thk_ans_format_reward": 1.0, "step": 1106, "think_completion_length": 6.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 129.4791717529297, "epoch": 3.738617200674536, "grad_norm": 46.65639431978475, "kl": 0.5625, "learning_rate": 6.883445945945946e-07, "loss": 0.0005, "reward": 3.095438241958618, "reward_std": 0.2738206684589386, "rewards/final_reward": 0.7180541243761372, "rewards/mask_iou_reward": 0.3590270621880686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.095438003540039, "rewards/thk_ans_format_reward": 1.0, "step": 1107, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 124.32292175292969, "epoch": 3.741989881956155, "grad_norm": 31.345126429693313, "kl": 1.119140625, "learning_rate": 6.880630630630631e-07, "loss": 0.0011, "reward": 3.4316246509552, "reward_std": 0.08794242702424526, "rewards/final_reward": 1.6411358926009716, "rewards/mask_iou_reward": 0.8205679463004858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4316245317459106, "rewards/thk_ans_format_reward": 1.0, "step": 1108, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 128.71875, "epoch": 3.745362563237774, "grad_norm": 12.541312240734433, "kl": 0.62109375, "learning_rate": 6.877815315315315e-07, "loss": 0.0006, "reward": 3.412677526473999, "reward_std": 0.2318500354886055, "rewards/final_reward": 1.5392361667886947, "rewards/mask_iou_reward": 0.7696180833943473, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.412677526473999, "rewards/thk_ans_format_reward": 1.0, "step": 1109, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 149.8541717529297, "epoch": 3.748735244519393, "grad_norm": 15.281198773897037, "kl": 0.8828125, "learning_rate": 6.875e-07, "loss": 0.0009, "reward": 3.305709719657898, "reward_std": 0.31281551718711853, "rewards/final_reward": 1.7111558294875617, "rewards/mask_iou_reward": 0.8555779147437809, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3682093620300293, "rewards/thk_ans_format_reward": 0.96875, "step": 1110, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 171.1354217529297, "epoch": 3.752107925801012, "grad_norm": 18.422191479114865, "kl": 0.423828125, "learning_rate": 6.872184684684684e-07, "loss": 0.0004, "reward": 3.6392204761505127, "reward_std": 0.04167993552982807, "rewards/final_reward": 1.9108645855743962, "rewards/mask_iou_reward": 0.9554322927871981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.639220416545868, "rewards/thk_ans_format_reward": 1.0, "step": 1111, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 144.2291717529297, "epoch": 3.7554806070826308, "grad_norm": 12.32647367095304, "kl": 0.6015625, "learning_rate": 6.869369369369369e-07, "loss": 0.0006, "reward": 3.3169971704483032, "reward_std": 0.07655757665634155, "rewards/final_reward": 1.1560575467479777, "rewards/mask_iou_reward": 0.5780287733739888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.316997230052948, "rewards/thk_ans_format_reward": 1.0, "step": 1112, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.50000762939453, "epoch": 3.7588532883642496, "grad_norm": 16.190413962214407, "kl": 0.5869140625, "learning_rate": 6.866554054054054e-07, "loss": 0.0006, "reward": 3.449712872505188, "reward_std": 0.16217003017663956, "rewards/final_reward": 1.5921062061509579, "rewards/mask_iou_reward": 0.7960531030754789, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4497130513191223, "rewards/thk_ans_format_reward": 1.0, "step": 1113, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.25000762939453, "epoch": 3.7622259696458684, "grad_norm": 22.800503624529753, "kl": 0.560546875, "learning_rate": 6.863738738738738e-07, "loss": 0.0005, "reward": 3.677310585975647, "reward_std": 0.05152285099029541, "rewards/final_reward": 1.4880345993776913, "rewards/mask_iou_reward": 0.7440172996888457, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6773105263710022, "rewards/thk_ans_format_reward": 1.0, "step": 1114, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 126.625, "epoch": 3.7655986509274872, "grad_norm": 19.32782592827037, "kl": 0.658203125, "learning_rate": 6.860923423423423e-07, "loss": 0.0007, "reward": 3.0054088830947876, "reward_std": 0.04576574079692364, "rewards/final_reward": 0.54155883498149, "rewards/mask_iou_reward": 0.270779417490745, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0054087042808533, "rewards/thk_ans_format_reward": 1.0, "step": 1115, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 141.9479217529297, "epoch": 3.768971332209106, "grad_norm": 8.843701911133191, "kl": 0.5390625, "learning_rate": 6.858108108108109e-07, "loss": 0.0005, "reward": 3.573105573654175, "reward_std": 0.1294175200164318, "rewards/final_reward": 1.6244050962650063, "rewards/mask_iou_reward": 0.8122025481325031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5731057524681091, "rewards/thk_ans_format_reward": 1.0, "step": 1116, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.25000762939453, "epoch": 3.7723440134907253, "grad_norm": 83.32854232345196, "kl": 0.533203125, "learning_rate": 6.855292792792793e-07, "loss": 0.0005, "reward": 3.2631919384002686, "reward_std": 0.10688711702823639, "rewards/final_reward": 1.4339104671690461, "rewards/mask_iou_reward": 0.7169552335845231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2631917893886566, "rewards/thk_ans_format_reward": 1.0, "step": 1117, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 122.54167175292969, "epoch": 3.775716694772344, "grad_norm": 9.126275736652579, "kl": 0.59375, "learning_rate": 6.852477477477478e-07, "loss": 0.0006, "reward": 3.5523250102996826, "reward_std": 0.1462704762816429, "rewards/final_reward": 1.4037499565724185, "rewards/mask_iou_reward": 0.7018749782862093, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5523249506950378, "rewards/thk_ans_format_reward": 1.0, "step": 1118, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 128.27083587646484, "epoch": 3.779089376053963, "grad_norm": 9.693226489065822, "kl": 0.5087890625, "learning_rate": 6.849662162162162e-07, "loss": 0.0005, "reward": 3.363922119140625, "reward_std": 0.0753621906042099, "rewards/final_reward": 1.296634945318167, "rewards/mask_iou_reward": 0.6483174726590835, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3639219999313354, "rewards/thk_ans_format_reward": 1.0, "step": 1119, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.0104217529297, "epoch": 3.782462057335582, "grad_norm": 15.448804342717906, "kl": 0.56640625, "learning_rate": 6.846846846846847e-07, "loss": 0.0006, "reward": 3.552070379257202, "reward_std": 0.3433392718434334, "rewards/final_reward": 1.423108607564154, "rewards/mask_iou_reward": 0.711554303782077, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.593737006187439, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1120, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.65625762939453, "epoch": 3.7858347386172007, "grad_norm": 13.346373731247276, "kl": 0.453125, "learning_rate": 6.844031531531532e-07, "loss": 0.0004, "reward": 3.4322853088378906, "reward_std": 0.5301657021045685, "rewards/final_reward": 1.3762657362921482, "rewards/mask_iou_reward": 0.6881328681460741, "rewards/sam_format_reward": 0.9375000298023224, "rewards/sam_reward_func_ultra": 1.5572853684425354, "rewards/thk_ans_format_reward": 0.9375000298023224, "step": 1121, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.53125381469727, "epoch": 3.7892074198988195, "grad_norm": 31.74734560119016, "kl": 0.4892578125, "learning_rate": 6.841216216216216e-07, "loss": 0.0005, "reward": 3.7686702013015747, "reward_std": 0.025162406265735626, "rewards/final_reward": 1.9371686420295529, "rewards/mask_iou_reward": 0.9685843210147764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7686700820922852, "rewards/thk_ans_format_reward": 1.0, "step": 1122, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 229.70833587646484, "epoch": 3.7925801011804383, "grad_norm": 22.51580592703938, "kl": 0.548828125, "learning_rate": 6.838400900900901e-07, "loss": 0.0006, "reward": 2.8940176963806152, "reward_std": 0.4165959060192108, "rewards/final_reward": 1.1415240073547133, "rewards/mask_iou_reward": 0.5707620036773566, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 0.9981842637062073, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 1123, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 153.9479217529297, "epoch": 3.7959527824620576, "grad_norm": 14.567123366233686, "kl": 0.5703125, "learning_rate": 6.835585585585585e-07, "loss": 0.0006, "reward": 3.60901141166687, "reward_std": 0.22202441096305847, "rewards/final_reward": 1.7287610565966587, "rewards/mask_iou_reward": 0.8643805282983293, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6298444271087646, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1124, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 121.93750381469727, "epoch": 3.799325463743676, "grad_norm": 13.533824935426894, "kl": 0.75390625, "learning_rate": 6.832770270270269e-07, "loss": 0.0008, "reward": 3.3092023134231567, "reward_std": 0.11465185135602951, "rewards/final_reward": 1.5839463106212694, "rewards/mask_iou_reward": 0.7919731553106347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3092021346092224, "rewards/thk_ans_format_reward": 1.0, "step": 1125, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 147.09375381469727, "epoch": 3.8026981450252952, "grad_norm": 6.412786524388538, "kl": 0.59765625, "learning_rate": 6.829954954954955e-07, "loss": 0.0006, "reward": 3.368514060974121, "reward_std": 0.20392443984746933, "rewards/final_reward": 1.5844456702050551, "rewards/mask_iou_reward": 0.7922228351025276, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3893473744392395, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1126, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 120.47916793823242, "epoch": 3.806070826306914, "grad_norm": 13.208236486503731, "kl": 0.71484375, "learning_rate": 6.827139639639639e-07, "loss": 0.0007, "reward": 3.238366961479187, "reward_std": 0.18330181390047073, "rewards/final_reward": 0.8698805224904762, "rewards/mask_iou_reward": 0.4349402612452381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.238366961479187, "rewards/thk_ans_format_reward": 1.0, "step": 1127, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 128.86458587646484, "epoch": 3.809443507588533, "grad_norm": 137.55010232326637, "kl": 0.642578125, "learning_rate": 6.824324324324324e-07, "loss": 0.0006, "reward": 3.424420475959778, "reward_std": 0.15352077782154083, "rewards/final_reward": 1.3288265654138773, "rewards/mask_iou_reward": 0.6644132827069387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4244202971458435, "rewards/thk_ans_format_reward": 1.0, "step": 1128, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.46875381469727, "epoch": 3.8128161888701517, "grad_norm": 16.05562555438284, "kl": 0.544921875, "learning_rate": 6.821509009009008e-07, "loss": 0.0005, "reward": 3.5405032634735107, "reward_std": 0.09723260626196861, "rewards/final_reward": 1.4643371725248082, "rewards/mask_iou_reward": 0.7321685862624041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.540503203868866, "rewards/thk_ans_format_reward": 1.0, "step": 1129, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.59375381469727, "epoch": 3.8161888701517706, "grad_norm": 17.866431008873636, "kl": 0.59375, "learning_rate": 6.818693693693693e-07, "loss": 0.0006, "reward": 3.3343998193740845, "reward_std": 0.20225829631090164, "rewards/final_reward": 1.8135839218908014, "rewards/mask_iou_reward": 0.9067919609454007, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3343996405601501, "rewards/thk_ans_format_reward": 1.0, "step": 1130, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 118.55208587646484, "epoch": 3.8195615514333894, "grad_norm": 12.75502597179819, "kl": 0.599609375, "learning_rate": 6.815878378378378e-07, "loss": 0.0006, "reward": 3.1991249322891235, "reward_std": 0.11675117909908295, "rewards/final_reward": 1.029759909468699, "rewards/mask_iou_reward": 0.5148799547343496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.199124813079834, "rewards/thk_ans_format_reward": 1.0, "step": 1131, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.95833587646484, "epoch": 3.822934232715008, "grad_norm": 25.249214949208973, "kl": 0.60546875, "learning_rate": 6.813063063063062e-07, "loss": 0.0006, "reward": 3.5785523653030396, "reward_std": 0.03312146570533514, "rewards/final_reward": 1.8504287888354827, "rewards/mask_iou_reward": 0.9252143944177413, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5785521268844604, "rewards/thk_ans_format_reward": 1.0, "step": 1132, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 127.6875, "epoch": 3.8263069139966275, "grad_norm": 10.199327918569953, "kl": 0.80078125, "learning_rate": 6.810247747747747e-07, "loss": 0.0008, "reward": 3.3085036277770996, "reward_std": 0.22457706183195114, "rewards/final_reward": 1.6140251674854684, "rewards/mask_iou_reward": 0.8070125837427342, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3293370604515076, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1133, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 127.25, "epoch": 3.8296795952782463, "grad_norm": 10.183729612731424, "kl": 0.8359375, "learning_rate": 6.807432432432431e-07, "loss": 0.0008, "reward": 3.356192469596863, "reward_std": 0.20595254004001617, "rewards/final_reward": 1.705807672602365, "rewards/mask_iou_reward": 0.8529038363011825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.356192409992218, "rewards/thk_ans_format_reward": 1.0, "step": 1134, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 134.73958587646484, "epoch": 3.833052276559865, "grad_norm": 25.931712730614123, "kl": 0.607421875, "learning_rate": 6.804617117117116e-07, "loss": 0.0006, "reward": 3.551330327987671, "reward_std": 0.11808786168694496, "rewards/final_reward": 1.6091930003779598, "rewards/mask_iou_reward": 0.8045965001889799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.551330327987671, "rewards/thk_ans_format_reward": 1.0, "step": 1135, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 190.8645896911621, "epoch": 3.836424957841484, "grad_norm": 12.010928338808759, "kl": 0.564453125, "learning_rate": 6.801801801801802e-07, "loss": 0.0006, "reward": 3.350279211997986, "reward_std": 0.2695094048976898, "rewards/final_reward": 1.469435863031313, "rewards/mask_iou_reward": 0.7347179315156565, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4336124062538147, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1136, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.36458587646484, "epoch": 3.839797639123103, "grad_norm": 10.129838501473719, "kl": 0.580078125, "learning_rate": 6.798986486486486e-07, "loss": 0.0006, "reward": 3.593467593193054, "reward_std": 0.1148192435503006, "rewards/final_reward": 1.7003924569798057, "rewards/mask_iou_reward": 0.8501962284899028, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.614300787448883, "rewards/thk_ans_format_reward": 1.0, "step": 1137, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.48958587646484, "epoch": 3.8431703204047216, "grad_norm": 12.924062041974949, "kl": 0.58984375, "learning_rate": 6.796171171171171e-07, "loss": 0.0006, "reward": 3.5154197216033936, "reward_std": 0.09570467099547386, "rewards/final_reward": 1.6768668541562382, "rewards/mask_iou_reward": 0.8384334270781191, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5154194235801697, "rewards/thk_ans_format_reward": 1.0, "step": 1138, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 175.0104217529297, "epoch": 3.8465430016863404, "grad_norm": 11.279230400648318, "kl": 0.525390625, "learning_rate": 6.793355855855856e-07, "loss": 0.0005, "reward": 3.1914998292922974, "reward_std": 0.2051123920828104, "rewards/final_reward": 1.6003443347049353, "rewards/mask_iou_reward": 0.8001721673524677, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.212332844734192, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1139, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 120.59375381469727, "epoch": 3.8499156829679597, "grad_norm": 33.25053767645448, "kl": 0.619140625, "learning_rate": 6.79054054054054e-07, "loss": 0.0006, "reward": 3.5866551399230957, "reward_std": 0.10260298103094101, "rewards/final_reward": 1.4827233010826073, "rewards/mask_iou_reward": 0.7413616505413037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.586655080318451, "rewards/thk_ans_format_reward": 1.0, "step": 1140, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 228.68750762939453, "epoch": 3.8532883642495785, "grad_norm": 14.008249722826953, "kl": 0.5419921875, "learning_rate": 6.787725225225225e-07, "loss": 0.0005, "reward": 3.2890868186950684, "reward_std": 0.273948322981596, "rewards/final_reward": 1.393363380868561, "rewards/mask_iou_reward": 0.6966816904342805, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 1.4765866994857788, "rewards/thk_ans_format_reward": 0.90625, "step": 1141, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 123.95833587646484, "epoch": 3.8566610455311974, "grad_norm": 10.579622025176663, "kl": 0.59765625, "learning_rate": 6.784909909909909e-07, "loss": 0.0006, "reward": 3.4296700954437256, "reward_std": 0.18933508545160294, "rewards/final_reward": 1.7752847816498374, "rewards/mask_iou_reward": 0.8876423908249187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4296702146530151, "rewards/thk_ans_format_reward": 1.0, "step": 1142, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 212.83333587646484, "epoch": 3.860033726812816, "grad_norm": 17.479380467691797, "kl": 0.654296875, "learning_rate": 6.782094594594594e-07, "loss": 0.0007, "reward": 3.2466046810150146, "reward_std": 0.2565118744969368, "rewards/final_reward": 1.645865671176606, "rewards/mask_iou_reward": 0.822932835588303, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 1.4341047406196594, "rewards/thk_ans_format_reward": 0.90625, "step": 1143, "think_completion_length": 10.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 209.7604217529297, "epoch": 3.863406408094435, "grad_norm": 20.262099881784287, "kl": 0.564453125, "learning_rate": 6.779279279279279e-07, "loss": 0.0006, "reward": 3.3922996520996094, "reward_std": 0.3344555199146271, "rewards/final_reward": 1.0211181931399709, "rewards/mask_iou_reward": 0.5105590965699854, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4652163982391357, "rewards/thk_ans_format_reward": 0.9687500298023224, "step": 1144, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 113.77083587646484, "epoch": 3.866779089376054, "grad_norm": 12.989026006240904, "kl": 0.73046875, "learning_rate": 6.776463963963963e-07, "loss": 0.0007, "reward": 3.320478320121765, "reward_std": 0.25647711753845215, "rewards/final_reward": 0.9190338004926744, "rewards/mask_iou_reward": 0.4595169002463372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.32047837972641, "rewards/thk_ans_format_reward": 1.0, "step": 1145, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 141.95833587646484, "epoch": 3.8701517706576727, "grad_norm": 10.956469240134854, "kl": 0.5625, "learning_rate": 6.773648648648649e-07, "loss": 0.0006, "reward": 3.03474223613739, "reward_std": 0.12496957927942276, "rewards/final_reward": 0.44723209186124996, "rewards/mask_iou_reward": 0.22361604593062498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.034742295742035, "rewards/thk_ans_format_reward": 1.0, "step": 1146, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.0625, "epoch": 3.873524451939292, "grad_norm": 7.76556935584102, "kl": 0.61328125, "learning_rate": 6.770833333333333e-07, "loss": 0.0006, "reward": 3.484253764152527, "reward_std": 0.09405850991606712, "rewards/final_reward": 1.3491039962728166, "rewards/mask_iou_reward": 0.6745519981364083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4842538833618164, "rewards/thk_ans_format_reward": 1.0, "step": 1147, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 176.17708587646484, "epoch": 3.876897133220911, "grad_norm": 16.52118909760354, "kl": 0.5859375, "learning_rate": 6.768018018018018e-07, "loss": 0.0006, "reward": 3.336282253265381, "reward_std": 0.23352781683206558, "rewards/final_reward": 1.4455606382828008, "rewards/mask_iou_reward": 0.7227803191414004, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.377948820590973, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1148, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 140.6666717529297, "epoch": 3.8802698145025296, "grad_norm": 26.974443259321365, "kl": 0.623046875, "learning_rate": 6.765202702702703e-07, "loss": 0.0006, "reward": 3.422439694404602, "reward_std": 0.20345600694417953, "rewards/final_reward": 1.8651120869182578, "rewards/mask_iou_reward": 0.9325560434591289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4224395751953125, "rewards/thk_ans_format_reward": 1.0, "step": 1149, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 126.21875381469727, "epoch": 3.8836424957841484, "grad_norm": 8.892420371329939, "kl": 0.587890625, "learning_rate": 6.762387387387387e-07, "loss": 0.0006, "reward": 3.4350632429122925, "reward_std": 0.11019621044397354, "rewards/final_reward": 1.9606659313791421, "rewards/mask_iou_reward": 0.9803329656895711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.435063362121582, "rewards/thk_ans_format_reward": 1.0, "step": 1150, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 146.4791717529297, "epoch": 3.8870151770657673, "grad_norm": 10.598498090981675, "kl": 0.669921875, "learning_rate": 6.759572072072072e-07, "loss": 0.0007, "reward": 3.4017633199691772, "reward_std": 0.05746646970510483, "rewards/final_reward": 1.1165454395140448, "rewards/mask_iou_reward": 0.5582727197570224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4017632603645325, "rewards/thk_ans_format_reward": 1.0, "step": 1151, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 173.9166717529297, "epoch": 3.890387858347386, "grad_norm": 8.414786648638504, "kl": 0.556640625, "learning_rate": 6.756756756756756e-07, "loss": 0.0006, "reward": 3.472022771835327, "reward_std": 0.09950246475636959, "rewards/final_reward": 1.4637774236557763, "rewards/mask_iou_reward": 0.7318887118278882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4720227718353271, "rewards/thk_ans_format_reward": 1.0, "step": 1152, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.32292175292969, "epoch": 3.893760539629005, "grad_norm": 8.27522608916324, "kl": 0.630859375, "learning_rate": 6.753941441441441e-07, "loss": 0.0006, "reward": 3.805325150489807, "reward_std": 0.0928366631269455, "rewards/final_reward": 1.8977951290927875, "rewards/mask_iou_reward": 0.9488975645463937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8053249716758728, "rewards/thk_ans_format_reward": 1.0, "step": 1153, "think_completion_length": 9.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 143.625, "epoch": 3.897133220910624, "grad_norm": 22.119312379954955, "kl": 0.642578125, "learning_rate": 6.751126126126126e-07, "loss": 0.0007, "reward": 3.390862822532654, "reward_std": 0.2109134942293167, "rewards/final_reward": 1.815931880147895, "rewards/mask_iou_reward": 0.9079659400739475, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4116960167884827, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1154, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 162.8854217529297, "epoch": 3.9005059021922426, "grad_norm": 37.14682144067229, "kl": 0.572265625, "learning_rate": 6.74831081081081e-07, "loss": 0.0006, "reward": 3.4662243127822876, "reward_std": 0.09864114969968796, "rewards/final_reward": 1.7343714224455264, "rewards/mask_iou_reward": 0.8671857112227632, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4662241339683533, "rewards/thk_ans_format_reward": 1.0, "step": 1155, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 167.58333587646484, "epoch": 3.903878583473862, "grad_norm": 10.005914797429432, "kl": 0.52734375, "learning_rate": 6.745495495495496e-07, "loss": 0.0005, "reward": 3.347672939300537, "reward_std": 0.11771451123058796, "rewards/final_reward": 1.4089836477985758, "rewards/mask_iou_reward": 0.7044918238992879, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3580895364284515, "rewards/thk_ans_format_reward": 1.0, "step": 1156, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 119.55208587646484, "epoch": 3.9072512647554807, "grad_norm": 9.853558177575996, "kl": 0.599609375, "learning_rate": 6.742680180180181e-07, "loss": 0.0006, "reward": 3.4117908477783203, "reward_std": 0.0272050928324461, "rewards/final_reward": 1.1120983698610167, "rewards/mask_iou_reward": 0.5560491849305084, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4117907881736755, "rewards/thk_ans_format_reward": 1.0, "step": 1157, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 119.14583587646484, "epoch": 3.9106239460370995, "grad_norm": 11.277941537448397, "kl": 0.548828125, "learning_rate": 6.739864864864865e-07, "loss": 0.0006, "reward": 3.5751309394836426, "reward_std": 0.08195750042796135, "rewards/final_reward": 1.4355375639291883, "rewards/mask_iou_reward": 0.7177687819645941, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5751309394836426, "rewards/thk_ans_format_reward": 1.0, "step": 1158, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 128.5729217529297, "epoch": 3.9139966273187183, "grad_norm": 9.054297786946076, "kl": 0.640625, "learning_rate": 6.73704954954955e-07, "loss": 0.0006, "reward": 3.4315420389175415, "reward_std": 0.17010553926229477, "rewards/final_reward": 1.4478197334246705, "rewards/mask_iou_reward": 0.7239098667123353, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4523754119873047, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1159, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 130.53125762939453, "epoch": 3.917369308600337, "grad_norm": 31.29774946350612, "kl": 0.57421875, "learning_rate": 6.734234234234234e-07, "loss": 0.0006, "reward": 3.2284148931503296, "reward_std": 0.22033357620239258, "rewards/final_reward": 1.4817521564526, "rewards/mask_iou_reward": 0.7408760782263, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.249248206615448, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1160, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 204.40625, "epoch": 3.920741989881956, "grad_norm": 10.40583238132195, "kl": 0.580078125, "learning_rate": 6.731418918918919e-07, "loss": 0.0006, "reward": 3.0834414958953857, "reward_std": 0.29399073868989944, "rewards/final_reward": 1.1915881620589188, "rewards/mask_iou_reward": 0.5957940810294594, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.2084414660930634, "rewards/thk_ans_format_reward": 0.9375, "step": 1161, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.65625381469727, "epoch": 3.924114671163575, "grad_norm": 160.03568478542434, "kl": 0.537109375, "learning_rate": 6.728603603603604e-07, "loss": 0.0005, "reward": 3.5287028551101685, "reward_std": 0.06423872895538807, "rewards/final_reward": 0.5818589507508326, "rewards/mask_iou_reward": 0.2909294753754163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5287025570869446, "rewards/thk_ans_format_reward": 1.0, "step": 1162, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.28125381469727, "epoch": 3.927487352445194, "grad_norm": 9.822659849234423, "kl": 1.068359375, "learning_rate": 6.725788288288288e-07, "loss": 0.0011, "reward": 3.2519426345825195, "reward_std": 0.10252987593412399, "rewards/final_reward": 0.06949677758991413, "rewards/mask_iou_reward": 0.034748388794957064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2519423365592957, "rewards/thk_ans_format_reward": 1.0, "step": 1163, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 119.70833587646484, "epoch": 3.930860033726813, "grad_norm": 10.798828216838599, "kl": 0.744140625, "learning_rate": 6.722972972972972e-07, "loss": 0.0008, "reward": 3.3033626079559326, "reward_std": 0.07960717380046844, "rewards/final_reward": 1.6421366016422263, "rewards/mask_iou_reward": 0.8210683008211132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3033625483512878, "rewards/thk_ans_format_reward": 1.0, "step": 1164, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.65625381469727, "epoch": 3.9342327150084317, "grad_norm": 16.320809642833645, "kl": 0.669921875, "learning_rate": 6.720157657657656e-07, "loss": 0.0007, "reward": 3.606690526008606, "reward_std": 0.24704181402921677, "rewards/final_reward": 1.3225213951972898, "rewards/mask_iou_reward": 0.6612606975986449, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.6900236010551453, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1165, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.29166793823242, "epoch": 3.9376053962900506, "grad_norm": 35.63580913868573, "kl": 0.615234375, "learning_rate": 6.717342342342342e-07, "loss": 0.0006, "reward": 3.183878540992737, "reward_std": 0.1320716105401516, "rewards/final_reward": 1.3518959103213104, "rewards/mask_iou_reward": 0.6759479551606552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1838783025741577, "rewards/thk_ans_format_reward": 1.0, "step": 1166, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 121.32291793823242, "epoch": 3.9409780775716694, "grad_norm": 16.859281176433807, "kl": 0.638671875, "learning_rate": 6.714527027027027e-07, "loss": 0.0006, "reward": 3.1818827390670776, "reward_std": 0.09869653731584549, "rewards/final_reward": 1.4249743612102743, "rewards/mask_iou_reward": 0.7124871806051372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1818827390670776, "rewards/thk_ans_format_reward": 1.0, "step": 1167, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 116.15625, "epoch": 3.9443507588532882, "grad_norm": 11.903269690933106, "kl": 0.63671875, "learning_rate": 6.711711711711711e-07, "loss": 0.0006, "reward": 3.1614683866500854, "reward_std": 0.14515875279903412, "rewards/final_reward": 1.5640988689443378, "rewards/mask_iou_reward": 0.7820494344721689, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1614683866500854, "rewards/thk_ans_format_reward": 1.0, "step": 1168, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.52083587646484, "epoch": 3.947723440134907, "grad_norm": 18.45401892617275, "kl": 0.546875, "learning_rate": 6.708896396396396e-07, "loss": 0.0005, "reward": 3.2433966398239136, "reward_std": 0.06726586446166039, "rewards/final_reward": 1.529449017547845, "rewards/mask_iou_reward": 0.7647245087739225, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2433964610099792, "rewards/thk_ans_format_reward": 1.0, "step": 1169, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 106.69792175292969, "epoch": 3.9510961214165263, "grad_norm": 28.4186280214568, "kl": 0.63671875, "learning_rate": 6.70608108108108e-07, "loss": 0.0006, "reward": 3.447712779045105, "reward_std": 0.09034018777310848, "rewards/final_reward": 1.7980894564169296, "rewards/mask_iou_reward": 0.8990447282084648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4477129578590393, "rewards/thk_ans_format_reward": 1.0, "step": 1170, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 125.95833969116211, "epoch": 3.954468802698145, "grad_norm": 18.817439024173634, "kl": 0.537109375, "learning_rate": 6.703265765765765e-07, "loss": 0.0005, "reward": 3.0330607891082764, "reward_std": 0.08898946642875671, "rewards/final_reward": 0.7484329408679943, "rewards/mask_iou_reward": 0.37421647043399714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0330607891082764, "rewards/thk_ans_format_reward": 1.0, "step": 1171, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 213.2187614440918, "epoch": 3.957841483979764, "grad_norm": 10.101315100221358, "kl": 0.56640625, "learning_rate": 6.70045045045045e-07, "loss": 0.0006, "reward": 3.3744516372680664, "reward_std": 0.2819393612444401, "rewards/final_reward": 1.3392306228334026, "rewards/mask_iou_reward": 0.6696153114167013, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.499451756477356, "rewards/thk_ans_format_reward": 0.9375, "step": 1172, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.08333587646484, "epoch": 3.961214165261383, "grad_norm": 8.184313790203422, "kl": 0.583984375, "learning_rate": 6.697635135135134e-07, "loss": 0.0006, "reward": 3.3349242210388184, "reward_std": 0.1057177446782589, "rewards/final_reward": 1.6384289076635299, "rewards/mask_iou_reward": 0.8192144538317649, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3349245190620422, "rewards/thk_ans_format_reward": 1.0, "step": 1173, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 122.87500381469727, "epoch": 3.9645868465430016, "grad_norm": 16.448266125331322, "kl": 0.57421875, "learning_rate": 6.694819819819819e-07, "loss": 0.0006, "reward": 3.4961599111557007, "reward_std": 0.09959017485380173, "rewards/final_reward": 1.3022567383415606, "rewards/mask_iou_reward": 0.6511283691707803, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.496160089969635, "rewards/thk_ans_format_reward": 1.0, "step": 1174, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 120.13541793823242, "epoch": 3.9679595278246205, "grad_norm": 10.897067411417096, "kl": 0.63671875, "learning_rate": 6.692004504504503e-07, "loss": 0.0006, "reward": 3.00723659992218, "reward_std": 0.07285760110244155, "rewards/final_reward": 0.502330371989074, "rewards/mask_iou_reward": 0.251165185994537, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0072364211082458, "rewards/thk_ans_format_reward": 1.0, "step": 1175, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 118.47916793823242, "epoch": 3.9713322091062393, "grad_norm": 20.446752251210203, "kl": 0.619140625, "learning_rate": 6.689189189189189e-07, "loss": 0.0006, "reward": 3.4216020107269287, "reward_std": 0.1204568762332201, "rewards/final_reward": 1.6239261922127697, "rewards/mask_iou_reward": 0.8119630961063848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4216020107269287, "rewards/thk_ans_format_reward": 1.0, "step": 1176, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.83333587646484, "epoch": 3.9747048903878586, "grad_norm": 10.013549314476817, "kl": 0.75, "learning_rate": 6.686373873873874e-07, "loss": 0.0008, "reward": 3.0989030599594116, "reward_std": 0.10697172209620476, "rewards/final_reward": 0.9797569255011491, "rewards/mask_iou_reward": 0.48987846275057456, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0989029705524445, "rewards/thk_ans_format_reward": 1.0, "step": 1177, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 118.34375381469727, "epoch": 3.9780775716694774, "grad_norm": 11.631293297558877, "kl": 0.90234375, "learning_rate": 6.683558558558558e-07, "loss": 0.0009, "reward": 3.0180094242095947, "reward_std": 0.12886736541986465, "rewards/final_reward": 1.0945630209917057, "rewards/mask_iou_reward": 0.5472815104958528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0180092453956604, "rewards/thk_ans_format_reward": 1.0, "step": 1178, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.9375, "epoch": 3.9814502529510962, "grad_norm": 10.991482902331736, "kl": 0.55859375, "learning_rate": 6.680743243243243e-07, "loss": 0.0006, "reward": 3.6170132160186768, "reward_std": 0.1302567794919014, "rewards/final_reward": 1.4590375447159973, "rewards/mask_iou_reward": 0.7295187723579987, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6274299025535583, "rewards/thk_ans_format_reward": 1.0, "step": 1179, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 138.0208396911621, "epoch": 3.984822934232715, "grad_norm": 6.800394388342199, "kl": 0.65625, "learning_rate": 6.677927927927928e-07, "loss": 0.0007, "reward": 3.363734722137451, "reward_std": 0.015621137339621782, "rewards/final_reward": 1.6003017140478155, "rewards/mask_iou_reward": 0.8001508570239078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3637345433235168, "rewards/thk_ans_format_reward": 1.0, "step": 1180, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.60416793823242, "epoch": 3.988195615514334, "grad_norm": 19.201489375835788, "kl": 0.6796875, "learning_rate": 6.675112612612612e-07, "loss": 0.0007, "reward": 3.5164124965667725, "reward_std": 0.12926794216036797, "rewards/final_reward": 1.8086767008232651, "rewards/mask_iou_reward": 0.9043383504116326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.516412377357483, "rewards/thk_ans_format_reward": 1.0, "step": 1181, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.03125381469727, "epoch": 3.9915682967959527, "grad_norm": 15.91782373980905, "kl": 0.646484375, "learning_rate": 6.672297297297297e-07, "loss": 0.0006, "reward": 3.5790776014328003, "reward_std": 0.07250684313476086, "rewards/final_reward": 1.217705241658552, "rewards/mask_iou_reward": 0.608852620829276, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5790774822235107, "rewards/thk_ans_format_reward": 1.0, "step": 1182, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 146.0729217529297, "epoch": 3.9949409780775715, "grad_norm": 11.389974166745496, "kl": 0.486328125, "learning_rate": 6.669481981981981e-07, "loss": 0.0006, "reward": 3.757065773010254, "reward_std": 0.0599273145198822, "rewards/final_reward": 1.9625319193708353, "rewards/mask_iou_reward": 0.9812659596854176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.75706547498703, "rewards/thk_ans_format_reward": 1.0, "step": 1183, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 118.6315803527832, "epoch": 3.998313659359191, "grad_norm": 15.93861202741278, "kl": 0.5546875, "learning_rate": 6.666666666666666e-07, "loss": 0.0006, "reward": 3.3559396266937256, "reward_std": 0.02718531433492899, "rewards/final_reward": 1.5773362977098602, "rewards/mask_iou_reward": 0.7886681488549301, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3559398651123047, "rewards/thk_ans_format_reward": 1.0, "step": 1184, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 118.12500381469727, "epoch": 4.003372681281619, "grad_norm": 10.554016252070584, "kl": 1.111328125, "learning_rate": 6.663851351351351e-07, "loss": 0.0011, "reward": 3.439687967300415, "reward_std": 0.08648747950792313, "rewards/final_reward": 1.4580134691598818, "rewards/mask_iou_reward": 0.7290067345799409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.439687967300415, "rewards/thk_ans_format_reward": 1.0, "step": 1185, "think_completion_length": 6.5 }, { "clip_ratio": 0.0, "completion_length": 134.08333587646484, "epoch": 4.006745362563238, "grad_norm": 10.95857025554743, "kl": 0.4609375, "learning_rate": 6.661036036036036e-07, "loss": 0.0005, "reward": 3.328967809677124, "reward_std": 0.03493300452828407, "rewards/final_reward": 1.8086746194133259, "rewards/mask_iou_reward": 0.9043373097066629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3289676308631897, "rewards/thk_ans_format_reward": 1.0, "step": 1186, "think_completion_length": 6.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 119.22916793823242, "epoch": 4.010118043844857, "grad_norm": 6.343836801717278, "kl": 0.67578125, "learning_rate": 6.658220720720721e-07, "loss": 0.0008, "reward": 3.5031098127365112, "reward_std": 0.033121745102107525, "rewards/final_reward": 1.483336888613724, "rewards/mask_iou_reward": 0.741668444306862, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.503109633922577, "rewards/thk_ans_format_reward": 1.0, "step": 1187, "think_completion_length": 5.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 135.8645896911621, "epoch": 4.013490725126475, "grad_norm": 11.33780507114698, "kl": 0.6640625, "learning_rate": 6.655405405405405e-07, "loss": 0.0007, "reward": 3.250043511390686, "reward_std": 0.08231607265770435, "rewards/final_reward": 1.5900309584094408, "rewards/mask_iou_reward": 0.7950154792047204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2500435709953308, "rewards/thk_ans_format_reward": 1.0, "step": 1188, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 163.2916717529297, "epoch": 4.016863406408095, "grad_norm": 15.958992045750374, "kl": 0.591796875, "learning_rate": 6.65259009009009e-07, "loss": 0.0006, "reward": 3.6597955226898193, "reward_std": 0.16362156346440315, "rewards/final_reward": 1.5883938304816358, "rewards/mask_iou_reward": 0.7941969152408179, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6806288361549377, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1189, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 116.45833587646484, "epoch": 4.020236087689713, "grad_norm": 29.774903611078646, "kl": 0.6328125, "learning_rate": 6.649774774774775e-07, "loss": 0.0006, "reward": 3.4126813411712646, "reward_std": 0.035650059347972274, "rewards/final_reward": 1.5868418605872399, "rewards/mask_iou_reward": 0.7934209302936199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4126812815666199, "rewards/thk_ans_format_reward": 1.0, "step": 1190, "think_completion_length": 6.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 113.77083587646484, "epoch": 4.023608768971332, "grad_norm": 13.734640247347274, "kl": 0.53515625, "learning_rate": 6.646959459459459e-07, "loss": 0.0005, "reward": 3.3605130910873413, "reward_std": 0.05231809243559837, "rewards/final_reward": 0.5977116399046374, "rewards/mask_iou_reward": 0.2988558199523187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3605130910873413, "rewards/thk_ans_format_reward": 1.0, "step": 1191, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 167.01041793823242, "epoch": 4.0269814502529515, "grad_norm": 37.86017681085706, "kl": 0.5703125, "learning_rate": 6.644144144144144e-07, "loss": 0.0006, "reward": 3.414387583732605, "reward_std": 0.18529635295271873, "rewards/final_reward": 1.1384239630747104, "rewards/mask_iou_reward": 0.5692119815373552, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.435220718383789, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1192, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.82291793823242, "epoch": 4.03035413153457, "grad_norm": 20.219296538614426, "kl": 0.4677734375, "learning_rate": 6.641328828828829e-07, "loss": 0.0005, "reward": 3.0895376205444336, "reward_std": 0.11390053480863571, "rewards/final_reward": 0.8502073905759294, "rewards/mask_iou_reward": 0.4251036952879647, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0895376205444336, "rewards/thk_ans_format_reward": 1.0, "step": 1193, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.00000381469727, "epoch": 4.033726812816189, "grad_norm": 31.58834779008439, "kl": 0.41796875, "learning_rate": 6.638513513513513e-07, "loss": 0.0004, "reward": 3.637164354324341, "reward_std": 0.08336159586906433, "rewards/final_reward": 1.7560355742195841, "rewards/mask_iou_reward": 0.8780177871097921, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.63716459274292, "rewards/thk_ans_format_reward": 1.0, "step": 1194, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 118.875, "epoch": 4.0370994940978076, "grad_norm": 8.293597312434683, "kl": 0.4716796875, "learning_rate": 6.635698198198198e-07, "loss": 0.0005, "reward": 3.3773438930511475, "reward_std": 0.07199937477707863, "rewards/final_reward": 1.4250182773570836, "rewards/mask_iou_reward": 0.7125091386785418, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.377343773841858, "rewards/thk_ans_format_reward": 1.0, "step": 1195, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 119.27083587646484, "epoch": 4.040472175379427, "grad_norm": 22.502011229703147, "kl": 0.5283203125, "learning_rate": 6.632882882882883e-07, "loss": 0.0005, "reward": 3.2654829025268555, "reward_std": 0.06824944447726011, "rewards/final_reward": 1.5511016928185446, "rewards/mask_iou_reward": 0.7755508464092723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2654826641082764, "rewards/thk_ans_format_reward": 1.0, "step": 1196, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 135.2604217529297, "epoch": 4.043844856661045, "grad_norm": 6.811462987308554, "kl": 0.724609375, "learning_rate": 6.630067567567568e-07, "loss": 0.0008, "reward": 3.615772247314453, "reward_std": 0.10267575038596988, "rewards/final_reward": 1.6120674931977677, "rewards/mask_iou_reward": 0.8060337465988838, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6261889934539795, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1197, "think_completion_length": 5.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.11458587646484, "epoch": 4.0472175379426645, "grad_norm": 17.715372589544845, "kl": 0.6015625, "learning_rate": 6.627252252252253e-07, "loss": 0.0006, "reward": 3.3741610050201416, "reward_std": 0.052045850083231926, "rewards/final_reward": 0.9774066456483272, "rewards/mask_iou_reward": 0.4887033228241636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.374160885810852, "rewards/thk_ans_format_reward": 1.0, "step": 1198, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 120.14583587646484, "epoch": 4.050590219224283, "grad_norm": 22.80800540898344, "kl": 0.91015625, "learning_rate": 6.624436936936937e-07, "loss": 0.0009, "reward": 3.4368356466293335, "reward_std": 0.09212891571223736, "rewards/final_reward": 1.471651074709624, "rewards/mask_iou_reward": 0.735825537354812, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4368354678153992, "rewards/thk_ans_format_reward": 1.0, "step": 1199, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 169.7395896911621, "epoch": 4.053962900505902, "grad_norm": 41.39021177493442, "kl": 0.4228515625, "learning_rate": 6.621621621621622e-07, "loss": 0.0004, "reward": 3.3039404153823853, "reward_std": 0.11558372527360916, "rewards/final_reward": 1.532649382911797, "rewards/mask_iou_reward": 0.7663246914558985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303940236568451, "rewards/thk_ans_format_reward": 1.0, "step": 1200, "think_completion_length": 5.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 109.375, "epoch": 4.057335581787521, "grad_norm": 43.35919146778135, "kl": 0.494140625, "learning_rate": 6.618806306306306e-07, "loss": 0.0005, "reward": 3.4193031787872314, "reward_std": 0.09287399984896183, "rewards/final_reward": 1.1264470755571945, "rewards/mask_iou_reward": 0.5632235377785972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.419303297996521, "rewards/thk_ans_format_reward": 1.0, "step": 1201, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.3645896911621, "epoch": 4.06070826306914, "grad_norm": 13.071128890946966, "kl": 0.478515625, "learning_rate": 6.615990990990991e-07, "loss": 0.0005, "reward": 3.0676461458206177, "reward_std": 0.15790517255663872, "rewards/final_reward": 1.685211982017151, "rewards/mask_iou_reward": 0.8426059910085755, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0884793996810913, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1202, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 140.1979217529297, "epoch": 4.064080944350759, "grad_norm": 8.496155118140667, "kl": 0.404296875, "learning_rate": 6.613175675675676e-07, "loss": 0.0004, "reward": 3.4975868463516235, "reward_std": 0.08468229323625565, "rewards/final_reward": 1.3996250295911348, "rewards/mask_iou_reward": 0.6998125147955674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4975868463516235, "rewards/thk_ans_format_reward": 1.0, "step": 1203, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 140.28125381469727, "epoch": 4.0674536256323774, "grad_norm": 15.478406315143841, "kl": 0.580078125, "learning_rate": 6.610360360360359e-07, "loss": 0.0006, "reward": 3.4111239910125732, "reward_std": 0.15900106355547905, "rewards/final_reward": 1.6024432224024738, "rewards/mask_iou_reward": 0.8012216112012369, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.411124050617218, "rewards/thk_ans_format_reward": 1.0, "step": 1204, "think_completion_length": 6.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 131.20833587646484, "epoch": 4.070826306913997, "grad_norm": 12.152397641597725, "kl": 0.5, "learning_rate": 6.607545045045044e-07, "loss": 0.0005, "reward": 3.2357778549194336, "reward_std": 0.07916467823088169, "rewards/final_reward": 1.2065086567678471, "rewards/mask_iou_reward": 0.6032543283839236, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.235777735710144, "rewards/thk_ans_format_reward": 1.0, "step": 1205, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.63541793823242, "epoch": 4.074198988195615, "grad_norm": 14.537589525952134, "kl": 0.4755859375, "learning_rate": 6.60472972972973e-07, "loss": 0.0005, "reward": 3.667311668395996, "reward_std": 0.06388841196894646, "rewards/final_reward": 1.8121912599107288, "rewards/mask_iou_reward": 0.9060956299553644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6673114895820618, "rewards/thk_ans_format_reward": 1.0, "step": 1206, "think_completion_length": 5.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 121.14583587646484, "epoch": 4.077571669477234, "grad_norm": 15.088706799391279, "kl": 0.4306640625, "learning_rate": 6.601914414414414e-07, "loss": 0.0006, "reward": 3.3942922353744507, "reward_std": 0.1983342319726944, "rewards/final_reward": 1.5898807602430456, "rewards/mask_iou_reward": 0.7949403801215228, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3942922353744507, "rewards/thk_ans_format_reward": 1.0, "step": 1207, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 125.28125, "epoch": 4.080944350758854, "grad_norm": 27.33584550590293, "kl": 0.4384765625, "learning_rate": 6.599099099099099e-07, "loss": 0.0005, "reward": 3.223801851272583, "reward_std": 0.02942474838346243, "rewards/final_reward": 1.6642409626734604, "rewards/mask_iou_reward": 0.8321204813367302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2238017916679382, "rewards/thk_ans_format_reward": 1.0, "step": 1208, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 140.15625, "epoch": 4.084317032040472, "grad_norm": 5.586636533993314, "kl": 0.4365234375, "learning_rate": 6.596283783783783e-07, "loss": 0.0004, "reward": 3.6259137392044067, "reward_std": 0.14700846886262298, "rewards/final_reward": 1.4255063237605718, "rewards/mask_iou_reward": 0.7127531618802859, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6467470526695251, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1209, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.51041793823242, "epoch": 4.087689713322091, "grad_norm": 10.176513900275186, "kl": 0.43359375, "learning_rate": 6.593468468468468e-07, "loss": 0.0004, "reward": 3.4102810621261597, "reward_std": 0.09019586816430092, "rewards/final_reward": 0.6894702596620408, "rewards/mask_iou_reward": 0.3447351298310204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.410281240940094, "rewards/thk_ans_format_reward": 1.0, "step": 1210, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 139.04166793823242, "epoch": 4.09106239460371, "grad_norm": 17.637085645134885, "kl": 0.419921875, "learning_rate": 6.590653153153153e-07, "loss": 0.0004, "reward": 3.303520679473877, "reward_std": 0.19280150532722473, "rewards/final_reward": 1.1647785308102354, "rewards/mask_iou_reward": 0.5823892654051177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303520679473877, "rewards/thk_ans_format_reward": 1.0, "step": 1211, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 137.83333587646484, "epoch": 4.094435075885329, "grad_norm": 15.695428734834808, "kl": 1.30078125, "learning_rate": 6.587837837837837e-07, "loss": 0.0013, "reward": 3.429495096206665, "reward_std": 0.12937488220632076, "rewards/final_reward": 1.789398878602503, "rewards/mask_iou_reward": 0.8946994393012515, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4294949769973755, "rewards/thk_ans_format_reward": 1.0, "step": 1212, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 136.10416793823242, "epoch": 4.097807757166947, "grad_norm": 11.875679375139654, "kl": 0.4580078125, "learning_rate": 6.585022522522522e-07, "loss": 0.0005, "reward": 3.1290459632873535, "reward_std": 0.13048794120550156, "rewards/final_reward": 0.8777940433255967, "rewards/mask_iou_reward": 0.43889702166279837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.139462798833847, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1213, "think_completion_length": 5.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.30208587646484, "epoch": 4.101180438448567, "grad_norm": 14.024431690361592, "kl": 0.5234375, "learning_rate": 6.582207207207206e-07, "loss": 0.0005, "reward": 3.3400204181671143, "reward_std": 0.02741351444274187, "rewards/final_reward": 1.4409633928774201, "rewards/mask_iou_reward": 0.7204816964387101, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3400205373764038, "rewards/thk_ans_format_reward": 1.0, "step": 1214, "think_completion_length": 5.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 105.98958587646484, "epoch": 4.104553119730186, "grad_norm": 23.58731569658534, "kl": 0.638671875, "learning_rate": 6.579391891891891e-07, "loss": 0.0006, "reward": 3.4039725065231323, "reward_std": 0.04642016626894474, "rewards/final_reward": 1.8063808337832992, "rewards/mask_iou_reward": 0.9031904168916496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.403972327709198, "rewards/thk_ans_format_reward": 1.0, "step": 1215, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 117.92708587646484, "epoch": 4.107925801011804, "grad_norm": 16.70364403368003, "kl": 0.44921875, "learning_rate": 6.576576576576577e-07, "loss": 0.0004, "reward": 3.491950511932373, "reward_std": 0.03475194610655308, "rewards/final_reward": 1.8578538951172, "rewards/mask_iou_reward": 0.9289269475586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4919501543045044, "rewards/thk_ans_format_reward": 1.0, "step": 1216, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 127.27083969116211, "epoch": 4.1112984822934235, "grad_norm": 20.916392811665588, "kl": 0.458984375, "learning_rate": 6.573761261261261e-07, "loss": 0.0005, "reward": 3.676409959793091, "reward_std": 0.05732338689267635, "rewards/final_reward": 1.8801769016381307, "rewards/mask_iou_reward": 0.9400884508190653, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6764100193977356, "rewards/thk_ans_format_reward": 1.0, "step": 1217, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 119.68750381469727, "epoch": 4.114671163575042, "grad_norm": 22.28216496691829, "kl": 0.5947265625, "learning_rate": 6.570945945945946e-07, "loss": 0.0006, "reward": 3.378030300140381, "reward_std": 0.10985900834202766, "rewards/final_reward": 1.514454998998649, "rewards/mask_iou_reward": 0.7572274994993246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3780303597450256, "rewards/thk_ans_format_reward": 1.0, "step": 1218, "think_completion_length": 5.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.22916793823242, "epoch": 4.118043844856661, "grad_norm": 21.727131304379803, "kl": 0.4677734375, "learning_rate": 6.56813063063063e-07, "loss": 0.0005, "reward": 3.600069284439087, "reward_std": 0.13599379733204842, "rewards/final_reward": 1.6915890647441116, "rewards/mask_iou_reward": 0.8457945323720558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.600069284439087, "rewards/thk_ans_format_reward": 1.0, "step": 1219, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 116.95833587646484, "epoch": 4.12141652613828, "grad_norm": 15.270943087396427, "kl": 1.1640625, "learning_rate": 6.565315315315315e-07, "loss": 0.0012, "reward": 3.3276935815811157, "reward_std": 0.061925821006298065, "rewards/final_reward": 1.5652405764330064, "rewards/mask_iou_reward": 0.7826202882165032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.327693521976471, "rewards/thk_ans_format_reward": 1.0, "step": 1220, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 132.23958587646484, "epoch": 4.124789207419899, "grad_norm": 9.675363166519526, "kl": 0.5361328125, "learning_rate": 6.5625e-07, "loss": 0.0006, "reward": 3.3326576948165894, "reward_std": 0.09473956376314163, "rewards/final_reward": 1.5967528080432538, "rewards/mask_iou_reward": 0.7983764040216269, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3326576948165894, "rewards/thk_ans_format_reward": 1.0, "step": 1221, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 130.6354217529297, "epoch": 4.128161888701518, "grad_norm": 16.7886905879466, "kl": 0.474609375, "learning_rate": 6.559684684684684e-07, "loss": 0.0005, "reward": 3.4593290090560913, "reward_std": 0.05802651774138212, "rewards/final_reward": 1.692901515639981, "rewards/mask_iou_reward": 0.8464507578199905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4593292474746704, "rewards/thk_ans_format_reward": 1.0, "step": 1222, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 149.6979217529297, "epoch": 4.1315345699831365, "grad_norm": 8.029491788908325, "kl": 0.568359375, "learning_rate": 6.556869369369369e-07, "loss": 0.0006, "reward": 3.5112451314926147, "reward_std": 0.120102159678936, "rewards/final_reward": 1.1520923960479097, "rewards/mask_iou_reward": 0.5760461980239548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5112449526786804, "rewards/thk_ans_format_reward": 1.0, "step": 1223, "think_completion_length": 6.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 127.63542175292969, "epoch": 4.134907251264756, "grad_norm": 9.309653332724258, "kl": 0.41015625, "learning_rate": 6.554054054054053e-07, "loss": 0.0004, "reward": 3.714353322982788, "reward_std": 0.018177752383053303, "rewards/final_reward": 1.531489051594929, "rewards/mask_iou_reward": 0.7657445257974645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7143531441688538, "rewards/thk_ans_format_reward": 1.0, "step": 1224, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 113.01042175292969, "epoch": 4.138279932546374, "grad_norm": 18.742822358444663, "kl": 0.5947265625, "learning_rate": 6.551238738738738e-07, "loss": 0.0007, "reward": 3.322489023208618, "reward_std": 0.08387609012424946, "rewards/final_reward": 1.3946590371260241, "rewards/mask_iou_reward": 0.6973295185630121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3224887251853943, "rewards/thk_ans_format_reward": 1.0, "step": 1225, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 152.92708587646484, "epoch": 4.141652613827993, "grad_norm": 35.009143337678275, "kl": 0.4609375, "learning_rate": 6.548423423423423e-07, "loss": 0.0005, "reward": 3.5057541131973267, "reward_std": 0.09459779784083366, "rewards/final_reward": 1.7178362719872866, "rewards/mask_iou_reward": 0.8589181359936433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5057538747787476, "rewards/thk_ans_format_reward": 1.0, "step": 1226, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 134.96875381469727, "epoch": 4.145025295109612, "grad_norm": 56.043396569771154, "kl": 0.4541015625, "learning_rate": 6.545608108108108e-07, "loss": 0.0005, "reward": 3.2239954471588135, "reward_std": 0.10730738565325737, "rewards/final_reward": 0.4325340813971122, "rewards/mask_iou_reward": 0.2162670406985561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2239955067634583, "rewards/thk_ans_format_reward": 1.0, "step": 1227, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.1041717529297, "epoch": 4.148397976391231, "grad_norm": 12.000564031917897, "kl": 0.474609375, "learning_rate": 6.542792792792793e-07, "loss": 0.0005, "reward": 3.229053258895874, "reward_std": 0.10904721170663834, "rewards/final_reward": 1.35390044326304, "rewards/mask_iou_reward": 0.67695022163152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2290531396865845, "rewards/thk_ans_format_reward": 1.0, "step": 1228, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.61458587646484, "epoch": 4.15177065767285, "grad_norm": 10.836795281979196, "kl": 1.314453125, "learning_rate": 6.539977477477478e-07, "loss": 0.0013, "reward": 3.5818368196487427, "reward_std": 0.16170379519462585, "rewards/final_reward": 1.723583934220796, "rewards/mask_iou_reward": 0.861791967110398, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5818366408348083, "rewards/thk_ans_format_reward": 1.0, "step": 1229, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.38541793823242, "epoch": 4.155143338954469, "grad_norm": 84.69549949187494, "kl": 0.560546875, "learning_rate": 6.537162162162162e-07, "loss": 0.0006, "reward": 3.1797205209732056, "reward_std": 0.027921637054532766, "rewards/final_reward": 0.01804651799242533, "rewards/mask_iou_reward": 0.009023258996212665, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.179720401763916, "rewards/thk_ans_format_reward": 1.0, "step": 1230, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 115.82292175292969, "epoch": 4.158516020236088, "grad_norm": 11.292030777174597, "kl": 0.55078125, "learning_rate": 6.534346846846847e-07, "loss": 0.0006, "reward": 3.534628748893738, "reward_std": 0.165926992893219, "rewards/final_reward": 1.533198233303391, "rewards/mask_iou_reward": 0.7665991166516954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5450453758239746, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1231, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 171.34375, "epoch": 4.161888701517706, "grad_norm": 7.4161788941658235, "kl": 0.5703125, "learning_rate": 6.531531531531531e-07, "loss": 0.0006, "reward": 2.960986852645874, "reward_std": 0.21262395568192005, "rewards/final_reward": 1.2512099620712724, "rewards/mask_iou_reward": 0.6256049810356362, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.0234868824481964, "rewards/thk_ans_format_reward": 0.96875, "step": 1232, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.28125762939453, "epoch": 4.165261382799326, "grad_norm": 11.150973269362533, "kl": 0.4033203125, "learning_rate": 6.528716216216216e-07, "loss": 0.0004, "reward": 3.5004022121429443, "reward_std": 0.04145765211433172, "rewards/final_reward": 1.497370642903789, "rewards/mask_iou_reward": 0.7486853214518945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5004018545150757, "rewards/thk_ans_format_reward": 1.0, "step": 1233, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 131.62500762939453, "epoch": 4.168634064080944, "grad_norm": 9.505429834089737, "kl": 1.185546875, "learning_rate": 6.525900900900901e-07, "loss": 0.0012, "reward": 3.405234456062317, "reward_std": 0.11318844556808472, "rewards/final_reward": 1.910689967757608, "rewards/mask_iou_reward": 0.955344983878804, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4052343964576721, "rewards/thk_ans_format_reward": 1.0, "step": 1234, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.65625, "epoch": 4.172006745362563, "grad_norm": 9.054992454458855, "kl": 0.3935546875, "learning_rate": 6.523085585585585e-07, "loss": 0.0004, "reward": 3.6547305583953857, "reward_std": 0.04657788202166557, "rewards/final_reward": 1.819914800489872, "rewards/mask_iou_reward": 0.909957400244936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6547306180000305, "rewards/thk_ans_format_reward": 1.0, "step": 1235, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 131.2395896911621, "epoch": 4.175379426644182, "grad_norm": 19.602709244183345, "kl": 0.4765625, "learning_rate": 6.52027027027027e-07, "loss": 0.0005, "reward": 3.5941241979599, "reward_std": 0.07574337627738714, "rewards/final_reward": 1.5954112549798443, "rewards/mask_iou_reward": 0.7977056274899221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5941241383552551, "rewards/thk_ans_format_reward": 1.0, "step": 1236, "think_completion_length": 5.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 129.73958587646484, "epoch": 4.178752107925801, "grad_norm": 13.188752737208313, "kl": 0.541015625, "learning_rate": 6.517454954954955e-07, "loss": 0.0005, "reward": 3.6106929779052734, "reward_std": 0.05192565359175205, "rewards/final_reward": 1.5023569538959587, "rewards/mask_iou_reward": 0.7511784769479793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6106928586959839, "rewards/thk_ans_format_reward": 1.0, "step": 1237, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 122.75000381469727, "epoch": 4.18212478920742, "grad_norm": 20.146829558184656, "kl": 0.4345703125, "learning_rate": 6.51463963963964e-07, "loss": 0.0004, "reward": 3.3862054347991943, "reward_std": 0.1351282075047493, "rewards/final_reward": 1.103903692786539, "rewards/mask_iou_reward": 0.5519518463932696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3862053751945496, "rewards/thk_ans_format_reward": 1.0, "step": 1238, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.69791793823242, "epoch": 4.185497470489039, "grad_norm": 10.390083694801291, "kl": 0.46875, "learning_rate": 6.511824324324325e-07, "loss": 0.0006, "reward": 3.443265676498413, "reward_std": 0.07178937830030918, "rewards/final_reward": 1.8056116902199864, "rewards/mask_iou_reward": 0.9028058451099932, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4432653784751892, "rewards/thk_ans_format_reward": 1.0, "step": 1239, "think_completion_length": 7.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 130.36458587646484, "epoch": 4.188870151770658, "grad_norm": 6.478982019410555, "kl": 0.533203125, "learning_rate": 6.509009009009009e-07, "loss": 0.0005, "reward": 3.445013165473938, "reward_std": 0.17272941768169403, "rewards/final_reward": 1.452560436189965, "rewards/mask_iou_reward": 0.7262802180949826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4450132846832275, "rewards/thk_ans_format_reward": 1.0, "step": 1240, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 119.125, "epoch": 4.192242833052276, "grad_norm": 8.789730060732012, "kl": 0.484375, "learning_rate": 6.506193693693694e-07, "loss": 0.0005, "reward": 3.440682053565979, "reward_std": 0.09131734818220139, "rewards/final_reward": 1.649042618797143, "rewards/mask_iou_reward": 0.8245213093985715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4406821131706238, "rewards/thk_ans_format_reward": 1.0, "step": 1241, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 169.77083587646484, "epoch": 4.195615514333896, "grad_norm": 15.601941559299878, "kl": 0.435546875, "learning_rate": 6.503378378378378e-07, "loss": 0.0005, "reward": 3.457811713218689, "reward_std": 0.23156233131885529, "rewards/final_reward": 1.834207483629815, "rewards/mask_iou_reward": 0.9171037418149075, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4994783997535706, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1242, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 144.50000762939453, "epoch": 4.198988195615514, "grad_norm": 88.22556284587016, "kl": 0.46875, "learning_rate": 6.500563063063062e-07, "loss": 0.0005, "reward": 3.180579423904419, "reward_std": 0.12762167118489742, "rewards/final_reward": 0.7456849647915553, "rewards/mask_iou_reward": 0.37284248239577766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1805793642997742, "rewards/thk_ans_format_reward": 1.0, "step": 1243, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 134.52083587646484, "epoch": 4.202360876897133, "grad_norm": 23.92275189355112, "kl": 0.470703125, "learning_rate": 6.497747747747747e-07, "loss": 0.0005, "reward": 3.466734290122986, "reward_std": 0.0954241082072258, "rewards/final_reward": 1.2845573218016204, "rewards/mask_iou_reward": 0.6422786609008102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4667341113090515, "rewards/thk_ans_format_reward": 1.0, "step": 1244, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.1979217529297, "epoch": 4.2057335581787525, "grad_norm": 12.833793769275767, "kl": 0.4365234375, "learning_rate": 6.494932432432431e-07, "loss": 0.0004, "reward": 3.397306203842163, "reward_std": 0.06800971738994122, "rewards/final_reward": 1.8772874987421284, "rewards/mask_iou_reward": 0.9386437493710642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3973060846328735, "rewards/thk_ans_format_reward": 1.0, "step": 1245, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 151.11459350585938, "epoch": 4.209106239460371, "grad_norm": 13.385538785948139, "kl": 0.4296875, "learning_rate": 6.492117117117116e-07, "loss": 0.0004, "reward": 3.5194268226623535, "reward_std": 0.25978654250502586, "rewards/final_reward": 1.5930146855113068, "rewards/mask_iou_reward": 0.7965073427556534, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5402601957321167, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1246, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.375, "epoch": 4.21247892074199, "grad_norm": 8.409320177212509, "kl": 0.623046875, "learning_rate": 6.489301801801802e-07, "loss": 0.0006, "reward": 3.4033043384552, "reward_std": 0.1465322431176901, "rewards/final_reward": 0.9809823485167375, "rewards/mask_iou_reward": 0.49049117425836875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4033044576644897, "rewards/thk_ans_format_reward": 1.0, "step": 1247, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 192.5104217529297, "epoch": 4.2158516020236085, "grad_norm": 60.95788374408251, "kl": 0.4169921875, "learning_rate": 6.486486486486486e-07, "loss": 0.0003, "reward": 3.5154601335525513, "reward_std": 0.18371056020259857, "rewards/final_reward": 1.4486695819317195, "rewards/mask_iou_reward": 0.7243347909658597, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5362934470176697, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1248, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 184.20833587646484, "epoch": 4.219224283305228, "grad_norm": 18.31368359785162, "kl": 0.439453125, "learning_rate": 6.483671171171171e-07, "loss": 0.0004, "reward": 3.470528244972229, "reward_std": 0.21738072484731674, "rewards/final_reward": 1.4415366605300044, "rewards/mask_iou_reward": 0.7207683302650022, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.5330281853675842, "rewards/thk_ans_format_reward": 0.96875, "step": 1249, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 122.54167175292969, "epoch": 4.222596964586846, "grad_norm": 13.461789249031524, "kl": 0.41015625, "learning_rate": 6.480855855855855e-07, "loss": 0.0004, "reward": 3.3450616598129272, "reward_std": 0.07049424014985561, "rewards/final_reward": 1.0090393338430488, "rewards/mask_iou_reward": 0.5045196669215244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3450616598129272, "rewards/thk_ans_format_reward": 1.0, "step": 1250, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.86458587646484, "epoch": 4.2259696458684655, "grad_norm": 12.45960904221618, "kl": 0.39453125, "learning_rate": 6.47804054054054e-07, "loss": 0.0004, "reward": 3.5850772857666016, "reward_std": 0.08815484121441841, "rewards/final_reward": 1.1730620960273108, "rewards/mask_iou_reward": 0.5865310480136554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5850771069526672, "rewards/thk_ans_format_reward": 1.0, "step": 1251, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 131.52083587646484, "epoch": 4.229342327150085, "grad_norm": 27.854007977705187, "kl": 0.44140625, "learning_rate": 6.475225225225225e-07, "loss": 0.0005, "reward": 3.6427247524261475, "reward_std": 0.06672817096114159, "rewards/final_reward": 1.2199035694148141, "rewards/mask_iou_reward": 0.6099517847074071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6427247524261475, "rewards/thk_ans_format_reward": 1.0, "step": 1252, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.5520896911621, "epoch": 4.232715008431703, "grad_norm": 29.167713475717548, "kl": 0.396484375, "learning_rate": 6.472409909909909e-07, "loss": 0.0004, "reward": 3.5975239276885986, "reward_std": 0.03818482160568237, "rewards/final_reward": 1.666841807357827, "rewards/mask_iou_reward": 0.8334209036789135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5975240468978882, "rewards/thk_ans_format_reward": 1.0, "step": 1253, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 192.20834350585938, "epoch": 4.236087689713322, "grad_norm": 27.32010950165432, "kl": 0.5859375, "learning_rate": 6.469594594594594e-07, "loss": 0.0006, "reward": 3.4661457538604736, "reward_std": 0.09239022061228752, "rewards/final_reward": 1.206821584190531, "rewards/mask_iou_reward": 0.6034107920952655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4661457538604736, "rewards/thk_ans_format_reward": 1.0, "step": 1254, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 110.08333587646484, "epoch": 4.239460370994941, "grad_norm": 21.433106992655098, "kl": 0.5380859375, "learning_rate": 6.466779279279278e-07, "loss": 0.0005, "reward": 3.370681881904602, "reward_std": 0.05036402679979801, "rewards/final_reward": 1.8078184431682338, "rewards/mask_iou_reward": 0.9039092215841169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3706818222999573, "rewards/thk_ans_format_reward": 1.0, "step": 1255, "think_completion_length": 6.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 135.67708587646484, "epoch": 4.24283305227656, "grad_norm": 11.401224674392935, "kl": 0.400390625, "learning_rate": 6.463963963963963e-07, "loss": 0.0004, "reward": 3.387624502182007, "reward_std": 0.05631537130102515, "rewards/final_reward": 1.8458147221685657, "rewards/mask_iou_reward": 0.9229073610842828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3876244127750397, "rewards/thk_ans_format_reward": 1.0, "step": 1256, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 191.2604217529297, "epoch": 4.246205733558178, "grad_norm": 8.2827018516746, "kl": 0.400390625, "learning_rate": 6.461148648648649e-07, "loss": 0.0005, "reward": 3.403711438179016, "reward_std": 0.06762434728443623, "rewards/final_reward": 1.920771464048671, "rewards/mask_iou_reward": 0.9603857320243355, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4037113785743713, "rewards/thk_ans_format_reward": 1.0, "step": 1257, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 144.56250762939453, "epoch": 4.249578414839798, "grad_norm": 9.468059780611297, "kl": 0.3837890625, "learning_rate": 6.458333333333333e-07, "loss": 0.0004, "reward": 3.150208592414856, "reward_std": 0.10077772289514542, "rewards/final_reward": 1.0828550566399318, "rewards/mask_iou_reward": 0.5414275283199659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1502084136009216, "rewards/thk_ans_format_reward": 1.0, "step": 1258, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 126.85417175292969, "epoch": 4.252951096121416, "grad_norm": 28.135678363113584, "kl": 0.404296875, "learning_rate": 6.455518018018018e-07, "loss": 0.0004, "reward": 3.407992720603943, "reward_std": 0.014361603185534477, "rewards/final_reward": 1.6231428704540085, "rewards/mask_iou_reward": 0.8115714352270043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4079925417900085, "rewards/thk_ans_format_reward": 1.0, "step": 1259, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 186.2604217529297, "epoch": 4.256323777403035, "grad_norm": 8.01168528846507, "kl": 0.4619140625, "learning_rate": 6.452702702702702e-07, "loss": 0.0004, "reward": 3.174490809440613, "reward_std": 0.14233126863837242, "rewards/final_reward": 1.0650623705048627, "rewards/mask_iou_reward": 0.5325311852524314, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1953240036964417, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1260, "think_completion_length": 5.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 114.35417175292969, "epoch": 4.259696458684655, "grad_norm": 10.994190019550278, "kl": 0.55859375, "learning_rate": 6.449887387387387e-07, "loss": 0.0006, "reward": 3.624230146408081, "reward_std": 0.08744936436414719, "rewards/final_reward": 1.5999830015346452, "rewards/mask_iou_reward": 0.7999915007673226, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6242303252220154, "rewards/thk_ans_format_reward": 1.0, "step": 1261, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.73958587646484, "epoch": 4.263069139966273, "grad_norm": 10.331129254992351, "kl": 0.45703125, "learning_rate": 6.447072072072072e-07, "loss": 0.0005, "reward": 3.3891860246658325, "reward_std": 0.09818197600543499, "rewards/final_reward": 1.2423954959032273, "rewards/mask_iou_reward": 0.6211977479516136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3891857862472534, "rewards/thk_ans_format_reward": 1.0, "step": 1262, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 4.266441821247892, "grad_norm": 21.318030836117394, "kl": 0.52734375, "learning_rate": 6.444256756756756e-07, "loss": 0.0006, "reward": 3.590371608734131, "reward_std": 0.03496438264846802, "rewards/final_reward": 1.0998050889015756, "rewards/mask_iou_reward": 0.5499025444507878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5903714895248413, "rewards/thk_ans_format_reward": 1.0, "step": 1263, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 142.68750762939453, "epoch": 4.269814502529511, "grad_norm": 11.324690008730549, "kl": 0.4638671875, "learning_rate": 6.441441441441441e-07, "loss": 0.0005, "reward": 3.3355226516723633, "reward_std": 0.07569969445466995, "rewards/final_reward": 1.583805177521902, "rewards/mask_iou_reward": 0.791902588760951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3355226516723633, "rewards/thk_ans_format_reward": 1.0, "step": 1264, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.29166793823242, "epoch": 4.27318718381113, "grad_norm": 56.87368885988965, "kl": 0.419921875, "learning_rate": 6.438626126126126e-07, "loss": 0.0004, "reward": 3.379696846008301, "reward_std": 0.09507296234369278, "rewards/final_reward": 1.3212376749816899, "rewards/mask_iou_reward": 0.6606188374908449, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3796967267990112, "rewards/thk_ans_format_reward": 1.0, "step": 1265, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 128.46875, "epoch": 4.276559865092748, "grad_norm": 15.725288435528828, "kl": 0.44921875, "learning_rate": 6.43581081081081e-07, "loss": 0.0005, "reward": 3.3752676248550415, "reward_std": 0.09868980012834072, "rewards/final_reward": 1.2282691318397194, "rewards/mask_iou_reward": 0.6141345659198597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3856841921806335, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1266, "think_completion_length": 5.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 140.9791717529297, "epoch": 4.279932546374368, "grad_norm": 15.177451072912048, "kl": 0.4609375, "learning_rate": 6.432995495495496e-07, "loss": 0.0005, "reward": 3.4971253871917725, "reward_std": 0.04681009333580732, "rewards/final_reward": 0.8785843499122169, "rewards/mask_iou_reward": 0.43929217495610845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4971250295639038, "rewards/thk_ans_format_reward": 1.0, "step": 1267, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 152.40625762939453, "epoch": 4.283305227655987, "grad_norm": 11.990275841882601, "kl": 0.453125, "learning_rate": 6.43018018018018e-07, "loss": 0.0005, "reward": 3.2263606786727905, "reward_std": 0.04418545961380005, "rewards/final_reward": 0.4898923797029757, "rewards/mask_iou_reward": 0.24494618985148786, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2263606786727905, "rewards/thk_ans_format_reward": 1.0, "step": 1268, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 140.42708587646484, "epoch": 4.286677908937605, "grad_norm": 90.20677125257647, "kl": 1.072265625, "learning_rate": 6.427364864864865e-07, "loss": 0.0011, "reward": 3.1920053958892822, "reward_std": 0.2112291418015957, "rewards/final_reward": 0.7658425978021275, "rewards/mask_iou_reward": 0.38292129890106374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1920052468776703, "rewards/thk_ans_format_reward": 1.0, "step": 1269, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 184.9583396911621, "epoch": 4.2900505902192245, "grad_norm": 12.851434274016533, "kl": 0.408203125, "learning_rate": 6.42454954954955e-07, "loss": 0.0004, "reward": 3.476992130279541, "reward_std": 0.24782110750675201, "rewards/final_reward": 1.6170369546662968, "rewards/mask_iou_reward": 0.8085184773331484, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5186585187911987, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1270, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 142.2916717529297, "epoch": 4.293423271500843, "grad_norm": 11.311288767429877, "kl": 0.3720703125, "learning_rate": 6.421734234234234e-07, "loss": 0.0004, "reward": 3.519341826438904, "reward_std": 0.07771720364689827, "rewards/final_reward": 1.6035272777919118, "rewards/mask_iou_reward": 0.8017636388959559, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5193416476249695, "rewards/thk_ans_format_reward": 1.0, "step": 1271, "think_completion_length": 5.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 148.86458587646484, "epoch": 4.296795952782462, "grad_norm": 14.915353679554494, "kl": 0.486328125, "learning_rate": 6.418918918918919e-07, "loss": 0.0006, "reward": 3.6986976861953735, "reward_std": 0.024407205171883106, "rewards/final_reward": 1.8355800602343713, "rewards/mask_iou_reward": 0.9177900301171856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6986975073814392, "rewards/thk_ans_format_reward": 1.0, "step": 1272, "think_completion_length": 5.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 149.8229217529297, "epoch": 4.300168634064081, "grad_norm": 22.386881153592117, "kl": 0.501953125, "learning_rate": 6.416103603603603e-07, "loss": 0.0005, "reward": 3.482214570045471, "reward_std": 0.07844844087958336, "rewards/final_reward": 1.8699623549921272, "rewards/mask_iou_reward": 0.9349811774960636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4822145104408264, "rewards/thk_ans_format_reward": 1.0, "step": 1273, "think_completion_length": 5.5 }, { "clip_ratio": 0.0, "completion_length": 132.09375762939453, "epoch": 4.3035413153457, "grad_norm": 11.281865408345398, "kl": 0.4189453125, "learning_rate": 6.413288288288288e-07, "loss": 0.0004, "reward": 3.472105860710144, "reward_std": 0.19757422804832458, "rewards/final_reward": 1.609137134690231, "rewards/mask_iou_reward": 0.8045685673451155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.472105860710144, "rewards/thk_ans_format_reward": 1.0, "step": 1274, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.88542175292969, "epoch": 4.306913996627319, "grad_norm": 10.133217004993806, "kl": 0.5869140625, "learning_rate": 6.410472972972973e-07, "loss": 0.0006, "reward": 3.6658592224121094, "reward_std": 0.17215874418616295, "rewards/final_reward": 1.944337073758779, "rewards/mask_iou_reward": 0.9721685368793895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.665859341621399, "rewards/thk_ans_format_reward": 1.0, "step": 1275, "think_completion_length": 5.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 137.8854217529297, "epoch": 4.3102866779089375, "grad_norm": 39.645176419264196, "kl": 0.4033203125, "learning_rate": 6.407657657657657e-07, "loss": 0.0004, "reward": 3.4578869342803955, "reward_std": 0.05655907094478607, "rewards/final_reward": 1.2396400380269288, "rewards/mask_iou_reward": 0.6198200190134644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.457886815071106, "rewards/thk_ans_format_reward": 1.0, "step": 1276, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.4583396911621, "epoch": 4.313659359190557, "grad_norm": 12.543109846229324, "kl": 0.390625, "learning_rate": 6.404842342342343e-07, "loss": 0.0004, "reward": 3.536691188812256, "reward_std": 0.12136990204453468, "rewards/final_reward": 1.310790119763332, "rewards/mask_iou_reward": 0.655395059881666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5366910696029663, "rewards/thk_ans_format_reward": 1.0, "step": 1277, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.09375762939453, "epoch": 4.317032040472175, "grad_norm": 15.233310117516359, "kl": 0.53515625, "learning_rate": 6.402027027027028e-07, "loss": 0.0005, "reward": 3.1646311283111572, "reward_std": 0.10930739156901836, "rewards/final_reward": 0.5577201157215373, "rewards/mask_iou_reward": 0.2788600578607687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1646308898925781, "rewards/thk_ans_format_reward": 1.0, "step": 1278, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.44792175292969, "epoch": 4.320404721753794, "grad_norm": 7.540695635548057, "kl": 0.4638671875, "learning_rate": 6.399211711711712e-07, "loss": 0.0005, "reward": 3.6288951635360718, "reward_std": 0.07137066684663296, "rewards/final_reward": 1.7092706493317622, "rewards/mask_iou_reward": 0.8546353246658811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6288952827453613, "rewards/thk_ans_format_reward": 1.0, "step": 1279, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.3229217529297, "epoch": 4.323777403035413, "grad_norm": 11.45759006501233, "kl": 0.4765625, "learning_rate": 6.396396396396397e-07, "loss": 0.0005, "reward": 3.403845429420471, "reward_std": 0.15239424258470535, "rewards/final_reward": 1.4765252724296953, "rewards/mask_iou_reward": 0.7382626362148477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4038452506065369, "rewards/thk_ans_format_reward": 1.0, "step": 1280, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 129.75, "epoch": 4.327150084317032, "grad_norm": 12.659566610037718, "kl": 0.4296875, "learning_rate": 6.393581081081081e-07, "loss": 0.0004, "reward": 3.546845316886902, "reward_std": 0.1074911393225193, "rewards/final_reward": 1.731462015738102, "rewards/mask_iou_reward": 0.865731007869051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5468451976776123, "rewards/thk_ans_format_reward": 1.0, "step": 1281, "think_completion_length": 5.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 157.6458396911621, "epoch": 4.330522765598651, "grad_norm": 8.705391227331303, "kl": 0.37890625, "learning_rate": 6.390765765765766e-07, "loss": 0.0004, "reward": 3.6814361810684204, "reward_std": 0.027851653285324574, "rewards/final_reward": 1.8943658471287275, "rewards/mask_iou_reward": 0.9471829235643637, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6814362406730652, "rewards/thk_ans_format_reward": 1.0, "step": 1282, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 146.08333587646484, "epoch": 4.33389544688027, "grad_norm": 7.4392061951139645, "kl": 0.427734375, "learning_rate": 6.38795045045045e-07, "loss": 0.0004, "reward": 3.3992159366607666, "reward_std": 0.07810256537050009, "rewards/final_reward": 1.6026946067459202, "rewards/mask_iou_reward": 0.8013473033729601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3992160558700562, "rewards/thk_ans_format_reward": 1.0, "step": 1283, "think_completion_length": 5.25 }, { "clip_ratio": 0.0, "completion_length": 126.47917175292969, "epoch": 4.337268128161889, "grad_norm": 9.69670778371472, "kl": 0.4384765625, "learning_rate": 6.385135135135134e-07, "loss": 0.0004, "reward": 3.6353808641433716, "reward_std": 0.08136463444679976, "rewards/final_reward": 1.7156594083437389, "rewards/mask_iou_reward": 0.8578297041718694, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6353808045387268, "rewards/thk_ans_format_reward": 1.0, "step": 1284, "think_completion_length": 4.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 144.0729217529297, "epoch": 4.340640809443507, "grad_norm": 12.762926580219476, "kl": 0.462890625, "learning_rate": 6.382319819819819e-07, "loss": 0.0005, "reward": 3.420608162879944, "reward_std": 0.18191301077604294, "rewards/final_reward": 1.348219115977413, "rewards/mask_iou_reward": 0.6741095579887065, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4414416551589966, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1285, "think_completion_length": 5.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 146.98958587646484, "epoch": 4.344013490725127, "grad_norm": 17.299410660768007, "kl": 0.4404296875, "learning_rate": 6.379504504504503e-07, "loss": 0.0004, "reward": 3.370743989944458, "reward_std": 0.1613960862159729, "rewards/final_reward": 1.7371300769846425, "rewards/mask_iou_reward": 0.8685650384923213, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3915773034095764, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1286, "think_completion_length": 5.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.57292175292969, "epoch": 4.347386172006745, "grad_norm": 25.087388286333983, "kl": 0.59765625, "learning_rate": 6.376689189189189e-07, "loss": 0.0006, "reward": 3.4849319458007812, "reward_std": 0.09557798132300377, "rewards/final_reward": 1.4741651557191244, "rewards/mask_iou_reward": 0.7370825778595622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4849318265914917, "rewards/thk_ans_format_reward": 1.0, "step": 1287, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 143.95833587646484, "epoch": 4.350758853288364, "grad_norm": 19.652298591259633, "kl": 0.48828125, "learning_rate": 6.373873873873874e-07, "loss": 0.0005, "reward": 3.369731068611145, "reward_std": 0.24395397305488586, "rewards/final_reward": 1.849694773799825, "rewards/mask_iou_reward": 0.9248473868999125, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4113975167274475, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1288, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 130.2916717529297, "epoch": 4.354131534569984, "grad_norm": 9.013563522241386, "kl": 0.41796875, "learning_rate": 6.371058558558558e-07, "loss": 0.0004, "reward": 3.424813151359558, "reward_std": 0.0731990858912468, "rewards/final_reward": 1.9547891026851874, "rewards/mask_iou_reward": 0.9773945513425937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4248132109642029, "rewards/thk_ans_format_reward": 1.0, "step": 1289, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 168.87500762939453, "epoch": 4.357504215851602, "grad_norm": 6.721369378846812, "kl": 0.4375, "learning_rate": 6.368243243243243e-07, "loss": 0.0004, "reward": 3.436237096786499, "reward_std": 0.08576931431889534, "rewards/final_reward": 1.467071944155215, "rewards/mask_iou_reward": 0.7335359720776075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.436237096786499, "rewards/thk_ans_format_reward": 1.0, "step": 1290, "think_completion_length": 5.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 121.29167175292969, "epoch": 4.360876897133221, "grad_norm": 7.362840756798237, "kl": 0.5146484375, "learning_rate": 6.365427927927927e-07, "loss": 0.0006, "reward": 3.4356051683425903, "reward_std": 0.16319414228200912, "rewards/final_reward": 1.925304111768032, "rewards/mask_iou_reward": 0.962652055884016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4356051683425903, "rewards/thk_ans_format_reward": 1.0, "step": 1291, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 180.7916717529297, "epoch": 4.36424957841484, "grad_norm": 185.28901934065232, "kl": 0.37890625, "learning_rate": 6.362612612612612e-07, "loss": 0.0004, "reward": 3.370970845222473, "reward_std": 0.33027198910713196, "rewards/final_reward": 1.114027995257803, "rewards/mask_iou_reward": 0.5570139976289015, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4126374125480652, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1292, "think_completion_length": 6.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 206.7916717529297, "epoch": 4.367622259696459, "grad_norm": 6.847292096928454, "kl": 0.3837890625, "learning_rate": 6.359797297297297e-07, "loss": 0.0004, "reward": 3.2700021266937256, "reward_std": 0.3615192845463753, "rewards/final_reward": 1.1498082631709108, "rewards/mask_iou_reward": 0.5749041315854554, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3429186344146729, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1293, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.83333587646484, "epoch": 4.370994940978077, "grad_norm": 8.913277865445579, "kl": 0.552734375, "learning_rate": 6.356981981981981e-07, "loss": 0.0006, "reward": 3.4847034215927124, "reward_std": 0.273854024708271, "rewards/final_reward": 1.7407927712937312, "rewards/mask_iou_reward": 0.8703963856468656, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.505536675453186, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1294, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 181.20833587646484, "epoch": 4.3743676222596966, "grad_norm": 6.8652638016315475, "kl": 0.58203125, "learning_rate": 6.354166666666666e-07, "loss": 0.0006, "reward": 3.5416958332061768, "reward_std": 0.2061656340956688, "rewards/final_reward": 1.8498229882408612, "rewards/mask_iou_reward": 0.9249114941204306, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5625290870666504, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1295, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 176.46875762939453, "epoch": 4.377740303541315, "grad_norm": 17.815113300408775, "kl": 0.58203125, "learning_rate": 6.35135135135135e-07, "loss": 0.0006, "reward": 3.021910071372986, "reward_std": 0.131107859313488, "rewards/final_reward": 0.8340113627958121, "rewards/mask_iou_reward": 0.41700568139790606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0219102799892426, "rewards/thk_ans_format_reward": 1.0, "step": 1296, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 152.82291793823242, "epoch": 4.381112984822934, "grad_norm": 42.24876306015295, "kl": 0.4775390625, "learning_rate": 6.348536036036036e-07, "loss": 0.0005, "reward": 3.639923334121704, "reward_std": 0.07676161080598831, "rewards/final_reward": 1.888736375245885, "rewards/mask_iou_reward": 0.9443681876229425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6399233937263489, "rewards/thk_ans_format_reward": 1.0, "step": 1297, "think_completion_length": 6.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 142.33333587646484, "epoch": 4.3844856661045535, "grad_norm": 17.35495373942962, "kl": 0.40234375, "learning_rate": 6.345720720720721e-07, "loss": 0.0004, "reward": 3.204251527786255, "reward_std": 0.03969000466167927, "rewards/final_reward": 1.0037151846973167, "rewards/mask_iou_reward": 0.5018575923486583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.20425146818161, "rewards/thk_ans_format_reward": 1.0, "step": 1298, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 133.6979217529297, "epoch": 4.387858347386172, "grad_norm": 20.977587134561993, "kl": 0.4111328125, "learning_rate": 6.342905405405405e-07, "loss": 0.0004, "reward": 3.381840944290161, "reward_std": 0.07051502913236618, "rewards/final_reward": 0.9913485842903418, "rewards/mask_iou_reward": 0.4956742921451709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3818408250808716, "rewards/thk_ans_format_reward": 1.0, "step": 1299, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.46875762939453, "epoch": 4.391231028667791, "grad_norm": 28.793575846253628, "kl": 0.51953125, "learning_rate": 6.34009009009009e-07, "loss": 0.0005, "reward": 3.0043944120407104, "reward_std": 0.12208867445588112, "rewards/final_reward": 1.4536093123239269, "rewards/mask_iou_reward": 0.7268046561619634, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0043944120407104, "rewards/thk_ans_format_reward": 1.0, "step": 1300, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 129.3541717529297, "epoch": 4.3946037099494095, "grad_norm": 10.88683265592209, "kl": 0.494140625, "learning_rate": 6.337274774774775e-07, "loss": 0.0005, "reward": 3.437264561653137, "reward_std": 0.12238720059394836, "rewards/final_reward": 1.0413577423258804, "rewards/mask_iou_reward": 0.5206788711629402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4372643828392029, "rewards/thk_ans_format_reward": 1.0, "step": 1301, "think_completion_length": 5.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 128.54166793823242, "epoch": 4.397976391231029, "grad_norm": 13.458149194489804, "kl": 0.76171875, "learning_rate": 6.334459459459459e-07, "loss": 0.0008, "reward": 3.532782554626465, "reward_std": 0.1034855768084526, "rewards/final_reward": 1.2482119400414557, "rewards/mask_iou_reward": 0.6241059700207279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5327824354171753, "rewards/thk_ans_format_reward": 1.0, "step": 1302, "think_completion_length": 5.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 128.95833587646484, "epoch": 4.401349072512647, "grad_norm": 29.113274377786073, "kl": 0.8466796875, "learning_rate": 6.331644144144144e-07, "loss": 0.0009, "reward": 3.2336429357528687, "reward_std": 0.10814763605594635, "rewards/final_reward": 1.2469824715224955, "rewards/mask_iou_reward": 0.6234912357612478, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2336428463459015, "rewards/thk_ans_format_reward": 1.0, "step": 1303, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 177.5104217529297, "epoch": 4.4047217537942664, "grad_norm": 10.438587288068794, "kl": 0.4140625, "learning_rate": 6.328828828828828e-07, "loss": 0.0004, "reward": 3.369965076446533, "reward_std": 0.2397690787911415, "rewards/final_reward": 1.3658386005190497, "rewards/mask_iou_reward": 0.6829193002595249, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3907981514930725, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1304, "think_completion_length": 6.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 135.875, "epoch": 4.408094435075886, "grad_norm": 9.686637334920153, "kl": 0.4208984375, "learning_rate": 6.326013513513513e-07, "loss": 0.0005, "reward": 3.1423239707946777, "reward_std": 0.10288365464657545, "rewards/final_reward": 1.4988730307858993, "rewards/mask_iou_reward": 0.7494365153929496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.142323911190033, "rewards/thk_ans_format_reward": 1.0, "step": 1305, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 200.48959350585938, "epoch": 4.411467116357504, "grad_norm": 9.02812056455549, "kl": 0.5029296875, "learning_rate": 6.323198198198198e-07, "loss": 0.0005, "reward": 3.1927590370178223, "reward_std": 0.2850857675075531, "rewards/final_reward": 1.5902582021310268, "rewards/mask_iou_reward": 0.7951291010655134, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.2552589774131775, "rewards/thk_ans_format_reward": 0.96875, "step": 1306, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 179.62500762939453, "epoch": 4.414839797639123, "grad_norm": 6.8909826541799974, "kl": 0.38671875, "learning_rate": 6.320382882882883e-07, "loss": 0.0004, "reward": 3.4335436820983887, "reward_std": 0.18952222168445587, "rewards/final_reward": 1.7601613200196897, "rewards/mask_iou_reward": 0.8800806600098449, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4543770551681519, "rewards/thk_ans_format_reward": 1.0, "step": 1307, "think_completion_length": 5.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.19792175292969, "epoch": 4.418212478920742, "grad_norm": 12.016535270077375, "kl": 0.447265625, "learning_rate": 6.317567567567568e-07, "loss": 0.0005, "reward": 3.2483325004577637, "reward_std": 0.08590874914079905, "rewards/final_reward": 1.1596071175494251, "rewards/mask_iou_reward": 0.5798035587747126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2483325004577637, "rewards/thk_ans_format_reward": 1.0, "step": 1308, "think_completion_length": 6.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.9270896911621, "epoch": 4.421585160202361, "grad_norm": 8.452737652006032, "kl": 0.51171875, "learning_rate": 6.314752252252252e-07, "loss": 0.0005, "reward": 3.5426175594329834, "reward_std": 0.11622235551476479, "rewards/final_reward": 1.5961111244796113, "rewards/mask_iou_reward": 0.7980555622398057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5426174998283386, "rewards/thk_ans_format_reward": 1.0, "step": 1309, "think_completion_length": 6.0 }, { "clip_ratio": 0.0, "completion_length": 133.39583587646484, "epoch": 4.424957841483979, "grad_norm": 13.0609759023184, "kl": 0.4267578125, "learning_rate": 6.311936936936937e-07, "loss": 0.0004, "reward": 3.4041415452957153, "reward_std": 0.19920283555984497, "rewards/final_reward": 1.66883441199862, "rewards/mask_iou_reward": 0.83441720599931, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4041413068771362, "rewards/thk_ans_format_reward": 1.0, "step": 1310, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 122.75000381469727, "epoch": 4.428330522765599, "grad_norm": 90.23292436090381, "kl": 0.416015625, "learning_rate": 6.309121621621622e-07, "loss": 0.0004, "reward": 2.907870292663574, "reward_std": 0.10285294055938721, "rewards/final_reward": 0.7910406349387594, "rewards/mask_iou_reward": 0.3955203174693797, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9078702032566071, "rewards/thk_ans_format_reward": 1.0, "step": 1311, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 171.42708587646484, "epoch": 4.431703204047217, "grad_norm": 17.978114646607587, "kl": 0.412109375, "learning_rate": 6.306306306306306e-07, "loss": 0.0005, "reward": 3.2035940885543823, "reward_std": 0.17428059503436089, "rewards/final_reward": 1.4809223040645283, "rewards/mask_iou_reward": 0.7404611520322641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2035939693450928, "rewards/thk_ans_format_reward": 1.0, "step": 1312, "think_completion_length": 5.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 150.03125762939453, "epoch": 4.435075885328836, "grad_norm": 9.933008430647831, "kl": 0.486328125, "learning_rate": 6.303490990990991e-07, "loss": 0.0005, "reward": 3.2731047868728638, "reward_std": 0.14033827558159828, "rewards/final_reward": 1.1736736908276793, "rewards/mask_iou_reward": 0.5868368454138396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2731046676635742, "rewards/thk_ans_format_reward": 1.0, "step": 1313, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.55208587646484, "epoch": 4.438448566610456, "grad_norm": 31.427049740289576, "kl": 0.4716796875, "learning_rate": 6.300675675675675e-07, "loss": 0.0005, "reward": 3.422204375267029, "reward_std": 0.1572865154594183, "rewards/final_reward": 1.091553492553749, "rewards/mask_iou_reward": 0.5457767462768744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4222044348716736, "rewards/thk_ans_format_reward": 1.0, "step": 1314, "think_completion_length": 5.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 154.5729217529297, "epoch": 4.441821247892074, "grad_norm": 20.019555623602535, "kl": 0.431640625, "learning_rate": 6.29786036036036e-07, "loss": 0.0004, "reward": 3.2552809715270996, "reward_std": 0.26685456931591034, "rewards/final_reward": 1.2028960215663902, "rewards/mask_iou_reward": 0.6014480107831951, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2865309417247772, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1315, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.40625, "epoch": 4.445193929173693, "grad_norm": 8.662922043418455, "kl": 0.46875, "learning_rate": 6.295045045045045e-07, "loss": 0.0005, "reward": 3.4265466928482056, "reward_std": 0.05025552585721016, "rewards/final_reward": 0.9454525025156142, "rewards/mask_iou_reward": 0.4727262512578071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.42654687166214, "rewards/thk_ans_format_reward": 1.0, "step": 1316, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 157.8333396911621, "epoch": 4.448566610455312, "grad_norm": 36.21317582943972, "kl": 0.501953125, "learning_rate": 6.29222972972973e-07, "loss": 0.0005, "reward": 3.3867627382278442, "reward_std": 0.11366377770900726, "rewards/final_reward": 1.4993498673055825, "rewards/mask_iou_reward": 0.7496749336527913, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.386762797832489, "rewards/thk_ans_format_reward": 1.0, "step": 1317, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.88541793823242, "epoch": 4.451939291736931, "grad_norm": 16.897114106900812, "kl": 0.498046875, "learning_rate": 6.289414414414415e-07, "loss": 0.0005, "reward": 3.1020525693893433, "reward_std": 0.14741721376776695, "rewards/final_reward": 0.520114738677751, "rewards/mask_iou_reward": 0.2600573693388755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1020523607730865, "rewards/thk_ans_format_reward": 1.0, "step": 1318, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 143.64583587646484, "epoch": 4.455311973018549, "grad_norm": 26.997113281837546, "kl": 0.591796875, "learning_rate": 6.2865990990991e-07, "loss": 0.0006, "reward": 3.3456215858459473, "reward_std": 0.07093912735581398, "rewards/final_reward": 1.579614805626064, "rewards/mask_iou_reward": 0.789807402813032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3456215262413025, "rewards/thk_ans_format_reward": 1.0, "step": 1319, "think_completion_length": 6.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 193.6666717529297, "epoch": 4.458684654300169, "grad_norm": 8.579021486082592, "kl": 0.4130859375, "learning_rate": 6.283783783783784e-07, "loss": 0.0004, "reward": 3.255575180053711, "reward_std": 0.4096851944923401, "rewards/final_reward": 1.7250021070975252, "rewards/mask_iou_reward": 0.8625010535487626, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.3597416281700134, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 1320, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 137.67708587646484, "epoch": 4.462057335581788, "grad_norm": 11.564712852567604, "kl": 0.4521484375, "learning_rate": 6.280968468468469e-07, "loss": 0.0005, "reward": 3.681838870048523, "reward_std": 0.05845123156905174, "rewards/final_reward": 1.5727482671470696, "rewards/mask_iou_reward": 0.7863741335735348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.681838870048523, "rewards/thk_ans_format_reward": 1.0, "step": 1321, "think_completion_length": 5.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 133.5833396911621, "epoch": 4.465430016863406, "grad_norm": 10.570742234389874, "kl": 0.4306640625, "learning_rate": 6.278153153153153e-07, "loss": 0.0004, "reward": 3.0406211614608765, "reward_std": 0.1231082808226347, "rewards/final_reward": 1.0502326924803995, "rewards/mask_iou_reward": 0.5251163462401998, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0406211614608765, "rewards/thk_ans_format_reward": 1.0, "step": 1322, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 132.65625, "epoch": 4.4688026981450255, "grad_norm": 48.40569580023704, "kl": 0.51953125, "learning_rate": 6.275337837837837e-07, "loss": 0.0005, "reward": 3.138838768005371, "reward_std": 0.06746555864810944, "rewards/final_reward": 0.777066802693318, "rewards/mask_iou_reward": 0.388533401346659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1388386487960815, "rewards/thk_ans_format_reward": 1.0, "step": 1323, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 4.472175379426644, "grad_norm": 10.429145624903171, "kl": 0.44140625, "learning_rate": 6.272522522522522e-07, "loss": 0.0005, "reward": 3.3782697916030884, "reward_std": 0.07179485633969307, "rewards/final_reward": 1.6552169970137809, "rewards/mask_iou_reward": 0.8276084985068904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3782697319984436, "rewards/thk_ans_format_reward": 1.0, "step": 1324, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 133.53125, "epoch": 4.475548060708263, "grad_norm": 17.4447687740572, "kl": 0.400390625, "learning_rate": 6.269707207207206e-07, "loss": 0.0004, "reward": 3.270228624343872, "reward_std": 0.04852992668747902, "rewards/final_reward": 1.771550318762778, "rewards/mask_iou_reward": 0.885775159381389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2702285051345825, "rewards/thk_ans_format_reward": 1.0, "step": 1325, "think_completion_length": 5.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 116.36458587646484, "epoch": 4.4789207419898815, "grad_norm": 9.387307475222283, "kl": 0.439453125, "learning_rate": 6.266891891891891e-07, "loss": 0.0004, "reward": 3.5763256549835205, "reward_std": 0.11707734689116478, "rewards/final_reward": 1.6630674648210952, "rewards/mask_iou_reward": 0.8315337324105476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.576325535774231, "rewards/thk_ans_format_reward": 1.0, "step": 1326, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 143.89584350585938, "epoch": 4.482293423271501, "grad_norm": 7.513275324902006, "kl": 0.3955078125, "learning_rate": 6.264076576576576e-07, "loss": 0.0004, "reward": 3.507278084754944, "reward_std": 0.08537270268425345, "rewards/final_reward": 1.874773920447942, "rewards/mask_iou_reward": 0.937386960223971, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.507278025150299, "rewards/thk_ans_format_reward": 1.0, "step": 1327, "think_completion_length": 5.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.0416717529297, "epoch": 4.48566610455312, "grad_norm": 11.916571704103813, "kl": 0.4365234375, "learning_rate": 6.261261261261261e-07, "loss": 0.0004, "reward": 3.7991596460342407, "reward_std": 0.07272768579423428, "rewards/final_reward": 1.9099894526726762, "rewards/mask_iou_reward": 0.9549947263363381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7991596460342407, "rewards/thk_ans_format_reward": 1.0, "step": 1328, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 124.71875, "epoch": 4.4890387858347385, "grad_norm": 24.874831175357656, "kl": 0.419921875, "learning_rate": 6.258445945945946e-07, "loss": 0.0004, "reward": 3.59783673286438, "reward_std": 0.06748372502624989, "rewards/final_reward": 1.6861565863299195, "rewards/mask_iou_reward": 0.8430782931649597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5978366136550903, "rewards/thk_ans_format_reward": 1.0, "step": 1329, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 158.40625, "epoch": 4.492411467116358, "grad_norm": 28.278413944207397, "kl": 0.4697265625, "learning_rate": 6.25563063063063e-07, "loss": 0.0005, "reward": 3.4091928005218506, "reward_std": 0.18366630002856255, "rewards/final_reward": 1.167090611647462, "rewards/mask_iou_reward": 0.583545305823731, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4300260543823242, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1330, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 336.67708587646484, "epoch": 4.495784148397976, "grad_norm": 9.278625107287716, "kl": 0.3818359375, "learning_rate": 6.252815315315315e-07, "loss": 0.0004, "reward": 2.7817625999450684, "reward_std": 0.47148652374744415, "rewards/final_reward": 0.9396521678417691, "rewards/mask_iou_reward": 0.46982608392088454, "rewards/sam_format_reward": 0.8750000298023224, "rewards/sam_reward_func_ultra": 1.031762421131134, "rewards/thk_ans_format_reward": 0.8750000298023224, "step": 1331, "think_completion_length": 5.75 }, { "clip_ratio": 0.0, "completion_length": 144.0729217529297, "epoch": 4.499156829679595, "grad_norm": 13.909337670026057, "kl": 0.46484375, "learning_rate": 6.249999999999999e-07, "loss": 0.0005, "reward": 3.4207370281219482, "reward_std": 0.1648988574743271, "rewards/final_reward": 1.8079818096558335, "rewards/mask_iou_reward": 0.9039909048279168, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4207369089126587, "rewards/thk_ans_format_reward": 1.0, "step": 1332, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 133.56250762939453, "epoch": 4.502529510961214, "grad_norm": 12.323677907036371, "kl": 0.4287109375, "learning_rate": 6.247184684684684e-07, "loss": 0.0004, "reward": 3.5210859775543213, "reward_std": 0.14288469403982162, "rewards/final_reward": 1.5541338166677165, "rewards/mask_iou_reward": 0.7770669083338583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5210858583450317, "rewards/thk_ans_format_reward": 1.0, "step": 1333, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 146.07291793823242, "epoch": 4.505902192242833, "grad_norm": 13.204691731923932, "kl": 0.4228515625, "learning_rate": 6.244369369369369e-07, "loss": 0.0004, "reward": 3.0131133794784546, "reward_std": 0.10102058947086334, "rewards/final_reward": 1.1227487359589867, "rewards/mask_iou_reward": 0.5613743679794934, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0235299468040466, "rewards/thk_ans_format_reward": 1.0, "step": 1334, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.375, "epoch": 4.509274873524452, "grad_norm": 10.704382763607255, "kl": 0.4423828125, "learning_rate": 6.241554054054053e-07, "loss": 0.0004, "reward": 3.5165557861328125, "reward_std": 0.1345103308558464, "rewards/final_reward": 1.8046991433654505, "rewards/mask_iou_reward": 0.9023495716827252, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5165559649467468, "rewards/thk_ans_format_reward": 1.0, "step": 1335, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 176.70833587646484, "epoch": 4.512647554806071, "grad_norm": 31.840986007738767, "kl": 0.4755859375, "learning_rate": 6.238738738738738e-07, "loss": 0.0005, "reward": 3.4962968826293945, "reward_std": 0.15561959147453308, "rewards/final_reward": 1.4510761611293062, "rewards/mask_iou_reward": 0.7255380805646531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4962967038154602, "rewards/thk_ans_format_reward": 1.0, "step": 1336, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.47916793823242, "epoch": 4.51602023608769, "grad_norm": 10.79087014899047, "kl": 0.4814453125, "learning_rate": 6.235923423423422e-07, "loss": 0.0005, "reward": 3.343773126602173, "reward_std": 0.031131713651120663, "rewards/final_reward": 0.8331768862643368, "rewards/mask_iou_reward": 0.4165884431321684, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3437729477882385, "rewards/thk_ans_format_reward": 1.0, "step": 1337, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 124.6875, "epoch": 4.519392917369308, "grad_norm": 7.266833367822084, "kl": 0.44921875, "learning_rate": 6.233108108108108e-07, "loss": 0.0004, "reward": 3.615713596343994, "reward_std": 0.08736434578895569, "rewards/final_reward": 1.7009432890917302, "rewards/mask_iou_reward": 0.8504716445458651, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.615713357925415, "rewards/thk_ans_format_reward": 1.0, "step": 1338, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 178.63541793823242, "epoch": 4.522765598650928, "grad_norm": 36.82887927583875, "kl": 0.451171875, "learning_rate": 6.230292792792793e-07, "loss": 0.0005, "reward": 3.28295361995697, "reward_std": 0.2342146709561348, "rewards/final_reward": 1.0344168897945436, "rewards/mask_iou_reward": 0.5172084448972718, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.345453679561615, "rewards/thk_ans_format_reward": 0.96875, "step": 1339, "think_completion_length": 6.5 }, { "clip_ratio": 0.0, "completion_length": 157.67708587646484, "epoch": 4.526138279932546, "grad_norm": 10.016939819024225, "kl": 0.451171875, "learning_rate": 6.227477477477477e-07, "loss": 0.0005, "reward": 3.27169930934906, "reward_std": 0.14648236706852913, "rewards/final_reward": 1.2877338441051873, "rewards/mask_iou_reward": 0.6438669220525937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2716993689537048, "rewards/thk_ans_format_reward": 1.0, "step": 1340, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 133.73958587646484, "epoch": 4.529510961214165, "grad_norm": 10.741459866609754, "kl": 0.5107421875, "learning_rate": 6.224662162162162e-07, "loss": 0.0005, "reward": 3.593325614929199, "reward_std": 0.1311676874756813, "rewards/final_reward": 1.751861304958525, "rewards/mask_iou_reward": 0.8759306524792625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5933258533477783, "rewards/thk_ans_format_reward": 1.0, "step": 1341, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 182.81250762939453, "epoch": 4.532883642495785, "grad_norm": 18.01725136665382, "kl": 1.0830078125, "learning_rate": 6.221846846846847e-07, "loss": 0.0011, "reward": 3.5322015285491943, "reward_std": 0.21869247313588858, "rewards/final_reward": 1.128467963366134, "rewards/mask_iou_reward": 0.564233981683067, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5530345439910889, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1342, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 155.86458587646484, "epoch": 4.536256323777403, "grad_norm": 31.487359076929582, "kl": 0.4267578125, "learning_rate": 6.219031531531531e-07, "loss": 0.0004, "reward": 3.56356418132782, "reward_std": 0.06110507994890213, "rewards/final_reward": 1.6827129801909355, "rewards/mask_iou_reward": 0.8413564900954678, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5635643601417542, "rewards/thk_ans_format_reward": 1.0, "step": 1343, "think_completion_length": 6.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 140.45833587646484, "epoch": 4.539629005059022, "grad_norm": 11.837423389388698, "kl": 0.4384765625, "learning_rate": 6.216216216216216e-07, "loss": 0.0004, "reward": 3.5896217823028564, "reward_std": 0.1191295669414103, "rewards/final_reward": 1.7449539403166883, "rewards/mask_iou_reward": 0.8724769701583441, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.589621901512146, "rewards/thk_ans_format_reward": 1.0, "step": 1344, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 124.58333587646484, "epoch": 4.543001686340641, "grad_norm": 9.16782009417388, "kl": 0.4365234375, "learning_rate": 6.2134009009009e-07, "loss": 0.0004, "reward": 3.5739206075668335, "reward_std": 0.03104757610708475, "rewards/final_reward": 1.2965923239817678, "rewards/mask_iou_reward": 0.6482961619908839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5739206671714783, "rewards/thk_ans_format_reward": 1.0, "step": 1345, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 117.95833587646484, "epoch": 4.54637436762226, "grad_norm": 12.876924496797981, "kl": 0.41796875, "learning_rate": 6.210585585585585e-07, "loss": 0.0004, "reward": 3.4988181591033936, "reward_std": 0.1466284692287445, "rewards/final_reward": 1.6481023773213026, "rewards/mask_iou_reward": 0.8240511886606513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4988181591033936, "rewards/thk_ans_format_reward": 1.0, "step": 1346, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 172.80208587646484, "epoch": 4.549747048903878, "grad_norm": 8.330779387661776, "kl": 0.41796875, "learning_rate": 6.20777027027027e-07, "loss": 0.0004, "reward": 3.434284806251526, "reward_std": 0.1272013932466507, "rewards/final_reward": 1.7430539413468273, "rewards/mask_iou_reward": 0.8715269706734137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4342848062515259, "rewards/thk_ans_format_reward": 1.0, "step": 1347, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 155.30208587646484, "epoch": 4.5531197301854975, "grad_norm": 11.60438500151872, "kl": 0.392578125, "learning_rate": 6.204954954954955e-07, "loss": 0.0004, "reward": 3.43938946723938, "reward_std": 0.10462233331054449, "rewards/final_reward": 1.2083157667735134, "rewards/mask_iou_reward": 0.6041578833867567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4393895864486694, "rewards/thk_ans_format_reward": 1.0, "step": 1348, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 159.23958587646484, "epoch": 4.556492411467117, "grad_norm": 8.767250500131237, "kl": 0.5, "learning_rate": 6.20213963963964e-07, "loss": 0.0005, "reward": 3.2020862102508545, "reward_std": 0.15238425135612488, "rewards/final_reward": 1.4537183116413868, "rewards/mask_iou_reward": 0.7268591558206934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2020861506462097, "rewards/thk_ans_format_reward": 1.0, "step": 1349, "think_completion_length": 7.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 128.52083587646484, "epoch": 4.559865092748735, "grad_norm": 9.766717118278587, "kl": 0.5546875, "learning_rate": 6.199324324324324e-07, "loss": 0.0006, "reward": 3.612934708595276, "reward_std": 0.06734197214245796, "rewards/final_reward": 1.7688350117457257, "rewards/mask_iou_reward": 0.8844175058728628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.612934648990631, "rewards/thk_ans_format_reward": 1.0, "step": 1350, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 134.85416793823242, "epoch": 4.5632377740303545, "grad_norm": 8.70242430872768, "kl": 0.51171875, "learning_rate": 6.196509009009009e-07, "loss": 0.0005, "reward": 3.0413984060287476, "reward_std": 0.26274920254945755, "rewards/final_reward": 1.7419171839168266, "rewards/mask_iou_reward": 0.8709585919584133, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0622316002845764, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1351, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 141.37500762939453, "epoch": 4.566610455311973, "grad_norm": 11.112925150548403, "kl": 0.4404296875, "learning_rate": 6.193693693693694e-07, "loss": 0.0004, "reward": 3.1636022329330444, "reward_std": 0.1556643471121788, "rewards/final_reward": 1.5595026733749058, "rewards/mask_iou_reward": 0.7797513366874529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.163602352142334, "rewards/thk_ans_format_reward": 1.0, "step": 1352, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.5104217529297, "epoch": 4.569983136593592, "grad_norm": 20.78715812576491, "kl": 0.41796875, "learning_rate": 6.190878378378378e-07, "loss": 0.0004, "reward": 3.53654682636261, "reward_std": 0.1526901237666607, "rewards/final_reward": 1.8200043833348483, "rewards/mask_iou_reward": 0.9100021916674241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5365468263626099, "rewards/thk_ans_format_reward": 1.0, "step": 1353, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 167.4479217529297, "epoch": 4.5733558178752105, "grad_norm": 17.193569910426874, "kl": 0.4052734375, "learning_rate": 6.188063063063063e-07, "loss": 0.0004, "reward": 3.4392212629318237, "reward_std": 0.15608344972133636, "rewards/final_reward": 1.1345150048448946, "rewards/mask_iou_reward": 0.5672575024224473, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4392213821411133, "rewards/thk_ans_format_reward": 1.0, "step": 1354, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.53125762939453, "epoch": 4.57672849915683, "grad_norm": 10.423335221856972, "kl": 0.453125, "learning_rate": 6.185247747747748e-07, "loss": 0.0005, "reward": 3.3039698600769043, "reward_std": 0.11607521027326584, "rewards/final_reward": 1.2098033741665402, "rewards/mask_iou_reward": 0.6049016870832701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3039699792861938, "rewards/thk_ans_format_reward": 1.0, "step": 1355, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 185.59375762939453, "epoch": 4.580101180438449, "grad_norm": 9.238720863822698, "kl": 0.4248046875, "learning_rate": 6.182432432432432e-07, "loss": 0.0004, "reward": 3.4512531757354736, "reward_std": 0.10461808368563652, "rewards/final_reward": 1.6735856737578172, "rewards/mask_iou_reward": 0.8367928368789086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4512531161308289, "rewards/thk_ans_format_reward": 1.0, "step": 1356, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 123.40625381469727, "epoch": 4.583473861720067, "grad_norm": 17.215769281190024, "kl": 0.4697265625, "learning_rate": 6.179617117117117e-07, "loss": 0.0005, "reward": 3.7004971504211426, "reward_std": 0.05688786879181862, "rewards/final_reward": 1.8174990980498769, "rewards/mask_iou_reward": 0.9087495490249384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7004972100257874, "rewards/thk_ans_format_reward": 1.0, "step": 1357, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 172.33333587646484, "epoch": 4.586846543001687, "grad_norm": 23.536469707220405, "kl": 0.517578125, "learning_rate": 6.176801801801802e-07, "loss": 0.0005, "reward": 3.5313466787338257, "reward_std": 0.1645505577325821, "rewards/final_reward": 1.4728273831129242, "rewards/mask_iou_reward": 0.7364136915564621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5313466787338257, "rewards/thk_ans_format_reward": 1.0, "step": 1358, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 161.83333587646484, "epoch": 4.590219224283305, "grad_norm": 26.026138897910474, "kl": 0.462890625, "learning_rate": 6.173986486486487e-07, "loss": 0.0005, "reward": 3.2835291624069214, "reward_std": 0.4738253206014633, "rewards/final_reward": 1.3618498747858259, "rewards/mask_iou_reward": 0.6809249373929129, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3251959085464478, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1359, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 235.8229217529297, "epoch": 4.593591905564924, "grad_norm": 9.427231475053144, "kl": 0.4375, "learning_rate": 6.171171171171172e-07, "loss": 0.0005, "reward": 3.4778225421905518, "reward_std": 0.34040002152323723, "rewards/final_reward": 1.142595404825373, "rewards/mask_iou_reward": 0.5712977024126865, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.581989347934723, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 1360, "think_completion_length": 6.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 125.91666793823242, "epoch": 4.596964586846543, "grad_norm": 20.106595600779748, "kl": 0.4150390625, "learning_rate": 6.168355855855856e-07, "loss": 0.0004, "reward": 3.325606107711792, "reward_std": 0.12346979975700378, "rewards/final_reward": 1.5796197811707622, "rewards/mask_iou_reward": 0.7898098905853811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.325605869293213, "rewards/thk_ans_format_reward": 1.0, "step": 1361, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 152.89583587646484, "epoch": 4.600337268128162, "grad_norm": 8.00573726219941, "kl": 0.515625, "learning_rate": 6.16554054054054e-07, "loss": 0.0005, "reward": 3.4754496812820435, "reward_std": 0.07985536009073257, "rewards/final_reward": 0.9789360400835958, "rewards/mask_iou_reward": 0.4894680200417979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4754494428634644, "rewards/thk_ans_format_reward": 1.0, "step": 1362, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.64583587646484, "epoch": 4.60370994940978, "grad_norm": 22.410320780410302, "kl": 0.416015625, "learning_rate": 6.162725225225224e-07, "loss": 0.0004, "reward": 3.6258490085601807, "reward_std": 0.09473934583365917, "rewards/final_reward": 1.558775344475041, "rewards/mask_iou_reward": 0.7793876722375205, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6258489489555359, "rewards/thk_ans_format_reward": 1.0, "step": 1363, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 164.50000762939453, "epoch": 4.6070826306914, "grad_norm": 124.86281572745158, "kl": 0.3779296875, "learning_rate": 6.159909909909909e-07, "loss": 0.0004, "reward": 3.177658796310425, "reward_std": 0.26666849851608276, "rewards/final_reward": 1.3526290204311469, "rewards/mask_iou_reward": 0.6763145102155734, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.198492169380188, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1364, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 209.5104217529297, "epoch": 4.610455311973018, "grad_norm": 26.843313276077073, "kl": 0.4287109375, "learning_rate": 6.157094594594594e-07, "loss": 0.0004, "reward": 3.3961726427078247, "reward_std": 0.2304963506758213, "rewards/final_reward": 1.4802080106953877, "rewards/mask_iou_reward": 0.7401040053476938, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.45867258310318, "rewards/thk_ans_format_reward": 0.96875, "step": 1365, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 163.55209350585938, "epoch": 4.613827993254637, "grad_norm": 9.086084177990507, "kl": 0.390625, "learning_rate": 6.154279279279278e-07, "loss": 0.0004, "reward": 3.3631885051727295, "reward_std": 0.16303710266947746, "rewards/final_reward": 1.048821753167892, "rewards/mask_iou_reward": 0.524410876583946, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.36318838596344, "rewards/thk_ans_format_reward": 1.0, "step": 1366, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.7916717529297, "epoch": 4.617200674536257, "grad_norm": 5.668347516194788, "kl": 0.4736328125, "learning_rate": 6.151463963963963e-07, "loss": 0.0005, "reward": 3.458433151245117, "reward_std": 0.13166548311710358, "rewards/final_reward": 1.3981328072953145, "rewards/mask_iou_reward": 0.6990664036476573, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4584329724311829, "rewards/thk_ans_format_reward": 1.0, "step": 1367, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 165.23959350585938, "epoch": 4.620573355817875, "grad_norm": 9.115177765492065, "kl": 0.435546875, "learning_rate": 6.148648648648648e-07, "loss": 0.0004, "reward": 3.657396078109741, "reward_std": 0.12225788831710815, "rewards/final_reward": 1.6637135937798875, "rewards/mask_iou_reward": 0.8318567968899437, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6573959589004517, "rewards/thk_ans_format_reward": 1.0, "step": 1368, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 185.5729217529297, "epoch": 4.623946037099494, "grad_norm": 13.399152863153292, "kl": 0.373046875, "learning_rate": 6.145833333333333e-07, "loss": 0.0004, "reward": 3.6571751832962036, "reward_std": 0.15783963352441788, "rewards/final_reward": 1.6372354588546512, "rewards/mask_iou_reward": 0.8186177294273256, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6780081987380981, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1369, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 226.56250762939453, "epoch": 4.627318718381113, "grad_norm": 9.441000443753211, "kl": 0.39453125, "learning_rate": 6.143018018018018e-07, "loss": 0.0004, "reward": 3.1151891946792603, "reward_std": 0.34237323701381683, "rewards/final_reward": 1.4576075154407924, "rewards/mask_iou_reward": 0.7288037577203962, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.156855821609497, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1370, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 248.13542938232422, "epoch": 4.630691399662732, "grad_norm": 6.958805381363635, "kl": 0.3974609375, "learning_rate": 6.140202702702702e-07, "loss": 0.0004, "reward": 3.1723281145095825, "reward_std": 0.41160689294338226, "rewards/final_reward": 1.6349061756987167, "rewards/mask_iou_reward": 0.8174530878493583, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.2348282039165497, "rewards/thk_ans_format_reward": 0.9687500298023224, "step": 1371, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 175.09375, "epoch": 4.63406408094435, "grad_norm": 5.81861981981799, "kl": 0.3837890625, "learning_rate": 6.137387387387387e-07, "loss": 0.0004, "reward": 3.5201971530914307, "reward_std": 0.17057428415864706, "rewards/final_reward": 1.792878767361338, "rewards/mask_iou_reward": 0.896439383680669, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5410303473472595, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1372, "think_completion_length": 6.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 241.44792938232422, "epoch": 4.63743676222597, "grad_norm": 22.718158938368305, "kl": 0.4345703125, "learning_rate": 6.134572072072072e-07, "loss": 0.0004, "reward": 3.308037281036377, "reward_std": 0.3673863261938095, "rewards/final_reward": 1.1550472984914382, "rewards/mask_iou_reward": 0.5775236492457191, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3705371618270874, "rewards/thk_ans_format_reward": 0.96875, "step": 1373, "think_completion_length": 6.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 210.1979217529297, "epoch": 4.640809443507589, "grad_norm": 9.528830517463504, "kl": 0.412109375, "learning_rate": 6.131756756756756e-07, "loss": 0.0004, "reward": 3.2841968536376953, "reward_std": 0.3351758047938347, "rewards/final_reward": 0.7085192624043266, "rewards/mask_iou_reward": 0.3542596312021633, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.3675304651260376, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1374, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 144.4895896911621, "epoch": 4.644182124789207, "grad_norm": 13.657284467532623, "kl": 0.490234375, "learning_rate": 6.128941441441441e-07, "loss": 0.0005, "reward": 3.5196319818496704, "reward_std": 0.383151039481163, "rewards/final_reward": 1.7381896307966391, "rewards/mask_iou_reward": 0.8690948153983196, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5612985491752625, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1375, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 177.7291717529297, "epoch": 4.6475548060708265, "grad_norm": 7.919045895049307, "kl": 0.4375, "learning_rate": 6.126126126126125e-07, "loss": 0.0004, "reward": 3.3381824493408203, "reward_std": 0.4271131902933121, "rewards/final_reward": 1.0904384314160225, "rewards/mask_iou_reward": 0.5452192157080112, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.379849135875702, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1376, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 146.21875762939453, "epoch": 4.650927487352445, "grad_norm": 8.13795156602028, "kl": 0.41015625, "learning_rate": 6.12331081081081e-07, "loss": 0.0004, "reward": 3.506591320037842, "reward_std": 0.096873689442873, "rewards/final_reward": 1.4166667266640771, "rewards/mask_iou_reward": 0.7083333633320386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5170079469680786, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1377, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 271.05208587646484, "epoch": 4.654300168634064, "grad_norm": 11.229457774273127, "kl": 0.3623046875, "learning_rate": 6.120495495495496e-07, "loss": 0.0004, "reward": 3.238800048828125, "reward_std": 0.4140184000134468, "rewards/final_reward": 1.082652828058263, "rewards/mask_iou_reward": 0.5413264140291315, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.3429667353630066, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 1378, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 196.4791717529297, "epoch": 4.6576728499156825, "grad_norm": 10.954054113328358, "kl": 0.44140625, "learning_rate": 6.11768018018018e-07, "loss": 0.0005, "reward": 3.3927810192108154, "reward_std": 0.27320830151438713, "rewards/final_reward": 1.2286505609184124, "rewards/mask_iou_reward": 0.6143252804592062, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4240307211875916, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1379, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 181.61459350585938, "epoch": 4.661045531197302, "grad_norm": 23.253338505109735, "kl": 0.4228515625, "learning_rate": 6.114864864864865e-07, "loss": 0.0004, "reward": 3.422883987426758, "reward_std": 0.08235886693000793, "rewards/final_reward": 1.3982151428079277, "rewards/mask_iou_reward": 0.6991075714039638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4228841662406921, "rewards/thk_ans_format_reward": 1.0, "step": 1380, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 199.30208587646484, "epoch": 4.664418212478921, "grad_norm": 9.892617377063814, "kl": 0.4013671875, "learning_rate": 6.112049549549549e-07, "loss": 0.0004, "reward": 3.4144232273101807, "reward_std": 0.18493741005659103, "rewards/final_reward": 1.6812816005396218, "rewards/mask_iou_reward": 0.8406408002698109, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4352566003799438, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1381, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 164.68750762939453, "epoch": 4.6677908937605395, "grad_norm": 7.174720082473362, "kl": 0.416015625, "learning_rate": 6.109234234234234e-07, "loss": 0.0004, "reward": 3.52528715133667, "reward_std": 0.04167925659567118, "rewards/final_reward": 1.6017496577825279, "rewards/mask_iou_reward": 0.8008748288912639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5252870917320251, "rewards/thk_ans_format_reward": 1.0, "step": 1382, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.9791717529297, "epoch": 4.671163575042159, "grad_norm": 6.330696699691036, "kl": 0.396484375, "learning_rate": 6.106418918918919e-07, "loss": 0.0004, "reward": 3.408154010772705, "reward_std": 0.10704836994409561, "rewards/final_reward": 1.7884134848846656, "rewards/mask_iou_reward": 0.8942067424423328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4081538915634155, "rewards/thk_ans_format_reward": 1.0, "step": 1383, "think_completion_length": 10.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 245.0416717529297, "epoch": 4.674536256323777, "grad_norm": 10.35051839748518, "kl": 0.3828125, "learning_rate": 6.103603603603603e-07, "loss": 0.0004, "reward": 3.3026130199432373, "reward_std": 0.36125922203063965, "rewards/final_reward": 1.2660540193293603, "rewards/mask_iou_reward": 0.6330270096646802, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.3651129007339478, "rewards/thk_ans_format_reward": 0.9687500298023224, "step": 1384, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 176.2604217529297, "epoch": 4.677908937605396, "grad_norm": 8.441998141784056, "kl": 0.400390625, "learning_rate": 6.100788288288288e-07, "loss": 0.0004, "reward": 3.605415463447571, "reward_std": 0.15792207419872284, "rewards/final_reward": 1.3898948118203842, "rewards/mask_iou_reward": 0.6949474059101921, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.626248836517334, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1385, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 213.1041717529297, "epoch": 4.681281618887015, "grad_norm": 7.202620273596892, "kl": 0.5478515625, "learning_rate": 6.097972972972972e-07, "loss": 0.0006, "reward": 3.121170401573181, "reward_std": 0.3351920619606972, "rewards/final_reward": 1.4420571140304157, "rewards/mask_iou_reward": 0.7210285570152078, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.1836704313755035, "rewards/thk_ans_format_reward": 0.96875, "step": 1386, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 139.1979217529297, "epoch": 4.684654300168634, "grad_norm": 10.040654884372035, "kl": 0.4248046875, "learning_rate": 6.095157657657657e-07, "loss": 0.0004, "reward": 3.5814239978790283, "reward_std": 0.05522888898849487, "rewards/final_reward": 1.634387054315411, "rewards/mask_iou_reward": 0.8171935271577055, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5814239978790283, "rewards/thk_ans_format_reward": 1.0, "step": 1387, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 159.61458587646484, "epoch": 4.688026981450253, "grad_norm": 12.756624503560145, "kl": 0.4873046875, "learning_rate": 6.092342342342343e-07, "loss": 0.0005, "reward": 3.7236673831939697, "reward_std": 0.1432779412716627, "rewards/final_reward": 1.6886858155096605, "rewards/mask_iou_reward": 0.8443429077548302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7236673831939697, "rewards/thk_ans_format_reward": 1.0, "step": 1388, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 193.1666717529297, "epoch": 4.691399662731872, "grad_norm": 54.60026001446657, "kl": 0.4697265625, "learning_rate": 6.089527027027027e-07, "loss": 0.0005, "reward": 3.6017699241638184, "reward_std": 0.12201762199401855, "rewards/final_reward": 1.8574840466847982, "rewards/mask_iou_reward": 0.9287420233423991, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.601769745349884, "rewards/thk_ans_format_reward": 1.0, "step": 1389, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 176.42708587646484, "epoch": 4.694772344013491, "grad_norm": 5.655602631361176, "kl": 0.40234375, "learning_rate": 6.086711711711712e-07, "loss": 0.0004, "reward": 3.3374961614608765, "reward_std": 0.231519166380167, "rewards/final_reward": 0.9501356463935893, "rewards/mask_iou_reward": 0.47506782319679464, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.379162847995758, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1390, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 168.75000762939453, "epoch": 4.698145025295109, "grad_norm": 13.136877331272428, "kl": 0.587890625, "learning_rate": 6.083896396396397e-07, "loss": 0.0006, "reward": 3.544087290763855, "reward_std": 0.07956103049218655, "rewards/final_reward": 1.8201977647482337, "rewards/mask_iou_reward": 0.9100988823741168, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5440873503684998, "rewards/thk_ans_format_reward": 1.0, "step": 1391, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 193.21875, "epoch": 4.701517706576729, "grad_norm": 9.365444962009972, "kl": 0.3564453125, "learning_rate": 6.081081081081081e-07, "loss": 0.0004, "reward": 3.349295735359192, "reward_std": 0.09699325263500214, "rewards/final_reward": 1.2572575228387097, "rewards/mask_iou_reward": 0.6286287614193549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3492956757545471, "rewards/thk_ans_format_reward": 1.0, "step": 1392, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 173.0729217529297, "epoch": 4.704890387858347, "grad_norm": 18.814682442066502, "kl": 0.62109375, "learning_rate": 6.078265765765766e-07, "loss": 0.0006, "reward": 3.586809515953064, "reward_std": 0.20244847238063812, "rewards/final_reward": 1.6999612887206061, "rewards/mask_iou_reward": 0.8499806443603031, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6076428890228271, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1393, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 177.5, "epoch": 4.708263069139966, "grad_norm": 12.611804363395612, "kl": 0.513671875, "learning_rate": 6.07545045045045e-07, "loss": 0.0005, "reward": 3.6310946941375732, "reward_std": 0.1718064285814762, "rewards/final_reward": 1.7734070202279941, "rewards/mask_iou_reward": 0.8867035101139971, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6519279479980469, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1394, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 280.55208587646484, "epoch": 4.7116357504215856, "grad_norm": 12.701339132671743, "kl": 0.400390625, "learning_rate": 6.072635135135135e-07, "loss": 0.0004, "reward": 2.95338237285614, "reward_std": 0.30559471249580383, "rewards/final_reward": 1.1044576161481927, "rewards/mask_iou_reward": 0.5522288080740964, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.1096324920654297, "rewards/thk_ans_format_reward": 0.9166666865348816, "step": 1395, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 165.11458587646484, "epoch": 4.715008431703204, "grad_norm": 6.719133591335739, "kl": 0.470703125, "learning_rate": 6.06981981981982e-07, "loss": 0.0005, "reward": 3.0169700384140015, "reward_std": 0.20718349143862724, "rewards/final_reward": 0.8760415241116719, "rewards/mask_iou_reward": 0.43802076205583595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0169699788093567, "rewards/thk_ans_format_reward": 1.0, "step": 1396, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 145.93750762939453, "epoch": 4.718381112984823, "grad_norm": 13.94165033038778, "kl": 0.484375, "learning_rate": 6.067004504504504e-07, "loss": 0.0005, "reward": 3.592189908027649, "reward_std": 0.10593907162547112, "rewards/final_reward": 1.8382220583082103, "rewards/mask_iou_reward": 0.9191110291541051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.592189610004425, "rewards/thk_ans_format_reward": 1.0, "step": 1397, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 142.59375, "epoch": 4.721753794266442, "grad_norm": 11.529880285191956, "kl": 0.40234375, "learning_rate": 6.06418918918919e-07, "loss": 0.0004, "reward": 3.289812684059143, "reward_std": 0.07469776272773743, "rewards/final_reward": 1.5201540594804328, "rewards/mask_iou_reward": 0.7600770297402164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2898123264312744, "rewards/thk_ans_format_reward": 1.0, "step": 1398, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 151.3854217529297, "epoch": 4.725126475548061, "grad_norm": 14.133682200224834, "kl": 0.6181640625, "learning_rate": 6.061373873873874e-07, "loss": 0.0006, "reward": 3.644963264465332, "reward_std": 0.04531935974955559, "rewards/final_reward": 1.778519984668355, "rewards/mask_iou_reward": 0.8892599923341775, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6449633240699768, "rewards/thk_ans_format_reward": 1.0, "step": 1399, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 156.23958587646484, "epoch": 4.728499156829679, "grad_norm": 9.726824492131778, "kl": 0.47265625, "learning_rate": 6.058558558558559e-07, "loss": 0.0005, "reward": 3.299925208091736, "reward_std": 0.08626305125653744, "rewards/final_reward": 0.9017449776055927, "rewards/mask_iou_reward": 0.45087248880279635, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2999252676963806, "rewards/thk_ans_format_reward": 1.0, "step": 1400, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 170.2291717529297, "epoch": 4.7318718381112985, "grad_norm": 9.027558807675197, "kl": 0.4716796875, "learning_rate": 6.055743243243244e-07, "loss": 0.0005, "reward": 3.4581342935562134, "reward_std": 0.25569941103458405, "rewards/final_reward": 1.6504690959386323, "rewards/mask_iou_reward": 0.8252345479693162, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4789676666259766, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1401, "think_completion_length": 5.875 }, { "clip_ratio": 0.0, "completion_length": 195.1354217529297, "epoch": 4.735244519392918, "grad_norm": 19.171285365895933, "kl": 0.55859375, "learning_rate": 6.052927927927927e-07, "loss": 0.0006, "reward": 3.3989903926849365, "reward_std": 0.04316495731472969, "rewards/final_reward": 1.8268622792643263, "rewards/mask_iou_reward": 0.9134311396321632, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.398990273475647, "rewards/thk_ans_format_reward": 1.0, "step": 1402, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 252.40625, "epoch": 4.738617200674536, "grad_norm": 10.350478375974465, "kl": 0.431640625, "learning_rate": 6.050112612612612e-07, "loss": 0.0004, "reward": 2.90904438495636, "reward_std": 0.5360357463359833, "rewards/final_reward": 0.6511499265661819, "rewards/mask_iou_reward": 0.32557496328309093, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.0027942955493927, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1403, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.46875, "epoch": 4.7419898819561555, "grad_norm": 12.387022468737703, "kl": 0.4169921875, "learning_rate": 6.047297297297296e-07, "loss": 0.0004, "reward": 3.4975666999816895, "reward_std": 0.19361478835344315, "rewards/final_reward": 1.6649348209872064, "rewards/mask_iou_reward": 0.8324674104936032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.507983148097992, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1404, "think_completion_length": 9.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 186.2291717529297, "epoch": 4.745362563237774, "grad_norm": 11.760509932057095, "kl": 0.41796875, "learning_rate": 6.044481981981981e-07, "loss": 0.0005, "reward": 3.5798988342285156, "reward_std": 0.01169863436371088, "rewards/final_reward": 1.5996546826614297, "rewards/mask_iou_reward": 0.7998273413307149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5798988342285156, "rewards/thk_ans_format_reward": 1.0, "step": 1405, "think_completion_length": 11.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 175.21875762939453, "epoch": 4.748735244519393, "grad_norm": 11.042883825916984, "kl": 0.4013671875, "learning_rate": 6.041666666666666e-07, "loss": 0.0004, "reward": 3.4733498096466064, "reward_std": 0.08175930939614773, "rewards/final_reward": 1.1120204333436654, "rewards/mask_iou_reward": 0.5560102166718327, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4733495712280273, "rewards/thk_ans_format_reward": 1.0, "step": 1406, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 221.17709350585938, "epoch": 4.7521079258010115, "grad_norm": 16.189002753636895, "kl": 0.3818359375, "learning_rate": 6.03885135135135e-07, "loss": 0.0004, "reward": 3.2562655210494995, "reward_std": 0.13706044666469097, "rewards/final_reward": 0.955771945037772, "rewards/mask_iou_reward": 0.477885972518886, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3083489537239075, "rewards/thk_ans_format_reward": 0.96875, "step": 1407, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 207.17708587646484, "epoch": 4.755480607082631, "grad_norm": 10.681538589372371, "kl": 0.5, "learning_rate": 6.036036036036036e-07, "loss": 0.0005, "reward": 2.9784927368164062, "reward_std": 0.16897240281105042, "rewards/final_reward": 0.5949273793559162, "rewards/mask_iou_reward": 0.2974636896779581, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.9889095425605774, "rewards/thk_ans_format_reward": 1.0, "step": 1408, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 179.20834350585938, "epoch": 4.75885328836425, "grad_norm": 7.624113833794622, "kl": 0.380859375, "learning_rate": 6.03322072072072e-07, "loss": 0.0004, "reward": 3.5433114767074585, "reward_std": 0.17104174941778183, "rewards/final_reward": 1.7751413765493096, "rewards/mask_iou_reward": 0.8875706882746548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5433114767074585, "rewards/thk_ans_format_reward": 1.0, "step": 1409, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 189.84375762939453, "epoch": 4.762225969645868, "grad_norm": 5.271956909636253, "kl": 0.470703125, "learning_rate": 6.030405405405405e-07, "loss": 0.0005, "reward": 3.537356972694397, "reward_std": 0.15852384641766548, "rewards/final_reward": 1.7470706569995778, "rewards/mask_iou_reward": 0.8735353284997889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.537356972694397, "rewards/thk_ans_format_reward": 1.0, "step": 1410, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 214.52084350585938, "epoch": 4.765598650927488, "grad_norm": 60.182323202213084, "kl": 0.451171875, "learning_rate": 6.02759009009009e-07, "loss": 0.0005, "reward": 3.4663702249526978, "reward_std": 0.39432157576084137, "rewards/final_reward": 1.5362254960852642, "rewards/mask_iou_reward": 0.7681127480426321, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5080366730690002, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1411, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 176.1041717529297, "epoch": 4.768971332209106, "grad_norm": 6.119217135940791, "kl": 0.4111328125, "learning_rate": 6.024774774774774e-07, "loss": 0.0004, "reward": 3.2781678438186646, "reward_std": 0.1051796767860651, "rewards/final_reward": 1.2576380901643076, "rewards/mask_iou_reward": 0.6288190450821538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2781679034233093, "rewards/thk_ans_format_reward": 1.0, "step": 1412, "think_completion_length": 10.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 166.48959350585938, "epoch": 4.772344013490725, "grad_norm": 26.195928703726985, "kl": 0.4921875, "learning_rate": 6.021959459459459e-07, "loss": 0.0005, "reward": 3.382575273513794, "reward_std": 0.12796253710985184, "rewards/final_reward": 1.0492009290654098, "rewards/mask_iou_reward": 0.5246004645327049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3825750946998596, "rewards/thk_ans_format_reward": 1.0, "step": 1413, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 193.90625, "epoch": 4.775716694772344, "grad_norm": 8.068286894497055, "kl": 0.361328125, "learning_rate": 6.019144144144144e-07, "loss": 0.0004, "reward": 3.5492480993270874, "reward_std": 0.13272245228290558, "rewards/final_reward": 1.8132955229145629, "rewards/mask_iou_reward": 0.9066477614572814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5492480993270874, "rewards/thk_ans_format_reward": 1.0, "step": 1414, "think_completion_length": 6.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 227.8854217529297, "epoch": 4.779089376053963, "grad_norm": 15.414406295867698, "kl": 0.3779296875, "learning_rate": 6.016328828828828e-07, "loss": 0.0004, "reward": 3.150847315788269, "reward_std": 0.22631582617759705, "rewards/final_reward": 1.752640437267114, "rewards/mask_iou_reward": 0.876320218633557, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.182097315788269, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1415, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 183.68750762939453, "epoch": 4.782462057335582, "grad_norm": 11.22454762434086, "kl": 0.4189453125, "learning_rate": 6.013513513513513e-07, "loss": 0.0004, "reward": 3.4703203439712524, "reward_std": 0.23415010422468185, "rewards/final_reward": 1.4415937930558345, "rewards/mask_iou_reward": 0.7207968965279172, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.5015702843666077, "rewards/thk_ans_format_reward": 1.0, "step": 1416, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.89583587646484, "epoch": 4.785834738617201, "grad_norm": 7.733062562592615, "kl": 0.4580078125, "learning_rate": 6.010698198198197e-07, "loss": 0.0005, "reward": 3.6019084453582764, "reward_std": 0.07121942192316055, "rewards/final_reward": 0.7838488290370744, "rewards/mask_iou_reward": 0.3919244145185372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6019084453582764, "rewards/thk_ans_format_reward": 1.0, "step": 1417, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 125.63542175292969, "epoch": 4.78920741989882, "grad_norm": 8.28171036478787, "kl": 0.4697265625, "learning_rate": 6.007882882882883e-07, "loss": 0.0005, "reward": 3.7588918209075928, "reward_std": 0.015361388213932514, "rewards/final_reward": 1.5930138037725348, "rewards/mask_iou_reward": 0.7965069018862674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7588915824890137, "rewards/thk_ans_format_reward": 1.0, "step": 1418, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 163.0416717529297, "epoch": 4.792580101180438, "grad_norm": 7.398839807072086, "kl": 0.431640625, "learning_rate": 6.005067567567568e-07, "loss": 0.0004, "reward": 3.1072298288345337, "reward_std": 0.22314369678497314, "rewards/final_reward": 0.765689319467859, "rewards/mask_iou_reward": 0.3828446597339295, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.1280630826950073, "rewards/thk_ans_format_reward": 1.0, "step": 1419, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 153.4166717529297, "epoch": 4.795952782462058, "grad_norm": 15.551402360333316, "kl": 0.45703125, "learning_rate": 6.002252252252252e-07, "loss": 0.0005, "reward": 3.691345453262329, "reward_std": 0.0775423776358366, "rewards/final_reward": 1.7338988018583557, "rewards/mask_iou_reward": 0.8669494009291778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.691345453262329, "rewards/thk_ans_format_reward": 1.0, "step": 1420, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 146.27083587646484, "epoch": 4.799325463743676, "grad_norm": 175.63575080043208, "kl": 0.419921875, "learning_rate": 5.999436936936937e-07, "loss": 0.0004, "reward": 3.6315362453460693, "reward_std": 0.0602062102407217, "rewards/final_reward": 1.7756131187543025, "rewards/mask_iou_reward": 0.8878065593771512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6315361857414246, "rewards/thk_ans_format_reward": 1.0, "step": 1421, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 128.6666717529297, "epoch": 4.802698145025295, "grad_norm": 22.810287856480006, "kl": 0.548828125, "learning_rate": 5.996621621621621e-07, "loss": 0.0005, "reward": 3.733266234397888, "reward_std": 0.06191633269190788, "rewards/final_reward": 1.4032215032311224, "rewards/mask_iou_reward": 0.7016107516155612, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7332662343978882, "rewards/thk_ans_format_reward": 1.0, "step": 1422, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 191.23958587646484, "epoch": 4.806070826306914, "grad_norm": 6.424184368298101, "kl": 0.400390625, "learning_rate": 5.993806306306306e-07, "loss": 0.0004, "reward": 3.5381908416748047, "reward_std": 0.1660333201289177, "rewards/final_reward": 1.5462451945725153, "rewards/mask_iou_reward": 0.7731225972862577, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.548607885837555, "rewards/thk_ans_format_reward": 1.0, "step": 1423, "think_completion_length": 9.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 159.58333587646484, "epoch": 4.809443507588533, "grad_norm": 10.449873295033866, "kl": 0.560546875, "learning_rate": 5.990990990990991e-07, "loss": 0.0006, "reward": 3.2715214490890503, "reward_std": 0.05700042471289635, "rewards/final_reward": 1.455750249986035, "rewards/mask_iou_reward": 0.7278751249930175, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2715213894844055, "rewards/thk_ans_format_reward": 1.0, "step": 1424, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.05208587646484, "epoch": 4.812816188870151, "grad_norm": 13.190764748599955, "kl": 0.513671875, "learning_rate": 5.988175675675675e-07, "loss": 0.0005, "reward": 3.528395652770996, "reward_std": 0.07729190587997437, "rewards/final_reward": 1.8860675174581736, "rewards/mask_iou_reward": 0.9430337587290868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5283954739570618, "rewards/thk_ans_format_reward": 1.0, "step": 1425, "think_completion_length": 10.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 132.34375, "epoch": 4.8161888701517706, "grad_norm": 5.8108007501621755, "kl": 0.4501953125, "learning_rate": 5.98536036036036e-07, "loss": 0.0004, "reward": 3.5327943563461304, "reward_std": 0.14703011512756348, "rewards/final_reward": 1.2986196006218722, "rewards/mask_iou_reward": 0.6493098003109361, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5432112216949463, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1426, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 168.80208587646484, "epoch": 4.81956155143339, "grad_norm": 19.684142950430235, "kl": 0.48046875, "learning_rate": 5.982545045045045e-07, "loss": 0.0005, "reward": 3.745737075805664, "reward_std": 0.04851808398962021, "rewards/final_reward": 1.7769890462739044, "rewards/mask_iou_reward": 0.8884945231369522, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7457371950149536, "rewards/thk_ans_format_reward": 1.0, "step": 1427, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 153.23959350585938, "epoch": 4.822934232715008, "grad_norm": 15.89792351747273, "kl": 0.494140625, "learning_rate": 5.97972972972973e-07, "loss": 0.0005, "reward": 3.3736536502838135, "reward_std": 0.03965951129794121, "rewards/final_reward": 0.9939795966808459, "rewards/mask_iou_reward": 0.49698979834042295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3736535906791687, "rewards/thk_ans_format_reward": 1.0, "step": 1428, "think_completion_length": 10.5 }, { "clip_ratio": 0.0, "completion_length": 185.3541717529297, "epoch": 4.8263069139966275, "grad_norm": 13.482933812720312, "kl": 0.369140625, "learning_rate": 5.976914414414415e-07, "loss": 0.0004, "reward": 3.585512638092041, "reward_std": 0.06997106038033962, "rewards/final_reward": 1.7870791411846856, "rewards/mask_iou_reward": 0.8935395705923428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5855128169059753, "rewards/thk_ans_format_reward": 1.0, "step": 1429, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 157.08333587646484, "epoch": 4.829679595278246, "grad_norm": 19.937100574525502, "kl": 0.4462890625, "learning_rate": 5.974099099099099e-07, "loss": 0.0005, "reward": 3.270692229270935, "reward_std": 0.1783260926604271, "rewards/final_reward": 1.8939412539997733, "rewards/mask_iou_reward": 0.9469706269998867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.270692229270935, "rewards/thk_ans_format_reward": 1.0, "step": 1430, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 176.27084350585938, "epoch": 4.833052276559865, "grad_norm": 13.139160373627659, "kl": 0.5029296875, "learning_rate": 5.971283783783784e-07, "loss": 0.0005, "reward": 3.3824121952056885, "reward_std": 0.21076303720474243, "rewards/final_reward": 1.3698366206645685, "rewards/mask_iou_reward": 0.6849183103322842, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3824121952056885, "rewards/thk_ans_format_reward": 1.0, "step": 1431, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 179.17708587646484, "epoch": 4.8364249578414835, "grad_norm": 8.548335447624954, "kl": 0.47265625, "learning_rate": 5.968468468468469e-07, "loss": 0.0005, "reward": 3.6033719778060913, "reward_std": 0.20962823927402496, "rewards/final_reward": 1.5506067036857636, "rewards/mask_iou_reward": 0.7753033518428818, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6033719778060913, "rewards/thk_ans_format_reward": 1.0, "step": 1432, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.5416717529297, "epoch": 4.839797639123103, "grad_norm": 7.23649023928491, "kl": 0.380859375, "learning_rate": 5.965653153153153e-07, "loss": 0.0004, "reward": 3.7130861282348633, "reward_std": 0.1642559003084898, "rewards/final_reward": 1.9265573826772417, "rewards/mask_iou_reward": 0.9632786913386209, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7339193224906921, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1433, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 200.80208587646484, "epoch": 4.843170320404722, "grad_norm": 12.446966451587427, "kl": 0.38671875, "learning_rate": 5.962837837837838e-07, "loss": 0.0004, "reward": 3.2934367656707764, "reward_std": 0.1459517478942871, "rewards/final_reward": 0.9782698315033347, "rewards/mask_iou_reward": 0.48913491575166734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2934367060661316, "rewards/thk_ans_format_reward": 1.0, "step": 1434, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 166.17708587646484, "epoch": 4.8465430016863404, "grad_norm": 8.99444927904533, "kl": 0.4326171875, "learning_rate": 5.960022522522522e-07, "loss": 0.0004, "reward": 3.3557018041610718, "reward_std": 0.13160298392176628, "rewards/final_reward": 1.91063537233244, "rewards/mask_iou_reward": 0.95531768616622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3557018637657166, "rewards/thk_ans_format_reward": 1.0, "step": 1435, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 156.34375762939453, "epoch": 4.84991568296796, "grad_norm": 8.347854214205052, "kl": 0.3916015625, "learning_rate": 5.957207207207207e-07, "loss": 0.0004, "reward": 3.199750542640686, "reward_std": 0.10312426835298538, "rewards/final_reward": 1.1158206020736225, "rewards/mask_iou_reward": 0.5579103010368113, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.199750542640686, "rewards/thk_ans_format_reward": 1.0, "step": 1436, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 157.3854217529297, "epoch": 4.853288364249578, "grad_norm": 10.778873977904322, "kl": 0.5888671875, "learning_rate": 5.954391891891892e-07, "loss": 0.0006, "reward": 3.339724063873291, "reward_std": 0.09653700515627861, "rewards/final_reward": 1.8371860152172692, "rewards/mask_iou_reward": 0.9185930076086346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3397240042686462, "rewards/thk_ans_format_reward": 1.0, "step": 1437, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 162.73958587646484, "epoch": 4.856661045531197, "grad_norm": 12.160043758137181, "kl": 0.44921875, "learning_rate": 5.951576576576577e-07, "loss": 0.0005, "reward": 3.5388941764831543, "reward_std": 0.15586276352405548, "rewards/final_reward": 1.532367128831696, "rewards/mask_iou_reward": 0.766183564415848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5388941764831543, "rewards/thk_ans_format_reward": 1.0, "step": 1438, "think_completion_length": 9.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 164.48958587646484, "epoch": 4.860033726812816, "grad_norm": 10.08293778091881, "kl": 0.4091796875, "learning_rate": 5.948761261261262e-07, "loss": 0.0004, "reward": 3.389430522918701, "reward_std": 0.09414727240800858, "rewards/final_reward": 1.8392757522290495, "rewards/mask_iou_reward": 0.9196378761145247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3894306421279907, "rewards/thk_ans_format_reward": 1.0, "step": 1439, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.80208587646484, "epoch": 4.863406408094435, "grad_norm": 7.986576988318299, "kl": 0.51171875, "learning_rate": 5.945945945945947e-07, "loss": 0.0005, "reward": 3.439391613006592, "reward_std": 0.21128800511360168, "rewards/final_reward": 1.0754574248509696, "rewards/mask_iou_reward": 0.5377287124254848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4393916130065918, "rewards/thk_ans_format_reward": 1.0, "step": 1440, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 170.2604217529297, "epoch": 4.866779089376054, "grad_norm": 17.852470988682573, "kl": 0.4482421875, "learning_rate": 5.94313063063063e-07, "loss": 0.0005, "reward": 3.288352608680725, "reward_std": 0.12657052278518677, "rewards/final_reward": 1.0126949471567626, "rewards/mask_iou_reward": 0.5063474735783813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2883524894714355, "rewards/thk_ans_format_reward": 1.0, "step": 1441, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 132.36458587646484, "epoch": 4.870151770657673, "grad_norm": 5.582480495011308, "kl": 0.48828125, "learning_rate": 5.940315315315315e-07, "loss": 0.0005, "reward": 3.4604064226150513, "reward_std": 0.053551677614450455, "rewards/final_reward": 0.9912932618618167, "rewards/mask_iou_reward": 0.4956466309309083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.460406243801117, "rewards/thk_ans_format_reward": 1.0, "step": 1442, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 175.11458587646484, "epoch": 4.873524451939292, "grad_norm": 10.726826988763884, "kl": 0.4052734375, "learning_rate": 5.937499999999999e-07, "loss": 0.0004, "reward": 3.3676618337631226, "reward_std": 0.10566180571913719, "rewards/final_reward": 1.4990256896295167, "rewards/mask_iou_reward": 0.7495128448147583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3676618337631226, "rewards/thk_ans_format_reward": 1.0, "step": 1443, "think_completion_length": 7.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 114.97916793823242, "epoch": 4.87689713322091, "grad_norm": 48.36969289130845, "kl": 0.482421875, "learning_rate": 5.934684684684684e-07, "loss": 0.0005, "reward": 3.831332206726074, "reward_std": 0.05491393432021141, "rewards/final_reward": 1.74433423874409, "rewards/mask_iou_reward": 0.872167119372045, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.83133202791214, "rewards/thk_ans_format_reward": 1.0, "step": 1444, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 142.48958587646484, "epoch": 4.88026981450253, "grad_norm": 8.382209616122617, "kl": 0.44140625, "learning_rate": 5.931869369369368e-07, "loss": 0.0005, "reward": 3.520848274230957, "reward_std": 0.060753241181373596, "rewards/final_reward": 1.5039464577605335, "rewards/mask_iou_reward": 0.7519732288802667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5208481550216675, "rewards/thk_ans_format_reward": 1.0, "step": 1445, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 159.73958587646484, "epoch": 4.883642495784148, "grad_norm": 14.90449864621606, "kl": 0.435546875, "learning_rate": 5.929054054054053e-07, "loss": 0.0004, "reward": 3.542003035545349, "reward_std": 0.07900802604854107, "rewards/final_reward": 1.5796984591925156, "rewards/mask_iou_reward": 0.7898492295962578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5420030355453491, "rewards/thk_ans_format_reward": 1.0, "step": 1446, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 171.125, "epoch": 4.887015177065767, "grad_norm": 17.873395622327013, "kl": 0.439453125, "learning_rate": 5.926238738738738e-07, "loss": 0.0004, "reward": 3.635874629020691, "reward_std": 0.08757461607456207, "rewards/final_reward": 1.907717467305979, "rewards/mask_iou_reward": 0.9538587336529895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6358747482299805, "rewards/thk_ans_format_reward": 1.0, "step": 1447, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 126.54166793823242, "epoch": 4.8903878583473865, "grad_norm": 9.895515957426772, "kl": 0.4208984375, "learning_rate": 5.923423423423422e-07, "loss": 0.0004, "reward": 3.433847188949585, "reward_std": 0.06330831721425056, "rewards/final_reward": 1.2834112597618437, "rewards/mask_iou_reward": 0.6417056298809218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4338471293449402, "rewards/thk_ans_format_reward": 1.0, "step": 1448, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 130.625, "epoch": 4.893760539629005, "grad_norm": 13.782892970677487, "kl": 0.5009765625, "learning_rate": 5.920608108108108e-07, "loss": 0.0005, "reward": 3.507854461669922, "reward_std": 0.06417267397046089, "rewards/final_reward": 1.227488244552132, "rewards/mask_iou_reward": 0.613744122276066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5078545808792114, "rewards/thk_ans_format_reward": 1.0, "step": 1449, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 244.1979217529297, "epoch": 4.897133220910624, "grad_norm": 11.388583621848147, "kl": 0.486328125, "learning_rate": 5.917792792792793e-07, "loss": 0.0005, "reward": 3.4151848554611206, "reward_std": 0.3494800329208374, "rewards/final_reward": 1.294598148507474, "rewards/mask_iou_reward": 0.647299074253737, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4881016612052917, "rewards/thk_ans_format_reward": 0.96875, "step": 1450, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.90625, "epoch": 4.900505902192243, "grad_norm": 13.374894758145272, "kl": 0.400390625, "learning_rate": 5.914977477477477e-07, "loss": 0.0004, "reward": 3.179789900779724, "reward_std": 0.19759593158960342, "rewards/final_reward": 1.0805071135765294, "rewards/mask_iou_reward": 0.5402535567882647, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2006232142448425, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1451, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 172.84375, "epoch": 4.903878583473862, "grad_norm": 35.74475640711918, "kl": 0.5029296875, "learning_rate": 5.912162162162162e-07, "loss": 0.0005, "reward": 3.46237576007843, "reward_std": 0.08721278607845306, "rewards/final_reward": 1.8576137448561374, "rewards/mask_iou_reward": 0.9288068724280687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.462375819683075, "rewards/thk_ans_format_reward": 1.0, "step": 1452, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 198.11459350585938, "epoch": 4.90725126475548, "grad_norm": 10.470587615041929, "kl": 0.431640625, "learning_rate": 5.909346846846846e-07, "loss": 0.0004, "reward": 2.773538589477539, "reward_std": 0.09802227839827538, "rewards/final_reward": 0.6910804862719137, "rewards/mask_iou_reward": 0.3455402431359568, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7735386490821838, "rewards/thk_ans_format_reward": 1.0, "step": 1453, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 189.6354217529297, "epoch": 4.9106239460370995, "grad_norm": 9.188969334508629, "kl": 0.4609375, "learning_rate": 5.906531531531531e-07, "loss": 0.0005, "reward": 3.2502578496932983, "reward_std": 0.19383827969431877, "rewards/final_reward": 1.5927118283877038, "rewards/mask_iou_reward": 0.7963559141938519, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2502577900886536, "rewards/thk_ans_format_reward": 1.0, "step": 1454, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 197.55209350585938, "epoch": 4.913996627318719, "grad_norm": 7.67389208349097, "kl": 0.376953125, "learning_rate": 5.903716216216216e-07, "loss": 0.0004, "reward": 3.3607468605041504, "reward_std": 0.19103029370307922, "rewards/final_reward": 1.5983448402015643, "rewards/mask_iou_reward": 0.7991724201007822, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3607465624809265, "rewards/thk_ans_format_reward": 1.0, "step": 1455, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 163.1354217529297, "epoch": 4.917369308600337, "grad_norm": 9.04516507270446, "kl": 0.4609375, "learning_rate": 5.9009009009009e-07, "loss": 0.0005, "reward": 3.50595223903656, "reward_std": 0.0903414785861969, "rewards/final_reward": 1.373421515151684, "rewards/mask_iou_reward": 0.686710757575842, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5059520602226257, "rewards/thk_ans_format_reward": 1.0, "step": 1456, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 168.80208587646484, "epoch": 4.920741989881956, "grad_norm": 15.530085309166886, "kl": 0.4208984375, "learning_rate": 5.898085585585585e-07, "loss": 0.0004, "reward": 3.2726866006851196, "reward_std": 0.21423480100929737, "rewards/final_reward": 0.6115067894808941, "rewards/mask_iou_reward": 0.30575339474044705, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.293519675731659, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1457, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.89584350585938, "epoch": 4.924114671163575, "grad_norm": 14.647227093297408, "kl": 0.3671875, "learning_rate": 5.895270270270269e-07, "loss": 0.0004, "reward": 3.3387356996536255, "reward_std": 0.21982141956686974, "rewards/final_reward": 1.7117802352980065, "rewards/mask_iou_reward": 0.8558901176490032, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3595690727233887, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1458, "think_completion_length": 10.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 187.1979217529297, "epoch": 4.927487352445194, "grad_norm": 26.526587616046104, "kl": 0.82421875, "learning_rate": 5.892454954954955e-07, "loss": 0.0008, "reward": 3.442506194114685, "reward_std": 0.19030030816793442, "rewards/final_reward": 1.678046948788117, "rewards/mask_iou_reward": 0.8390234743940586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4425063133239746, "rewards/thk_ans_format_reward": 1.0, "step": 1459, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 188.3125, "epoch": 4.9308600337268125, "grad_norm": 12.92104118288375, "kl": 0.3779296875, "learning_rate": 5.88963963963964e-07, "loss": 0.0004, "reward": 3.5892350673675537, "reward_std": 0.15997378155589104, "rewards/final_reward": 1.3809674531930873, "rewards/mask_iou_reward": 0.6904837265965437, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.589235007762909, "rewards/thk_ans_format_reward": 1.0, "step": 1460, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.1979217529297, "epoch": 4.934232715008432, "grad_norm": 10.699530552849511, "kl": 0.46484375, "learning_rate": 5.886824324324324e-07, "loss": 0.0005, "reward": 3.7261245250701904, "reward_std": 0.0693025141954422, "rewards/final_reward": 1.8081499676675472, "rewards/mask_iou_reward": 0.9040749838337736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7261244654655457, "rewards/thk_ans_format_reward": 1.0, "step": 1461, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 236.3541717529297, "epoch": 4.937605396290051, "grad_norm": 15.085039386224084, "kl": 0.357421875, "learning_rate": 5.884009009009009e-07, "loss": 0.0004, "reward": 3.5327210426330566, "reward_std": 0.16988344490528107, "rewards/final_reward": 1.0301080061032815, "rewards/mask_iou_reward": 0.5150540030516407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5327209830284119, "rewards/thk_ans_format_reward": 1.0, "step": 1462, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 133.12500762939453, "epoch": 4.940978077571669, "grad_norm": 33.75329709138452, "kl": 0.572265625, "learning_rate": 5.881193693693694e-07, "loss": 0.0006, "reward": 3.8426930904388428, "reward_std": 0.07315381523221731, "rewards/final_reward": 1.9806365523159024, "rewards/mask_iou_reward": 0.9903182761579512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.842693030834198, "rewards/thk_ans_format_reward": 1.0, "step": 1463, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.6354217529297, "epoch": 4.944350758853289, "grad_norm": 8.529144022685584, "kl": 0.3935546875, "learning_rate": 5.878378378378378e-07, "loss": 0.0004, "reward": 3.2548428773880005, "reward_std": 0.257319413125515, "rewards/final_reward": 1.4339055670117853, "rewards/mask_iou_reward": 0.7169527835058926, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2756760120391846, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1464, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 234.77084350585938, "epoch": 4.947723440134907, "grad_norm": 23.35727370278157, "kl": 0.435546875, "learning_rate": 5.875563063063063e-07, "loss": 0.0004, "reward": 3.2339917421340942, "reward_std": 0.3385896082036197, "rewards/final_reward": 1.5851923339722218, "rewards/mask_iou_reward": 0.7925961669861109, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.3173249959945679, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1465, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 208.1666717529297, "epoch": 4.951096121416526, "grad_norm": 10.193660842015277, "kl": 0.37890625, "learning_rate": 5.872747747747747e-07, "loss": 0.0004, "reward": 3.423218607902527, "reward_std": 0.21466679126024246, "rewards/final_reward": 1.043962518203048, "rewards/mask_iou_reward": 0.521981259101524, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.464885175228119, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1466, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 288.92708587646484, "epoch": 4.954468802698145, "grad_norm": 10.053228747894176, "kl": 0.4296875, "learning_rate": 5.869932432432432e-07, "loss": 0.0004, "reward": 3.3755905628204346, "reward_std": 0.48930785059928894, "rewards/final_reward": 1.3053080176855865, "rewards/mask_iou_reward": 0.6526540088427932, "rewards/sam_format_reward": 0.9583333730697632, "rewards/sam_reward_func_ultra": 1.458924114704132, "rewards/thk_ans_format_reward": 0.9583333730697632, "step": 1467, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 232.45834350585938, "epoch": 4.957841483979764, "grad_norm": 9.13263919322423, "kl": 0.3623046875, "learning_rate": 5.867117117117117e-07, "loss": 0.0004, "reward": 3.380911111831665, "reward_std": 0.14448396861553192, "rewards/final_reward": 0.9460272971516208, "rewards/mask_iou_reward": 0.4730136485758104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3809112310409546, "rewards/thk_ans_format_reward": 1.0, "step": 1468, "think_completion_length": 9.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 181.64583587646484, "epoch": 4.961214165261383, "grad_norm": 12.851598480616993, "kl": 0.548828125, "learning_rate": 5.864301801801802e-07, "loss": 0.0006, "reward": 3.380996346473694, "reward_std": 0.0686973761767149, "rewards/final_reward": 1.735709440736965, "rewards/mask_iou_reward": 0.8678547203684825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.380996286869049, "rewards/thk_ans_format_reward": 1.0, "step": 1469, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 239.52084350585938, "epoch": 4.964586846543002, "grad_norm": 6.908240979975726, "kl": 0.40234375, "learning_rate": 5.861486486486487e-07, "loss": 0.0004, "reward": 3.3168845176696777, "reward_std": 0.33022212237119675, "rewards/final_reward": 1.392443548241194, "rewards/mask_iou_reward": 0.696221774120597, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4002178311347961, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 1470, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 219.40625, "epoch": 4.967959527824621, "grad_norm": 5.682180474450704, "kl": 0.44921875, "learning_rate": 5.858671171171171e-07, "loss": 0.0004, "reward": 3.4397724866867065, "reward_std": 0.2822958081960678, "rewards/final_reward": 1.8190369445658745, "rewards/mask_iou_reward": 0.9095184722829373, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4814391732215881, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1471, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 223.62500762939453, "epoch": 4.971332209106239, "grad_norm": 4.219765484758162, "kl": 0.427734375, "learning_rate": 5.855855855855856e-07, "loss": 0.0004, "reward": 2.8198740482330322, "reward_std": 0.29118700325489044, "rewards/final_reward": 1.2281644554676672, "rewards/mask_iou_reward": 0.6140822277338336, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 0.8407072424888611, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1472, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 203.50000762939453, "epoch": 4.974704890387859, "grad_norm": 17.90019864721094, "kl": 0.3623046875, "learning_rate": 5.853040540540541e-07, "loss": 0.0004, "reward": 3.3107402324676514, "reward_std": 0.3046105355024338, "rewards/final_reward": 1.2522745466299563, "rewards/mask_iou_reward": 0.6261372733149781, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.3628233671188354, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1473, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 290.53125762939453, "epoch": 4.978077571669477, "grad_norm": 24.712536897200188, "kl": 0.373046875, "learning_rate": 5.850225225225225e-07, "loss": 0.0004, "reward": 3.2573235034942627, "reward_std": 0.2703036963939667, "rewards/final_reward": 1.5502094876731096, "rewards/mask_iou_reward": 0.7751047438365548, "rewards/sam_format_reward": 0.8750000298023224, "rewards/sam_reward_func_ultra": 1.4864901304244995, "rewards/thk_ans_format_reward": 0.8958333432674408, "step": 1474, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 234.6979217529297, "epoch": 4.981450252951096, "grad_norm": 14.704284009815126, "kl": 0.455078125, "learning_rate": 5.84740990990991e-07, "loss": 0.0005, "reward": 3.2285962104797363, "reward_std": 0.24824640899896622, "rewards/final_reward": 1.611938276483454, "rewards/mask_iou_reward": 0.805969138241727, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2702626585960388, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1475, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 134.8854217529297, "epoch": 4.9848229342327155, "grad_norm": 6.630361314100409, "kl": 0.4365234375, "learning_rate": 5.844594594594594e-07, "loss": 0.0004, "reward": 3.642152428627014, "reward_std": 0.11073607206344604, "rewards/final_reward": 1.795592902342593, "rewards/mask_iou_reward": 0.8977964511712965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6421523690223694, "rewards/thk_ans_format_reward": 1.0, "step": 1476, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.1979217529297, "epoch": 4.988195615514334, "grad_norm": 13.41912847632758, "kl": 0.544921875, "learning_rate": 5.841779279279279e-07, "loss": 0.0006, "reward": 3.583902359008789, "reward_std": 0.09717679023742676, "rewards/final_reward": 1.813288793658495, "rewards/mask_iou_reward": 0.9066443968292475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.58390212059021, "rewards/thk_ans_format_reward": 1.0, "step": 1477, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 161.20833587646484, "epoch": 4.991568296795953, "grad_norm": 11.096277828348555, "kl": 0.400390625, "learning_rate": 5.838963963963964e-07, "loss": 0.0004, "reward": 3.486131191253662, "reward_std": 0.14123235642910004, "rewards/final_reward": 1.5401189557808541, "rewards/mask_iou_reward": 0.7700594778904271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4861310720443726, "rewards/thk_ans_format_reward": 1.0, "step": 1478, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 188.53125762939453, "epoch": 4.9949409780775715, "grad_norm": 31.457315141376977, "kl": 0.41796875, "learning_rate": 5.836148648648649e-07, "loss": 0.0004, "reward": 3.2034597396850586, "reward_std": 0.222567617893219, "rewards/final_reward": 1.3724964809920568, "rewards/mask_iou_reward": 0.6862482404960284, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2138763666152954, "rewards/thk_ans_format_reward": 1.0, "step": 1479, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 208.15789794921875, "epoch": 4.998313659359191, "grad_norm": 16.960939494270033, "kl": 0.431640625, "learning_rate": 5.833333333333334e-07, "loss": 0.0004, "reward": 3.2109347581863403, "reward_std": 0.05437912791967392, "rewards/final_reward": 0.9108779879936213, "rewards/mask_iou_reward": 0.45543899399681065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2109346687793732, "rewards/thk_ans_format_reward": 1.0, "step": 1480, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 144.83333587646484, "epoch": 5.003372681281619, "grad_norm": 6.790820122613479, "kl": 0.4619140625, "learning_rate": 5.830518018018017e-07, "loss": 0.0005, "reward": 3.7827104330062866, "reward_std": 0.08347970061004162, "rewards/final_reward": 1.7737243143901735, "rewards/mask_iou_reward": 0.8868621571950868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.782710313796997, "rewards/thk_ans_format_reward": 1.0, "step": 1481, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 141.06250762939453, "epoch": 5.006745362563238, "grad_norm": 7.175459226342949, "kl": 0.4384765625, "learning_rate": 5.827702702702702e-07, "loss": 0.0005, "reward": 3.3402684926986694, "reward_std": 0.05066767521202564, "rewards/final_reward": 1.6801718841758047, "rewards/mask_iou_reward": 0.8400859420879023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.340268611907959, "rewards/thk_ans_format_reward": 1.0, "step": 1482, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 145.23958587646484, "epoch": 5.010118043844857, "grad_norm": 9.831226190656961, "kl": 0.458984375, "learning_rate": 5.824887387387387e-07, "loss": 0.0005, "reward": 3.557790994644165, "reward_std": 0.07639824971556664, "rewards/final_reward": 1.5340993469521953, "rewards/mask_iou_reward": 0.7670496734760976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5577911138534546, "rewards/thk_ans_format_reward": 1.0, "step": 1483, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 163.61458587646484, "epoch": 5.013490725126475, "grad_norm": 10.464371874039816, "kl": 0.447265625, "learning_rate": 5.822072072072071e-07, "loss": 0.0004, "reward": 3.669014811515808, "reward_std": 0.1482251062989235, "rewards/final_reward": 1.6119222329711107, "rewards/mask_iou_reward": 0.8059611164855554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6690149903297424, "rewards/thk_ans_format_reward": 1.0, "step": 1484, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 135.87500762939453, "epoch": 5.016863406408095, "grad_norm": 26.869203809282517, "kl": 0.482421875, "learning_rate": 5.819256756756756e-07, "loss": 0.0005, "reward": 3.3169851303100586, "reward_std": 0.13613472506403923, "rewards/final_reward": 1.4533780060265058, "rewards/mask_iou_reward": 0.7266890030132529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3274016380310059, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1485, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 184.21875381469727, "epoch": 5.020236087689713, "grad_norm": 8.257518991856715, "kl": 0.4091796875, "learning_rate": 5.816441441441441e-07, "loss": 0.0004, "reward": 3.219054937362671, "reward_std": 0.12979388982057571, "rewards/final_reward": 0.9981974144377622, "rewards/mask_iou_reward": 0.4990987072188811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.219054639339447, "rewards/thk_ans_format_reward": 1.0, "step": 1486, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 118.23958587646484, "epoch": 5.023608768971332, "grad_norm": 9.23841667282252, "kl": 0.53515625, "learning_rate": 5.813626126126125e-07, "loss": 0.0005, "reward": 3.609062910079956, "reward_std": 0.024191563948988914, "rewards/final_reward": 1.0073205239101837, "rewards/mask_iou_reward": 0.5036602619550918, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6090629696846008, "rewards/thk_ans_format_reward": 1.0, "step": 1487, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 192.46875, "epoch": 5.0269814502529515, "grad_norm": 13.825371287240923, "kl": 0.4609375, "learning_rate": 5.81081081081081e-07, "loss": 0.0005, "reward": 3.4536292552948, "reward_std": 0.28554829210042953, "rewards/final_reward": 1.3948383521925767, "rewards/mask_iou_reward": 0.6974191760962883, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4848793745040894, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1488, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 144.0416717529297, "epoch": 5.03035413153457, "grad_norm": 12.392904541863194, "kl": 0.4384765625, "learning_rate": 5.807995495495495e-07, "loss": 0.0005, "reward": 3.346392273902893, "reward_std": 0.13830474764108658, "rewards/final_reward": 1.0681874939877285, "rewards/mask_iou_reward": 0.5340937469938642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.346392273902893, "rewards/thk_ans_format_reward": 1.0, "step": 1489, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.12500762939453, "epoch": 5.033726812816189, "grad_norm": 17.686435335007896, "kl": 0.4306640625, "learning_rate": 5.80518018018018e-07, "loss": 0.0004, "reward": 3.561290979385376, "reward_std": 0.1316972728818655, "rewards/final_reward": 1.6434079249000093, "rewards/mask_iou_reward": 0.8217039624500047, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.571707546710968, "rewards/thk_ans_format_reward": 1.0, "step": 1490, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 134.4791717529297, "epoch": 5.0370994940978076, "grad_norm": 7.365495976354105, "kl": 0.447265625, "learning_rate": 5.802364864864865e-07, "loss": 0.0005, "reward": 3.3517701625823975, "reward_std": 0.08861459605395794, "rewards/final_reward": 1.3835822174002097, "rewards/mask_iou_reward": 0.6917911087001049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.351770043373108, "rewards/thk_ans_format_reward": 1.0, "step": 1491, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 134.54166793823242, "epoch": 5.040472175379427, "grad_norm": 6.787027590541496, "kl": 0.3955078125, "learning_rate": 5.799549549549549e-07, "loss": 0.0004, "reward": 3.3317559957504272, "reward_std": 0.12025662325322628, "rewards/final_reward": 1.7357438439015276, "rewards/mask_iou_reward": 0.8678719219507638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.331756055355072, "rewards/thk_ans_format_reward": 1.0, "step": 1492, "think_completion_length": 6.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 132.53125762939453, "epoch": 5.043844856661045, "grad_norm": 12.941794978698631, "kl": 0.82421875, "learning_rate": 5.796734234234234e-07, "loss": 0.0008, "reward": 3.5921987295150757, "reward_std": 0.1429782472550869, "rewards/final_reward": 1.4563671336692763, "rewards/mask_iou_reward": 0.7281835668346381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5921986103057861, "rewards/thk_ans_format_reward": 1.0, "step": 1493, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 142.6875, "epoch": 5.0472175379426645, "grad_norm": 5.387916165116724, "kl": 0.4296875, "learning_rate": 5.793918918918918e-07, "loss": 0.0004, "reward": 3.305783987045288, "reward_std": 0.1328853741288185, "rewards/final_reward": 1.6526029425179214, "rewards/mask_iou_reward": 0.8263014712589607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3162005543708801, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1494, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.8541717529297, "epoch": 5.050590219224283, "grad_norm": 13.444814680608228, "kl": 0.71875, "learning_rate": 5.791103603603603e-07, "loss": 0.0007, "reward": 3.694078803062439, "reward_std": 0.07504570484161377, "rewards/final_reward": 1.7440251935785165, "rewards/mask_iou_reward": 0.8720125967892582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6940789222717285, "rewards/thk_ans_format_reward": 1.0, "step": 1495, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 126.79167175292969, "epoch": 5.053962900505902, "grad_norm": 9.212872385754208, "kl": 0.4521484375, "learning_rate": 5.788288288288288e-07, "loss": 0.0005, "reward": 3.4492886066436768, "reward_std": 0.034415675327181816, "rewards/final_reward": 1.0015886302556327, "rewards/mask_iou_reward": 0.5007943151278164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.449288547039032, "rewards/thk_ans_format_reward": 1.0, "step": 1496, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 148.31250762939453, "epoch": 5.057335581787521, "grad_norm": 10.653258799760168, "kl": 0.4365234375, "learning_rate": 5.785472972972972e-07, "loss": 0.0004, "reward": 3.4376277923583984, "reward_std": 0.20355669409036636, "rewards/final_reward": 1.239369106554923, "rewards/mask_iou_reward": 0.6196845532774615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.437627911567688, "rewards/thk_ans_format_reward": 1.0, "step": 1497, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.14583587646484, "epoch": 5.06070826306914, "grad_norm": 12.34666190314828, "kl": 0.3916015625, "learning_rate": 5.782657657657657e-07, "loss": 0.0004, "reward": 3.16068696975708, "reward_std": 0.1016513011418283, "rewards/final_reward": 1.4004916296674885, "rewards/mask_iou_reward": 0.7002458148337443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1606866717338562, "rewards/thk_ans_format_reward": 1.0, "step": 1498, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.88541793823242, "epoch": 5.064080944350759, "grad_norm": 7.722374138852124, "kl": 0.427734375, "learning_rate": 5.779842342342343e-07, "loss": 0.0005, "reward": 3.562274694442749, "reward_std": 0.030085250735282898, "rewards/final_reward": 1.356150771831553, "rewards/mask_iou_reward": 0.6780753859157765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5622745752334595, "rewards/thk_ans_format_reward": 1.0, "step": 1499, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 118.27083587646484, "epoch": 5.0674536256323774, "grad_norm": 10.427274697363368, "kl": 0.5087890625, "learning_rate": 5.777027027027027e-07, "loss": 0.0005, "reward": 3.7217488288879395, "reward_std": 0.09539500810205936, "rewards/final_reward": 1.7929246014720737, "rewards/mask_iou_reward": 0.8964623007360368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7217488288879395, "rewards/thk_ans_format_reward": 1.0, "step": 1500, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 130.62500762939453, "epoch": 5.070826306913997, "grad_norm": 14.243323096545671, "kl": 0.4658203125, "learning_rate": 5.774211711711712e-07, "loss": 0.0005, "reward": 3.5437698364257812, "reward_std": 0.044545894488692284, "rewards/final_reward": 1.3813278328414316, "rewards/mask_iou_reward": 0.6906639164207158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5437697768211365, "rewards/thk_ans_format_reward": 1.0, "step": 1501, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 151.71875762939453, "epoch": 5.074198988195615, "grad_norm": 45.828657304525194, "kl": 0.4345703125, "learning_rate": 5.771396396396396e-07, "loss": 0.0004, "reward": 3.3739442825317383, "reward_std": 0.25980713963508606, "rewards/final_reward": 1.3380072058453065, "rewards/mask_iou_reward": 0.6690036029226533, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.384360909461975, "rewards/thk_ans_format_reward": 1.0, "step": 1502, "think_completion_length": 10.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 126.04166793823242, "epoch": 5.077571669477234, "grad_norm": 11.483672966459704, "kl": 0.4501953125, "learning_rate": 5.768581081081081e-07, "loss": 0.0005, "reward": 3.358354330062866, "reward_std": 0.06946107372641563, "rewards/final_reward": 1.7414760491673258, "rewards/mask_iou_reward": 0.8707380245836629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3583542108535767, "rewards/thk_ans_format_reward": 1.0, "step": 1503, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 158.2291717529297, "epoch": 5.080944350758854, "grad_norm": 12.504538881398963, "kl": 0.3515625, "learning_rate": 5.765765765765766e-07, "loss": 0.0004, "reward": 3.381642699241638, "reward_std": 0.10796273127198219, "rewards/final_reward": 1.477920895873604, "rewards/mask_iou_reward": 0.738960447936802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3816425800323486, "rewards/thk_ans_format_reward": 1.0, "step": 1504, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.73958587646484, "epoch": 5.084317032040472, "grad_norm": 37.92833349633823, "kl": 0.435546875, "learning_rate": 5.76295045045045e-07, "loss": 0.0004, "reward": 3.5506197214126587, "reward_std": 0.11413155496120453, "rewards/final_reward": 1.8354678340080102, "rewards/mask_iou_reward": 0.9177339170040051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5506197214126587, "rewards/thk_ans_format_reward": 1.0, "step": 1505, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 113.76042175292969, "epoch": 5.087689713322091, "grad_norm": 23.119291373909714, "kl": 0.5078125, "learning_rate": 5.760135135135135e-07, "loss": 0.0005, "reward": 3.324196457862854, "reward_std": 0.02308501861989498, "rewards/final_reward": 0.940729508191458, "rewards/mask_iou_reward": 0.470364754095729, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3241963982582092, "rewards/thk_ans_format_reward": 1.0, "step": 1506, "think_completion_length": 10.25 }, { "clip_ratio": 0.0, "completion_length": 153.42708587646484, "epoch": 5.09106239460371, "grad_norm": 8.347456566306933, "kl": 0.4541015625, "learning_rate": 5.757319819819819e-07, "loss": 0.0005, "reward": 3.4063055515289307, "reward_std": 0.05413071811199188, "rewards/final_reward": 1.8625934872891068, "rewards/mask_iou_reward": 0.9312967436445534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4063054919242859, "rewards/thk_ans_format_reward": 1.0, "step": 1507, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 127.31250381469727, "epoch": 5.094435075885329, "grad_norm": 12.280488648254183, "kl": 0.3994140625, "learning_rate": 5.754504504504504e-07, "loss": 0.0004, "reward": 3.319761037826538, "reward_std": 0.15890631824731827, "rewards/final_reward": 1.2424586590711777, "rewards/mask_iou_reward": 0.6212293295355888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3197609186172485, "rewards/thk_ans_format_reward": 1.0, "step": 1508, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.46875381469727, "epoch": 5.097807757166947, "grad_norm": 8.065481225765806, "kl": 0.4130859375, "learning_rate": 5.75168918918919e-07, "loss": 0.0004, "reward": 3.604817509651184, "reward_std": 0.0404562558978796, "rewards/final_reward": 1.7894732818901535, "rewards/mask_iou_reward": 0.8947366409450768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6048176884651184, "rewards/thk_ans_format_reward": 1.0, "step": 1509, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 126.58333587646484, "epoch": 5.101180438448567, "grad_norm": 14.184383237985477, "kl": 0.412109375, "learning_rate": 5.748873873873874e-07, "loss": 0.0005, "reward": 3.7557284832000732, "reward_std": 0.0582825830206275, "rewards/final_reward": 1.81418842535148, "rewards/mask_iou_reward": 0.90709421267574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.755728542804718, "rewards/thk_ans_format_reward": 1.0, "step": 1510, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.09375762939453, "epoch": 5.104553119730186, "grad_norm": 12.679298662198153, "kl": 0.4423828125, "learning_rate": 5.746058558558559e-07, "loss": 0.0004, "reward": 3.424263119697571, "reward_std": 0.09918565303087234, "rewards/final_reward": 1.429559166461599, "rewards/mask_iou_reward": 0.7147795832307995, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4242631793022156, "rewards/thk_ans_format_reward": 1.0, "step": 1511, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 130.375, "epoch": 5.107925801011804, "grad_norm": 11.672708605208836, "kl": 0.400390625, "learning_rate": 5.743243243243243e-07, "loss": 0.0004, "reward": 3.4523242712020874, "reward_std": 0.096237538382411, "rewards/final_reward": 1.4747727593325761, "rewards/mask_iou_reward": 0.7373863796662881, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4523241519927979, "rewards/thk_ans_format_reward": 1.0, "step": 1512, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 121.94791793823242, "epoch": 5.1112984822934235, "grad_norm": 8.775201882775074, "kl": 0.4853515625, "learning_rate": 5.740427927927928e-07, "loss": 0.0005, "reward": 3.6957037448883057, "reward_std": 0.06799108721315861, "rewards/final_reward": 1.7922714004424427, "rewards/mask_iou_reward": 0.8961357002212214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6957036256790161, "rewards/thk_ans_format_reward": 1.0, "step": 1513, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 147.67708587646484, "epoch": 5.114671163575042, "grad_norm": 9.70304061676113, "kl": 0.4052734375, "learning_rate": 5.737612612612613e-07, "loss": 0.0005, "reward": 3.1605879068374634, "reward_std": 0.1523672752082348, "rewards/final_reward": 1.2100002109274715, "rewards/mask_iou_reward": 0.6050001054637357, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.171004593372345, "rewards/thk_ans_format_reward": 1.0, "step": 1514, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 127.83333969116211, "epoch": 5.118043844856661, "grad_norm": 19.815276506284814, "kl": 0.46875, "learning_rate": 5.734797297297297e-07, "loss": 0.0005, "reward": 3.6572550535202026, "reward_std": 0.12338948994874954, "rewards/final_reward": 1.8912148086599259, "rewards/mask_iou_reward": 0.9456074043299629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6572552919387817, "rewards/thk_ans_format_reward": 1.0, "step": 1515, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.84375762939453, "epoch": 5.12141652613828, "grad_norm": 7.3037408566604975, "kl": 0.4365234375, "learning_rate": 5.731981981981982e-07, "loss": 0.0004, "reward": 3.1100199222564697, "reward_std": 0.13631337881088257, "rewards/final_reward": 0.6195152690810087, "rewards/mask_iou_reward": 0.30975763454050437, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.110019862651825, "rewards/thk_ans_format_reward": 1.0, "step": 1516, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 142.09375, "epoch": 5.124789207419899, "grad_norm": 8.928813460588515, "kl": 0.3798828125, "learning_rate": 5.729166666666667e-07, "loss": 0.0005, "reward": 3.3842055797576904, "reward_std": 0.03953359508886933, "rewards/final_reward": 1.3692225022258984, "rewards/mask_iou_reward": 0.6846112511129492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3842054903507233, "rewards/thk_ans_format_reward": 1.0, "step": 1517, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.90625762939453, "epoch": 5.128161888701518, "grad_norm": 18.589885478274518, "kl": 0.4638671875, "learning_rate": 5.726351351351351e-07, "loss": 0.0005, "reward": 3.3922992944717407, "reward_std": 0.24772672355175018, "rewards/final_reward": 1.7797736823794406, "rewards/mask_iou_reward": 0.8898868411897203, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4131325483322144, "rewards/thk_ans_format_reward": 1.0, "step": 1518, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.75, "epoch": 5.1315345699831365, "grad_norm": 12.183344084823798, "kl": 0.435546875, "learning_rate": 5.723536036036037e-07, "loss": 0.0004, "reward": 3.2745349407196045, "reward_std": 0.051657652482390404, "rewards/final_reward": 1.6897714946454898, "rewards/mask_iou_reward": 0.8448857473227449, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2745347321033478, "rewards/thk_ans_format_reward": 1.0, "step": 1519, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 146.71875381469727, "epoch": 5.134907251264756, "grad_norm": 11.902841576797009, "kl": 0.482421875, "learning_rate": 5.720720720720721e-07, "loss": 0.0005, "reward": 3.6553049087524414, "reward_std": 0.11148537695407867, "rewards/final_reward": 1.5538138012328986, "rewards/mask_iou_reward": 0.7769069006164493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6553047895431519, "rewards/thk_ans_format_reward": 1.0, "step": 1520, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 130.8229217529297, "epoch": 5.138279932546374, "grad_norm": 16.633666084642627, "kl": 0.42578125, "learning_rate": 5.717905405405405e-07, "loss": 0.0004, "reward": 3.3416396379470825, "reward_std": 0.10212976112961769, "rewards/final_reward": 1.1319948253465024, "rewards/mask_iou_reward": 0.5659974126732512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.341639518737793, "rewards/thk_ans_format_reward": 1.0, "step": 1521, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 123.57291793823242, "epoch": 5.141652613827993, "grad_norm": 10.582573874462401, "kl": 0.4140625, "learning_rate": 5.71509009009009e-07, "loss": 0.0004, "reward": 3.6125001907348633, "reward_std": 0.09932881966233253, "rewards/final_reward": 1.7526732780315073, "rewards/mask_iou_reward": 0.8763366390157536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6125000715255737, "rewards/thk_ans_format_reward": 1.0, "step": 1522, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 123.54167175292969, "epoch": 5.145025295109612, "grad_norm": 7.333966642905111, "kl": 0.423828125, "learning_rate": 5.712274774774774e-07, "loss": 0.0004, "reward": 3.6571160554885864, "reward_std": 0.09418375790119171, "rewards/final_reward": 1.449667431419821, "rewards/mask_iou_reward": 0.7248337157099105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6571159362792969, "rewards/thk_ans_format_reward": 1.0, "step": 1523, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 194.15625762939453, "epoch": 5.148397976391231, "grad_norm": 12.455752046502456, "kl": 0.400390625, "learning_rate": 5.709459459459459e-07, "loss": 0.0004, "reward": 3.4153331518173218, "reward_std": 0.08316674456000328, "rewards/final_reward": 1.1740201997786626, "rewards/mask_iou_reward": 0.5870100998893313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.415333330631256, "rewards/thk_ans_format_reward": 1.0, "step": 1524, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 135.34375381469727, "epoch": 5.15177065767285, "grad_norm": 13.553438063454204, "kl": 0.421875, "learning_rate": 5.706644144144143e-07, "loss": 0.0004, "reward": 3.599404454231262, "reward_std": 0.052878640592098236, "rewards/final_reward": 1.3837828129699297, "rewards/mask_iou_reward": 0.6918914064849648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5994043350219727, "rewards/thk_ans_format_reward": 1.0, "step": 1525, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 149.89583587646484, "epoch": 5.155143338954469, "grad_norm": 15.193464949430728, "kl": 0.4140625, "learning_rate": 5.703828828828828e-07, "loss": 0.0004, "reward": 3.3660353422164917, "reward_std": 0.04505654610693455, "rewards/final_reward": 1.9256488497998228, "rewards/mask_iou_reward": 0.9628244248999114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3660351634025574, "rewards/thk_ans_format_reward": 1.0, "step": 1526, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.96875381469727, "epoch": 5.158516020236088, "grad_norm": 13.55265426027934, "kl": 0.494140625, "learning_rate": 5.701013513513513e-07, "loss": 0.0005, "reward": 3.4424548149108887, "reward_std": 0.07838378101587296, "rewards/final_reward": 1.836514503370909, "rewards/mask_iou_reward": 0.9182572516854545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.442454755306244, "rewards/thk_ans_format_reward": 1.0, "step": 1527, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 128.75000381469727, "epoch": 5.161888701517706, "grad_norm": 18.989102473316567, "kl": 0.5283203125, "learning_rate": 5.698198198198197e-07, "loss": 0.0005, "reward": 3.1408311128616333, "reward_std": 0.09010594710707664, "rewards/final_reward": 1.893888014220626, "rewards/mask_iou_reward": 0.946944007110313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1408310234546661, "rewards/thk_ans_format_reward": 1.0, "step": 1528, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 122.80208587646484, "epoch": 5.165261382799326, "grad_norm": 65.82344999454938, "kl": 0.50390625, "learning_rate": 5.695382882882883e-07, "loss": 0.0005, "reward": 3.3837637901306152, "reward_std": 0.05148115009069443, "rewards/final_reward": 1.5014640082567738, "rewards/mask_iou_reward": 0.7507320041283869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.383763611316681, "rewards/thk_ans_format_reward": 1.0, "step": 1529, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.22916793823242, "epoch": 5.168634064080944, "grad_norm": 14.530501175090082, "kl": 0.6435546875, "learning_rate": 5.692567567567567e-07, "loss": 0.0006, "reward": 3.2932307720184326, "reward_std": 0.2251843735575676, "rewards/final_reward": 1.7689563760134575, "rewards/mask_iou_reward": 0.8844781880067287, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3036476373672485, "rewards/thk_ans_format_reward": 1.0, "step": 1530, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 131.71875762939453, "epoch": 5.172006745362563, "grad_norm": 13.65500874924387, "kl": 0.521484375, "learning_rate": 5.689752252252252e-07, "loss": 0.0006, "reward": 3.4125406742095947, "reward_std": 0.05845703464001417, "rewards/final_reward": 0.88716912268446, "rewards/mask_iou_reward": 0.44358456134223, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4125406742095947, "rewards/thk_ans_format_reward": 1.0, "step": 1531, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 124.27083587646484, "epoch": 5.175379426644182, "grad_norm": 12.40091004140753, "kl": 0.9296875, "learning_rate": 5.686936936936937e-07, "loss": 0.0009, "reward": 3.339430570602417, "reward_std": 0.12155294232070446, "rewards/final_reward": 1.3781666494968288, "rewards/mask_iou_reward": 0.6890833247484144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3394304513931274, "rewards/thk_ans_format_reward": 1.0, "step": 1532, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 113.96875, "epoch": 5.178752107925801, "grad_norm": 11.079528878346789, "kl": 0.5380859375, "learning_rate": 5.684121621621621e-07, "loss": 0.0006, "reward": 3.565992593765259, "reward_std": 0.17389854416251183, "rewards/final_reward": 1.696084288793232, "rewards/mask_iou_reward": 0.848042144396616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5659927725791931, "rewards/thk_ans_format_reward": 1.0, "step": 1533, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.63542175292969, "epoch": 5.18212478920742, "grad_norm": 14.322200453536906, "kl": 0.525390625, "learning_rate": 5.681306306306306e-07, "loss": 0.0005, "reward": 3.643845319747925, "reward_std": 0.06971078272908926, "rewards/final_reward": 1.7220737777035455, "rewards/mask_iou_reward": 0.8610368888517728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6438450813293457, "rewards/thk_ans_format_reward": 1.0, "step": 1534, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 112.16667175292969, "epoch": 5.185497470489039, "grad_norm": 18.093270113097123, "kl": 0.5693359375, "learning_rate": 5.67849099099099e-07, "loss": 0.0006, "reward": 3.6941890716552734, "reward_std": 0.3119666241109371, "rewards/final_reward": 1.6503372209016867, "rewards/mask_iou_reward": 0.8251686104508433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6941890716552734, "rewards/thk_ans_format_reward": 1.0, "step": 1535, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 133.28125381469727, "epoch": 5.188870151770658, "grad_norm": 11.133834519601262, "kl": 0.5888671875, "learning_rate": 5.675675675675675e-07, "loss": 0.0006, "reward": 3.6762256622314453, "reward_std": 0.10237638652324677, "rewards/final_reward": 1.773332255036778, "rewards/mask_iou_reward": 0.886666127518389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6762259006500244, "rewards/thk_ans_format_reward": 1.0, "step": 1536, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.66666793823242, "epoch": 5.192242833052276, "grad_norm": 10.01209463545946, "kl": 0.55859375, "learning_rate": 5.67286036036036e-07, "loss": 0.0006, "reward": 3.216148257255554, "reward_std": 0.14021393656730652, "rewards/final_reward": 1.220889013408144, "rewards/mask_iou_reward": 0.610444506704072, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2161482572555542, "rewards/thk_ans_format_reward": 1.0, "step": 1537, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 151.36458587646484, "epoch": 5.195615514333896, "grad_norm": 22.298105118905134, "kl": 0.453125, "learning_rate": 5.670045045045044e-07, "loss": 0.0004, "reward": 3.0634961128234863, "reward_std": 0.07166947051882744, "rewards/final_reward": 0.3871498869639922, "rewards/mask_iou_reward": 0.1935749434819961, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0634959936141968, "rewards/thk_ans_format_reward": 1.0, "step": 1538, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 111.05208587646484, "epoch": 5.198988195615514, "grad_norm": 16.616373424300722, "kl": 0.49609375, "learning_rate": 5.66722972972973e-07, "loss": 0.0005, "reward": 3.328110694885254, "reward_std": 0.037119604647159576, "rewards/final_reward": 0.7062539012306082, "rewards/mask_iou_reward": 0.3531269506153041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3281108736991882, "rewards/thk_ans_format_reward": 1.0, "step": 1539, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 113.4375, "epoch": 5.202360876897133, "grad_norm": 11.18647739665039, "kl": 0.4912109375, "learning_rate": 5.664414414414415e-07, "loss": 0.0005, "reward": 3.4070080518722534, "reward_std": 0.03997157700359821, "rewards/final_reward": 1.84481949763112, "rewards/mask_iou_reward": 0.92240974881556, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4070080518722534, "rewards/thk_ans_format_reward": 1.0, "step": 1540, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 163.5104217529297, "epoch": 5.2057335581787525, "grad_norm": 29.37877858635454, "kl": 0.412109375, "learning_rate": 5.661599099099099e-07, "loss": 0.0004, "reward": 3.5974971055984497, "reward_std": 0.14647838473320007, "rewards/final_reward": 1.7596759500227992, "rewards/mask_iou_reward": 0.8798379750113996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5974968671798706, "rewards/thk_ans_format_reward": 1.0, "step": 1541, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 127.54167175292969, "epoch": 5.209106239460371, "grad_norm": 6.73445212638002, "kl": 0.4443359375, "learning_rate": 5.658783783783784e-07, "loss": 0.0005, "reward": 3.4951417446136475, "reward_std": 0.052374981343746185, "rewards/final_reward": 1.64993442480199, "rewards/mask_iou_reward": 0.824967212400995, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.495141625404358, "rewards/thk_ans_format_reward": 1.0, "step": 1542, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 228.11459350585938, "epoch": 5.21247892074199, "grad_norm": 19.1238333669945, "kl": 0.4033203125, "learning_rate": 5.655968468468468e-07, "loss": 0.0004, "reward": 2.794515371322632, "reward_std": 0.2504820667090826, "rewards/final_reward": 0.959297504557755, "rewards/mask_iou_reward": 0.4796487522788775, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 0.9299318194389343, "rewards/thk_ans_format_reward": 0.9375, "step": 1543, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.21875, "epoch": 5.2158516020236085, "grad_norm": 73.96611340269645, "kl": 0.5205078125, "learning_rate": 5.653153153153153e-07, "loss": 0.0005, "reward": 3.3519543409347534, "reward_std": 0.1191567312926054, "rewards/final_reward": 1.5246337668641923, "rewards/mask_iou_reward": 0.7623168834320961, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3519543409347534, "rewards/thk_ans_format_reward": 1.0, "step": 1544, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 121.98958587646484, "epoch": 5.219224283305228, "grad_norm": 8.202103381403266, "kl": 0.45703125, "learning_rate": 5.650337837837838e-07, "loss": 0.0005, "reward": 3.384859323501587, "reward_std": 0.0341465137898922, "rewards/final_reward": 1.4621985641984132, "rewards/mask_iou_reward": 0.7310992820992066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3848593831062317, "rewards/thk_ans_format_reward": 1.0, "step": 1545, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 124.35416793823242, "epoch": 5.222596964586846, "grad_norm": 14.785637935429422, "kl": 0.591796875, "learning_rate": 5.647522522522522e-07, "loss": 0.0006, "reward": 3.454978108406067, "reward_std": 0.11388104408979416, "rewards/final_reward": 1.659704569059731, "rewards/mask_iou_reward": 0.8298522845298655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4549779295921326, "rewards/thk_ans_format_reward": 1.0, "step": 1546, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 123.27083587646484, "epoch": 5.2259696458684655, "grad_norm": 10.723802312148857, "kl": 0.4326171875, "learning_rate": 5.644707207207207e-07, "loss": 0.0005, "reward": 3.6628739833831787, "reward_std": 0.09409919008612633, "rewards/final_reward": 1.8867463281451706, "rewards/mask_iou_reward": 0.9433731640725853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6628742218017578, "rewards/thk_ans_format_reward": 1.0, "step": 1547, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 121.32291793823242, "epoch": 5.229342327150085, "grad_norm": 31.792109466430873, "kl": 0.3984375, "learning_rate": 5.641891891891891e-07, "loss": 0.0004, "reward": 3.559706926345825, "reward_std": 0.1302800141274929, "rewards/final_reward": 1.3238463009011596, "rewards/mask_iou_reward": 0.6619231504505798, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55970698595047, "rewards/thk_ans_format_reward": 1.0, "step": 1548, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 140.1979217529297, "epoch": 5.232715008431703, "grad_norm": 23.73896689821121, "kl": 0.5048828125, "learning_rate": 5.639076576576577e-07, "loss": 0.0005, "reward": 3.1021581888198853, "reward_std": 0.14375893399119377, "rewards/final_reward": 0.6169975867768962, "rewards/mask_iou_reward": 0.3084987933884481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1021581292152405, "rewards/thk_ans_format_reward": 1.0, "step": 1549, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 123.72916793823242, "epoch": 5.236087689713322, "grad_norm": 21.491525250003136, "kl": 0.4072265625, "learning_rate": 5.636261261261262e-07, "loss": 0.0004, "reward": 3.6628674268722534, "reward_std": 0.059761207550764084, "rewards/final_reward": 1.7522080608301884, "rewards/mask_iou_reward": 0.8761040304150942, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6628673076629639, "rewards/thk_ans_format_reward": 1.0, "step": 1550, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 125.02083587646484, "epoch": 5.239460370994941, "grad_norm": 7.610443172810163, "kl": 0.4619140625, "learning_rate": 5.633445945945946e-07, "loss": 0.0005, "reward": 3.4855546951293945, "reward_std": 0.12474964559078217, "rewards/final_reward": 1.6593411417752995, "rewards/mask_iou_reward": 0.8296705708876497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4855546355247498, "rewards/thk_ans_format_reward": 1.0, "step": 1551, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 134.46875762939453, "epoch": 5.24283305227656, "grad_norm": 14.118379922011169, "kl": 0.4345703125, "learning_rate": 5.630630630630631e-07, "loss": 0.0004, "reward": 3.29573655128479, "reward_std": 0.13444262370467186, "rewards/final_reward": 1.6338925846834953, "rewards/mask_iou_reward": 0.8169462923417476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.295736312866211, "rewards/thk_ans_format_reward": 1.0, "step": 1552, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 173.6666717529297, "epoch": 5.246205733558178, "grad_norm": 47.09683397724249, "kl": 0.4892578125, "learning_rate": 5.627815315315316e-07, "loss": 0.0005, "reward": 3.719792604446411, "reward_std": 0.03036335203796625, "rewards/final_reward": 1.713939308841118, "rewards/mask_iou_reward": 0.856969654420559, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7197925448417664, "rewards/thk_ans_format_reward": 1.0, "step": 1553, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 122.92708587646484, "epoch": 5.249578414839798, "grad_norm": 10.575688870182061, "kl": 0.744140625, "learning_rate": 5.625e-07, "loss": 0.0008, "reward": 3.4308345317840576, "reward_std": 0.10887641087174416, "rewards/final_reward": 1.6010893689639407, "rewards/mask_iou_reward": 0.8005446844819704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4308344721794128, "rewards/thk_ans_format_reward": 1.0, "step": 1554, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 133.5416717529297, "epoch": 5.252951096121416, "grad_norm": 21.612969798866093, "kl": 0.55078125, "learning_rate": 5.622184684684685e-07, "loss": 0.0006, "reward": 3.6511470079421997, "reward_std": 0.05324476957321167, "rewards/final_reward": 1.768552601739545, "rewards/mask_iou_reward": 0.8842763008697725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6511470079421997, "rewards/thk_ans_format_reward": 1.0, "step": 1555, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 130.34375762939453, "epoch": 5.256323777403035, "grad_norm": 25.791714296136618, "kl": 0.5859375, "learning_rate": 5.619369369369369e-07, "loss": 0.0006, "reward": 3.3546031713485718, "reward_std": 0.04129011929035187, "rewards/final_reward": 1.5378495910974854, "rewards/mask_iou_reward": 0.7689247955487427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3546032309532166, "rewards/thk_ans_format_reward": 1.0, "step": 1556, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.2291717529297, "epoch": 5.259696458684655, "grad_norm": 13.404011319795831, "kl": 0.435546875, "learning_rate": 5.616554054054054e-07, "loss": 0.0004, "reward": 3.324510097503662, "reward_std": 0.13226917386054993, "rewards/final_reward": 0.8674480571987822, "rewards/mask_iou_reward": 0.4337240285993911, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3245099782943726, "rewards/thk_ans_format_reward": 1.0, "step": 1557, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 122.88542175292969, "epoch": 5.263069139966273, "grad_norm": 15.12979386432097, "kl": 1.296875, "learning_rate": 5.613738738738739e-07, "loss": 0.0013, "reward": 3.581545352935791, "reward_std": 0.04847773676738143, "rewards/final_reward": 1.8063348684698242, "rewards/mask_iou_reward": 0.9031674342349121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5815452933311462, "rewards/thk_ans_format_reward": 1.0, "step": 1558, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.42708587646484, "epoch": 5.266441821247892, "grad_norm": 42.19614693932147, "kl": 0.4794921875, "learning_rate": 5.610923423423422e-07, "loss": 0.0005, "reward": 3.4823594093322754, "reward_std": 0.13475025445222855, "rewards/final_reward": 1.7472569945766088, "rewards/mask_iou_reward": 0.8736284972883044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4823591113090515, "rewards/thk_ans_format_reward": 1.0, "step": 1559, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 123.28125381469727, "epoch": 5.269814502529511, "grad_norm": 83.2498651086656, "kl": 0.5546875, "learning_rate": 5.608108108108108e-07, "loss": 0.0006, "reward": 3.675955653190613, "reward_std": 0.09048607014119625, "rewards/final_reward": 1.8582812308316892, "rewards/mask_iou_reward": 0.9291406154158446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6759557723999023, "rewards/thk_ans_format_reward": 1.0, "step": 1560, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 145.5416717529297, "epoch": 5.27318718381113, "grad_norm": 22.738778486995905, "kl": 0.52734375, "learning_rate": 5.605292792792792e-07, "loss": 0.0005, "reward": 3.5443469285964966, "reward_std": 0.13854551687836647, "rewards/final_reward": 1.565121317219153, "rewards/mask_iou_reward": 0.7825606586095765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5443468689918518, "rewards/thk_ans_format_reward": 1.0, "step": 1561, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 139.46875381469727, "epoch": 5.276559865092748, "grad_norm": 29.120362052579313, "kl": 0.4072265625, "learning_rate": 5.602477477477477e-07, "loss": 0.0004, "reward": 3.2357590198516846, "reward_std": 0.16141557320952415, "rewards/final_reward": 1.0046967644421276, "rewards/mask_iou_reward": 0.5023483822210638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.235758900642395, "rewards/thk_ans_format_reward": 1.0, "step": 1562, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 129.34375762939453, "epoch": 5.279932546374368, "grad_norm": 16.034146725438514, "kl": 0.43359375, "learning_rate": 5.599662162162162e-07, "loss": 0.0004, "reward": 3.4278730154037476, "reward_std": 0.0456274077296257, "rewards/final_reward": 1.5122349537878514, "rewards/mask_iou_reward": 0.7561174768939257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4278731942176819, "rewards/thk_ans_format_reward": 1.0, "step": 1563, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 129.1145896911621, "epoch": 5.283305227655987, "grad_norm": 34.61046567649571, "kl": 0.5732421875, "learning_rate": 5.596846846846846e-07, "loss": 0.0006, "reward": 3.431596279144287, "reward_std": 0.07788949087262154, "rewards/final_reward": 1.3865765962860417, "rewards/mask_iou_reward": 0.6932882981430208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4315963983535767, "rewards/thk_ans_format_reward": 1.0, "step": 1564, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.0729217529297, "epoch": 5.286677908937605, "grad_norm": 11.795224139319926, "kl": 0.4521484375, "learning_rate": 5.594031531531531e-07, "loss": 0.0005, "reward": 3.494480848312378, "reward_std": 0.16206956654787064, "rewards/final_reward": 1.2220150396439693, "rewards/mask_iou_reward": 0.6110075198219846, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4944809079170227, "rewards/thk_ans_format_reward": 1.0, "step": 1565, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 147.52083587646484, "epoch": 5.2900505902192245, "grad_norm": 10.969810119969456, "kl": 0.3984375, "learning_rate": 5.591216216216215e-07, "loss": 0.0004, "reward": 3.6718339920043945, "reward_std": 0.045063115656375885, "rewards/final_reward": 1.452622763087667, "rewards/mask_iou_reward": 0.7263113815438335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6718339323997498, "rewards/thk_ans_format_reward": 1.0, "step": 1566, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 143.0729217529297, "epoch": 5.293423271500843, "grad_norm": 33.29662349353952, "kl": 0.4619140625, "learning_rate": 5.5884009009009e-07, "loss": 0.0005, "reward": 3.5407893657684326, "reward_std": 0.07485915347933769, "rewards/final_reward": 1.6468581700055525, "rewards/mask_iou_reward": 0.8234290850027762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5407893657684326, "rewards/thk_ans_format_reward": 1.0, "step": 1567, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 119.72916793823242, "epoch": 5.296795952782462, "grad_norm": 14.017071819423824, "kl": 0.423828125, "learning_rate": 5.585585585585585e-07, "loss": 0.0004, "reward": 3.55793297290802, "reward_std": 0.02568998374044895, "rewards/final_reward": 1.838847874711516, "rewards/mask_iou_reward": 0.919423937355758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55793297290802, "rewards/thk_ans_format_reward": 1.0, "step": 1568, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 108.20833587646484, "epoch": 5.300168634064081, "grad_norm": 8.392709690302171, "kl": 0.5263671875, "learning_rate": 5.582770270270269e-07, "loss": 0.0005, "reward": 3.6737680435180664, "reward_std": 0.06741153821349144, "rewards/final_reward": 1.8004864144588608, "rewards/mask_iou_reward": 0.9002432072294304, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.673767864704132, "rewards/thk_ans_format_reward": 1.0, "step": 1569, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 130.8541717529297, "epoch": 5.3035413153457, "grad_norm": 10.569948867581923, "kl": 0.41796875, "learning_rate": 5.579954954954955e-07, "loss": 0.0004, "reward": 3.4037704467773438, "reward_std": 0.09090332314372063, "rewards/final_reward": 0.8551043103711985, "rewards/mask_iou_reward": 0.42755215518559925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4037703275680542, "rewards/thk_ans_format_reward": 1.0, "step": 1570, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.10416793823242, "epoch": 5.306913996627319, "grad_norm": 13.326835721795648, "kl": 1.677734375, "learning_rate": 5.57713963963964e-07, "loss": 0.0017, "reward": 3.740381360054016, "reward_std": 0.056756491772830486, "rewards/final_reward": 1.4973907536173645, "rewards/mask_iou_reward": 0.7486953768086823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7403812408447266, "rewards/thk_ans_format_reward": 1.0, "step": 1571, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 121.96875381469727, "epoch": 5.3102866779089375, "grad_norm": 10.72925580096195, "kl": 0.4931640625, "learning_rate": 5.574324324324324e-07, "loss": 0.0005, "reward": 3.5163180828094482, "reward_std": 0.0905131883919239, "rewards/final_reward": 1.856243706221524, "rewards/mask_iou_reward": 0.928121853110762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5163179636001587, "rewards/thk_ans_format_reward": 1.0, "step": 1572, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 123.14583587646484, "epoch": 5.313659359190557, "grad_norm": 11.226461152562312, "kl": 0.416015625, "learning_rate": 5.571509009009009e-07, "loss": 0.0004, "reward": 3.228415608406067, "reward_std": 0.07192541658878326, "rewards/final_reward": 1.3220572332200686, "rewards/mask_iou_reward": 0.6610286166100343, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2284154891967773, "rewards/thk_ans_format_reward": 1.0, "step": 1573, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 180.04166793823242, "epoch": 5.317032040472175, "grad_norm": 8.230439038432156, "kl": 0.3759765625, "learning_rate": 5.568693693693693e-07, "loss": 0.0004, "reward": 3.8010900020599365, "reward_std": 0.018786365166306496, "rewards/final_reward": 1.7107739670918538, "rewards/mask_iou_reward": 0.8553869835459269, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8010901808738708, "rewards/thk_ans_format_reward": 1.0, "step": 1574, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.33333587646484, "epoch": 5.320404721753794, "grad_norm": 5.757461841734188, "kl": 0.42578125, "learning_rate": 5.565878378378378e-07, "loss": 0.0004, "reward": 3.414375066757202, "reward_std": 0.05427007144317031, "rewards/final_reward": 1.4775362898487359, "rewards/mask_iou_reward": 0.7387681449243679, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.414375126361847, "rewards/thk_ans_format_reward": 1.0, "step": 1575, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 125.19791793823242, "epoch": 5.323777403035413, "grad_norm": 36.58365647963653, "kl": 0.419921875, "learning_rate": 5.563063063063063e-07, "loss": 0.0004, "reward": 3.325410485267639, "reward_std": 0.10779011994600296, "rewards/final_reward": 1.0423662379730816, "rewards/mask_iou_reward": 0.5211831189865408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3254103660583496, "rewards/thk_ans_format_reward": 1.0, "step": 1576, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 131.32291793823242, "epoch": 5.327150084317032, "grad_norm": 10.107335069790686, "kl": 0.7216796875, "learning_rate": 5.560247747747747e-07, "loss": 0.0008, "reward": 3.2031502723693848, "reward_std": 0.0661102794110775, "rewards/final_reward": 1.0763216198767998, "rewards/mask_iou_reward": 0.5381608099383999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2031500935554504, "rewards/thk_ans_format_reward": 1.0, "step": 1577, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.36458969116211, "epoch": 5.330522765598651, "grad_norm": 10.322027034328684, "kl": 0.47265625, "learning_rate": 5.557432432432432e-07, "loss": 0.0005, "reward": 3.5067999362945557, "reward_std": 0.1160063948482275, "rewards/final_reward": 0.977022600833603, "rewards/mask_iou_reward": 0.4885113004168015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5067998170852661, "rewards/thk_ans_format_reward": 1.0, "step": 1578, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.51041793823242, "epoch": 5.33389544688027, "grad_norm": 15.348517850938887, "kl": 0.5, "learning_rate": 5.554617117117116e-07, "loss": 0.0005, "reward": 3.456921339035034, "reward_std": 0.06033678911626339, "rewards/final_reward": 1.3159359344971464, "rewards/mask_iou_reward": 0.6579679672485732, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4569213390350342, "rewards/thk_ans_format_reward": 1.0, "step": 1579, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 134.55208587646484, "epoch": 5.337268128161889, "grad_norm": 15.585442503496923, "kl": 0.4501953125, "learning_rate": 5.551801801801802e-07, "loss": 0.0005, "reward": 3.488860607147217, "reward_std": 0.09073191322386265, "rewards/final_reward": 1.2704860859565354, "rewards/mask_iou_reward": 0.6352430429782677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.488860547542572, "rewards/thk_ans_format_reward": 1.0, "step": 1580, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 125.97917175292969, "epoch": 5.340640809443507, "grad_norm": 27.820073001383008, "kl": 0.751953125, "learning_rate": 5.548986486486487e-07, "loss": 0.0008, "reward": 3.639863133430481, "reward_std": 0.06889799144119024, "rewards/final_reward": 1.796841014307425, "rewards/mask_iou_reward": 0.8984205071537125, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6398632526397705, "rewards/thk_ans_format_reward": 1.0, "step": 1581, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.83333587646484, "epoch": 5.344013490725127, "grad_norm": 11.68937389204564, "kl": 0.44140625, "learning_rate": 5.546171171171171e-07, "loss": 0.0004, "reward": 3.3747018575668335, "reward_std": 0.07148859463632107, "rewards/final_reward": 1.0186688008845346, "rewards/mask_iou_reward": 0.5093344004422673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3747016787528992, "rewards/thk_ans_format_reward": 1.0, "step": 1582, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 119.19792175292969, "epoch": 5.347386172006745, "grad_norm": 9.387634937829326, "kl": 0.48046875, "learning_rate": 5.543355855855856e-07, "loss": 0.0005, "reward": 3.5327770709991455, "reward_std": 0.10092796385288239, "rewards/final_reward": 1.823864692806687, "rewards/mask_iou_reward": 0.9119323464033435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5327771306037903, "rewards/thk_ans_format_reward": 1.0, "step": 1583, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.14583587646484, "epoch": 5.350758853288364, "grad_norm": 12.118552830379011, "kl": 0.4482421875, "learning_rate": 5.54054054054054e-07, "loss": 0.0005, "reward": 3.5782933235168457, "reward_std": 0.09411728754639626, "rewards/final_reward": 1.2275416819809903, "rewards/mask_iou_reward": 0.6137708409904952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5782932043075562, "rewards/thk_ans_format_reward": 1.0, "step": 1584, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 148.22916793823242, "epoch": 5.354131534569984, "grad_norm": 276.16617115162836, "kl": 0.54296875, "learning_rate": 5.537725225225225e-07, "loss": 0.0005, "reward": 3.5092782974243164, "reward_std": 0.12828397750854492, "rewards/final_reward": 1.4079524650829063, "rewards/mask_iou_reward": 0.7039762325414531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5092784762382507, "rewards/thk_ans_format_reward": 1.0, "step": 1585, "think_completion_length": 9.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 148.09375762939453, "epoch": 5.357504215851602, "grad_norm": 15.448136244261567, "kl": 1.51171875, "learning_rate": 5.53490990990991e-07, "loss": 0.0015, "reward": 3.139350414276123, "reward_std": 0.08229007199406624, "rewards/final_reward": 1.1157618654892065, "rewards/mask_iou_reward": 0.5578809327446033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1393502354621887, "rewards/thk_ans_format_reward": 1.0, "step": 1586, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 131.0729217529297, "epoch": 5.360876897133221, "grad_norm": 7.589524801705542, "kl": 0.439453125, "learning_rate": 5.532094594594594e-07, "loss": 0.0004, "reward": 3.525477886199951, "reward_std": 0.12513011507689953, "rewards/final_reward": 1.7533777641824975, "rewards/mask_iou_reward": 0.8766888820912487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5254778265953064, "rewards/thk_ans_format_reward": 1.0, "step": 1587, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 173.37500762939453, "epoch": 5.36424957841484, "grad_norm": 6.570301057011809, "kl": 0.4521484375, "learning_rate": 5.529279279279279e-07, "loss": 0.0005, "reward": 3.304585814476013, "reward_std": 0.07918488210998476, "rewards/final_reward": 1.792820508585685, "rewards/mask_iou_reward": 0.8964102542928425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.304585576057434, "rewards/thk_ans_format_reward": 1.0, "step": 1588, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.875, "epoch": 5.367622259696459, "grad_norm": 9.062400524300918, "kl": 0.537109375, "learning_rate": 5.526463963963963e-07, "loss": 0.0005, "reward": 3.5815329551696777, "reward_std": 0.2140875719487667, "rewards/final_reward": 1.1094297889036784, "rewards/mask_iou_reward": 0.5547148944518392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5815329551696777, "rewards/thk_ans_format_reward": 1.0, "step": 1589, "think_completion_length": 10.5 }, { "clip_ratio": 0.0, "completion_length": 183.03125762939453, "epoch": 5.370994940978077, "grad_norm": 9.15405908451973, "kl": 0.4638671875, "learning_rate": 5.523648648648649e-07, "loss": 0.0005, "reward": 3.3573790788650513, "reward_std": 0.12218708544969559, "rewards/final_reward": 1.2496978881252572, "rewards/mask_iou_reward": 0.6248489440626286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3573788404464722, "rewards/thk_ans_format_reward": 1.0, "step": 1590, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.91666793823242, "epoch": 5.3743676222596966, "grad_norm": 16.06898917110685, "kl": 0.5869140625, "learning_rate": 5.520833333333334e-07, "loss": 0.0006, "reward": 3.674129366874695, "reward_std": 0.045963745564222336, "rewards/final_reward": 1.7854430448313332, "rewards/mask_iou_reward": 0.8927215224156666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6741293668746948, "rewards/thk_ans_format_reward": 1.0, "step": 1591, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 119.54166793823242, "epoch": 5.377740303541315, "grad_norm": 13.666515737584314, "kl": 0.435546875, "learning_rate": 5.518018018018018e-07, "loss": 0.0004, "reward": 3.001492738723755, "reward_std": 0.18107537552714348, "rewards/final_reward": 0.45554146425085257, "rewards/mask_iou_reward": 0.22777073212542628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0014926195144653, "rewards/thk_ans_format_reward": 1.0, "step": 1592, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 138.8333396911621, "epoch": 5.381112984822934, "grad_norm": 16.562808165815945, "kl": 0.654296875, "learning_rate": 5.515202702702703e-07, "loss": 0.0007, "reward": 3.5894731283187866, "reward_std": 0.060048991814255714, "rewards/final_reward": 1.2045376616534607, "rewards/mask_iou_reward": 0.6022688308267303, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5894731283187866, "rewards/thk_ans_format_reward": 1.0, "step": 1593, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 5.3844856661045535, "grad_norm": 7.900805230696725, "kl": 0.4208984375, "learning_rate": 5.512387387387388e-07, "loss": 0.0004, "reward": 3.297197103500366, "reward_std": 0.050192068330943584, "rewards/final_reward": 1.5728006310996099, "rewards/mask_iou_reward": 0.7864003155498049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2971969246864319, "rewards/thk_ans_format_reward": 1.0, "step": 1594, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 127.60416793823242, "epoch": 5.387858347386172, "grad_norm": 16.85455125388022, "kl": 0.4453125, "learning_rate": 5.509572072072072e-07, "loss": 0.0004, "reward": 3.0963913202285767, "reward_std": 0.2514451891183853, "rewards/final_reward": 1.3276669146343856, "rewards/mask_iou_reward": 0.6638334573171928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0963911414146423, "rewards/thk_ans_format_reward": 1.0, "step": 1595, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 122.14583587646484, "epoch": 5.391231028667791, "grad_norm": 8.657383821568352, "kl": 0.4677734375, "learning_rate": 5.506756756756757e-07, "loss": 0.0005, "reward": 3.521684169769287, "reward_std": 0.2204669639468193, "rewards/final_reward": 1.7213135550727179, "rewards/mask_iou_reward": 0.8606567775363589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5216842293739319, "rewards/thk_ans_format_reward": 1.0, "step": 1596, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 156.7604217529297, "epoch": 5.3946037099494095, "grad_norm": 16.606444003821277, "kl": 0.4267578125, "learning_rate": 5.503941441441441e-07, "loss": 0.0004, "reward": 3.491199016571045, "reward_std": 0.0791405662894249, "rewards/final_reward": 1.380018995825186, "rewards/mask_iou_reward": 0.690009497912593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4911989569664001, "rewards/thk_ans_format_reward": 1.0, "step": 1597, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 125.08333969116211, "epoch": 5.397976391231029, "grad_norm": 18.573848879245762, "kl": 0.5126953125, "learning_rate": 5.501126126126125e-07, "loss": 0.0005, "reward": 3.5595691204071045, "reward_std": 0.05301516316831112, "rewards/final_reward": 1.7548742102506711, "rewards/mask_iou_reward": 0.8774371051253356, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5595690608024597, "rewards/thk_ans_format_reward": 1.0, "step": 1598, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.77083587646484, "epoch": 5.401349072512647, "grad_norm": 13.677851860701725, "kl": 0.4833984375, "learning_rate": 5.49831081081081e-07, "loss": 0.0005, "reward": 3.4730799198150635, "reward_std": 0.09135781228542328, "rewards/final_reward": 1.5527164072778548, "rewards/mask_iou_reward": 0.7763582036389274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4730800986289978, "rewards/thk_ans_format_reward": 1.0, "step": 1599, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.88542175292969, "epoch": 5.4047217537942664, "grad_norm": 59.41510375735123, "kl": 0.4638671875, "learning_rate": 5.495495495495495e-07, "loss": 0.0005, "reward": 3.6761807203292847, "reward_std": 0.10935474932193756, "rewards/final_reward": 1.7209638783991454, "rewards/mask_iou_reward": 0.8604819391995727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.676180899143219, "rewards/thk_ans_format_reward": 1.0, "step": 1600, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 150.42708587646484, "epoch": 5.408094435075886, "grad_norm": 11.48604334360361, "kl": 0.400390625, "learning_rate": 5.49268018018018e-07, "loss": 0.0004, "reward": 3.5194766521453857, "reward_std": 0.06819850951433182, "rewards/final_reward": 1.7190072709272624, "rewards/mask_iou_reward": 0.8595036354636312, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.519476592540741, "rewards/thk_ans_format_reward": 1.0, "step": 1601, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.0, "epoch": 5.411467116357504, "grad_norm": 11.328987027276094, "kl": 0.44921875, "learning_rate": 5.489864864864864e-07, "loss": 0.0005, "reward": 3.3820735216140747, "reward_std": 0.05752043426036835, "rewards/final_reward": 1.6397381276838643, "rewards/mask_iou_reward": 0.8198690638419321, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3820736408233643, "rewards/thk_ans_format_reward": 1.0, "step": 1602, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 119.98958587646484, "epoch": 5.414839797639123, "grad_norm": 15.086284072459184, "kl": 0.443359375, "learning_rate": 5.487049549549549e-07, "loss": 0.0005, "reward": 3.315527081489563, "reward_std": 0.1428496576845646, "rewards/final_reward": 1.7716332694488184, "rewards/mask_iou_reward": 0.8858166347244092, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3155272006988525, "rewards/thk_ans_format_reward": 1.0, "step": 1603, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.0416717529297, "epoch": 5.418212478920742, "grad_norm": 14.191826769982049, "kl": 0.4296875, "learning_rate": 5.484234234234234e-07, "loss": 0.0004, "reward": 3.36587655544281, "reward_std": 0.08461336139589548, "rewards/final_reward": 1.6119604684506845, "rewards/mask_iou_reward": 0.8059802342253423, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.365876317024231, "rewards/thk_ans_format_reward": 1.0, "step": 1604, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.51041793823242, "epoch": 5.421585160202361, "grad_norm": 7.723156096040805, "kl": 0.4619140625, "learning_rate": 5.481418918918918e-07, "loss": 0.0005, "reward": 3.455763339996338, "reward_std": 0.13234110176563263, "rewards/final_reward": 1.3475519159373448, "rewards/mask_iou_reward": 0.6737759579686724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4557632803916931, "rewards/thk_ans_format_reward": 1.0, "step": 1605, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 146.03125381469727, "epoch": 5.424957841483979, "grad_norm": 17.843660716465195, "kl": 0.4775390625, "learning_rate": 5.478603603603603e-07, "loss": 0.0005, "reward": 3.515939950942993, "reward_std": 0.04447547905147076, "rewards/final_reward": 0.7696596498514374, "rewards/mask_iou_reward": 0.3848298249257187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5159401893615723, "rewards/thk_ans_format_reward": 1.0, "step": 1606, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.2291717529297, "epoch": 5.428330522765599, "grad_norm": 8.32019114382189, "kl": 0.4150390625, "learning_rate": 5.475788288288287e-07, "loss": 0.0004, "reward": 3.0867764949798584, "reward_std": 0.1152360737323761, "rewards/final_reward": 1.6006940530922766, "rewards/mask_iou_reward": 0.8003470265461383, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0867764353752136, "rewards/thk_ans_format_reward": 1.0, "step": 1607, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 123.6875, "epoch": 5.431703204047217, "grad_norm": 18.9909129285575, "kl": 0.4521484375, "learning_rate": 5.472972972972972e-07, "loss": 0.0005, "reward": 3.5866810083389282, "reward_std": 0.10227518156170845, "rewards/final_reward": 1.8427148180003425, "rewards/mask_iou_reward": 0.9213574090001713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5866807699203491, "rewards/thk_ans_format_reward": 1.0, "step": 1608, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 158.1041717529297, "epoch": 5.435075885328836, "grad_norm": 14.657295509107488, "kl": 0.62109375, "learning_rate": 5.470157657657657e-07, "loss": 0.0006, "reward": 3.316844344139099, "reward_std": 0.07849056646227837, "rewards/final_reward": 1.6149113626967542, "rewards/mask_iou_reward": 0.8074556813483771, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3168442845344543, "rewards/thk_ans_format_reward": 1.0, "step": 1609, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 120.46875, "epoch": 5.438448566610456, "grad_norm": 40.969177808345904, "kl": 0.607421875, "learning_rate": 5.467342342342342e-07, "loss": 0.0006, "reward": 3.4543533325195312, "reward_std": 0.10421181283891201, "rewards/final_reward": 1.8084816161777553, "rewards/mask_iou_reward": 0.9042408080888776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4543529748916626, "rewards/thk_ans_format_reward": 1.0, "step": 1610, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 138.43750762939453, "epoch": 5.441821247892074, "grad_norm": 10.93391540750094, "kl": 0.412109375, "learning_rate": 5.464527027027027e-07, "loss": 0.0004, "reward": 3.3363726139068604, "reward_std": 0.08054106682538986, "rewards/final_reward": 0.9038268503309029, "rewards/mask_iou_reward": 0.45191342516545147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3363724946975708, "rewards/thk_ans_format_reward": 1.0, "step": 1611, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 134.40625762939453, "epoch": 5.445193929173693, "grad_norm": 9.121135289305188, "kl": 0.474609375, "learning_rate": 5.461711711711712e-07, "loss": 0.0005, "reward": 3.646440267562866, "reward_std": 0.06063675507903099, "rewards/final_reward": 1.8498281476593796, "rewards/mask_iou_reward": 0.9249140738296898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.646440327167511, "rewards/thk_ans_format_reward": 1.0, "step": 1612, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.17708587646484, "epoch": 5.448566610455312, "grad_norm": 9.347691834498171, "kl": 0.4990234375, "learning_rate": 5.458896396396396e-07, "loss": 0.0005, "reward": 3.603909730911255, "reward_std": 0.059759557247161865, "rewards/final_reward": 1.7791889589015653, "rewards/mask_iou_reward": 0.8895944794507826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6039097905158997, "rewards/thk_ans_format_reward": 1.0, "step": 1613, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 146.95833587646484, "epoch": 5.451939291736931, "grad_norm": 64.78327486793128, "kl": 0.3916015625, "learning_rate": 5.456081081081081e-07, "loss": 0.0004, "reward": 3.368232846260071, "reward_std": 0.04645315185189247, "rewards/final_reward": 1.1973084912773273, "rewards/mask_iou_reward": 0.5986542456386637, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.368232548236847, "rewards/thk_ans_format_reward": 1.0, "step": 1614, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.1875, "epoch": 5.455311973018549, "grad_norm": 6.750624391879805, "kl": 0.40234375, "learning_rate": 5.453265765765765e-07, "loss": 0.0004, "reward": 3.5145528316497803, "reward_std": 0.03645121678709984, "rewards/final_reward": 1.694978467397088, "rewards/mask_iou_reward": 0.847489233698544, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5145527124404907, "rewards/thk_ans_format_reward": 1.0, "step": 1615, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 131.7604217529297, "epoch": 5.458684654300169, "grad_norm": 16.16205090571039, "kl": 1.986328125, "learning_rate": 5.45045045045045e-07, "loss": 0.002, "reward": 3.627463221549988, "reward_std": 0.04613169934600592, "rewards/final_reward": 1.9425661984513574, "rewards/mask_iou_reward": 0.9712830992256787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6274632215499878, "rewards/thk_ans_format_reward": 1.0, "step": 1616, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.00000762939453, "epoch": 5.462057335581788, "grad_norm": 10.333683537542921, "kl": 0.412109375, "learning_rate": 5.447635135135135e-07, "loss": 0.0005, "reward": 3.6512891054153442, "reward_std": 0.10130597651004791, "rewards/final_reward": 1.7270932594670367, "rewards/mask_iou_reward": 0.8635466297335184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6617057919502258, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1617, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 108.59375381469727, "epoch": 5.465430016863406, "grad_norm": 11.288309283682565, "kl": 0.576171875, "learning_rate": 5.444819819819819e-07, "loss": 0.0006, "reward": 3.6216951608657837, "reward_std": 0.036016141064465046, "rewards/final_reward": 1.9431474565908906, "rewards/mask_iou_reward": 0.9715737282954453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6216949224472046, "rewards/thk_ans_format_reward": 1.0, "step": 1618, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 146.7916717529297, "epoch": 5.4688026981450255, "grad_norm": 11.645954279749585, "kl": 0.3896484375, "learning_rate": 5.442004504504504e-07, "loss": 0.0004, "reward": 3.32417094707489, "reward_std": 0.11718141287565231, "rewards/final_reward": 1.2097389401016168, "rewards/mask_iou_reward": 0.6048694700508084, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3241708874702454, "rewards/thk_ans_format_reward": 1.0, "step": 1619, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 119.44791793823242, "epoch": 5.472175379426644, "grad_norm": 10.510371694596317, "kl": 0.4541015625, "learning_rate": 5.43918918918919e-07, "loss": 0.0005, "reward": 3.4671072959899902, "reward_std": 0.09320265799760818, "rewards/final_reward": 1.7891415823759047, "rewards/mask_iou_reward": 0.8945707911879524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4671071767807007, "rewards/thk_ans_format_reward": 1.0, "step": 1620, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 108.70833587646484, "epoch": 5.475548060708263, "grad_norm": 10.57340206050586, "kl": 1.16796875, "learning_rate": 5.436373873873874e-07, "loss": 0.0012, "reward": 3.784354567527771, "reward_std": 0.05950320092961192, "rewards/final_reward": 1.6432683415503035, "rewards/mask_iou_reward": 0.8216341707751518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.784354567527771, "rewards/thk_ans_format_reward": 1.0, "step": 1621, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 128.34375381469727, "epoch": 5.4789207419898815, "grad_norm": 14.473689193550852, "kl": 0.423828125, "learning_rate": 5.433558558558559e-07, "loss": 0.0004, "reward": 3.405122399330139, "reward_std": 0.11872344464063644, "rewards/final_reward": 1.1827626419877142, "rewards/mask_iou_reward": 0.5913813209938571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4051221013069153, "rewards/thk_ans_format_reward": 1.0, "step": 1622, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 176.8645896911621, "epoch": 5.482293423271501, "grad_norm": 43.06217139028088, "kl": 0.3828125, "learning_rate": 5.430743243243243e-07, "loss": 0.0004, "reward": 3.340269446372986, "reward_std": 0.20309398137032986, "rewards/final_reward": 1.2307757543990752, "rewards/mask_iou_reward": 0.6153878771995376, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3506861925125122, "rewards/thk_ans_format_reward": 1.0, "step": 1623, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 150.36459350585938, "epoch": 5.48566610455312, "grad_norm": 7.01983725004176, "kl": 0.451171875, "learning_rate": 5.427927927927928e-07, "loss": 0.0005, "reward": 3.6813403367996216, "reward_std": 0.04475306533277035, "rewards/final_reward": 1.8279500199920133, "rewards/mask_iou_reward": 0.9139750099960067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6813403367996216, "rewards/thk_ans_format_reward": 1.0, "step": 1624, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 131.78125, "epoch": 5.4890387858347385, "grad_norm": 10.716033971911102, "kl": 0.85546875, "learning_rate": 5.425112612612613e-07, "loss": 0.0008, "reward": 3.4271167516708374, "reward_std": 0.12328441441059113, "rewards/final_reward": 1.671780505812558, "rewards/mask_iou_reward": 0.835890252906279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4271168112754822, "rewards/thk_ans_format_reward": 1.0, "step": 1625, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.84375, "epoch": 5.492411467116358, "grad_norm": 18.256151332877142, "kl": 0.490234375, "learning_rate": 5.422297297297297e-07, "loss": 0.0005, "reward": 3.758134603500366, "reward_std": 0.026996027678251266, "rewards/final_reward": 1.5473002885407978, "rewards/mask_iou_reward": 0.7736501442703989, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7581345438957214, "rewards/thk_ans_format_reward": 1.0, "step": 1626, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 119.66666793823242, "epoch": 5.495784148397976, "grad_norm": 14.495247198790699, "kl": 0.3984375, "learning_rate": 5.419481981981982e-07, "loss": 0.0004, "reward": 3.59829044342041, "reward_std": 0.05195681378245354, "rewards/final_reward": 1.8786183719312315, "rewards/mask_iou_reward": 0.9393091859656157, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5982903838157654, "rewards/thk_ans_format_reward": 1.0, "step": 1627, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 121.36458587646484, "epoch": 5.499156829679595, "grad_norm": 9.939043586294236, "kl": 0.443359375, "learning_rate": 5.416666666666666e-07, "loss": 0.0005, "reward": 3.37367844581604, "reward_std": 0.032843963243067265, "rewards/final_reward": 1.2515881682879417, "rewards/mask_iou_reward": 0.6257940841439709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3736785650253296, "rewards/thk_ans_format_reward": 1.0, "step": 1628, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.95833587646484, "epoch": 5.502529510961214, "grad_norm": 9.954367021666135, "kl": 0.3984375, "learning_rate": 5.413851351351351e-07, "loss": 0.0004, "reward": 3.2100558280944824, "reward_std": 0.02408889401704073, "rewards/final_reward": 1.5521448941685216, "rewards/mask_iou_reward": 0.7760724470842608, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2100557088851929, "rewards/thk_ans_format_reward": 1.0, "step": 1629, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 131.7916717529297, "epoch": 5.505902192242833, "grad_norm": 25.744694096144258, "kl": 0.435546875, "learning_rate": 5.411036036036037e-07, "loss": 0.0004, "reward": 3.306903600692749, "reward_std": 0.1038884948939085, "rewards/final_reward": 0.9442666685268137, "rewards/mask_iou_reward": 0.47213333426340687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3069035410881042, "rewards/thk_ans_format_reward": 1.0, "step": 1630, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.87500381469727, "epoch": 5.509274873524452, "grad_norm": 114.37127200407646, "kl": 0.52734375, "learning_rate": 5.408220720720721e-07, "loss": 0.0005, "reward": 3.312256336212158, "reward_std": 0.0768951065838337, "rewards/final_reward": 0.6544148646849897, "rewards/mask_iou_reward": 0.32720743234249483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3122565150260925, "rewards/thk_ans_format_reward": 1.0, "step": 1631, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 147.23958587646484, "epoch": 5.512647554806071, "grad_norm": 17.052226307928535, "kl": 0.4052734375, "learning_rate": 5.405405405405406e-07, "loss": 0.0004, "reward": 3.6516486406326294, "reward_std": 0.06326550245285034, "rewards/final_reward": 1.8457776060042868, "rewards/mask_iou_reward": 0.9228888030021434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6516485810279846, "rewards/thk_ans_format_reward": 1.0, "step": 1632, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 121.93750381469727, "epoch": 5.51602023608769, "grad_norm": 14.431876285263879, "kl": 0.4482421875, "learning_rate": 5.40259009009009e-07, "loss": 0.0005, "reward": 3.2408528327941895, "reward_std": 0.043673787266016006, "rewards/final_reward": 0.9847618306461534, "rewards/mask_iou_reward": 0.4923809153230767, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2408528923988342, "rewards/thk_ans_format_reward": 1.0, "step": 1633, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 255.55209350585938, "epoch": 5.519392917369308, "grad_norm": 17.26001494408382, "kl": 0.4228515625, "learning_rate": 5.399774774774775e-07, "loss": 0.0004, "reward": 3.347735643386841, "reward_std": 0.3271710202097893, "rewards/final_reward": 1.4704092045759678, "rewards/mask_iou_reward": 0.7352046022879839, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.493568778038025, "rewards/thk_ans_format_reward": 0.9270833432674408, "step": 1634, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 121.32291793823242, "epoch": 5.522765598650928, "grad_norm": 8.97053405491925, "kl": 0.45703125, "learning_rate": 5.39695945945946e-07, "loss": 0.0005, "reward": 3.441559910774231, "reward_std": 0.18121246993541718, "rewards/final_reward": 1.500605036354436, "rewards/mask_iou_reward": 0.750302518177218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4415597915649414, "rewards/thk_ans_format_reward": 1.0, "step": 1635, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 177.84375, "epoch": 5.526138279932546, "grad_norm": 7.933652110506884, "kl": 0.4609375, "learning_rate": 5.394144144144144e-07, "loss": 0.0005, "reward": 3.5340306758880615, "reward_std": 0.07737600617110729, "rewards/final_reward": 1.5301011994802551, "rewards/mask_iou_reward": 0.7650505997401276, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5340306758880615, "rewards/thk_ans_format_reward": 1.0, "step": 1636, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.82291793823242, "epoch": 5.529510961214165, "grad_norm": 21.625197898356575, "kl": 0.53125, "learning_rate": 5.391328828828829e-07, "loss": 0.0005, "reward": 3.2599620819091797, "reward_std": 0.1290854811668396, "rewards/final_reward": 0.9122858420071565, "rewards/mask_iou_reward": 0.45614292100357823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2599619626998901, "rewards/thk_ans_format_reward": 1.0, "step": 1637, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.50000381469727, "epoch": 5.532883642495785, "grad_norm": 14.189035764609386, "kl": 0.439453125, "learning_rate": 5.388513513513512e-07, "loss": 0.0004, "reward": 3.785208821296692, "reward_std": 0.10167535580694675, "rewards/final_reward": 1.5567690849466458, "rewards/mask_iou_reward": 0.7783845424733229, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7852087020874023, "rewards/thk_ans_format_reward": 1.0, "step": 1638, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 128.25000762939453, "epoch": 5.536256323777403, "grad_norm": 17.561204439467147, "kl": 0.4267578125, "learning_rate": 5.385698198198197e-07, "loss": 0.0004, "reward": 3.3584929704666138, "reward_std": 0.15081000700592995, "rewards/final_reward": 1.350205189405206, "rewards/mask_iou_reward": 0.675102594702603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.35849267244339, "rewards/thk_ans_format_reward": 1.0, "step": 1639, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 117.45833587646484, "epoch": 5.539629005059022, "grad_norm": 13.441524313971499, "kl": 0.5224609375, "learning_rate": 5.382882882882883e-07, "loss": 0.0005, "reward": 3.2099350690841675, "reward_std": 0.12646333128213882, "rewards/final_reward": 1.0815409952337722, "rewards/mask_iou_reward": 0.5407704976168861, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2099350094795227, "rewards/thk_ans_format_reward": 1.0, "step": 1640, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 131.7395896911621, "epoch": 5.543001686340641, "grad_norm": 11.448017304941155, "kl": 0.4208984375, "learning_rate": 5.380067567567567e-07, "loss": 0.0004, "reward": 3.6662003993988037, "reward_std": 0.025301916524767876, "rewards/final_reward": 1.728385868170287, "rewards/mask_iou_reward": 0.8641929340851435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6662004590034485, "rewards/thk_ans_format_reward": 1.0, "step": 1641, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.83333587646484, "epoch": 5.54637436762226, "grad_norm": 11.684922782454798, "kl": 0.5966796875, "learning_rate": 5.377252252252252e-07, "loss": 0.0006, "reward": 3.4820899963378906, "reward_std": 0.05957669019699097, "rewards/final_reward": 1.1023617770398222, "rewards/mask_iou_reward": 0.5511808885199111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4820902347564697, "rewards/thk_ans_format_reward": 1.0, "step": 1642, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 107.87500381469727, "epoch": 5.549747048903878, "grad_norm": 10.7985345793467, "kl": 0.541015625, "learning_rate": 5.374436936936936e-07, "loss": 0.0006, "reward": 3.5354676246643066, "reward_std": 0.09456159453839064, "rewards/final_reward": 1.2728535625444786, "rewards/mask_iou_reward": 0.6364267812722393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.535467505455017, "rewards/thk_ans_format_reward": 1.0, "step": 1643, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 156.7291717529297, "epoch": 5.5531197301854975, "grad_norm": 16.472570929374434, "kl": 0.400390625, "learning_rate": 5.371621621621621e-07, "loss": 0.0004, "reward": 3.267836332321167, "reward_std": 0.10154848545789719, "rewards/final_reward": 0.9260682476262471, "rewards/mask_iou_reward": 0.46303412381312353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.267836332321167, "rewards/thk_ans_format_reward": 1.0, "step": 1644, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.62500381469727, "epoch": 5.556492411467117, "grad_norm": 11.550943423319882, "kl": 0.662109375, "learning_rate": 5.368806306306306e-07, "loss": 0.0007, "reward": 3.6972213983535767, "reward_std": 0.05988108739256859, "rewards/final_reward": 1.817311461596225, "rewards/mask_iou_reward": 0.9086557307981125, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6972213387489319, "rewards/thk_ans_format_reward": 1.0, "step": 1645, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.55208587646484, "epoch": 5.559865092748735, "grad_norm": 12.56189111590616, "kl": 0.416015625, "learning_rate": 5.36599099099099e-07, "loss": 0.0005, "reward": 3.3453147411346436, "reward_std": 0.09329931810498238, "rewards/final_reward": 1.5583423431680927, "rewards/mask_iou_reward": 0.7791711715840464, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3453145623207092, "rewards/thk_ans_format_reward": 1.0, "step": 1646, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 138.4166717529297, "epoch": 5.5632377740303545, "grad_norm": 22.44312055578235, "kl": 0.4697265625, "learning_rate": 5.363175675675675e-07, "loss": 0.0005, "reward": 3.079059362411499, "reward_std": 0.08573806285858154, "rewards/final_reward": 0.784900331719663, "rewards/mask_iou_reward": 0.3924501658598315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0790594518184662, "rewards/thk_ans_format_reward": 1.0, "step": 1647, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 128.1041717529297, "epoch": 5.566610455311973, "grad_norm": 18.880687882286963, "kl": 0.5234375, "learning_rate": 5.36036036036036e-07, "loss": 0.0005, "reward": 3.4205507040023804, "reward_std": 0.13985726051032543, "rewards/final_reward": 1.363954579065279, "rewards/mask_iou_reward": 0.6819772895326395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4205505847930908, "rewards/thk_ans_format_reward": 1.0, "step": 1648, "think_completion_length": 9.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 132.6354217529297, "epoch": 5.569983136593592, "grad_norm": 32.78135655581178, "kl": 0.4423828125, "learning_rate": 5.357545045045044e-07, "loss": 0.0004, "reward": 3.25208842754364, "reward_std": 0.05463777109980583, "rewards/final_reward": 0.8885717828254704, "rewards/mask_iou_reward": 0.4442858914127352, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2520886063575745, "rewards/thk_ans_format_reward": 1.0, "step": 1649, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 116.9375, "epoch": 5.5733558178752105, "grad_norm": 12.839953502051491, "kl": 0.80078125, "learning_rate": 5.35472972972973e-07, "loss": 0.0008, "reward": 3.2596405744552612, "reward_std": 0.0540030263364315, "rewards/final_reward": 1.8060929050230472, "rewards/mask_iou_reward": 0.9030464525115236, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2596407532691956, "rewards/thk_ans_format_reward": 1.0, "step": 1650, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 112.53125381469727, "epoch": 5.57672849915683, "grad_norm": 11.968290135687798, "kl": 0.5263671875, "learning_rate": 5.351914414414414e-07, "loss": 0.0005, "reward": 3.6679221391677856, "reward_std": 0.041471182368695736, "rewards/final_reward": 1.8406348878109202, "rewards/mask_iou_reward": 0.9203174439054601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.667921781539917, "rewards/thk_ans_format_reward": 1.0, "step": 1651, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 106.96875381469727, "epoch": 5.580101180438449, "grad_norm": 42.447179584452854, "kl": 0.4755859375, "learning_rate": 5.349099099099099e-07, "loss": 0.0005, "reward": 3.6466050148010254, "reward_std": 0.059364247135818005, "rewards/final_reward": 1.5773073296975162, "rewards/mask_iou_reward": 0.7886536648487581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6466050148010254, "rewards/thk_ans_format_reward": 1.0, "step": 1652, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 125.63542175292969, "epoch": 5.583473861720067, "grad_norm": 40.10948190147107, "kl": 0.44921875, "learning_rate": 5.346283783783784e-07, "loss": 0.0004, "reward": 3.4712870121002197, "reward_std": 0.08134759962558746, "rewards/final_reward": 1.6431637920767832, "rewards/mask_iou_reward": 0.8215818960383916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4712870717048645, "rewards/thk_ans_format_reward": 1.0, "step": 1653, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 115.86458587646484, "epoch": 5.586846543001687, "grad_norm": 13.079819261870021, "kl": 0.44921875, "learning_rate": 5.343468468468468e-07, "loss": 0.0004, "reward": 3.5946396589279175, "reward_std": 0.13511600345373154, "rewards/final_reward": 1.3123857157778915, "rewards/mask_iou_reward": 0.6561928578889458, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.605056345462799, "rewards/thk_ans_format_reward": 1.0, "step": 1654, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 128.9270896911621, "epoch": 5.590219224283305, "grad_norm": 12.470984977069678, "kl": 2.4609375, "learning_rate": 5.340653153153153e-07, "loss": 0.0025, "reward": 3.556732416152954, "reward_std": 0.1202475274913013, "rewards/final_reward": 1.564926728070922, "rewards/mask_iou_reward": 0.782463364035461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.556732177734375, "rewards/thk_ans_format_reward": 1.0, "step": 1655, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 125.60416793823242, "epoch": 5.593591905564924, "grad_norm": 241.62352953992772, "kl": 0.533203125, "learning_rate": 5.337837837837837e-07, "loss": 0.0005, "reward": 3.4673640727996826, "reward_std": 0.05588648747652769, "rewards/final_reward": 1.3350874708452558, "rewards/mask_iou_reward": 0.6675437354226279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4673641324043274, "rewards/thk_ans_format_reward": 1.0, "step": 1656, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 118.71875381469727, "epoch": 5.596964586846543, "grad_norm": 8.440196304786275, "kl": 0.435546875, "learning_rate": 5.335022522522522e-07, "loss": 0.0004, "reward": 3.678599715232849, "reward_std": 0.09616643656045198, "rewards/final_reward": 1.2519775876713632, "rewards/mask_iou_reward": 0.6259887938356816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6785999536514282, "rewards/thk_ans_format_reward": 1.0, "step": 1657, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 124.79167175292969, "epoch": 5.600337268128162, "grad_norm": 20.773866582057902, "kl": 0.41015625, "learning_rate": 5.332207207207207e-07, "loss": 0.0004, "reward": 3.4142476320266724, "reward_std": 0.12623858451843262, "rewards/final_reward": 1.590514677557561, "rewards/mask_iou_reward": 0.7952573387787805, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4246641993522644, "rewards/thk_ans_format_reward": 1.0, "step": 1658, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 121.35416793823242, "epoch": 5.60370994940978, "grad_norm": 13.944861340918969, "kl": 0.482421875, "learning_rate": 5.329391891891891e-07, "loss": 0.0005, "reward": 3.3524316549301147, "reward_std": 0.0691116601228714, "rewards/final_reward": 1.4367779985159759, "rewards/mask_iou_reward": 0.7183889992579879, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.35243159532547, "rewards/thk_ans_format_reward": 1.0, "step": 1659, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.57292175292969, "epoch": 5.6070826306914, "grad_norm": 21.92530932871542, "kl": 0.50390625, "learning_rate": 5.326576576576577e-07, "loss": 0.0005, "reward": 3.5524903535842896, "reward_std": 0.1243749912828207, "rewards/final_reward": 1.56479281877437, "rewards/mask_iou_reward": 0.782396409387185, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.552490234375, "rewards/thk_ans_format_reward": 1.0, "step": 1660, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 137.64583587646484, "epoch": 5.610455311973018, "grad_norm": 32.03033808136419, "kl": 0.474609375, "learning_rate": 5.323761261261262e-07, "loss": 0.0005, "reward": 3.106863260269165, "reward_std": 0.08488507196307182, "rewards/final_reward": 1.1713338661055392, "rewards/mask_iou_reward": 0.5856669330527696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1068630814552307, "rewards/thk_ans_format_reward": 1.0, "step": 1661, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 196.26042938232422, "epoch": 5.613827993254637, "grad_norm": 11.539811281420858, "kl": 0.494140625, "learning_rate": 5.320945945945946e-07, "loss": 0.0005, "reward": 3.4144234657287598, "reward_std": 0.1457614228129387, "rewards/final_reward": 1.3577875336470997, "rewards/mask_iou_reward": 0.6788937668235498, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4248398542404175, "rewards/thk_ans_format_reward": 1.0, "step": 1662, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.53125, "epoch": 5.617200674536257, "grad_norm": 8.60480920359381, "kl": 0.4521484375, "learning_rate": 5.318130630630631e-07, "loss": 0.0005, "reward": 3.43615460395813, "reward_std": 0.09291991218924522, "rewards/final_reward": 1.0773759939667544, "rewards/mask_iou_reward": 0.5386879969833772, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4361546635627747, "rewards/thk_ans_format_reward": 1.0, "step": 1663, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 134.45833587646484, "epoch": 5.620573355817875, "grad_norm": 12.209485569856556, "kl": 0.6083984375, "learning_rate": 5.315315315315315e-07, "loss": 0.0006, "reward": 3.3641871213912964, "reward_std": 0.062420524656772614, "rewards/final_reward": 0.591343764088626, "rewards/mask_iou_reward": 0.295671882044313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.364187240600586, "rewards/thk_ans_format_reward": 1.0, "step": 1664, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.42708969116211, "epoch": 5.623946037099494, "grad_norm": 21.161549557227392, "kl": 0.41796875, "learning_rate": 5.3125e-07, "loss": 0.0004, "reward": 3.2564741373062134, "reward_std": 0.10630857944488525, "rewards/final_reward": 1.307717137972623, "rewards/mask_iou_reward": 0.6538585689863115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2564740180969238, "rewards/thk_ans_format_reward": 1.0, "step": 1665, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 167.64583587646484, "epoch": 5.627318718381113, "grad_norm": 7.549720713456712, "kl": 0.375, "learning_rate": 5.309684684684685e-07, "loss": 0.0004, "reward": 3.6098986864089966, "reward_std": 0.08896861225366592, "rewards/final_reward": 1.8692268517240347, "rewards/mask_iou_reward": 0.9346134258620173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.609898328781128, "rewards/thk_ans_format_reward": 1.0, "step": 1666, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 118.59375381469727, "epoch": 5.630691399662732, "grad_norm": 6.874837411521086, "kl": 0.572265625, "learning_rate": 5.306869369369369e-07, "loss": 0.0006, "reward": 3.6495296955108643, "reward_std": 0.020325029268860817, "rewards/final_reward": 1.8610696951488621, "rewards/mask_iou_reward": 0.9305348475744311, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6495293974876404, "rewards/thk_ans_format_reward": 1.0, "step": 1667, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 116.95833587646484, "epoch": 5.63406408094435, "grad_norm": 15.487206766913037, "kl": 0.4453125, "learning_rate": 5.304054054054054e-07, "loss": 0.0004, "reward": 3.2291752099990845, "reward_std": 0.0427558608353138, "rewards/final_reward": 1.6678412198822345, "rewards/mask_iou_reward": 0.8339206099411173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.229175090789795, "rewards/thk_ans_format_reward": 1.0, "step": 1668, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 117.07292175292969, "epoch": 5.63743676222597, "grad_norm": 11.987338469465488, "kl": 0.490234375, "learning_rate": 5.301238738738738e-07, "loss": 0.0005, "reward": 3.36657178401947, "reward_std": 0.06892849691212177, "rewards/final_reward": 1.483850377035424, "rewards/mask_iou_reward": 0.741925188517712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3665716052055359, "rewards/thk_ans_format_reward": 1.0, "step": 1669, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 131.81250762939453, "epoch": 5.640809443507589, "grad_norm": 8.52808794702646, "kl": 0.4130859375, "learning_rate": 5.298423423423423e-07, "loss": 0.0004, "reward": 3.2042828798294067, "reward_std": 0.04842074401676655, "rewards/final_reward": 1.0179814834202083, "rewards/mask_iou_reward": 0.5089907417101042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2042827904224396, "rewards/thk_ans_format_reward": 1.0, "step": 1670, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.20833587646484, "epoch": 5.644182124789207, "grad_norm": 11.609123162744043, "kl": 0.49609375, "learning_rate": 5.295608108108109e-07, "loss": 0.0005, "reward": 3.619337797164917, "reward_std": 0.10815814649686217, "rewards/final_reward": 1.6707233426072208, "rewards/mask_iou_reward": 0.8353616713036104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.619337797164917, "rewards/thk_ans_format_reward": 1.0, "step": 1671, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 145.75000381469727, "epoch": 5.6475548060708265, "grad_norm": 9.05456878348143, "kl": 0.466796875, "learning_rate": 5.292792792792793e-07, "loss": 0.0005, "reward": 3.596415400505066, "reward_std": 0.04145562183111906, "rewards/final_reward": 1.4119979903838544, "rewards/mask_iou_reward": 0.7059989951919272, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5964152216911316, "rewards/thk_ans_format_reward": 1.0, "step": 1672, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 119.36458587646484, "epoch": 5.650927487352445, "grad_norm": 7.830828364844946, "kl": 0.4765625, "learning_rate": 5.289977477477478e-07, "loss": 0.0005, "reward": 3.470069169998169, "reward_std": 0.07297424972057343, "rewards/final_reward": 1.0570247474625651, "rewards/mask_iou_reward": 0.5285123737312826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4700689315795898, "rewards/thk_ans_format_reward": 1.0, "step": 1673, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.60416793823242, "epoch": 5.654300168634064, "grad_norm": 8.478112364771983, "kl": 0.638671875, "learning_rate": 5.287162162162162e-07, "loss": 0.0006, "reward": 3.4935306310653687, "reward_std": 0.07750003226101398, "rewards/final_reward": 1.1903071669456566, "rewards/mask_iou_reward": 0.5951535834728283, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.493530511856079, "rewards/thk_ans_format_reward": 1.0, "step": 1674, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.86458587646484, "epoch": 5.6576728499156825, "grad_norm": 13.125329872620233, "kl": 0.884765625, "learning_rate": 5.284346846846847e-07, "loss": 0.0009, "reward": 3.6382278203964233, "reward_std": 0.1006748378276825, "rewards/final_reward": 1.789235596962822, "rewards/mask_iou_reward": 0.894617798481411, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6382275819778442, "rewards/thk_ans_format_reward": 1.0, "step": 1675, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 127.77083969116211, "epoch": 5.661045531197302, "grad_norm": 107.66558914496177, "kl": 0.45703125, "learning_rate": 5.281531531531532e-07, "loss": 0.0005, "reward": 3.3801087141036987, "reward_std": 0.10969089716672897, "rewards/final_reward": 1.0466964469001674, "rewards/mask_iou_reward": 0.5233482234500837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3801087141036987, "rewards/thk_ans_format_reward": 1.0, "step": 1676, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.37500381469727, "epoch": 5.664418212478921, "grad_norm": 10.73940996989274, "kl": 0.4755859375, "learning_rate": 5.278716216216215e-07, "loss": 0.0005, "reward": 3.7271050214767456, "reward_std": 0.08030565828084946, "rewards/final_reward": 1.5374772492864222, "rewards/mask_iou_reward": 0.7687386246432111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7271050214767456, "rewards/thk_ans_format_reward": 1.0, "step": 1677, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.1875, "epoch": 5.6677908937605395, "grad_norm": 6.539633603292251, "kl": 0.4658203125, "learning_rate": 5.2759009009009e-07, "loss": 0.0005, "reward": 3.651551127433777, "reward_std": 0.06802648678421974, "rewards/final_reward": 1.3033550017231739, "rewards/mask_iou_reward": 0.6516775008615869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6515511274337769, "rewards/thk_ans_format_reward": 1.0, "step": 1678, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 129.50000381469727, "epoch": 5.671163575042159, "grad_norm": 8.418325679063077, "kl": 0.4365234375, "learning_rate": 5.273085585585584e-07, "loss": 0.0004, "reward": 3.2904953956604004, "reward_std": 0.11499625258147717, "rewards/final_reward": 1.4031736868913716, "rewards/mask_iou_reward": 0.7015868434456858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2904953360557556, "rewards/thk_ans_format_reward": 1.0, "step": 1679, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 145.84375, "epoch": 5.674536256323777, "grad_norm": 23.618377937796875, "kl": 0.576171875, "learning_rate": 5.270270270270269e-07, "loss": 0.0006, "reward": 3.468711018562317, "reward_std": 0.18310219049453735, "rewards/final_reward": 1.1853710656067125, "rewards/mask_iou_reward": 0.5926855328033562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4687109589576721, "rewards/thk_ans_format_reward": 1.0, "step": 1680, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 163.11458587646484, "epoch": 5.677908937605396, "grad_norm": 7.811614773663714, "kl": 0.474609375, "learning_rate": 5.267454954954955e-07, "loss": 0.0005, "reward": 3.676987886428833, "reward_std": 0.07566726952791214, "rewards/final_reward": 1.582874192705009, "rewards/mask_iou_reward": 0.7914370963525045, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.676987886428833, "rewards/thk_ans_format_reward": 1.0, "step": 1681, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.19791793823242, "epoch": 5.681281618887015, "grad_norm": 9.22818287374541, "kl": 0.4150390625, "learning_rate": 5.264639639639639e-07, "loss": 0.0004, "reward": 3.2872849702835083, "reward_std": 0.1653279960155487, "rewards/final_reward": 1.6721876660408017, "rewards/mask_iou_reward": 0.8360938330204009, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2872849702835083, "rewards/thk_ans_format_reward": 1.0, "step": 1682, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 160.2291717529297, "epoch": 5.684654300168634, "grad_norm": 13.911973370238616, "kl": 0.4345703125, "learning_rate": 5.261824324324324e-07, "loss": 0.0004, "reward": 3.442698359489441, "reward_std": 0.07027308642864227, "rewards/final_reward": 1.656888151695507, "rewards/mask_iou_reward": 0.8284440758477535, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.442698359489441, "rewards/thk_ans_format_reward": 1.0, "step": 1683, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 151.83333587646484, "epoch": 5.688026981450253, "grad_norm": 9.683213301253808, "kl": 0.435546875, "learning_rate": 5.259009009009009e-07, "loss": 0.0004, "reward": 3.4510587453842163, "reward_std": 0.12935582548379898, "rewards/final_reward": 1.524409884196702, "rewards/mask_iou_reward": 0.762204942098351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4510589241981506, "rewards/thk_ans_format_reward": 1.0, "step": 1684, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 126.58333587646484, "epoch": 5.691399662731872, "grad_norm": 9.08676964608566, "kl": 0.5107421875, "learning_rate": 5.256193693693693e-07, "loss": 0.0005, "reward": 2.989185094833374, "reward_std": 0.08790277317166328, "rewards/final_reward": 1.412420054225461, "rewards/mask_iou_reward": 0.7062100271127305, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9891849160194397, "rewards/thk_ans_format_reward": 1.0, "step": 1685, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.0416717529297, "epoch": 5.694772344013491, "grad_norm": 15.651366383255017, "kl": 0.421875, "learning_rate": 5.253378378378378e-07, "loss": 0.0004, "reward": 3.205936312675476, "reward_std": 0.10460496693849564, "rewards/final_reward": 1.316913765441078, "rewards/mask_iou_reward": 0.658456882720539, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.205936223268509, "rewards/thk_ans_format_reward": 1.0, "step": 1686, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 139.11458587646484, "epoch": 5.698145025295109, "grad_norm": 9.987256106162631, "kl": 0.4052734375, "learning_rate": 5.250563063063062e-07, "loss": 0.0004, "reward": 3.3900359869003296, "reward_std": 0.09033327549695969, "rewards/final_reward": 0.7899245332137992, "rewards/mask_iou_reward": 0.3949622666068996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3900358080863953, "rewards/thk_ans_format_reward": 1.0, "step": 1687, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.42708587646484, "epoch": 5.701517706576729, "grad_norm": 32.114201494541796, "kl": 0.4638671875, "learning_rate": 5.247747747747747e-07, "loss": 0.0005, "reward": 3.728898286819458, "reward_std": 0.03537928406149149, "rewards/final_reward": 1.4416144211001236, "rewards/mask_iou_reward": 0.7208072105500618, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7288981676101685, "rewards/thk_ans_format_reward": 1.0, "step": 1688, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 121.82291793823242, "epoch": 5.704890387858347, "grad_norm": 15.12992837384189, "kl": 0.5078125, "learning_rate": 5.244932432432432e-07, "loss": 0.0005, "reward": 3.466995120048523, "reward_std": 0.11427108105272055, "rewards/final_reward": 1.6574982774473515, "rewards/mask_iou_reward": 0.8287491387236757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4669951796531677, "rewards/thk_ans_format_reward": 1.0, "step": 1689, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 130.23958587646484, "epoch": 5.708263069139966, "grad_norm": 15.175398411998135, "kl": 0.4794921875, "learning_rate": 5.242117117117116e-07, "loss": 0.0005, "reward": 3.348542332649231, "reward_std": 0.12049789726734161, "rewards/final_reward": 1.367889439033234, "rewards/mask_iou_reward": 0.683944719516617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3485422134399414, "rewards/thk_ans_format_reward": 1.0, "step": 1690, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 108.98958587646484, "epoch": 5.7116357504215856, "grad_norm": 10.96788286776064, "kl": 0.515625, "learning_rate": 5.239301801801802e-07, "loss": 0.0005, "reward": 3.5892138481140137, "reward_std": 0.10701700299978256, "rewards/final_reward": 1.3613337813454496, "rewards/mask_iou_reward": 0.6806668906727248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5892136096954346, "rewards/thk_ans_format_reward": 1.0, "step": 1691, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.25000381469727, "epoch": 5.715008431703204, "grad_norm": 19.324395640798016, "kl": 0.408203125, "learning_rate": 5.236486486486486e-07, "loss": 0.0004, "reward": 3.7077428102493286, "reward_std": 0.036984759382903576, "rewards/final_reward": 1.5949915653025974, "rewards/mask_iou_reward": 0.7974957826512987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7077427506446838, "rewards/thk_ans_format_reward": 1.0, "step": 1692, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 116.94791793823242, "epoch": 5.718381112984823, "grad_norm": 31.703385874009, "kl": 0.544921875, "learning_rate": 5.233671171171171e-07, "loss": 0.0006, "reward": 3.311197876930237, "reward_std": 0.07203967496752739, "rewards/final_reward": 0.9592490981961042, "rewards/mask_iou_reward": 0.4796245490980521, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3111979365348816, "rewards/thk_ans_format_reward": 1.0, "step": 1693, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.61459350585938, "epoch": 5.721753794266442, "grad_norm": 18.58018766643279, "kl": 0.505859375, "learning_rate": 5.230855855855856e-07, "loss": 0.0005, "reward": 3.0218993425369263, "reward_std": 0.052796896547079086, "rewards/final_reward": 1.4534910951146789, "rewards/mask_iou_reward": 0.7267455475573394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0218994319438934, "rewards/thk_ans_format_reward": 1.0, "step": 1694, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.90625381469727, "epoch": 5.725126475548061, "grad_norm": 68.33497978649426, "kl": 0.484375, "learning_rate": 5.22804054054054e-07, "loss": 0.0005, "reward": 3.3459736108779907, "reward_std": 0.13298944756388664, "rewards/final_reward": 1.8743874890546413, "rewards/mask_iou_reward": 0.9371937445273206, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3459736108779907, "rewards/thk_ans_format_reward": 1.0, "step": 1695, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 130.1979217529297, "epoch": 5.728499156829679, "grad_norm": 18.308639001399047, "kl": 0.4443359375, "learning_rate": 5.225225225225225e-07, "loss": 0.0005, "reward": 3.351117730140686, "reward_std": 0.09872744139283895, "rewards/final_reward": 1.233155712943284, "rewards/mask_iou_reward": 0.616577856471642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3511175513267517, "rewards/thk_ans_format_reward": 1.0, "step": 1696, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 131.0, "epoch": 5.7318718381112985, "grad_norm": 11.213363187180152, "kl": 0.443359375, "learning_rate": 5.22240990990991e-07, "loss": 0.0004, "reward": 3.452345371246338, "reward_std": 0.1489114686846733, "rewards/final_reward": 1.5670607741123705, "rewards/mask_iou_reward": 0.7835303870561853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4523451924324036, "rewards/thk_ans_format_reward": 1.0, "step": 1697, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 149.53125, "epoch": 5.735244519392918, "grad_norm": 14.89224352148172, "kl": 0.3994140625, "learning_rate": 5.219594594594594e-07, "loss": 0.0004, "reward": 3.5091474056243896, "reward_std": 0.1592223308980465, "rewards/final_reward": 1.7831663927632908, "rewards/mask_iou_reward": 0.8915831963816454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5091473460197449, "rewards/thk_ans_format_reward": 1.0, "step": 1698, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.80208587646484, "epoch": 5.738617200674536, "grad_norm": 23.0003418459873, "kl": 0.4375, "learning_rate": 5.216779279279279e-07, "loss": 0.0004, "reward": 3.3121293783187866, "reward_std": 0.13283708691596985, "rewards/final_reward": 1.2491971488263538, "rewards/mask_iou_reward": 0.6245985744131769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3121294975280762, "rewards/thk_ans_format_reward": 1.0, "step": 1699, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 121.50000381469727, "epoch": 5.7419898819561555, "grad_norm": 45.87529807068098, "kl": 0.4248046875, "learning_rate": 5.213963963963963e-07, "loss": 0.0004, "reward": 3.6261146068573, "reward_std": 0.08257642015814781, "rewards/final_reward": 1.6618618352805021, "rewards/mask_iou_reward": 0.8309309176402511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6261144876480103, "rewards/thk_ans_format_reward": 1.0, "step": 1700, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 127.11458587646484, "epoch": 5.745362563237774, "grad_norm": 18.868106450012085, "kl": 0.6328125, "learning_rate": 5.211148648648649e-07, "loss": 0.0006, "reward": 3.6011475324630737, "reward_std": 0.1154952123761177, "rewards/final_reward": 1.844404480008979, "rewards/mask_iou_reward": 0.9222022400044895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6011476516723633, "rewards/thk_ans_format_reward": 1.0, "step": 1701, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.39583587646484, "epoch": 5.748735244519393, "grad_norm": 9.694625454651934, "kl": 0.53515625, "learning_rate": 5.208333333333334e-07, "loss": 0.0005, "reward": 3.4521749019622803, "reward_std": 0.10139688663184643, "rewards/final_reward": 0.945008864001477, "rewards/mask_iou_reward": 0.4725044320007385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4521750807762146, "rewards/thk_ans_format_reward": 1.0, "step": 1702, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 129.66666793823242, "epoch": 5.7521079258010115, "grad_norm": 17.253212109876063, "kl": 0.53125, "learning_rate": 5.205518018018018e-07, "loss": 0.0005, "reward": 3.2960422039031982, "reward_std": 0.1437181867659092, "rewards/final_reward": 0.952370236450588, "rewards/mask_iou_reward": 0.476185118225294, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3064589500427246, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1703, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.50000762939453, "epoch": 5.755480607082631, "grad_norm": 17.31363700736517, "kl": 0.423828125, "learning_rate": 5.202702702702703e-07, "loss": 0.0004, "reward": 3.463135838508606, "reward_std": 0.04686561040580273, "rewards/final_reward": 1.8182827151669658, "rewards/mask_iou_reward": 0.9091413575834829, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4631354808807373, "rewards/thk_ans_format_reward": 1.0, "step": 1704, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.5, "epoch": 5.75885328836425, "grad_norm": 10.915200571014212, "kl": 0.5595703125, "learning_rate": 5.199887387387387e-07, "loss": 0.0006, "reward": 3.3400657176971436, "reward_std": 0.12699758261442184, "rewards/final_reward": 0.9774009059955321, "rewards/mask_iou_reward": 0.48870045299776604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.340065598487854, "rewards/thk_ans_format_reward": 1.0, "step": 1705, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 122.79166793823242, "epoch": 5.762225969645868, "grad_norm": 10.499187119948228, "kl": 0.4296875, "learning_rate": 5.197072072072072e-07, "loss": 0.0004, "reward": 3.5650423765182495, "reward_std": 0.18133790232241154, "rewards/final_reward": 1.4786204764762632, "rewards/mask_iou_reward": 0.7393102382381316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5650423765182495, "rewards/thk_ans_format_reward": 1.0, "step": 1706, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 136.42708587646484, "epoch": 5.765598650927488, "grad_norm": 19.482838286261735, "kl": 0.5107421875, "learning_rate": 5.194256756756757e-07, "loss": 0.0005, "reward": 3.2994097471237183, "reward_std": 0.10507571697235107, "rewards/final_reward": 0.855652438364876, "rewards/mask_iou_reward": 0.427826219182438, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.299409806728363, "rewards/thk_ans_format_reward": 1.0, "step": 1707, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 128.85416793823242, "epoch": 5.768971332209106, "grad_norm": 10.232377939016123, "kl": 0.484375, "learning_rate": 5.191441441441441e-07, "loss": 0.0005, "reward": 3.475585460662842, "reward_std": 0.10696535930037498, "rewards/final_reward": 1.4330362248927848, "rewards/mask_iou_reward": 0.7165181124463924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.475585401058197, "rewards/thk_ans_format_reward": 1.0, "step": 1708, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 134.16666793823242, "epoch": 5.772344013490725, "grad_norm": 8.795494606383011, "kl": 0.4638671875, "learning_rate": 5.188626126126126e-07, "loss": 0.0005, "reward": 3.252189874649048, "reward_std": 0.116634551435709, "rewards/final_reward": 1.8850582728807999, "rewards/mask_iou_reward": 0.9425291364403999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2521897554397583, "rewards/thk_ans_format_reward": 1.0, "step": 1709, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.46875, "epoch": 5.775716694772344, "grad_norm": 11.388991934526885, "kl": 0.603515625, "learning_rate": 5.18581081081081e-07, "loss": 0.0006, "reward": 3.1848024129867554, "reward_std": 0.07361265271902084, "rewards/final_reward": 1.122927262485939, "rewards/mask_iou_reward": 0.5614636312429695, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1848024129867554, "rewards/thk_ans_format_reward": 1.0, "step": 1710, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 118.0625, "epoch": 5.779089376053963, "grad_norm": 21.639140996155447, "kl": 0.455078125, "learning_rate": 5.182995495495496e-07, "loss": 0.0005, "reward": 3.5467482805252075, "reward_std": 0.08291709423065186, "rewards/final_reward": 1.4941731254870554, "rewards/mask_iou_reward": 0.7470865627435277, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5467482209205627, "rewards/thk_ans_format_reward": 1.0, "step": 1711, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.73958587646484, "epoch": 5.782462057335582, "grad_norm": 22.735884919407788, "kl": 0.61328125, "learning_rate": 5.180180180180181e-07, "loss": 0.0006, "reward": 2.988006830215454, "reward_std": 0.13545718044042587, "rewards/final_reward": 0.8691919661445637, "rewards/mask_iou_reward": 0.43459598307228187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9880067706108093, "rewards/thk_ans_format_reward": 1.0, "step": 1712, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 188.68750762939453, "epoch": 5.785834738617201, "grad_norm": 7.582875420681268, "kl": 0.4755859375, "learning_rate": 5.177364864864865e-07, "loss": 0.0005, "reward": 3.2828404903411865, "reward_std": 0.07186983339488506, "rewards/final_reward": 1.1207400699644596, "rewards/mask_iou_reward": 0.5603700349822298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2828404903411865, "rewards/thk_ans_format_reward": 1.0, "step": 1713, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 121.79167175292969, "epoch": 5.78920741989882, "grad_norm": 11.276241793600565, "kl": 0.486328125, "learning_rate": 5.17454954954955e-07, "loss": 0.0005, "reward": 3.64591646194458, "reward_std": 0.10193854942917824, "rewards/final_reward": 1.731438612762751, "rewards/mask_iou_reward": 0.8657193063813755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6459165215492249, "rewards/thk_ans_format_reward": 1.0, "step": 1714, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 128.6666717529297, "epoch": 5.792580101180438, "grad_norm": 8.613140831131751, "kl": 0.701171875, "learning_rate": 5.171734234234235e-07, "loss": 0.0007, "reward": 3.4341585636138916, "reward_std": 0.10433509945869446, "rewards/final_reward": 1.3357545861748017, "rewards/mask_iou_reward": 0.6678772930874008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4341585636138916, "rewards/thk_ans_format_reward": 1.0, "step": 1715, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.16666793823242, "epoch": 5.795952782462058, "grad_norm": 11.510790702330484, "kl": 0.5400390625, "learning_rate": 5.168918918918919e-07, "loss": 0.0005, "reward": 3.6309818029403687, "reward_std": 0.07807623594999313, "rewards/final_reward": 1.3849225081200518, "rewards/mask_iou_reward": 0.6924612540600259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6309821009635925, "rewards/thk_ans_format_reward": 1.0, "step": 1716, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 125.16666793823242, "epoch": 5.799325463743676, "grad_norm": 54.070522534119505, "kl": 0.439453125, "learning_rate": 5.166103603603603e-07, "loss": 0.0004, "reward": 3.3230055570602417, "reward_std": 0.11343681067228317, "rewards/final_reward": 1.5856585419995561, "rewards/mask_iou_reward": 0.7928292709997781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3230054378509521, "rewards/thk_ans_format_reward": 1.0, "step": 1717, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 120.53125, "epoch": 5.802698145025295, "grad_norm": 23.76981236983568, "kl": 0.525390625, "learning_rate": 5.163288288288287e-07, "loss": 0.0005, "reward": 3.3168094158172607, "reward_std": 0.1799912080168724, "rewards/final_reward": 1.5219490770111228, "rewards/mask_iou_reward": 0.7609745385055614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3168094754219055, "rewards/thk_ans_format_reward": 1.0, "step": 1718, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 124.60416793823242, "epoch": 5.806070826306914, "grad_norm": 10.974777498998844, "kl": 0.4794921875, "learning_rate": 5.160472972972972e-07, "loss": 0.0005, "reward": 3.6152318716049194, "reward_std": 0.10453075915575027, "rewards/final_reward": 1.632990182382099, "rewards/mask_iou_reward": 0.8164950911910495, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6152318120002747, "rewards/thk_ans_format_reward": 1.0, "step": 1719, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 136.50000381469727, "epoch": 5.809443507588533, "grad_norm": 8.54597729635777, "kl": 0.478515625, "learning_rate": 5.157657657657657e-07, "loss": 0.0005, "reward": 3.0575523376464844, "reward_std": 0.040778761729598045, "rewards/final_reward": 0.7077429081727465, "rewards/mask_iou_reward": 0.35387145408637327, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.05755215883255, "rewards/thk_ans_format_reward": 1.0, "step": 1720, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.5104217529297, "epoch": 5.812816188870151, "grad_norm": 8.132198049021852, "kl": 0.48046875, "learning_rate": 5.154842342342342e-07, "loss": 0.0005, "reward": 3.6106003522872925, "reward_std": 0.08156681805849075, "rewards/final_reward": 1.3410413718043763, "rewards/mask_iou_reward": 0.6705206859021882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6106004118919373, "rewards/thk_ans_format_reward": 1.0, "step": 1721, "think_completion_length": 10.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 157.6041717529297, "epoch": 5.8161888701517706, "grad_norm": 15.388878973584676, "kl": 0.470703125, "learning_rate": 5.152027027027027e-07, "loss": 0.0005, "reward": 3.5082924365997314, "reward_std": 0.2979699335992336, "rewards/final_reward": 1.8881755293728573, "rewards/mask_iou_reward": 0.9440877646864286, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5291257500648499, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1722, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 118.13542175292969, "epoch": 5.81956155143339, "grad_norm": 98.13463787790987, "kl": 0.572265625, "learning_rate": 5.149211711711711e-07, "loss": 0.0006, "reward": 3.3044140338897705, "reward_std": 0.05266350507736206, "rewards/final_reward": 0.9294198038171804, "rewards/mask_iou_reward": 0.4647099019085902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3044140934944153, "rewards/thk_ans_format_reward": 1.0, "step": 1723, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 153.34375, "epoch": 5.822934232715008, "grad_norm": 17.488677944153867, "kl": 0.49609375, "learning_rate": 5.146396396396396e-07, "loss": 0.0005, "reward": 3.245222806930542, "reward_std": 0.14908801019191742, "rewards/final_reward": 1.455564582508996, "rewards/mask_iou_reward": 0.727782291254498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2452226877212524, "rewards/thk_ans_format_reward": 1.0, "step": 1724, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 154.62500762939453, "epoch": 5.8263069139966275, "grad_norm": 7.789892829358596, "kl": 0.443359375, "learning_rate": 5.143581081081081e-07, "loss": 0.0004, "reward": 3.2473161220550537, "reward_std": 0.12342843785881996, "rewards/final_reward": 1.5933870322320047, "rewards/mask_iou_reward": 0.7966935161160024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2473162412643433, "rewards/thk_ans_format_reward": 1.0, "step": 1725, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 148.61458587646484, "epoch": 5.829679595278246, "grad_norm": 13.595597903885679, "kl": 0.462890625, "learning_rate": 5.140765765765765e-07, "loss": 0.0005, "reward": 3.493627429008484, "reward_std": 0.10147467255592346, "rewards/final_reward": 1.446620973518145, "rewards/mask_iou_reward": 0.7233104867590725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4936274290084839, "rewards/thk_ans_format_reward": 1.0, "step": 1726, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 123.94791793823242, "epoch": 5.833052276559865, "grad_norm": 11.792450197444229, "kl": 1.236328125, "learning_rate": 5.13795045045045e-07, "loss": 0.0012, "reward": 3.316351890563965, "reward_std": 0.07703639194369316, "rewards/final_reward": 1.0956512096704052, "rewards/mask_iou_reward": 0.5478256048352026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3163517713546753, "rewards/thk_ans_format_reward": 1.0, "step": 1727, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 167.7604217529297, "epoch": 5.8364249578414835, "grad_norm": 8.292767744572648, "kl": 0.3994140625, "learning_rate": 5.135135135135134e-07, "loss": 0.0004, "reward": 3.33501935005188, "reward_std": 0.2470620460808277, "rewards/final_reward": 1.5175090174757648, "rewards/mask_iou_reward": 0.7587545087378824, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3558525443077087, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1728, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 119.21875, "epoch": 5.839797639123103, "grad_norm": 8.741281688283356, "kl": 0.568359375, "learning_rate": 5.132319819819819e-07, "loss": 0.0006, "reward": 3.7741293907165527, "reward_std": 0.06825266778469086, "rewards/final_reward": 1.4154810710394847, "rewards/mask_iou_reward": 0.7077405355197424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.774129331111908, "rewards/thk_ans_format_reward": 1.0, "step": 1729, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 167.61458587646484, "epoch": 5.843170320404722, "grad_norm": 19.962365291750043, "kl": 0.361328125, "learning_rate": 5.129504504504504e-07, "loss": 0.0004, "reward": 3.6556034088134766, "reward_std": 0.0850059799849987, "rewards/final_reward": 1.8285352974407516, "rewards/mask_iou_reward": 0.9142676487203758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.655603289604187, "rewards/thk_ans_format_reward": 1.0, "step": 1730, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 161.71875, "epoch": 5.8465430016863404, "grad_norm": 18.535866700500076, "kl": 0.4091796875, "learning_rate": 5.126689189189189e-07, "loss": 0.0004, "reward": 3.645634651184082, "reward_std": 0.08320539817214012, "rewards/final_reward": 1.8012216379238128, "rewards/mask_iou_reward": 0.9006108189619064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.645634651184082, "rewards/thk_ans_format_reward": 1.0, "step": 1731, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 183.08333587646484, "epoch": 5.84991568296796, "grad_norm": 10.25741967867522, "kl": 0.3798828125, "learning_rate": 5.123873873873874e-07, "loss": 0.0004, "reward": 3.1230812072753906, "reward_std": 0.12471498548984528, "rewards/final_reward": 1.1030794339834928, "rewards/mask_iou_reward": 0.5515397169917464, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1230809688568115, "rewards/thk_ans_format_reward": 1.0, "step": 1732, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 170.0729217529297, "epoch": 5.853288364249578, "grad_norm": 11.29392121179074, "kl": 0.6533203125, "learning_rate": 5.121058558558559e-07, "loss": 0.0007, "reward": 3.413060188293457, "reward_std": 0.09217966627329588, "rewards/final_reward": 1.7253575871750448, "rewards/mask_iou_reward": 0.8626787935875224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4130603075027466, "rewards/thk_ans_format_reward": 1.0, "step": 1733, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 167.27083587646484, "epoch": 5.856661045531197, "grad_norm": 11.747910346406071, "kl": 0.466796875, "learning_rate": 5.118243243243243e-07, "loss": 0.0005, "reward": 3.640872836112976, "reward_std": 0.03201808128505945, "rewards/final_reward": 1.9409818975883342, "rewards/mask_iou_reward": 0.9704909487941671, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6408729553222656, "rewards/thk_ans_format_reward": 1.0, "step": 1734, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 217.23958587646484, "epoch": 5.860033726812816, "grad_norm": 6.658730706855383, "kl": 0.42578125, "learning_rate": 5.115427927927928e-07, "loss": 0.0004, "reward": 3.450482726097107, "reward_std": 0.38272392749786377, "rewards/final_reward": 1.4120153309699566, "rewards/mask_iou_reward": 0.7060076654849783, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.512982964515686, "rewards/thk_ans_format_reward": 0.9687500298023224, "step": 1735, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 180.7916717529297, "epoch": 5.863406408094435, "grad_norm": 22.93808254518162, "kl": 0.4140625, "learning_rate": 5.112612612612612e-07, "loss": 0.0004, "reward": 3.6187745332717896, "reward_std": 0.09785094857215881, "rewards/final_reward": 1.4834057744913518, "rewards/mask_iou_reward": 0.7417028872456759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6187745928764343, "rewards/thk_ans_format_reward": 1.0, "step": 1736, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 201.2916717529297, "epoch": 5.866779089376054, "grad_norm": 10.396671513600031, "kl": 0.4033203125, "learning_rate": 5.109797297297297e-07, "loss": 0.0004, "reward": 3.3890039920806885, "reward_std": 0.11911951750516891, "rewards/final_reward": 0.9812503136409951, "rewards/mask_iou_reward": 0.49062515682049757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3890039920806885, "rewards/thk_ans_format_reward": 1.0, "step": 1737, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 145.65625, "epoch": 5.870151770657673, "grad_norm": 29.160739711927683, "kl": 0.5849609375, "learning_rate": 5.106981981981982e-07, "loss": 0.0006, "reward": 3.5928313732147217, "reward_std": 0.05474974773824215, "rewards/final_reward": 1.9592838294684447, "rewards/mask_iou_reward": 0.9796419147342224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5928313732147217, "rewards/thk_ans_format_reward": 1.0, "step": 1738, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 202.73959350585938, "epoch": 5.873524451939292, "grad_norm": 17.75064820845106, "kl": 0.4208984375, "learning_rate": 5.104166666666666e-07, "loss": 0.0004, "reward": 3.7334030866622925, "reward_std": 0.05087855085730553, "rewards/final_reward": 1.7924745983336359, "rewards/mask_iou_reward": 0.8962372991668179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7334030866622925, "rewards/thk_ans_format_reward": 1.0, "step": 1739, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 145.11458587646484, "epoch": 5.87689713322091, "grad_norm": 18.717009916786903, "kl": 0.546875, "learning_rate": 5.101351351351351e-07, "loss": 0.0006, "reward": 3.5742392539978027, "reward_std": 0.09535662084817886, "rewards/final_reward": 1.240059957912595, "rewards/mask_iou_reward": 0.6200299789562975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5742393732070923, "rewards/thk_ans_format_reward": 1.0, "step": 1740, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 155.1875, "epoch": 5.88026981450253, "grad_norm": 16.3173306666386, "kl": 0.4306640625, "learning_rate": 5.098536036036036e-07, "loss": 0.0004, "reward": 3.5463132858276367, "reward_std": 0.06176626309752464, "rewards/final_reward": 1.612073242086272, "rewards/mask_iou_reward": 0.806036621043136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5463131666183472, "rewards/thk_ans_format_reward": 1.0, "step": 1741, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.86458587646484, "epoch": 5.883642495784148, "grad_norm": 15.342405431064343, "kl": 0.46875, "learning_rate": 5.095720720720721e-07, "loss": 0.0005, "reward": 3.6642171144485474, "reward_std": 0.10495152324438095, "rewards/final_reward": 1.5927597251292855, "rewards/mask_iou_reward": 0.7963798625646428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6642171740531921, "rewards/thk_ans_format_reward": 1.0, "step": 1742, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 177.39583587646484, "epoch": 5.887015177065767, "grad_norm": 6.6510036765395535, "kl": 0.5302734375, "learning_rate": 5.092905405405406e-07, "loss": 0.0005, "reward": 3.4673802852630615, "reward_std": 0.0501430481672287, "rewards/final_reward": 1.4812380552366138, "rewards/mask_iou_reward": 0.7406190276183069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4673802852630615, "rewards/thk_ans_format_reward": 1.0, "step": 1743, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 167.89583587646484, "epoch": 5.8903878583473865, "grad_norm": 24.165763442619365, "kl": 0.4189453125, "learning_rate": 5.09009009009009e-07, "loss": 0.0004, "reward": 3.6040745973587036, "reward_std": 0.06931154802441597, "rewards/final_reward": 1.5248657320782466, "rewards/mask_iou_reward": 0.7624328660391233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.604074478149414, "rewards/thk_ans_format_reward": 1.0, "step": 1744, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 171.5729217529297, "epoch": 5.893760539629005, "grad_norm": 40.57122292570664, "kl": 0.4365234375, "learning_rate": 5.087274774774775e-07, "loss": 0.0004, "reward": 3.054714322090149, "reward_std": 0.08430700935423374, "rewards/final_reward": 1.288889173740179, "rewards/mask_iou_reward": 0.6444445868700895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0547142326831818, "rewards/thk_ans_format_reward": 1.0, "step": 1745, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 133.20833587646484, "epoch": 5.897133220910624, "grad_norm": 11.861157554661235, "kl": 0.423828125, "learning_rate": 5.084459459459459e-07, "loss": 0.0004, "reward": 3.5562628507614136, "reward_std": 0.10923858545720577, "rewards/final_reward": 1.3463834622729496, "rewards/mask_iou_reward": 0.6731917311364748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5562628507614136, "rewards/thk_ans_format_reward": 1.0, "step": 1746, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 248.36459350585938, "epoch": 5.900505902192243, "grad_norm": 7.507762898919182, "kl": 0.4873046875, "learning_rate": 5.081644144144144e-07, "loss": 0.0005, "reward": 3.6194422245025635, "reward_std": 0.27009348571300507, "rewards/final_reward": 1.5039307968904405, "rewards/mask_iou_reward": 0.7519653984452203, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.6611087322235107, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1747, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 166.7916717529297, "epoch": 5.903878583473862, "grad_norm": 17.826037032061887, "kl": 0.40234375, "learning_rate": 5.078828828828829e-07, "loss": 0.0004, "reward": 3.407109498977661, "reward_std": 0.08853336982429028, "rewards/final_reward": 1.2309654718579899, "rewards/mask_iou_reward": 0.6154827359289949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4071094989776611, "rewards/thk_ans_format_reward": 1.0, "step": 1748, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 163.1354217529297, "epoch": 5.90725126475548, "grad_norm": 39.26605245968703, "kl": 0.4814453125, "learning_rate": 5.076013513513513e-07, "loss": 0.0005, "reward": 3.35912024974823, "reward_std": 0.03402594896033406, "rewards/final_reward": 1.236561535022871, "rewards/mask_iou_reward": 0.6182807675114355, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3591201305389404, "rewards/thk_ans_format_reward": 1.0, "step": 1749, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 161.83333587646484, "epoch": 5.9106239460370995, "grad_norm": 18.73426457604382, "kl": 0.400390625, "learning_rate": 5.073198198198198e-07, "loss": 0.0004, "reward": 3.7030093669891357, "reward_std": 0.06464430969208479, "rewards/final_reward": 1.4769883342304442, "rewards/mask_iou_reward": 0.7384941671152221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7030091881752014, "rewards/thk_ans_format_reward": 1.0, "step": 1750, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 171.7916717529297, "epoch": 5.913996627318719, "grad_norm": 13.299711018916073, "kl": 0.41015625, "learning_rate": 5.070382882882884e-07, "loss": 0.0004, "reward": 3.3841612339019775, "reward_std": 0.08488386124372482, "rewards/final_reward": 1.4041756477925396, "rewards/mask_iou_reward": 0.7020878238962698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.384161114692688, "rewards/thk_ans_format_reward": 1.0, "step": 1751, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.23958587646484, "epoch": 5.917369308600337, "grad_norm": 9.5260628825992, "kl": 0.412109375, "learning_rate": 5.067567567567568e-07, "loss": 0.0004, "reward": 3.6380069255828857, "reward_std": 0.04551572538912296, "rewards/final_reward": 1.8168987853889136, "rewards/mask_iou_reward": 0.9084493926944568, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.638006865978241, "rewards/thk_ans_format_reward": 1.0, "step": 1752, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 202.05208587646484, "epoch": 5.920741989881956, "grad_norm": 6.56819437673071, "kl": 0.375, "learning_rate": 5.064752252252253e-07, "loss": 0.0004, "reward": 3.733555316925049, "reward_std": 0.10838979762047529, "rewards/final_reward": 1.9458549362643849, "rewards/mask_iou_reward": 0.9729274681321924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7335551381111145, "rewards/thk_ans_format_reward": 1.0, "step": 1753, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 134.85416793823242, "epoch": 5.924114671163575, "grad_norm": 17.981830604106577, "kl": 0.3955078125, "learning_rate": 5.061936936936937e-07, "loss": 0.0004, "reward": 3.5746283531188965, "reward_std": 0.0435329545289278, "rewards/final_reward": 1.8869617940125476, "rewards/mask_iou_reward": 0.9434808970062738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5746282935142517, "rewards/thk_ans_format_reward": 1.0, "step": 1754, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 163.5729217529297, "epoch": 5.927487352445194, "grad_norm": 17.18872625472572, "kl": 0.802734375, "learning_rate": 5.059121621621622e-07, "loss": 0.0008, "reward": 3.631569027900696, "reward_std": 0.0754023939371109, "rewards/final_reward": 1.5373684318831864, "rewards/mask_iou_reward": 0.7686842159415932, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6315687894821167, "rewards/thk_ans_format_reward": 1.0, "step": 1755, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 143.68750762939453, "epoch": 5.9308600337268125, "grad_norm": 10.750206089182589, "kl": 0.505859375, "learning_rate": 5.056306306306307e-07, "loss": 0.0005, "reward": 3.2980340719223022, "reward_std": 0.09991785138845444, "rewards/final_reward": 1.2133367748293613, "rewards/mask_iou_reward": 0.6066683874146807, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.298033893108368, "rewards/thk_ans_format_reward": 1.0, "step": 1756, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 155.75, "epoch": 5.934232715008432, "grad_norm": 8.232534124957862, "kl": 0.408203125, "learning_rate": 5.05349099099099e-07, "loss": 0.0004, "reward": 3.4839136600494385, "reward_std": 0.1400267817080021, "rewards/final_reward": 1.68557964538632, "rewards/mask_iou_reward": 0.84278982269316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4839134216308594, "rewards/thk_ans_format_reward": 1.0, "step": 1757, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 181.86458587646484, "epoch": 5.937605396290051, "grad_norm": 24.402661559828726, "kl": 0.4072265625, "learning_rate": 5.050675675675675e-07, "loss": 0.0004, "reward": 3.389898419380188, "reward_std": 0.10899049788713455, "rewards/final_reward": 1.6848548729834658, "rewards/mask_iou_reward": 0.8424274364917329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3898981809616089, "rewards/thk_ans_format_reward": 1.0, "step": 1758, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 144.33333587646484, "epoch": 5.940978077571669, "grad_norm": 5.222034550827015, "kl": 0.392578125, "learning_rate": 5.047860360360359e-07, "loss": 0.0004, "reward": 3.622321844100952, "reward_std": 0.03675311338156462, "rewards/final_reward": 1.9151095000401712, "rewards/mask_iou_reward": 0.9575547500200856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6223217844963074, "rewards/thk_ans_format_reward": 1.0, "step": 1759, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 171.59375762939453, "epoch": 5.944350758853289, "grad_norm": 27.666519901031105, "kl": 0.3837890625, "learning_rate": 5.045045045045044e-07, "loss": 0.0004, "reward": 3.654398560523987, "reward_std": 0.06969969533383846, "rewards/final_reward": 1.6697516701864739, "rewards/mask_iou_reward": 0.8348758350932369, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.654398500919342, "rewards/thk_ans_format_reward": 1.0, "step": 1760, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 144.5625, "epoch": 5.947723440134907, "grad_norm": 7.459759500944451, "kl": 0.4111328125, "learning_rate": 5.04222972972973e-07, "loss": 0.0004, "reward": 3.2948083877563477, "reward_std": 0.16588781774044037, "rewards/final_reward": 1.6697245311694944, "rewards/mask_iou_reward": 0.8348622655847472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2948083281517029, "rewards/thk_ans_format_reward": 1.0, "step": 1761, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.52083587646484, "epoch": 5.951096121416526, "grad_norm": 10.70021845117529, "kl": 0.71484375, "learning_rate": 5.039414414414414e-07, "loss": 0.0007, "reward": 3.3829610347747803, "reward_std": 0.12868967279791832, "rewards/final_reward": 1.9008267771256642, "rewards/mask_iou_reward": 0.9504133885628321, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3933777213096619, "rewards/thk_ans_format_reward": 1.0, "step": 1762, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.05208587646484, "epoch": 5.954468802698145, "grad_norm": 21.624002001456407, "kl": 0.4658203125, "learning_rate": 5.036599099099099e-07, "loss": 0.0005, "reward": 3.522372007369995, "reward_std": 0.060267508029937744, "rewards/final_reward": 1.941477651115174, "rewards/mask_iou_reward": 0.970738825557587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.522372305393219, "rewards/thk_ans_format_reward": 1.0, "step": 1763, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 143.75000762939453, "epoch": 5.957841483979764, "grad_norm": 12.43829962090804, "kl": 0.4326171875, "learning_rate": 5.033783783783783e-07, "loss": 0.0004, "reward": 3.623276948928833, "reward_std": 0.0512046292424202, "rewards/final_reward": 1.2826587931602818, "rewards/mask_iou_reward": 0.6413293965801409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6232768297195435, "rewards/thk_ans_format_reward": 1.0, "step": 1764, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 139.19791793823242, "epoch": 5.961214165261383, "grad_norm": 17.285948867539556, "kl": 0.60546875, "learning_rate": 5.030968468468468e-07, "loss": 0.0006, "reward": 3.4323031902313232, "reward_std": 0.17292555421590805, "rewards/final_reward": 1.7967404303750492, "rewards/mask_iou_reward": 0.8983702151875246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432303249835968, "rewards/thk_ans_format_reward": 1.0, "step": 1765, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 161.59375762939453, "epoch": 5.964586846543002, "grad_norm": 7.939744549301504, "kl": 0.58984375, "learning_rate": 5.028153153153153e-07, "loss": 0.0005, "reward": 3.7313038110733032, "reward_std": 0.06714446656405926, "rewards/final_reward": 1.5971374551194422, "rewards/mask_iou_reward": 0.7985687275597211, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.731303632259369, "rewards/thk_ans_format_reward": 1.0, "step": 1766, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 153.8229217529297, "epoch": 5.967959527824621, "grad_norm": 86.11679710344998, "kl": 0.4453125, "learning_rate": 5.025337837837837e-07, "loss": 0.0004, "reward": 3.4167131185531616, "reward_std": 0.07535018771886826, "rewards/final_reward": 1.813674784198955, "rewards/mask_iou_reward": 0.9068373920994774, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4167132377624512, "rewards/thk_ans_format_reward": 1.0, "step": 1767, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 177.4791717529297, "epoch": 5.971332209106239, "grad_norm": 12.385200771779376, "kl": 0.4775390625, "learning_rate": 5.022522522522522e-07, "loss": 0.0005, "reward": 3.4023174047470093, "reward_std": 0.12072728388011456, "rewards/final_reward": 0.8421889742507591, "rewards/mask_iou_reward": 0.42109448712537956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4023174047470093, "rewards/thk_ans_format_reward": 1.0, "step": 1768, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 174.03125, "epoch": 5.974704890387859, "grad_norm": 53.45386993073585, "kl": 0.47265625, "learning_rate": 5.019707207207206e-07, "loss": 0.0005, "reward": 3.238593816757202, "reward_std": 0.07144520059227943, "rewards/final_reward": 1.6239609793989143, "rewards/mask_iou_reward": 0.8119804896994571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2385937571525574, "rewards/thk_ans_format_reward": 1.0, "step": 1769, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 137.96875762939453, "epoch": 5.978077571669477, "grad_norm": 14.787089708404682, "kl": 0.443359375, "learning_rate": 5.016891891891891e-07, "loss": 0.0005, "reward": 3.7221224308013916, "reward_std": 0.053310368210077286, "rewards/final_reward": 1.821725056904525, "rewards/mask_iou_reward": 0.9108625284522625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7221226692199707, "rewards/thk_ans_format_reward": 1.0, "step": 1770, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 198.0104217529297, "epoch": 5.981450252951096, "grad_norm": 9.746981068060926, "kl": 0.453125, "learning_rate": 5.014076576576577e-07, "loss": 0.0005, "reward": 3.554728627204895, "reward_std": 0.024302124045789242, "rewards/final_reward": 1.364493583967756, "rewards/mask_iou_reward": 0.682246791983878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5547285079956055, "rewards/thk_ans_format_reward": 1.0, "step": 1771, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 155.89583587646484, "epoch": 5.9848229342327155, "grad_norm": 15.870396692550974, "kl": 0.732421875, "learning_rate": 5.011261261261261e-07, "loss": 0.0007, "reward": 3.3367944955825806, "reward_std": 0.05224468186497688, "rewards/final_reward": 1.6297709031076986, "rewards/mask_iou_reward": 0.8148854515538493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3367944955825806, "rewards/thk_ans_format_reward": 1.0, "step": 1772, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 124.56250381469727, "epoch": 5.988195615514334, "grad_norm": 14.485932527886074, "kl": 0.484375, "learning_rate": 5.008445945945946e-07, "loss": 0.0005, "reward": 3.652013063430786, "reward_std": 0.033195996191352606, "rewards/final_reward": 1.3546507063087527, "rewards/mask_iou_reward": 0.6773253531543764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6520132422447205, "rewards/thk_ans_format_reward": 1.0, "step": 1773, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.30208587646484, "epoch": 5.991568296795953, "grad_norm": 19.43956413548473, "kl": 0.431640625, "learning_rate": 5.005630630630631e-07, "loss": 0.0004, "reward": 3.2097524404525757, "reward_std": 0.06278990767896175, "rewards/final_reward": 1.227444801034187, "rewards/mask_iou_reward": 0.6137224005170935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.209752380847931, "rewards/thk_ans_format_reward": 1.0, "step": 1774, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 142.4583396911621, "epoch": 5.9949409780775715, "grad_norm": 14.53640635351357, "kl": 0.41015625, "learning_rate": 5.002815315315315e-07, "loss": 0.0004, "reward": 3.418621063232422, "reward_std": 0.17865736782550812, "rewards/final_reward": 1.7044326001778476, "rewards/mask_iou_reward": 0.8522163000889238, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.439454197883606, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1775, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 135.8684196472168, "epoch": 5.998313659359191, "grad_norm": 15.114676747335935, "kl": 0.423828125, "learning_rate": 5e-07, "loss": 0.0004, "reward": 3.6168004274368286, "reward_std": 0.01466382760554552, "rewards/final_reward": 1.718274332119896, "rewards/mask_iou_reward": 0.859137166059948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.616800308227539, "rewards/thk_ans_format_reward": 1.0, "step": 1776, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.81250762939453, "epoch": 6.003372681281619, "grad_norm": 11.445548876199032, "kl": 0.4482421875, "learning_rate": 4.997184684684684e-07, "loss": 0.0005, "reward": 3.3490960597991943, "reward_std": 0.22792461514472961, "rewards/final_reward": 1.3445919555141737, "rewards/mask_iou_reward": 0.6722959777570868, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.369929313659668, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1777, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.78125, "epoch": 6.006745362563238, "grad_norm": 15.350160021168787, "kl": 1.03125, "learning_rate": 4.994369369369369e-07, "loss": 0.001, "reward": 3.6605095863342285, "reward_std": 0.1871098130941391, "rewards/final_reward": 1.486225992428576, "rewards/mask_iou_reward": 0.743112996214288, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6813429594039917, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1778, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 126.95833587646484, "epoch": 6.010118043844857, "grad_norm": 19.652957475920775, "kl": 0.4345703125, "learning_rate": 4.991554054054054e-07, "loss": 0.0004, "reward": 3.4463671445846558, "reward_std": 0.11494097299873829, "rewards/final_reward": 1.5364096322411285, "rewards/mask_iou_reward": 0.7682048161205642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4463672041893005, "rewards/thk_ans_format_reward": 1.0, "step": 1779, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 158.87500381469727, "epoch": 6.013490725126475, "grad_norm": 21.56330468855678, "kl": 0.6611328125, "learning_rate": 4.988738738738738e-07, "loss": 0.0006, "reward": 3.5298094749450684, "reward_std": 0.03876837342977524, "rewards/final_reward": 1.9804720364517627, "rewards/mask_iou_reward": 0.9902360182258814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5298094153404236, "rewards/thk_ans_format_reward": 1.0, "step": 1780, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 134.70833587646484, "epoch": 6.016863406408095, "grad_norm": 14.309020021037526, "kl": 0.458984375, "learning_rate": 4.985923423423423e-07, "loss": 0.0005, "reward": 3.177926540374756, "reward_std": 0.0898869875818491, "rewards/final_reward": 1.3632918010967072, "rewards/mask_iou_reward": 0.6816459005483536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1779264211654663, "rewards/thk_ans_format_reward": 1.0, "step": 1781, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 164.00000762939453, "epoch": 6.020236087689713, "grad_norm": 72.04324946469048, "kl": 0.423828125, "learning_rate": 4.983108108108107e-07, "loss": 0.0004, "reward": 3.5047526359558105, "reward_std": 0.05716628208756447, "rewards/final_reward": 1.6035955601823277, "rewards/mask_iou_reward": 0.8017977800911639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5047527551651, "rewards/thk_ans_format_reward": 1.0, "step": 1782, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 144.44791793823242, "epoch": 6.023608768971332, "grad_norm": 10.273570408753594, "kl": 0.435546875, "learning_rate": 4.980292792792792e-07, "loss": 0.0004, "reward": 3.215430498123169, "reward_std": 0.05078008770942688, "rewards/final_reward": 1.5356546696469242, "rewards/mask_iou_reward": 0.7678273348234621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2154302597045898, "rewards/thk_ans_format_reward": 1.0, "step": 1783, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 145.55208587646484, "epoch": 6.0269814502529515, "grad_norm": 13.941282005436875, "kl": 0.4521484375, "learning_rate": 4.977477477477478e-07, "loss": 0.0005, "reward": 3.353714108467102, "reward_std": 0.13728094846010208, "rewards/final_reward": 1.1914722189040807, "rewards/mask_iou_reward": 0.5957361094520404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3537142276763916, "rewards/thk_ans_format_reward": 1.0, "step": 1784, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 131.3541717529297, "epoch": 6.03035413153457, "grad_norm": 13.967535257238211, "kl": 0.4609375, "learning_rate": 4.974662162162162e-07, "loss": 0.0005, "reward": 3.2387622594833374, "reward_std": 0.1281859129667282, "rewards/final_reward": 1.4602011399614638, "rewards/mask_iou_reward": 0.7301005699807319, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2387624382972717, "rewards/thk_ans_format_reward": 1.0, "step": 1785, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.31250381469727, "epoch": 6.033726812816189, "grad_norm": 16.134563017526045, "kl": 0.4423828125, "learning_rate": 4.971846846846847e-07, "loss": 0.0005, "reward": 3.3565257787704468, "reward_std": 0.11713682115077972, "rewards/final_reward": 1.6746956553420935, "rewards/mask_iou_reward": 0.8373478276710468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3565258383750916, "rewards/thk_ans_format_reward": 1.0, "step": 1786, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 135.43750762939453, "epoch": 6.0370994940978076, "grad_norm": 13.881967660391403, "kl": 0.4853515625, "learning_rate": 4.969031531531532e-07, "loss": 0.0005, "reward": 3.572825312614441, "reward_std": 0.13529992662370205, "rewards/final_reward": 1.8447336471667763, "rewards/mask_iou_reward": 0.9223668235833882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.572825312614441, "rewards/thk_ans_format_reward": 1.0, "step": 1787, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 223.33333587646484, "epoch": 6.040472175379427, "grad_norm": 11.643033696015314, "kl": 0.404296875, "learning_rate": 4.966216216216216e-07, "loss": 0.0004, "reward": 3.598806142807007, "reward_std": 0.2787330001592636, "rewards/final_reward": 1.6966626586449314, "rewards/mask_iou_reward": 0.8483313293224657, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6196394562721252, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1788, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 134.3854217529297, "epoch": 6.043844856661045, "grad_norm": 12.237340492732953, "kl": 0.494140625, "learning_rate": 4.963400900900901e-07, "loss": 0.0005, "reward": 3.351323366165161, "reward_std": 0.04036957677453756, "rewards/final_reward": 1.2045851596437673, "rewards/mask_iou_reward": 0.6022925798218837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3513233065605164, "rewards/thk_ans_format_reward": 1.0, "step": 1789, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 155.64583587646484, "epoch": 6.0472175379426645, "grad_norm": 5.86025306631662, "kl": 0.5556640625, "learning_rate": 4.960585585585585e-07, "loss": 0.0006, "reward": 2.9825971126556396, "reward_std": 0.12656350433826447, "rewards/final_reward": 0.9906582387276571, "rewards/mask_iou_reward": 0.49532911936382856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9825969934463501, "rewards/thk_ans_format_reward": 1.0, "step": 1790, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 127.69791793823242, "epoch": 6.050590219224283, "grad_norm": 8.881479831274108, "kl": 0.4951171875, "learning_rate": 4.95777027027027e-07, "loss": 0.0005, "reward": 3.5861589908599854, "reward_std": 0.036995792761445045, "rewards/final_reward": 1.681130699757016, "rewards/mask_iou_reward": 0.840565349878508, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5861589312553406, "rewards/thk_ans_format_reward": 1.0, "step": 1791, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 167.77083587646484, "epoch": 6.053962900505902, "grad_norm": 15.07407534685162, "kl": 0.4453125, "learning_rate": 4.954954954954955e-07, "loss": 0.0004, "reward": 3.5831758975982666, "reward_std": 0.1213915403932333, "rewards/final_reward": 1.7150435711100571, "rewards/mask_iou_reward": 0.8575217855550286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5831758975982666, "rewards/thk_ans_format_reward": 1.0, "step": 1792, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 146.3854217529297, "epoch": 6.057335581787521, "grad_norm": 7.6668102116418355, "kl": 0.4091796875, "learning_rate": 4.952139639639639e-07, "loss": 0.0004, "reward": 3.604040026664734, "reward_std": 0.054723722860217094, "rewards/final_reward": 1.5824915907787216, "rewards/mask_iou_reward": 0.7912457953893608, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6040397882461548, "rewards/thk_ans_format_reward": 1.0, "step": 1793, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 131.9375, "epoch": 6.06070826306914, "grad_norm": 17.07492983564493, "kl": 0.4384765625, "learning_rate": 4.949324324324325e-07, "loss": 0.0004, "reward": 3.750004529953003, "reward_std": 0.07825981266796589, "rewards/final_reward": 1.9201976174507362, "rewards/mask_iou_reward": 0.9600988087253681, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7500044107437134, "rewards/thk_ans_format_reward": 1.0, "step": 1794, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 126.10417175292969, "epoch": 6.064080944350759, "grad_norm": 15.468223744682641, "kl": 0.419921875, "learning_rate": 4.946509009009009e-07, "loss": 0.0004, "reward": 3.6026484966278076, "reward_std": 0.10279983654618263, "rewards/final_reward": 1.36152362689057, "rewards/mask_iou_reward": 0.680761813445285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6026484370231628, "rewards/thk_ans_format_reward": 1.0, "step": 1795, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.27083587646484, "epoch": 6.0674536256323774, "grad_norm": 18.180630844455536, "kl": 0.5810546875, "learning_rate": 4.943693693693693e-07, "loss": 0.0005, "reward": 3.5730003118515015, "reward_std": 0.01847125869244337, "rewards/final_reward": 1.909989832297998, "rewards/mask_iou_reward": 0.954994916148999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5730002522468567, "rewards/thk_ans_format_reward": 1.0, "step": 1796, "think_completion_length": 10.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 165.1354217529297, "epoch": 6.070826306913997, "grad_norm": 8.679508837091305, "kl": 0.4267578125, "learning_rate": 4.940878378378378e-07, "loss": 0.0004, "reward": 3.4946954250335693, "reward_std": 0.12109193205833435, "rewards/final_reward": 1.3505949481761967, "rewards/mask_iou_reward": 0.6752974740880984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4946955442428589, "rewards/thk_ans_format_reward": 1.0, "step": 1797, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 133.9166717529297, "epoch": 6.074198988195615, "grad_norm": 8.79628738011936, "kl": 0.447265625, "learning_rate": 4.938063063063062e-07, "loss": 0.0005, "reward": 3.360843539237976, "reward_std": 0.038767154794186354, "rewards/final_reward": 1.0506880923424922, "rewards/mask_iou_reward": 0.5253440461712461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3608436584472656, "rewards/thk_ans_format_reward": 1.0, "step": 1798, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 131.9791717529297, "epoch": 6.077571669477234, "grad_norm": 13.827782140220531, "kl": 0.44921875, "learning_rate": 4.935247747747748e-07, "loss": 0.0005, "reward": 3.5027761459350586, "reward_std": 0.07696177158504725, "rewards/final_reward": 1.4222419690109462, "rewards/mask_iou_reward": 0.7111209845054731, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5027759671211243, "rewards/thk_ans_format_reward": 1.0, "step": 1799, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.20833587646484, "epoch": 6.080944350758854, "grad_norm": 9.89407607249062, "kl": 0.474609375, "learning_rate": 4.932432432432432e-07, "loss": 0.0005, "reward": 3.4364618062973022, "reward_std": 0.032252633944153786, "rewards/final_reward": 1.814366455937597, "rewards/mask_iou_reward": 0.9071832279687985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4364619255065918, "rewards/thk_ans_format_reward": 1.0, "step": 1800, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 138.18750762939453, "epoch": 6.084317032040472, "grad_norm": 12.036125325350469, "kl": 0.46484375, "learning_rate": 4.929617117117117e-07, "loss": 0.0005, "reward": 3.497703790664673, "reward_std": 0.07510556373745203, "rewards/final_reward": 1.375586700424734, "rewards/mask_iou_reward": 0.687793350212367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.497703731060028, "rewards/thk_ans_format_reward": 1.0, "step": 1801, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.18750381469727, "epoch": 6.087689713322091, "grad_norm": 9.272346546953209, "kl": 2.04296875, "learning_rate": 4.926801801801802e-07, "loss": 0.002, "reward": 3.246990919113159, "reward_std": 0.07481374405324459, "rewards/final_reward": 1.2698713054500153, "rewards/mask_iou_reward": 0.6349356527250076, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2469908595085144, "rewards/thk_ans_format_reward": 1.0, "step": 1802, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 134.25000762939453, "epoch": 6.09106239460371, "grad_norm": 7.763042794101035, "kl": 0.455078125, "learning_rate": 4.923986486486486e-07, "loss": 0.0005, "reward": 3.592580199241638, "reward_std": 0.1305740661919117, "rewards/final_reward": 1.6972252048345693, "rewards/mask_iou_reward": 0.8486126024172846, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6029969453811646, "rewards/thk_ans_format_reward": 1.0, "step": 1803, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 111.64583587646484, "epoch": 6.094435075885329, "grad_norm": 7.8167428206231016, "kl": 0.4716796875, "learning_rate": 4.921171171171171e-07, "loss": 0.0005, "reward": 3.5702860355377197, "reward_std": 0.1579833161085844, "rewards/final_reward": 1.2851122394080043, "rewards/mask_iou_reward": 0.6425561197040022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5702861547470093, "rewards/thk_ans_format_reward": 1.0, "step": 1804, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 115.64583587646484, "epoch": 6.097807757166947, "grad_norm": 8.273854917771985, "kl": 0.650390625, "learning_rate": 4.918355855855855e-07, "loss": 0.0007, "reward": 3.735241651535034, "reward_std": 0.07285539992153645, "rewards/final_reward": 1.7158941135165968, "rewards/mask_iou_reward": 0.8579470567582984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7352415323257446, "rewards/thk_ans_format_reward": 1.0, "step": 1805, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 146.5833396911621, "epoch": 6.101180438448567, "grad_norm": 8.050910452950912, "kl": 0.42578125, "learning_rate": 4.91554054054054e-07, "loss": 0.0005, "reward": 3.4020386934280396, "reward_std": 0.02658071694895625, "rewards/final_reward": 1.9605694783898593, "rewards/mask_iou_reward": 0.9802847391949296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4020384550094604, "rewards/thk_ans_format_reward": 1.0, "step": 1806, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 160.08333587646484, "epoch": 6.104553119730186, "grad_norm": 12.275695310256692, "kl": 0.5146484375, "learning_rate": 4.912725225225225e-07, "loss": 0.0005, "reward": 3.641546368598938, "reward_std": 0.1854504942893982, "rewards/final_reward": 1.712221254437532, "rewards/mask_iou_reward": 0.856110627218766, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6623798608779907, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1807, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 134.8854217529297, "epoch": 6.107925801011804, "grad_norm": 9.725615667917536, "kl": 0.419921875, "learning_rate": 4.909909909909909e-07, "loss": 0.0004, "reward": 3.6741241216659546, "reward_std": 0.04602981638163328, "rewards/final_reward": 1.4570953259034565, "rewards/mask_iou_reward": 0.7285476629517282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6741241216659546, "rewards/thk_ans_format_reward": 1.0, "step": 1808, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 141.6979217529297, "epoch": 6.1112984822934235, "grad_norm": 15.884101211242106, "kl": 0.484375, "learning_rate": 4.907094594594595e-07, "loss": 0.0005, "reward": 3.8114129304885864, "reward_std": 0.045267632231116295, "rewards/final_reward": 1.8431219354271338, "rewards/mask_iou_reward": 0.9215609677135669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8114128708839417, "rewards/thk_ans_format_reward": 1.0, "step": 1809, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 214.65625, "epoch": 6.114671163575042, "grad_norm": 25.81386622474354, "kl": 0.46875, "learning_rate": 4.90427927927928e-07, "loss": 0.0005, "reward": 3.6401002407073975, "reward_std": 0.18938226997852325, "rewards/final_reward": 1.9438146284433242, "rewards/mask_iou_reward": 0.9719073142216621, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.650516927242279, "rewards/thk_ans_format_reward": 1.0, "step": 1810, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.35416793823242, "epoch": 6.118043844856661, "grad_norm": 8.497806317951227, "kl": 0.427734375, "learning_rate": 4.901463963963964e-07, "loss": 0.0005, "reward": 3.3357125520706177, "reward_std": 0.10076085850596428, "rewards/final_reward": 1.4866574546484423, "rewards/mask_iou_reward": 0.7433287273242212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3357125520706177, "rewards/thk_ans_format_reward": 1.0, "step": 1811, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 125.42708587646484, "epoch": 6.12141652613828, "grad_norm": 11.40831085565751, "kl": 0.572265625, "learning_rate": 4.898648648648649e-07, "loss": 0.0006, "reward": 3.5201451778411865, "reward_std": 0.07543889572843909, "rewards/final_reward": 1.2276516786559781, "rewards/mask_iou_reward": 0.6138258393279891, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.520145058631897, "rewards/thk_ans_format_reward": 1.0, "step": 1812, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 150.5625, "epoch": 6.124789207419899, "grad_norm": 16.476859110128636, "kl": 0.521484375, "learning_rate": 4.895833333333333e-07, "loss": 0.0005, "reward": 3.0899579524993896, "reward_std": 0.07404950819909573, "rewards/final_reward": 1.4622259397212147, "rewards/mask_iou_reward": 0.7311129698606074, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0899578332901, "rewards/thk_ans_format_reward": 1.0, "step": 1813, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 139.4791717529297, "epoch": 6.128161888701518, "grad_norm": 7.32903798149775, "kl": 0.486328125, "learning_rate": 4.893018018018018e-07, "loss": 0.0005, "reward": 3.2776451110839844, "reward_std": 0.04616658762097359, "rewards/final_reward": 1.4818470056431212, "rewards/mask_iou_reward": 0.7409235028215606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.277645230293274, "rewards/thk_ans_format_reward": 1.0, "step": 1814, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 184.8229217529297, "epoch": 6.1315345699831365, "grad_norm": 10.56676495216597, "kl": 0.4208984375, "learning_rate": 4.890202702702703e-07, "loss": 0.0005, "reward": 3.4688678979873657, "reward_std": 0.07710606977343559, "rewards/final_reward": 1.9705928437719198, "rewards/mask_iou_reward": 0.9852964218859599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4688677787780762, "rewards/thk_ans_format_reward": 1.0, "step": 1815, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.61458587646484, "epoch": 6.134907251264756, "grad_norm": 6.168549350134606, "kl": 0.4765625, "learning_rate": 4.887387387387387e-07, "loss": 0.0005, "reward": 3.066379189491272, "reward_std": 0.07491825148463249, "rewards/final_reward": 1.4189378907902062, "rewards/mask_iou_reward": 0.7094689453951031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.066379189491272, "rewards/thk_ans_format_reward": 1.0, "step": 1816, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 157.13541793823242, "epoch": 6.138279932546374, "grad_norm": 46.181273262335935, "kl": 0.419921875, "learning_rate": 4.884572072072072e-07, "loss": 0.0004, "reward": 3.6587594747543335, "reward_std": 0.1114624422043562, "rewards/final_reward": 1.6354522765770105, "rewards/mask_iou_reward": 0.8177261382885053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6587591767311096, "rewards/thk_ans_format_reward": 1.0, "step": 1817, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 126.40625381469727, "epoch": 6.141652613827993, "grad_norm": 9.338732537826253, "kl": 0.54296875, "learning_rate": 4.881756756756756e-07, "loss": 0.0006, "reward": 3.492391347885132, "reward_std": 0.058474089950323105, "rewards/final_reward": 1.5647377878528934, "rewards/mask_iou_reward": 0.7823688939264467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4923916459083557, "rewards/thk_ans_format_reward": 1.0, "step": 1818, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 159.8541717529297, "epoch": 6.145025295109612, "grad_norm": 14.939731342568743, "kl": 0.43359375, "learning_rate": 4.878941441441441e-07, "loss": 0.0004, "reward": 3.352776527404785, "reward_std": 0.0804421491920948, "rewards/final_reward": 1.2421818179755024, "rewards/mask_iou_reward": 0.6210909089877512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.35277658700943, "rewards/thk_ans_format_reward": 1.0, "step": 1819, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 164.4166717529297, "epoch": 6.148397976391231, "grad_norm": 7.270457639937909, "kl": 0.5, "learning_rate": 4.876126126126126e-07, "loss": 0.0005, "reward": 3.645534873008728, "reward_std": 0.05726535618305206, "rewards/final_reward": 1.2087856690062433, "rewards/mask_iou_reward": 0.6043928345031216, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6455351114273071, "rewards/thk_ans_format_reward": 1.0, "step": 1820, "think_completion_length": 10.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.6041717529297, "epoch": 6.15177065767285, "grad_norm": 21.78725379694707, "kl": 0.6484375, "learning_rate": 4.87331081081081e-07, "loss": 0.0006, "reward": 3.240267753601074, "reward_std": 0.09761350601911545, "rewards/final_reward": 0.3889431804937442, "rewards/mask_iou_reward": 0.1944715902468721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2402676343917847, "rewards/thk_ans_format_reward": 1.0, "step": 1821, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 157.70834350585938, "epoch": 6.155143338954469, "grad_norm": 9.80529265640279, "kl": 0.474609375, "learning_rate": 4.870495495495495e-07, "loss": 0.0005, "reward": 3.245134949684143, "reward_std": 0.11889784410595894, "rewards/final_reward": 1.0686943146111323, "rewards/mask_iou_reward": 0.5343471573055661, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2451348900794983, "rewards/thk_ans_format_reward": 1.0, "step": 1822, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 127.01042175292969, "epoch": 6.158516020236088, "grad_norm": 20.528644385672013, "kl": 0.759765625, "learning_rate": 4.867680180180179e-07, "loss": 0.0008, "reward": 3.2267754077911377, "reward_std": 0.03484675846993923, "rewards/final_reward": 1.4529543439692785, "rewards/mask_iou_reward": 0.7264771719846392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2267754077911377, "rewards/thk_ans_format_reward": 1.0, "step": 1823, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 148.7083396911621, "epoch": 6.161888701517706, "grad_norm": 6.527277155267125, "kl": 0.5185546875, "learning_rate": 4.864864864864865e-07, "loss": 0.0005, "reward": 3.5386557579040527, "reward_std": 0.15575581789016724, "rewards/final_reward": 1.7940223206754031, "rewards/mask_iou_reward": 0.8970111603377016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5386556386947632, "rewards/thk_ans_format_reward": 1.0, "step": 1824, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 121.30208587646484, "epoch": 6.165261382799326, "grad_norm": 7.014600074912451, "kl": 0.4580078125, "learning_rate": 4.86204954954955e-07, "loss": 0.0005, "reward": 3.5652071237564087, "reward_std": 0.03770854417234659, "rewards/final_reward": 1.817075835921095, "rewards/mask_iou_reward": 0.9085379179605475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5652071237564087, "rewards/thk_ans_format_reward": 1.0, "step": 1825, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 176.20834350585938, "epoch": 6.168634064080944, "grad_norm": 16.387909639859263, "kl": 3.076171875, "learning_rate": 4.859234234234234e-07, "loss": 0.0031, "reward": 3.3169403076171875, "reward_std": 0.06754343025386333, "rewards/final_reward": 1.1096263378606444, "rewards/mask_iou_reward": 0.5548131689303222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3169403672218323, "rewards/thk_ans_format_reward": 1.0, "step": 1826, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 146.46875762939453, "epoch": 6.172006745362563, "grad_norm": 8.58167030233964, "kl": 0.4384765625, "learning_rate": 4.856418918918919e-07, "loss": 0.0004, "reward": 3.648140788078308, "reward_std": 0.08551261574029922, "rewards/final_reward": 1.7414726240826537, "rewards/mask_iou_reward": 0.8707363120413268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6481409072875977, "rewards/thk_ans_format_reward": 1.0, "step": 1827, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 165.98959350585938, "epoch": 6.175379426644182, "grad_norm": 13.283475018178068, "kl": 0.4248046875, "learning_rate": 4.853603603603604e-07, "loss": 0.0004, "reward": 3.440650701522827, "reward_std": 0.2208097279071808, "rewards/final_reward": 1.6704628419619407, "rewards/mask_iou_reward": 0.8352314209809704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4406505823135376, "rewards/thk_ans_format_reward": 1.0, "step": 1828, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 138.17708587646484, "epoch": 6.178752107925801, "grad_norm": 24.21518870673125, "kl": 0.7392578125, "learning_rate": 4.850788288288288e-07, "loss": 0.0007, "reward": 3.4418188333511353, "reward_std": 0.035075574181973934, "rewards/final_reward": 1.6447764419543716, "rewards/mask_iou_reward": 0.8223882209771858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4418187141418457, "rewards/thk_ans_format_reward": 1.0, "step": 1829, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 137.52084350585938, "epoch": 6.18212478920742, "grad_norm": 12.568516663657245, "kl": 0.4658203125, "learning_rate": 4.847972972972973e-07, "loss": 0.0005, "reward": 3.697560429573059, "reward_std": 0.05124947056174278, "rewards/final_reward": 1.4155713679870419, "rewards/mask_iou_reward": 0.7077856839935209, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.697560429573059, "rewards/thk_ans_format_reward": 1.0, "step": 1830, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 126.73958587646484, "epoch": 6.185497470489039, "grad_norm": 6.373846714068581, "kl": 0.455078125, "learning_rate": 4.845157657657657e-07, "loss": 0.0005, "reward": 3.2864041328430176, "reward_std": 0.056804947555065155, "rewards/final_reward": 1.3386153594496895, "rewards/mask_iou_reward": 0.6693076797248447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.286404013633728, "rewards/thk_ans_format_reward": 1.0, "step": 1831, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 128.71875381469727, "epoch": 6.188870151770658, "grad_norm": 11.950140564771095, "kl": 0.48828125, "learning_rate": 4.842342342342342e-07, "loss": 0.0005, "reward": 3.6336305141448975, "reward_std": 0.054893579334020615, "rewards/final_reward": 1.8206187602156358, "rewards/mask_iou_reward": 0.9103093801078179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6336302757263184, "rewards/thk_ans_format_reward": 1.0, "step": 1832, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 186.30209350585938, "epoch": 6.192242833052276, "grad_norm": 8.144789812330119, "kl": 0.408203125, "learning_rate": 4.839527027027027e-07, "loss": 0.0004, "reward": 3.1093939542770386, "reward_std": 0.2279842160642147, "rewards/final_reward": 0.9082491008078164, "rewards/mask_iou_reward": 0.4541245504039082, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.130227416753769, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1833, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.5416717529297, "epoch": 6.195615514333896, "grad_norm": 13.15715666098289, "kl": 0.421875, "learning_rate": 4.836711711711711e-07, "loss": 0.0005, "reward": 3.1555248498916626, "reward_std": 0.1126946210861206, "rewards/final_reward": 1.6677855768724603, "rewards/mask_iou_reward": 0.8338927884362302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1555247902870178, "rewards/thk_ans_format_reward": 1.0, "step": 1834, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 141.73958587646484, "epoch": 6.198988195615514, "grad_norm": 10.09920999100286, "kl": 0.4697265625, "learning_rate": 4.833896396396397e-07, "loss": 0.0005, "reward": 3.560391664505005, "reward_std": 0.036461517214775085, "rewards/final_reward": 1.5740383488315124, "rewards/mask_iou_reward": 0.7870191744157562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.56039160490036, "rewards/thk_ans_format_reward": 1.0, "step": 1835, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 167.70833587646484, "epoch": 6.202360876897133, "grad_norm": 21.17541065383814, "kl": 0.4384765625, "learning_rate": 4.83108108108108e-07, "loss": 0.0004, "reward": 3.2930556535720825, "reward_std": 0.1490391530096531, "rewards/final_reward": 0.8221827094035479, "rewards/mask_iou_reward": 0.41109135470177394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2930553555488586, "rewards/thk_ans_format_reward": 1.0, "step": 1836, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 147.50000762939453, "epoch": 6.2057335581787525, "grad_norm": 39.0265227970619, "kl": 0.5498046875, "learning_rate": 4.828265765765765e-07, "loss": 0.0006, "reward": 3.4005260467529297, "reward_std": 0.04964868910610676, "rewards/final_reward": 1.012190933699076, "rewards/mask_iou_reward": 0.506095466849538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4005258679389954, "rewards/thk_ans_format_reward": 1.0, "step": 1837, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.2916717529297, "epoch": 6.209106239460371, "grad_norm": 10.174527346336742, "kl": 0.431640625, "learning_rate": 4.82545045045045e-07, "loss": 0.0004, "reward": 3.533667206764221, "reward_std": 0.07525857351720333, "rewards/final_reward": 1.5471510406717721, "rewards/mask_iou_reward": 0.7735755203358861, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5336670279502869, "rewards/thk_ans_format_reward": 1.0, "step": 1838, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.64583587646484, "epoch": 6.21247892074199, "grad_norm": 16.030419351543703, "kl": 0.4560546875, "learning_rate": 4.822635135135134e-07, "loss": 0.0004, "reward": 3.734760046005249, "reward_std": 0.07690603472292423, "rewards/final_reward": 1.6794847684644445, "rewards/mask_iou_reward": 0.8397423842322222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7347601056098938, "rewards/thk_ans_format_reward": 1.0, "step": 1839, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 136.33333587646484, "epoch": 6.2158516020236085, "grad_norm": 129.07331392254122, "kl": 0.3955078125, "learning_rate": 4.81981981981982e-07, "loss": 0.0004, "reward": 3.5038132667541504, "reward_std": 0.10739928111433983, "rewards/final_reward": 1.4776798330855296, "rewards/mask_iou_reward": 0.7388399165427648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5038131475448608, "rewards/thk_ans_format_reward": 1.0, "step": 1840, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 108.47917175292969, "epoch": 6.219224283305228, "grad_norm": 14.35637587106052, "kl": 0.98828125, "learning_rate": 4.817004504504505e-07, "loss": 0.001, "reward": 3.8405778408050537, "reward_std": 0.015699880197644234, "rewards/final_reward": 1.9513990856142902, "rewards/mask_iou_reward": 0.9756995428071451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.840577781200409, "rewards/thk_ans_format_reward": 1.0, "step": 1841, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 137.67709350585938, "epoch": 6.222596964586846, "grad_norm": 13.455911390668044, "kl": 0.4951171875, "learning_rate": 4.814189189189189e-07, "loss": 0.0005, "reward": 3.352190852165222, "reward_std": 0.049657109659165144, "rewards/final_reward": 0.9730023664458212, "rewards/mask_iou_reward": 0.4865011832229106, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.352190911769867, "rewards/thk_ans_format_reward": 1.0, "step": 1842, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 154.52084350585938, "epoch": 6.2259696458684655, "grad_norm": 15.863423153445565, "kl": 0.5087890625, "learning_rate": 4.811373873873874e-07, "loss": 0.0005, "reward": 3.609791874885559, "reward_std": 0.03911227732896805, "rewards/final_reward": 1.574771011400429, "rewards/mask_iou_reward": 0.7873855057002145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6097919940948486, "rewards/thk_ans_format_reward": 1.0, "step": 1843, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 143.18750762939453, "epoch": 6.229342327150085, "grad_norm": 10.155964433901605, "kl": 0.37890625, "learning_rate": 4.808558558558558e-07, "loss": 0.0004, "reward": 3.828355550765991, "reward_std": 0.015464604832231998, "rewards/final_reward": 1.914407570926103, "rewards/mask_iou_reward": 0.9572037854630515, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8283554911613464, "rewards/thk_ans_format_reward": 1.0, "step": 1844, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 174.11458587646484, "epoch": 6.232715008431703, "grad_norm": 19.461364205169517, "kl": 0.52734375, "learning_rate": 4.805743243243243e-07, "loss": 0.0005, "reward": 3.383014440536499, "reward_std": 0.16253596171736717, "rewards/final_reward": 1.2477217644360832, "rewards/mask_iou_reward": 0.6238608822180416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3830143809318542, "rewards/thk_ans_format_reward": 1.0, "step": 1845, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 142.17708587646484, "epoch": 6.236087689713322, "grad_norm": 11.087827878884447, "kl": 0.41796875, "learning_rate": 4.802927927927928e-07, "loss": 0.0004, "reward": 3.2912451028823853, "reward_std": 0.1132066361606121, "rewards/final_reward": 1.7735398501636395, "rewards/mask_iou_reward": 0.8867699250818197, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2912451028823853, "rewards/thk_ans_format_reward": 1.0, "step": 1846, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.15625, "epoch": 6.239460370994941, "grad_norm": 17.893362894514222, "kl": 0.7080078125, "learning_rate": 4.800112612612612e-07, "loss": 0.0008, "reward": 3.4652419090270996, "reward_std": 0.046267539262771606, "rewards/final_reward": 1.3597863364412965, "rewards/mask_iou_reward": 0.6798931682206483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4652420282363892, "rewards/thk_ans_format_reward": 1.0, "step": 1847, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 133.625, "epoch": 6.24283305227656, "grad_norm": 9.73142051628865, "kl": 0.40234375, "learning_rate": 4.797297297297297e-07, "loss": 0.0005, "reward": 3.4128910303115845, "reward_std": 0.07327094860374928, "rewards/final_reward": 1.433672808060011, "rewards/mask_iou_reward": 0.7168364040300055, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4128910303115845, "rewards/thk_ans_format_reward": 1.0, "step": 1848, "think_completion_length": 6.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 149.6875, "epoch": 6.246205733558178, "grad_norm": 13.676519283489288, "kl": 0.427734375, "learning_rate": 4.794481981981981e-07, "loss": 0.0005, "reward": 3.3033007383346558, "reward_std": 0.09086661785840988, "rewards/final_reward": 1.643985369919946, "rewards/mask_iou_reward": 0.821992684959973, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3033005595207214, "rewards/thk_ans_format_reward": 1.0, "step": 1849, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 171.39583587646484, "epoch": 6.249578414839798, "grad_norm": 8.076127289231117, "kl": 0.4140625, "learning_rate": 4.791666666666667e-07, "loss": 0.0004, "reward": 3.0996118783950806, "reward_std": 0.16868788562715054, "rewards/final_reward": 1.5549581997171695, "rewards/mask_iou_reward": 0.7774790998585848, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1204451322555542, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1850, "think_completion_length": 6.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.72916793823242, "epoch": 6.252951096121416, "grad_norm": 31.480636549002742, "kl": 0.4677734375, "learning_rate": 4.788851351351352e-07, "loss": 0.0005, "reward": 3.263098955154419, "reward_std": 0.21037384122610092, "rewards/final_reward": 1.3115423029288782, "rewards/mask_iou_reward": 0.6557711514644391, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2630988359451294, "rewards/thk_ans_format_reward": 1.0, "step": 1851, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 136.20833587646484, "epoch": 6.256323777403035, "grad_norm": 6.937391694748509, "kl": 0.501953125, "learning_rate": 4.786036036036036e-07, "loss": 0.0005, "reward": 3.13068687915802, "reward_std": 0.20532716810703278, "rewards/final_reward": 0.978316601376308, "rewards/mask_iou_reward": 0.489158300688154, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.1515201926231384, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1852, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 137.12500762939453, "epoch": 6.259696458684655, "grad_norm": 9.857491201068333, "kl": 0.3994140625, "learning_rate": 4.783220720720721e-07, "loss": 0.0004, "reward": 3.5886178016662598, "reward_std": 0.07222697883844376, "rewards/final_reward": 1.4276639751242879, "rewards/mask_iou_reward": 0.7138319875621439, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5886176824569702, "rewards/thk_ans_format_reward": 1.0, "step": 1853, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.30208587646484, "epoch": 6.263069139966273, "grad_norm": 19.50421363346234, "kl": 0.400390625, "learning_rate": 4.780405405405405e-07, "loss": 0.0004, "reward": 3.641382932662964, "reward_std": 0.0448097325861454, "rewards/final_reward": 1.6495557187238357, "rewards/mask_iou_reward": 0.8247778593619178, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6413828134536743, "rewards/thk_ans_format_reward": 1.0, "step": 1854, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 182.2291717529297, "epoch": 6.266441821247892, "grad_norm": 7.351446937843033, "kl": 0.580078125, "learning_rate": 4.77759009009009e-07, "loss": 0.0006, "reward": 3.594027280807495, "reward_std": 0.26565699838101864, "rewards/final_reward": 1.9030951882988434, "rewards/mask_iou_reward": 0.9515475941494217, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.6356940865516663, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 1855, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 141.2916717529297, "epoch": 6.269814502529511, "grad_norm": 22.137513934687362, "kl": 0.478515625, "learning_rate": 4.774774774774775e-07, "loss": 0.0005, "reward": 3.6100287437438965, "reward_std": 0.025507054291665554, "rewards/final_reward": 1.6572004694700335, "rewards/mask_iou_reward": 0.8286002347350168, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6100287437438965, "rewards/thk_ans_format_reward": 1.0, "step": 1856, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 159.375, "epoch": 6.27318718381113, "grad_norm": 10.17366204243886, "kl": 0.4677734375, "learning_rate": 4.771959459459459e-07, "loss": 0.0005, "reward": 3.527943730354309, "reward_std": 0.25231462717056274, "rewards/final_reward": 1.6537278172949341, "rewards/mask_iou_reward": 0.8268639086474671, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5279436111450195, "rewards/thk_ans_format_reward": 1.0, "step": 1857, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 168.6666717529297, "epoch": 6.276559865092748, "grad_norm": 5.4924519172347885, "kl": 0.4638671875, "learning_rate": 4.769144144144144e-07, "loss": 0.0005, "reward": 3.417281150817871, "reward_std": 0.08728579431772232, "rewards/final_reward": 1.5202317602031523, "rewards/mask_iou_reward": 0.7601158801015762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4172809720039368, "rewards/thk_ans_format_reward": 1.0, "step": 1858, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 139.9479217529297, "epoch": 6.279932546374368, "grad_norm": 6.81588308994651, "kl": 0.412109375, "learning_rate": 4.7663288288288285e-07, "loss": 0.0004, "reward": 3.592257022857666, "reward_std": 0.049894423224031925, "rewards/final_reward": 1.3763441086826136, "rewards/mask_iou_reward": 0.6881720543413068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5922568440437317, "rewards/thk_ans_format_reward": 1.0, "step": 1859, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.55209350585938, "epoch": 6.283305227655987, "grad_norm": 6.815516689412823, "kl": 0.484375, "learning_rate": 4.7635135135135136e-07, "loss": 0.0005, "reward": 3.6291333436965942, "reward_std": 0.11306917294859886, "rewards/final_reward": 1.5164182235209045, "rewards/mask_iou_reward": 0.7582091117604522, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6291332244873047, "rewards/thk_ans_format_reward": 1.0, "step": 1860, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 202.21875762939453, "epoch": 6.286677908937605, "grad_norm": 11.211526253372272, "kl": 0.357421875, "learning_rate": 4.760698198198198e-07, "loss": 0.0004, "reward": 3.6165082454681396, "reward_std": 0.12074577808380127, "rewards/final_reward": 1.597264458972782, "rewards/mask_iou_reward": 0.798632229486391, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.61650812625885, "rewards/thk_ans_format_reward": 1.0, "step": 1861, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 155.18750762939453, "epoch": 6.2900505902192245, "grad_norm": 17.466570040418652, "kl": 0.4365234375, "learning_rate": 4.757882882882883e-07, "loss": 0.0004, "reward": 3.407384157180786, "reward_std": 0.06713058799505234, "rewards/final_reward": 1.872860084905434, "rewards/mask_iou_reward": 0.936430042452717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.407383918762207, "rewards/thk_ans_format_reward": 1.0, "step": 1862, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 170.6979217529297, "epoch": 6.293423271500843, "grad_norm": 10.368101080790261, "kl": 0.4150390625, "learning_rate": 4.7550675675675674e-07, "loss": 0.0004, "reward": 3.6531848907470703, "reward_std": 0.1504236189648509, "rewards/final_reward": 1.3807319749969587, "rewards/mask_iou_reward": 0.6903659874984793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.653184711933136, "rewards/thk_ans_format_reward": 1.0, "step": 1863, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 139.5416717529297, "epoch": 6.296795952782462, "grad_norm": 12.365053252610293, "kl": 0.4267578125, "learning_rate": 4.752252252252252e-07, "loss": 0.0006, "reward": 3.647891640663147, "reward_std": 0.07979295030236244, "rewards/final_reward": 1.739375300114676, "rewards/mask_iou_reward": 0.869687650057338, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.647891640663147, "rewards/thk_ans_format_reward": 1.0, "step": 1864, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 151.11458587646484, "epoch": 6.300168634064081, "grad_norm": 12.789096356831873, "kl": 0.681640625, "learning_rate": 4.749436936936937e-07, "loss": 0.0007, "reward": 3.50308358669281, "reward_std": 0.039963416289538145, "rewards/final_reward": 1.1487614871984597, "rewards/mask_iou_reward": 0.5743807435992299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5030834078788757, "rewards/thk_ans_format_reward": 1.0, "step": 1865, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.52083587646484, "epoch": 6.3035413153457, "grad_norm": 6.530863942545822, "kl": 0.4521484375, "learning_rate": 4.746621621621621e-07, "loss": 0.0005, "reward": 3.501235008239746, "reward_std": 0.15554272197186947, "rewards/final_reward": 1.7192413322312214, "rewards/mask_iou_reward": 0.8596206661156107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5012351870536804, "rewards/thk_ans_format_reward": 1.0, "step": 1866, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 149.1354217529297, "epoch": 6.306913996627319, "grad_norm": 13.245851374127852, "kl": 0.3955078125, "learning_rate": 4.743806306306306e-07, "loss": 0.0004, "reward": 3.6863157749176025, "reward_std": 0.07255137898027897, "rewards/final_reward": 1.7820136042567227, "rewards/mask_iou_reward": 0.8910068021283614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6863157153129578, "rewards/thk_ans_format_reward": 1.0, "step": 1867, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 168.3854217529297, "epoch": 6.3102866779089375, "grad_norm": 17.41876990858359, "kl": 0.47265625, "learning_rate": 4.7409909909909905e-07, "loss": 0.0005, "reward": 3.3488051891326904, "reward_std": 0.04395863972604275, "rewards/final_reward": 1.1569589159476008, "rewards/mask_iou_reward": 0.5784794579738004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3488048911094666, "rewards/thk_ans_format_reward": 1.0, "step": 1868, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 186.2916717529297, "epoch": 6.313659359190557, "grad_norm": 7.877274635062242, "kl": 0.408203125, "learning_rate": 4.738175675675675e-07, "loss": 0.0005, "reward": 3.414574384689331, "reward_std": 0.03488452360033989, "rewards/final_reward": 1.345949744631015, "rewards/mask_iou_reward": 0.6729748723155075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4145742654800415, "rewards/thk_ans_format_reward": 1.0, "step": 1869, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 156.0520896911621, "epoch": 6.317032040472175, "grad_norm": 20.015154532440828, "kl": 0.640625, "learning_rate": 4.73536036036036e-07, "loss": 0.0006, "reward": 3.530336856842041, "reward_std": 0.08547847159206867, "rewards/final_reward": 1.0988999298593032, "rewards/mask_iou_reward": 0.5494499649296516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5303367376327515, "rewards/thk_ans_format_reward": 1.0, "step": 1870, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 129.70833587646484, "epoch": 6.320404721753794, "grad_norm": 44.35928207835712, "kl": 0.4873046875, "learning_rate": 4.732545045045045e-07, "loss": 0.0005, "reward": 3.468393325805664, "reward_std": 0.05352478846907616, "rewards/final_reward": 1.684244809139015, "rewards/mask_iou_reward": 0.8421224045695075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4683933854103088, "rewards/thk_ans_format_reward": 1.0, "step": 1871, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 142.8229217529297, "epoch": 6.323777403035413, "grad_norm": 25.40471040471904, "kl": 0.4287109375, "learning_rate": 4.7297297297297294e-07, "loss": 0.0005, "reward": 3.4976236820220947, "reward_std": 0.08486808463931084, "rewards/final_reward": 1.3047796045588043, "rewards/mask_iou_reward": 0.6523898022794021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.49762362241745, "rewards/thk_ans_format_reward": 1.0, "step": 1872, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 166.34375, "epoch": 6.327150084317032, "grad_norm": 6.886490971181804, "kl": 0.712890625, "learning_rate": 4.726914414414414e-07, "loss": 0.0007, "reward": 3.532763361930847, "reward_std": 0.06538549810647964, "rewards/final_reward": 1.7415548988047154, "rewards/mask_iou_reward": 0.8707774494023577, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.532763421535492, "rewards/thk_ans_format_reward": 1.0, "step": 1873, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.2291717529297, "epoch": 6.330522765598651, "grad_norm": 9.355042908067066, "kl": 0.423828125, "learning_rate": 4.7240990990990986e-07, "loss": 0.0004, "reward": 3.6574703454971313, "reward_std": 0.04328635986894369, "rewards/final_reward": 1.9130759923228222, "rewards/mask_iou_reward": 0.9565379961614111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6574699878692627, "rewards/thk_ans_format_reward": 1.0, "step": 1874, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.4479217529297, "epoch": 6.33389544688027, "grad_norm": 9.49022115875892, "kl": 0.5078125, "learning_rate": 4.721283783783784e-07, "loss": 0.0005, "reward": 3.5508744716644287, "reward_std": 0.07801926881074905, "rewards/final_reward": 1.5653615315107303, "rewards/mask_iou_reward": 0.7826807657553652, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.550874412059784, "rewards/thk_ans_format_reward": 1.0, "step": 1875, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 146.25000762939453, "epoch": 6.337268128161889, "grad_norm": 6.088105787863321, "kl": 0.400390625, "learning_rate": 4.7184684684684684e-07, "loss": 0.0004, "reward": 3.5852304697036743, "reward_std": 0.12043560296297073, "rewards/final_reward": 1.7741760352887033, "rewards/mask_iou_reward": 0.8870880176443516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5852304100990295, "rewards/thk_ans_format_reward": 1.0, "step": 1876, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 159.4166717529297, "epoch": 6.340640809443507, "grad_norm": 5.801410379375544, "kl": 0.3720703125, "learning_rate": 4.715653153153153e-07, "loss": 0.0004, "reward": 3.80169677734375, "reward_std": 0.035175224766135216, "rewards/final_reward": 1.8897999246439883, "rewards/mask_iou_reward": 0.9448999623219941, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8016967177391052, "rewards/thk_ans_format_reward": 1.0, "step": 1877, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 150.53125762939453, "epoch": 6.344013490725127, "grad_norm": 10.847386218011383, "kl": 0.451171875, "learning_rate": 4.7128378378378376e-07, "loss": 0.0005, "reward": 3.548642635345459, "reward_std": 0.07939281314611435, "rewards/final_reward": 1.7258872880340754, "rewards/mask_iou_reward": 0.8629436440170377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.548642635345459, "rewards/thk_ans_format_reward": 1.0, "step": 1878, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 236.63542938232422, "epoch": 6.347386172006745, "grad_norm": 5.9551648312298715, "kl": 0.3994140625, "learning_rate": 4.710022522522522e-07, "loss": 0.0004, "reward": 3.187057852745056, "reward_std": 0.19463208317756653, "rewards/final_reward": 1.2669394950542276, "rewards/mask_iou_reward": 0.6334697475271138, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2078912556171417, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1879, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 179.1354217529297, "epoch": 6.350758853288364, "grad_norm": 12.090427610462893, "kl": 0.4150390625, "learning_rate": 4.7072072072072073e-07, "loss": 0.0004, "reward": 3.521798610687256, "reward_std": 0.05686133913695812, "rewards/final_reward": 1.0955721458005767, "rewards/mask_iou_reward": 0.5477860729002884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5217987298965454, "rewards/thk_ans_format_reward": 1.0, "step": 1880, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 171.89583587646484, "epoch": 6.354131534569984, "grad_norm": 6.817187114704969, "kl": 0.4169921875, "learning_rate": 4.704391891891892e-07, "loss": 0.0004, "reward": 3.7434085607528687, "reward_std": 0.17366989701986313, "rewards/final_reward": 1.7364503374356408, "rewards/mask_iou_reward": 0.8682251687178204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7434085607528687, "rewards/thk_ans_format_reward": 1.0, "step": 1881, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 183.02083587646484, "epoch": 6.357504215851602, "grad_norm": 11.502887174757745, "kl": 0.4736328125, "learning_rate": 4.7015765765765766e-07, "loss": 0.0005, "reward": 3.6568949222564697, "reward_std": 0.060632091015577316, "rewards/final_reward": 1.8699796659595362, "rewards/mask_iou_reward": 0.9349898329797681, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6568948030471802, "rewards/thk_ans_format_reward": 1.0, "step": 1882, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 173.14583587646484, "epoch": 6.360876897133221, "grad_norm": 9.859580321040402, "kl": 0.4111328125, "learning_rate": 4.698761261261261e-07, "loss": 0.0004, "reward": 3.5974520444869995, "reward_std": 0.10273704305291176, "rewards/final_reward": 1.7107119595052134, "rewards/mask_iou_reward": 0.8553559797526067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5974522829055786, "rewards/thk_ans_format_reward": 1.0, "step": 1883, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 179.67708587646484, "epoch": 6.36424957841484, "grad_norm": 9.131603356854935, "kl": 0.404296875, "learning_rate": 4.695945945945946e-07, "loss": 0.0004, "reward": 3.458780288696289, "reward_std": 0.08173859491944313, "rewards/final_reward": 1.5234535005155414, "rewards/mask_iou_reward": 0.7617267502577707, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4587804675102234, "rewards/thk_ans_format_reward": 1.0, "step": 1884, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 208.98959350585938, "epoch": 6.367622259696459, "grad_norm": 7.846051525792356, "kl": 0.43359375, "learning_rate": 4.6931306306306304e-07, "loss": 0.0004, "reward": 3.477925419807434, "reward_std": 0.056161317974328995, "rewards/final_reward": 1.7337655025580978, "rewards/mask_iou_reward": 0.8668827512790489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4779254794120789, "rewards/thk_ans_format_reward": 1.0, "step": 1885, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 211.50000762939453, "epoch": 6.370994940978077, "grad_norm": 9.447334787633611, "kl": 0.4921875, "learning_rate": 4.690315315315315e-07, "loss": 0.0005, "reward": 3.6862441301345825, "reward_std": 0.07632257603108883, "rewards/final_reward": 1.539230532497822, "rewards/mask_iou_reward": 0.769615266248911, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6862438917160034, "rewards/thk_ans_format_reward": 1.0, "step": 1886, "think_completion_length": 10.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 213.45834350585938, "epoch": 6.3743676222596966, "grad_norm": 7.634914287544993, "kl": 0.388671875, "learning_rate": 4.6874999999999996e-07, "loss": 0.0004, "reward": 3.2322871685028076, "reward_std": 0.3254256844520569, "rewards/final_reward": 0.8302558515962044, "rewards/mask_iou_reward": 0.4151279257981022, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2531203627586365, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1887, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 158.39584350585938, "epoch": 6.377740303541315, "grad_norm": 9.849923513152714, "kl": 0.5419921875, "learning_rate": 4.684684684684684e-07, "loss": 0.0005, "reward": 3.6471699476242065, "reward_std": 0.11751040071249008, "rewards/final_reward": 1.6158547055186272, "rewards/mask_iou_reward": 0.8079273527593136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.647170066833496, "rewards/thk_ans_format_reward": 1.0, "step": 1888, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 148.3541717529297, "epoch": 6.381112984822934, "grad_norm": 14.533013973880257, "kl": 0.419921875, "learning_rate": 4.681869369369369e-07, "loss": 0.0004, "reward": 3.430299997329712, "reward_std": 0.10118568316102028, "rewards/final_reward": 1.7194286437301751, "rewards/mask_iou_reward": 0.8597143218650876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4302998781204224, "rewards/thk_ans_format_reward": 1.0, "step": 1889, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 161.53125762939453, "epoch": 6.3844856661045535, "grad_norm": 8.55568300191631, "kl": 0.55078125, "learning_rate": 4.679054054054054e-07, "loss": 0.0006, "reward": 3.460633873939514, "reward_std": 0.25002913177013397, "rewards/final_reward": 1.8770856025200167, "rewards/mask_iou_reward": 0.9385428012600083, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4814671277999878, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1890, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 148.62500762939453, "epoch": 6.387858347386172, "grad_norm": 11.682304101407496, "kl": 0.4931640625, "learning_rate": 4.6762387387387385e-07, "loss": 0.0005, "reward": 3.004297971725464, "reward_std": 0.11261074617505074, "rewards/final_reward": 0.4484286897518397, "rewards/mask_iou_reward": 0.22421434487591985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.004297912120819, "rewards/thk_ans_format_reward": 1.0, "step": 1891, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.71875762939453, "epoch": 6.391231028667791, "grad_norm": 7.919820661858854, "kl": 0.470703125, "learning_rate": 4.673423423423423e-07, "loss": 0.0005, "reward": 3.617723226547241, "reward_std": 0.06559170037508011, "rewards/final_reward": 1.4914626275770577, "rewards/mask_iou_reward": 0.7457313137885289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.617722988128662, "rewards/thk_ans_format_reward": 1.0, "step": 1892, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 175.1979217529297, "epoch": 6.3946037099494095, "grad_norm": 16.955529628182386, "kl": 0.41015625, "learning_rate": 4.670608108108108e-07, "loss": 0.0004, "reward": 3.0826051235198975, "reward_std": 0.14307872019708157, "rewards/final_reward": 0.46960478747577494, "rewards/mask_iou_reward": 0.23480239373788747, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.0930215120315552, "rewards/thk_ans_format_reward": 1.0, "step": 1893, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 239.46875762939453, "epoch": 6.397976391231029, "grad_norm": 9.479392818196867, "kl": 0.478515625, "learning_rate": 4.6677927927927924e-07, "loss": 0.0005, "reward": 3.517951011657715, "reward_std": 0.16193577647209167, "rewards/final_reward": 1.0184101228734423, "rewards/mask_iou_reward": 0.5092050614367212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5179506540298462, "rewards/thk_ans_format_reward": 1.0, "step": 1894, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 195.92708587646484, "epoch": 6.401349072512647, "grad_norm": 20.372484305072078, "kl": 0.416015625, "learning_rate": 4.6649774774774775e-07, "loss": 0.0004, "reward": 3.436389207839966, "reward_std": 0.11014799401164055, "rewards/final_reward": 1.2207106790456062, "rewards/mask_iou_reward": 0.6103553395228031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4363892078399658, "rewards/thk_ans_format_reward": 1.0, "step": 1895, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.08333587646484, "epoch": 6.4047217537942664, "grad_norm": 15.893878645545087, "kl": 0.494140625, "learning_rate": 4.662162162162162e-07, "loss": 0.0005, "reward": 3.601964235305786, "reward_std": 0.16152212023735046, "rewards/final_reward": 1.766780638428799, "rewards/mask_iou_reward": 0.8833903192143995, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.601963996887207, "rewards/thk_ans_format_reward": 1.0, "step": 1896, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.2604217529297, "epoch": 6.408094435075886, "grad_norm": 8.736031495192677, "kl": 0.3828125, "learning_rate": 4.6593468468468467e-07, "loss": 0.0004, "reward": 3.5205795764923096, "reward_std": 0.20784608274698257, "rewards/final_reward": 1.4668057131209964, "rewards/mask_iou_reward": 0.7334028565604982, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.52057945728302, "rewards/thk_ans_format_reward": 1.0, "step": 1897, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 168.18750762939453, "epoch": 6.411467116357504, "grad_norm": 7.594879038415985, "kl": 0.669921875, "learning_rate": 4.6565315315315313e-07, "loss": 0.0007, "reward": 3.405119299888611, "reward_std": 0.12414194270968437, "rewards/final_reward": 1.162574556350045, "rewards/mask_iou_reward": 0.5812872781750225, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.40511953830719, "rewards/thk_ans_format_reward": 1.0, "step": 1898, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 190.2291717529297, "epoch": 6.414839797639123, "grad_norm": 29.348037382618, "kl": 0.37890625, "learning_rate": 4.653716216216216e-07, "loss": 0.0003, "reward": 3.5290740728378296, "reward_std": 0.07987385988235474, "rewards/final_reward": 1.2127857770770258, "rewards/mask_iou_reward": 0.6063928885385129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.529074251651764, "rewards/thk_ans_format_reward": 1.0, "step": 1899, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 162.46875762939453, "epoch": 6.418212478920742, "grad_norm": 31.070338674714865, "kl": 0.54296875, "learning_rate": 4.650900900900901e-07, "loss": 0.0006, "reward": 3.3826311826705933, "reward_std": 0.049534888938069344, "rewards/final_reward": 1.8532369135906288, "rewards/mask_iou_reward": 0.9266184567953144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3826313018798828, "rewards/thk_ans_format_reward": 1.0, "step": 1900, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 167.3229217529297, "epoch": 6.421585160202361, "grad_norm": 9.508395337814294, "kl": 0.40234375, "learning_rate": 4.6480855855855857e-07, "loss": 0.0004, "reward": 3.3927754163742065, "reward_std": 0.07938742637634277, "rewards/final_reward": 1.1924416076254536, "rewards/mask_iou_reward": 0.5962208038127268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3927754163742065, "rewards/thk_ans_format_reward": 1.0, "step": 1901, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 158.8854217529297, "epoch": 6.424957841483979, "grad_norm": 11.1582006576339, "kl": 0.4482421875, "learning_rate": 4.6452702702702703e-07, "loss": 0.0005, "reward": 3.4296629428863525, "reward_std": 0.11983692087233067, "rewards/final_reward": 1.087414950859162, "rewards/mask_iou_reward": 0.543707475429581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4296630024909973, "rewards/thk_ans_format_reward": 1.0, "step": 1902, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 160.30209350585938, "epoch": 6.428330522765599, "grad_norm": 12.465830782713155, "kl": 0.501953125, "learning_rate": 4.642454954954955e-07, "loss": 0.0005, "reward": 3.1689430475234985, "reward_std": 0.07689309120178223, "rewards/final_reward": 1.5112853224431064, "rewards/mask_iou_reward": 0.7556426612215532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.168942928314209, "rewards/thk_ans_format_reward": 1.0, "step": 1903, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 176.65625762939453, "epoch": 6.431703204047217, "grad_norm": 9.971794799370729, "kl": 0.4296875, "learning_rate": 4.639639639639639e-07, "loss": 0.0004, "reward": 3.2337831258773804, "reward_std": 0.10983862727880478, "rewards/final_reward": 1.3039498349252785, "rewards/mask_iou_reward": 0.6519749174626392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.233783334493637, "rewards/thk_ans_format_reward": 1.0, "step": 1904, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 115.26041793823242, "epoch": 6.435075885328836, "grad_norm": 18.680989287171347, "kl": 0.607421875, "learning_rate": 4.636824324324324e-07, "loss": 0.0006, "reward": 3.4857401847839355, "reward_std": 0.122207872569561, "rewards/final_reward": 1.6436075325939328, "rewards/mask_iou_reward": 0.8218037662969664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.485740065574646, "rewards/thk_ans_format_reward": 1.0, "step": 1905, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 218.90625, "epoch": 6.438448566610456, "grad_norm": 24.15840867971098, "kl": 0.4111328125, "learning_rate": 4.6340090090090087e-07, "loss": 0.0004, "reward": 3.565677046775818, "reward_std": 0.05232588015496731, "rewards/final_reward": 1.7861149132567309, "rewards/mask_iou_reward": 0.8930574566283654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5656769275665283, "rewards/thk_ans_format_reward": 1.0, "step": 1906, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 201.0520896911621, "epoch": 6.441821247892074, "grad_norm": 9.748880341775454, "kl": 0.392578125, "learning_rate": 4.6311936936936933e-07, "loss": 0.0004, "reward": 3.5896493196487427, "reward_std": 0.06489380449056625, "rewards/final_reward": 1.803237968390866, "rewards/mask_iou_reward": 0.901618984195433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5896490812301636, "rewards/thk_ans_format_reward": 1.0, "step": 1907, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 186.21875, "epoch": 6.445193929173693, "grad_norm": 8.003405234992922, "kl": 0.453125, "learning_rate": 4.628378378378378e-07, "loss": 0.0005, "reward": 3.381407618522644, "reward_std": 0.106148362159729, "rewards/final_reward": 1.3149248627705654, "rewards/mask_iou_reward": 0.6574624313852827, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.381407618522644, "rewards/thk_ans_format_reward": 1.0, "step": 1908, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 188.46875, "epoch": 6.448566610455312, "grad_norm": 7.960417542259144, "kl": 0.4375, "learning_rate": 4.6255630630630625e-07, "loss": 0.0004, "reward": 3.6770023107528687, "reward_std": 0.11613703519105911, "rewards/final_reward": 1.7991855951978626, "rewards/mask_iou_reward": 0.8995927975989313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6770024299621582, "rewards/thk_ans_format_reward": 1.0, "step": 1909, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 173.05208587646484, "epoch": 6.451939291736931, "grad_norm": 13.076750082591246, "kl": 0.38671875, "learning_rate": 4.6227477477477477e-07, "loss": 0.0004, "reward": 3.450620651245117, "reward_std": 0.12947594933211803, "rewards/final_reward": 1.7108386702374498, "rewards/mask_iou_reward": 0.8554193351187249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.450620412826538, "rewards/thk_ans_format_reward": 1.0, "step": 1910, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 159.59375762939453, "epoch": 6.455311973018549, "grad_norm": 12.562946407758929, "kl": 0.462890625, "learning_rate": 4.6199324324324323e-07, "loss": 0.0005, "reward": 3.538373827934265, "reward_std": 0.052979251369833946, "rewards/final_reward": 1.9293922026045054, "rewards/mask_iou_reward": 0.9646961013022527, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5383738279342651, "rewards/thk_ans_format_reward": 1.0, "step": 1911, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 139.25000762939453, "epoch": 6.458684654300169, "grad_norm": 27.17672682917496, "kl": 0.443359375, "learning_rate": 4.617117117117117e-07, "loss": 0.0004, "reward": 3.4632757902145386, "reward_std": 0.18749287351965904, "rewards/final_reward": 1.8631885229559808, "rewards/mask_iou_reward": 0.9315942614779904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4632756114006042, "rewards/thk_ans_format_reward": 1.0, "step": 1912, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 225.50000762939453, "epoch": 6.462057335581788, "grad_norm": 68.01765666204261, "kl": 0.8388671875, "learning_rate": 4.6143018018018015e-07, "loss": 0.0008, "reward": 3.3502947092056274, "reward_std": 0.15701918303966522, "rewards/final_reward": 1.287749956469415, "rewards/mask_iou_reward": 0.6438749782347075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3502946496009827, "rewards/thk_ans_format_reward": 1.0, "step": 1913, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 135.93750762939453, "epoch": 6.465430016863406, "grad_norm": 12.465496225956764, "kl": 0.4345703125, "learning_rate": 4.611486486486486e-07, "loss": 0.0004, "reward": 3.6096237897872925, "reward_std": 0.11629275232553482, "rewards/final_reward": 1.6158974661798904, "rewards/mask_iou_reward": 0.8079487330899452, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6096238493919373, "rewards/thk_ans_format_reward": 1.0, "step": 1914, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 128.1875, "epoch": 6.4688026981450255, "grad_norm": 8.953988368314507, "kl": 0.41015625, "learning_rate": 4.608671171171171e-07, "loss": 0.0004, "reward": 3.6579357385635376, "reward_std": 0.08615681529045105, "rewards/final_reward": 1.8738261382499415, "rewards/mask_iou_reward": 0.9369130691249707, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6579357981681824, "rewards/thk_ans_format_reward": 1.0, "step": 1915, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 168.29166793823242, "epoch": 6.472175379426644, "grad_norm": 8.530074182730772, "kl": 0.419921875, "learning_rate": 4.605855855855856e-07, "loss": 0.0004, "reward": 3.500860333442688, "reward_std": 0.06479554157704115, "rewards/final_reward": 1.7721428409723283, "rewards/mask_iou_reward": 0.8860714204861642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5008601546287537, "rewards/thk_ans_format_reward": 1.0, "step": 1916, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 125.09375762939453, "epoch": 6.475548060708263, "grad_norm": 9.642455716005925, "kl": 0.439453125, "learning_rate": 4.6030405405405404e-07, "loss": 0.0005, "reward": 3.2050682306289673, "reward_std": 0.2021598145365715, "rewards/final_reward": 0.5997128136106451, "rewards/mask_iou_reward": 0.29985640680532255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2050682306289673, "rewards/thk_ans_format_reward": 1.0, "step": 1917, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.9791717529297, "epoch": 6.4789207419898815, "grad_norm": 7.830276530642089, "kl": 0.626953125, "learning_rate": 4.600225225225225e-07, "loss": 0.0006, "reward": 3.629697561264038, "reward_std": 0.036790769547224045, "rewards/final_reward": 1.4997423734748434, "rewards/mask_iou_reward": 0.7498711867374217, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6296973824501038, "rewards/thk_ans_format_reward": 1.0, "step": 1918, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 165.8854217529297, "epoch": 6.482293423271501, "grad_norm": 8.236699940973251, "kl": 0.5322265625, "learning_rate": 4.5974099099099097e-07, "loss": 0.0005, "reward": 3.3659796714782715, "reward_std": 0.08384433016180992, "rewards/final_reward": 1.8747773736141915, "rewards/mask_iou_reward": 0.9373886868070958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.365979552268982, "rewards/thk_ans_format_reward": 1.0, "step": 1919, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 145.45833587646484, "epoch": 6.48566610455312, "grad_norm": 7.399897831772437, "kl": 0.576171875, "learning_rate": 4.594594594594595e-07, "loss": 0.0006, "reward": 3.5207263231277466, "reward_std": 0.0641837865114212, "rewards/final_reward": 1.9002956424366944, "rewards/mask_iou_reward": 0.9501478212183472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5207264423370361, "rewards/thk_ans_format_reward": 1.0, "step": 1920, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 172.42708587646484, "epoch": 6.4890387858347385, "grad_norm": 9.934190133486409, "kl": 0.5322265625, "learning_rate": 4.5917792792792794e-07, "loss": 0.0005, "reward": 3.6050784587860107, "reward_std": 0.05036386847496033, "rewards/final_reward": 1.4844076510515394, "rewards/mask_iou_reward": 0.7422038255257697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6050782203674316, "rewards/thk_ans_format_reward": 1.0, "step": 1921, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 143.61458587646484, "epoch": 6.492411467116358, "grad_norm": 17.45409532746466, "kl": 0.501953125, "learning_rate": 4.588963963963964e-07, "loss": 0.0006, "reward": 3.6852781772613525, "reward_std": 0.14414148032665253, "rewards/final_reward": 1.8175316750528594, "rewards/mask_iou_reward": 0.9087658375264297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6852782368659973, "rewards/thk_ans_format_reward": 1.0, "step": 1922, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 145.67708587646484, "epoch": 6.495784148397976, "grad_norm": 8.365698590732766, "kl": 0.494140625, "learning_rate": 4.5861486486486486e-07, "loss": 0.0005, "reward": 3.620155453681946, "reward_std": 0.10641103237867355, "rewards/final_reward": 1.5902591801469743, "rewards/mask_iou_reward": 0.7951295900734872, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6201554536819458, "rewards/thk_ans_format_reward": 1.0, "step": 1923, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 167.64583587646484, "epoch": 6.499156829679595, "grad_norm": 8.407844695773075, "kl": 0.4052734375, "learning_rate": 4.5833333333333327e-07, "loss": 0.0004, "reward": 3.578555703163147, "reward_std": 0.040246653370559216, "rewards/final_reward": 1.0063336036209627, "rewards/mask_iou_reward": 0.5031668018104813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5785555839538574, "rewards/thk_ans_format_reward": 1.0, "step": 1924, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 143.34375762939453, "epoch": 6.502529510961214, "grad_norm": 26.172801271228387, "kl": 0.421875, "learning_rate": 4.580518018018018e-07, "loss": 0.0004, "reward": 3.4072368144989014, "reward_std": 0.07461421936750412, "rewards/final_reward": 1.2929097656984843, "rewards/mask_iou_reward": 0.6464548828492421, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4072368144989014, "rewards/thk_ans_format_reward": 1.0, "step": 1925, "think_completion_length": 6.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 159.73958587646484, "epoch": 6.505902192242833, "grad_norm": 12.119896301081399, "kl": 0.4287109375, "learning_rate": 4.5777027027027024e-07, "loss": 0.0004, "reward": 3.206121563911438, "reward_std": 0.09432797785848379, "rewards/final_reward": 0.8982187139512381, "rewards/mask_iou_reward": 0.44910935697561905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2061215043067932, "rewards/thk_ans_format_reward": 1.0, "step": 1926, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 145.21875, "epoch": 6.509274873524452, "grad_norm": 415.2726487286936, "kl": 0.423828125, "learning_rate": 4.574887387387387e-07, "loss": 0.0004, "reward": 3.6168497800827026, "reward_std": 0.10688769817352295, "rewards/final_reward": 1.6747035288263004, "rewards/mask_iou_reward": 0.8373517644131502, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6272663474082947, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1927, "think_completion_length": 9.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 145.5729217529297, "epoch": 6.512647554806071, "grad_norm": 19.920648554833924, "kl": 1.8330078125, "learning_rate": 4.5720720720720716e-07, "loss": 0.0018, "reward": 3.4919700622558594, "reward_std": 0.17838171124458313, "rewards/final_reward": 1.3544193334299761, "rewards/mask_iou_reward": 0.6772096667149881, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.491970181465149, "rewards/thk_ans_format_reward": 1.0, "step": 1928, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 167.73959350585938, "epoch": 6.51602023608769, "grad_norm": 5.874339600504554, "kl": 0.41796875, "learning_rate": 4.569256756756756e-07, "loss": 0.0004, "reward": 3.5194711685180664, "reward_std": 0.1812281534075737, "rewards/final_reward": 1.7720225438373989, "rewards/mask_iou_reward": 0.8860112719186994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5194711089134216, "rewards/thk_ans_format_reward": 1.0, "step": 1929, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 138.14583587646484, "epoch": 6.519392917369308, "grad_norm": 71.2249777632913, "kl": 0.5859375, "learning_rate": 4.5664414414414414e-07, "loss": 0.0006, "reward": 3.513147711753845, "reward_std": 0.0648888386785984, "rewards/final_reward": 1.1092570722526767, "rewards/mask_iou_reward": 0.5546285361263383, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5131479501724243, "rewards/thk_ans_format_reward": 1.0, "step": 1930, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 147.46875, "epoch": 6.522765598650928, "grad_norm": 10.954163481582695, "kl": 0.3837890625, "learning_rate": 4.563626126126126e-07, "loss": 0.0004, "reward": 3.397698402404785, "reward_std": 0.0887177549302578, "rewards/final_reward": 1.8347744826708012, "rewards/mask_iou_reward": 0.9173872413354006, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3976984024047852, "rewards/thk_ans_format_reward": 1.0, "step": 1931, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 172.53125762939453, "epoch": 6.526138279932546, "grad_norm": 7.545325626890654, "kl": 0.5244140625, "learning_rate": 4.5608108108108106e-07, "loss": 0.0005, "reward": 3.047685384750366, "reward_std": 0.022525336127728224, "rewards/final_reward": 1.05814256808342, "rewards/mask_iou_reward": 0.52907128404171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0476853251457214, "rewards/thk_ans_format_reward": 1.0, "step": 1932, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.20833587646484, "epoch": 6.529510961214165, "grad_norm": 33.660125151640464, "kl": 0.44921875, "learning_rate": 4.557995495495495e-07, "loss": 0.0005, "reward": 3.55889630317688, "reward_std": 0.051030886359512806, "rewards/final_reward": 1.5346261785456772, "rewards/mask_iou_reward": 0.7673130892728386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5588963627815247, "rewards/thk_ans_format_reward": 1.0, "step": 1933, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.05208587646484, "epoch": 6.532883642495785, "grad_norm": 8.120695624403554, "kl": 0.431640625, "learning_rate": 4.55518018018018e-07, "loss": 0.0004, "reward": 3.450679898262024, "reward_std": 0.10136326961219311, "rewards/final_reward": 0.9703960820165738, "rewards/mask_iou_reward": 0.4851980410082869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4506798386573792, "rewards/thk_ans_format_reward": 1.0, "step": 1934, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.8229217529297, "epoch": 6.536256323777403, "grad_norm": 12.669245787632661, "kl": 4.64453125, "learning_rate": 4.552364864864865e-07, "loss": 0.0047, "reward": 3.7870538234710693, "reward_std": 0.07818649988621473, "rewards/final_reward": 1.5274776588059071, "rewards/mask_iou_reward": 0.7637388294029536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7870538234710693, "rewards/thk_ans_format_reward": 1.0, "step": 1935, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.90625762939453, "epoch": 6.539629005059022, "grad_norm": 11.033899027393797, "kl": 0.416015625, "learning_rate": 4.5495495495495496e-07, "loss": 0.0004, "reward": 3.4292943477630615, "reward_std": 0.12554167211055756, "rewards/final_reward": 1.6428870075377782, "rewards/mask_iou_reward": 0.8214435037688891, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.429294228553772, "rewards/thk_ans_format_reward": 1.0, "step": 1936, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 177.2916717529297, "epoch": 6.543001686340641, "grad_norm": 8.971625066042417, "kl": 0.5234375, "learning_rate": 4.546734234234234e-07, "loss": 0.0006, "reward": 3.288731336593628, "reward_std": 0.13752873055636883, "rewards/final_reward": 1.865159786427796, "rewards/mask_iou_reward": 0.932579893213898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2887312769889832, "rewards/thk_ans_format_reward": 1.0, "step": 1937, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 136.8854217529297, "epoch": 6.54637436762226, "grad_norm": 13.524320563229523, "kl": 0.423828125, "learning_rate": 4.543918918918919e-07, "loss": 0.0005, "reward": 3.1908024549484253, "reward_std": 0.04454457201063633, "rewards/final_reward": 1.530183020227454, "rewards/mask_iou_reward": 0.765091510113727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1908025741577148, "rewards/thk_ans_format_reward": 1.0, "step": 1938, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 129.48958587646484, "epoch": 6.549747048903878, "grad_norm": 7.095076055591257, "kl": 0.54296875, "learning_rate": 4.5411036036036034e-07, "loss": 0.0006, "reward": 3.755676031112671, "reward_std": 0.05841661896556616, "rewards/final_reward": 1.6603783972345676, "rewards/mask_iou_reward": 0.8301891986172838, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7556763291358948, "rewards/thk_ans_format_reward": 1.0, "step": 1939, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 150.03125762939453, "epoch": 6.5531197301854975, "grad_norm": 19.24255297558277, "kl": 0.443359375, "learning_rate": 4.5382882882882885e-07, "loss": 0.0004, "reward": 3.671309232711792, "reward_std": 0.07891538739204407, "rewards/final_reward": 1.6045817813514578, "rewards/mask_iou_reward": 0.8022908906757289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6713091135025024, "rewards/thk_ans_format_reward": 1.0, "step": 1940, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 137.12500762939453, "epoch": 6.556492411467117, "grad_norm": 21.61772380542658, "kl": 0.44140625, "learning_rate": 4.535472972972973e-07, "loss": 0.0005, "reward": 3.2939724922180176, "reward_std": 0.07393957488238811, "rewards/final_reward": 1.8902274489083017, "rewards/mask_iou_reward": 0.9451137244541509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2939725518226624, "rewards/thk_ans_format_reward": 1.0, "step": 1941, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 143.70833587646484, "epoch": 6.559865092748735, "grad_norm": 8.304061072522082, "kl": 0.3974609375, "learning_rate": 4.5326576576576577e-07, "loss": 0.0004, "reward": 3.7287017107009888, "reward_std": 0.0779542843811214, "rewards/final_reward": 1.565529514841001, "rewards/mask_iou_reward": 0.7827647574205004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7287017703056335, "rewards/thk_ans_format_reward": 1.0, "step": 1942, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 139.5729217529297, "epoch": 6.5632377740303545, "grad_norm": 29.07655724939814, "kl": 0.4384765625, "learning_rate": 4.5298423423423423e-07, "loss": 0.0004, "reward": 3.7415881156921387, "reward_std": 0.11560340598225594, "rewards/final_reward": 1.9170531111047713, "rewards/mask_iou_reward": 0.9585265555523856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7415879368782043, "rewards/thk_ans_format_reward": 1.0, "step": 1943, "think_completion_length": 10.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 137.61458587646484, "epoch": 6.566610455311973, "grad_norm": 6.927386222217618, "kl": 0.47265625, "learning_rate": 4.5270270270270264e-07, "loss": 0.0005, "reward": 3.2021960020065308, "reward_std": 0.12428174912929535, "rewards/final_reward": 0.5200134386180482, "rewards/mask_iou_reward": 0.2600067193090241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2021958231925964, "rewards/thk_ans_format_reward": 1.0, "step": 1944, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 188.09375762939453, "epoch": 6.569983136593592, "grad_norm": 9.047643638152596, "kl": 0.4072265625, "learning_rate": 4.524211711711711e-07, "loss": 0.0005, "reward": 3.5492023229599, "reward_std": 0.052342869341373444, "rewards/final_reward": 1.9278371578029603, "rewards/mask_iou_reward": 0.9639185789014801, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5492025017738342, "rewards/thk_ans_format_reward": 1.0, "step": 1945, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 116.67708587646484, "epoch": 6.5733558178752105, "grad_norm": 8.075763010287291, "kl": 0.4912109375, "learning_rate": 4.521396396396396e-07, "loss": 0.0006, "reward": 3.7009243965148926, "reward_std": 0.04675254225730896, "rewards/final_reward": 1.7781464676900498, "rewards/mask_iou_reward": 0.8890732338450249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7009243369102478, "rewards/thk_ans_format_reward": 1.0, "step": 1946, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 158.5625, "epoch": 6.57672849915683, "grad_norm": 15.369144475444557, "kl": 0.5537109375, "learning_rate": 4.518581081081081e-07, "loss": 0.0005, "reward": 3.627153992652893, "reward_std": 0.11572860553860664, "rewards/final_reward": 1.614166593708518, "rewards/mask_iou_reward": 0.807083296854259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6271539330482483, "rewards/thk_ans_format_reward": 1.0, "step": 1947, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 138.39583587646484, "epoch": 6.580101180438449, "grad_norm": 84.51339562057268, "kl": 0.669921875, "learning_rate": 4.5157657657657654e-07, "loss": 0.0007, "reward": 3.5069133043289185, "reward_std": 0.15810798108577728, "rewards/final_reward": 1.6061496162495597, "rewards/mask_iou_reward": 0.8030748081247798, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5069132447242737, "rewards/thk_ans_format_reward": 1.0, "step": 1948, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.42709350585938, "epoch": 6.583473861720067, "grad_norm": 12.173912335159075, "kl": 0.4990234375, "learning_rate": 4.51295045045045e-07, "loss": 0.0005, "reward": 3.458880066871643, "reward_std": 0.079188940115273, "rewards/final_reward": 1.3770498445233732, "rewards/mask_iou_reward": 0.6885249222616866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4588798880577087, "rewards/thk_ans_format_reward": 1.0, "step": 1949, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.47916793823242, "epoch": 6.586846543001687, "grad_norm": 17.453147272178725, "kl": 0.541015625, "learning_rate": 4.5101351351351346e-07, "loss": 0.0005, "reward": 3.465920329093933, "reward_std": 0.07390345633029938, "rewards/final_reward": 1.7473652437753469, "rewards/mask_iou_reward": 0.8736826218876734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.465920329093933, "rewards/thk_ans_format_reward": 1.0, "step": 1950, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.67708587646484, "epoch": 6.590219224283305, "grad_norm": 7.769345536825688, "kl": 0.4599609375, "learning_rate": 4.5073198198198197e-07, "loss": 0.0005, "reward": 3.3470152616500854, "reward_std": 0.08661656081676483, "rewards/final_reward": 1.3208495764728003, "rewards/mask_iou_reward": 0.6604247882364002, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3470152616500854, "rewards/thk_ans_format_reward": 1.0, "step": 1951, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 130.34375762939453, "epoch": 6.593591905564924, "grad_norm": 15.386128922804302, "kl": 0.53125, "learning_rate": 4.5045045045045043e-07, "loss": 0.0006, "reward": 3.5289965867996216, "reward_std": 0.14280862733721733, "rewards/final_reward": 1.461553967923843, "rewards/mask_iou_reward": 0.7307769839619215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5289965271949768, "rewards/thk_ans_format_reward": 1.0, "step": 1952, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 125.67708587646484, "epoch": 6.596964586846543, "grad_norm": 26.095973849490772, "kl": 0.474609375, "learning_rate": 4.501689189189189e-07, "loss": 0.0005, "reward": 3.497464895248413, "reward_std": 0.1548975557088852, "rewards/final_reward": 1.8610477691556193, "rewards/mask_iou_reward": 0.9305238845778097, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4974648356437683, "rewards/thk_ans_format_reward": 1.0, "step": 1953, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 209.6041717529297, "epoch": 6.600337268128162, "grad_norm": 18.14612727739541, "kl": 3.826171875, "learning_rate": 4.4988738738738735e-07, "loss": 0.0038, "reward": 3.464960217475891, "reward_std": 0.168898306787014, "rewards/final_reward": 1.8793782250237863, "rewards/mask_iou_reward": 0.9396891125118931, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4857934713363647, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1954, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.53125762939453, "epoch": 6.60370994940978, "grad_norm": 46.67923195576741, "kl": 0.703125, "learning_rate": 4.496058558558558e-07, "loss": 0.0007, "reward": 3.451904535293579, "reward_std": 0.11900551989674568, "rewards/final_reward": 1.68860443712021, "rewards/mask_iou_reward": 0.844302218560105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4519044756889343, "rewards/thk_ans_format_reward": 1.0, "step": 1955, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 127.10417175292969, "epoch": 6.6070826306914, "grad_norm": 9.579470561626394, "kl": 0.4453125, "learning_rate": 4.4932432432432433e-07, "loss": 0.0005, "reward": 3.6845006942749023, "reward_std": 0.04711965471506119, "rewards/final_reward": 1.8025177070464735, "rewards/mask_iou_reward": 0.9012588535232368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.684500515460968, "rewards/thk_ans_format_reward": 1.0, "step": 1956, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 160.0104217529297, "epoch": 6.610455311973018, "grad_norm": 11.62271298127058, "kl": 0.4140625, "learning_rate": 4.490427927927928e-07, "loss": 0.0004, "reward": 3.4423630237579346, "reward_std": 0.06707348302006721, "rewards/final_reward": 1.4323155140969193, "rewards/mask_iou_reward": 0.7161577570484596, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4423629641532898, "rewards/thk_ans_format_reward": 1.0, "step": 1957, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 118.375, "epoch": 6.613827993254637, "grad_norm": 5.29781925562583, "kl": 0.56640625, "learning_rate": 4.4876126126126125e-07, "loss": 0.0005, "reward": 3.433348774909973, "reward_std": 0.03969069384038448, "rewards/final_reward": 1.8051425197216238, "rewards/mask_iou_reward": 0.9025712598608119, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4333484768867493, "rewards/thk_ans_format_reward": 1.0, "step": 1958, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 185.92708587646484, "epoch": 6.617200674536257, "grad_norm": 8.482364662005988, "kl": 0.5, "learning_rate": 4.484797297297297e-07, "loss": 0.0005, "reward": 3.334537982940674, "reward_std": 0.2071598581969738, "rewards/final_reward": 1.401921329152926, "rewards/mask_iou_reward": 0.700960664576463, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3553712964057922, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1959, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 122.65625762939453, "epoch": 6.620573355817875, "grad_norm": 9.109768103898793, "kl": 0.548828125, "learning_rate": 4.4819819819819817e-07, "loss": 0.0005, "reward": 3.436391592025757, "reward_std": 0.09884997457265854, "rewards/final_reward": 1.448345198392761, "rewards/mask_iou_reward": 0.7241725991963806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4363916516304016, "rewards/thk_ans_format_reward": 1.0, "step": 1960, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 186.09375, "epoch": 6.623946037099494, "grad_norm": 7.618736225368571, "kl": 0.521484375, "learning_rate": 4.479166666666667e-07, "loss": 0.0005, "reward": 3.4175766706466675, "reward_std": 0.19502199813723564, "rewards/final_reward": 1.3908411124500102, "rewards/mask_iou_reward": 0.6954205562250051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4175763726234436, "rewards/thk_ans_format_reward": 1.0, "step": 1961, "think_completion_length": 9.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.34375762939453, "epoch": 6.627318718381113, "grad_norm": 9.619492441340693, "kl": 0.4921875, "learning_rate": 4.4763513513513514e-07, "loss": 0.0005, "reward": 3.8224265575408936, "reward_std": 0.07703239098191261, "rewards/final_reward": 1.9362953050973177, "rewards/mask_iou_reward": 0.9681476525486589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8224263787269592, "rewards/thk_ans_format_reward": 1.0, "step": 1962, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 137.71875381469727, "epoch": 6.630691399662732, "grad_norm": 7.710027079651332, "kl": 0.5009765625, "learning_rate": 4.473536036036036e-07, "loss": 0.0005, "reward": 3.3381128311157227, "reward_std": 0.10878269374370575, "rewards/final_reward": 1.7846295429377987, "rewards/mask_iou_reward": 0.8923147714688994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.338112473487854, "rewards/thk_ans_format_reward": 1.0, "step": 1963, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 120.21875381469727, "epoch": 6.63406408094435, "grad_norm": 11.144801235946936, "kl": 0.498046875, "learning_rate": 4.47072072072072e-07, "loss": 0.0005, "reward": 3.5592269897460938, "reward_std": 0.06369103118777275, "rewards/final_reward": 1.1932721632080656, "rewards/mask_iou_reward": 0.5966360816040328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.559226930141449, "rewards/thk_ans_format_reward": 1.0, "step": 1964, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 124.16667175292969, "epoch": 6.63743676222597, "grad_norm": 7.580881113025555, "kl": 0.419921875, "learning_rate": 4.467905405405405e-07, "loss": 0.0004, "reward": 3.5747615098953247, "reward_std": 0.037079617381095886, "rewards/final_reward": 1.4820962960826627, "rewards/mask_iou_reward": 0.7410481480413313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5747616291046143, "rewards/thk_ans_format_reward": 1.0, "step": 1965, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 170.98958587646484, "epoch": 6.640809443507589, "grad_norm": 25.18062527412311, "kl": 0.384765625, "learning_rate": 4.46509009009009e-07, "loss": 0.0004, "reward": 3.6321656703948975, "reward_std": 0.09968181699514389, "rewards/final_reward": 1.7989874917960231, "rewards/mask_iou_reward": 0.8994937458980116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6321657299995422, "rewards/thk_ans_format_reward": 1.0, "step": 1966, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 127.03125381469727, "epoch": 6.644182124789207, "grad_norm": 23.268402345738316, "kl": 0.4365234375, "learning_rate": 4.4622747747747745e-07, "loss": 0.0004, "reward": 3.722867250442505, "reward_std": 0.07552542351186275, "rewards/final_reward": 1.9138519713931477, "rewards/mask_iou_reward": 0.9569259856965738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7228673696517944, "rewards/thk_ans_format_reward": 1.0, "step": 1967, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.31250381469727, "epoch": 6.6475548060708265, "grad_norm": 9.668951180153915, "kl": 0.46875, "learning_rate": 4.459459459459459e-07, "loss": 0.0005, "reward": 3.4191720485687256, "reward_std": 0.09570085257291794, "rewards/final_reward": 1.1168326682335907, "rewards/mask_iou_reward": 0.5584163341167954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4191720485687256, "rewards/thk_ans_format_reward": 1.0, "step": 1968, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.84375, "epoch": 6.650927487352445, "grad_norm": 12.650533772674278, "kl": 0.4365234375, "learning_rate": 4.4566441441441437e-07, "loss": 0.0004, "reward": 3.7565109729766846, "reward_std": 0.034082308411598206, "rewards/final_reward": 1.794588959040616, "rewards/mask_iou_reward": 0.897294479520308, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7565112113952637, "rewards/thk_ans_format_reward": 1.0, "step": 1969, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 152.20833587646484, "epoch": 6.654300168634064, "grad_norm": 9.648538208038131, "kl": 0.42578125, "learning_rate": 4.4538288288288283e-07, "loss": 0.0004, "reward": 3.744592070579529, "reward_std": 0.051872748881578445, "rewards/final_reward": 1.8309302844557134, "rewards/mask_iou_reward": 0.9154651422278567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7445920705795288, "rewards/thk_ans_format_reward": 1.0, "step": 1970, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 112.89583587646484, "epoch": 6.6576728499156825, "grad_norm": 7.015618502596955, "kl": 0.615234375, "learning_rate": 4.4510135135135134e-07, "loss": 0.0006, "reward": 2.9393208026885986, "reward_std": 0.16016435716301203, "rewards/final_reward": 0.18500248755062323, "rewards/mask_iou_reward": 0.09250124377531162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9393208026885986, "rewards/thk_ans_format_reward": 1.0, "step": 1971, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 139.32291793823242, "epoch": 6.661045531197302, "grad_norm": 20.898709748720457, "kl": 0.4794921875, "learning_rate": 4.448198198198198e-07, "loss": 0.0005, "reward": 3.446476101875305, "reward_std": 0.05248234234750271, "rewards/final_reward": 1.1091014816709772, "rewards/mask_iou_reward": 0.5545507408354886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4464759826660156, "rewards/thk_ans_format_reward": 1.0, "step": 1972, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 141.15625381469727, "epoch": 6.664418212478921, "grad_norm": 14.035078577908726, "kl": 0.478515625, "learning_rate": 4.4453828828828826e-07, "loss": 0.0005, "reward": 3.5463058948516846, "reward_std": 0.12094194442033768, "rewards/final_reward": 1.6668437508848952, "rewards/mask_iou_reward": 0.8334218754424476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5463060140609741, "rewards/thk_ans_format_reward": 1.0, "step": 1973, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 142.62500762939453, "epoch": 6.6677908937605395, "grad_norm": 12.934268271191499, "kl": 0.4150390625, "learning_rate": 4.442567567567567e-07, "loss": 0.0004, "reward": 3.3734281063079834, "reward_std": 0.06955000199377537, "rewards/final_reward": 1.6885538225511096, "rewards/mask_iou_reward": 0.8442769112755548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.373428225517273, "rewards/thk_ans_format_reward": 1.0, "step": 1974, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.6770896911621, "epoch": 6.671163575042159, "grad_norm": 8.430511842274917, "kl": 0.537109375, "learning_rate": 4.439752252252252e-07, "loss": 0.0005, "reward": 3.0608272552490234, "reward_std": 0.09121683984994888, "rewards/final_reward": 1.2410802690536467, "rewards/mask_iou_reward": 0.6205401345268233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0608273148536682, "rewards/thk_ans_format_reward": 1.0, "step": 1975, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 143.1145896911621, "epoch": 6.674536256323777, "grad_norm": 8.93189961549658, "kl": 0.4951171875, "learning_rate": 4.436936936936937e-07, "loss": 0.0005, "reward": 3.746822953224182, "reward_std": 0.060973282903432846, "rewards/final_reward": 1.8544490342566362, "rewards/mask_iou_reward": 0.9272245171283181, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7468228340148926, "rewards/thk_ans_format_reward": 1.0, "step": 1976, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 147.68750762939453, "epoch": 6.677908937605396, "grad_norm": 10.003023701820984, "kl": 0.4462890625, "learning_rate": 4.4341216216216216e-07, "loss": 0.0004, "reward": 3.589956760406494, "reward_std": 0.11278185062110424, "rewards/final_reward": 1.6687520003064154, "rewards/mask_iou_reward": 0.8343760001532077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5899565815925598, "rewards/thk_ans_format_reward": 1.0, "step": 1977, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 128.44791793823242, "epoch": 6.681281618887015, "grad_norm": 16.828960754253966, "kl": 0.4638671875, "learning_rate": 4.431306306306306e-07, "loss": 0.0005, "reward": 3.5644259452819824, "reward_std": 0.060715802013874054, "rewards/final_reward": 1.7956880990376614, "rewards/mask_iou_reward": 0.8978440495188307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5644258856773376, "rewards/thk_ans_format_reward": 1.0, "step": 1978, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.6041717529297, "epoch": 6.684654300168634, "grad_norm": 10.676107641246627, "kl": 0.41015625, "learning_rate": 4.428490990990991e-07, "loss": 0.0004, "reward": 3.4321783781051636, "reward_std": 0.23373743519186974, "rewards/final_reward": 1.3625278164093109, "rewards/mask_iou_reward": 0.6812639082046554, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4530117511749268, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 1979, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 214.87500762939453, "epoch": 6.688026981450253, "grad_norm": 13.405735230042936, "kl": 0.4658203125, "learning_rate": 4.4256756756756754e-07, "loss": 0.0005, "reward": 3.292783737182617, "reward_std": 0.06245427392423153, "rewards/final_reward": 1.5012419177457694, "rewards/mask_iou_reward": 0.7506209588728847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2927836179733276, "rewards/thk_ans_format_reward": 1.0, "step": 1980, "think_completion_length": 6.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 142.93750762939453, "epoch": 6.691399662731872, "grad_norm": 13.934015511756387, "kl": 0.4453125, "learning_rate": 4.4228603603603606e-07, "loss": 0.0005, "reward": 3.5601495504379272, "reward_std": 0.11545858904719353, "rewards/final_reward": 1.9275234496275209, "rewards/mask_iou_reward": 0.9637617248137604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.560149371623993, "rewards/thk_ans_format_reward": 1.0, "step": 1981, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.28125, "epoch": 6.694772344013491, "grad_norm": 22.647171475849067, "kl": 0.404296875, "learning_rate": 4.420045045045045e-07, "loss": 0.0004, "reward": 3.3105775117874146, "reward_std": 0.06604907289147377, "rewards/final_reward": 0.9340656949871835, "rewards/mask_iou_reward": 0.46703284749359175, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.310577392578125, "rewards/thk_ans_format_reward": 1.0, "step": 1982, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.61458587646484, "epoch": 6.698145025295109, "grad_norm": 9.748655133158199, "kl": 0.4072265625, "learning_rate": 4.41722972972973e-07, "loss": 0.0005, "reward": 3.4918267726898193, "reward_std": 0.06488988548517227, "rewards/final_reward": 1.7892597610591587, "rewards/mask_iou_reward": 0.8946298805295794, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4918268322944641, "rewards/thk_ans_format_reward": 1.0, "step": 1983, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 137.1770896911621, "epoch": 6.701517706576729, "grad_norm": 7.5516474807955705, "kl": 0.4306640625, "learning_rate": 4.414414414414414e-07, "loss": 0.0005, "reward": 3.610625743865967, "reward_std": 0.03432144969701767, "rewards/final_reward": 1.8832431743295208, "rewards/mask_iou_reward": 0.9416215871647604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.610625684261322, "rewards/thk_ans_format_reward": 1.0, "step": 1984, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 133.27084350585938, "epoch": 6.704890387858347, "grad_norm": 9.755324488231713, "kl": 0.4091796875, "learning_rate": 4.4115990990990985e-07, "loss": 0.0004, "reward": 3.649802088737488, "reward_std": 0.09982002340257168, "rewards/final_reward": 1.787384414540992, "rewards/mask_iou_reward": 0.893692207270496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6498020887374878, "rewards/thk_ans_format_reward": 1.0, "step": 1985, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 121.95833587646484, "epoch": 6.708263069139966, "grad_norm": 6.872406296871829, "kl": 0.4326171875, "learning_rate": 4.4087837837837836e-07, "loss": 0.0004, "reward": 3.651780366897583, "reward_std": 0.03141362592577934, "rewards/final_reward": 1.8310194196700014, "rewards/mask_iou_reward": 0.9155097098350007, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.651780366897583, "rewards/thk_ans_format_reward": 1.0, "step": 1986, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 135.50000762939453, "epoch": 6.7116357504215856, "grad_norm": 6.035689301729314, "kl": 0.767578125, "learning_rate": 4.405968468468468e-07, "loss": 0.0008, "reward": 3.4294917583465576, "reward_std": 0.05013443436473608, "rewards/final_reward": 0.8621415224398494, "rewards/mask_iou_reward": 0.4310707612199247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4294917583465576, "rewards/thk_ans_format_reward": 1.0, "step": 1987, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 140.62500762939453, "epoch": 6.715008431703204, "grad_norm": 10.28762324119743, "kl": 0.41796875, "learning_rate": 4.403153153153153e-07, "loss": 0.0004, "reward": 3.6641100645065308, "reward_std": 0.06396713852882385, "rewards/final_reward": 1.8392503553519635, "rewards/mask_iou_reward": 0.9196251776759817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6641101241111755, "rewards/thk_ans_format_reward": 1.0, "step": 1988, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.02083587646484, "epoch": 6.718381112984823, "grad_norm": 10.948623290376469, "kl": 0.453125, "learning_rate": 4.4003378378378374e-07, "loss": 0.0004, "reward": 3.5362966060638428, "reward_std": 0.07892968133091927, "rewards/final_reward": 1.5511359942392953, "rewards/mask_iou_reward": 0.7755679971196476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5362964868545532, "rewards/thk_ans_format_reward": 1.0, "step": 1989, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 219.1875, "epoch": 6.721753794266442, "grad_norm": 54.9128726538673, "kl": 0.439453125, "learning_rate": 4.397522522522522e-07, "loss": 0.0005, "reward": 3.337985396385193, "reward_std": 0.10868017747998238, "rewards/final_reward": 1.6815334552677537, "rewards/mask_iou_reward": 0.8407667276338768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.337985336780548, "rewards/thk_ans_format_reward": 1.0, "step": 1990, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 133.95833587646484, "epoch": 6.725126475548061, "grad_norm": 18.053724543999564, "kl": 0.416015625, "learning_rate": 4.394707207207207e-07, "loss": 0.0004, "reward": 3.48260235786438, "reward_std": 0.044008538126945496, "rewards/final_reward": 1.215805165491985, "rewards/mask_iou_reward": 0.6079025827459925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4826021194458008, "rewards/thk_ans_format_reward": 1.0, "step": 1991, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 174.9166717529297, "epoch": 6.728499156829679, "grad_norm": 16.723041509589873, "kl": 0.4111328125, "learning_rate": 4.391891891891892e-07, "loss": 0.0004, "reward": 3.580541491508484, "reward_std": 0.029503321275115013, "rewards/final_reward": 0.8807408224852558, "rewards/mask_iou_reward": 0.4403704112426279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5805413722991943, "rewards/thk_ans_format_reward": 1.0, "step": 1992, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 161.03125, "epoch": 6.7318718381112985, "grad_norm": 9.532645094987426, "kl": 0.564453125, "learning_rate": 4.3890765765765764e-07, "loss": 0.0006, "reward": 3.3507272005081177, "reward_std": 0.16045157611370087, "rewards/final_reward": 1.4624383649971802, "rewards/mask_iou_reward": 0.7312191824985901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.350727140903473, "rewards/thk_ans_format_reward": 1.0, "step": 1993, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.71875762939453, "epoch": 6.735244519392918, "grad_norm": 103.84941218533088, "kl": 0.4140625, "learning_rate": 4.386261261261261e-07, "loss": 0.0004, "reward": 3.5490139722824097, "reward_std": 0.11374928709119558, "rewards/final_reward": 1.794110779942215, "rewards/mask_iou_reward": 0.8970553899711075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.549013912677765, "rewards/thk_ans_format_reward": 1.0, "step": 1994, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 148.59375762939453, "epoch": 6.738617200674536, "grad_norm": 7.3091119459144895, "kl": 0.447265625, "learning_rate": 4.3834459459459456e-07, "loss": 0.0005, "reward": 3.6078637838363647, "reward_std": 0.04512942023575306, "rewards/final_reward": 1.2506563243584963, "rewards/mask_iou_reward": 0.6253281621792481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6078636050224304, "rewards/thk_ans_format_reward": 1.0, "step": 1995, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.11458587646484, "epoch": 6.7419898819561555, "grad_norm": 9.110154205303866, "kl": 0.4443359375, "learning_rate": 4.3806306306306307e-07, "loss": 0.0004, "reward": 3.3878947496414185, "reward_std": 0.09348580799996853, "rewards/final_reward": 0.8257914149734332, "rewards/mask_iou_reward": 0.4128957074867166, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3878947496414185, "rewards/thk_ans_format_reward": 1.0, "step": 1996, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 171.48958587646484, "epoch": 6.745362563237774, "grad_norm": 25.63321763421153, "kl": 0.404296875, "learning_rate": 4.3778153153153153e-07, "loss": 0.0004, "reward": 3.3954946994781494, "reward_std": 0.11219822522252798, "rewards/final_reward": 1.9292324835472883, "rewards/mask_iou_reward": 0.9646162417736441, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3954947590827942, "rewards/thk_ans_format_reward": 1.0, "step": 1997, "think_completion_length": 6.5 }, { "clip_ratio": 0.0, "completion_length": 142.6979217529297, "epoch": 6.748735244519393, "grad_norm": 6.710029564547177, "kl": 0.541015625, "learning_rate": 4.375e-07, "loss": 0.0005, "reward": 3.6499743461608887, "reward_std": 0.11773128435015678, "rewards/final_reward": 1.974263432550682, "rewards/mask_iou_reward": 0.987131716275341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6499744653701782, "rewards/thk_ans_format_reward": 1.0, "step": 1998, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.35416793823242, "epoch": 6.7521079258010115, "grad_norm": 10.378393283198248, "kl": 0.607421875, "learning_rate": 4.3721846846846845e-07, "loss": 0.0006, "reward": 3.4805933237075806, "reward_std": 0.02605645265430212, "rewards/final_reward": 1.6452655520916593, "rewards/mask_iou_reward": 0.8226327760458296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4805933833122253, "rewards/thk_ans_format_reward": 1.0, "step": 1999, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 164.36458587646484, "epoch": 6.755480607082631, "grad_norm": 10.707522224413472, "kl": 0.4482421875, "learning_rate": 4.369369369369369e-07, "loss": 0.0005, "reward": 3.482789993286133, "reward_std": 0.077627994120121, "rewards/final_reward": 1.5862446101483059, "rewards/mask_iou_reward": 0.7931223050741529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4827898144721985, "rewards/thk_ans_format_reward": 1.0, "step": 2000, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.59375381469727, "epoch": 6.75885328836425, "grad_norm": 10.464256908306714, "kl": 0.490234375, "learning_rate": 4.3665540540540543e-07, "loss": 0.0005, "reward": 3.395174980163574, "reward_std": 0.18283828347921371, "rewards/final_reward": 1.5661229109850008, "rewards/mask_iou_reward": 0.7830614554925004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3951751589775085, "rewards/thk_ans_format_reward": 1.0, "step": 2001, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.6666717529297, "epoch": 6.762225969645868, "grad_norm": 8.871161573522318, "kl": 0.478515625, "learning_rate": 4.363738738738739e-07, "loss": 0.0005, "reward": 3.6884692907333374, "reward_std": 0.10596386343240738, "rewards/final_reward": 1.7219217855060478, "rewards/mask_iou_reward": 0.8609608927530239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6884692907333374, "rewards/thk_ans_format_reward": 1.0, "step": 2002, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 128.78125381469727, "epoch": 6.765598650927488, "grad_norm": 11.590469988382258, "kl": 0.4375, "learning_rate": 4.360923423423423e-07, "loss": 0.0005, "reward": 3.7286834716796875, "reward_std": 0.017819946398958564, "rewards/final_reward": 1.8638369676242008, "rewards/mask_iou_reward": 0.9319184838121004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7286832928657532, "rewards/thk_ans_format_reward": 1.0, "step": 2003, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 149.7916717529297, "epoch": 6.768971332209106, "grad_norm": 8.86489481149611, "kl": 0.4755859375, "learning_rate": 4.3581081081081076e-07, "loss": 0.0005, "reward": 3.4955180883407593, "reward_std": 0.05199388600885868, "rewards/final_reward": 1.7048009412676646, "rewards/mask_iou_reward": 0.8524004706338323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4955180883407593, "rewards/thk_ans_format_reward": 1.0, "step": 2004, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 167.6770896911621, "epoch": 6.772344013490725, "grad_norm": 9.166938127616588, "kl": 0.4013671875, "learning_rate": 4.355292792792792e-07, "loss": 0.0004, "reward": 3.5893832445144653, "reward_std": 0.12393485009670258, "rewards/final_reward": 1.8329345116802633, "rewards/mask_iou_reward": 0.9164672558401317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5893832445144653, "rewards/thk_ans_format_reward": 1.0, "step": 2005, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 171.92708587646484, "epoch": 6.775716694772344, "grad_norm": 14.396280349467032, "kl": 0.4501953125, "learning_rate": 4.3524774774774773e-07, "loss": 0.0005, "reward": 3.6871821880340576, "reward_std": 0.04286697134375572, "rewards/final_reward": 1.803451205269603, "rewards/mask_iou_reward": 0.9017256026348015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6871821880340576, "rewards/thk_ans_format_reward": 1.0, "step": 2006, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 183.6666717529297, "epoch": 6.779089376053963, "grad_norm": 13.29191324189456, "kl": 0.4033203125, "learning_rate": 4.349662162162162e-07, "loss": 0.0005, "reward": 3.649420142173767, "reward_std": 0.06181117706000805, "rewards/final_reward": 1.3635650259031187, "rewards/mask_iou_reward": 0.6817825129515593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6494203209877014, "rewards/thk_ans_format_reward": 1.0, "step": 2007, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 181.7291717529297, "epoch": 6.782462057335582, "grad_norm": 10.68491866592069, "kl": 0.435546875, "learning_rate": 4.3468468468468465e-07, "loss": 0.0004, "reward": 3.775758147239685, "reward_std": 0.17000571638345718, "rewards/final_reward": 1.8791206054269782, "rewards/mask_iou_reward": 0.9395603027134891, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.796591579914093, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2008, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 176.25000762939453, "epoch": 6.785834738617201, "grad_norm": 28.604722545822455, "kl": 0.439453125, "learning_rate": 4.344031531531531e-07, "loss": 0.0004, "reward": 3.454217791557312, "reward_std": 0.046439859084784985, "rewards/final_reward": 1.6288204235372983, "rewards/mask_iou_reward": 0.8144102117686491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.454217791557312, "rewards/thk_ans_format_reward": 1.0, "step": 2009, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.89583587646484, "epoch": 6.78920741989882, "grad_norm": 5.868265505091315, "kl": 0.4248046875, "learning_rate": 4.341216216216216e-07, "loss": 0.0004, "reward": 3.2674766778945923, "reward_std": 0.029944440349936485, "rewards/final_reward": 1.4306260788189844, "rewards/mask_iou_reward": 0.7153130394094922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.267476499080658, "rewards/thk_ans_format_reward": 1.0, "step": 2010, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 234.65625, "epoch": 6.792580101180438, "grad_norm": 19.454490820847745, "kl": 0.33984375, "learning_rate": 4.338400900900901e-07, "loss": 0.0003, "reward": 3.4577724933624268, "reward_std": 0.18022079020738602, "rewards/final_reward": 1.252627623997265, "rewards/mask_iou_reward": 0.6263138119986325, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4786058068275452, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2011, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 162.73958587646484, "epoch": 6.795952782462058, "grad_norm": 9.842950941312992, "kl": 0.4677734375, "learning_rate": 4.3355855855855855e-07, "loss": 0.0005, "reward": 3.304463267326355, "reward_std": 0.1052134744822979, "rewards/final_reward": 1.713373917512853, "rewards/mask_iou_reward": 0.8566869587564265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3044632077217102, "rewards/thk_ans_format_reward": 1.0, "step": 2012, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 165.4479217529297, "epoch": 6.799325463743676, "grad_norm": 11.935152460455082, "kl": 0.3857421875, "learning_rate": 4.33277027027027e-07, "loss": 0.0004, "reward": 3.6137847900390625, "reward_std": 0.12032023817300797, "rewards/final_reward": 1.956927414618658, "rewards/mask_iou_reward": 0.978463707309329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.613784670829773, "rewards/thk_ans_format_reward": 1.0, "step": 2013, "think_completion_length": 6.25 }, { "clip_ratio": 0.0, "completion_length": 165.5729217529297, "epoch": 6.802698145025295, "grad_norm": 11.123120156843346, "kl": 0.529296875, "learning_rate": 4.3299549549549547e-07, "loss": 0.0005, "reward": 3.1668895483016968, "reward_std": 0.16719982773065567, "rewards/final_reward": 0.6149814804918514, "rewards/mask_iou_reward": 0.3074907402459257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1668896079063416, "rewards/thk_ans_format_reward": 1.0, "step": 2014, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 127.28125381469727, "epoch": 6.806070826306914, "grad_norm": 7.836398907878883, "kl": 0.5341796875, "learning_rate": 4.3271396396396393e-07, "loss": 0.0006, "reward": 3.5650718212127686, "reward_std": 0.10769028216600418, "rewards/final_reward": 1.439123274000237, "rewards/mask_iou_reward": 0.7195616370001185, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5650717616081238, "rewards/thk_ans_format_reward": 1.0, "step": 2015, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 148.8229217529297, "epoch": 6.809443507588533, "grad_norm": 6.754528250710224, "kl": 0.390625, "learning_rate": 4.3243243243243244e-07, "loss": 0.0004, "reward": 3.625916361808777, "reward_std": 0.051243921276181936, "rewards/final_reward": 1.0752049657908953, "rewards/mask_iou_reward": 0.5376024828954477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6259164214134216, "rewards/thk_ans_format_reward": 1.0, "step": 2016, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 157.40625762939453, "epoch": 6.812816188870151, "grad_norm": 8.338203379557932, "kl": 0.587890625, "learning_rate": 4.321509009009009e-07, "loss": 0.0006, "reward": 3.6156070232391357, "reward_std": 0.045968128368258476, "rewards/final_reward": 1.7577103244250059, "rewards/mask_iou_reward": 0.8788551622125029, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6156070828437805, "rewards/thk_ans_format_reward": 1.0, "step": 2017, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 171.87500762939453, "epoch": 6.8161888701517706, "grad_norm": 14.692569916989621, "kl": 0.404296875, "learning_rate": 4.3186936936936937e-07, "loss": 0.0004, "reward": 3.511265277862549, "reward_std": 0.11939521878957748, "rewards/final_reward": 1.8363637454485904, "rewards/mask_iou_reward": 0.9181818727242952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.511265218257904, "rewards/thk_ans_format_reward": 1.0, "step": 2018, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 162.06250762939453, "epoch": 6.81956155143339, "grad_norm": 21.775539312569634, "kl": 0.4091796875, "learning_rate": 4.315878378378378e-07, "loss": 0.0004, "reward": 3.3648746013641357, "reward_std": 0.04116539843380451, "rewards/final_reward": 1.5359148831507619, "rewards/mask_iou_reward": 0.7679574415753809, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.364874541759491, "rewards/thk_ans_format_reward": 1.0, "step": 2019, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 196.78125, "epoch": 6.822934232715008, "grad_norm": 19.70188938671734, "kl": 0.419921875, "learning_rate": 4.313063063063063e-07, "loss": 0.0004, "reward": 3.721093535423279, "reward_std": 0.07094984129071236, "rewards/final_reward": 1.6374509550249228, "rewards/mask_iou_reward": 0.8187254775124614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7210937142372131, "rewards/thk_ans_format_reward": 1.0, "step": 2020, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 189.28125762939453, "epoch": 6.8263069139966275, "grad_norm": 27.009543335856872, "kl": 0.361328125, "learning_rate": 4.310247747747748e-07, "loss": 0.0004, "reward": 3.712403893470764, "reward_std": 0.25884455256164074, "rewards/final_reward": 1.7145527885472487, "rewards/mask_iou_reward": 0.8572763942736243, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.7436540722846985, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2021, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 136.03125762939453, "epoch": 6.829679595278246, "grad_norm": 9.488178524727436, "kl": 0.462890625, "learning_rate": 4.3074324324324326e-07, "loss": 0.0005, "reward": 3.418555974960327, "reward_std": 0.11346096568740904, "rewards/final_reward": 1.8226451478927959, "rewards/mask_iou_reward": 0.9113225739463979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4185560941696167, "rewards/thk_ans_format_reward": 1.0, "step": 2022, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.23958587646484, "epoch": 6.833052276559865, "grad_norm": 8.097256330169499, "kl": 0.486328125, "learning_rate": 4.3046171171171167e-07, "loss": 0.0005, "reward": 3.4910701513290405, "reward_std": 0.11701249331235886, "rewards/final_reward": 1.4391594572018165, "rewards/mask_iou_reward": 0.7195797286009082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4910700917243958, "rewards/thk_ans_format_reward": 1.0, "step": 2023, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 159.1666717529297, "epoch": 6.8364249578414835, "grad_norm": 8.493282353877067, "kl": 0.453125, "learning_rate": 4.3018018018018013e-07, "loss": 0.0005, "reward": 3.4912160634994507, "reward_std": 0.09391393139958382, "rewards/final_reward": 1.438580935765031, "rewards/mask_iou_reward": 0.7192904678825155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4912160634994507, "rewards/thk_ans_format_reward": 1.0, "step": 2024, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 166.02084350585938, "epoch": 6.839797639123103, "grad_norm": 13.795074515453015, "kl": 0.380859375, "learning_rate": 4.298986486486486e-07, "loss": 0.0004, "reward": 3.5047991275787354, "reward_std": 0.06105640344321728, "rewards/final_reward": 1.382774638880558, "rewards/mask_iou_reward": 0.691387319440279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5047988891601562, "rewards/thk_ans_format_reward": 1.0, "step": 2025, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 170.3541717529297, "epoch": 6.843170320404722, "grad_norm": 26.800221203820207, "kl": 0.453125, "learning_rate": 4.296171171171171e-07, "loss": 0.0005, "reward": 3.1508926153182983, "reward_std": 0.1210801713168621, "rewards/final_reward": 1.5852870872997817, "rewards/mask_iou_reward": 0.7926435436498909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1508926749229431, "rewards/thk_ans_format_reward": 1.0, "step": 2026, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 209.1979217529297, "epoch": 6.8465430016863404, "grad_norm": 11.371052034781563, "kl": 0.4189453125, "learning_rate": 4.2933558558558556e-07, "loss": 0.0004, "reward": 3.4964561462402344, "reward_std": 0.12025372684001923, "rewards/final_reward": 1.6872832059981004, "rewards/mask_iou_reward": 0.8436416029990502, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4964563250541687, "rewards/thk_ans_format_reward": 1.0, "step": 2027, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.125, "epoch": 6.84991568296796, "grad_norm": 16.273760978509856, "kl": 0.3974609375, "learning_rate": 4.29054054054054e-07, "loss": 0.0004, "reward": 3.4314554929733276, "reward_std": 0.05612679943442345, "rewards/final_reward": 1.9202637571991077, "rewards/mask_iou_reward": 0.9601318785995538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4314554333686829, "rewards/thk_ans_format_reward": 1.0, "step": 2028, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 181.78125, "epoch": 6.853288364249578, "grad_norm": 9.154995082158772, "kl": 0.3916015625, "learning_rate": 4.287725225225225e-07, "loss": 0.0004, "reward": 3.800302505493164, "reward_std": 0.01082283305004239, "rewards/final_reward": 1.8819881384430022, "rewards/mask_iou_reward": 0.9409940692215011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8003026843070984, "rewards/thk_ans_format_reward": 1.0, "step": 2029, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 159.68750762939453, "epoch": 6.856661045531197, "grad_norm": 10.432191975171135, "kl": 0.4560546875, "learning_rate": 4.2849099099099095e-07, "loss": 0.0004, "reward": 3.822153329849243, "reward_std": 0.02899275626987219, "rewards/final_reward": 1.9186536417083446, "rewards/mask_iou_reward": 0.9593268208541723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8221532106399536, "rewards/thk_ans_format_reward": 1.0, "step": 2030, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 218.71875762939453, "epoch": 6.860033726812816, "grad_norm": 9.532886988046716, "kl": 0.45703125, "learning_rate": 4.2820945945945946e-07, "loss": 0.0005, "reward": 3.441614031791687, "reward_std": 0.18240241333842278, "rewards/final_reward": 1.3654729554678962, "rewards/mask_iou_reward": 0.6827364777339481, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4520307183265686, "rewards/thk_ans_format_reward": 1.0, "step": 2031, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 153.77083587646484, "epoch": 6.863406408094435, "grad_norm": 9.693404018286879, "kl": 0.462890625, "learning_rate": 4.279279279279279e-07, "loss": 0.0005, "reward": 3.097666382789612, "reward_std": 0.1792236566543579, "rewards/final_reward": 0.7095118455592196, "rewards/mask_iou_reward": 0.3547559227796098, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0976662635803223, "rewards/thk_ans_format_reward": 1.0, "step": 2032, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 185.9791717529297, "epoch": 6.866779089376054, "grad_norm": 6.292404625002513, "kl": 0.5263671875, "learning_rate": 4.276463963963964e-07, "loss": 0.0005, "reward": 3.4150606393814087, "reward_std": 0.08291511330753565, "rewards/final_reward": 1.8709507809260542, "rewards/mask_iou_reward": 0.9354753904630271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4150602221488953, "rewards/thk_ans_format_reward": 1.0, "step": 2033, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 202.87500762939453, "epoch": 6.870151770657673, "grad_norm": 25.59875786307666, "kl": 0.427734375, "learning_rate": 4.2736486486486484e-07, "loss": 0.0004, "reward": 3.5064250230789185, "reward_std": 0.32452040165662766, "rewards/final_reward": 1.3838483755187423, "rewards/mask_iou_reward": 0.6919241877593711, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5480916500091553, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2034, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.4791717529297, "epoch": 6.873524451939292, "grad_norm": 16.83175076438212, "kl": 0.607421875, "learning_rate": 4.270833333333333e-07, "loss": 0.0006, "reward": 3.4737848043441772, "reward_std": 0.10801901668310165, "rewards/final_reward": 1.1984207695179063, "rewards/mask_iou_reward": 0.5992103847589532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.473784863948822, "rewards/thk_ans_format_reward": 1.0, "step": 2035, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 182.58333587646484, "epoch": 6.87689713322091, "grad_norm": 23.664794856462887, "kl": 0.4716796875, "learning_rate": 4.268018018018018e-07, "loss": 0.0005, "reward": 3.7692642211914062, "reward_std": 0.06182833015918732, "rewards/final_reward": 1.8681330269338021, "rewards/mask_iou_reward": 0.9340665134669011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7692642211914062, "rewards/thk_ans_format_reward": 1.0, "step": 2036, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 193.5416717529297, "epoch": 6.88026981450253, "grad_norm": 4.613817355642268, "kl": 0.4453125, "learning_rate": 4.265202702702703e-07, "loss": 0.0005, "reward": 3.53538978099823, "reward_std": 0.09306821972131729, "rewards/final_reward": 1.388622126765915, "rewards/mask_iou_reward": 0.6943110633829574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5353897213935852, "rewards/thk_ans_format_reward": 1.0, "step": 2037, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 159.0833396911621, "epoch": 6.883642495784148, "grad_norm": 7.311229770984157, "kl": 0.580078125, "learning_rate": 4.2623873873873874e-07, "loss": 0.0006, "reward": 3.4610273838043213, "reward_std": 0.14541162177920341, "rewards/final_reward": 1.5367615667478312, "rewards/mask_iou_reward": 0.7683807833739156, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.461027443408966, "rewards/thk_ans_format_reward": 1.0, "step": 2038, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 205.50000762939453, "epoch": 6.887015177065767, "grad_norm": 28.29948794364883, "kl": 1.51171875, "learning_rate": 4.259572072072072e-07, "loss": 0.0015, "reward": 3.4711467027664185, "reward_std": 0.14650648832321167, "rewards/final_reward": 1.0551088381709772, "rewards/mask_iou_reward": 0.5275544190854886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4711466431617737, "rewards/thk_ans_format_reward": 1.0, "step": 2039, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 209.61459350585938, "epoch": 6.8903878583473865, "grad_norm": 6.459950948608125, "kl": 0.4189453125, "learning_rate": 4.2567567567567566e-07, "loss": 0.0004, "reward": 3.627356767654419, "reward_std": 0.2032754383981228, "rewards/final_reward": 1.5017048117281278, "rewards/mask_iou_reward": 0.7508524058640639, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6481899619102478, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2040, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 209.23958587646484, "epoch": 6.893760539629005, "grad_norm": 11.212679981209691, "kl": 0.4052734375, "learning_rate": 4.2539414414414417e-07, "loss": 0.0004, "reward": 3.6127889156341553, "reward_std": 0.05332676135003567, "rewards/final_reward": 1.7987581374357826, "rewards/mask_iou_reward": 0.8993790687178913, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6127886772155762, "rewards/thk_ans_format_reward": 1.0, "step": 2041, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 148.9791717529297, "epoch": 6.897133220910624, "grad_norm": 15.159521023445356, "kl": 0.4052734375, "learning_rate": 4.2511261261261263e-07, "loss": 0.0004, "reward": 3.6380761861801147, "reward_std": 0.048528952058404684, "rewards/final_reward": 1.9370697683883562, "rewards/mask_iou_reward": 0.9685348841941781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6380762457847595, "rewards/thk_ans_format_reward": 1.0, "step": 2042, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 185.96875762939453, "epoch": 6.900505902192243, "grad_norm": 8.981046099187886, "kl": 0.369140625, "learning_rate": 4.2483108108108104e-07, "loss": 0.0004, "reward": 3.67224383354187, "reward_std": 0.11439605057239532, "rewards/final_reward": 1.8130440471669833, "rewards/mask_iou_reward": 0.9065220235834917, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6722437143325806, "rewards/thk_ans_format_reward": 1.0, "step": 2043, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 195.9791717529297, "epoch": 6.903878583473862, "grad_norm": 8.722632525029411, "kl": 0.3701171875, "learning_rate": 4.245495495495495e-07, "loss": 0.0004, "reward": 3.5095438957214355, "reward_std": 0.25895553827285767, "rewards/final_reward": 1.8349615960203192, "rewards/mask_iou_reward": 0.9174807980101596, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5303771495819092, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2044, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 185.18750762939453, "epoch": 6.90725126475548, "grad_norm": 5.767256824016995, "kl": 0.400390625, "learning_rate": 4.2426801801801796e-07, "loss": 0.0004, "reward": 3.5061161518096924, "reward_std": 0.023271950893104076, "rewards/final_reward": 1.279184815264447, "rewards/mask_iou_reward": 0.6395924076322235, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5061162114143372, "rewards/thk_ans_format_reward": 1.0, "step": 2045, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 206.59376525878906, "epoch": 6.9106239460370995, "grad_norm": 12.672614787199961, "kl": 0.5947265625, "learning_rate": 4.239864864864865e-07, "loss": 0.0006, "reward": 3.4643458127975464, "reward_std": 0.06347063556313515, "rewards/final_reward": 1.1290433724283133, "rewards/mask_iou_reward": 0.5645216862141567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4643457531929016, "rewards/thk_ans_format_reward": 1.0, "step": 2046, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 248.17709350585938, "epoch": 6.913996627318719, "grad_norm": 50.46641016769619, "kl": 0.3564453125, "learning_rate": 4.2370495495495494e-07, "loss": 0.0004, "reward": 3.238726854324341, "reward_std": 0.11985756456851959, "rewards/final_reward": 1.1162085397945747, "rewards/mask_iou_reward": 0.5581042698972873, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2387269139289856, "rewards/thk_ans_format_reward": 1.0, "step": 2047, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 203.50000762939453, "epoch": 6.917369308600337, "grad_norm": 22.18032940628456, "kl": 0.4208984375, "learning_rate": 4.234234234234234e-07, "loss": 0.0004, "reward": 3.5871880054473877, "reward_std": 0.12817983329296112, "rewards/final_reward": 1.545037883939106, "rewards/mask_iou_reward": 0.772518941969553, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.587187945842743, "rewards/thk_ans_format_reward": 1.0, "step": 2048, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 214.1666717529297, "epoch": 6.920741989881956, "grad_norm": 28.427981286178998, "kl": 0.4033203125, "learning_rate": 4.2314189189189186e-07, "loss": 0.0004, "reward": 3.621571660041809, "reward_std": 0.020552618894726038, "rewards/final_reward": 1.3964029672048128, "rewards/mask_iou_reward": 0.6982014836024064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6215715408325195, "rewards/thk_ans_format_reward": 1.0, "step": 2049, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 140.83333587646484, "epoch": 6.924114671163575, "grad_norm": 9.362320711061372, "kl": 0.4072265625, "learning_rate": 4.228603603603603e-07, "loss": 0.0004, "reward": 3.4834201335906982, "reward_std": 0.14708335511386395, "rewards/final_reward": 1.827577067771653, "rewards/mask_iou_reward": 0.9137885338858265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4834200739860535, "rewards/thk_ans_format_reward": 1.0, "step": 2050, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 156.80208587646484, "epoch": 6.927487352445194, "grad_norm": 13.204004703332789, "kl": 0.419921875, "learning_rate": 4.2257882882882883e-07, "loss": 0.0004, "reward": 3.435231924057007, "reward_std": 0.07278600335121155, "rewards/final_reward": 1.6471412575629387, "rewards/mask_iou_reward": 0.8235706287814694, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4352316856384277, "rewards/thk_ans_format_reward": 1.0, "step": 2051, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 144.90625381469727, "epoch": 6.9308600337268125, "grad_norm": 9.33496190379643, "kl": 0.447265625, "learning_rate": 4.222972972972973e-07, "loss": 0.0005, "reward": 3.709159255027771, "reward_std": 0.03617256507277489, "rewards/final_reward": 1.7899409602624758, "rewards/mask_iou_reward": 0.8949704801312379, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.70915949344635, "rewards/thk_ans_format_reward": 1.0, "step": 2052, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.8854217529297, "epoch": 6.934232715008432, "grad_norm": 23.282238466862477, "kl": 0.5009765625, "learning_rate": 4.2201576576576575e-07, "loss": 0.0005, "reward": 3.610919237136841, "reward_std": 0.044389775954186916, "rewards/final_reward": 1.2384339013913657, "rewards/mask_iou_reward": 0.6192169506956828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6109193563461304, "rewards/thk_ans_format_reward": 1.0, "step": 2053, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 243.09375762939453, "epoch": 6.937605396290051, "grad_norm": 12.724463055365122, "kl": 0.4013671875, "learning_rate": 4.217342342342342e-07, "loss": 0.0004, "reward": 3.6631650924682617, "reward_std": 0.13815235905349255, "rewards/final_reward": 1.7990066628365704, "rewards/mask_iou_reward": 0.8995033314182852, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.683998465538025, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2054, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 170.70833587646484, "epoch": 6.940978077571669, "grad_norm": 8.099159270808352, "kl": 0.48046875, "learning_rate": 4.214527027027027e-07, "loss": 0.0005, "reward": 3.4705790281295776, "reward_std": 0.06602546386420727, "rewards/final_reward": 1.3726603388502736, "rewards/mask_iou_reward": 0.6863301694251368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4705789685249329, "rewards/thk_ans_format_reward": 1.0, "step": 2055, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 180.52084350585938, "epoch": 6.944350758853289, "grad_norm": 11.396772634783654, "kl": 0.376953125, "learning_rate": 4.2117117117117114e-07, "loss": 0.0004, "reward": 3.3403568267822266, "reward_std": 0.12214644998311996, "rewards/final_reward": 1.7037964388214435, "rewards/mask_iou_reward": 0.8518982194107217, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3403565883636475, "rewards/thk_ans_format_reward": 1.0, "step": 2056, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 240.75, "epoch": 6.947723440134907, "grad_norm": 11.978637065897662, "kl": 0.3828125, "learning_rate": 4.2088963963963965e-07, "loss": 0.0004, "reward": 3.2751539945602417, "reward_std": 0.173908993601799, "rewards/final_reward": 1.8127977999330058, "rewards/mask_iou_reward": 0.9063988999665029, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2959871888160706, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2057, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.58333587646484, "epoch": 6.951096121416526, "grad_norm": 11.433820469930387, "kl": 0.6953125, "learning_rate": 4.206081081081081e-07, "loss": 0.0007, "reward": 3.478344202041626, "reward_std": 0.10730455070734024, "rewards/final_reward": 1.515322556277861, "rewards/mask_iou_reward": 0.7576612781389305, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4783442616462708, "rewards/thk_ans_format_reward": 1.0, "step": 2058, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.6979217529297, "epoch": 6.954468802698145, "grad_norm": 7.33352107841849, "kl": 0.3896484375, "learning_rate": 4.2032657657657657e-07, "loss": 0.0004, "reward": 3.4388267993927, "reward_std": 0.004863133071921766, "rewards/final_reward": 0.9464367668835442, "rewards/mask_iou_reward": 0.4732183834417721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4388265013694763, "rewards/thk_ans_format_reward": 1.0, "step": 2059, "think_completion_length": 6.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 166.34375762939453, "epoch": 6.957841483979764, "grad_norm": 36.48687000543652, "kl": 0.4052734375, "learning_rate": 4.2004504504504503e-07, "loss": 0.0004, "reward": 3.280514121055603, "reward_std": 0.16359587758779526, "rewards/final_reward": 1.5899992914228696, "rewards/mask_iou_reward": 0.7949996457114348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.280514121055603, "rewards/thk_ans_format_reward": 1.0, "step": 2060, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 197.2604217529297, "epoch": 6.961214165261383, "grad_norm": 6.0776006598667545, "kl": 0.36328125, "learning_rate": 4.197635135135135e-07, "loss": 0.0004, "reward": 3.6308919191360474, "reward_std": 0.06040792353451252, "rewards/final_reward": 1.7377528025504996, "rewards/mask_iou_reward": 0.8688764012752498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6308916807174683, "rewards/thk_ans_format_reward": 1.0, "step": 2061, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 170.6354217529297, "epoch": 6.964586846543002, "grad_norm": 14.250731643162881, "kl": 0.509765625, "learning_rate": 4.19481981981982e-07, "loss": 0.0005, "reward": 3.692965269088745, "reward_std": 0.039796837605535984, "rewards/final_reward": 1.9779923740620449, "rewards/mask_iou_reward": 0.9889961870310224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6929653882980347, "rewards/thk_ans_format_reward": 1.0, "step": 2062, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 134.64584350585938, "epoch": 6.967959527824621, "grad_norm": 25.0275801936448, "kl": 0.4345703125, "learning_rate": 4.192004504504504e-07, "loss": 0.0004, "reward": 3.548501491546631, "reward_std": 0.12630417943000793, "rewards/final_reward": 1.5421603575578033, "rewards/mask_iou_reward": 0.7710801787789017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5485016107559204, "rewards/thk_ans_format_reward": 1.0, "step": 2063, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 211.07292938232422, "epoch": 6.971332209106239, "grad_norm": 9.025736068743793, "kl": 0.470703125, "learning_rate": 4.189189189189189e-07, "loss": 0.0005, "reward": 3.666631579399109, "reward_std": 0.04170432314276695, "rewards/final_reward": 1.6901646951117226, "rewards/mask_iou_reward": 0.8450823475558613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6666315197944641, "rewards/thk_ans_format_reward": 1.0, "step": 2064, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 130.67708587646484, "epoch": 6.974704890387859, "grad_norm": 12.870468548233324, "kl": 0.4599609375, "learning_rate": 4.1863738738738733e-07, "loss": 0.0005, "reward": 3.4917114973068237, "reward_std": 0.12080695852637291, "rewards/final_reward": 1.628104300970692, "rewards/mask_iou_reward": 0.814052150485346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4917116165161133, "rewards/thk_ans_format_reward": 1.0, "step": 2065, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 154.05208587646484, "epoch": 6.978077571669477, "grad_norm": 6.25613270559228, "kl": 0.41015625, "learning_rate": 4.183558558558558e-07, "loss": 0.0005, "reward": 3.553234815597534, "reward_std": 0.027653097175061703, "rewards/final_reward": 1.6949132902041693, "rewards/mask_iou_reward": 0.8474566451020846, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.553234577178955, "rewards/thk_ans_format_reward": 1.0, "step": 2066, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 174.625, "epoch": 6.981450252951096, "grad_norm": 10.920688669993615, "kl": 0.4814453125, "learning_rate": 4.180743243243243e-07, "loss": 0.0005, "reward": 3.600623846054077, "reward_std": 0.11215956509113312, "rewards/final_reward": 1.2300481204617575, "rewards/mask_iou_reward": 0.6150240602308787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6006236672401428, "rewards/thk_ans_format_reward": 1.0, "step": 2067, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 128.64583587646484, "epoch": 6.9848229342327155, "grad_norm": 14.28349519092112, "kl": 0.630859375, "learning_rate": 4.1779279279279277e-07, "loss": 0.0006, "reward": 3.2274869680404663, "reward_std": 0.1608312577009201, "rewards/final_reward": 1.013269042755018, "rewards/mask_iou_reward": 0.506634521377509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.227486789226532, "rewards/thk_ans_format_reward": 1.0, "step": 2068, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.82291793823242, "epoch": 6.988195615514334, "grad_norm": 8.99894758581553, "kl": 0.45703125, "learning_rate": 4.1751126126126123e-07, "loss": 0.0005, "reward": 3.7834309339523315, "reward_std": 0.025212008506059647, "rewards/final_reward": 1.6631238092734235, "rewards/mask_iou_reward": 0.8315619046367118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7834311127662659, "rewards/thk_ans_format_reward": 1.0, "step": 2069, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 146.20833587646484, "epoch": 6.991568296795953, "grad_norm": 47.632513811120354, "kl": 0.521484375, "learning_rate": 4.172297297297297e-07, "loss": 0.0006, "reward": 3.3873326778411865, "reward_std": 0.28176888823509216, "rewards/final_reward": 1.8443692369468496, "rewards/mask_iou_reward": 0.9221846184734248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3873324990272522, "rewards/thk_ans_format_reward": 1.0, "step": 2070, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 178.625, "epoch": 6.9949409780775715, "grad_norm": 7.669381095104932, "kl": 0.458984375, "learning_rate": 4.1694819819819815e-07, "loss": 0.0005, "reward": 3.062989354133606, "reward_std": 0.08837875723838806, "rewards/final_reward": 1.4319442864302787, "rewards/mask_iou_reward": 0.7159721432151394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0629891753196716, "rewards/thk_ans_format_reward": 1.0, "step": 2071, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 140.92105102539062, "epoch": 6.998313659359191, "grad_norm": 47.753976767913095, "kl": 0.490234375, "learning_rate": 4.1666666666666667e-07, "loss": 0.0005, "reward": 3.8917770385742188, "reward_std": 0.017438477370887995, "rewards/final_reward": 1.8893658419395065, "rewards/mask_iou_reward": 0.9446829209697533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8917770385742188, "rewards/thk_ans_format_reward": 1.0, "step": 2072, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 141.36458587646484, "epoch": 7.003372681281619, "grad_norm": 39.113360226616884, "kl": 0.548828125, "learning_rate": 4.163851351351351e-07, "loss": 0.0006, "reward": 3.5466045141220093, "reward_std": 0.10832555405795574, "rewards/final_reward": 1.4662659790531536, "rewards/mask_iou_reward": 0.7331329895265768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5466048121452332, "rewards/thk_ans_format_reward": 1.0, "step": 2073, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.4166717529297, "epoch": 7.006745362563238, "grad_norm": 8.053807818313302, "kl": 0.4931640625, "learning_rate": 4.161036036036036e-07, "loss": 0.0005, "reward": 3.5464266538619995, "reward_std": 0.05879717133939266, "rewards/final_reward": 1.8675432805313172, "rewards/mask_iou_reward": 0.9337716402656586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.54642653465271, "rewards/thk_ans_format_reward": 1.0, "step": 2074, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 197.55208587646484, "epoch": 7.010118043844857, "grad_norm": 8.633699358022847, "kl": 0.431640625, "learning_rate": 4.1582207207207205e-07, "loss": 0.0004, "reward": 3.399770975112915, "reward_std": 0.1902711447328329, "rewards/final_reward": 1.408978737907396, "rewards/mask_iou_reward": 0.704489368953698, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.420604407787323, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2075, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 135.0729217529297, "epoch": 7.013490725126475, "grad_norm": 12.270572132816845, "kl": 0.439453125, "learning_rate": 4.155405405405405e-07, "loss": 0.0004, "reward": 3.4145290851593018, "reward_std": 0.03906204830855131, "rewards/final_reward": 1.0129971435762222, "rewards/mask_iou_reward": 0.5064985717881111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4145292043685913, "rewards/thk_ans_format_reward": 1.0, "step": 2076, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 129.40625762939453, "epoch": 7.016863406408095, "grad_norm": 45.624948252596006, "kl": 0.58203125, "learning_rate": 4.15259009009009e-07, "loss": 0.0006, "reward": 3.3474459648132324, "reward_std": 0.25892218202352524, "rewards/final_reward": 1.5210914066116574, "rewards/mask_iou_reward": 0.7605457033058287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3474458456039429, "rewards/thk_ans_format_reward": 1.0, "step": 2077, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 195.1041717529297, "epoch": 7.020236087689713, "grad_norm": 8.878850724523403, "kl": 0.4873046875, "learning_rate": 4.149774774774775e-07, "loss": 0.0005, "reward": 3.6284542083740234, "reward_std": 0.056491006165742874, "rewards/final_reward": 1.72353412290081, "rewards/mask_iou_reward": 0.861767061450405, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6284542083740234, "rewards/thk_ans_format_reward": 1.0, "step": 2078, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 146.50000762939453, "epoch": 7.023608768971332, "grad_norm": 7.75147011450966, "kl": 0.4677734375, "learning_rate": 4.1469594594594594e-07, "loss": 0.0005, "reward": 3.2908732891082764, "reward_std": 0.09442063421010971, "rewards/final_reward": 1.541150831055468, "rewards/mask_iou_reward": 0.770575415527734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2908731698989868, "rewards/thk_ans_format_reward": 1.0, "step": 2079, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.5625, "epoch": 7.0269814502529515, "grad_norm": 10.132125586188312, "kl": 0.509765625, "learning_rate": 4.144144144144144e-07, "loss": 0.0005, "reward": 3.430995225906372, "reward_std": 0.07199489884078503, "rewards/final_reward": 0.8899003687201831, "rewards/mask_iou_reward": 0.44495018436009154, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.430995225906372, "rewards/thk_ans_format_reward": 1.0, "step": 2080, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 174.3854217529297, "epoch": 7.03035413153457, "grad_norm": 10.19011933825818, "kl": 0.41796875, "learning_rate": 4.1413288288288286e-07, "loss": 0.0004, "reward": 3.6006062030792236, "reward_std": 0.14285754412412643, "rewards/final_reward": 1.189594890267856, "rewards/mask_iou_reward": 0.594797445133928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.600606083869934, "rewards/thk_ans_format_reward": 1.0, "step": 2081, "think_completion_length": 6.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 143.7395896911621, "epoch": 7.033726812816189, "grad_norm": 7.733210854177587, "kl": 0.4814453125, "learning_rate": 4.138513513513514e-07, "loss": 0.0005, "reward": 3.3528921604156494, "reward_std": 0.09503498487174511, "rewards/final_reward": 1.043915528277389, "rewards/mask_iou_reward": 0.5219577641386945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3528921604156494, "rewards/thk_ans_format_reward": 1.0, "step": 2082, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 131.09375762939453, "epoch": 7.0370994940978076, "grad_norm": 19.94042549315522, "kl": 0.8447265625, "learning_rate": 4.135698198198198e-07, "loss": 0.0008, "reward": 3.5029793977737427, "reward_std": 0.08090989291667938, "rewards/final_reward": 1.3858858070205202, "rewards/mask_iou_reward": 0.6929429035102601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5029793977737427, "rewards/thk_ans_format_reward": 1.0, "step": 2083, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 121.41666793823242, "epoch": 7.040472175379427, "grad_norm": 10.718417213008133, "kl": 0.4501953125, "learning_rate": 4.1328828828828825e-07, "loss": 0.0005, "reward": 3.7191574573516846, "reward_std": 0.05856491345912218, "rewards/final_reward": 1.728180431981604, "rewards/mask_iou_reward": 0.864090215990802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7191572785377502, "rewards/thk_ans_format_reward": 1.0, "step": 2084, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 149.53125381469727, "epoch": 7.043844856661045, "grad_norm": 8.713561379718907, "kl": 0.4443359375, "learning_rate": 4.130067567567567e-07, "loss": 0.0004, "reward": 3.4879956245422363, "reward_std": 0.1195925809442997, "rewards/final_reward": 1.8511911442186961, "rewards/mask_iou_reward": 0.9255955721093481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.487995684146881, "rewards/thk_ans_format_reward": 1.0, "step": 2085, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 152.80208587646484, "epoch": 7.0472175379426645, "grad_norm": 10.716632561067437, "kl": 0.4453125, "learning_rate": 4.1272522522522517e-07, "loss": 0.0004, "reward": 3.579306960105896, "reward_std": 0.1012413278222084, "rewards/final_reward": 1.7639435770020846, "rewards/mask_iou_reward": 0.8819717885010423, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5793068408966064, "rewards/thk_ans_format_reward": 1.0, "step": 2086, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 124.96875381469727, "epoch": 7.050590219224283, "grad_norm": 8.659125001587432, "kl": 0.421875, "learning_rate": 4.124436936936937e-07, "loss": 0.0004, "reward": 3.30793559551239, "reward_std": 0.16385822743177414, "rewards/final_reward": 1.435798409073462, "rewards/mask_iou_reward": 0.717899204536731, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3079355955123901, "rewards/thk_ans_format_reward": 1.0, "step": 2087, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 194.25000762939453, "epoch": 7.053962900505902, "grad_norm": 10.376524327165193, "kl": 0.4296875, "learning_rate": 4.1216216216216214e-07, "loss": 0.0004, "reward": 3.631484866142273, "reward_std": 0.037298865616321564, "rewards/final_reward": 1.8119502353295949, "rewards/mask_iou_reward": 0.9059751176647974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6314847469329834, "rewards/thk_ans_format_reward": 1.0, "step": 2088, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 140.44791793823242, "epoch": 7.057335581787521, "grad_norm": 9.465415172309807, "kl": 0.51953125, "learning_rate": 4.118806306306306e-07, "loss": 0.0005, "reward": 3.8078432083129883, "reward_std": 0.010188735090196133, "rewards/final_reward": 1.8143336565431716, "rewards/mask_iou_reward": 0.9071668282715858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8078433871269226, "rewards/thk_ans_format_reward": 1.0, "step": 2089, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.95834350585938, "epoch": 7.06070826306914, "grad_norm": 22.620576396536165, "kl": 0.4453125, "learning_rate": 4.1159909909909906e-07, "loss": 0.0005, "reward": 3.7463499307632446, "reward_std": 0.047850754112005234, "rewards/final_reward": 1.8501468568459538, "rewards/mask_iou_reward": 0.9250734284229769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7463498711585999, "rewards/thk_ans_format_reward": 1.0, "step": 2090, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.77083587646484, "epoch": 7.064080944350759, "grad_norm": 8.978667930484319, "kl": 0.5849609375, "learning_rate": 4.113175675675675e-07, "loss": 0.0006, "reward": 3.4429363012313843, "reward_std": 0.021207381039857864, "rewards/final_reward": 1.0751444982965108, "rewards/mask_iou_reward": 0.5375722491482554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4429363012313843, "rewards/thk_ans_format_reward": 1.0, "step": 2091, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 121.1875, "epoch": 7.0674536256323774, "grad_norm": 72.53993116517377, "kl": 0.4384765625, "learning_rate": 4.1103603603603604e-07, "loss": 0.0005, "reward": 3.5110002756118774, "reward_std": 0.07376820594072342, "rewards/final_reward": 0.7198397571572596, "rewards/mask_iou_reward": 0.3599198785786298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5110000371932983, "rewards/thk_ans_format_reward": 1.0, "step": 2092, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 123.04167175292969, "epoch": 7.070826306913997, "grad_norm": 6.7448324868525145, "kl": 0.4599609375, "learning_rate": 4.107545045045045e-07, "loss": 0.0005, "reward": 3.767333507537842, "reward_std": 0.031111277639865875, "rewards/final_reward": 1.6456698216256496, "rewards/mask_iou_reward": 0.8228349108128248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7673333883285522, "rewards/thk_ans_format_reward": 1.0, "step": 2093, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 133.67708587646484, "epoch": 7.074198988195615, "grad_norm": 31.687531572728126, "kl": 0.4296875, "learning_rate": 4.1047297297297296e-07, "loss": 0.0004, "reward": 3.6214983463287354, "reward_std": 0.07041146233677864, "rewards/final_reward": 1.50514327525262, "rewards/mask_iou_reward": 0.75257163762631, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.621498167514801, "rewards/thk_ans_format_reward": 1.0, "step": 2094, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.3229217529297, "epoch": 7.077571669477234, "grad_norm": 71.86638065587701, "kl": 0.53125, "learning_rate": 4.101914414414414e-07, "loss": 0.0005, "reward": 2.968865752220154, "reward_std": 0.08706454932689667, "rewards/final_reward": 0.9521295059368908, "rewards/mask_iou_reward": 0.4760647529684454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9688657224178314, "rewards/thk_ans_format_reward": 1.0, "step": 2095, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.65625, "epoch": 7.080944350758854, "grad_norm": 31.190633686564574, "kl": 0.4873046875, "learning_rate": 4.099099099099099e-07, "loss": 0.0005, "reward": 3.5734691619873047, "reward_std": 0.06589183211326599, "rewards/final_reward": 1.9099139483753502, "rewards/mask_iou_reward": 0.9549569741876751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5734692811965942, "rewards/thk_ans_format_reward": 1.0, "step": 2096, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 146.93750381469727, "epoch": 7.084317032040472, "grad_norm": 9.307520904937336, "kl": 0.537109375, "learning_rate": 4.096283783783784e-07, "loss": 0.0006, "reward": 3.633847951889038, "reward_std": 0.1258330326527357, "rewards/final_reward": 1.5853979707257067, "rewards/mask_iou_reward": 0.7926989853628533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.633847951889038, "rewards/thk_ans_format_reward": 1.0, "step": 2097, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 163.5729217529297, "epoch": 7.087689713322091, "grad_norm": 10.44724731169259, "kl": 0.501953125, "learning_rate": 4.0934684684684685e-07, "loss": 0.0005, "reward": 3.5259032249450684, "reward_std": 0.13000381737947464, "rewards/final_reward": 1.8315799850789156, "rewards/mask_iou_reward": 0.9157899925394578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5259031057357788, "rewards/thk_ans_format_reward": 1.0, "step": 2098, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 178.48958587646484, "epoch": 7.09106239460371, "grad_norm": 13.639258532398406, "kl": 0.46484375, "learning_rate": 4.090653153153153e-07, "loss": 0.0005, "reward": 3.4919179677963257, "reward_std": 0.05438784509897232, "rewards/final_reward": 1.593466845640541, "rewards/mask_iou_reward": 0.7967334228202705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4919179677963257, "rewards/thk_ans_format_reward": 1.0, "step": 2099, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 184.83334350585938, "epoch": 7.094435075885329, "grad_norm": 8.851197616606413, "kl": 0.62109375, "learning_rate": 4.087837837837838e-07, "loss": 0.0006, "reward": 3.5763626098632812, "reward_std": 0.11013033799827099, "rewards/final_reward": 1.5660547621116407, "rewards/mask_iou_reward": 0.7830273810558204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5763624906539917, "rewards/thk_ans_format_reward": 1.0, "step": 2100, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 216.79167938232422, "epoch": 7.097807757166947, "grad_norm": 8.102484022277629, "kl": 0.5068359375, "learning_rate": 4.0850225225225224e-07, "loss": 0.0005, "reward": 3.2421382665634155, "reward_std": 0.032540466636419296, "rewards/final_reward": 1.1041041897317303, "rewards/mask_iou_reward": 0.5520520948658652, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2421382665634155, "rewards/thk_ans_format_reward": 1.0, "step": 2101, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 181.43750762939453, "epoch": 7.101180438448567, "grad_norm": 43.703709847710556, "kl": 0.4111328125, "learning_rate": 4.082207207207207e-07, "loss": 0.0004, "reward": 3.2417622804641724, "reward_std": 0.04179760627448559, "rewards/final_reward": 1.284407073054897, "rewards/mask_iou_reward": 0.6422035365274485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2417622804641724, "rewards/thk_ans_format_reward": 1.0, "step": 2102, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 180.9375, "epoch": 7.104553119730186, "grad_norm": 15.530204865371694, "kl": 0.45703125, "learning_rate": 4.0793918918918916e-07, "loss": 0.0005, "reward": 3.7399545907974243, "reward_std": 0.04067044984549284, "rewards/final_reward": 1.7588847560236927, "rewards/mask_iou_reward": 0.8794423780118463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7399544715881348, "rewards/thk_ans_format_reward": 1.0, "step": 2103, "think_completion_length": 6.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 176.9791717529297, "epoch": 7.107925801011804, "grad_norm": 17.465844081341633, "kl": 0.60546875, "learning_rate": 4.076576576576576e-07, "loss": 0.0006, "reward": 3.3154332637786865, "reward_std": 0.046184979379177094, "rewards/final_reward": 1.0295719726963035, "rewards/mask_iou_reward": 0.5147859863481518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3154330849647522, "rewards/thk_ans_format_reward": 1.0, "step": 2104, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 216.67708587646484, "epoch": 7.1112984822934235, "grad_norm": 11.463666020587564, "kl": 0.376953125, "learning_rate": 4.073761261261261e-07, "loss": 0.0004, "reward": 3.6639139652252197, "reward_std": 0.06345823779702187, "rewards/final_reward": 1.5036986046138827, "rewards/mask_iou_reward": 0.7518493023069414, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6639137864112854, "rewards/thk_ans_format_reward": 1.0, "step": 2105, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.09375381469727, "epoch": 7.114671163575042, "grad_norm": 5.965543170545409, "kl": 0.4853515625, "learning_rate": 4.0709459459459454e-07, "loss": 0.0005, "reward": 3.514773368835449, "reward_std": 0.02285183686763048, "rewards/final_reward": 1.8121782155019046, "rewards/mask_iou_reward": 0.9060891077509523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5147733688354492, "rewards/thk_ans_format_reward": 1.0, "step": 2106, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 146.9479217529297, "epoch": 7.118043844856661, "grad_norm": 26.377210709034202, "kl": 0.4423828125, "learning_rate": 4.0681306306306305e-07, "loss": 0.0004, "reward": 3.6755313873291016, "reward_std": 0.1012740321457386, "rewards/final_reward": 1.9159749806838375, "rewards/mask_iou_reward": 0.9579874903419188, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6755311489105225, "rewards/thk_ans_format_reward": 1.0, "step": 2107, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 147.95834350585938, "epoch": 7.12141652613828, "grad_norm": 9.271016248200919, "kl": 0.478515625, "learning_rate": 4.065315315315315e-07, "loss": 0.0005, "reward": 3.7043673992156982, "reward_std": 0.03634229302406311, "rewards/final_reward": 1.9564682308637495, "rewards/mask_iou_reward": 0.9782341154318748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7043673396110535, "rewards/thk_ans_format_reward": 1.0, "step": 2108, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 197.3229217529297, "epoch": 7.124789207419899, "grad_norm": 25.351969828606347, "kl": 0.416015625, "learning_rate": 4.0625e-07, "loss": 0.0004, "reward": 3.440119981765747, "reward_std": 0.15209950879216194, "rewards/final_reward": 1.4271350135374492, "rewards/mask_iou_reward": 0.7135675067687246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.440119981765747, "rewards/thk_ans_format_reward": 1.0, "step": 2109, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 151.92708587646484, "epoch": 7.128161888701518, "grad_norm": 16.288230773620306, "kl": 0.4150390625, "learning_rate": 4.0596846846846844e-07, "loss": 0.0004, "reward": 3.5552964210510254, "reward_std": 0.04798351600766182, "rewards/final_reward": 0.9282412127329586, "rewards/mask_iou_reward": 0.4641206063664793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.555296540260315, "rewards/thk_ans_format_reward": 1.0, "step": 2110, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 165.5104217529297, "epoch": 7.1315345699831365, "grad_norm": 8.161303969531241, "kl": 0.46484375, "learning_rate": 4.056869369369369e-07, "loss": 0.0005, "reward": 3.3685712814331055, "reward_std": 0.08867337927222252, "rewards/final_reward": 0.8364246545390581, "rewards/mask_iou_reward": 0.41821232726952906, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.368571400642395, "rewards/thk_ans_format_reward": 1.0, "step": 2111, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 203.4791717529297, "epoch": 7.134907251264756, "grad_norm": 9.576325538332526, "kl": 0.3876953125, "learning_rate": 4.054054054054054e-07, "loss": 0.0004, "reward": 3.5125324726104736, "reward_std": 0.08074938133358955, "rewards/final_reward": 1.516974694693673, "rewards/mask_iou_reward": 0.7584873473468365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5125325322151184, "rewards/thk_ans_format_reward": 1.0, "step": 2112, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 120.65625381469727, "epoch": 7.138279932546374, "grad_norm": 26.230253385846307, "kl": 0.51171875, "learning_rate": 4.0512387387387387e-07, "loss": 0.0005, "reward": 3.467235565185547, "reward_std": 0.09212902188301086, "rewards/final_reward": 1.2272662786796569, "rewards/mask_iou_reward": 0.6136331393398284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4672354459762573, "rewards/thk_ans_format_reward": 1.0, "step": 2113, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 151.03125381469727, "epoch": 7.141652613827993, "grad_norm": 20.715019889586777, "kl": 2.5693359375, "learning_rate": 4.0484234234234233e-07, "loss": 0.0026, "reward": 3.457432508468628, "reward_std": 0.10950843244791031, "rewards/final_reward": 1.429733310803956, "rewards/mask_iou_reward": 0.714866655401978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4678492546081543, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2114, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 142.9166717529297, "epoch": 7.145025295109612, "grad_norm": 9.845124376157337, "kl": 0.4306640625, "learning_rate": 4.045608108108108e-07, "loss": 0.0004, "reward": 3.435527205467224, "reward_std": 0.03152256831526756, "rewards/final_reward": 1.55579422645883, "rewards/mask_iou_reward": 0.777897113229415, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.435526967048645, "rewards/thk_ans_format_reward": 1.0, "step": 2115, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 174.58333587646484, "epoch": 7.148397976391231, "grad_norm": 94.28612906783492, "kl": 0.4306640625, "learning_rate": 4.0427927927927925e-07, "loss": 0.0004, "reward": 3.3601832389831543, "reward_std": 0.08771881833672523, "rewards/final_reward": 0.87061459389086, "rewards/mask_iou_reward": 0.43530729694543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3601831197738647, "rewards/thk_ans_format_reward": 1.0, "step": 2116, "think_completion_length": 6.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 172.59375, "epoch": 7.15177065767285, "grad_norm": 10.221977457090283, "kl": 0.4208984375, "learning_rate": 4.0399774774774777e-07, "loss": 0.0004, "reward": 2.9706469774246216, "reward_std": 0.047149766236543655, "rewards/final_reward": 0.6018353282228965, "rewards/mask_iou_reward": 0.30091766411144827, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9706469774246216, "rewards/thk_ans_format_reward": 1.0, "step": 2117, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 157.55208587646484, "epoch": 7.155143338954469, "grad_norm": 8.51580663824599, "kl": 0.478515625, "learning_rate": 4.0371621621621623e-07, "loss": 0.0005, "reward": 3.6264944076538086, "reward_std": 0.1628369241952896, "rewards/final_reward": 1.9376107862778913, "rewards/mask_iou_reward": 0.9688053931389456, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6369110941886902, "rewards/thk_ans_format_reward": 1.0, "step": 2118, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 134.23958587646484, "epoch": 7.158516020236088, "grad_norm": 58.8540263103047, "kl": 0.46875, "learning_rate": 4.034346846846847e-07, "loss": 0.0005, "reward": 3.8285425901412964, "reward_std": 0.021101244492456317, "rewards/final_reward": 1.912153629289806, "rewards/mask_iou_reward": 0.956076814644903, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.828542411327362, "rewards/thk_ans_format_reward": 1.0, "step": 2119, "think_completion_length": 6.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 164.08334350585938, "epoch": 7.161888701517706, "grad_norm": 14.23377320752225, "kl": 0.4228515625, "learning_rate": 4.0315315315315315e-07, "loss": 0.0004, "reward": 3.592753767967224, "reward_std": 0.06616166792809963, "rewards/final_reward": 1.2861638111709919, "rewards/mask_iou_reward": 0.6430819055854959, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5927537679672241, "rewards/thk_ans_format_reward": 1.0, "step": 2120, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 166.3229217529297, "epoch": 7.165261382799326, "grad_norm": 7.3252669737331075, "kl": 0.5078125, "learning_rate": 4.0287162162162156e-07, "loss": 0.0005, "reward": 3.5950053930282593, "reward_std": 0.30001043528318405, "rewards/final_reward": 1.5873910615638014, "rewards/mask_iou_reward": 0.7936955307819007, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6158387660980225, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2121, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 165.5729217529297, "epoch": 7.168634064080944, "grad_norm": 7.638980910789786, "kl": 0.4794921875, "learning_rate": 4.0259009009009007e-07, "loss": 0.0006, "reward": 3.5733102560043335, "reward_std": 0.0462705185636878, "rewards/final_reward": 1.4853571579104514, "rewards/mask_iou_reward": 0.7426785789552257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5733102560043335, "rewards/thk_ans_format_reward": 1.0, "step": 2122, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 153.45833587646484, "epoch": 7.172006745362563, "grad_norm": 9.882482773527292, "kl": 0.439453125, "learning_rate": 4.0230855855855853e-07, "loss": 0.0005, "reward": 3.4895578622817993, "reward_std": 0.0789448469877243, "rewards/final_reward": 1.319527123891827, "rewards/mask_iou_reward": 0.6597635619459135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4895577430725098, "rewards/thk_ans_format_reward": 1.0, "step": 2123, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.3854217529297, "epoch": 7.175379426644182, "grad_norm": 39.426525505427065, "kl": 0.427734375, "learning_rate": 4.02027027027027e-07, "loss": 0.0004, "reward": 3.4777748584747314, "reward_std": 0.1806269846856594, "rewards/final_reward": 1.8703323689058926, "rewards/mask_iou_reward": 0.9351661844529463, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4986081719398499, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2124, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 164.96875762939453, "epoch": 7.178752107925801, "grad_norm": 11.025901523498554, "kl": 0.416015625, "learning_rate": 4.0174549549549545e-07, "loss": 0.0003, "reward": 3.55265212059021, "reward_std": 0.013125112280249596, "rewards/final_reward": 0.9920675836555493, "rewards/mask_iou_reward": 0.49603379182777463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55265212059021, "rewards/thk_ans_format_reward": 1.0, "step": 2125, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 159.68750762939453, "epoch": 7.18212478920742, "grad_norm": 12.019458429651026, "kl": 0.4140625, "learning_rate": 4.014639639639639e-07, "loss": 0.0004, "reward": 3.4704500436782837, "reward_std": 0.07267389260232449, "rewards/final_reward": 1.8845524357169587, "rewards/mask_iou_reward": 0.9422762178584794, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.470449984073639, "rewards/thk_ans_format_reward": 1.0, "step": 2126, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 155.3854217529297, "epoch": 7.185497470489039, "grad_norm": 23.32054644884736, "kl": 0.4140625, "learning_rate": 4.011824324324324e-07, "loss": 0.0004, "reward": 3.5207202434539795, "reward_std": 0.07270674407482147, "rewards/final_reward": 1.5134364200643244, "rewards/mask_iou_reward": 0.7567182100321622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5207201838493347, "rewards/thk_ans_format_reward": 1.0, "step": 2127, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 194.96875762939453, "epoch": 7.188870151770658, "grad_norm": 233.4763662776818, "kl": 0.447265625, "learning_rate": 4.009009009009009e-07, "loss": 0.0004, "reward": 3.513588070869446, "reward_std": 0.057372111827135086, "rewards/final_reward": 1.375929576670419, "rewards/mask_iou_reward": 0.6879647883352095, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5135878920555115, "rewards/thk_ans_format_reward": 1.0, "step": 2128, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 120.98958587646484, "epoch": 7.192242833052276, "grad_norm": 16.137783540866636, "kl": 0.515625, "learning_rate": 4.0061936936936935e-07, "loss": 0.0005, "reward": 3.594455122947693, "reward_std": 0.05518577480688691, "rewards/final_reward": 1.87773857431839, "rewards/mask_iou_reward": 0.938869287159195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.594455063343048, "rewards/thk_ans_format_reward": 1.0, "step": 2129, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.87500762939453, "epoch": 7.195615514333896, "grad_norm": 10.617842015866719, "kl": 0.435546875, "learning_rate": 4.003378378378378e-07, "loss": 0.0004, "reward": 3.1099244356155396, "reward_std": 0.055913373827934265, "rewards/final_reward": 0.5185217282650944, "rewards/mask_iou_reward": 0.2592608641325472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1099244356155396, "rewards/thk_ans_format_reward": 1.0, "step": 2130, "think_completion_length": 6.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 168.84375762939453, "epoch": 7.198988195615514, "grad_norm": 13.662643697706343, "kl": 0.484375, "learning_rate": 4.0005630630630627e-07, "loss": 0.0005, "reward": 3.8299983739852905, "reward_std": 0.01682877354323864, "rewards/final_reward": 1.7913390549733599, "rewards/mask_iou_reward": 0.8956695274866799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8299983739852905, "rewards/thk_ans_format_reward": 1.0, "step": 2131, "think_completion_length": 5.625 }, { "clip_ratio": 0.0, "completion_length": 168.55208587646484, "epoch": 7.202360876897133, "grad_norm": 17.046292903262447, "kl": 0.421875, "learning_rate": 3.997747747747748e-07, "loss": 0.0004, "reward": 3.196842908859253, "reward_std": 0.16826032102108002, "rewards/final_reward": 1.912653454277757, "rewards/mask_iou_reward": 0.9563267271388785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1968427896499634, "rewards/thk_ans_format_reward": 1.0, "step": 2132, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 188.75000762939453, "epoch": 7.2057335581787525, "grad_norm": 15.186937128391326, "kl": 0.38671875, "learning_rate": 3.9949324324324324e-07, "loss": 0.0004, "reward": 3.2341290712356567, "reward_std": 0.026309030130505562, "rewards/final_reward": 1.563177724085566, "rewards/mask_iou_reward": 0.781588862042783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2341291904449463, "rewards/thk_ans_format_reward": 1.0, "step": 2133, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 156.0520896911621, "epoch": 7.209106239460371, "grad_norm": 29.4255008250624, "kl": 0.7470703125, "learning_rate": 3.992117117117117e-07, "loss": 0.0007, "reward": 3.400033712387085, "reward_std": 0.17767321318387985, "rewards/final_reward": 1.5393208560935134, "rewards/mask_iou_reward": 0.7696604280467567, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4208670258522034, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2134, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 146.96875762939453, "epoch": 7.21247892074199, "grad_norm": 8.485399571137938, "kl": 0.44921875, "learning_rate": 3.9893018018018016e-07, "loss": 0.0005, "reward": 3.3921091556549072, "reward_std": 0.08318794146180153, "rewards/final_reward": 1.3257067431520768, "rewards/mask_iou_reward": 0.6628533715760384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3921091556549072, "rewards/thk_ans_format_reward": 1.0, "step": 2135, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 181.11458587646484, "epoch": 7.2158516020236085, "grad_norm": 12.122435985688213, "kl": 0.513671875, "learning_rate": 3.986486486486486e-07, "loss": 0.0005, "reward": 3.521836519241333, "reward_std": 0.17866870388388634, "rewards/final_reward": 1.0408465030152805, "rewards/mask_iou_reward": 0.5204232515076402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.521836519241333, "rewards/thk_ans_format_reward": 1.0, "step": 2136, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 229.9479217529297, "epoch": 7.219224283305228, "grad_norm": 18.80734240520374, "kl": 0.36328125, "learning_rate": 3.9836711711711714e-07, "loss": 0.0004, "reward": 3.5676910877227783, "reward_std": 0.03717435151338577, "rewards/final_reward": 1.8321522375205679, "rewards/mask_iou_reward": 0.9160761187602839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5676909685134888, "rewards/thk_ans_format_reward": 1.0, "step": 2137, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 248.56250762939453, "epoch": 7.222596964586846, "grad_norm": 14.275363933619314, "kl": 0.4375, "learning_rate": 3.980855855855856e-07, "loss": 0.0004, "reward": 3.482014775276184, "reward_std": 0.2985433340072632, "rewards/final_reward": 1.349934065591685, "rewards/mask_iou_reward": 0.6749670327958425, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5236812829971313, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2138, "think_completion_length": 6.625 }, { "clip_ratio": 0.0, "completion_length": 226.1041717529297, "epoch": 7.2259696458684655, "grad_norm": 9.048563062948114, "kl": 0.4150390625, "learning_rate": 3.9780405405405406e-07, "loss": 0.0004, "reward": 3.4160574674606323, "reward_std": 0.10905380174517632, "rewards/final_reward": 1.6702328220901432, "rewards/mask_iou_reward": 0.8351164110450716, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4160576462745667, "rewards/thk_ans_format_reward": 1.0, "step": 2139, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 204.87500762939453, "epoch": 7.229342327150085, "grad_norm": 8.460530923785011, "kl": 0.451171875, "learning_rate": 3.975225225225225e-07, "loss": 0.0005, "reward": 3.340652108192444, "reward_std": 0.09711728245019913, "rewards/final_reward": 1.0464337627592921, "rewards/mask_iou_reward": 0.5232168813796461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3406521081924438, "rewards/thk_ans_format_reward": 1.0, "step": 2140, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 179.61458587646484, "epoch": 7.232715008431703, "grad_norm": 14.278147083487097, "kl": 0.408203125, "learning_rate": 3.9724099099099093e-07, "loss": 0.0004, "reward": 3.703104615211487, "reward_std": 0.02983518410474062, "rewards/final_reward": 1.3170696865282334, "rewards/mask_iou_reward": 0.6585348432641167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7031044960021973, "rewards/thk_ans_format_reward": 1.0, "step": 2141, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 144.98958587646484, "epoch": 7.236087689713322, "grad_norm": 9.425512098183246, "kl": 0.4912109375, "learning_rate": 3.9695945945945944e-07, "loss": 0.0005, "reward": 3.412560224533081, "reward_std": 0.15079614520072937, "rewards/final_reward": 1.7363874893385831, "rewards/mask_iou_reward": 0.8681937446692916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4125602841377258, "rewards/thk_ans_format_reward": 1.0, "step": 2142, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 208.6666717529297, "epoch": 7.239460370994941, "grad_norm": 6.200306150717977, "kl": 0.4580078125, "learning_rate": 3.966779279279279e-07, "loss": 0.0005, "reward": 3.399711489677429, "reward_std": 0.15125693986192346, "rewards/final_reward": 1.5797161181786068, "rewards/mask_iou_reward": 0.7898580590893034, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4205447435379028, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2143, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.77083587646484, "epoch": 7.24283305227656, "grad_norm": 8.147060870604529, "kl": 0.5302734375, "learning_rate": 3.9639639639639636e-07, "loss": 0.0005, "reward": 3.6022841930389404, "reward_std": 0.15134335309267044, "rewards/final_reward": 1.8884136915778607, "rewards/mask_iou_reward": 0.9442068457889303, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6022841930389404, "rewards/thk_ans_format_reward": 1.0, "step": 2144, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 117.85417175292969, "epoch": 7.246205733558178, "grad_norm": 149.03203489496107, "kl": 0.650390625, "learning_rate": 3.961148648648648e-07, "loss": 0.0007, "reward": 3.508164882659912, "reward_std": 0.10794974863529205, "rewards/final_reward": 1.9698042980004327, "rewards/mask_iou_reward": 0.9849021490002163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.508164882659912, "rewards/thk_ans_format_reward": 1.0, "step": 2145, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 181.00000762939453, "epoch": 7.249578414839798, "grad_norm": 13.057972042386261, "kl": 0.443359375, "learning_rate": 3.958333333333333e-07, "loss": 0.0004, "reward": 3.5224772691726685, "reward_std": 0.19207923859357834, "rewards/final_reward": 1.4173489969961852, "rewards/mask_iou_reward": 0.7086744984980926, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5224772691726685, "rewards/thk_ans_format_reward": 1.0, "step": 2146, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 179.39583587646484, "epoch": 7.252951096121416, "grad_norm": 4.5979033935736435, "kl": 0.451171875, "learning_rate": 3.955518018018018e-07, "loss": 0.0005, "reward": 3.186232328414917, "reward_std": 0.17321348935365677, "rewards/final_reward": 1.3516898725056445, "rewards/mask_iou_reward": 0.6758449362528223, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1862324178218842, "rewards/thk_ans_format_reward": 1.0, "step": 2147, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 174.77083587646484, "epoch": 7.256323777403035, "grad_norm": 9.207451207124421, "kl": 0.42578125, "learning_rate": 3.9527027027027026e-07, "loss": 0.0004, "reward": 3.324433922767639, "reward_std": 0.06282211095094681, "rewards/final_reward": 1.2522952711881437, "rewards/mask_iou_reward": 0.6261476355940718, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3244337439537048, "rewards/thk_ans_format_reward": 1.0, "step": 2148, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 182.78125, "epoch": 7.259696458684655, "grad_norm": 10.283281006635079, "kl": 0.408203125, "learning_rate": 3.949887387387387e-07, "loss": 0.0004, "reward": 3.7005112171173096, "reward_std": 0.039139024913311005, "rewards/final_reward": 1.5932102741295942, "rewards/mask_iou_reward": 0.7966051370647971, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7005112767219543, "rewards/thk_ans_format_reward": 1.0, "step": 2149, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 184.7916717529297, "epoch": 7.263069139966273, "grad_norm": 38.4408050725588, "kl": 0.392578125, "learning_rate": 3.947072072072072e-07, "loss": 0.0004, "reward": 3.5593591928482056, "reward_std": 0.08478840440511703, "rewards/final_reward": 1.4710174033824241, "rewards/mask_iou_reward": 0.7355087016912121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5593591928482056, "rewards/thk_ans_format_reward": 1.0, "step": 2150, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 131.67708587646484, "epoch": 7.266441821247892, "grad_norm": 5.787815387830136, "kl": 0.541015625, "learning_rate": 3.9442567567567564e-07, "loss": 0.0005, "reward": 3.6677483320236206, "reward_std": 0.12288782093673944, "rewards/final_reward": 1.5742830388891997, "rewards/mask_iou_reward": 0.7871415194445999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6677480936050415, "rewards/thk_ans_format_reward": 1.0, "step": 2151, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 196.5, "epoch": 7.269814502529511, "grad_norm": 10.878390354861022, "kl": 0.4296875, "learning_rate": 3.9414414414414415e-07, "loss": 0.0004, "reward": 3.606956720352173, "reward_std": 0.040956467390060425, "rewards/final_reward": 1.4903519318159777, "rewards/mask_iou_reward": 0.7451759659079888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6069567799568176, "rewards/thk_ans_format_reward": 1.0, "step": 2152, "think_completion_length": 6.125 }, { "clip_ratio": 0.0, "completion_length": 243.0104217529297, "epoch": 7.27318718381113, "grad_norm": 13.98855206951791, "kl": 0.3876953125, "learning_rate": 3.938626126126126e-07, "loss": 0.0004, "reward": 3.5343345403671265, "reward_std": 0.31789813563227654, "rewards/final_reward": 1.2603681149803911, "rewards/mask_iou_reward": 0.6301840574901956, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5760010480880737, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2153, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 207.1666717529297, "epoch": 7.276559865092748, "grad_norm": 15.301432109475098, "kl": 0.4130859375, "learning_rate": 3.935810810810811e-07, "loss": 0.0004, "reward": 3.613258957862854, "reward_std": 0.21496530901640654, "rewards/final_reward": 1.6013227848837182, "rewards/mask_iou_reward": 0.8006613924418591, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.654925525188446, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2154, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 170.77084350585938, "epoch": 7.279932546374368, "grad_norm": 14.334921119171533, "kl": 0.490234375, "learning_rate": 3.9329954954954954e-07, "loss": 0.0005, "reward": 3.3268117904663086, "reward_std": 0.11101927608251572, "rewards/final_reward": 1.8440713944047213, "rewards/mask_iou_reward": 0.9220356972023607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.326811671257019, "rewards/thk_ans_format_reward": 1.0, "step": 2155, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 184.5729217529297, "epoch": 7.283305227655987, "grad_norm": 18.06605732574647, "kl": 0.43359375, "learning_rate": 3.93018018018018e-07, "loss": 0.0004, "reward": 3.5632468461990356, "reward_std": 0.07951584458351135, "rewards/final_reward": 1.1299945703341507, "rewards/mask_iou_reward": 0.5649972851670754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.563246726989746, "rewards/thk_ans_format_reward": 1.0, "step": 2156, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 204.4166717529297, "epoch": 7.286677908937605, "grad_norm": 7.086837200220286, "kl": 0.556640625, "learning_rate": 3.927364864864865e-07, "loss": 0.0006, "reward": 3.759609341621399, "reward_std": 0.2072555348277092, "rewards/final_reward": 1.8777304738504346, "rewards/mask_iou_reward": 0.9388652369252173, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7804425954818726, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2157, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 212.20834350585938, "epoch": 7.2900505902192245, "grad_norm": 19.968401086600352, "kl": 0.4443359375, "learning_rate": 3.9245495495495497e-07, "loss": 0.0005, "reward": 3.369117498397827, "reward_std": 0.19751837849617004, "rewards/final_reward": 1.1488828752023381, "rewards/mask_iou_reward": 0.5744414376011691, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.389950692653656, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2158, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 257.65626525878906, "epoch": 7.293423271500843, "grad_norm": 12.387109597703848, "kl": 0.4111328125, "learning_rate": 3.9217342342342343e-07, "loss": 0.0004, "reward": 3.170549750328064, "reward_std": 0.28758961241692305, "rewards/final_reward": 1.3935270921524614, "rewards/mask_iou_reward": 0.6967635460762307, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.316383183002472, "rewards/thk_ans_format_reward": 0.9270833432674408, "step": 2159, "think_completion_length": 6.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 196.04167938232422, "epoch": 7.296795952782462, "grad_norm": 19.28123203840753, "kl": 0.423828125, "learning_rate": 3.918918918918919e-07, "loss": 0.0004, "reward": 3.62705397605896, "reward_std": 0.22706139460206032, "rewards/final_reward": 1.6559225404737472, "rewards/mask_iou_reward": 0.8279612702368736, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.647887110710144, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2160, "think_completion_length": 6.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 238.09375, "epoch": 7.300168634064081, "grad_norm": 8.978004082516899, "kl": 0.5009765625, "learning_rate": 3.916103603603603e-07, "loss": 0.0005, "reward": 3.428421139717102, "reward_std": 0.18285532295703888, "rewards/final_reward": 1.617844638569244, "rewards/mask_iou_reward": 0.808922319284622, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4700875282287598, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2161, "think_completion_length": 6.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 264.40626525878906, "epoch": 7.3035413153457, "grad_norm": 16.80433491867078, "kl": 0.36328125, "learning_rate": 3.913288288288288e-07, "loss": 0.0004, "reward": 3.5047558546066284, "reward_std": 0.30528346402570605, "rewards/final_reward": 1.7800765282781192, "rewards/mask_iou_reward": 0.8900382641390596, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.6089226603507996, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 2162, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 192.02084350585938, "epoch": 7.306913996627319, "grad_norm": 5.595963899834234, "kl": 0.6728515625, "learning_rate": 3.910472972972973e-07, "loss": 0.0007, "reward": 3.6005738973617554, "reward_std": 0.09857448190450668, "rewards/final_reward": 1.6690496045432348, "rewards/mask_iou_reward": 0.8345248022716174, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6005739569664001, "rewards/thk_ans_format_reward": 1.0, "step": 2163, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 261.1041717529297, "epoch": 7.3102866779089375, "grad_norm": 16.19212872325748, "kl": 0.365234375, "learning_rate": 3.9076576576576574e-07, "loss": 0.0004, "reward": 3.5587165355682373, "reward_std": 0.1438434375450015, "rewards/final_reward": 1.6682048349112146, "rewards/mask_iou_reward": 0.8341024174556073, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5795499086380005, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2164, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 168.71875762939453, "epoch": 7.313659359190557, "grad_norm": 7.803745253729495, "kl": 0.443359375, "learning_rate": 3.904842342342342e-07, "loss": 0.0005, "reward": 3.412447929382324, "reward_std": 0.02094810316339135, "rewards/final_reward": 1.3435670359208849, "rewards/mask_iou_reward": 0.6717835179604424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4124478697776794, "rewards/thk_ans_format_reward": 1.0, "step": 2165, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 210.78125762939453, "epoch": 7.317032040472175, "grad_norm": 4.843633546989581, "kl": 0.4716796875, "learning_rate": 3.9020270270270266e-07, "loss": 0.0005, "reward": 3.5543458461761475, "reward_std": 0.052722327411174774, "rewards/final_reward": 1.8292358145000578, "rewards/mask_iou_reward": 0.9146179072500289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.554345726966858, "rewards/thk_ans_format_reward": 1.0, "step": 2166, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 222.12500762939453, "epoch": 7.320404721753794, "grad_norm": 15.399198454003308, "kl": 0.537109375, "learning_rate": 3.899211711711711e-07, "loss": 0.0005, "reward": 3.5818214416503906, "reward_std": 0.08301165979355574, "rewards/final_reward": 1.163532358848403, "rewards/mask_iou_reward": 0.5817661794242015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5818213820457458, "rewards/thk_ans_format_reward": 1.0, "step": 2167, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 350.31251525878906, "epoch": 7.323777403035413, "grad_norm": 21.04994479043818, "kl": 0.404296875, "learning_rate": 3.8963963963963963e-07, "loss": 0.0004, "reward": 3.177867293357849, "reward_std": 0.42192305624485016, "rewards/final_reward": 1.5446818310936512, "rewards/mask_iou_reward": 0.7723409155468256, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.2820341289043427, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 2168, "think_completion_length": 5.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 310.1979217529297, "epoch": 7.327150084317032, "grad_norm": 7.782304979128652, "kl": 0.345703125, "learning_rate": 3.893581081081081e-07, "loss": 0.0003, "reward": 3.2318207025527954, "reward_std": 0.41622330248355865, "rewards/final_reward": 1.9512054078677687, "rewards/mask_iou_reward": 0.9756027039338844, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.3776538372039795, "rewards/thk_ans_format_reward": 0.9270833432674408, "step": 2169, "think_completion_length": 6.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 334.5625, "epoch": 7.330522765598651, "grad_norm": 8.374995571382644, "kl": 0.5, "learning_rate": 3.8907657657657655e-07, "loss": 0.0005, "reward": 3.082065463066101, "reward_std": 0.2468433976173401, "rewards/final_reward": 1.6989625807648732, "rewards/mask_iou_reward": 0.8494812903824366, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.102898895740509, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2170, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 285.06251525878906, "epoch": 7.33389544688027, "grad_norm": 9.203101037904963, "kl": 0.3583984375, "learning_rate": 3.88795045045045e-07, "loss": 0.0004, "reward": 3.2510664463043213, "reward_std": 0.4821528196334839, "rewards/final_reward": 1.641097839508744, "rewards/mask_iou_reward": 0.820548919754372, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.3343997597694397, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 2171, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 210.25000762939453, "epoch": 7.337268128161889, "grad_norm": 12.01320772350456, "kl": 0.380859375, "learning_rate": 3.885135135135135e-07, "loss": 0.0004, "reward": 3.471616744995117, "reward_std": 0.1700489092618227, "rewards/final_reward": 1.6669278379681216, "rewards/mask_iou_reward": 0.8334639189840608, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4924501180648804, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2172, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 301.625, "epoch": 7.340640809443507, "grad_norm": 8.85257565325863, "kl": 0.3359375, "learning_rate": 3.88231981981982e-07, "loss": 0.0003, "reward": 3.362300753593445, "reward_std": 0.31437610648572445, "rewards/final_reward": 1.7814382838099094, "rewards/mask_iou_reward": 0.8907191419049547, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.4248005151748657, "rewards/thk_ans_format_reward": 0.96875, "step": 2173, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 297.96876525878906, "epoch": 7.344013490725127, "grad_norm": 6.91327388672393, "kl": 0.3330078125, "learning_rate": 3.8795045045045045e-07, "loss": 0.0003, "reward": 3.5213279724121094, "reward_std": 0.14972331002354622, "rewards/final_reward": 1.603927316493496, "rewards/mask_iou_reward": 0.801963658246748, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5421611070632935, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2174, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 335.50001525878906, "epoch": 7.347386172006745, "grad_norm": 5.4068115963741175, "kl": 0.3193359375, "learning_rate": 3.876689189189189e-07, "loss": 0.0003, "reward": 3.0117900371551514, "reward_std": 0.5588356852531433, "rewards/final_reward": 1.7250185714794255, "rewards/mask_iou_reward": 0.8625092857397127, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.1159567832946777, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 2175, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 281.56251525878906, "epoch": 7.350758853288364, "grad_norm": 8.37104926533639, "kl": 0.318359375, "learning_rate": 3.8738738738738737e-07, "loss": 0.0003, "reward": 3.1996771097183228, "reward_std": 0.37156446278095245, "rewards/final_reward": 1.4420773983333568, "rewards/mask_iou_reward": 0.7210386991666784, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2413436770439148, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2176, "think_completion_length": 6.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 261.78125762939453, "epoch": 7.354131534569984, "grad_norm": 6.7895801457677045, "kl": 0.3515625, "learning_rate": 3.8710585585585583e-07, "loss": 0.0004, "reward": 3.296520471572876, "reward_std": 0.18614527583122253, "rewards/final_reward": 1.4495435536760706, "rewards/mask_iou_reward": 0.7247717768380353, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.359020471572876, "rewards/thk_ans_format_reward": 0.96875, "step": 2177, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 288.8854217529297, "epoch": 7.357504215851602, "grad_norm": 6.552101407920925, "kl": 0.365234375, "learning_rate": 3.8682432432432434e-07, "loss": 0.0004, "reward": 3.418135404586792, "reward_std": 0.39255285263061523, "rewards/final_reward": 1.4095475051114925, "rewards/mask_iou_reward": 0.7047737525557463, "rewards/sam_format_reward": 0.9687500298023224, "rewards/sam_reward_func_ultra": 1.480635404586792, "rewards/thk_ans_format_reward": 0.9687500298023224, "step": 2178, "think_completion_length": 6.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.86458587646484, "epoch": 7.360876897133221, "grad_norm": 5.4216062546604835, "kl": 0.509765625, "learning_rate": 3.865427927927928e-07, "loss": 0.0005, "reward": 3.7206671237945557, "reward_std": 0.03425286652054638, "rewards/final_reward": 1.8160294503549825, "rewards/mask_iou_reward": 0.9080147251774913, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7206671237945557, "rewards/thk_ans_format_reward": 1.0, "step": 2179, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 236.30209350585938, "epoch": 7.36424957841484, "grad_norm": 18.869959269884077, "kl": 0.39453125, "learning_rate": 3.8626126126126127e-07, "loss": 0.0004, "reward": 3.5714718103408813, "reward_std": 0.22793716937303543, "rewards/final_reward": 1.6498136826991323, "rewards/mask_iou_reward": 0.8249068413495662, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5923051834106445, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2180, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 219.27084350585938, "epoch": 7.367622259696459, "grad_norm": 35.29660849574145, "kl": 1.1455078125, "learning_rate": 3.8597972972972967e-07, "loss": 0.0011, "reward": 3.8055763244628906, "reward_std": 0.11059517413377762, "rewards/final_reward": 1.932277467940826, "rewards/mask_iou_reward": 0.966138733970413, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8055765628814697, "rewards/thk_ans_format_reward": 1.0, "step": 2181, "think_completion_length": 6.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 248.39584350585938, "epoch": 7.370994940978077, "grad_norm": 5.726177243990948, "kl": 0.3515625, "learning_rate": 3.8569819819819813e-07, "loss": 0.0004, "reward": 3.4815937280654907, "reward_std": 0.19145195186138153, "rewards/final_reward": 1.6666713105364588, "rewards/mask_iou_reward": 0.8333356552682294, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.502427101135254, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2182, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 242.48958587646484, "epoch": 7.3743676222596966, "grad_norm": 5.940061168135283, "kl": 0.4697265625, "learning_rate": 3.8541666666666665e-07, "loss": 0.0005, "reward": 3.6492291688919067, "reward_std": 0.0633248221129179, "rewards/final_reward": 1.7177834520208082, "rewards/mask_iou_reward": 0.8588917260104041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6492289304733276, "rewards/thk_ans_format_reward": 1.0, "step": 2183, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 215.6666717529297, "epoch": 7.377740303541315, "grad_norm": 5.7907618943106325, "kl": 0.4248046875, "learning_rate": 3.851351351351351e-07, "loss": 0.0004, "reward": 3.47650945186615, "reward_std": 0.23418362438678741, "rewards/final_reward": 1.8728345238525135, "rewards/mask_iou_reward": 0.9364172619262567, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4973427653312683, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2184, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 318.78125, "epoch": 7.381112984822934, "grad_norm": 9.008928063551123, "kl": 0.4111328125, "learning_rate": 3.8485360360360357e-07, "loss": 0.0004, "reward": 3.35174298286438, "reward_std": 0.32421646267175674, "rewards/final_reward": 0.9672972781583278, "rewards/mask_iou_reward": 0.4836486390791639, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.4142429828643799, "rewards/thk_ans_format_reward": 0.96875, "step": 2185, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 175.65625, "epoch": 7.3844856661045535, "grad_norm": 7.066288003515296, "kl": 0.416015625, "learning_rate": 3.8457207207207203e-07, "loss": 0.0005, "reward": 3.231791615486145, "reward_std": 0.10090844705700874, "rewards/final_reward": 0.919918053266102, "rewards/mask_iou_reward": 0.459959026633051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2317915558815002, "rewards/thk_ans_format_reward": 1.0, "step": 2186, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 172.30208587646484, "epoch": 7.387858347386172, "grad_norm": 5.58767972116072, "kl": 0.396484375, "learning_rate": 3.842905405405405e-07, "loss": 0.0004, "reward": 3.4587671756744385, "reward_std": 0.0885092574171722, "rewards/final_reward": 1.8481281099755282, "rewards/mask_iou_reward": 0.9240640549877641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.458767294883728, "rewards/thk_ans_format_reward": 1.0, "step": 2187, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 235.06251525878906, "epoch": 7.391231028667791, "grad_norm": 11.352386821224222, "kl": 0.3623046875, "learning_rate": 3.84009009009009e-07, "loss": 0.0004, "reward": 3.7279186248779297, "reward_std": 0.21404867619276047, "rewards/final_reward": 1.7843203736840167, "rewards/mask_iou_reward": 0.8921601868420084, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.7695854306221008, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2188, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 189.71875762939453, "epoch": 7.3946037099494095, "grad_norm": 13.41413439864965, "kl": 0.390625, "learning_rate": 3.8372747747747746e-07, "loss": 0.0004, "reward": 3.3822845220565796, "reward_std": 0.11853938177227974, "rewards/final_reward": 1.0435111055189241, "rewards/mask_iou_reward": 0.5217555527594621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3822844624519348, "rewards/thk_ans_format_reward": 1.0, "step": 2189, "think_completion_length": 6.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 212.2916717529297, "epoch": 7.397976391231029, "grad_norm": 15.092434058069491, "kl": 0.3916015625, "learning_rate": 3.834459459459459e-07, "loss": 0.0004, "reward": 3.6769341230392456, "reward_std": 0.19549360498785973, "rewards/final_reward": 1.8140674795930374, "rewards/mask_iou_reward": 0.9070337397965187, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6977673768997192, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2190, "think_completion_length": 6.375 }, { "clip_ratio": 0.0, "completion_length": 139.70833587646484, "epoch": 7.401349072512647, "grad_norm": 17.033820282190767, "kl": 0.421875, "learning_rate": 3.831644144144144e-07, "loss": 0.0004, "reward": 3.5693044662475586, "reward_std": 0.035708085633814335, "rewards/final_reward": 1.1148767645815578, "rewards/mask_iou_reward": 0.5574383822907789, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.56930410861969, "rewards/thk_ans_format_reward": 1.0, "step": 2191, "think_completion_length": 6.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 210.5625, "epoch": 7.4047217537942664, "grad_norm": 25.511794760633197, "kl": 0.478515625, "learning_rate": 3.8288288288288285e-07, "loss": 0.0005, "reward": 3.360031247138977, "reward_std": 0.0737453605979681, "rewards/final_reward": 1.5050937203409687, "rewards/mask_iou_reward": 0.7525468601704843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.360031247138977, "rewards/thk_ans_format_reward": 1.0, "step": 2192, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 187.11458587646484, "epoch": 7.408094435075886, "grad_norm": 9.392724621007625, "kl": 0.4892578125, "learning_rate": 3.8260135135135136e-07, "loss": 0.0005, "reward": 3.3804363012313843, "reward_std": 0.09317411482334137, "rewards/final_reward": 1.1672647924376789, "rewards/mask_iou_reward": 0.5836323962188394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3804362416267395, "rewards/thk_ans_format_reward": 1.0, "step": 2193, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 133.32291793823242, "epoch": 7.411467116357504, "grad_norm": 40.91590308816363, "kl": 0.6357421875, "learning_rate": 3.823198198198198e-07, "loss": 0.0007, "reward": 3.6738728284835815, "reward_std": 0.0640547089278698, "rewards/final_reward": 1.4350410539701315, "rewards/mask_iou_reward": 0.7175205269850657, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.673872709274292, "rewards/thk_ans_format_reward": 1.0, "step": 2194, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 156.70833587646484, "epoch": 7.414839797639123, "grad_norm": 17.155419273352084, "kl": 0.568359375, "learning_rate": 3.820382882882883e-07, "loss": 0.0006, "reward": 3.4904896020889282, "reward_std": 0.046076007187366486, "rewards/final_reward": 1.8197614421774921, "rewards/mask_iou_reward": 0.9098807210887461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4904893636703491, "rewards/thk_ans_format_reward": 1.0, "step": 2195, "think_completion_length": 6.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 141.58333587646484, "epoch": 7.418212478920742, "grad_norm": 27.34766369146001, "kl": 0.666015625, "learning_rate": 3.8175675675675674e-07, "loss": 0.0007, "reward": 3.746211886405945, "reward_std": 0.0641557164490223, "rewards/final_reward": 1.628753586553516, "rewards/mask_iou_reward": 0.814376793276758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7462120056152344, "rewards/thk_ans_format_reward": 1.0, "step": 2196, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 164.0104217529297, "epoch": 7.421585160202361, "grad_norm": 15.070360604305545, "kl": 0.4365234375, "learning_rate": 3.814752252252252e-07, "loss": 0.0004, "reward": 3.628060817718506, "reward_std": 0.14443704020231962, "rewards/final_reward": 1.7625150773573142, "rewards/mask_iou_reward": 0.8812575386786571, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6384775042533875, "rewards/thk_ans_format_reward": 1.0, "step": 2197, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 164.13541793823242, "epoch": 7.424957841483979, "grad_norm": 15.525017230063492, "kl": 0.51953125, "learning_rate": 3.811936936936937e-07, "loss": 0.0005, "reward": 3.5307661294937134, "reward_std": 0.054470050148665905, "rewards/final_reward": 1.3505440488513778, "rewards/mask_iou_reward": 0.6752720244256889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.530765950679779, "rewards/thk_ans_format_reward": 1.0, "step": 2198, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 199.4479217529297, "epoch": 7.428330522765599, "grad_norm": 14.559243133401148, "kl": 0.4306640625, "learning_rate": 3.809121621621622e-07, "loss": 0.0004, "reward": 3.709012985229492, "reward_std": 0.04592567728832364, "rewards/final_reward": 1.6783327267927772, "rewards/mask_iou_reward": 0.8391663633963886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7090131640434265, "rewards/thk_ans_format_reward": 1.0, "step": 2199, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 145.75, "epoch": 7.431703204047217, "grad_norm": 12.114197844295079, "kl": 0.5166015625, "learning_rate": 3.8063063063063064e-07, "loss": 0.0005, "reward": 3.7441108226776123, "reward_std": 0.022463752888143063, "rewards/final_reward": 1.6117690480853413, "rewards/mask_iou_reward": 0.8058845240426706, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7441107034683228, "rewards/thk_ans_format_reward": 1.0, "step": 2200, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 141.22916793823242, "epoch": 7.435075885328836, "grad_norm": 6.981330545722498, "kl": 0.451171875, "learning_rate": 3.8034909909909904e-07, "loss": 0.0005, "reward": 3.562459111213684, "reward_std": 0.07043910771608353, "rewards/final_reward": 0.9374220019005385, "rewards/mask_iou_reward": 0.46871100095026924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5624586939811707, "rewards/thk_ans_format_reward": 1.0, "step": 2201, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 145.73958587646484, "epoch": 7.438448566610456, "grad_norm": 12.743543224102556, "kl": 0.5830078125, "learning_rate": 3.800675675675675e-07, "loss": 0.0006, "reward": 3.7944655418395996, "reward_std": 0.044870490208268166, "rewards/final_reward": 1.4736335733550814, "rewards/mask_iou_reward": 0.7368167866775407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.794465720653534, "rewards/thk_ans_format_reward": 1.0, "step": 2202, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 134.9479217529297, "epoch": 7.441821247892074, "grad_norm": 7.429789500024564, "kl": 0.6259765625, "learning_rate": 3.79786036036036e-07, "loss": 0.0007, "reward": 3.661327838897705, "reward_std": 0.0312417505774647, "rewards/final_reward": 1.4290566956752653, "rewards/mask_iou_reward": 0.7145283478376326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6613278985023499, "rewards/thk_ans_format_reward": 1.0, "step": 2203, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 166.92708587646484, "epoch": 7.445193929173693, "grad_norm": 24.441487691143756, "kl": 0.85546875, "learning_rate": 3.795045045045045e-07, "loss": 0.0009, "reward": 3.331498861312866, "reward_std": 0.040046393405646086, "rewards/final_reward": 1.232488613551466, "rewards/mask_iou_reward": 0.616244306775733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3314987421035767, "rewards/thk_ans_format_reward": 1.0, "step": 2204, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 189.09375, "epoch": 7.448566610455312, "grad_norm": 14.548716922710366, "kl": 0.4306640625, "learning_rate": 3.7922297297297294e-07, "loss": 0.0004, "reward": 3.6373748779296875, "reward_std": 0.09864117112010717, "rewards/final_reward": 1.7490384938743897, "rewards/mask_iou_reward": 0.8745192469371948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6373746991157532, "rewards/thk_ans_format_reward": 1.0, "step": 2205, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 125.52083587646484, "epoch": 7.451939291736931, "grad_norm": 8.439231876587783, "kl": 0.580078125, "learning_rate": 3.789414414414414e-07, "loss": 0.0006, "reward": 3.783831477165222, "reward_std": 0.06918147206306458, "rewards/final_reward": 1.6706428733265701, "rewards/mask_iou_reward": 0.8353214366632851, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7838313579559326, "rewards/thk_ans_format_reward": 1.0, "step": 2206, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 148.0416717529297, "epoch": 7.455311973018549, "grad_norm": 13.586478486878896, "kl": 0.4736328125, "learning_rate": 3.7865990990990986e-07, "loss": 0.0005, "reward": 3.2348625659942627, "reward_std": 0.10381998401135206, "rewards/final_reward": 1.3652094540702542, "rewards/mask_iou_reward": 0.6826047270351271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2348628044128418, "rewards/thk_ans_format_reward": 1.0, "step": 2207, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 195.28125762939453, "epoch": 7.458684654300169, "grad_norm": 8.91410597058018, "kl": 0.408203125, "learning_rate": 3.783783783783784e-07, "loss": 0.0004, "reward": 3.38576340675354, "reward_std": 0.05992165021598339, "rewards/final_reward": 1.5979352470525372, "rewards/mask_iou_reward": 0.7989676235262686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3857632279396057, "rewards/thk_ans_format_reward": 1.0, "step": 2208, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 123.95833587646484, "epoch": 7.462057335581788, "grad_norm": 25.637237825695134, "kl": 0.509765625, "learning_rate": 3.7809684684684684e-07, "loss": 0.0005, "reward": 3.713690161705017, "reward_std": 0.03531708940863609, "rewards/final_reward": 1.3274247871289895, "rewards/mask_iou_reward": 0.6637123935644947, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.713690161705017, "rewards/thk_ans_format_reward": 1.0, "step": 2209, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 138.93750762939453, "epoch": 7.465430016863406, "grad_norm": 28.405959846536, "kl": 0.4443359375, "learning_rate": 3.778153153153153e-07, "loss": 0.0004, "reward": 3.681239604949951, "reward_std": 0.06277862191200256, "rewards/final_reward": 1.5637615545390429, "rewards/mask_iou_reward": 0.7818807772695214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6812394857406616, "rewards/thk_ans_format_reward": 1.0, "step": 2210, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 146.42708587646484, "epoch": 7.4688026981450255, "grad_norm": 23.567063540409116, "kl": 0.44921875, "learning_rate": 3.7753378378378376e-07, "loss": 0.0005, "reward": 3.579433560371399, "reward_std": 0.04429387301206589, "rewards/final_reward": 1.4484530347445896, "rewards/mask_iou_reward": 0.7242265173722948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5794333815574646, "rewards/thk_ans_format_reward": 1.0, "step": 2211, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 238.46876525878906, "epoch": 7.472175379426644, "grad_norm": 13.340321865561812, "kl": 0.6357421875, "learning_rate": 3.772522522522522e-07, "loss": 0.0006, "reward": 3.567931294441223, "reward_std": 0.0854704063385725, "rewards/final_reward": 1.5191784787802172, "rewards/mask_iou_reward": 0.7595892393901086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5679312944412231, "rewards/thk_ans_format_reward": 1.0, "step": 2212, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 177.92708587646484, "epoch": 7.475548060708263, "grad_norm": 19.779998859375922, "kl": 0.4296875, "learning_rate": 3.7697072072072073e-07, "loss": 0.0004, "reward": 3.3631173372268677, "reward_std": 0.09759453311562538, "rewards/final_reward": 1.1787163009693873, "rewards/mask_iou_reward": 0.5893581504846936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3631173372268677, "rewards/thk_ans_format_reward": 1.0, "step": 2213, "think_completion_length": 7.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 125.375, "epoch": 7.4789207419898815, "grad_norm": 11.414583103038034, "kl": 0.5224609375, "learning_rate": 3.766891891891892e-07, "loss": 0.0005, "reward": 3.526229500770569, "reward_std": 0.09345915447920561, "rewards/final_reward": 1.2993992131816308, "rewards/mask_iou_reward": 0.6496996065908154, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.526229441165924, "rewards/thk_ans_format_reward": 1.0, "step": 2214, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 172.45833587646484, "epoch": 7.482293423271501, "grad_norm": 93.76734649917043, "kl": 0.4921875, "learning_rate": 3.7640765765765765e-07, "loss": 0.0005, "reward": 3.3889803886413574, "reward_std": 0.05893061310052872, "rewards/final_reward": 1.1675987035576778, "rewards/mask_iou_reward": 0.5837993517788389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.388980507850647, "rewards/thk_ans_format_reward": 1.0, "step": 2215, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 143.61459350585938, "epoch": 7.48566610455312, "grad_norm": 9.911396228124637, "kl": 0.630859375, "learning_rate": 3.761261261261261e-07, "loss": 0.0006, "reward": 3.6827415227890015, "reward_std": 0.03132602386176586, "rewards/final_reward": 1.7700213586978424, "rewards/mask_iou_reward": 0.8850106793489212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6827413439750671, "rewards/thk_ans_format_reward": 1.0, "step": 2216, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 154.06250762939453, "epoch": 7.4890387858347385, "grad_norm": 10.90871479195432, "kl": 0.4384765625, "learning_rate": 3.758445945945946e-07, "loss": 0.0004, "reward": 3.661782741546631, "reward_std": 0.04503993829712272, "rewards/final_reward": 1.3926323886832095, "rewards/mask_iou_reward": 0.6963161943416047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.661782443523407, "rewards/thk_ans_format_reward": 1.0, "step": 2217, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 159.2291717529297, "epoch": 7.492411467116358, "grad_norm": 11.99902117384724, "kl": 0.544921875, "learning_rate": 3.755630630630631e-07, "loss": 0.0006, "reward": 3.598073363304138, "reward_std": 0.08071838691830635, "rewards/final_reward": 1.4846668315120548, "rewards/mask_iou_reward": 0.7423334157560274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5980734825134277, "rewards/thk_ans_format_reward": 1.0, "step": 2218, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 161.23958587646484, "epoch": 7.495784148397976, "grad_norm": 8.12535465891368, "kl": 0.48046875, "learning_rate": 3.7528153153153155e-07, "loss": 0.0005, "reward": 3.6236926317214966, "reward_std": 0.0668908916413784, "rewards/final_reward": 1.5927653023144133, "rewards/mask_iou_reward": 0.7963826511572066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.623692274093628, "rewards/thk_ans_format_reward": 1.0, "step": 2219, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 152.98959350585938, "epoch": 7.499156829679595, "grad_norm": 6.679490208300435, "kl": 0.515625, "learning_rate": 3.75e-07, "loss": 0.0005, "reward": 3.4468973875045776, "reward_std": 0.05854324251413345, "rewards/final_reward": 1.732786586934808, "rewards/mask_iou_reward": 0.866393293467404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.446897566318512, "rewards/thk_ans_format_reward": 1.0, "step": 2220, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 210.89583587646484, "epoch": 7.502529510961214, "grad_norm": 45.747778673697404, "kl": 0.4326171875, "learning_rate": 3.747184684684684e-07, "loss": 0.0004, "reward": 3.609244465827942, "reward_std": 0.06323170848190784, "rewards/final_reward": 1.0002317736868498, "rewards/mask_iou_reward": 0.5001158868434249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6092444062232971, "rewards/thk_ans_format_reward": 1.0, "step": 2221, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 192.67708587646484, "epoch": 7.505902192242833, "grad_norm": 32.9355415535747, "kl": 0.439453125, "learning_rate": 3.744369369369369e-07, "loss": 0.0004, "reward": 3.7596893310546875, "reward_std": 0.06496717035770416, "rewards/final_reward": 1.7733527351530844, "rewards/mask_iou_reward": 0.8866763675765422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7596891522407532, "rewards/thk_ans_format_reward": 1.0, "step": 2222, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 225.81250762939453, "epoch": 7.509274873524452, "grad_norm": 18.282761151795988, "kl": 0.4287109375, "learning_rate": 3.741554054054054e-07, "loss": 0.0004, "reward": 3.25011944770813, "reward_std": 0.1788785234093666, "rewards/final_reward": 1.7937153522651648, "rewards/mask_iou_reward": 0.8968576761325824, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.354286015033722, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 2223, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 169.48958587646484, "epoch": 7.512647554806071, "grad_norm": 11.784405940817232, "kl": 0.5439453125, "learning_rate": 3.7387387387387385e-07, "loss": 0.0005, "reward": 3.707550287246704, "reward_std": 0.06585472077131271, "rewards/final_reward": 1.5084785123889164, "rewards/mask_iou_reward": 0.7542392561944582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7075501680374146, "rewards/thk_ans_format_reward": 1.0, "step": 2224, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 147.2291717529297, "epoch": 7.51602023608769, "grad_norm": 517.8951212717884, "kl": 0.548828125, "learning_rate": 3.735923423423423e-07, "loss": 0.0006, "reward": 3.596574544906616, "reward_std": 0.12409070134162903, "rewards/final_reward": 1.5688267024199112, "rewards/mask_iou_reward": 0.7844133512099556, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5965744256973267, "rewards/thk_ans_format_reward": 1.0, "step": 2225, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 196.1979217529297, "epoch": 7.519392917369308, "grad_norm": 10.548538981689129, "kl": 0.3935546875, "learning_rate": 3.7331081081081077e-07, "loss": 0.0004, "reward": 3.405883550643921, "reward_std": 0.06507723964750767, "rewards/final_reward": 1.6584953532713012, "rewards/mask_iou_reward": 0.8292476766356506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.405883550643921, "rewards/thk_ans_format_reward": 1.0, "step": 2226, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 126.22916793823242, "epoch": 7.522765598650928, "grad_norm": 7.929043652698762, "kl": 0.4267578125, "learning_rate": 3.7302927927927923e-07, "loss": 0.0004, "reward": 3.7863839864730835, "reward_std": 0.017993359360843897, "rewards/final_reward": 1.435606746177593, "rewards/mask_iou_reward": 0.7178033730887965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7863839864730835, "rewards/thk_ans_format_reward": 1.0, "step": 2227, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 185.90625, "epoch": 7.526138279932546, "grad_norm": 8.908029448338619, "kl": 0.4423828125, "learning_rate": 3.7274774774774775e-07, "loss": 0.0004, "reward": 3.4020566940307617, "reward_std": 0.10786337032914162, "rewards/final_reward": 1.4677611612201815, "rewards/mask_iou_reward": 0.7338805806100908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4020565748214722, "rewards/thk_ans_format_reward": 1.0, "step": 2228, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 176.34375762939453, "epoch": 7.529510961214165, "grad_norm": 11.043197813990117, "kl": 0.669921875, "learning_rate": 3.724662162162162e-07, "loss": 0.0007, "reward": 3.0677175521850586, "reward_std": 0.11405624449253082, "rewards/final_reward": 1.335943431280947, "rewards/mask_iou_reward": 0.6679717156404735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0677174925804138, "rewards/thk_ans_format_reward": 1.0, "step": 2229, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 191.3541717529297, "epoch": 7.532883642495785, "grad_norm": 6.752553142615018, "kl": 0.4365234375, "learning_rate": 3.7218468468468467e-07, "loss": 0.0005, "reward": 3.2327537536621094, "reward_std": 0.06896964088082314, "rewards/final_reward": 1.8958117133631451, "rewards/mask_iou_reward": 0.9479058566815726, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2327535152435303, "rewards/thk_ans_format_reward": 1.0, "step": 2230, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 140.7916717529297, "epoch": 7.536256323777403, "grad_norm": 5.607542004004971, "kl": 0.4560546875, "learning_rate": 3.7190315315315313e-07, "loss": 0.0004, "reward": 3.449102759361267, "reward_std": 0.02281183283776045, "rewards/final_reward": 1.764716877455844, "rewards/mask_iou_reward": 0.882358438727922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4491026401519775, "rewards/thk_ans_format_reward": 1.0, "step": 2231, "think_completion_length": 6.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 174.25000762939453, "epoch": 7.539629005059022, "grad_norm": 16.474163586412292, "kl": 0.4541015625, "learning_rate": 3.716216216216216e-07, "loss": 0.0005, "reward": 3.6877329349517822, "reward_std": 0.05621516332030296, "rewards/final_reward": 1.5877485930388506, "rewards/mask_iou_reward": 0.7938742965194253, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6877328157424927, "rewards/thk_ans_format_reward": 1.0, "step": 2232, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 146.65625762939453, "epoch": 7.543001686340641, "grad_norm": 85.02315344867714, "kl": 0.431640625, "learning_rate": 3.713400900900901e-07, "loss": 0.0004, "reward": 3.723360538482666, "reward_std": 0.02233183290809393, "rewards/final_reward": 1.5680750089532776, "rewards/mask_iou_reward": 0.7840375044766388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7233604192733765, "rewards/thk_ans_format_reward": 1.0, "step": 2233, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 226.4166717529297, "epoch": 7.54637436762226, "grad_norm": 9.119756514703937, "kl": 0.3876953125, "learning_rate": 3.7105855855855856e-07, "loss": 0.0004, "reward": 3.675152897834778, "reward_std": 0.04308299534022808, "rewards/final_reward": 1.6556049828306034, "rewards/mask_iou_reward": 0.8278024914153017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.675152838230133, "rewards/thk_ans_format_reward": 1.0, "step": 2234, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 250.47917938232422, "epoch": 7.549747048903878, "grad_norm": 21.538090314289647, "kl": 0.4794921875, "learning_rate": 3.70777027027027e-07, "loss": 0.0005, "reward": 3.4606536626815796, "reward_std": 0.2695749457925558, "rewards/final_reward": 1.6050519190494144, "rewards/mask_iou_reward": 0.8025259595247072, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.543986976146698, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 2235, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 334.2395935058594, "epoch": 7.5531197301854975, "grad_norm": 13.49060468083085, "kl": 0.42578125, "learning_rate": 3.704954954954955e-07, "loss": 0.0004, "reward": 3.408440351486206, "reward_std": 0.4749724715948105, "rewards/final_reward": 1.6407366601027256, "rewards/mask_iou_reward": 0.8203683300513628, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.4917737245559692, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 2236, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 186.625, "epoch": 7.556492411467117, "grad_norm": 8.739897004231855, "kl": 0.4140625, "learning_rate": 3.7021396396396395e-07, "loss": 0.0004, "reward": 3.3929598331451416, "reward_std": 0.11936522647738457, "rewards/final_reward": 1.5154707608922555, "rewards/mask_iou_reward": 0.7577353804461278, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3929598331451416, "rewards/thk_ans_format_reward": 1.0, "step": 2237, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 213.1041717529297, "epoch": 7.559865092748735, "grad_norm": 5.920106396877477, "kl": 0.4296875, "learning_rate": 3.6993243243243246e-07, "loss": 0.0004, "reward": 3.253367304801941, "reward_std": 0.18102075904607773, "rewards/final_reward": 0.6303426610613139, "rewards/mask_iou_reward": 0.31517133053065693, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2533673644065857, "rewards/thk_ans_format_reward": 1.0, "step": 2238, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 229.55208587646484, "epoch": 7.5632377740303545, "grad_norm": 9.261427448407403, "kl": 0.4267578125, "learning_rate": 3.696509009009009e-07, "loss": 0.0005, "reward": 3.3558290004730225, "reward_std": 0.1935933530330658, "rewards/final_reward": 1.7297704820722961, "rewards/mask_iou_reward": 0.8648852410361481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.355828881263733, "rewards/thk_ans_format_reward": 1.0, "step": 2239, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 160.40625, "epoch": 7.566610455311973, "grad_norm": 8.221685272658684, "kl": 0.4716796875, "learning_rate": 3.6936936936936933e-07, "loss": 0.0005, "reward": 3.681155204772949, "reward_std": 0.026889142580330372, "rewards/final_reward": 1.117077239786284, "rewards/mask_iou_reward": 0.558538619893142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6811550855636597, "rewards/thk_ans_format_reward": 1.0, "step": 2240, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 223.56250762939453, "epoch": 7.569983136593592, "grad_norm": 18.127114567662925, "kl": 0.4677734375, "learning_rate": 3.690878378378378e-07, "loss": 0.0005, "reward": 3.411197304725647, "reward_std": 0.08188456669449806, "rewards/final_reward": 1.3459034129007557, "rewards/mask_iou_reward": 0.6729517064503778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4111971855163574, "rewards/thk_ans_format_reward": 1.0, "step": 2241, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 178.89583587646484, "epoch": 7.5733558178752105, "grad_norm": 12.244729010105242, "kl": 0.76171875, "learning_rate": 3.6880630630630625e-07, "loss": 0.0008, "reward": 3.693773627281189, "reward_std": 0.11770599335432053, "rewards/final_reward": 1.858820377313907, "rewards/mask_iou_reward": 0.9294101886569535, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6937735080718994, "rewards/thk_ans_format_reward": 1.0, "step": 2242, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 177.14583587646484, "epoch": 7.57672849915683, "grad_norm": 10.033573999591129, "kl": 0.4765625, "learning_rate": 3.6852477477477476e-07, "loss": 0.0005, "reward": 3.378702402114868, "reward_std": 0.1148904599249363, "rewards/final_reward": 1.559600441550097, "rewards/mask_iou_reward": 0.7798002207750485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3787025213241577, "rewards/thk_ans_format_reward": 1.0, "step": 2243, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 196.4166717529297, "epoch": 7.580101180438449, "grad_norm": 6.1941069990035835, "kl": 0.453125, "learning_rate": 3.682432432432432e-07, "loss": 0.0004, "reward": 3.464112401008606, "reward_std": 0.08145070215687156, "rewards/final_reward": 1.6657504073543956, "rewards/mask_iou_reward": 0.8328752036771978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4641121625900269, "rewards/thk_ans_format_reward": 1.0, "step": 2244, "think_completion_length": 6.75 }, { "clip_ratio": 0.0, "completion_length": 223.6979217529297, "epoch": 7.583473861720067, "grad_norm": 30.140629304396885, "kl": 0.4091796875, "learning_rate": 3.679617117117117e-07, "loss": 0.0004, "reward": 3.6752060651779175, "reward_std": 0.03587649203836918, "rewards/final_reward": 1.2118360177049143, "rewards/mask_iou_reward": 0.6059180088524572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6752062439918518, "rewards/thk_ans_format_reward": 1.0, "step": 2245, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 183.1041717529297, "epoch": 7.586846543001687, "grad_norm": 7.843757555164772, "kl": 0.5078125, "learning_rate": 3.6768018018018015e-07, "loss": 0.0005, "reward": 3.5074230432510376, "reward_std": 0.14821650087833405, "rewards/final_reward": 1.4525049018818945, "rewards/mask_iou_reward": 0.7262524509409473, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5074230432510376, "rewards/thk_ans_format_reward": 1.0, "step": 2246, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 185.87500762939453, "epoch": 7.590219224283305, "grad_norm": 37.07479306445186, "kl": 0.4560546875, "learning_rate": 3.673986486486486e-07, "loss": 0.0005, "reward": 3.6342554092407227, "reward_std": 0.12161976844072342, "rewards/final_reward": 1.6356422989189932, "rewards/mask_iou_reward": 0.8178211494594966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6342553496360779, "rewards/thk_ans_format_reward": 1.0, "step": 2247, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 175.45833587646484, "epoch": 7.593591905564924, "grad_norm": 13.198979193087293, "kl": 0.486328125, "learning_rate": 3.671171171171171e-07, "loss": 0.0005, "reward": 3.4941141605377197, "reward_std": 0.12330342456698418, "rewards/final_reward": 1.6368577190460187, "rewards/mask_iou_reward": 0.8184288595230094, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4941142201423645, "rewards/thk_ans_format_reward": 1.0, "step": 2248, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 173.2604217529297, "epoch": 7.596964586846543, "grad_norm": 14.38327466152256, "kl": 0.458984375, "learning_rate": 3.668355855855856e-07, "loss": 0.0005, "reward": 3.3377726078033447, "reward_std": 0.12106435745954514, "rewards/final_reward": 1.4980455248753946, "rewards/mask_iou_reward": 0.7490227624376973, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3377727270126343, "rewards/thk_ans_format_reward": 1.0, "step": 2249, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 145.89584350585938, "epoch": 7.600337268128162, "grad_norm": 17.91416919788979, "kl": 0.763671875, "learning_rate": 3.6655405405405404e-07, "loss": 0.0008, "reward": 3.4967925548553467, "reward_std": 0.039044877514243126, "rewards/final_reward": 1.9765416585709925, "rewards/mask_iou_reward": 0.9882708292854963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4967926740646362, "rewards/thk_ans_format_reward": 1.0, "step": 2250, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 226.31251525878906, "epoch": 7.60370994940978, "grad_norm": 7.692315822711692, "kl": 0.4248046875, "learning_rate": 3.662725225225225e-07, "loss": 0.0004, "reward": 3.5274364948272705, "reward_std": 0.24068910256028175, "rewards/final_reward": 1.848031531631317, "rewards/mask_iou_reward": 0.9240157658156585, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.6107696294784546, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 2251, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 134.82291793823242, "epoch": 7.6070826306914, "grad_norm": 24.759656039202838, "kl": 0.54296875, "learning_rate": 3.6599099099099096e-07, "loss": 0.0006, "reward": 3.594699501991272, "reward_std": 0.03100848849862814, "rewards/final_reward": 1.705501867635372, "rewards/mask_iou_reward": 0.852750933817686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5946993827819824, "rewards/thk_ans_format_reward": 1.0, "step": 2252, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 119.28125381469727, "epoch": 7.610455311973018, "grad_norm": 12.021081013771163, "kl": 0.4404296875, "learning_rate": 3.657094594594595e-07, "loss": 0.0004, "reward": 3.546002149581909, "reward_std": 0.061210453510284424, "rewards/final_reward": 1.1353577830401829, "rewards/mask_iou_reward": 0.5676788915200914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5460022687911987, "rewards/thk_ans_format_reward": 1.0, "step": 2253, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.44791793823242, "epoch": 7.613827993254637, "grad_norm": 11.792513231710862, "kl": 0.525390625, "learning_rate": 3.6542792792792794e-07, "loss": 0.0006, "reward": 3.6590678691864014, "reward_std": 0.03704315610229969, "rewards/final_reward": 1.9048289909962175, "rewards/mask_iou_reward": 0.9524144954981087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6590678095817566, "rewards/thk_ans_format_reward": 1.0, "step": 2254, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 186.7291717529297, "epoch": 7.617200674536257, "grad_norm": 8.792350098249111, "kl": 0.5498046875, "learning_rate": 3.651463963963964e-07, "loss": 0.0006, "reward": 3.502922534942627, "reward_std": 0.029376371763646603, "rewards/final_reward": 1.2339888142029136, "rewards/mask_iou_reward": 0.6169944071014568, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5029225945472717, "rewards/thk_ans_format_reward": 1.0, "step": 2255, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 162.2916717529297, "epoch": 7.620573355817875, "grad_norm": 16.987057100099697, "kl": 0.583984375, "learning_rate": 3.6486486486486486e-07, "loss": 0.0007, "reward": 3.826703906059265, "reward_std": 0.06088973954319954, "rewards/final_reward": 1.7263040467134716, "rewards/mask_iou_reward": 0.8631520233567358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8267039060592651, "rewards/thk_ans_format_reward": 1.0, "step": 2256, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 174.96875, "epoch": 7.623946037099494, "grad_norm": 23.764417563866544, "kl": 0.4296875, "learning_rate": 3.645833333333333e-07, "loss": 0.0004, "reward": 3.435308575630188, "reward_std": 0.062496624886989594, "rewards/final_reward": 1.8658596923240005, "rewards/mask_iou_reward": 0.9329298461620003, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4353084564208984, "rewards/thk_ans_format_reward": 1.0, "step": 2257, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 147.7604217529297, "epoch": 7.627318718381113, "grad_norm": 10.45185473996097, "kl": 0.53515625, "learning_rate": 3.6430180180180183e-07, "loss": 0.0005, "reward": 3.4529281854629517, "reward_std": 0.11622785404324532, "rewards/final_reward": 1.047259343234454, "rewards/mask_iou_reward": 0.523629671617227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4529281854629517, "rewards/thk_ans_format_reward": 1.0, "step": 2258, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 151.76041793823242, "epoch": 7.630691399662732, "grad_norm": 9.249642151291475, "kl": 0.896484375, "learning_rate": 3.640202702702703e-07, "loss": 0.0009, "reward": 3.285198450088501, "reward_std": 0.06478903815150261, "rewards/final_reward": 0.8379335323225162, "rewards/mask_iou_reward": 0.4189667661612581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2851983308792114, "rewards/thk_ans_format_reward": 1.0, "step": 2259, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 125.93750381469727, "epoch": 7.63406408094435, "grad_norm": 10.101942600327556, "kl": 0.46484375, "learning_rate": 3.637387387387387e-07, "loss": 0.0005, "reward": 3.510848045349121, "reward_std": 0.07667689025402069, "rewards/final_reward": 1.0392154602383656, "rewards/mask_iou_reward": 0.5196077301191828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5108479857444763, "rewards/thk_ans_format_reward": 1.0, "step": 2260, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 124.71875, "epoch": 7.63743676222597, "grad_norm": 14.927874216703648, "kl": 0.61328125, "learning_rate": 3.6345720720720716e-07, "loss": 0.0006, "reward": 3.684826970100403, "reward_std": 0.05714831594377756, "rewards/final_reward": 1.916309884839134, "rewards/mask_iou_reward": 0.958154942419567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6848266124725342, "rewards/thk_ans_format_reward": 1.0, "step": 2261, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 193.67708587646484, "epoch": 7.640809443507589, "grad_norm": 41.60265407330587, "kl": 0.4482421875, "learning_rate": 3.631756756756756e-07, "loss": 0.0005, "reward": 3.417069673538208, "reward_std": 0.10894013848155737, "rewards/final_reward": 1.7192058449284477, "rewards/mask_iou_reward": 0.8596029224642239, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4274863004684448, "rewards/thk_ans_format_reward": 1.0, "step": 2262, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 124.3125, "epoch": 7.644182124789207, "grad_norm": 8.390634669842632, "kl": 0.4814453125, "learning_rate": 3.6289414414414414e-07, "loss": 0.0005, "reward": 3.4856581687927246, "reward_std": 0.14437636360526085, "rewards/final_reward": 1.4299188208280276, "rewards/mask_iou_reward": 0.7149594104140138, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4856582283973694, "rewards/thk_ans_format_reward": 1.0, "step": 2263, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 140.80209350585938, "epoch": 7.6475548060708265, "grad_norm": 15.907862387941135, "kl": 1.001953125, "learning_rate": 3.626126126126126e-07, "loss": 0.001, "reward": 3.6028738021850586, "reward_std": 0.0683306735008955, "rewards/final_reward": 1.3892481459049233, "rewards/mask_iou_reward": 0.6946240729524616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6028736233711243, "rewards/thk_ans_format_reward": 1.0, "step": 2264, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 149.4479217529297, "epoch": 7.650927487352445, "grad_norm": 11.635154219046655, "kl": 0.4150390625, "learning_rate": 3.6233108108108106e-07, "loss": 0.0004, "reward": 3.636030435562134, "reward_std": 0.054130956530570984, "rewards/final_reward": 1.567502128195345, "rewards/mask_iou_reward": 0.7837510640976725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.636030375957489, "rewards/thk_ans_format_reward": 1.0, "step": 2265, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 155.02083587646484, "epoch": 7.654300168634064, "grad_norm": 13.225457282383763, "kl": 0.529296875, "learning_rate": 3.620495495495495e-07, "loss": 0.0005, "reward": 3.525128722190857, "reward_std": 0.054051365703344345, "rewards/final_reward": 1.942525280419543, "rewards/mask_iou_reward": 0.9712626402097715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5251285433769226, "rewards/thk_ans_format_reward": 1.0, "step": 2266, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 122.9375, "epoch": 7.6576728499156825, "grad_norm": 7.194650640225918, "kl": 0.583984375, "learning_rate": 3.61768018018018e-07, "loss": 0.0006, "reward": 3.3762385845184326, "reward_std": 0.1294691450893879, "rewards/final_reward": 1.1971829463789943, "rewards/mask_iou_reward": 0.5985914731894971, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3762387037277222, "rewards/thk_ans_format_reward": 1.0, "step": 2267, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 179.34375762939453, "epoch": 7.661045531197302, "grad_norm": 8.525852262401335, "kl": 0.4208984375, "learning_rate": 3.614864864864865e-07, "loss": 0.0004, "reward": 3.744857430458069, "reward_std": 0.030906444415450096, "rewards/final_reward": 1.4886834060598715, "rewards/mask_iou_reward": 0.7443417030299357, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7448573112487793, "rewards/thk_ans_format_reward": 1.0, "step": 2268, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 168.18750762939453, "epoch": 7.664418212478921, "grad_norm": 16.675502796446537, "kl": 0.458984375, "learning_rate": 3.6120495495495495e-07, "loss": 0.0004, "reward": 3.4693796634674072, "reward_std": 0.13334699161350727, "rewards/final_reward": 1.0773209848085428, "rewards/mask_iou_reward": 0.5386604924042714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4693796038627625, "rewards/thk_ans_format_reward": 1.0, "step": 2269, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 197.5729217529297, "epoch": 7.6677908937605395, "grad_norm": 8.558349809131897, "kl": 0.419921875, "learning_rate": 3.609234234234234e-07, "loss": 0.0004, "reward": 3.4815101623535156, "reward_std": 0.07516926433891058, "rewards/final_reward": 1.1391895798047658, "rewards/mask_iou_reward": 0.5695947899023829, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4815101027488708, "rewards/thk_ans_format_reward": 1.0, "step": 2270, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 191.62500762939453, "epoch": 7.671163575042159, "grad_norm": 8.90441635462389, "kl": 0.474609375, "learning_rate": 3.606418918918919e-07, "loss": 0.0005, "reward": 3.58591091632843, "reward_std": 0.052635351195931435, "rewards/final_reward": 1.8259667105500643, "rewards/mask_iou_reward": 0.9129833552750322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5859108567237854, "rewards/thk_ans_format_reward": 1.0, "step": 2271, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 141.20833587646484, "epoch": 7.674536256323777, "grad_norm": 18.882738239796144, "kl": 0.6806640625, "learning_rate": 3.6036036036036033e-07, "loss": 0.0007, "reward": 3.450736880302429, "reward_std": 0.12018753960728645, "rewards/final_reward": 1.5825101953239598, "rewards/mask_iou_reward": 0.7912550976619799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4507368803024292, "rewards/thk_ans_format_reward": 1.0, "step": 2272, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 165.43750762939453, "epoch": 7.677908937605396, "grad_norm": 8.664488502796393, "kl": 0.4912109375, "learning_rate": 3.6007882882882885e-07, "loss": 0.0005, "reward": 3.5445717573165894, "reward_std": 0.05564088374376297, "rewards/final_reward": 1.3317285713213023, "rewards/mask_iou_reward": 0.6658642856606511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5445716381072998, "rewards/thk_ans_format_reward": 1.0, "step": 2273, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 164.58333587646484, "epoch": 7.681281618887015, "grad_norm": 34.33115400832227, "kl": 0.4326171875, "learning_rate": 3.597972972972973e-07, "loss": 0.0005, "reward": 3.6770211458206177, "reward_std": 0.03603087249211967, "rewards/final_reward": 1.1484784869257847, "rewards/mask_iou_reward": 0.5742392434628923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6770211458206177, "rewards/thk_ans_format_reward": 1.0, "step": 2274, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 146.65625, "epoch": 7.684654300168634, "grad_norm": 29.052257351781336, "kl": 0.4716796875, "learning_rate": 3.5951576576576577e-07, "loss": 0.0005, "reward": 3.1839051246643066, "reward_std": 0.17020989954471588, "rewards/final_reward": 1.1850262935917633, "rewards/mask_iou_reward": 0.5925131467958816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.183905005455017, "rewards/thk_ans_format_reward": 1.0, "step": 2275, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 142.09375, "epoch": 7.688026981450253, "grad_norm": 39.52562117914507, "kl": 0.859375, "learning_rate": 3.5923423423423423e-07, "loss": 0.0009, "reward": 3.5432727336883545, "reward_std": 0.08223596028983593, "rewards/final_reward": 1.9002105295025138, "rewards/mask_iou_reward": 0.9501052647512569, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5432727932929993, "rewards/thk_ans_format_reward": 1.0, "step": 2276, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 146.6666717529297, "epoch": 7.691399662731872, "grad_norm": 16.67426749316441, "kl": 0.529296875, "learning_rate": 3.589527027027027e-07, "loss": 0.0006, "reward": 3.728038787841797, "reward_std": 0.0844535268843174, "rewards/final_reward": 1.647716684034463, "rewards/mask_iou_reward": 0.8238583420172315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7280389070510864, "rewards/thk_ans_format_reward": 1.0, "step": 2277, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 126.72916793823242, "epoch": 7.694772344013491, "grad_norm": 7.016987101308896, "kl": 0.623046875, "learning_rate": 3.5867117117117115e-07, "loss": 0.0006, "reward": 3.1386752128601074, "reward_std": 0.13225263357162476, "rewards/final_reward": 0.19329672988784663, "rewards/mask_iou_reward": 0.09664836494392332, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1386749744415283, "rewards/thk_ans_format_reward": 1.0, "step": 2278, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 153.5104217529297, "epoch": 7.698145025295109, "grad_norm": 8.83488013053066, "kl": 0.58203125, "learning_rate": 3.5838963963963967e-07, "loss": 0.0006, "reward": 3.4244601726531982, "reward_std": 0.0693696178495884, "rewards/final_reward": 0.7691967355950156, "rewards/mask_iou_reward": 0.3845983677975078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.424459993839264, "rewards/thk_ans_format_reward": 1.0, "step": 2279, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 142.3229217529297, "epoch": 7.701517706576729, "grad_norm": 17.366672119479563, "kl": 0.5244140625, "learning_rate": 3.5810810810810807e-07, "loss": 0.0005, "reward": 3.5201783180236816, "reward_std": 0.07162079215049744, "rewards/final_reward": 1.5854306102070996, "rewards/mask_iou_reward": 0.7927153051035498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.520177960395813, "rewards/thk_ans_format_reward": 1.0, "step": 2280, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 135.3229217529297, "epoch": 7.704890387858347, "grad_norm": 11.9320751465435, "kl": 0.4443359375, "learning_rate": 3.5782657657657653e-07, "loss": 0.0005, "reward": 3.6812453269958496, "reward_std": 0.037463925778865814, "rewards/final_reward": 1.7041272989205765, "rewards/mask_iou_reward": 0.8520636494602882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.68124520778656, "rewards/thk_ans_format_reward": 1.0, "step": 2281, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 125.5625, "epoch": 7.708263069139966, "grad_norm": 27.22726471913835, "kl": 0.494140625, "learning_rate": 3.57545045045045e-07, "loss": 0.0005, "reward": 3.6677199602127075, "reward_std": 0.07058407552540302, "rewards/final_reward": 1.2181321919761752, "rewards/mask_iou_reward": 0.6090660959880876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6677199602127075, "rewards/thk_ans_format_reward": 1.0, "step": 2282, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 161.09375, "epoch": 7.7116357504215856, "grad_norm": 12.482225650157211, "kl": 0.51171875, "learning_rate": 3.5726351351351346e-07, "loss": 0.0005, "reward": 3.7093077898025513, "reward_std": 0.06627171486616135, "rewards/final_reward": 1.5520777488098996, "rewards/mask_iou_reward": 0.7760388744049498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7093076705932617, "rewards/thk_ans_format_reward": 1.0, "step": 2283, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 150.36459350585938, "epoch": 7.715008431703204, "grad_norm": 7.097840578621795, "kl": 0.451171875, "learning_rate": 3.5698198198198197e-07, "loss": 0.0005, "reward": 3.471312642097473, "reward_std": 0.09290103241801262, "rewards/final_reward": 1.309364779661415, "rewards/mask_iou_reward": 0.6546823898307075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4713128805160522, "rewards/thk_ans_format_reward": 1.0, "step": 2284, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 122.34375381469727, "epoch": 7.718381112984823, "grad_norm": 16.776373577928087, "kl": 1.1015625, "learning_rate": 3.5670045045045043e-07, "loss": 0.0011, "reward": 3.419792890548706, "reward_std": 0.1175164058804512, "rewards/final_reward": 1.0883838774256862, "rewards/mask_iou_reward": 0.5441919387128431, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4197928309440613, "rewards/thk_ans_format_reward": 1.0, "step": 2285, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 131.1979217529297, "epoch": 7.721753794266442, "grad_norm": 9.676650940022329, "kl": 0.509765625, "learning_rate": 3.564189189189189e-07, "loss": 0.0006, "reward": 3.515068531036377, "reward_std": 0.15917960554361343, "rewards/final_reward": 1.503861021723187, "rewards/mask_iou_reward": 0.7519305108615935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5150684714317322, "rewards/thk_ans_format_reward": 1.0, "step": 2286, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.9479217529297, "epoch": 7.725126475548061, "grad_norm": 20.745195561069806, "kl": 0.55859375, "learning_rate": 3.5613738738738735e-07, "loss": 0.0006, "reward": 3.739573359489441, "reward_std": 0.01828201860189438, "rewards/final_reward": 1.9107846618437416, "rewards/mask_iou_reward": 0.9553923309218708, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7395732998847961, "rewards/thk_ans_format_reward": 1.0, "step": 2287, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 162.50000762939453, "epoch": 7.728499156829679, "grad_norm": 14.222149311002784, "kl": 0.419921875, "learning_rate": 3.558558558558558e-07, "loss": 0.0004, "reward": 3.4065486192703247, "reward_std": 0.08904951438307762, "rewards/final_reward": 1.4015213427144408, "rewards/mask_iou_reward": 0.7007606713572204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.40654855966568, "rewards/thk_ans_format_reward": 1.0, "step": 2288, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 196.4791717529297, "epoch": 7.7318718381112985, "grad_norm": 8.227714520886847, "kl": 0.416015625, "learning_rate": 3.555743243243243e-07, "loss": 0.0004, "reward": 3.526465058326721, "reward_std": 0.037529608234763145, "rewards/final_reward": 1.8010980479365568, "rewards/mask_iou_reward": 0.9005490239682784, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5264650583267212, "rewards/thk_ans_format_reward": 1.0, "step": 2289, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 276.58333587646484, "epoch": 7.735244519392918, "grad_norm": 10.560115501176965, "kl": 0.4404296875, "learning_rate": 3.552927927927928e-07, "loss": 0.0005, "reward": 3.440025568008423, "reward_std": 0.27941257879137993, "rewards/final_reward": 1.1915034510081766, "rewards/mask_iou_reward": 0.5957517255040883, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.565025508403778, "rewards/thk_ans_format_reward": 0.9375, "step": 2290, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 132.92708587646484, "epoch": 7.738617200674536, "grad_norm": 35.51630293724726, "kl": 0.572265625, "learning_rate": 3.5501126126126125e-07, "loss": 0.0006, "reward": 3.538991093635559, "reward_std": 0.08621177216991782, "rewards/final_reward": 1.6881863003564102, "rewards/mask_iou_reward": 0.8440931501782051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.538991093635559, "rewards/thk_ans_format_reward": 1.0, "step": 2291, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 145.60416793823242, "epoch": 7.7419898819561555, "grad_norm": 9.505637948818205, "kl": 0.46875, "learning_rate": 3.547297297297297e-07, "loss": 0.0005, "reward": 3.70369029045105, "reward_std": 0.08660473302006721, "rewards/final_reward": 1.4734322024985178, "rewards/mask_iou_reward": 0.7367161012492589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7036903500556946, "rewards/thk_ans_format_reward": 1.0, "step": 2292, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 211.9895896911621, "epoch": 7.745362563237774, "grad_norm": 10.484695069513347, "kl": 0.416015625, "learning_rate": 3.5444819819819817e-07, "loss": 0.0004, "reward": 3.4020389318466187, "reward_std": 0.07059112749993801, "rewards/final_reward": 1.0682273746109578, "rewards/mask_iou_reward": 0.5341136873054789, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.402038812637329, "rewards/thk_ans_format_reward": 1.0, "step": 2293, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 149.90625381469727, "epoch": 7.748735244519393, "grad_norm": 14.513575678757574, "kl": 0.4931640625, "learning_rate": 3.541666666666667e-07, "loss": 0.0005, "reward": 3.4654370546340942, "reward_std": 0.048545608296990395, "rewards/final_reward": 0.7577805668806391, "rewards/mask_iou_reward": 0.37889028344031955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4654370546340942, "rewards/thk_ans_format_reward": 1.0, "step": 2294, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 201.50001525878906, "epoch": 7.7521079258010115, "grad_norm": 41.83666830833403, "kl": 0.630859375, "learning_rate": 3.5388513513513514e-07, "loss": 0.0006, "reward": 3.5073028802871704, "reward_std": 0.12978895753622055, "rewards/final_reward": 1.9100693476261814, "rewards/mask_iou_reward": 0.9550346738130907, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5073029398918152, "rewards/thk_ans_format_reward": 1.0, "step": 2295, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.2916717529297, "epoch": 7.755480607082631, "grad_norm": 9.347916089328256, "kl": 0.4599609375, "learning_rate": 3.536036036036036e-07, "loss": 0.0005, "reward": 3.6417770385742188, "reward_std": 0.02596164494752884, "rewards/final_reward": 1.6588071187782987, "rewards/mask_iou_reward": 0.8294035593891493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.641777217388153, "rewards/thk_ans_format_reward": 1.0, "step": 2296, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 159.96875, "epoch": 7.75885328836425, "grad_norm": 26.11197213835604, "kl": 0.451171875, "learning_rate": 3.5332207207207206e-07, "loss": 0.0005, "reward": 3.4054592847824097, "reward_std": 0.06146854721009731, "rewards/final_reward": 1.8495260696406592, "rewards/mask_iou_reward": 0.9247630348203296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4054594039916992, "rewards/thk_ans_format_reward": 1.0, "step": 2297, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 202.9479217529297, "epoch": 7.762225969645868, "grad_norm": 6.92012139038751, "kl": 0.458984375, "learning_rate": 3.530405405405405e-07, "loss": 0.0005, "reward": 3.5028148889541626, "reward_std": 0.036012555472552776, "rewards/final_reward": 1.466303298975642, "rewards/mask_iou_reward": 0.733151649487821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5028148293495178, "rewards/thk_ans_format_reward": 1.0, "step": 2298, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 145.78125762939453, "epoch": 7.765598650927488, "grad_norm": 8.461246166243656, "kl": 0.431640625, "learning_rate": 3.5275900900900904e-07, "loss": 0.0004, "reward": 3.5506240129470825, "reward_std": 0.07442787801846862, "rewards/final_reward": 1.5512045862225976, "rewards/mask_iou_reward": 0.7756022931112988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5506239533424377, "rewards/thk_ans_format_reward": 1.0, "step": 2299, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 143.75000762939453, "epoch": 7.768971332209106, "grad_norm": 11.326303220722192, "kl": 0.447265625, "learning_rate": 3.5247747747747745e-07, "loss": 0.0004, "reward": 3.489335775375366, "reward_std": 0.025043433532118797, "rewards/final_reward": 1.799468701569588, "rewards/mask_iou_reward": 0.899734350784794, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.489335834980011, "rewards/thk_ans_format_reward": 1.0, "step": 2300, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.67708587646484, "epoch": 7.772344013490725, "grad_norm": 23.87880415190321, "kl": 0.447265625, "learning_rate": 3.521959459459459e-07, "loss": 0.0004, "reward": 3.2864990234375, "reward_std": 0.07297203643247485, "rewards/final_reward": 0.8304928473778955, "rewards/mask_iou_reward": 0.41524642368894776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2864991426467896, "rewards/thk_ans_format_reward": 1.0, "step": 2301, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 183.64583587646484, "epoch": 7.775716694772344, "grad_norm": 90.40553412999691, "kl": 0.521484375, "learning_rate": 3.5191441441441437e-07, "loss": 0.0006, "reward": 3.360445976257324, "reward_std": 0.05471951887011528, "rewards/final_reward": 1.0188568995105611, "rewards/mask_iou_reward": 0.5094284497552806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.36044579744339, "rewards/thk_ans_format_reward": 1.0, "step": 2302, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 157.82291793823242, "epoch": 7.779089376053963, "grad_norm": 12.670352448393869, "kl": 0.52734375, "learning_rate": 3.5163288288288283e-07, "loss": 0.0006, "reward": 3.590890645980835, "reward_std": 0.08140116557478905, "rewards/final_reward": 1.5870530239041354, "rewards/mask_iou_reward": 0.7935265119520677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5908905267715454, "rewards/thk_ans_format_reward": 1.0, "step": 2303, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 224.6041717529297, "epoch": 7.782462057335582, "grad_norm": 16.46874007266962, "kl": 0.4052734375, "learning_rate": 3.5135135135135134e-07, "loss": 0.0004, "reward": 3.6431690454483032, "reward_std": 0.04836719110608101, "rewards/final_reward": 1.8580852428852772, "rewards/mask_iou_reward": 0.9290426214426386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6431688070297241, "rewards/thk_ans_format_reward": 1.0, "step": 2304, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 199.5416717529297, "epoch": 7.785834738617201, "grad_norm": 7.347159424364246, "kl": 0.4130859375, "learning_rate": 3.510698198198198e-07, "loss": 0.0004, "reward": 3.4472914934158325, "reward_std": 0.020406564697623253, "rewards/final_reward": 1.7516853229805487, "rewards/mask_iou_reward": 0.8758426614902743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4472914934158325, "rewards/thk_ans_format_reward": 1.0, "step": 2305, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 196.86458587646484, "epoch": 7.78920741989882, "grad_norm": 19.455204429708385, "kl": 0.4296875, "learning_rate": 3.5078828828828826e-07, "loss": 0.0004, "reward": 3.352233409881592, "reward_std": 0.11754608154296875, "rewards/final_reward": 1.8694441956962824, "rewards/mask_iou_reward": 0.9347220978481412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3522332906723022, "rewards/thk_ans_format_reward": 1.0, "step": 2306, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 153.59375, "epoch": 7.792580101180438, "grad_norm": 13.13219609139006, "kl": 0.9287109375, "learning_rate": 3.505067567567567e-07, "loss": 0.0009, "reward": 3.4515668153762817, "reward_std": 0.1543382704257965, "rewards/final_reward": 1.317968822124398, "rewards/mask_iou_reward": 0.658984411062199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4515668749809265, "rewards/thk_ans_format_reward": 1.0, "step": 2307, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.8229217529297, "epoch": 7.795952782462058, "grad_norm": 11.740865831015478, "kl": 0.59765625, "learning_rate": 3.502252252252252e-07, "loss": 0.0006, "reward": 3.5365203619003296, "reward_std": 0.039987629279494286, "rewards/final_reward": 1.9206840736768225, "rewards/mask_iou_reward": 0.9603420368384112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5365204811096191, "rewards/thk_ans_format_reward": 1.0, "step": 2308, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 155.5104217529297, "epoch": 7.799325463743676, "grad_norm": 8.861505071959428, "kl": 0.587890625, "learning_rate": 3.499436936936937e-07, "loss": 0.0006, "reward": 3.3772459030151367, "reward_std": 0.08232882246375084, "rewards/final_reward": 1.546701372882615, "rewards/mask_iou_reward": 0.7733506864413076, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.377245843410492, "rewards/thk_ans_format_reward": 1.0, "step": 2309, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.89583587646484, "epoch": 7.802698145025295, "grad_norm": 11.730335589806394, "kl": 0.443359375, "learning_rate": 3.4966216216216216e-07, "loss": 0.0004, "reward": 2.9785395860671997, "reward_std": 0.08616751432418823, "rewards/final_reward": 0.6032473239090221, "rewards/mask_iou_reward": 0.30162366195451107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9785396456718445, "rewards/thk_ans_format_reward": 1.0, "step": 2310, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 210.89584350585938, "epoch": 7.806070826306914, "grad_norm": 12.6283200499224, "kl": 0.4033203125, "learning_rate": 3.493806306306306e-07, "loss": 0.0004, "reward": 3.658359169960022, "reward_std": 0.06291536800563335, "rewards/final_reward": 1.621674238125308, "rewards/mask_iou_reward": 0.810837119062654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6583590507507324, "rewards/thk_ans_format_reward": 1.0, "step": 2311, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.21875762939453, "epoch": 7.809443507588533, "grad_norm": 6.784044716299772, "kl": 0.4697265625, "learning_rate": 3.490990990990991e-07, "loss": 0.0005, "reward": 3.6008437871932983, "reward_std": 0.03371572960168123, "rewards/final_reward": 1.9011566973384175, "rewards/mask_iou_reward": 0.9505783486692088, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6008437871932983, "rewards/thk_ans_format_reward": 1.0, "step": 2312, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 149.30209350585938, "epoch": 7.812816188870151, "grad_norm": 27.340968120934896, "kl": 0.6171875, "learning_rate": 3.4881756756756754e-07, "loss": 0.0006, "reward": 3.232995629310608, "reward_std": 0.08912499528378248, "rewards/final_reward": 0.8706868650440512, "rewards/mask_iou_reward": 0.4353434325220256, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2329955101013184, "rewards/thk_ans_format_reward": 1.0, "step": 2313, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 186.89584350585938, "epoch": 7.8161888701517706, "grad_norm": 8.13050302174962, "kl": 0.4091796875, "learning_rate": 3.4853603603603605e-07, "loss": 0.0004, "reward": 3.358641743659973, "reward_std": 0.034882666543126106, "rewards/final_reward": 1.7063491811286031, "rewards/mask_iou_reward": 0.8531745905643016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.358641803264618, "rewards/thk_ans_format_reward": 1.0, "step": 2314, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 165.46875762939453, "epoch": 7.81956155143339, "grad_norm": 15.113967431164683, "kl": 0.4560546875, "learning_rate": 3.482545045045045e-07, "loss": 0.0005, "reward": 3.599574327468872, "reward_std": 0.0958711989223957, "rewards/final_reward": 1.6495412259843159, "rewards/mask_iou_reward": 0.8247706129921579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5995741486549377, "rewards/thk_ans_format_reward": 1.0, "step": 2315, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 189.75000762939453, "epoch": 7.822934232715008, "grad_norm": 26.43129837171513, "kl": 0.443359375, "learning_rate": 3.47972972972973e-07, "loss": 0.0004, "reward": 3.4044047594070435, "reward_std": 0.06186537444591522, "rewards/final_reward": 1.8200695665581779, "rewards/mask_iou_reward": 0.9100347832790889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4044047594070435, "rewards/thk_ans_format_reward": 1.0, "step": 2316, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 127.98958587646484, "epoch": 7.8263069139966275, "grad_norm": 10.395822624159122, "kl": 0.5234375, "learning_rate": 3.4769144144144144e-07, "loss": 0.0005, "reward": 3.568729519844055, "reward_std": 0.07372662238776684, "rewards/final_reward": 1.661542540133941, "rewards/mask_iou_reward": 0.8307712700669705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5687294006347656, "rewards/thk_ans_format_reward": 1.0, "step": 2317, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 141.84375, "epoch": 7.829679595278246, "grad_norm": 18.712101724067143, "kl": 0.583984375, "learning_rate": 3.474099099099099e-07, "loss": 0.0006, "reward": 3.6224128007888794, "reward_std": 0.06478509679436684, "rewards/final_reward": 1.5744207148950102, "rewards/mask_iou_reward": 0.7872103574475051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6224127411842346, "rewards/thk_ans_format_reward": 1.0, "step": 2318, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 167.12500762939453, "epoch": 7.833052276559865, "grad_norm": 249.85421546912946, "kl": 0.443359375, "learning_rate": 3.471283783783784e-07, "loss": 0.0005, "reward": 3.545035719871521, "reward_std": 0.07090389914810658, "rewards/final_reward": 1.1629425698182438, "rewards/mask_iou_reward": 0.5814712849091219, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5450357794761658, "rewards/thk_ans_format_reward": 1.0, "step": 2319, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.15625, "epoch": 7.8364249578414835, "grad_norm": 9.60632427652472, "kl": 0.439453125, "learning_rate": 3.468468468468468e-07, "loss": 0.0005, "reward": 3.721726655960083, "reward_std": 0.038542356342077255, "rewards/final_reward": 1.8422141542666353, "rewards/mask_iou_reward": 0.9211070771333176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7217265963554382, "rewards/thk_ans_format_reward": 1.0, "step": 2320, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 193.02083587646484, "epoch": 7.839797639123103, "grad_norm": 8.201411040398995, "kl": 0.43359375, "learning_rate": 3.465653153153153e-07, "loss": 0.0005, "reward": 3.5359808206558228, "reward_std": 0.07264266163110733, "rewards/final_reward": 1.7804632002891472, "rewards/mask_iou_reward": 0.8902316001445736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5359808206558228, "rewards/thk_ans_format_reward": 1.0, "step": 2321, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 207.36458587646484, "epoch": 7.843170320404722, "grad_norm": 10.835979305273261, "kl": 0.3759765625, "learning_rate": 3.4628378378378374e-07, "loss": 0.0004, "reward": 3.568721055984497, "reward_std": 0.04248126968741417, "rewards/final_reward": 1.540176830992082, "rewards/mask_iou_reward": 0.770088415496041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5687209367752075, "rewards/thk_ans_format_reward": 1.0, "step": 2322, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 197.7604217529297, "epoch": 7.8465430016863404, "grad_norm": 8.787303455057133, "kl": 0.38671875, "learning_rate": 3.460022522522522e-07, "loss": 0.0004, "reward": 3.5989596843719482, "reward_std": 0.09075170010328293, "rewards/final_reward": 1.2405073472062438, "rewards/mask_iou_reward": 0.6202536736031219, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5989595651626587, "rewards/thk_ans_format_reward": 1.0, "step": 2323, "think_completion_length": 7.333333333333333 }, { "clip_ratio": 0.0, "completion_length": 148.8854217529297, "epoch": 7.84991568296796, "grad_norm": 9.182053652495602, "kl": 0.537109375, "learning_rate": 3.457207207207207e-07, "loss": 0.0006, "reward": 3.511338710784912, "reward_std": 0.07975371927022934, "rewards/final_reward": 1.170090604027916, "rewards/mask_iou_reward": 0.585045302013958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.511338710784912, "rewards/thk_ans_format_reward": 1.0, "step": 2324, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 220.87500762939453, "epoch": 7.853288364249578, "grad_norm": 13.68236372891141, "kl": 0.52734375, "learning_rate": 3.454391891891892e-07, "loss": 0.0005, "reward": 3.566367268562317, "reward_std": 0.07451931945979595, "rewards/final_reward": 0.9658934278265687, "rewards/mask_iou_reward": 0.48294671391328436, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5663670301437378, "rewards/thk_ans_format_reward": 1.0, "step": 2325, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.92708587646484, "epoch": 7.856661045531197, "grad_norm": 7.843385990135139, "kl": 0.4921875, "learning_rate": 3.4515765765765763e-07, "loss": 0.0005, "reward": 3.4027241468429565, "reward_std": 0.04318516911007464, "rewards/final_reward": 1.8411135688835198, "rewards/mask_iou_reward": 0.9205567844417599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4027240872383118, "rewards/thk_ans_format_reward": 1.0, "step": 2326, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 148.83333587646484, "epoch": 7.860033726812816, "grad_norm": 17.641573866474715, "kl": 2.2646484375, "learning_rate": 3.448761261261261e-07, "loss": 0.0023, "reward": 3.287583351135254, "reward_std": 0.070755485445261, "rewards/final_reward": 1.2760556416836235, "rewards/mask_iou_reward": 0.6380278208418118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2875832915306091, "rewards/thk_ans_format_reward": 1.0, "step": 2327, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 219.28125, "epoch": 7.863406408094435, "grad_norm": 11.039443903969092, "kl": 0.404296875, "learning_rate": 3.4459459459459456e-07, "loss": 0.0004, "reward": 3.6915663480758667, "reward_std": 0.10650844499468803, "rewards/final_reward": 1.8212546911576388, "rewards/mask_iou_reward": 0.9106273455788194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6915662288665771, "rewards/thk_ans_format_reward": 1.0, "step": 2328, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 159.93750762939453, "epoch": 7.866779089376054, "grad_norm": 45.22548356284776, "kl": 0.46875, "learning_rate": 3.4431306306306307e-07, "loss": 0.0005, "reward": 3.289353132247925, "reward_std": 0.10025676898658276, "rewards/final_reward": 0.9978360163727829, "rewards/mask_iou_reward": 0.4989180081863914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2893530130386353, "rewards/thk_ans_format_reward": 1.0, "step": 2329, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 233.33333587646484, "epoch": 7.870151770657673, "grad_norm": 10.64319600309174, "kl": 0.3994140625, "learning_rate": 3.4403153153153153e-07, "loss": 0.0004, "reward": 3.5440070629119873, "reward_std": 0.03834127727895975, "rewards/final_reward": 1.5940954495231145, "rewards/mask_iou_reward": 0.7970477247615573, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5440071821212769, "rewards/thk_ans_format_reward": 1.0, "step": 2330, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 199.14583587646484, "epoch": 7.873524451939292, "grad_norm": 9.592694804363731, "kl": 0.62890625, "learning_rate": 3.4375e-07, "loss": 0.0006, "reward": 3.3139032125473022, "reward_std": 0.1449567973613739, "rewards/final_reward": 0.8608355036367921, "rewards/mask_iou_reward": 0.43041775181839603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3139033317565918, "rewards/thk_ans_format_reward": 1.0, "step": 2331, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 208.62500762939453, "epoch": 7.87689713322091, "grad_norm": 11.82364466612513, "kl": 0.4453125, "learning_rate": 3.4346846846846845e-07, "loss": 0.0005, "reward": 3.5688596963882446, "reward_std": 0.08483656868338585, "rewards/final_reward": 1.5885996944105767, "rewards/mask_iou_reward": 0.7942998472052883, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.568859577178955, "rewards/thk_ans_format_reward": 1.0, "step": 2332, "think_completion_length": 12.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.7604217529297, "epoch": 7.88026981450253, "grad_norm": 11.078307712657724, "kl": 0.4521484375, "learning_rate": 3.431869369369369e-07, "loss": 0.0005, "reward": 3.384488821029663, "reward_std": 0.12777956575155258, "rewards/final_reward": 0.9596014290310367, "rewards/mask_iou_reward": 0.47980071451551837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3844888806343079, "rewards/thk_ans_format_reward": 1.0, "step": 2333, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 138.6041717529297, "epoch": 7.883642495784148, "grad_norm": 13.09620604007018, "kl": 0.544921875, "learning_rate": 3.429054054054054e-07, "loss": 0.0006, "reward": 3.4537363052368164, "reward_std": 0.1276137139648199, "rewards/final_reward": 0.5759846275612032, "rewards/mask_iou_reward": 0.2879923137806016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4537363052368164, "rewards/thk_ans_format_reward": 1.0, "step": 2334, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 231.875, "epoch": 7.887015177065767, "grad_norm": 30.203500843574535, "kl": 0.3544921875, "learning_rate": 3.426238738738739e-07, "loss": 0.0004, "reward": 3.7364327907562256, "reward_std": 0.03459780430421233, "rewards/final_reward": 1.737842492664877, "rewards/mask_iou_reward": 0.8689212463324385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7364325523376465, "rewards/thk_ans_format_reward": 1.0, "step": 2335, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 144.09375762939453, "epoch": 7.8903878583473865, "grad_norm": 12.063816563747155, "kl": 0.5087890625, "learning_rate": 3.4234234234234235e-07, "loss": 0.0005, "reward": 3.5211243629455566, "reward_std": 0.04529313184320927, "rewards/final_reward": 1.802233579994247, "rewards/mask_iou_reward": 0.9011167899971235, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.521124243736267, "rewards/thk_ans_format_reward": 1.0, "step": 2336, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 178.61459350585938, "epoch": 7.893760539629005, "grad_norm": 7.861235953625273, "kl": 0.568359375, "learning_rate": 3.420608108108108e-07, "loss": 0.0006, "reward": 3.6086841821670532, "reward_std": 0.03303397446870804, "rewards/final_reward": 1.8981940915356716, "rewards/mask_iou_reward": 0.9490970457678358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6086838841438293, "rewards/thk_ans_format_reward": 1.0, "step": 2337, "think_completion_length": 11.375 }, { "clip_ratio": 0.0, "completion_length": 160.5625, "epoch": 7.897133220910624, "grad_norm": 17.37265934151549, "kl": 0.4462890625, "learning_rate": 3.4177927927927927e-07, "loss": 0.0005, "reward": 3.3253653049468994, "reward_std": 0.1824297234416008, "rewards/final_reward": 0.9615388733284267, "rewards/mask_iou_reward": 0.48076943666421335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.325365126132965, "rewards/thk_ans_format_reward": 1.0, "step": 2338, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 229.08334350585938, "epoch": 7.900505902192243, "grad_norm": 20.80358081412897, "kl": 0.4697265625, "learning_rate": 3.4149774774774773e-07, "loss": 0.0005, "reward": 3.2458066940307617, "reward_std": 0.19842278212308884, "rewards/final_reward": 1.1941345429684787, "rewards/mask_iou_reward": 0.5970672714842393, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.2874733805656433, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2339, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 175.08333587646484, "epoch": 7.903878583473862, "grad_norm": 15.630954674126164, "kl": 0.53125, "learning_rate": 3.412162162162162e-07, "loss": 0.0005, "reward": 3.4466086626052856, "reward_std": 0.11500649899244308, "rewards/final_reward": 1.46870160960264, "rewards/mask_iou_reward": 0.73435080480132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4466084241867065, "rewards/thk_ans_format_reward": 1.0, "step": 2340, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 248.46875762939453, "epoch": 7.90725126475548, "grad_norm": 7.77783215283481, "kl": 0.3798828125, "learning_rate": 3.4093468468468465e-07, "loss": 0.0004, "reward": 3.701847553253174, "reward_std": 0.044463444501161575, "rewards/final_reward": 1.8407206420426043, "rewards/mask_iou_reward": 0.9203603210213022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7018474340438843, "rewards/thk_ans_format_reward": 1.0, "step": 2341, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 193.80208587646484, "epoch": 7.9106239460370995, "grad_norm": 7.17664934078942, "kl": 0.4013671875, "learning_rate": 3.406531531531531e-07, "loss": 0.0004, "reward": 3.54721200466156, "reward_std": 0.04046872444450855, "rewards/final_reward": 1.9069610779524413, "rewards/mask_iou_reward": 0.9534805389762206, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5472122430801392, "rewards/thk_ans_format_reward": 1.0, "step": 2342, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 181.5625, "epoch": 7.913996627318719, "grad_norm": 14.106045514797199, "kl": 0.3974609375, "learning_rate": 3.4037162162162157e-07, "loss": 0.0004, "reward": 3.5020689964294434, "reward_std": 0.10484147071838379, "rewards/final_reward": 1.9132991048909953, "rewards/mask_iou_reward": 0.9566495524454977, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5020688772201538, "rewards/thk_ans_format_reward": 1.0, "step": 2343, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 154.15625762939453, "epoch": 7.917369308600337, "grad_norm": 67.84555169518201, "kl": 0.5234375, "learning_rate": 3.400900900900901e-07, "loss": 0.0005, "reward": 3.628502130508423, "reward_std": 0.040713533759117126, "rewards/final_reward": 1.2533247159523904, "rewards/mask_iou_reward": 0.6266623579761952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6285019516944885, "rewards/thk_ans_format_reward": 1.0, "step": 2344, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 178.83334350585938, "epoch": 7.920741989881956, "grad_norm": 33.18071869400635, "kl": 0.494140625, "learning_rate": 3.3980855855855855e-07, "loss": 0.0005, "reward": 3.621130108833313, "reward_std": 0.03444007970392704, "rewards/final_reward": 1.7131870145275485, "rewards/mask_iou_reward": 0.8565935072637743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6211299896240234, "rewards/thk_ans_format_reward": 1.0, "step": 2345, "think_completion_length": 11.75 }, { "clip_ratio": 0.0, "completion_length": 167.4166717529297, "epoch": 7.924114671163575, "grad_norm": 21.952011179777173, "kl": 0.4580078125, "learning_rate": 3.39527027027027e-07, "loss": 0.0005, "reward": 3.218958258628845, "reward_std": 0.07898985967040062, "rewards/final_reward": 0.5945596088104014, "rewards/mask_iou_reward": 0.2972798044052007, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2189582586288452, "rewards/thk_ans_format_reward": 1.0, "step": 2346, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 185.06250762939453, "epoch": 7.927487352445194, "grad_norm": 6.97931756465897, "kl": 0.41796875, "learning_rate": 3.3924549549549547e-07, "loss": 0.0004, "reward": 3.788905382156372, "reward_std": 0.03340917080640793, "rewards/final_reward": 1.8980523459951733, "rewards/mask_iou_reward": 0.9490261729975866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7889053225517273, "rewards/thk_ans_format_reward": 1.0, "step": 2347, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 219.53125, "epoch": 7.9308600337268125, "grad_norm": 15.430277569447743, "kl": 0.4609375, "learning_rate": 3.3896396396396393e-07, "loss": 0.0005, "reward": 3.5355775356292725, "reward_std": 0.04261211957782507, "rewards/final_reward": 1.8832372240822794, "rewards/mask_iou_reward": 0.9416186120411397, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.535577416419983, "rewards/thk_ans_format_reward": 1.0, "step": 2348, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 278.9270935058594, "epoch": 7.934232715008432, "grad_norm": 8.556554171502322, "kl": 0.373046875, "learning_rate": 3.3868243243243244e-07, "loss": 0.0004, "reward": 3.428499698638916, "reward_std": 0.08785773441195488, "rewards/final_reward": 1.4202347812155347, "rewards/mask_iou_reward": 0.7101173906077674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4284995198249817, "rewards/thk_ans_format_reward": 1.0, "step": 2349, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 174.14583587646484, "epoch": 7.937605396290051, "grad_norm": 20.645215597498492, "kl": 0.50390625, "learning_rate": 3.384009009009009e-07, "loss": 0.0005, "reward": 3.360377550125122, "reward_std": 0.16720493882894516, "rewards/final_reward": 1.595247778754447, "rewards/mask_iou_reward": 0.7976238893772235, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3603774905204773, "rewards/thk_ans_format_reward": 1.0, "step": 2350, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 213.8854217529297, "epoch": 7.940978077571669, "grad_norm": 32.13190178817014, "kl": 0.541015625, "learning_rate": 3.3811936936936936e-07, "loss": 0.0006, "reward": 3.542497754096985, "reward_std": 0.05872867442667484, "rewards/final_reward": 1.6635123248896204, "rewards/mask_iou_reward": 0.8317561624448102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.54249769449234, "rewards/thk_ans_format_reward": 1.0, "step": 2351, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 189.2916717529297, "epoch": 7.944350758853289, "grad_norm": 10.315882348214277, "kl": 0.4404296875, "learning_rate": 3.378378378378378e-07, "loss": 0.0004, "reward": 3.573422074317932, "reward_std": 0.07881678268313408, "rewards/final_reward": 1.6519748972757755, "rewards/mask_iou_reward": 0.8259874486378878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5734221935272217, "rewards/thk_ans_format_reward": 1.0, "step": 2352, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 263.5416793823242, "epoch": 7.947723440134907, "grad_norm": 7.729003667115754, "kl": 0.732421875, "learning_rate": 3.375563063063063e-07, "loss": 0.0007, "reward": 3.356159210205078, "reward_std": 0.08650216832756996, "rewards/final_reward": 1.3735585263480121, "rewards/mask_iou_reward": 0.6867792631740061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3561591506004333, "rewards/thk_ans_format_reward": 1.0, "step": 2353, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 203.52083587646484, "epoch": 7.951096121416526, "grad_norm": 5.855065824425488, "kl": 0.4091796875, "learning_rate": 3.372747747747748e-07, "loss": 0.0006, "reward": 3.6102631092071533, "reward_std": 0.07107937522232533, "rewards/final_reward": 1.53924420063235, "rewards/mask_iou_reward": 0.769622100316175, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6102629899978638, "rewards/thk_ans_format_reward": 1.0, "step": 2354, "think_completion_length": 9.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 269.0416717529297, "epoch": 7.954468802698145, "grad_norm": 22.629742151372223, "kl": 0.41015625, "learning_rate": 3.3699324324324326e-07, "loss": 0.0004, "reward": 2.8923134803771973, "reward_std": 0.15287496149539948, "rewards/final_reward": 0.804916964131339, "rewards/mask_iou_reward": 0.4024584820656695, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8923133313655853, "rewards/thk_ans_format_reward": 1.0, "step": 2355, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 197.17709350585938, "epoch": 7.957841483979764, "grad_norm": 9.726923091668409, "kl": 0.4326171875, "learning_rate": 3.367117117117117e-07, "loss": 0.0004, "reward": 3.207701802253723, "reward_std": 0.061932358890771866, "rewards/final_reward": 1.2355819249165891, "rewards/mask_iou_reward": 0.6177909624582946, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2077017426490784, "rewards/thk_ans_format_reward": 1.0, "step": 2356, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 210.1041717529297, "epoch": 7.961214165261383, "grad_norm": 6.3255156854871, "kl": 0.4150390625, "learning_rate": 3.364301801801802e-07, "loss": 0.0004, "reward": 3.118018388748169, "reward_std": 0.011369133368134499, "rewards/final_reward": 0.4532947116263412, "rewards/mask_iou_reward": 0.2266473558131706, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1180184483528137, "rewards/thk_ans_format_reward": 1.0, "step": 2357, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 242.6354217529297, "epoch": 7.964586846543002, "grad_norm": 6.205063191039621, "kl": 0.443359375, "learning_rate": 3.361486486486486e-07, "loss": 0.0005, "reward": 3.5695269107818604, "reward_std": 0.14852624107152224, "rewards/final_reward": 0.8570310423524563, "rewards/mask_iou_reward": 0.42851552117622815, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5903602242469788, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2358, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 246.5416717529297, "epoch": 7.967959527824621, "grad_norm": 7.83157245622682, "kl": 0.388671875, "learning_rate": 3.358671171171171e-07, "loss": 0.0004, "reward": 3.6626497507095337, "reward_std": 0.057364363223314285, "rewards/final_reward": 1.9162213234292915, "rewards/mask_iou_reward": 0.9581106617146458, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.662649691104889, "rewards/thk_ans_format_reward": 1.0, "step": 2359, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 170.84375762939453, "epoch": 7.971332209106239, "grad_norm": 44.272638856549584, "kl": 0.4501953125, "learning_rate": 3.3558558558558556e-07, "loss": 0.0005, "reward": 3.702326536178589, "reward_std": 0.09001387841999531, "rewards/final_reward": 1.7448316603806895, "rewards/mask_iou_reward": 0.8724158301903447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7023264169692993, "rewards/thk_ans_format_reward": 1.0, "step": 2360, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 242.05209350585938, "epoch": 7.974704890387859, "grad_norm": 11.615592962528046, "kl": 0.3505859375, "learning_rate": 3.35304054054054e-07, "loss": 0.0004, "reward": 3.660933017730713, "reward_std": 0.04490010812878609, "rewards/final_reward": 1.7137576906076042, "rewards/mask_iou_reward": 0.8568788453038021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6609334349632263, "rewards/thk_ans_format_reward": 1.0, "step": 2361, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 271.21875, "epoch": 7.978077571669477, "grad_norm": 21.061501022597145, "kl": 0.39453125, "learning_rate": 3.350225225225225e-07, "loss": 0.0004, "reward": 3.371769428253174, "reward_std": 0.23365569114685059, "rewards/final_reward": 1.7317698156014782, "rewards/mask_iou_reward": 0.8658849078007391, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4134362936019897, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2362, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 242.0729217529297, "epoch": 7.981450252951096, "grad_norm": 18.606036025398936, "kl": 0.3759765625, "learning_rate": 3.3474099099099094e-07, "loss": 0.0004, "reward": 3.425795316696167, "reward_std": 0.20433041267096996, "rewards/final_reward": 1.7526200055850498, "rewards/mask_iou_reward": 0.8763100027925249, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4466286897659302, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2363, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 231.96875762939453, "epoch": 7.9848229342327155, "grad_norm": 8.152504737474274, "kl": 0.498046875, "learning_rate": 3.3445945945945946e-07, "loss": 0.0005, "reward": 3.575013518333435, "reward_std": 0.14411963429301977, "rewards/final_reward": 1.5466455653104827, "rewards/mask_iou_reward": 0.7733227826552413, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.595846951007843, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2364, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 265.6770935058594, "epoch": 7.988195615514334, "grad_norm": 10.160184176734472, "kl": 0.390625, "learning_rate": 3.341779279279279e-07, "loss": 0.0004, "reward": 3.7018014192581177, "reward_std": 0.21830223500728607, "rewards/final_reward": 1.923110668124186, "rewards/mask_iou_reward": 0.961555334062093, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.7434679865837097, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2365, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 150.2604217529297, "epoch": 7.991568296795953, "grad_norm": 9.70498937587903, "kl": 0.59765625, "learning_rate": 3.338963963963964e-07, "loss": 0.0006, "reward": 3.6059956550598145, "reward_std": 0.01825597556307912, "rewards/final_reward": 1.8529482315714163, "rewards/mask_iou_reward": 0.9264741157857082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6059958934783936, "rewards/thk_ans_format_reward": 1.0, "step": 2366, "think_completion_length": 12.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 198.6979217529297, "epoch": 7.9949409780775715, "grad_norm": 14.530691748123385, "kl": 0.5234375, "learning_rate": 3.3361486486486484e-07, "loss": 0.0005, "reward": 3.6428091526031494, "reward_std": 0.07249030750244856, "rewards/final_reward": 1.7027538774608706, "rewards/mask_iou_reward": 0.8513769387304353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6428090929985046, "rewards/thk_ans_format_reward": 1.0, "step": 2367, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 193.60526657104492, "epoch": 7.998313659359191, "grad_norm": 12.281907589005117, "kl": 0.416015625, "learning_rate": 3.333333333333333e-07, "loss": 0.0004, "reward": 3.605751395225525, "reward_std": 0.11550533585250378, "rewards/final_reward": 1.4464381873724115, "rewards/mask_iou_reward": 0.7232190936862057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.605751395225525, "rewards/thk_ans_format_reward": 1.0, "step": 2368, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 167.7604217529297, "epoch": 8.003372681281618, "grad_norm": 23.744219327400796, "kl": 0.392578125, "learning_rate": 3.330518018018018e-07, "loss": 0.0004, "reward": 3.507601261138916, "reward_std": 0.11289845686405897, "rewards/final_reward": 1.228210963425049, "rewards/mask_iou_reward": 0.6141054817125245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5076011419296265, "rewards/thk_ans_format_reward": 1.0, "step": 2369, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 247.33333587646484, "epoch": 8.006745362563239, "grad_norm": 11.546192868792904, "kl": 0.388671875, "learning_rate": 3.327702702702703e-07, "loss": 0.0004, "reward": 3.7985188961029053, "reward_std": 0.03749396279454231, "rewards/final_reward": 1.867038907222471, "rewards/mask_iou_reward": 0.9335194536112355, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7985187768936157, "rewards/thk_ans_format_reward": 1.0, "step": 2370, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 264.2708435058594, "epoch": 8.010118043844857, "grad_norm": 22.293278328225153, "kl": 1.9775390625, "learning_rate": 3.3248873873873874e-07, "loss": 0.002, "reward": 3.674692153930664, "reward_std": 0.03796894662082195, "rewards/final_reward": 1.643982822737366, "rewards/mask_iou_reward": 0.821991411368683, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6746920347213745, "rewards/thk_ans_format_reward": 1.0, "step": 2371, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 168.9479217529297, "epoch": 8.013490725126475, "grad_norm": 26.55755490987274, "kl": 0.4404296875, "learning_rate": 3.322072072072072e-07, "loss": 0.0004, "reward": 3.422567129135132, "reward_std": 0.06658890098333359, "rewards/final_reward": 1.1253702176690892, "rewards/mask_iou_reward": 0.5626851088345446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.422567069530487, "rewards/thk_ans_format_reward": 1.0, "step": 2372, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 227.20834350585938, "epoch": 8.016863406408094, "grad_norm": 7.147523959066198, "kl": 0.40234375, "learning_rate": 3.3192567567567566e-07, "loss": 0.0004, "reward": 3.740321159362793, "reward_std": 0.21628601849079132, "rewards/final_reward": 1.7931842878703108, "rewards/mask_iou_reward": 0.8965921439351554, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.7819878458976746, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2373, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 216.6979217529297, "epoch": 8.020236087689714, "grad_norm": 8.090296978672066, "kl": 0.619140625, "learning_rate": 3.3164414414414417e-07, "loss": 0.0006, "reward": 3.5261985063552856, "reward_std": 0.16563283652067184, "rewards/final_reward": 1.7265625912685088, "rewards/mask_iou_reward": 0.8632812956342544, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5470316410064697, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2374, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 186.89583587646484, "epoch": 8.023608768971332, "grad_norm": 6.551030017093635, "kl": 0.44921875, "learning_rate": 3.3136261261261263e-07, "loss": 0.0004, "reward": 3.4857009649276733, "reward_std": 0.08584445342421532, "rewards/final_reward": 1.5250340704634133, "rewards/mask_iou_reward": 0.7625170352317067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.485701084136963, "rewards/thk_ans_format_reward": 1.0, "step": 2375, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 243.6041717529297, "epoch": 8.02698145025295, "grad_norm": 8.89636088836023, "kl": 0.4267578125, "learning_rate": 3.310810810810811e-07, "loss": 0.0004, "reward": 3.673058271408081, "reward_std": 0.07457491382956505, "rewards/final_reward": 1.7505070487126124, "rewards/mask_iou_reward": 0.8752535243563062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.673058271408081, "rewards/thk_ans_format_reward": 1.0, "step": 2376, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 209.4479217529297, "epoch": 8.03035413153457, "grad_norm": 9.333641288570725, "kl": 0.482421875, "learning_rate": 3.3079954954954955e-07, "loss": 0.0005, "reward": 3.279388904571533, "reward_std": 0.10259271413087845, "rewards/final_reward": 1.165007475258762, "rewards/mask_iou_reward": 0.582503737629381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2793890237808228, "rewards/thk_ans_format_reward": 1.0, "step": 2377, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 228.05208587646484, "epoch": 8.03372681281619, "grad_norm": 7.7193007263674795, "kl": 0.4609375, "learning_rate": 3.3051801801801796e-07, "loss": 0.0005, "reward": 3.6911059617996216, "reward_std": 0.03571598511189222, "rewards/final_reward": 1.7758940189951273, "rewards/mask_iou_reward": 0.8879470094975637, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6911059617996216, "rewards/thk_ans_format_reward": 1.0, "step": 2378, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 203.80209350585938, "epoch": 8.037099494097808, "grad_norm": 12.554029559797081, "kl": 0.41015625, "learning_rate": 3.302364864864865e-07, "loss": 0.0004, "reward": 3.50697922706604, "reward_std": 0.010762129910290241, "rewards/final_reward": 1.9482879176670094, "rewards/mask_iou_reward": 0.9741439588335047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.506978988647461, "rewards/thk_ans_format_reward": 1.0, "step": 2379, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 208.98958587646484, "epoch": 8.040472175379426, "grad_norm": 25.757316247888088, "kl": 0.3984375, "learning_rate": 3.2995495495495493e-07, "loss": 0.0004, "reward": 3.8017170429229736, "reward_std": 0.05078030563890934, "rewards/final_reward": 1.6788383903437554, "rewards/mask_iou_reward": 0.8394191951718777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8017171025276184, "rewards/thk_ans_format_reward": 1.0, "step": 2380, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 229.65626525878906, "epoch": 8.043844856661046, "grad_norm": 4.711917028914413, "kl": 0.39453125, "learning_rate": 3.296734234234234e-07, "loss": 0.0004, "reward": 3.4532305002212524, "reward_std": 0.21785828098654747, "rewards/final_reward": 1.2614822442511835, "rewards/mask_iou_reward": 0.6307411221255917, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4636470079421997, "rewards/thk_ans_format_reward": 1.0, "step": 2381, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 211.1041717529297, "epoch": 8.047217537942664, "grad_norm": 5.786104794251641, "kl": 0.43359375, "learning_rate": 3.2939189189189186e-07, "loss": 0.0004, "reward": 3.568354845046997, "reward_std": 0.23636979144066572, "rewards/final_reward": 1.6667668988522717, "rewards/mask_iou_reward": 0.8333834494261358, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5996047258377075, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2382, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 237.06250762939453, "epoch": 8.050590219224283, "grad_norm": 11.667658501236074, "kl": 0.3955078125, "learning_rate": 3.291103603603603e-07, "loss": 0.0004, "reward": 3.549164295196533, "reward_std": 0.11421588622033596, "rewards/final_reward": 1.9014355104384648, "rewards/mask_iou_reward": 0.9507177552192324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5491642951965332, "rewards/thk_ans_format_reward": 1.0, "step": 2383, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 213.28125762939453, "epoch": 8.053962900505903, "grad_norm": 52.26829765639469, "kl": 0.408203125, "learning_rate": 3.2882882882882883e-07, "loss": 0.0004, "reward": 3.5316416025161743, "reward_std": 0.07862124592065811, "rewards/final_reward": 1.7800296532453164, "rewards/mask_iou_reward": 0.8900148266226582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5316415429115295, "rewards/thk_ans_format_reward": 1.0, "step": 2384, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 228.6041717529297, "epoch": 8.057335581787521, "grad_norm": 19.327738750923807, "kl": 0.556640625, "learning_rate": 3.285472972972973e-07, "loss": 0.0006, "reward": 3.734640121459961, "reward_std": 0.11318856105208397, "rewards/final_reward": 1.855851258239336, "rewards/mask_iou_reward": 0.927925629119668, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7346400022506714, "rewards/thk_ans_format_reward": 1.0, "step": 2385, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 182.3229217529297, "epoch": 8.06070826306914, "grad_norm": 6.5165529355463345, "kl": 0.4169921875, "learning_rate": 3.2826576576576575e-07, "loss": 0.0004, "reward": 3.505844473838806, "reward_std": 0.13680755905807018, "rewards/final_reward": 1.2078693786602015, "rewards/mask_iou_reward": 0.6039346893301007, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5266777276992798, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2386, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 190.1041717529297, "epoch": 8.064080944350758, "grad_norm": 5.597707115359675, "kl": 0.544921875, "learning_rate": 3.279842342342342e-07, "loss": 0.0006, "reward": 3.4991390705108643, "reward_std": 0.07332871481776237, "rewards/final_reward": 0.6863406134033639, "rewards/mask_iou_reward": 0.34317030670168197, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4991388320922852, "rewards/thk_ans_format_reward": 1.0, "step": 2387, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 192.08333587646484, "epoch": 8.067453625632378, "grad_norm": 30.552181119910045, "kl": 0.447265625, "learning_rate": 3.2770270270270267e-07, "loss": 0.0005, "reward": 3.708919405937195, "reward_std": 0.027788237668573856, "rewards/final_reward": 1.5298693900152363, "rewards/mask_iou_reward": 0.7649346950076181, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7089195251464844, "rewards/thk_ans_format_reward": 1.0, "step": 2388, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 185.3229217529297, "epoch": 8.070826306913997, "grad_norm": 5.886357948414976, "kl": 0.509765625, "learning_rate": 3.2742117117117113e-07, "loss": 0.0005, "reward": 3.342664361000061, "reward_std": 0.10686694085597992, "rewards/final_reward": 1.2874507479483295, "rewards/mask_iou_reward": 0.6437253739741647, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3426641821861267, "rewards/thk_ans_format_reward": 1.0, "step": 2389, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 254.89584350585938, "epoch": 8.074198988195615, "grad_norm": 5.208029616834268, "kl": 0.4287109375, "learning_rate": 3.2713963963963965e-07, "loss": 0.0004, "reward": 3.3730231523513794, "reward_std": 0.2724648416042328, "rewards/final_reward": 1.0485035138599763, "rewards/mask_iou_reward": 0.5242517569299882, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.4980231523513794, "rewards/thk_ans_format_reward": 0.9375, "step": 2390, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 158.6041717529297, "epoch": 8.077571669477235, "grad_norm": 10.381308765291164, "kl": 0.486328125, "learning_rate": 3.268581081081081e-07, "loss": 0.0005, "reward": 3.585293173789978, "reward_std": 0.08048686385154724, "rewards/final_reward": 1.7610035561330326, "rewards/mask_iou_reward": 0.8805017780665163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5852932333946228, "rewards/thk_ans_format_reward": 1.0, "step": 2391, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 192.1979217529297, "epoch": 8.080944350758854, "grad_norm": 4.410245364445696, "kl": 0.400390625, "learning_rate": 3.2657657657657657e-07, "loss": 0.0004, "reward": 3.611588478088379, "reward_std": 0.06441009044647217, "rewards/final_reward": 1.56279629364386, "rewards/mask_iou_reward": 0.78139814682193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6115882992744446, "rewards/thk_ans_format_reward": 1.0, "step": 2392, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 195.75, "epoch": 8.084317032040472, "grad_norm": 8.748181047939324, "kl": 0.396484375, "learning_rate": 3.2629504504504503e-07, "loss": 0.0004, "reward": 3.7184702157974243, "reward_std": 0.012965178117156029, "rewards/final_reward": 1.8514894083113558, "rewards/mask_iou_reward": 0.9257447041556779, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7184701561927795, "rewards/thk_ans_format_reward": 1.0, "step": 2393, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 229.375, "epoch": 8.08768971332209, "grad_norm": 30.253714510471372, "kl": 0.46484375, "learning_rate": 3.260135135135135e-07, "loss": 0.0005, "reward": 3.337326765060425, "reward_std": 0.1405288316309452, "rewards/final_reward": 1.8327282117382686, "rewards/mask_iou_reward": 0.9163641058691343, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3373266458511353, "rewards/thk_ans_format_reward": 1.0, "step": 2394, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 211.4375, "epoch": 8.09106239460371, "grad_norm": 5.770519777707653, "kl": 0.3984375, "learning_rate": 3.25731981981982e-07, "loss": 0.0004, "reward": 3.6448590755462646, "reward_std": 0.19882620126008987, "rewards/final_reward": 1.5674474068642676, "rewards/mask_iou_reward": 0.7837237034321338, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.665692389011383, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2395, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 216.45833587646484, "epoch": 8.094435075885329, "grad_norm": 7.792328264252321, "kl": 0.7607421875, "learning_rate": 3.2545045045045046e-07, "loss": 0.0008, "reward": 3.6415023803710938, "reward_std": 0.03474126663058996, "rewards/final_reward": 1.1202452769772973, "rewards/mask_iou_reward": 0.5601226384886486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6415023803710938, "rewards/thk_ans_format_reward": 1.0, "step": 2396, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 225.1354217529297, "epoch": 8.097807757166947, "grad_norm": 11.930417082724471, "kl": 0.390625, "learning_rate": 3.251689189189189e-07, "loss": 0.0004, "reward": 3.4700154066085815, "reward_std": 0.07594737969338894, "rewards/final_reward": 0.6212718691660815, "rewards/mask_iou_reward": 0.31063593458304073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4700152277946472, "rewards/thk_ans_format_reward": 1.0, "step": 2397, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 187.65625762939453, "epoch": 8.101180438448566, "grad_norm": 18.963475126752737, "kl": 0.4609375, "learning_rate": 3.2488738738738733e-07, "loss": 0.0005, "reward": 3.397141933441162, "reward_std": 0.15669412538409233, "rewards/final_reward": 1.7398657181327521, "rewards/mask_iou_reward": 0.8699328590663761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3971418142318726, "rewards/thk_ans_format_reward": 1.0, "step": 2398, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 221.36459350585938, "epoch": 8.104553119730186, "grad_norm": 15.023091097131797, "kl": 0.3828125, "learning_rate": 3.246058558558558e-07, "loss": 0.0004, "reward": 3.2796510457992554, "reward_std": 0.11465698108077049, "rewards/final_reward": 1.149373076233073, "rewards/mask_iou_reward": 0.5746865381165365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2796511054039001, "rewards/thk_ans_format_reward": 1.0, "step": 2399, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 179.21875762939453, "epoch": 8.107925801011804, "grad_norm": 129.00479724577653, "kl": 0.44140625, "learning_rate": 3.243243243243243e-07, "loss": 0.0005, "reward": 3.5124967098236084, "reward_std": 0.09804843366146088, "rewards/final_reward": 1.8780440430976026, "rewards/mask_iou_reward": 0.9390220215488013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5124965906143188, "rewards/thk_ans_format_reward": 1.0, "step": 2400, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 185.34375762939453, "epoch": 8.111298482293423, "grad_norm": 5.992840860030975, "kl": 0.408203125, "learning_rate": 3.2404279279279277e-07, "loss": 0.0004, "reward": 3.42751145362854, "reward_std": 0.09829858504235744, "rewards/final_reward": 1.6106350603914932, "rewards/mask_iou_reward": 0.8053175301957466, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4275111556053162, "rewards/thk_ans_format_reward": 1.0, "step": 2401, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.7916717529297, "epoch": 8.114671163575043, "grad_norm": 8.068307793533288, "kl": 0.4169921875, "learning_rate": 3.2376126126126123e-07, "loss": 0.0004, "reward": 3.227345824241638, "reward_std": 0.08449060097336769, "rewards/final_reward": 1.410210368073459, "rewards/mask_iou_reward": 0.7051051840367295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2273458242416382, "rewards/thk_ans_format_reward": 1.0, "step": 2402, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 170.7291717529297, "epoch": 8.118043844856661, "grad_norm": 10.835281412196219, "kl": 0.458984375, "learning_rate": 3.234797297297297e-07, "loss": 0.0005, "reward": 3.6753695011138916, "reward_std": 0.0677886251360178, "rewards/final_reward": 1.5689224450025796, "rewards/mask_iou_reward": 0.7844612225012898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.675369381904602, "rewards/thk_ans_format_reward": 1.0, "step": 2403, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 212.7916717529297, "epoch": 8.12141652613828, "grad_norm": 7.868671257529703, "kl": 0.4091796875, "learning_rate": 3.2319819819819815e-07, "loss": 0.0004, "reward": 3.660378336906433, "reward_std": 0.12548162788152695, "rewards/final_reward": 1.8662698704350658, "rewards/mask_iou_reward": 0.9331349352175329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6603783965110779, "rewards/thk_ans_format_reward": 1.0, "step": 2404, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 186.93750762939453, "epoch": 8.124789207419898, "grad_norm": 8.370326706025851, "kl": 0.6669921875, "learning_rate": 3.2291666666666666e-07, "loss": 0.0007, "reward": 3.3133339881896973, "reward_std": 0.14033591002225876, "rewards/final_reward": 1.7516517047199014, "rewards/mask_iou_reward": 0.8758258523599507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.313333809375763, "rewards/thk_ans_format_reward": 1.0, "step": 2405, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 266.87500762939453, "epoch": 8.128161888701518, "grad_norm": 8.330567857586583, "kl": 0.423828125, "learning_rate": 3.226351351351351e-07, "loss": 0.0004, "reward": 3.424866199493408, "reward_std": 0.12922486569732428, "rewards/final_reward": 1.0733441257197325, "rewards/mask_iou_reward": 0.5366720628598662, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4456994533538818, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2406, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 145.3125, "epoch": 8.131534569983137, "grad_norm": 10.523488546634377, "kl": 0.560546875, "learning_rate": 3.223536036036036e-07, "loss": 0.0006, "reward": 3.84055757522583, "reward_std": 0.08618904370814562, "rewards/final_reward": 1.758160011130204, "rewards/mask_iou_reward": 0.879080005565102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8405576944351196, "rewards/thk_ans_format_reward": 1.0, "step": 2407, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 152.5, "epoch": 8.134907251264755, "grad_norm": 10.934981146929287, "kl": 0.466796875, "learning_rate": 3.2207207207207205e-07, "loss": 0.0005, "reward": 3.602039098739624, "reward_std": 0.046292152255773544, "rewards/final_reward": 1.8189440665563734, "rewards/mask_iou_reward": 0.9094720332781867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6020390391349792, "rewards/thk_ans_format_reward": 1.0, "step": 2408, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 183.9479217529297, "epoch": 8.138279932546375, "grad_norm": 6.908980900575378, "kl": 0.4326171875, "learning_rate": 3.217905405405405e-07, "loss": 0.0004, "reward": 3.737973690032959, "reward_std": 0.02162565803155303, "rewards/final_reward": 1.873947012729171, "rewards/mask_iou_reward": 0.9369735063645855, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.737973690032959, "rewards/thk_ans_format_reward": 1.0, "step": 2409, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 184.9791717529297, "epoch": 8.141652613827993, "grad_norm": 17.50123600151369, "kl": 0.447265625, "learning_rate": 3.21509009009009e-07, "loss": 0.0004, "reward": 3.526793599128723, "reward_std": 0.04617397487163544, "rewards/final_reward": 1.5573754967659243, "rewards/mask_iou_reward": 0.7786877483829622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5267934203147888, "rewards/thk_ans_format_reward": 1.0, "step": 2410, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 190.98958587646484, "epoch": 8.145025295109612, "grad_norm": 10.237101465585567, "kl": 0.4921875, "learning_rate": 3.212274774774775e-07, "loss": 0.0005, "reward": 3.466389775276184, "reward_std": 0.10824509710073471, "rewards/final_reward": 1.0882982205477378, "rewards/mask_iou_reward": 0.5441491102738689, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.466389775276184, "rewards/thk_ans_format_reward": 1.0, "step": 2411, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 165.30208587646484, "epoch": 8.14839797639123, "grad_norm": 17.59083784655385, "kl": 0.4736328125, "learning_rate": 3.2094594594594594e-07, "loss": 0.0005, "reward": 3.24951171875, "reward_std": 0.05265136994421482, "rewards/final_reward": 0.7422969614193567, "rewards/mask_iou_reward": 0.3711484807096784, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2495114207267761, "rewards/thk_ans_format_reward": 1.0, "step": 2412, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 163.1041717529297, "epoch": 8.15177065767285, "grad_norm": 9.760394222550351, "kl": 0.4599609375, "learning_rate": 3.206644144144144e-07, "loss": 0.0005, "reward": 3.385643482208252, "reward_std": 0.09001307748258114, "rewards/final_reward": 0.6627619569797258, "rewards/mask_iou_reward": 0.3313809784898629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3856431245803833, "rewards/thk_ans_format_reward": 1.0, "step": 2413, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 144.21875381469727, "epoch": 8.155143338954469, "grad_norm": 31.29595285296675, "kl": 0.4208984375, "learning_rate": 3.2038288288288286e-07, "loss": 0.0004, "reward": 3.4562931060791016, "reward_std": 0.05336186848580837, "rewards/final_reward": 1.1441042030131243, "rewards/mask_iou_reward": 0.5720521015065622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.456292986869812, "rewards/thk_ans_format_reward": 1.0, "step": 2414, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 164.09375, "epoch": 8.158516020236087, "grad_norm": 9.501872238266467, "kl": 0.87109375, "learning_rate": 3.201013513513514e-07, "loss": 0.0009, "reward": 3.521483302116394, "reward_std": 0.08115924685262144, "rewards/final_reward": 1.6029761419337214, "rewards/mask_iou_reward": 0.8014880709668607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5214833617210388, "rewards/thk_ans_format_reward": 1.0, "step": 2415, "think_completion_length": 10.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 164.34375, "epoch": 8.161888701517707, "grad_norm": 32.72477260944038, "kl": 0.486328125, "learning_rate": 3.1981981981981984e-07, "loss": 0.0005, "reward": 3.627326726913452, "reward_std": 0.046977970749139786, "rewards/final_reward": 1.6322781141514124, "rewards/mask_iou_reward": 0.8161390570757062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6273266077041626, "rewards/thk_ans_format_reward": 1.0, "step": 2416, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 205.55209350585938, "epoch": 8.165261382799326, "grad_norm": 7.131920182603108, "kl": 0.4736328125, "learning_rate": 3.195382882882883e-07, "loss": 0.0005, "reward": 3.7091704607009888, "reward_std": 0.06176206795498729, "rewards/final_reward": 1.550195763691969, "rewards/mask_iou_reward": 0.7750978818459845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7091703414916992, "rewards/thk_ans_format_reward": 1.0, "step": 2417, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 188.02083587646484, "epoch": 8.168634064080944, "grad_norm": 7.346436112306174, "kl": 0.5078125, "learning_rate": 3.192567567567567e-07, "loss": 0.0005, "reward": 3.6605117321014404, "reward_std": 0.04182407818734646, "rewards/final_reward": 1.4259560613112674, "rewards/mask_iou_reward": 0.7129780306556337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6605116724967957, "rewards/thk_ans_format_reward": 1.0, "step": 2418, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 191.48958587646484, "epoch": 8.172006745362562, "grad_norm": 9.91741035767607, "kl": 0.44921875, "learning_rate": 3.1897522522522517e-07, "loss": 0.0005, "reward": 3.6389087438583374, "reward_std": 0.10996793489903212, "rewards/final_reward": 1.8648980222098381, "rewards/mask_iou_reward": 0.9324490111049191, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6389088034629822, "rewards/thk_ans_format_reward": 1.0, "step": 2419, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 147.7916717529297, "epoch": 8.175379426644183, "grad_norm": 8.771784119719042, "kl": 0.478515625, "learning_rate": 3.186936936936937e-07, "loss": 0.0005, "reward": 3.5804519653320312, "reward_std": 0.019869420444592834, "rewards/final_reward": 1.7407468174759197, "rewards/mask_iou_reward": 0.8703734087379599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5804519653320312, "rewards/thk_ans_format_reward": 1.0, "step": 2420, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 149.58333587646484, "epoch": 8.178752107925801, "grad_norm": 14.461206633706372, "kl": 0.544921875, "learning_rate": 3.1841216216216214e-07, "loss": 0.0006, "reward": 3.332810401916504, "reward_std": 0.05701042152941227, "rewards/final_reward": 1.5631696036833542, "rewards/mask_iou_reward": 0.7815848018416771, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3328104615211487, "rewards/thk_ans_format_reward": 1.0, "step": 2421, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 154.9166717529297, "epoch": 8.18212478920742, "grad_norm": 10.900392482313276, "kl": 0.5048828125, "learning_rate": 3.181306306306306e-07, "loss": 0.0005, "reward": 3.417921781539917, "reward_std": 0.1880015730857849, "rewards/final_reward": 1.6576152588454947, "rewards/mask_iou_reward": 0.8288076294227473, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.428338348865509, "rewards/thk_ans_format_reward": 1.0, "step": 2422, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 148.58333587646484, "epoch": 8.18549747048904, "grad_norm": 8.069657839824432, "kl": 0.482421875, "learning_rate": 3.1784909909909906e-07, "loss": 0.0005, "reward": 3.43167781829834, "reward_std": 0.09442893601953983, "rewards/final_reward": 1.5638686557156498, "rewards/mask_iou_reward": 0.7819343278578249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4316779375076294, "rewards/thk_ans_format_reward": 1.0, "step": 2423, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 197.0104217529297, "epoch": 8.188870151770658, "grad_norm": 13.19781049934047, "kl": 0.45703125, "learning_rate": 3.175675675675675e-07, "loss": 0.0005, "reward": 3.432652235031128, "reward_std": 0.10366004332900047, "rewards/final_reward": 1.4790426118829503, "rewards/mask_iou_reward": 0.7395213059414751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432652235031128, "rewards/thk_ans_format_reward": 1.0, "step": 2424, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 161.55208587646484, "epoch": 8.192242833052276, "grad_norm": 8.387087408769968, "kl": 0.48046875, "learning_rate": 3.1728603603603604e-07, "loss": 0.0005, "reward": 3.5953248739242554, "reward_std": 0.04133354127407074, "rewards/final_reward": 1.5849948737706572, "rewards/mask_iou_reward": 0.7924974368853286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5953248739242554, "rewards/thk_ans_format_reward": 1.0, "step": 2425, "think_completion_length": 9.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 213.1666717529297, "epoch": 8.195615514333895, "grad_norm": 5.757225928522244, "kl": 0.4755859375, "learning_rate": 3.170045045045045e-07, "loss": 0.0005, "reward": 3.656041979789734, "reward_std": 0.04135182220488787, "rewards/final_reward": 1.9078518225754462, "rewards/mask_iou_reward": 0.9539259112877231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6560419797897339, "rewards/thk_ans_format_reward": 1.0, "step": 2426, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 179.40625762939453, "epoch": 8.198988195615515, "grad_norm": 11.15654342910603, "kl": 0.4365234375, "learning_rate": 3.1672297297297296e-07, "loss": 0.0004, "reward": 3.546691417694092, "reward_std": 0.13070277497172356, "rewards/final_reward": 0.9240818825057648, "rewards/mask_iou_reward": 0.4620409412528824, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5466914772987366, "rewards/thk_ans_format_reward": 1.0, "step": 2427, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 141.75000381469727, "epoch": 8.202360876897133, "grad_norm": 8.195096236049537, "kl": 0.466796875, "learning_rate": 3.164414414414414e-07, "loss": 0.0005, "reward": 3.33832848072052, "reward_std": 0.09735456854104996, "rewards/final_reward": 1.2954900338208892, "rewards/mask_iou_reward": 0.6477450169104446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.33832848072052, "rewards/thk_ans_format_reward": 1.0, "step": 2428, "think_completion_length": 9.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.80208969116211, "epoch": 8.205733558178752, "grad_norm": 10.897348964830934, "kl": 0.5087890625, "learning_rate": 3.161599099099099e-07, "loss": 0.0005, "reward": 3.5560476779937744, "reward_std": 0.1025335043668747, "rewards/final_reward": 1.9635037618405802, "rewards/mask_iou_reward": 0.9817518809202901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5560476183891296, "rewards/thk_ans_format_reward": 1.0, "step": 2429, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 208.57292938232422, "epoch": 8.209106239460372, "grad_norm": 8.79365056371932, "kl": 0.4287109375, "learning_rate": 3.158783783783784e-07, "loss": 0.0004, "reward": 3.04141902923584, "reward_std": 0.10161345452070236, "rewards/final_reward": 0.734848567434719, "rewards/mask_iou_reward": 0.3674242837173595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.041418969631195, "rewards/thk_ans_format_reward": 1.0, "step": 2430, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 209.9791717529297, "epoch": 8.21247892074199, "grad_norm": 8.018953305589752, "kl": 0.4169921875, "learning_rate": 3.1559684684684685e-07, "loss": 0.0004, "reward": 3.687578797340393, "reward_std": 0.07668573036789894, "rewards/final_reward": 1.703764574546751, "rewards/mask_iou_reward": 0.8518822872733754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6875787377357483, "rewards/thk_ans_format_reward": 1.0, "step": 2431, "think_completion_length": 10.5 }, { "clip_ratio": 0.0, "completion_length": 171.64583587646484, "epoch": 8.215851602023609, "grad_norm": 23.61435432038782, "kl": 0.517578125, "learning_rate": 3.153153153153153e-07, "loss": 0.0005, "reward": 3.406832456588745, "reward_std": 0.11250332370400429, "rewards/final_reward": 1.213014148599159, "rewards/mask_iou_reward": 0.6065070742995795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4068323373794556, "rewards/thk_ans_format_reward": 1.0, "step": 2432, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 164.0416717529297, "epoch": 8.219224283305227, "grad_norm": 9.13877086411204, "kl": 0.59375, "learning_rate": 3.150337837837838e-07, "loss": 0.0006, "reward": 3.6608787775039673, "reward_std": 0.06742198672145605, "rewards/final_reward": 1.7380011253940841, "rewards/mask_iou_reward": 0.8690005626970421, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.660878837108612, "rewards/thk_ans_format_reward": 1.0, "step": 2433, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 189.71875, "epoch": 8.222596964586847, "grad_norm": 14.396642672138428, "kl": 0.439453125, "learning_rate": 3.1475225225225223e-07, "loss": 0.0004, "reward": 3.429533004760742, "reward_std": 0.12928189616650343, "rewards/final_reward": 1.663311666723789, "rewards/mask_iou_reward": 0.8316558333618945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4295329451560974, "rewards/thk_ans_format_reward": 1.0, "step": 2434, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 150.70833587646484, "epoch": 8.225969645868465, "grad_norm": 12.580987281110211, "kl": 0.5361328125, "learning_rate": 3.1447072072072075e-07, "loss": 0.0005, "reward": 3.5138970613479614, "reward_std": 0.09798325225710869, "rewards/final_reward": 0.9252272387501513, "rewards/mask_iou_reward": 0.46261361937507567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5138969421386719, "rewards/thk_ans_format_reward": 1.0, "step": 2435, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 129.11458587646484, "epoch": 8.229342327150084, "grad_norm": 7.862194953393018, "kl": 0.556640625, "learning_rate": 3.141891891891892e-07, "loss": 0.0006, "reward": 3.8974932432174683, "reward_std": 0.008425467647612095, "rewards/final_reward": 1.904266355805077, "rewards/mask_iou_reward": 0.9521331779025385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8974932432174683, "rewards/thk_ans_format_reward": 1.0, "step": 2436, "think_completion_length": 11.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 159.8854217529297, "epoch": 8.232715008431704, "grad_norm": 7.968289754618249, "kl": 0.5341796875, "learning_rate": 3.1390765765765767e-07, "loss": 0.0005, "reward": 3.715288996696472, "reward_std": 0.08045927435159683, "rewards/final_reward": 1.7203986965591502, "rewards/mask_iou_reward": 0.8601993482795751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7152891755104065, "rewards/thk_ans_format_reward": 1.0, "step": 2437, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 212.4791717529297, "epoch": 8.236087689713322, "grad_norm": 8.114872001225699, "kl": 0.3974609375, "learning_rate": 3.136261261261261e-07, "loss": 0.0004, "reward": 3.6249054670333862, "reward_std": 0.07169766910374165, "rewards/final_reward": 1.5403707879258648, "rewards/mask_iou_reward": 0.7701853939629324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.635322093963623, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2438, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 206.5416717529297, "epoch": 8.23946037099494, "grad_norm": 9.312476476695707, "kl": 0.45703125, "learning_rate": 3.1334459459459454e-07, "loss": 0.0005, "reward": 3.6166598796844482, "reward_std": 0.07187589257955551, "rewards/final_reward": 1.833042004311349, "rewards/mask_iou_reward": 0.9165210021556744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6166600584983826, "rewards/thk_ans_format_reward": 1.0, "step": 2439, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 151.25000762939453, "epoch": 8.24283305227656, "grad_norm": 12.932719567705236, "kl": 0.474609375, "learning_rate": 3.1306306306306305e-07, "loss": 0.0005, "reward": 3.3091615438461304, "reward_std": 0.06278246641159058, "rewards/final_reward": 1.414298059959736, "rewards/mask_iou_reward": 0.707149029979868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3091613054275513, "rewards/thk_ans_format_reward": 1.0, "step": 2440, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 161.39584350585938, "epoch": 8.24620573355818, "grad_norm": 6.166095258366224, "kl": 0.4091796875, "learning_rate": 3.127815315315315e-07, "loss": 0.0004, "reward": 3.5161445140838623, "reward_std": 0.12845508754253387, "rewards/final_reward": 1.7061118329359797, "rewards/mask_iou_reward": 0.8530559164679898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5161446928977966, "rewards/thk_ans_format_reward": 1.0, "step": 2441, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 172.00000762939453, "epoch": 8.249578414839798, "grad_norm": 10.115525504943898, "kl": 0.3896484375, "learning_rate": 3.1249999999999997e-07, "loss": 0.0004, "reward": 3.7777761220932007, "reward_std": 0.013954056892544031, "rewards/final_reward": 1.759472750093344, "rewards/mask_iou_reward": 0.879736375046672, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7777761816978455, "rewards/thk_ans_format_reward": 1.0, "step": 2442, "think_completion_length": 11.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 134.4791717529297, "epoch": 8.252951096121416, "grad_norm": 11.21434176611459, "kl": 0.634765625, "learning_rate": 3.1221846846846843e-07, "loss": 0.0007, "reward": 3.4758187532424927, "reward_std": 0.10847053304314613, "rewards/final_reward": 1.401818192682101, "rewards/mask_iou_reward": 0.7009090963410505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4758188128471375, "rewards/thk_ans_format_reward": 1.0, "step": 2443, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 125.40625762939453, "epoch": 8.256323777403036, "grad_norm": 7.883589360397389, "kl": 0.5, "learning_rate": 3.119369369369369e-07, "loss": 0.0005, "reward": 3.708293914794922, "reward_std": 0.01661589415743947, "rewards/final_reward": 1.8312880607156954, "rewards/mask_iou_reward": 0.9156440303578477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.708293855190277, "rewards/thk_ans_format_reward": 1.0, "step": 2444, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 155.02083587646484, "epoch": 8.259696458684655, "grad_norm": 12.16595034169912, "kl": 0.451171875, "learning_rate": 3.116554054054054e-07, "loss": 0.0005, "reward": 3.5598052740097046, "reward_std": 0.045145684853196144, "rewards/final_reward": 1.7660008482587344, "rewards/mask_iou_reward": 0.8830004241293672, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5598055124282837, "rewards/thk_ans_format_reward": 1.0, "step": 2445, "think_completion_length": 12.25 }, { "clip_ratio": 0.0, "completion_length": 127.89583587646484, "epoch": 8.263069139966273, "grad_norm": 56.30875917052071, "kl": 0.541015625, "learning_rate": 3.1137387387387387e-07, "loss": 0.0006, "reward": 3.6041558980941772, "reward_std": 0.05847676005214453, "rewards/final_reward": 1.8159008746661849, "rewards/mask_iou_reward": 0.9079504373330924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6041561365127563, "rewards/thk_ans_format_reward": 1.0, "step": 2446, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 164.14584350585938, "epoch": 8.266441821247891, "grad_norm": 12.722989721471567, "kl": 0.478515625, "learning_rate": 3.1109234234234233e-07, "loss": 0.0005, "reward": 3.610850214958191, "reward_std": 0.030726881697773933, "rewards/final_reward": 1.6853639776575466, "rewards/mask_iou_reward": 0.8426819888287733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6108500361442566, "rewards/thk_ans_format_reward": 1.0, "step": 2447, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 156.5208396911621, "epoch": 8.269814502529512, "grad_norm": 8.340943305441208, "kl": 0.486328125, "learning_rate": 3.108108108108108e-07, "loss": 0.0005, "reward": 3.596595287322998, "reward_std": 0.028427790850400925, "rewards/final_reward": 1.315319206376798, "rewards/mask_iou_reward": 0.657659603188399, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5965954065322876, "rewards/thk_ans_format_reward": 1.0, "step": 2448, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.7916717529297, "epoch": 8.27318718381113, "grad_norm": 13.227968762729217, "kl": 0.44921875, "learning_rate": 3.1052927927927925e-07, "loss": 0.0005, "reward": 3.6097664833068848, "reward_std": 0.13787231594324112, "rewards/final_reward": 1.9214265601311156, "rewards/mask_iou_reward": 0.9607132800655578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6097664833068848, "rewards/thk_ans_format_reward": 1.0, "step": 2449, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 157.96875762939453, "epoch": 8.276559865092748, "grad_norm": 10.939981851399066, "kl": 0.4560546875, "learning_rate": 3.1024774774774776e-07, "loss": 0.0005, "reward": 3.5717475414276123, "reward_std": 0.06208985298871994, "rewards/final_reward": 1.2314015001254215, "rewards/mask_iou_reward": 0.6157007500627107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5717473030090332, "rewards/thk_ans_format_reward": 1.0, "step": 2450, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 199.92708587646484, "epoch": 8.279932546374368, "grad_norm": 17.351268699684027, "kl": 0.48046875, "learning_rate": 3.099662162162162e-07, "loss": 0.0005, "reward": 3.4347504377365112, "reward_std": 0.043286630883812904, "rewards/final_reward": 1.8738782380664545, "rewards/mask_iou_reward": 0.9369391190332272, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4347504377365112, "rewards/thk_ans_format_reward": 1.0, "step": 2451, "think_completion_length": 10.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 230.5729217529297, "epoch": 8.283305227655987, "grad_norm": 7.358723536584804, "kl": 0.46875, "learning_rate": 3.096846846846847e-07, "loss": 0.0005, "reward": 3.572574019432068, "reward_std": 0.05550253111869097, "rewards/final_reward": 1.8950182497118977, "rewards/mask_iou_reward": 0.9475091248559488, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5725743770599365, "rewards/thk_ans_format_reward": 1.0, "step": 2452, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 153.08334350585938, "epoch": 8.286677908937605, "grad_norm": 12.804066129927278, "kl": 0.580078125, "learning_rate": 3.0940315315315315e-07, "loss": 0.0006, "reward": 3.285163164138794, "reward_std": 0.08269466087222099, "rewards/final_reward": 0.24234840012493677, "rewards/mask_iou_reward": 0.12117420006246839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2851630449295044, "rewards/thk_ans_format_reward": 1.0, "step": 2453, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 138.64583587646484, "epoch": 8.290050590219224, "grad_norm": 7.576007898495416, "kl": 0.650390625, "learning_rate": 3.091216216216216e-07, "loss": 0.0007, "reward": 3.386742115020752, "reward_std": 0.043751709163188934, "rewards/final_reward": 0.6861222812405611, "rewards/mask_iou_reward": 0.34306114062028054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3867419958114624, "rewards/thk_ans_format_reward": 1.0, "step": 2454, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 178.48959350585938, "epoch": 8.293423271500844, "grad_norm": 28.406237180014656, "kl": 0.4541015625, "learning_rate": 3.088400900900901e-07, "loss": 0.0005, "reward": 3.4539101123809814, "reward_std": 0.07719557732343674, "rewards/final_reward": 1.4805113924994693, "rewards/mask_iou_reward": 0.7402556962497346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4539099335670471, "rewards/thk_ans_format_reward": 1.0, "step": 2455, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 183.11458587646484, "epoch": 8.296795952782462, "grad_norm": 12.882943111223456, "kl": 0.533203125, "learning_rate": 3.085585585585586e-07, "loss": 0.0005, "reward": 3.6638059616088867, "reward_std": 0.0629742294549942, "rewards/final_reward": 1.3295112124943975, "rewards/mask_iou_reward": 0.6647556062471988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6638058423995972, "rewards/thk_ans_format_reward": 1.0, "step": 2456, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 167.14584350585938, "epoch": 8.30016863406408, "grad_norm": 35.69679144791488, "kl": 0.4951171875, "learning_rate": 3.08277027027027e-07, "loss": 0.0005, "reward": 3.436911702156067, "reward_std": 0.14027688652276993, "rewards/final_reward": 1.6058508945621877, "rewards/mask_iou_reward": 0.8029254472810938, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4369115829467773, "rewards/thk_ans_format_reward": 1.0, "step": 2457, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 155.7708396911621, "epoch": 8.3035413153457, "grad_norm": 18.234730844982323, "kl": 0.5302734375, "learning_rate": 3.0799549549549545e-07, "loss": 0.0005, "reward": 3.6933977603912354, "reward_std": 0.04440005775541067, "rewards/final_reward": 1.8964776895218742, "rewards/mask_iou_reward": 0.9482388447609371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6933976411819458, "rewards/thk_ans_format_reward": 1.0, "step": 2458, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.28125762939453, "epoch": 8.306913996627319, "grad_norm": 19.83675276244956, "kl": 0.443359375, "learning_rate": 3.077139639639639e-07, "loss": 0.0004, "reward": 3.2433114051818848, "reward_std": 0.07939034514129162, "rewards/final_reward": 1.090434110862306, "rewards/mask_iou_reward": 0.545217055431153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2433112859725952, "rewards/thk_ans_format_reward": 1.0, "step": 2459, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 182.6979217529297, "epoch": 8.310286677908937, "grad_norm": 7.085741787515577, "kl": 0.51171875, "learning_rate": 3.074324324324324e-07, "loss": 0.0005, "reward": 3.4524821043014526, "reward_std": 0.05678035132586956, "rewards/final_reward": 1.646411504121072, "rewards/mask_iou_reward": 0.823205752060536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4524821639060974, "rewards/thk_ans_format_reward": 1.0, "step": 2460, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 158.3645896911621, "epoch": 8.313659359190556, "grad_norm": 15.709015841909409, "kl": 0.62109375, "learning_rate": 3.071509009009009e-07, "loss": 0.0006, "reward": 3.4738714694976807, "reward_std": 0.07798239542171359, "rewards/final_reward": 1.6209935966105546, "rewards/mask_iou_reward": 0.8104967983052773, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4738715291023254, "rewards/thk_ans_format_reward": 1.0, "step": 2461, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 201.7604217529297, "epoch": 8.317032040472176, "grad_norm": 18.33658559895609, "kl": 0.44140625, "learning_rate": 3.0686936936936934e-07, "loss": 0.0005, "reward": 3.4468486309051514, "reward_std": 0.017853936180472374, "rewards/final_reward": 1.77218697188322, "rewards/mask_iou_reward": 0.88609348594161, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.446848750114441, "rewards/thk_ans_format_reward": 1.0, "step": 2462, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 193.53126525878906, "epoch": 8.320404721753794, "grad_norm": 14.760904578386418, "kl": 0.4501953125, "learning_rate": 3.065878378378378e-07, "loss": 0.0005, "reward": 3.0838990211486816, "reward_std": 0.08175505138933659, "rewards/final_reward": 0.858719382503991, "rewards/mask_iou_reward": 0.4293596912519955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0838988423347473, "rewards/thk_ans_format_reward": 1.0, "step": 2463, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 173.77083587646484, "epoch": 8.323777403035413, "grad_norm": 14.392272707383267, "kl": 0.71484375, "learning_rate": 3.0630630630630627e-07, "loss": 0.0007, "reward": 3.385775327682495, "reward_std": 0.17238017916679382, "rewards/final_reward": 1.1066655612204888, "rewards/mask_iou_reward": 0.5533327806102444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3857755064964294, "rewards/thk_ans_format_reward": 1.0, "step": 2464, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.15625762939453, "epoch": 8.327150084317031, "grad_norm": 20.668715407406307, "kl": 0.466796875, "learning_rate": 3.060247747747748e-07, "loss": 0.0005, "reward": 3.758315682411194, "reward_std": 0.06302101723849773, "rewards/final_reward": 1.625147822078063, "rewards/mask_iou_reward": 0.8125739110390315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.758315622806549, "rewards/thk_ans_format_reward": 1.0, "step": 2465, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 178.6875, "epoch": 8.330522765598651, "grad_norm": 10.070725120432538, "kl": 0.583984375, "learning_rate": 3.0574324324324324e-07, "loss": 0.0006, "reward": 3.7680346965789795, "reward_std": 0.05466078221797943, "rewards/final_reward": 1.620782261636709, "rewards/mask_iou_reward": 0.8103911308183545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.76803457736969, "rewards/thk_ans_format_reward": 1.0, "step": 2466, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 158.14583587646484, "epoch": 8.33389544688027, "grad_norm": 10.935838833962107, "kl": 0.49609375, "learning_rate": 3.054617117117117e-07, "loss": 0.0005, "reward": 3.594951868057251, "reward_std": 0.08344347029924393, "rewards/final_reward": 1.5041478962655797, "rewards/mask_iou_reward": 0.7520739481327898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.59495210647583, "rewards/thk_ans_format_reward": 1.0, "step": 2467, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 169.57291793823242, "epoch": 8.337268128161888, "grad_norm": 7.509251515010535, "kl": 0.576171875, "learning_rate": 3.0518018018018016e-07, "loss": 0.0006, "reward": 3.664355993270874, "reward_std": 0.08624411281198263, "rewards/final_reward": 1.2802445932812174, "rewards/mask_iou_reward": 0.6401222966406087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6643559336662292, "rewards/thk_ans_format_reward": 1.0, "step": 2468, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 195.40625762939453, "epoch": 8.340640809443508, "grad_norm": 11.699693204834624, "kl": 0.458984375, "learning_rate": 3.048986486486486e-07, "loss": 0.0005, "reward": 3.586129307746887, "reward_std": 0.13098665326833725, "rewards/final_reward": 1.595433860783781, "rewards/mask_iou_reward": 0.7977169303918905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5861293077468872, "rewards/thk_ans_format_reward": 1.0, "step": 2469, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.03125762939453, "epoch": 8.344013490725127, "grad_norm": 12.73540793684107, "kl": 0.6484375, "learning_rate": 3.0461711711711714e-07, "loss": 0.0007, "reward": 3.8514713048934937, "reward_std": 0.08601294551044703, "rewards/final_reward": 1.8853168454128082, "rewards/mask_iou_reward": 0.9426584227064041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8514713644981384, "rewards/thk_ans_format_reward": 1.0, "step": 2470, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 162.83333587646484, "epoch": 8.347386172006745, "grad_norm": 18.984630505291502, "kl": 0.57421875, "learning_rate": 3.043355855855856e-07, "loss": 0.0007, "reward": 3.7675371170043945, "reward_std": 0.08456644229590893, "rewards/final_reward": 1.90508309737051, "rewards/mask_iou_reward": 0.952541548685255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7675367593765259, "rewards/thk_ans_format_reward": 1.0, "step": 2471, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.67708587646484, "epoch": 8.350758853288363, "grad_norm": 36.358076957750235, "kl": 0.4814453125, "learning_rate": 3.0405405405405406e-07, "loss": 0.0004, "reward": 3.574353575706482, "reward_std": 0.04178227297961712, "rewards/final_reward": 1.6225584169950786, "rewards/mask_iou_reward": 0.8112792084975393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5743535161018372, "rewards/thk_ans_format_reward": 1.0, "step": 2472, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 143.14583587646484, "epoch": 8.354131534569984, "grad_norm": 9.26915726526041, "kl": 0.552734375, "learning_rate": 3.037725225225225e-07, "loss": 0.0006, "reward": 3.664590835571289, "reward_std": 0.08789392560720444, "rewards/final_reward": 1.8925300259644269, "rewards/mask_iou_reward": 0.9462650129822134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6645907759666443, "rewards/thk_ans_format_reward": 1.0, "step": 2473, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 170.56250762939453, "epoch": 8.357504215851602, "grad_norm": 6.854815896384399, "kl": 0.50390625, "learning_rate": 3.03490990990991e-07, "loss": 0.0005, "reward": 3.483082890510559, "reward_std": 0.04504427965730429, "rewards/final_reward": 1.5886423660196636, "rewards/mask_iou_reward": 0.7943211830098318, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.483082890510559, "rewards/thk_ans_format_reward": 1.0, "step": 2474, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 176.67708587646484, "epoch": 8.36087689713322, "grad_norm": 52.066697310453165, "kl": 0.5283203125, "learning_rate": 3.032094594594595e-07, "loss": 0.0005, "reward": 3.7876784801483154, "reward_std": 0.0402345466427505, "rewards/final_reward": 1.9005078347031392, "rewards/mask_iou_reward": 0.9502539173515696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7876786589622498, "rewards/thk_ans_format_reward": 1.0, "step": 2475, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 189.80209350585938, "epoch": 8.36424957841484, "grad_norm": 15.253898465155263, "kl": 0.4267578125, "learning_rate": 3.0292792792792795e-07, "loss": 0.0004, "reward": 3.3623496294021606, "reward_std": 0.06954523921012878, "rewards/final_reward": 1.6360745575927615, "rewards/mask_iou_reward": 0.8180372787963808, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3623495697975159, "rewards/thk_ans_format_reward": 1.0, "step": 2476, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 214.7291717529297, "epoch": 8.367622259696459, "grad_norm": 7.604992373949504, "kl": 0.419921875, "learning_rate": 3.0264639639639636e-07, "loss": 0.0005, "reward": 3.484631061553955, "reward_std": 0.05333420401439071, "rewards/final_reward": 0.7899648588250253, "rewards/mask_iou_reward": 0.39498242941251266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4846312403678894, "rewards/thk_ans_format_reward": 1.0, "step": 2477, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 218.48959350585938, "epoch": 8.370994940978077, "grad_norm": 16.13659881531086, "kl": 0.3974609375, "learning_rate": 3.023648648648648e-07, "loss": 0.0004, "reward": 3.7472829818725586, "reward_std": 0.052220143377780914, "rewards/final_reward": 1.8180687068441148, "rewards/mask_iou_reward": 0.9090343534220574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.747282862663269, "rewards/thk_ans_format_reward": 1.0, "step": 2478, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.05208587646484, "epoch": 8.374367622259696, "grad_norm": 24.763365338808626, "kl": 0.49609375, "learning_rate": 3.020833333333333e-07, "loss": 0.0006, "reward": 3.612781286239624, "reward_std": 0.0973962377756834, "rewards/final_reward": 1.796999700367992, "rewards/mask_iou_reward": 0.898499850183996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6127811670303345, "rewards/thk_ans_format_reward": 1.0, "step": 2479, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 178.0416717529297, "epoch": 8.377740303541316, "grad_norm": 38.40838261997966, "kl": 0.482421875, "learning_rate": 3.018018018018018e-07, "loss": 0.0005, "reward": 3.380762219429016, "reward_std": 0.2005249634385109, "rewards/final_reward": 1.290711448853035, "rewards/mask_iou_reward": 0.6453557244265175, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3911789059638977, "rewards/thk_ans_format_reward": 1.0, "step": 2480, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 172.29166793823242, "epoch": 8.381112984822934, "grad_norm": 11.097298558244487, "kl": 0.5908203125, "learning_rate": 3.0152027027027026e-07, "loss": 0.0006, "reward": 3.3576884269714355, "reward_std": 0.041434711776673794, "rewards/final_reward": 0.9189116504533033, "rewards/mask_iou_reward": 0.4594558252266516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.357688307762146, "rewards/thk_ans_format_reward": 1.0, "step": 2481, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 225.95834350585938, "epoch": 8.384485666104553, "grad_norm": 6.519165982877114, "kl": 0.40234375, "learning_rate": 3.012387387387387e-07, "loss": 0.0004, "reward": 3.258583188056946, "reward_std": 0.11860659997910261, "rewards/final_reward": 0.9140629804971686, "rewards/mask_iou_reward": 0.4570314902485843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2585831880569458, "rewards/thk_ans_format_reward": 1.0, "step": 2482, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 174.3854217529297, "epoch": 8.387858347386173, "grad_norm": 8.464867970457298, "kl": 0.45703125, "learning_rate": 3.009572072072072e-07, "loss": 0.0005, "reward": 3.522014021873474, "reward_std": 0.036655642092227936, "rewards/final_reward": 1.9023617745948194, "rewards/mask_iou_reward": 0.9511808872974097, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5220139622688293, "rewards/thk_ans_format_reward": 1.0, "step": 2483, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 290.43750762939453, "epoch": 8.391231028667791, "grad_norm": 12.517351363030858, "kl": 0.4521484375, "learning_rate": 3.0067567567567564e-07, "loss": 0.0005, "reward": 2.9502170085906982, "reward_std": 0.3760811146348715, "rewards/final_reward": 1.805158868698367, "rewards/mask_iou_reward": 0.9025794343491835, "rewards/sam_format_reward": 0.9166666865348816, "rewards/sam_reward_func_ultra": 1.1168835163116455, "rewards/thk_ans_format_reward": 0.9166666865348816, "step": 2484, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 160.61458587646484, "epoch": 8.39460370994941, "grad_norm": 9.591095522414026, "kl": 0.6611328125, "learning_rate": 3.0039414414414415e-07, "loss": 0.0007, "reward": 3.5364558696746826, "reward_std": 0.009379489347338676, "rewards/final_reward": 1.732489508652729, "rewards/mask_iou_reward": 0.8662447543263645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.536455750465393, "rewards/thk_ans_format_reward": 1.0, "step": 2485, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 250.59375762939453, "epoch": 8.397976391231028, "grad_norm": 19.63539543049306, "kl": 0.361328125, "learning_rate": 3.001126126126126e-07, "loss": 0.0004, "reward": 3.6687779426574707, "reward_std": 0.13566308468580246, "rewards/final_reward": 1.8564654166287857, "rewards/mask_iou_reward": 0.9282327083143929, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6687778234481812, "rewards/thk_ans_format_reward": 1.0, "step": 2486, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.6979217529297, "epoch": 8.401349072512648, "grad_norm": 21.022706427785067, "kl": 0.4287109375, "learning_rate": 2.9983108108108107e-07, "loss": 0.0005, "reward": 3.546531081199646, "reward_std": 0.04343246482312679, "rewards/final_reward": 0.910726898082419, "rewards/mask_iou_reward": 0.4553634490412095, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5465310215950012, "rewards/thk_ans_format_reward": 1.0, "step": 2487, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 199.3125, "epoch": 8.404721753794266, "grad_norm": 86.12108620293544, "kl": 0.564453125, "learning_rate": 2.9954954954954953e-07, "loss": 0.0006, "reward": 3.336834669113159, "reward_std": 0.04387115687131882, "rewards/final_reward": 1.6743125925985707, "rewards/mask_iou_reward": 0.8371562962992853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3368346691131592, "rewards/thk_ans_format_reward": 1.0, "step": 2488, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 178.70834350585938, "epoch": 8.408094435075885, "grad_norm": 5.607179483122207, "kl": 0.4375, "learning_rate": 2.99268018018018e-07, "loss": 0.0005, "reward": 3.646553158760071, "reward_std": 0.036387352272868156, "rewards/final_reward": 1.8311148539790383, "rewards/mask_iou_reward": 0.9155574269895191, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6465531587600708, "rewards/thk_ans_format_reward": 1.0, "step": 2489, "think_completion_length": 10.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 191.4479217529297, "epoch": 8.411467116357505, "grad_norm": 17.164425299435482, "kl": 0.57421875, "learning_rate": 2.989864864864865e-07, "loss": 0.0006, "reward": 3.4151976108551025, "reward_std": 0.05222295597195625, "rewards/final_reward": 1.0455022682978328, "rewards/mask_iou_reward": 0.5227511341489164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.415197730064392, "rewards/thk_ans_format_reward": 1.0, "step": 2490, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.40625762939453, "epoch": 8.414839797639123, "grad_norm": 9.427976224812596, "kl": 0.4296875, "learning_rate": 2.9870495495495497e-07, "loss": 0.0004, "reward": 3.4724154472351074, "reward_std": 0.02874742913991213, "rewards/final_reward": 1.2115602872674582, "rewards/mask_iou_reward": 0.6057801436337291, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.472415566444397, "rewards/thk_ans_format_reward": 1.0, "step": 2491, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 191.8125, "epoch": 8.418212478920742, "grad_norm": 9.332438234253871, "kl": 0.40625, "learning_rate": 2.9842342342342343e-07, "loss": 0.0004, "reward": 3.3823347091674805, "reward_std": 0.08106222376227379, "rewards/final_reward": 1.2735577478572697, "rewards/mask_iou_reward": 0.6367788739286349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3823343515396118, "rewards/thk_ans_format_reward": 1.0, "step": 2492, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 184.58333587646484, "epoch": 8.42158516020236, "grad_norm": 5.7462511768993645, "kl": 0.431640625, "learning_rate": 2.981418918918919e-07, "loss": 0.0004, "reward": 3.6817235946655273, "reward_std": 0.07646342925727367, "rewards/final_reward": 1.6200802844913902, "rewards/mask_iou_reward": 0.8100401422456951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6817233562469482, "rewards/thk_ans_format_reward": 1.0, "step": 2493, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 180.8854217529297, "epoch": 8.42495784148398, "grad_norm": 18.36974505848094, "kl": 0.49609375, "learning_rate": 2.9786036036036035e-07, "loss": 0.0005, "reward": 3.3335955142974854, "reward_std": 0.0239328695461154, "rewards/final_reward": 0.4093485643823379, "rewards/mask_iou_reward": 0.20467428219116895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3335955739021301, "rewards/thk_ans_format_reward": 1.0, "step": 2494, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 168.98958587646484, "epoch": 8.428330522765599, "grad_norm": 14.168116351084729, "kl": 0.4853515625, "learning_rate": 2.9757882882882886e-07, "loss": 0.0005, "reward": 3.652899146080017, "reward_std": 0.06641834788024426, "rewards/final_reward": 1.578465960049993, "rewards/mask_iou_reward": 0.7892329800249965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.652899146080017, "rewards/thk_ans_format_reward": 1.0, "step": 2495, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.3229217529297, "epoch": 8.431703204047217, "grad_norm": 9.093427773961716, "kl": 0.3779296875, "learning_rate": 2.972972972972973e-07, "loss": 0.0004, "reward": 3.5432502031326294, "reward_std": 0.06956898421049118, "rewards/final_reward": 0.9596189913403599, "rewards/mask_iou_reward": 0.47980949567017994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5432501435279846, "rewards/thk_ans_format_reward": 1.0, "step": 2496, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 180.4895896911621, "epoch": 8.435075885328837, "grad_norm": 8.366250293662942, "kl": 0.44921875, "learning_rate": 2.9701576576576573e-07, "loss": 0.0005, "reward": 3.49937105178833, "reward_std": 0.036435868591070175, "rewards/final_reward": 1.5941931373008824, "rewards/mask_iou_reward": 0.7970965686504412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.499370813369751, "rewards/thk_ans_format_reward": 1.0, "step": 2497, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 163.8541717529297, "epoch": 8.438448566610456, "grad_norm": 100.97623268945327, "kl": 0.50390625, "learning_rate": 2.967342342342342e-07, "loss": 0.0005, "reward": 3.4538137912750244, "reward_std": 0.13186132721602917, "rewards/final_reward": 1.4109515061725602, "rewards/mask_iou_reward": 0.7054757530862801, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4538137316703796, "rewards/thk_ans_format_reward": 1.0, "step": 2498, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 186.8125, "epoch": 8.441821247892074, "grad_norm": 9.866319728457622, "kl": 0.44921875, "learning_rate": 2.9645270270270265e-07, "loss": 0.0004, "reward": 3.690119504928589, "reward_std": 0.06055077165365219, "rewards/final_reward": 1.778394397487585, "rewards/mask_iou_reward": 0.8891971987437925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.690119445323944, "rewards/thk_ans_format_reward": 1.0, "step": 2499, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 225.7604217529297, "epoch": 8.445193929173692, "grad_norm": 14.757207246280633, "kl": 0.421875, "learning_rate": 2.961711711711711e-07, "loss": 0.0004, "reward": 3.51715350151062, "reward_std": 0.1072116307914257, "rewards/final_reward": 1.4594061525948856, "rewards/mask_iou_reward": 0.7297030762974428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5171534419059753, "rewards/thk_ans_format_reward": 1.0, "step": 2500, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 167.4479217529297, "epoch": 8.448566610455313, "grad_norm": 10.616897251939372, "kl": 3.1396484375, "learning_rate": 2.9588963963963963e-07, "loss": 0.0031, "reward": 3.537464737892151, "reward_std": 0.08383292891085148, "rewards/final_reward": 1.5480614030024484, "rewards/mask_iou_reward": 0.7740307015012242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.537464678287506, "rewards/thk_ans_format_reward": 1.0, "step": 2501, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 127.98958969116211, "epoch": 8.451939291736931, "grad_norm": 8.944695462699952, "kl": 0.533203125, "learning_rate": 2.956081081081081e-07, "loss": 0.0005, "reward": 3.5444079637527466, "reward_std": 0.08628739230334759, "rewards/final_reward": 1.2522563293073279, "rewards/mask_iou_reward": 0.6261281646536639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5444077253341675, "rewards/thk_ans_format_reward": 1.0, "step": 2502, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 188.7916717529297, "epoch": 8.45531197301855, "grad_norm": 18.813662646472718, "kl": 0.5625, "learning_rate": 2.9532657657657655e-07, "loss": 0.0006, "reward": 3.4374442100524902, "reward_std": 0.10433689411729574, "rewards/final_reward": 1.61969842826039, "rewards/mask_iou_reward": 0.809849214130195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4374441504478455, "rewards/thk_ans_format_reward": 1.0, "step": 2503, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 164.52083587646484, "epoch": 8.45868465430017, "grad_norm": 14.633252688172275, "kl": 0.4501953125, "learning_rate": 2.95045045045045e-07, "loss": 0.0005, "reward": 3.606281876564026, "reward_std": 0.04830903559923172, "rewards/final_reward": 1.6475285395482937, "rewards/mask_iou_reward": 0.8237642697741469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6062817573547363, "rewards/thk_ans_format_reward": 1.0, "step": 2504, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 263.0104217529297, "epoch": 8.462057335581788, "grad_norm": 24.03846680399155, "kl": 0.52734375, "learning_rate": 2.9476351351351347e-07, "loss": 0.0005, "reward": 3.4330817461013794, "reward_std": 0.22954870760440826, "rewards/final_reward": 1.5019694669903112, "rewards/mask_iou_reward": 0.7509847334951556, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.453914999961853, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2505, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 195.20833587646484, "epoch": 8.465430016863406, "grad_norm": 13.402525176205542, "kl": 0.47265625, "learning_rate": 2.94481981981982e-07, "loss": 0.0005, "reward": 3.840874671936035, "reward_std": 0.068333032540977, "rewards/final_reward": 1.80296798970013, "rewards/mask_iou_reward": 0.901483994850065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8408745527267456, "rewards/thk_ans_format_reward": 1.0, "step": 2506, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 177.43750762939453, "epoch": 8.468802698145025, "grad_norm": 9.997875936893529, "kl": 0.455078125, "learning_rate": 2.9420045045045045e-07, "loss": 0.0005, "reward": 3.3141136169433594, "reward_std": 0.06572789885103703, "rewards/final_reward": 1.3835898973357565, "rewards/mask_iou_reward": 0.6917949486678783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3141136169433594, "rewards/thk_ans_format_reward": 1.0, "step": 2507, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 189.34375762939453, "epoch": 8.472175379426645, "grad_norm": 14.379412047558572, "kl": 0.455078125, "learning_rate": 2.939189189189189e-07, "loss": 0.0005, "reward": 3.2447589635849, "reward_std": 0.06407011300325394, "rewards/final_reward": 0.7249371697291964, "rewards/mask_iou_reward": 0.3624685848645982, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2447589635849, "rewards/thk_ans_format_reward": 1.0, "step": 2508, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 191.9166717529297, "epoch": 8.475548060708263, "grad_norm": 10.626344525000441, "kl": 0.447265625, "learning_rate": 2.9363738738738737e-07, "loss": 0.0004, "reward": 3.5443687438964844, "reward_std": 0.011294094379991293, "rewards/final_reward": 0.8410213537308809, "rewards/mask_iou_reward": 0.42051067686544047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.544368863105774, "rewards/thk_ans_format_reward": 1.0, "step": 2509, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 147.56250762939453, "epoch": 8.478920741989882, "grad_norm": 141.9077108977809, "kl": 0.5576171875, "learning_rate": 2.9335585585585583e-07, "loss": 0.0006, "reward": 3.2660834789276123, "reward_std": 0.027983209118247032, "rewards/final_reward": 1.5384706716644456, "rewards/mask_iou_reward": 0.7692353358322228, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2660833597183228, "rewards/thk_ans_format_reward": 1.0, "step": 2510, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 224.2604217529297, "epoch": 8.4822934232715, "grad_norm": 7.13545711592528, "kl": 0.39453125, "learning_rate": 2.9307432432432434e-07, "loss": 0.0004, "reward": 3.448287844657898, "reward_std": 0.07316517271101475, "rewards/final_reward": 1.509025737794106, "rewards/mask_iou_reward": 0.754512868897053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.448287844657898, "rewards/thk_ans_format_reward": 1.0, "step": 2511, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 157.8854217529297, "epoch": 8.48566610455312, "grad_norm": 14.17306525874049, "kl": 0.4873046875, "learning_rate": 2.927927927927928e-07, "loss": 0.0005, "reward": 3.374244809150696, "reward_std": 0.042572012171149254, "rewards/final_reward": 1.4661105339791107, "rewards/mask_iou_reward": 0.7330552669895554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3742445707321167, "rewards/thk_ans_format_reward": 1.0, "step": 2512, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 164.59375381469727, "epoch": 8.489038785834738, "grad_norm": 8.979587153252908, "kl": 0.49609375, "learning_rate": 2.9251126126126126e-07, "loss": 0.0005, "reward": 3.571820020675659, "reward_std": 0.06202232651412487, "rewards/final_reward": 1.7060769163035236, "rewards/mask_iou_reward": 0.8530384581517618, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5718199014663696, "rewards/thk_ans_format_reward": 1.0, "step": 2513, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 141.0208396911621, "epoch": 8.492411467116357, "grad_norm": 17.016517968673256, "kl": 0.521484375, "learning_rate": 2.922297297297297e-07, "loss": 0.0005, "reward": 3.4522387981414795, "reward_std": 0.08139899373054504, "rewards/final_reward": 1.6619377635910786, "rewards/mask_iou_reward": 0.8309688817955393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4522387981414795, "rewards/thk_ans_format_reward": 1.0, "step": 2514, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 152.5416717529297, "epoch": 8.495784148397977, "grad_norm": 7.961331611236981, "kl": 0.59765625, "learning_rate": 2.919481981981982e-07, "loss": 0.0006, "reward": 3.4150949716567993, "reward_std": 0.0757363960146904, "rewards/final_reward": 1.639288865133204, "rewards/mask_iou_reward": 0.819644432566602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4150949716567993, "rewards/thk_ans_format_reward": 1.0, "step": 2515, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 208.15625762939453, "epoch": 8.499156829679595, "grad_norm": 10.052498835832177, "kl": 0.4814453125, "learning_rate": 2.916666666666667e-07, "loss": 0.0005, "reward": 3.7526577711105347, "reward_std": 0.04326079413294792, "rewards/final_reward": 1.6635693312937825, "rewards/mask_iou_reward": 0.8317846656468912, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7526578307151794, "rewards/thk_ans_format_reward": 1.0, "step": 2516, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 175.25000762939453, "epoch": 8.502529510961214, "grad_norm": 8.869345441898515, "kl": 0.6494140625, "learning_rate": 2.913851351351351e-07, "loss": 0.0006, "reward": 3.7935723066329956, "reward_std": 0.08686716388911009, "rewards/final_reward": 1.8359953194848295, "rewards/mask_iou_reward": 0.9179976597424148, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7935723066329956, "rewards/thk_ans_format_reward": 1.0, "step": 2517, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 217.9479217529297, "epoch": 8.505902192242832, "grad_norm": 9.171440126815991, "kl": 0.3994140625, "learning_rate": 2.9110360360360357e-07, "loss": 0.0004, "reward": 3.4219143390655518, "reward_std": 0.1695508360862732, "rewards/final_reward": 1.7729887482617612, "rewards/mask_iou_reward": 0.8864943741308806, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4531643986701965, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2518, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.90625, "epoch": 8.509274873524452, "grad_norm": 22.261539602959235, "kl": 0.4921875, "learning_rate": 2.9082207207207203e-07, "loss": 0.0005, "reward": 3.3779337406158447, "reward_std": 0.1589372158050537, "rewards/final_reward": 1.3534446510088611, "rewards/mask_iou_reward": 0.6767223255044306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3779336214065552, "rewards/thk_ans_format_reward": 1.0, "step": 2519, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 152.17708587646484, "epoch": 8.51264755480607, "grad_norm": 22.363042010028014, "kl": 0.470703125, "learning_rate": 2.905405405405405e-07, "loss": 0.0005, "reward": 3.648926615715027, "reward_std": 0.022691112011671066, "rewards/final_reward": 1.4853223530097646, "rewards/mask_iou_reward": 0.7426611765048823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6489266157150269, "rewards/thk_ans_format_reward": 1.0, "step": 2520, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.4479217529297, "epoch": 8.516020236087689, "grad_norm": 139.0103838529845, "kl": 0.447265625, "learning_rate": 2.90259009009009e-07, "loss": 0.0004, "reward": 3.5983364582061768, "reward_std": 0.1036510244011879, "rewards/final_reward": 1.9440492694703408, "rewards/mask_iou_reward": 0.9720246347351704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5983361601829529, "rewards/thk_ans_format_reward": 1.0, "step": 2521, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 163.05208587646484, "epoch": 8.51939291736931, "grad_norm": 6.94058567280915, "kl": 0.4609375, "learning_rate": 2.8997747747747746e-07, "loss": 0.0005, "reward": 3.624302864074707, "reward_std": 0.08075489476323128, "rewards/final_reward": 1.7019580353050605, "rewards/mask_iou_reward": 0.8509790176525303, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6243028044700623, "rewards/thk_ans_format_reward": 1.0, "step": 2522, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 169.17708587646484, "epoch": 8.522765598650928, "grad_norm": 10.212955282258918, "kl": 0.5166015625, "learning_rate": 2.896959459459459e-07, "loss": 0.0005, "reward": 3.6895445585250854, "reward_std": 0.03810789994895458, "rewards/final_reward": 1.8911533097852695, "rewards/mask_iou_reward": 0.9455766548926348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.689544677734375, "rewards/thk_ans_format_reward": 1.0, "step": 2523, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 181.84375762939453, "epoch": 8.526138279932546, "grad_norm": 11.602663710945789, "kl": 0.4384765625, "learning_rate": 2.894144144144144e-07, "loss": 0.0004, "reward": 3.2343530654907227, "reward_std": 0.14465375151485205, "rewards/final_reward": 1.6413925943182874, "rewards/mask_iou_reward": 0.8206962971591437, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2551867961883545, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2524, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 158.40625762939453, "epoch": 8.529510961214164, "grad_norm": 161.6466819493176, "kl": 0.5830078125, "learning_rate": 2.8913288288288284e-07, "loss": 0.0006, "reward": 3.6265861988067627, "reward_std": 0.10466808825731277, "rewards/final_reward": 1.3112621176439139, "rewards/mask_iou_reward": 0.6556310588219569, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6265860795974731, "rewards/thk_ans_format_reward": 1.0, "step": 2525, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 218.86458587646484, "epoch": 8.532883642495785, "grad_norm": 7.11750737081999, "kl": 0.4521484375, "learning_rate": 2.8885135135135136e-07, "loss": 0.0005, "reward": 3.681373715400696, "reward_std": 0.049964262172579765, "rewards/final_reward": 1.7365305951596461, "rewards/mask_iou_reward": 0.8682652975798231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6813737154006958, "rewards/thk_ans_format_reward": 1.0, "step": 2526, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 138.37500762939453, "epoch": 8.536256323777403, "grad_norm": 7.352675052372248, "kl": 0.6171875, "learning_rate": 2.885698198198198e-07, "loss": 0.0006, "reward": 3.810825228691101, "reward_std": 0.049029380083084106, "rewards/final_reward": 1.8610386605291138, "rewards/mask_iou_reward": 0.9305193302645569, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8108253479003906, "rewards/thk_ans_format_reward": 1.0, "step": 2527, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 188.25000762939453, "epoch": 8.539629005059021, "grad_norm": 10.908191542281262, "kl": 0.443359375, "learning_rate": 2.882882882882883e-07, "loss": 0.0004, "reward": 3.490597367286682, "reward_std": 0.07156710140407085, "rewards/final_reward": 0.9468506256295272, "rewards/mask_iou_reward": 0.4734253128147636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4905971884727478, "rewards/thk_ans_format_reward": 1.0, "step": 2528, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 174.59375762939453, "epoch": 8.543001686340641, "grad_norm": 11.045030765018707, "kl": 0.4306640625, "learning_rate": 2.8800675675675674e-07, "loss": 0.0004, "reward": 3.5810824632644653, "reward_std": 0.07197471894323826, "rewards/final_reward": 1.613765532250191, "rewards/mask_iou_reward": 0.8068827661250955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5810824632644653, "rewards/thk_ans_format_reward": 1.0, "step": 2529, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 175.0416717529297, "epoch": 8.54637436762226, "grad_norm": 14.456941600396206, "kl": 0.458984375, "learning_rate": 2.877252252252252e-07, "loss": 0.0005, "reward": 3.5833380222320557, "reward_std": 0.06957448460161686, "rewards/final_reward": 1.878485730364916, "rewards/mask_iou_reward": 0.939242865182458, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5833380222320557, "rewards/thk_ans_format_reward": 1.0, "step": 2530, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 123.53125, "epoch": 8.549747048903878, "grad_norm": 12.664397632704164, "kl": 0.4677734375, "learning_rate": 2.874436936936937e-07, "loss": 0.0005, "reward": 3.670201539993286, "reward_std": 0.023576030042022467, "rewards/final_reward": 1.4009314958362795, "rewards/mask_iou_reward": 0.7004657479181398, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6702014803886414, "rewards/thk_ans_format_reward": 1.0, "step": 2531, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 155.9791717529297, "epoch": 8.553119730185497, "grad_norm": 18.977288893600278, "kl": 0.4638671875, "learning_rate": 2.871621621621622e-07, "loss": 0.0005, "reward": 3.491969347000122, "reward_std": 0.059009552001953125, "rewards/final_reward": 1.1136705530124207, "rewards/mask_iou_reward": 0.5568352765062103, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4919692277908325, "rewards/thk_ans_format_reward": 1.0, "step": 2532, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 165.7291717529297, "epoch": 8.556492411467117, "grad_norm": 9.423306246424612, "kl": 0.498046875, "learning_rate": 2.8688063063063063e-07, "loss": 0.0005, "reward": 3.7101043462753296, "reward_std": 0.015714637003839016, "rewards/final_reward": 1.8516040221880408, "rewards/mask_iou_reward": 0.9258020110940204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7101043462753296, "rewards/thk_ans_format_reward": 1.0, "step": 2533, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 179.06250762939453, "epoch": 8.559865092748735, "grad_norm": 11.2280568629412, "kl": 0.4375, "learning_rate": 2.865990990990991e-07, "loss": 0.0004, "reward": 3.4110137224197388, "reward_std": 0.11291562020778656, "rewards/final_reward": 1.6769507866752837, "rewards/mask_iou_reward": 0.8384753933376419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4110137224197388, "rewards/thk_ans_format_reward": 1.0, "step": 2534, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 167.6666717529297, "epoch": 8.563237774030354, "grad_norm": 13.206039611412244, "kl": 0.46875, "learning_rate": 2.8631756756756756e-07, "loss": 0.0005, "reward": 3.5476059913635254, "reward_std": 0.077615050598979, "rewards/final_reward": 1.3061237702838464, "rewards/mask_iou_reward": 0.6530618851419232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5476059317588806, "rewards/thk_ans_format_reward": 1.0, "step": 2535, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 204.3854217529297, "epoch": 8.566610455311974, "grad_norm": 12.925283052005339, "kl": 0.3955078125, "learning_rate": 2.8603603603603607e-07, "loss": 0.0004, "reward": 3.677824378013611, "reward_std": 0.04426476452499628, "rewards/final_reward": 1.8621604391032591, "rewards/mask_iou_reward": 0.9310802195516296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.677824318408966, "rewards/thk_ans_format_reward": 1.0, "step": 2536, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 178.96875762939453, "epoch": 8.569983136593592, "grad_norm": 26.031777808217022, "kl": 0.3955078125, "learning_rate": 2.857545045045045e-07, "loss": 0.0004, "reward": 3.6277259588241577, "reward_std": 0.07600187882781029, "rewards/final_reward": 1.5448248992903784, "rewards/mask_iou_reward": 0.7724124496451892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.627725899219513, "rewards/thk_ans_format_reward": 1.0, "step": 2537, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 150.6354217529297, "epoch": 8.57335581787521, "grad_norm": 18.72130767765532, "kl": 0.419921875, "learning_rate": 2.8547297297297294e-07, "loss": 0.0005, "reward": 3.756251096725464, "reward_std": 0.05335315503180027, "rewards/final_reward": 1.4595373271868004, "rewards/mask_iou_reward": 0.7297686635934002, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.75625079870224, "rewards/thk_ans_format_reward": 1.0, "step": 2538, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 139.375, "epoch": 8.576728499156829, "grad_norm": 88.11159314067041, "kl": 0.6083984375, "learning_rate": 2.851914414414414e-07, "loss": 0.0006, "reward": 3.179754376411438, "reward_std": 0.13116220384836197, "rewards/final_reward": 0.8943421914303193, "rewards/mask_iou_reward": 0.44717109571515967, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1797543168067932, "rewards/thk_ans_format_reward": 1.0, "step": 2539, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 261.17708587646484, "epoch": 8.580101180438449, "grad_norm": 5.732811017543621, "kl": 0.375, "learning_rate": 2.8490990990990986e-07, "loss": 0.0004, "reward": 3.658676028251648, "reward_std": 0.11672421544790268, "rewards/final_reward": 1.626043902889117, "rewards/mask_iou_reward": 0.8130219514445585, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6690927743911743, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2540, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 118.67708969116211, "epoch": 8.583473861720067, "grad_norm": 66.56071625946628, "kl": 0.7890625, "learning_rate": 2.8462837837837837e-07, "loss": 0.0008, "reward": 3.4864169359207153, "reward_std": 0.1309407837688923, "rewards/final_reward": 1.6494711168698228, "rewards/mask_iou_reward": 0.8247355584349114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4864166378974915, "rewards/thk_ans_format_reward": 1.0, "step": 2541, "think_completion_length": 10.75 }, { "clip_ratio": 0.0, "completion_length": 154.3541717529297, "epoch": 8.586846543001686, "grad_norm": 8.568815174969663, "kl": 0.470703125, "learning_rate": 2.8434684684684683e-07, "loss": 0.0005, "reward": 3.5493093729019165, "reward_std": 0.08573894761502743, "rewards/final_reward": 1.477603366021547, "rewards/mask_iou_reward": 0.7388016830107735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5493090152740479, "rewards/thk_ans_format_reward": 1.0, "step": 2542, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 201.17709350585938, "epoch": 8.590219224283306, "grad_norm": 7.298574233572978, "kl": 0.4609375, "learning_rate": 2.840653153153153e-07, "loss": 0.0005, "reward": 3.6566131114959717, "reward_std": 0.11266613006591797, "rewards/final_reward": 1.5176401166274118, "rewards/mask_iou_reward": 0.7588200583137059, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6566131711006165, "rewards/thk_ans_format_reward": 1.0, "step": 2543, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 156.14583587646484, "epoch": 8.593591905564924, "grad_norm": 26.55636292637615, "kl": 0.5673828125, "learning_rate": 2.8378378378378376e-07, "loss": 0.0006, "reward": 3.783558487892151, "reward_std": 0.042043750174343586, "rewards/final_reward": 1.9065222306385974, "rewards/mask_iou_reward": 0.9532611153192987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7835585474967957, "rewards/thk_ans_format_reward": 1.0, "step": 2544, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 188.65625762939453, "epoch": 8.596964586846543, "grad_norm": 12.844963607007417, "kl": 0.77734375, "learning_rate": 2.835022522522522e-07, "loss": 0.0008, "reward": 3.539679527282715, "reward_std": 0.04935073805972934, "rewards/final_reward": 1.8597483490103142, "rewards/mask_iou_reward": 0.9298741745051571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5396792888641357, "rewards/thk_ans_format_reward": 1.0, "step": 2545, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 196.8854217529297, "epoch": 8.600337268128161, "grad_norm": 45.98019538397138, "kl": 0.5439453125, "learning_rate": 2.8322072072072073e-07, "loss": 0.0005, "reward": 3.5690609216690063, "reward_std": 0.04167831316590309, "rewards/final_reward": 1.0290717329363943, "rewards/mask_iou_reward": 0.5145358664681972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5690608620643616, "rewards/thk_ans_format_reward": 1.0, "step": 2546, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 245.42708587646484, "epoch": 8.603709949409781, "grad_norm": 6.232417525749554, "kl": 0.4443359375, "learning_rate": 2.829391891891892e-07, "loss": 0.0005, "reward": 3.1595572233200073, "reward_std": 0.10324277426116168, "rewards/final_reward": 1.074434420208405, "rewards/mask_iou_reward": 0.5372172101042025, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.180390626192093, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2547, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 177.84375, "epoch": 8.6070826306914, "grad_norm": 42.12515218147006, "kl": 0.455078125, "learning_rate": 2.8265765765765765e-07, "loss": 0.0005, "reward": 3.5446321964263916, "reward_std": 0.08529717102646828, "rewards/final_reward": 1.8174893080345567, "rewards/mask_iou_reward": 0.9087446540172783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5446322560310364, "rewards/thk_ans_format_reward": 1.0, "step": 2548, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.12500762939453, "epoch": 8.610455311973018, "grad_norm": 9.854312839397398, "kl": 0.408203125, "learning_rate": 2.823761261261261e-07, "loss": 0.0004, "reward": 3.767038345336914, "reward_std": 0.06215832382440567, "rewards/final_reward": 1.904067015100479, "rewards/mask_iou_reward": 0.9520335075502395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.767038345336914, "rewards/thk_ans_format_reward": 1.0, "step": 2549, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 148.38541793823242, "epoch": 8.613827993254638, "grad_norm": 6.976886758251197, "kl": 0.568359375, "learning_rate": 2.8209459459459457e-07, "loss": 0.0006, "reward": 3.729358434677124, "reward_std": 0.08664998784661293, "rewards/final_reward": 1.9594386198347054, "rewards/mask_iou_reward": 0.9797193099173527, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7293584942817688, "rewards/thk_ans_format_reward": 1.0, "step": 2550, "think_completion_length": 12.625 }, { "clip_ratio": 0.0, "completion_length": 157.83333587646484, "epoch": 8.617200674536257, "grad_norm": 6.5393567285906995, "kl": 0.43359375, "learning_rate": 2.818130630630631e-07, "loss": 0.0005, "reward": 3.4602993726730347, "reward_std": 0.168155737221241, "rewards/final_reward": 1.905844650653056, "rewards/mask_iou_reward": 0.952922325326528, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.470715880393982, "rewards/thk_ans_format_reward": 1.0, "step": 2551, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 172.73959350585938, "epoch": 8.620573355817875, "grad_norm": 11.85388614199403, "kl": 0.455078125, "learning_rate": 2.8153153153153155e-07, "loss": 0.0005, "reward": 3.556620478630066, "reward_std": 0.061766088008880615, "rewards/final_reward": 1.7875575724226098, "rewards/mask_iou_reward": 0.8937787862113049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5566202998161316, "rewards/thk_ans_format_reward": 1.0, "step": 2552, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 167.0104217529297, "epoch": 8.623946037099493, "grad_norm": 10.547197211481848, "kl": 0.4208984375, "learning_rate": 2.8125e-07, "loss": 0.0004, "reward": 3.4441052675247192, "reward_std": 0.08604156225919724, "rewards/final_reward": 1.3584484672443913, "rewards/mask_iou_reward": 0.6792242336221956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4441050291061401, "rewards/thk_ans_format_reward": 1.0, "step": 2553, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 189.59375762939453, "epoch": 8.627318718381114, "grad_norm": 18.734391005705923, "kl": 0.5078125, "learning_rate": 2.8096846846846847e-07, "loss": 0.0005, "reward": 3.513301134109497, "reward_std": 0.07055116072297096, "rewards/final_reward": 1.6922842308381596, "rewards/mask_iou_reward": 0.8461421154190798, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5133010149002075, "rewards/thk_ans_format_reward": 1.0, "step": 2554, "think_completion_length": 11.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 191.9791717529297, "epoch": 8.630691399662732, "grad_norm": 26.468379779375635, "kl": 0.62109375, "learning_rate": 2.8068693693693693e-07, "loss": 0.0006, "reward": 3.3101227283477783, "reward_std": 0.2818397730588913, "rewards/final_reward": 1.654746762295793, "rewards/mask_iou_reward": 0.8273733811478965, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3309558629989624, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2555, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 189.8229217529297, "epoch": 8.63406408094435, "grad_norm": 5.290031724177615, "kl": 0.5205078125, "learning_rate": 2.804054054054054e-07, "loss": 0.0005, "reward": 3.3882901668548584, "reward_std": 0.04036908410489559, "rewards/final_reward": 1.7354911968642042, "rewards/mask_iou_reward": 0.8677455984321021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.388290286064148, "rewards/thk_ans_format_reward": 1.0, "step": 2556, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 164.7916717529297, "epoch": 8.63743676222597, "grad_norm": 15.860539572750685, "kl": 0.4921875, "learning_rate": 2.8012387387387385e-07, "loss": 0.0005, "reward": 3.6285096406936646, "reward_std": 0.09304303559474647, "rewards/final_reward": 1.8183068185832045, "rewards/mask_iou_reward": 0.9091534092916023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6285096406936646, "rewards/thk_ans_format_reward": 1.0, "step": 2557, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 166.1041717529297, "epoch": 8.640809443507589, "grad_norm": 16.90640893768875, "kl": 0.435546875, "learning_rate": 2.798423423423423e-07, "loss": 0.0005, "reward": 3.600364923477173, "reward_std": 0.056407464668154716, "rewards/final_reward": 1.4578039481564466, "rewards/mask_iou_reward": 0.7289019740782233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6003649234771729, "rewards/thk_ans_format_reward": 1.0, "step": 2558, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 143.34375762939453, "epoch": 8.644182124789207, "grad_norm": 11.810250822733689, "kl": 0.458984375, "learning_rate": 2.7956081081081077e-07, "loss": 0.0005, "reward": 3.52646803855896, "reward_std": 0.05152006074786186, "rewards/final_reward": 0.9467531404162066, "rewards/mask_iou_reward": 0.4733765702081033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5264679193496704, "rewards/thk_ans_format_reward": 1.0, "step": 2559, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.28125762939453, "epoch": 8.647554806070826, "grad_norm": 5.154818837939803, "kl": 0.4912109375, "learning_rate": 2.7927927927927923e-07, "loss": 0.0006, "reward": 3.6589834690093994, "reward_std": 0.04206683021038771, "rewards/final_reward": 1.6065023994965042, "rewards/mask_iou_reward": 0.8032511997482521, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6589834094047546, "rewards/thk_ans_format_reward": 1.0, "step": 2560, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 173.4479217529297, "epoch": 8.650927487352446, "grad_norm": 16.38009193888725, "kl": 0.587890625, "learning_rate": 2.7899774774774775e-07, "loss": 0.0006, "reward": 3.7777464389801025, "reward_std": 0.029515139758586884, "rewards/final_reward": 1.896249769299833, "rewards/mask_iou_reward": 0.9481248846499165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.777746319770813, "rewards/thk_ans_format_reward": 1.0, "step": 2561, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 141.3020896911621, "epoch": 8.654300168634064, "grad_norm": 9.35187219045533, "kl": 0.482421875, "learning_rate": 2.787162162162162e-07, "loss": 0.0005, "reward": 3.4067482948303223, "reward_std": 0.11706292629241943, "rewards/final_reward": 1.073244567798334, "rewards/mask_iou_reward": 0.536622283899167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4067482352256775, "rewards/thk_ans_format_reward": 1.0, "step": 2562, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 213.65625762939453, "epoch": 8.657672849915683, "grad_norm": 6.688008058376227, "kl": 0.453125, "learning_rate": 2.7843468468468467e-07, "loss": 0.0005, "reward": 3.7152053117752075, "reward_std": 0.023756575770676136, "rewards/final_reward": 1.8820338318907854, "rewards/mask_iou_reward": 0.9410169159453927, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7152053117752075, "rewards/thk_ans_format_reward": 1.0, "step": 2563, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 156.6979217529297, "epoch": 8.661045531197303, "grad_norm": 34.29633067802778, "kl": 0.427734375, "learning_rate": 2.7815315315315313e-07, "loss": 0.0006, "reward": 3.733788013458252, "reward_std": 0.02422420820221305, "rewards/final_reward": 1.9275858243600428, "rewards/mask_iou_reward": 0.9637929121800214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.733788013458252, "rewards/thk_ans_format_reward": 1.0, "step": 2564, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 189.1354217529297, "epoch": 8.664418212478921, "grad_norm": 12.887212200092963, "kl": 0.4501953125, "learning_rate": 2.778716216216216e-07, "loss": 0.0005, "reward": 3.491172194480896, "reward_std": 0.07623483892530203, "rewards/final_reward": 1.2990106052827048, "rewards/mask_iou_reward": 0.6495053026413524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4911721348762512, "rewards/thk_ans_format_reward": 1.0, "step": 2565, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 218.6979217529297, "epoch": 8.66779089376054, "grad_norm": 12.075436890130483, "kl": 0.4765625, "learning_rate": 2.775900900900901e-07, "loss": 0.0005, "reward": 3.7700542211532593, "reward_std": 0.09697945602238178, "rewards/final_reward": 1.9443128383259172, "rewards/mask_iou_reward": 0.9721564191629586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7700544595718384, "rewards/thk_ans_format_reward": 1.0, "step": 2566, "think_completion_length": 10.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 208.09375, "epoch": 8.671163575042158, "grad_norm": 7.363763311581606, "kl": 0.43359375, "learning_rate": 2.7730855855855856e-07, "loss": 0.0004, "reward": 3.4510533809661865, "reward_std": 0.042235566303133965, "rewards/final_reward": 1.744057022524851, "rewards/mask_iou_reward": 0.8720285112624255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.451053500175476, "rewards/thk_ans_format_reward": 1.0, "step": 2567, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 152.34375762939453, "epoch": 8.674536256323778, "grad_norm": 13.697326391954496, "kl": 0.583984375, "learning_rate": 2.77027027027027e-07, "loss": 0.0006, "reward": 3.4283299446105957, "reward_std": 0.040075878612697124, "rewards/final_reward": 1.8370620019922566, "rewards/mask_iou_reward": 0.9185310009961283, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.42833012342453, "rewards/thk_ans_format_reward": 1.0, "step": 2568, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 186.9479217529297, "epoch": 8.677908937605396, "grad_norm": 11.546432521196792, "kl": 0.455078125, "learning_rate": 2.767454954954955e-07, "loss": 0.0005, "reward": 3.5929055213928223, "reward_std": 0.059562329202890396, "rewards/final_reward": 1.768698927073372, "rewards/mask_iou_reward": 0.884349463536686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5929054021835327, "rewards/thk_ans_format_reward": 1.0, "step": 2569, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 185.3125, "epoch": 8.681281618887015, "grad_norm": 10.9044199056127, "kl": 0.390625, "learning_rate": 2.7646396396396394e-07, "loss": 0.0004, "reward": 3.2602450847625732, "reward_std": 0.03169908095151186, "rewards/final_reward": 1.8138352665157995, "rewards/mask_iou_reward": 0.9069176332578998, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2602448463439941, "rewards/thk_ans_format_reward": 1.0, "step": 2570, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 156.8541717529297, "epoch": 8.684654300168635, "grad_norm": 54.539015249753554, "kl": 0.5869140625, "learning_rate": 2.7618243243243246e-07, "loss": 0.0006, "reward": 3.5645973682403564, "reward_std": 0.11071610450744629, "rewards/final_reward": 0.8719106954026625, "rewards/mask_iou_reward": 0.43595534770133126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5645974278450012, "rewards/thk_ans_format_reward": 1.0, "step": 2571, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 162.81250762939453, "epoch": 8.688026981450253, "grad_norm": 11.927673383869944, "kl": 0.408203125, "learning_rate": 2.759009009009009e-07, "loss": 0.0006, "reward": 3.6039984226226807, "reward_std": 0.10941345617175102, "rewards/final_reward": 0.9543782866716485, "rewards/mask_iou_reward": 0.47718914333582424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6039982438087463, "rewards/thk_ans_format_reward": 1.0, "step": 2572, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 199.05208587646484, "epoch": 8.691399662731872, "grad_norm": 8.034050467478677, "kl": 0.595703125, "learning_rate": 2.756193693693694e-07, "loss": 0.0006, "reward": 3.7109304666519165, "reward_std": 0.033792685717344284, "rewards/final_reward": 1.7174564503363023, "rewards/mask_iou_reward": 0.8587282251681512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7109304070472717, "rewards/thk_ans_format_reward": 1.0, "step": 2573, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 165.0104217529297, "epoch": 8.69477234401349, "grad_norm": 21.422356986248246, "kl": 0.623046875, "learning_rate": 2.7533783783783784e-07, "loss": 0.0006, "reward": 3.3546335697174072, "reward_std": 0.07038544863462448, "rewards/final_reward": 0.7904122383978938, "rewards/mask_iou_reward": 0.3952061191989469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.354633629322052, "rewards/thk_ans_format_reward": 1.0, "step": 2574, "think_completion_length": 9.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 219.8541717529297, "epoch": 8.69814502529511, "grad_norm": 9.696842487528931, "kl": 0.5712890625, "learning_rate": 2.7505630630630625e-07, "loss": 0.0006, "reward": 3.74173903465271, "reward_std": 0.04419276397675276, "rewards/final_reward": 1.784767222990809, "rewards/mask_iou_reward": 0.8923836114954045, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7417391538619995, "rewards/thk_ans_format_reward": 1.0, "step": 2575, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 225.3229217529297, "epoch": 8.701517706576729, "grad_norm": 9.956113940997724, "kl": 0.421875, "learning_rate": 2.7477477477477476e-07, "loss": 0.0004, "reward": 3.691778302192688, "reward_std": 0.03693321347236633, "rewards/final_reward": 1.8514881707197457, "rewards/mask_iou_reward": 0.9257440853598728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6917780637741089, "rewards/thk_ans_format_reward": 1.0, "step": 2576, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 200.92709350585938, "epoch": 8.704890387858347, "grad_norm": 19.513661619610637, "kl": 0.47265625, "learning_rate": 2.744932432432432e-07, "loss": 0.0005, "reward": 3.6737890243530273, "reward_std": 0.03834380768239498, "rewards/final_reward": 1.7957259894180986, "rewards/mask_iou_reward": 0.8978629947090493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6737890243530273, "rewards/thk_ans_format_reward": 1.0, "step": 2577, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 165.6666717529297, "epoch": 8.708263069139967, "grad_norm": 13.19141830442237, "kl": 0.484375, "learning_rate": 2.742117117117117e-07, "loss": 0.0005, "reward": 3.340681791305542, "reward_std": 0.12751448899507523, "rewards/final_reward": 1.8338263788167821, "rewards/mask_iou_reward": 0.9169131894083911, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3406816124916077, "rewards/thk_ans_format_reward": 1.0, "step": 2578, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 167.4166717529297, "epoch": 8.711635750421586, "grad_norm": 16.411047595909793, "kl": 0.4189453125, "learning_rate": 2.7393018018018014e-07, "loss": 0.0004, "reward": 3.5782195329666138, "reward_std": 0.08094577863812447, "rewards/final_reward": 1.4291149300643984, "rewards/mask_iou_reward": 0.7145574650321992, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5782194137573242, "rewards/thk_ans_format_reward": 1.0, "step": 2579, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 158.5625, "epoch": 8.715008431703204, "grad_norm": 16.802202255267115, "kl": 0.583984375, "learning_rate": 2.736486486486486e-07, "loss": 0.0006, "reward": 3.513985753059387, "reward_std": 0.13777944818139076, "rewards/final_reward": 1.2669673172997142, "rewards/mask_iou_reward": 0.6334836586498571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5139857530593872, "rewards/thk_ans_format_reward": 1.0, "step": 2580, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 214.81250762939453, "epoch": 8.718381112984822, "grad_norm": 8.00176037412226, "kl": 0.4365234375, "learning_rate": 2.733671171171171e-07, "loss": 0.0005, "reward": 3.552944779396057, "reward_std": 0.09154192451387644, "rewards/final_reward": 1.4492299638504018, "rewards/mask_iou_reward": 0.7246149819252009, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5529447793960571, "rewards/thk_ans_format_reward": 1.0, "step": 2581, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 184.6666717529297, "epoch": 8.721753794266442, "grad_norm": 10.310261963697636, "kl": 0.42578125, "learning_rate": 2.730855855855856e-07, "loss": 0.0004, "reward": 3.181455135345459, "reward_std": 0.08033962082117796, "rewards/final_reward": 1.543839269099942, "rewards/mask_iou_reward": 0.771919634549971, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1814552545547485, "rewards/thk_ans_format_reward": 1.0, "step": 2582, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.28125, "epoch": 8.72512647554806, "grad_norm": 18.977015116224848, "kl": 0.421875, "learning_rate": 2.7280405405405404e-07, "loss": 0.0005, "reward": 3.833088994026184, "reward_std": 0.0317679438740015, "rewards/final_reward": 1.8962071656871986, "rewards/mask_iou_reward": 0.9481035828435993, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.833088994026184, "rewards/thk_ans_format_reward": 1.0, "step": 2583, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 201.1979217529297, "epoch": 8.72849915682968, "grad_norm": 6.04162016708347, "kl": 0.4169921875, "learning_rate": 2.725225225225225e-07, "loss": 0.0004, "reward": 3.3477823734283447, "reward_std": 0.08656807988882065, "rewards/final_reward": 1.2145620779352024, "rewards/mask_iou_reward": 0.6072810389676012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3477822542190552, "rewards/thk_ans_format_reward": 1.0, "step": 2584, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 189.5937614440918, "epoch": 8.7318718381113, "grad_norm": 59.301908875851915, "kl": 0.447265625, "learning_rate": 2.7224099099099096e-07, "loss": 0.0004, "reward": 3.1889774799346924, "reward_std": 0.09784254245460033, "rewards/final_reward": 1.2470772716566363, "rewards/mask_iou_reward": 0.6235386358283181, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1889773607254028, "rewards/thk_ans_format_reward": 1.0, "step": 2585, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 189.39583587646484, "epoch": 8.735244519392918, "grad_norm": 11.564177217628647, "kl": 0.427734375, "learning_rate": 2.719594594594595e-07, "loss": 0.0004, "reward": 3.7254035472869873, "reward_std": 0.08181975595653057, "rewards/final_reward": 1.7824145137487353, "rewards/mask_iou_reward": 0.8912072568743676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7254034876823425, "rewards/thk_ans_format_reward": 1.0, "step": 2586, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 164.1666717529297, "epoch": 8.738617200674536, "grad_norm": 9.045044514348094, "kl": 0.5498046875, "learning_rate": 2.7167792792792793e-07, "loss": 0.0006, "reward": 3.5254725217819214, "reward_std": 0.04110686667263508, "rewards/final_reward": 1.1279837635335666, "rewards/mask_iou_reward": 0.5639918817667833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.525472342967987, "rewards/thk_ans_format_reward": 1.0, "step": 2587, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 170.6875, "epoch": 8.741989881956155, "grad_norm": 11.478877926068497, "kl": 0.71875, "learning_rate": 2.713963963963964e-07, "loss": 0.0007, "reward": 3.7656898498535156, "reward_std": 0.08955633267760277, "rewards/final_reward": 1.6858383612881407, "rewards/mask_iou_reward": 0.8429191806440703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7656898498535156, "rewards/thk_ans_format_reward": 1.0, "step": 2588, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 201.18750762939453, "epoch": 8.745362563237775, "grad_norm": 16.235873553011174, "kl": 0.4306640625, "learning_rate": 2.7111486486486486e-07, "loss": 0.0004, "reward": 3.3747421503067017, "reward_std": 0.027278369292616844, "rewards/final_reward": 1.9198226083096936, "rewards/mask_iou_reward": 0.9599113041548468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3747420907020569, "rewards/thk_ans_format_reward": 1.0, "step": 2589, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 128.64583587646484, "epoch": 8.748735244519393, "grad_norm": 10.47292339165554, "kl": 0.677734375, "learning_rate": 2.708333333333333e-07, "loss": 0.0007, "reward": 3.50990092754364, "reward_std": 0.06432923208922148, "rewards/final_reward": 0.9000077052200236, "rewards/mask_iou_reward": 0.4500038526100118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5099010467529297, "rewards/thk_ans_format_reward": 1.0, "step": 2590, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 212.9791717529297, "epoch": 8.752107925801011, "grad_norm": 17.664010232266858, "kl": 0.408203125, "learning_rate": 2.7055180180180183e-07, "loss": 0.0004, "reward": 3.2131507396698, "reward_std": 0.12909862026572227, "rewards/final_reward": 1.7976802139126753, "rewards/mask_iou_reward": 0.8988401069563376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2131509184837341, "rewards/thk_ans_format_reward": 1.0, "step": 2591, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 186.11459350585938, "epoch": 8.75548060708263, "grad_norm": 9.558168924239965, "kl": 0.548828125, "learning_rate": 2.702702702702703e-07, "loss": 0.0006, "reward": 3.4076467752456665, "reward_std": 0.03746108431369066, "rewards/final_reward": 1.3606280399295545, "rewards/mask_iou_reward": 0.6803140199647773, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4076467156410217, "rewards/thk_ans_format_reward": 1.0, "step": 2592, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 177.14584350585938, "epoch": 8.75885328836425, "grad_norm": 11.049869173104032, "kl": 0.435546875, "learning_rate": 2.6998873873873875e-07, "loss": 0.0004, "reward": 3.631982922554016, "reward_std": 0.048909788485616446, "rewards/final_reward": 1.874138768786148, "rewards/mask_iou_reward": 0.937069384393074, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6319828629493713, "rewards/thk_ans_format_reward": 1.0, "step": 2593, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.05208587646484, "epoch": 8.762225969645868, "grad_norm": 15.549417475269248, "kl": 0.494140625, "learning_rate": 2.697072072072072e-07, "loss": 0.0005, "reward": 3.5601353645324707, "reward_std": 0.048820956610143185, "rewards/final_reward": 1.745202148046324, "rewards/mask_iou_reward": 0.872601074023162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5601353645324707, "rewards/thk_ans_format_reward": 1.0, "step": 2594, "think_completion_length": 9.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 177.0104217529297, "epoch": 8.765598650927487, "grad_norm": 9.690853129284346, "kl": 0.4345703125, "learning_rate": 2.694256756756756e-07, "loss": 0.0004, "reward": 3.626412868499756, "reward_std": 0.020583651028573513, "rewards/final_reward": 1.554970225171132, "rewards/mask_iou_reward": 0.777485112585566, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6264130473136902, "rewards/thk_ans_format_reward": 1.0, "step": 2595, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 133.79166793823242, "epoch": 8.768971332209107, "grad_norm": 9.419266648136379, "kl": 0.494140625, "learning_rate": 2.6914414414414413e-07, "loss": 0.0006, "reward": 3.422475814819336, "reward_std": 0.03806304559111595, "rewards/final_reward": 1.8769513964133184, "rewards/mask_iou_reward": 0.9384756982066592, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4224757552146912, "rewards/thk_ans_format_reward": 1.0, "step": 2596, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 182.4270896911621, "epoch": 8.772344013490725, "grad_norm": 6.032356877197085, "kl": 0.5927734375, "learning_rate": 2.688626126126126e-07, "loss": 0.0006, "reward": 3.665559768676758, "reward_std": 0.031542010605335236, "rewards/final_reward": 1.7459385507383132, "rewards/mask_iou_reward": 0.8729692753691566, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6655599474906921, "rewards/thk_ans_format_reward": 1.0, "step": 2597, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 153.8541717529297, "epoch": 8.775716694772344, "grad_norm": 12.511204960457997, "kl": 0.4970703125, "learning_rate": 2.6858108108108105e-07, "loss": 0.0005, "reward": 3.4443455934524536, "reward_std": 0.13716903142631054, "rewards/final_reward": 1.4805604503427627, "rewards/mask_iou_reward": 0.7402802251713814, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4547622203826904, "rewards/thk_ans_format_reward": 1.0, "step": 2598, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 177.1875, "epoch": 8.779089376053962, "grad_norm": 18.14749880774283, "kl": 0.5205078125, "learning_rate": 2.682995495495495e-07, "loss": 0.0005, "reward": 3.4348336458206177, "reward_std": 0.04534833878278732, "rewards/final_reward": 1.6291405471232325, "rewards/mask_iou_reward": 0.8145702735616163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4348336458206177, "rewards/thk_ans_format_reward": 1.0, "step": 2599, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 155.5104217529297, "epoch": 8.782462057335582, "grad_norm": 20.271029674111745, "kl": 0.564453125, "learning_rate": 2.68018018018018e-07, "loss": 0.0006, "reward": 3.7123841047286987, "reward_std": 0.052057093009352684, "rewards/final_reward": 1.9473939414920798, "rewards/mask_iou_reward": 0.9736969707460399, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7123839259147644, "rewards/thk_ans_format_reward": 1.0, "step": 2600, "think_completion_length": 12.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 169.6979217529297, "epoch": 8.7858347386172, "grad_norm": 19.986727941177254, "kl": 0.5400390625, "learning_rate": 2.677364864864865e-07, "loss": 0.0005, "reward": 3.5862083435058594, "reward_std": 0.07752817496657372, "rewards/final_reward": 1.8850161595394952, "rewards/mask_iou_reward": 0.9425080797697476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5862082242965698, "rewards/thk_ans_format_reward": 1.0, "step": 2601, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 120.38541793823242, "epoch": 8.789207419898819, "grad_norm": 9.11727486811287, "kl": 0.5859375, "learning_rate": 2.6745495495495495e-07, "loss": 0.0006, "reward": 3.5917123556137085, "reward_std": 0.028876617550849915, "rewards/final_reward": 1.0631040108756813, "rewards/mask_iou_reward": 0.5315520054378406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.591712474822998, "rewards/thk_ans_format_reward": 1.0, "step": 2602, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.75000762939453, "epoch": 8.79258010118044, "grad_norm": 11.357046550930207, "kl": 0.4443359375, "learning_rate": 2.671734234234234e-07, "loss": 0.0005, "reward": 3.6925617456436157, "reward_std": 0.11417952738702297, "rewards/final_reward": 1.717848772073134, "rewards/mask_iou_reward": 0.858924386036567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6925618648529053, "rewards/thk_ans_format_reward": 1.0, "step": 2603, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 161.2916717529297, "epoch": 8.795952782462058, "grad_norm": 10.95869088214582, "kl": 0.4033203125, "learning_rate": 2.6689189189189187e-07, "loss": 0.0004, "reward": 3.3376840353012085, "reward_std": 0.1011296734213829, "rewards/final_reward": 1.6609974239055063, "rewards/mask_iou_reward": 0.8304987119527532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3376837968826294, "rewards/thk_ans_format_reward": 1.0, "step": 2604, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 139.6041717529297, "epoch": 8.799325463743676, "grad_norm": 23.19978010706122, "kl": 0.4560546875, "learning_rate": 2.6661036036036033e-07, "loss": 0.0005, "reward": 3.5850476026535034, "reward_std": 0.040439434349536896, "rewards/final_reward": 0.8097829479056705, "rewards/mask_iou_reward": 0.40489147395283526, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5850474834442139, "rewards/thk_ans_format_reward": 1.0, "step": 2605, "think_completion_length": 10.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 124.08333587646484, "epoch": 8.802698145025294, "grad_norm": 41.379014866284514, "kl": 0.4853515625, "learning_rate": 2.6632882882882885e-07, "loss": 0.0005, "reward": 3.5459182262420654, "reward_std": 0.05569390393793583, "rewards/final_reward": 1.0564543807372129, "rewards/mask_iou_reward": 0.5282271903686064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5459181070327759, "rewards/thk_ans_format_reward": 1.0, "step": 2606, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 171.6666717529297, "epoch": 8.806070826306915, "grad_norm": 8.74549334092815, "kl": 0.4736328125, "learning_rate": 2.660472972972973e-07, "loss": 0.0005, "reward": 3.487770438194275, "reward_std": 0.12233811803162098, "rewards/final_reward": 1.2958860855271568, "rewards/mask_iou_reward": 0.6479430427635784, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4981870651245117, "rewards/thk_ans_format_reward": 1.0, "step": 2607, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 169.0104217529297, "epoch": 8.809443507588533, "grad_norm": 13.26405079753535, "kl": 0.572265625, "learning_rate": 2.6576576576576577e-07, "loss": 0.0006, "reward": 3.5517358779907227, "reward_std": 0.04905109805986285, "rewards/final_reward": 1.2092090290589885, "rewards/mask_iou_reward": 0.6046045145294943, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5517358183860779, "rewards/thk_ans_format_reward": 1.0, "step": 2608, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 160.78125762939453, "epoch": 8.812816188870151, "grad_norm": 20.005619354745726, "kl": 0.646484375, "learning_rate": 2.6548423423423423e-07, "loss": 0.0007, "reward": 3.5325050354003906, "reward_std": 0.025053212884813547, "rewards/final_reward": 1.002610821867861, "rewards/mask_iou_reward": 0.5013054109339306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.532505214214325, "rewards/thk_ans_format_reward": 1.0, "step": 2609, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 156.28125762939453, "epoch": 8.816188870151771, "grad_norm": 14.765386223784136, "kl": 0.5078125, "learning_rate": 2.652027027027027e-07, "loss": 0.0005, "reward": 3.8309184312820435, "reward_std": 0.014153166441246867, "rewards/final_reward": 1.9386175055923904, "rewards/mask_iou_reward": 0.9693087527961952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.830918550491333, "rewards/thk_ans_format_reward": 1.0, "step": 2610, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 209.5104217529297, "epoch": 8.81956155143339, "grad_norm": 9.850666141328157, "kl": 0.58984375, "learning_rate": 2.6492117117117115e-07, "loss": 0.0006, "reward": 3.3679628372192383, "reward_std": 0.07900388538837433, "rewards/final_reward": 1.8492453060512983, "rewards/mask_iou_reward": 0.9246226530256492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.367962896823883, "rewards/thk_ans_format_reward": 1.0, "step": 2611, "think_completion_length": 10.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.0729217529297, "epoch": 8.822934232715008, "grad_norm": 11.286693578515079, "kl": 0.53125, "learning_rate": 2.6463963963963966e-07, "loss": 0.0005, "reward": 3.205089807510376, "reward_std": 0.10610627755522728, "rewards/final_reward": 0.5328642452260304, "rewards/mask_iou_reward": 0.2664321226130152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.205089807510376, "rewards/thk_ans_format_reward": 1.0, "step": 2612, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 153.92708587646484, "epoch": 8.826306913996627, "grad_norm": 62.562830341723284, "kl": 0.4521484375, "learning_rate": 2.643581081081081e-07, "loss": 0.0005, "reward": 3.6393673419952393, "reward_std": 0.08628643676638603, "rewards/final_reward": 1.7409598350531619, "rewards/mask_iou_reward": 0.8704799175265809, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6393672823905945, "rewards/thk_ans_format_reward": 1.0, "step": 2613, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 212.30208587646484, "epoch": 8.829679595278247, "grad_norm": 9.169418403340412, "kl": 0.4541015625, "learning_rate": 2.640765765765766e-07, "loss": 0.0004, "reward": 3.658125400543213, "reward_std": 0.06685709208250046, "rewards/final_reward": 1.5693793803364242, "rewards/mask_iou_reward": 0.7846896901682121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6581252813339233, "rewards/thk_ans_format_reward": 1.0, "step": 2614, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 141.4375, "epoch": 8.833052276559865, "grad_norm": 9.424040489571068, "kl": 0.46484375, "learning_rate": 2.63795045045045e-07, "loss": 0.0005, "reward": 3.3040276765823364, "reward_std": 0.051268843933939934, "rewards/final_reward": 1.3129399364091325, "rewards/mask_iou_reward": 0.6564699682045663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3040276169776917, "rewards/thk_ans_format_reward": 1.0, "step": 2615, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 195.2604217529297, "epoch": 8.836424957841484, "grad_norm": 11.799474042076646, "kl": 0.453125, "learning_rate": 2.6351351351351345e-07, "loss": 0.0005, "reward": 3.416573643684387, "reward_std": 0.07485876977443695, "rewards/final_reward": 1.8482420419840249, "rewards/mask_iou_reward": 0.9241210209920124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4165735244750977, "rewards/thk_ans_format_reward": 1.0, "step": 2616, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 161.64583587646484, "epoch": 8.839797639123104, "grad_norm": 7.646928271668927, "kl": 0.5078125, "learning_rate": 2.6323198198198197e-07, "loss": 0.0005, "reward": 3.5402209758758545, "reward_std": 0.05744621530175209, "rewards/final_reward": 1.6883144009827764, "rewards/mask_iou_reward": 0.8441572004913882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5402206778526306, "rewards/thk_ans_format_reward": 1.0, "step": 2617, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 163.36458587646484, "epoch": 8.843170320404722, "grad_norm": 10.752846546517851, "kl": 0.458984375, "learning_rate": 2.6295045045045043e-07, "loss": 0.0005, "reward": 3.4455671310424805, "reward_std": 0.07160164043307304, "rewards/final_reward": 1.8307324658180666, "rewards/mask_iou_reward": 0.9153662329090333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4455673694610596, "rewards/thk_ans_format_reward": 1.0, "step": 2618, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 179.92708587646484, "epoch": 8.84654300168634, "grad_norm": 23.347348826498127, "kl": 0.5146484375, "learning_rate": 2.626689189189189e-07, "loss": 0.0005, "reward": 3.562696099281311, "reward_std": 0.031371730379760265, "rewards/final_reward": 1.3731017182197023, "rewards/mask_iou_reward": 0.6865508591098511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5626959800720215, "rewards/thk_ans_format_reward": 1.0, "step": 2619, "think_completion_length": 10.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 210.78125762939453, "epoch": 8.849915682967959, "grad_norm": 12.860734234006927, "kl": 0.4619140625, "learning_rate": 2.6238738738738735e-07, "loss": 0.0005, "reward": 3.329040288925171, "reward_std": 0.17197439819574356, "rewards/final_reward": 1.7695295108402456, "rewards/mask_iou_reward": 0.8847647554201228, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.370707094669342, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2620, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 164.98958587646484, "epoch": 8.853288364249579, "grad_norm": 15.301909395105529, "kl": 0.5302734375, "learning_rate": 2.621058558558558e-07, "loss": 0.0006, "reward": 3.573215365409851, "reward_std": 0.04483833443373442, "rewards/final_reward": 1.4895611136624436, "rewards/mask_iou_reward": 0.7447805568312218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.573215365409851, "rewards/thk_ans_format_reward": 1.0, "step": 2621, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.46875, "epoch": 8.856661045531197, "grad_norm": 50.53942012992708, "kl": 0.439453125, "learning_rate": 2.618243243243243e-07, "loss": 0.0005, "reward": 3.6168044805526733, "reward_std": 0.12859731912612915, "rewards/final_reward": 1.6656976467758042, "rewards/mask_iou_reward": 0.8328488233879021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6168044209480286, "rewards/thk_ans_format_reward": 1.0, "step": 2622, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 245.21875762939453, "epoch": 8.860033726812816, "grad_norm": 11.774345122272566, "kl": 0.412109375, "learning_rate": 2.615427927927928e-07, "loss": 0.0004, "reward": 3.6533669233322144, "reward_std": 0.12147049978375435, "rewards/final_reward": 1.5555595096657289, "rewards/mask_iou_reward": 0.7777797548328644, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6637837290763855, "rewards/thk_ans_format_reward": 1.0, "step": 2623, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 200.34376525878906, "epoch": 8.863406408094434, "grad_norm": 6.525971220926675, "kl": 0.421875, "learning_rate": 2.6126126126126124e-07, "loss": 0.0004, "reward": 3.5324405431747437, "reward_std": 0.020154454745352268, "rewards/final_reward": 1.9465013247374108, "rewards/mask_iou_reward": 0.9732506623687054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5324404835700989, "rewards/thk_ans_format_reward": 1.0, "step": 2624, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 226.87501525878906, "epoch": 8.866779089376054, "grad_norm": 81.03369462301879, "kl": 0.443359375, "learning_rate": 2.609797297297297e-07, "loss": 0.0004, "reward": 3.6387441158294678, "reward_std": 0.03701675124466419, "rewards/final_reward": 1.4862304581123538, "rewards/mask_iou_reward": 0.7431152290561769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6387439966201782, "rewards/thk_ans_format_reward": 1.0, "step": 2625, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 222.125, "epoch": 8.870151770657673, "grad_norm": 30.87060923711784, "kl": 0.4853515625, "learning_rate": 2.6069819819819817e-07, "loss": 0.0005, "reward": 3.176121711730957, "reward_std": 0.1424817405641079, "rewards/final_reward": 1.209586082100811, "rewards/mask_iou_reward": 0.6047930410504055, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1761216521263123, "rewards/thk_ans_format_reward": 1.0, "step": 2626, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 149.1875, "epoch": 8.873524451939291, "grad_norm": 14.645248060791376, "kl": 0.48046875, "learning_rate": 2.604166666666667e-07, "loss": 0.0005, "reward": 3.415264129638672, "reward_std": 0.1616402491927147, "rewards/final_reward": 1.1429228559742883, "rewards/mask_iou_reward": 0.5714614279871442, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4152640104293823, "rewards/thk_ans_format_reward": 1.0, "step": 2627, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 184.03125762939453, "epoch": 8.876897133220911, "grad_norm": 12.615776582831762, "kl": 0.3984375, "learning_rate": 2.6013513513513514e-07, "loss": 0.0004, "reward": 3.315216302871704, "reward_std": 0.08645356260240078, "rewards/final_reward": 1.722401614164057, "rewards/mask_iou_reward": 0.8612008070820285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3152162730693817, "rewards/thk_ans_format_reward": 1.0, "step": 2628, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 189.0104217529297, "epoch": 8.88026981450253, "grad_norm": 21.727976468632153, "kl": 0.408203125, "learning_rate": 2.598536036036036e-07, "loss": 0.0004, "reward": 3.459446907043457, "reward_std": 0.14448082819581032, "rewards/final_reward": 1.3135695645172558, "rewards/mask_iou_reward": 0.6567847822586279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4594467282295227, "rewards/thk_ans_format_reward": 1.0, "step": 2629, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 201.09375762939453, "epoch": 8.883642495784148, "grad_norm": 7.832814400376301, "kl": 0.4609375, "learning_rate": 2.5957207207207206e-07, "loss": 0.0005, "reward": 3.6723581552505493, "reward_std": 0.11408434621989727, "rewards/final_reward": 1.5644726703356446, "rewards/mask_iou_reward": 0.7822363351678223, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6723580956459045, "rewards/thk_ans_format_reward": 1.0, "step": 2630, "think_completion_length": 11.75 }, { "clip_ratio": 0.0, "completion_length": 244.05208587646484, "epoch": 8.887015177065766, "grad_norm": 32.03645850756957, "kl": 0.4375, "learning_rate": 2.592905405405405e-07, "loss": 0.0004, "reward": 3.4289125204086304, "reward_std": 0.037827394902706146, "rewards/final_reward": 1.3664140757626264, "rewards/mask_iou_reward": 0.6832070378813132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4289124608039856, "rewards/thk_ans_format_reward": 1.0, "step": 2631, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.02083587646484, "epoch": 8.890387858347387, "grad_norm": 17.042048133003938, "kl": 0.4990234375, "learning_rate": 2.5900900900900904e-07, "loss": 0.0005, "reward": 3.583601951599121, "reward_std": 0.04651731997728348, "rewards/final_reward": 1.789611237700948, "rewards/mask_iou_reward": 0.894805618850474, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5836020708084106, "rewards/thk_ans_format_reward": 1.0, "step": 2632, "think_completion_length": 12.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 177.8125, "epoch": 8.893760539629005, "grad_norm": 39.833372050013864, "kl": 0.505859375, "learning_rate": 2.587274774774775e-07, "loss": 0.0005, "reward": 3.650436520576477, "reward_std": 0.03200624976307154, "rewards/final_reward": 1.7966636620129028, "rewards/mask_iou_reward": 0.8983318310064514, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6504364609718323, "rewards/thk_ans_format_reward": 1.0, "step": 2633, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 339.7083435058594, "epoch": 8.897133220910623, "grad_norm": 29.622911554214088, "kl": 0.3349609375, "learning_rate": 2.5844594594594596e-07, "loss": 0.0003, "reward": 3.3017475605010986, "reward_std": 0.18274864368140697, "rewards/final_reward": 0.8420971193307066, "rewards/mask_iou_reward": 0.4210485596653533, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3225809335708618, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2634, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 165.46875762939453, "epoch": 8.900505902192243, "grad_norm": 8.892742647552673, "kl": 0.4501953125, "learning_rate": 2.5816441441441436e-07, "loss": 0.0005, "reward": 3.360956907272339, "reward_std": 0.03940213192254305, "rewards/final_reward": 0.9547359003275036, "rewards/mask_iou_reward": 0.4773679501637518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3609567880630493, "rewards/thk_ans_format_reward": 1.0, "step": 2635, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 212.80208587646484, "epoch": 8.903878583473862, "grad_norm": 171.80196189316987, "kl": 0.46484375, "learning_rate": 2.578828828828828e-07, "loss": 0.0005, "reward": 3.7178579568862915, "reward_std": 0.04392486624419689, "rewards/final_reward": 1.557018958881521, "rewards/mask_iou_reward": 0.7785094794407605, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.717857837677002, "rewards/thk_ans_format_reward": 1.0, "step": 2636, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 201.4166717529297, "epoch": 8.90725126475548, "grad_norm": 39.6809223643521, "kl": 0.50390625, "learning_rate": 2.5760135135135134e-07, "loss": 0.0005, "reward": 3.664846181869507, "reward_std": 0.08971784822642803, "rewards/final_reward": 1.628289576339221, "rewards/mask_iou_reward": 0.8141447881696104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6648462414741516, "rewards/thk_ans_format_reward": 1.0, "step": 2637, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 243.98959350585938, "epoch": 8.910623946037099, "grad_norm": 14.746750635196374, "kl": 0.40234375, "learning_rate": 2.573198198198198e-07, "loss": 0.0004, "reward": 3.678009271621704, "reward_std": 0.04709428362548351, "rewards/final_reward": 1.849764863148052, "rewards/mask_iou_reward": 0.924882431574026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6780093908309937, "rewards/thk_ans_format_reward": 1.0, "step": 2638, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 182.875, "epoch": 8.913996627318719, "grad_norm": 51.01927196117225, "kl": 0.4228515625, "learning_rate": 2.5703828828828826e-07, "loss": 0.0004, "reward": 3.4034098386764526, "reward_std": 0.07623914256691933, "rewards/final_reward": 0.8302799876257294, "rewards/mask_iou_reward": 0.4151399938128647, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4034096598625183, "rewards/thk_ans_format_reward": 1.0, "step": 2639, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 199.39583587646484, "epoch": 8.917369308600337, "grad_norm": 12.773784866652818, "kl": 0.431640625, "learning_rate": 2.567567567567567e-07, "loss": 0.0004, "reward": 3.4415433406829834, "reward_std": 0.11167657189071178, "rewards/final_reward": 1.6203662508946173, "rewards/mask_iou_reward": 0.8101831254473086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4415434002876282, "rewards/thk_ans_format_reward": 1.0, "step": 2640, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 220.375, "epoch": 8.920741989881956, "grad_norm": 8.17540569049189, "kl": 0.458984375, "learning_rate": 2.564752252252252e-07, "loss": 0.0005, "reward": 3.6591590642929077, "reward_std": 0.08187056519091129, "rewards/final_reward": 1.334430933250397, "rewards/mask_iou_reward": 0.6672154666251985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6591590642929077, "rewards/thk_ans_format_reward": 1.0, "step": 2641, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 140.28125381469727, "epoch": 8.924114671163576, "grad_norm": 11.630567244110326, "kl": 0.560546875, "learning_rate": 2.561936936936937e-07, "loss": 0.0006, "reward": 3.5551464557647705, "reward_std": 0.1242928933352232, "rewards/final_reward": 1.1889093049377917, "rewards/mask_iou_reward": 0.5944546524688958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5551465153694153, "rewards/thk_ans_format_reward": 1.0, "step": 2642, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 165.92708587646484, "epoch": 8.927487352445194, "grad_norm": 8.779738218183537, "kl": 0.4775390625, "learning_rate": 2.5591216216216216e-07, "loss": 0.0005, "reward": 3.487833023071289, "reward_std": 0.08055975451134145, "rewards/final_reward": 1.1506063369972375, "rewards/mask_iou_reward": 0.5753031684986187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4878326654434204, "rewards/thk_ans_format_reward": 1.0, "step": 2643, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 207.96875, "epoch": 8.930860033726812, "grad_norm": 6.416920565803001, "kl": 0.4951171875, "learning_rate": 2.556306306306306e-07, "loss": 0.0005, "reward": 3.6678950786590576, "reward_std": 0.034986887127161026, "rewards/final_reward": 1.799880776864869, "rewards/mask_iou_reward": 0.8999403884324345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6678951978683472, "rewards/thk_ans_format_reward": 1.0, "step": 2644, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 156.52083587646484, "epoch": 8.93423271500843, "grad_norm": 8.99742168656809, "kl": 0.501953125, "learning_rate": 2.553490990990991e-07, "loss": 0.0005, "reward": 3.372067928314209, "reward_std": 0.06782113015651703, "rewards/final_reward": 1.8697549752479614, "rewards/mask_iou_reward": 0.9348774876239807, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3720678091049194, "rewards/thk_ans_format_reward": 1.0, "step": 2645, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 262.1145935058594, "epoch": 8.937605396290051, "grad_norm": 13.757136143314941, "kl": 0.380859375, "learning_rate": 2.5506756756756754e-07, "loss": 0.0004, "reward": 3.5446051359176636, "reward_std": 0.1319795325398445, "rewards/final_reward": 1.3958622800108376, "rewards/mask_iou_reward": 0.6979311400054188, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5654385089874268, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2646, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 198.8854217529297, "epoch": 8.94097807757167, "grad_norm": 9.816895266921808, "kl": 0.4501953125, "learning_rate": 2.5478603603603605e-07, "loss": 0.0005, "reward": 3.4566650390625, "reward_std": 0.08659421931952238, "rewards/final_reward": 1.2735341116940226, "rewards/mask_iou_reward": 0.6367670558470113, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4566651582717896, "rewards/thk_ans_format_reward": 1.0, "step": 2647, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 183.6666717529297, "epoch": 8.944350758853288, "grad_norm": 15.86401428521161, "kl": 0.4638671875, "learning_rate": 2.545045045045045e-07, "loss": 0.0005, "reward": 3.6637405157089233, "reward_std": 0.04727690666913986, "rewards/final_reward": 1.897336213430683, "rewards/mask_iou_reward": 0.9486681067153415, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6637406945228577, "rewards/thk_ans_format_reward": 1.0, "step": 2648, "think_completion_length": 11.0 }, { "clip_ratio": 0.0, "completion_length": 195.55208587646484, "epoch": 8.947723440134908, "grad_norm": 19.73551194433044, "kl": 0.3935546875, "learning_rate": 2.5422297297297297e-07, "loss": 0.0004, "reward": 3.512809634208679, "reward_std": 0.06839705258607864, "rewards/final_reward": 1.4805996022986838, "rewards/mask_iou_reward": 0.7402998011493419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5128096342086792, "rewards/thk_ans_format_reward": 1.0, "step": 2649, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 226.02084350585938, "epoch": 8.951096121416526, "grad_norm": 28.12710538047436, "kl": 0.4130859375, "learning_rate": 2.5394144144144143e-07, "loss": 0.0004, "reward": 3.279626727104187, "reward_std": 0.05651633441448212, "rewards/final_reward": 1.340548502695853, "rewards/mask_iou_reward": 0.6702742513479265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2796268165111542, "rewards/thk_ans_format_reward": 1.0, "step": 2650, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 135.40625762939453, "epoch": 8.954468802698145, "grad_norm": 12.8049783585612, "kl": 0.48828125, "learning_rate": 2.536599099099099e-07, "loss": 0.0005, "reward": 3.8202362060546875, "reward_std": 0.01339608570560813, "rewards/final_reward": 1.8661134566366775, "rewards/mask_iou_reward": 0.9330567283183387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8202361464500427, "rewards/thk_ans_format_reward": 1.0, "step": 2651, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 182.72916793823242, "epoch": 8.957841483979763, "grad_norm": 7.0945906774410625, "kl": 0.4033203125, "learning_rate": 2.533783783783784e-07, "loss": 0.0004, "reward": 3.421821355819702, "reward_std": 0.039436303079128265, "rewards/final_reward": 1.819093730521853, "rewards/mask_iou_reward": 0.9095468652609265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4218213558197021, "rewards/thk_ans_format_reward": 1.0, "step": 2652, "think_completion_length": 10.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 230.3229217529297, "epoch": 8.961214165261383, "grad_norm": 27.860175075426934, "kl": 0.521484375, "learning_rate": 2.5309684684684687e-07, "loss": 0.0005, "reward": 3.437433958053589, "reward_std": 0.12433646619319916, "rewards/final_reward": 1.7747834075878353, "rewards/mask_iou_reward": 0.8873917037939176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4374338388442993, "rewards/thk_ans_format_reward": 1.0, "step": 2653, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 151.01041793823242, "epoch": 8.964586846543002, "grad_norm": 6.465993260335525, "kl": 0.4853515625, "learning_rate": 2.5281531531531533e-07, "loss": 0.0005, "reward": 3.613997220993042, "reward_std": 0.024749555392190814, "rewards/final_reward": 1.8598908330075345, "rewards/mask_iou_reward": 0.9299454165037673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6139971613883972, "rewards/thk_ans_format_reward": 1.0, "step": 2654, "think_completion_length": 10.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 228.5, "epoch": 8.96795952782462, "grad_norm": 169.99969242460082, "kl": 0.384765625, "learning_rate": 2.5253378378378374e-07, "loss": 0.0004, "reward": 3.538175344467163, "reward_std": 0.07858727127313614, "rewards/final_reward": 1.645226200115244, "rewards/mask_iou_reward": 0.822613100057622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5381752848625183, "rewards/thk_ans_format_reward": 1.0, "step": 2655, "think_completion_length": 10.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 165.46875, "epoch": 8.97133220910624, "grad_norm": 22.292604120261398, "kl": 0.58984375, "learning_rate": 2.522522522522522e-07, "loss": 0.0006, "reward": 3.0568559169769287, "reward_std": 0.1006831880658865, "rewards/final_reward": 0.5641649094732629, "rewards/mask_iou_reward": 0.28208245473663146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0568559765815735, "rewards/thk_ans_format_reward": 1.0, "step": 2656, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 248.17708587646484, "epoch": 8.974704890387859, "grad_norm": 6.934613809314335, "kl": 0.478515625, "learning_rate": 2.519707207207207e-07, "loss": 0.0005, "reward": 3.728635311126709, "reward_std": 0.02214963547885418, "rewards/final_reward": 1.6594442697257372, "rewards/mask_iou_reward": 0.8297221348628686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7286354899406433, "rewards/thk_ans_format_reward": 1.0, "step": 2657, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 210.92708587646484, "epoch": 8.978077571669477, "grad_norm": 10.436256111361867, "kl": 0.509765625, "learning_rate": 2.5168918918918917e-07, "loss": 0.0005, "reward": 3.538660407066345, "reward_std": 0.029477974399924278, "rewards/final_reward": 1.1480477415799157, "rewards/mask_iou_reward": 0.5740238707899579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5386605262756348, "rewards/thk_ans_format_reward": 1.0, "step": 2658, "think_completion_length": 9.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 195.14583587646484, "epoch": 8.981450252951095, "grad_norm": 13.813894213545238, "kl": 0.4111328125, "learning_rate": 2.5140765765765763e-07, "loss": 0.0004, "reward": 3.539559245109558, "reward_std": 0.03265107958577573, "rewards/final_reward": 1.871948467193768, "rewards/mask_iou_reward": 0.935974233596884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.539559245109558, "rewards/thk_ans_format_reward": 1.0, "step": 2659, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 154.12500381469727, "epoch": 8.984822934232715, "grad_norm": 14.094723839060045, "kl": 0.505859375, "learning_rate": 2.511261261261261e-07, "loss": 0.0005, "reward": 3.61898672580719, "reward_std": 0.029271011240780354, "rewards/final_reward": 1.828193769950737, "rewards/mask_iou_reward": 0.9140968849753685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6189867854118347, "rewards/thk_ans_format_reward": 1.0, "step": 2660, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 191.1666717529297, "epoch": 8.988195615514334, "grad_norm": 14.438567754631755, "kl": 0.4326171875, "learning_rate": 2.5084459459459455e-07, "loss": 0.0005, "reward": 3.5245808362960815, "reward_std": 0.03417748771607876, "rewards/final_reward": 1.5107258270611736, "rewards/mask_iou_reward": 0.7553629135305868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5245808959007263, "rewards/thk_ans_format_reward": 1.0, "step": 2661, "think_completion_length": 10.0 }, { "clip_ratio": 0.0, "completion_length": 167.33333587646484, "epoch": 8.991568296795952, "grad_norm": 11.380236907302288, "kl": 0.6640625, "learning_rate": 2.5056306306306307e-07, "loss": 0.0007, "reward": 3.6742966175079346, "reward_std": 0.034709298983216286, "rewards/final_reward": 1.6202422930984484, "rewards/mask_iou_reward": 0.8101211465492242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.674296498298645, "rewards/thk_ans_format_reward": 1.0, "step": 2662, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 205.36458587646484, "epoch": 8.994940978077572, "grad_norm": 45.308936467943454, "kl": 0.4951171875, "learning_rate": 2.5028153153153153e-07, "loss": 0.0005, "reward": 3.7181190252304077, "reward_std": 0.016878115944564342, "rewards/final_reward": 1.4298181162040127, "rewards/mask_iou_reward": 0.7149090581020063, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.718118965625763, "rewards/thk_ans_format_reward": 1.0, "step": 2663, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 218.2631607055664, "epoch": 8.99831365935919, "grad_norm": 9.649917036875612, "kl": 0.4326171875, "learning_rate": 2.5e-07, "loss": 0.0004, "reward": 3.3768553733825684, "reward_std": 0.05902155674993992, "rewards/final_reward": 1.4501204854085197, "rewards/mask_iou_reward": 0.7250602427042598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.376855492591858, "rewards/thk_ans_format_reward": 1.0, "step": 2664, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 145.55208587646484, "epoch": 9.003372681281618, "grad_norm": 15.152185059708602, "kl": 0.51171875, "learning_rate": 2.4971846846846845e-07, "loss": 0.0005, "reward": 3.7161524295806885, "reward_std": 0.052857328206300735, "rewards/final_reward": 1.3831503284858946, "rewards/mask_iou_reward": 0.6915751642429473, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.716152310371399, "rewards/thk_ans_format_reward": 1.0, "step": 2665, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 163.61458587646484, "epoch": 9.006745362563239, "grad_norm": 71.87257427370919, "kl": 0.5576171875, "learning_rate": 2.494369369369369e-07, "loss": 0.0006, "reward": 3.7588504552841187, "reward_std": 0.03195140324532986, "rewards/final_reward": 1.950061648307034, "rewards/mask_iou_reward": 0.975030824153517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7588502168655396, "rewards/thk_ans_format_reward": 1.0, "step": 2666, "think_completion_length": 9.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 137.125, "epoch": 9.010118043844857, "grad_norm": 25.326411709571683, "kl": 0.603515625, "learning_rate": 2.4915540540540537e-07, "loss": 0.0006, "reward": 3.7052032947540283, "reward_std": 0.045940013602375984, "rewards/final_reward": 1.6143038150577902, "rewards/mask_iou_reward": 0.8071519075288951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7052034139633179, "rewards/thk_ans_format_reward": 1.0, "step": 2667, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 200.20834350585938, "epoch": 9.013490725126475, "grad_norm": 7.283528282342515, "kl": 0.4326171875, "learning_rate": 2.488738738738739e-07, "loss": 0.0005, "reward": 3.5239779949188232, "reward_std": 0.06010612426325679, "rewards/final_reward": 1.7654436370875735, "rewards/mask_iou_reward": 0.8827218185437867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5239779353141785, "rewards/thk_ans_format_reward": 1.0, "step": 2668, "think_completion_length": 9.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.67708587646484, "epoch": 9.016863406408094, "grad_norm": 16.417099390110156, "kl": 0.4033203125, "learning_rate": 2.4859234234234234e-07, "loss": 0.0004, "reward": 3.594439744949341, "reward_std": 0.0543990321457386, "rewards/final_reward": 1.7274174146954215, "rewards/mask_iou_reward": 0.8637087073477108, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5944397449493408, "rewards/thk_ans_format_reward": 1.0, "step": 2669, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 162.43750762939453, "epoch": 9.020236087689714, "grad_norm": 8.903204844746314, "kl": 0.5078125, "learning_rate": 2.483108108108108e-07, "loss": 0.0005, "reward": 3.4996542930603027, "reward_std": 0.06193845346570015, "rewards/final_reward": 1.6113397853745253, "rewards/mask_iou_reward": 0.8056698926872626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4996538162231445, "rewards/thk_ans_format_reward": 1.0, "step": 2670, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 146.9166717529297, "epoch": 9.023608768971332, "grad_norm": 9.246171153011565, "kl": 0.513671875, "learning_rate": 2.4802927927927927e-07, "loss": 0.0006, "reward": 3.5686081647872925, "reward_std": 0.12947729974985123, "rewards/final_reward": 1.722881013585082, "rewards/mask_iou_reward": 0.861440506792541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5686081647872925, "rewards/thk_ans_format_reward": 1.0, "step": 2671, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 167.6666717529297, "epoch": 9.02698145025295, "grad_norm": 20.437319408883287, "kl": 0.4892578125, "learning_rate": 2.4774774774774773e-07, "loss": 0.0005, "reward": 3.53087317943573, "reward_std": 0.06894206255674362, "rewards/final_reward": 1.8161740667128283, "rewards/mask_iou_reward": 0.9080870333564142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5308731198310852, "rewards/thk_ans_format_reward": 1.0, "step": 2672, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 148.3854217529297, "epoch": 9.03035413153457, "grad_norm": 15.565045926865892, "kl": 0.5185546875, "learning_rate": 2.4746621621621624e-07, "loss": 0.0005, "reward": 3.667048692703247, "reward_std": 0.07611064240336418, "rewards/final_reward": 1.3353405029124257, "rewards/mask_iou_reward": 0.6676702514562128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6670488119125366, "rewards/thk_ans_format_reward": 1.0, "step": 2673, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 232.4791717529297, "epoch": 9.03372681281619, "grad_norm": 7.936379232416905, "kl": 0.4296875, "learning_rate": 2.4718468468468465e-07, "loss": 0.0004, "reward": 3.2925609350204468, "reward_std": 0.036365545354783535, "rewards/final_reward": 1.525952263897908, "rewards/mask_iou_reward": 0.762976131948954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2925609350204468, "rewards/thk_ans_format_reward": 1.0, "step": 2674, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 174.09375762939453, "epoch": 9.037099494097808, "grad_norm": 11.974096155908322, "kl": 0.4921875, "learning_rate": 2.469031531531531e-07, "loss": 0.0005, "reward": 3.6752796173095703, "reward_std": 0.009360826574265957, "rewards/final_reward": 1.400129051099373, "rewards/mask_iou_reward": 0.7000645255496865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6752796173095703, "rewards/thk_ans_format_reward": 1.0, "step": 2675, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 157.42709350585938, "epoch": 9.040472175379426, "grad_norm": 7.324533013799514, "kl": 0.58984375, "learning_rate": 2.466216216216216e-07, "loss": 0.0006, "reward": 3.5461130142211914, "reward_std": 0.10013717226684093, "rewards/final_reward": 1.9402012319072974, "rewards/mask_iou_reward": 0.9701006159536487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5461129546165466, "rewards/thk_ans_format_reward": 1.0, "step": 2676, "think_completion_length": 11.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 151.53125762939453, "epoch": 9.043844856661046, "grad_norm": 19.556279213220137, "kl": 0.4609375, "learning_rate": 2.463400900900901e-07, "loss": 0.0005, "reward": 3.4152863025665283, "reward_std": 0.1509530497714877, "rewards/final_reward": 1.1074901545950122, "rewards/mask_iou_reward": 0.5537450772975061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.415286362171173, "rewards/thk_ans_format_reward": 1.0, "step": 2677, "think_completion_length": 10.5 }, { "clip_ratio": 0.0, "completion_length": 216.3125, "epoch": 9.047217537942664, "grad_norm": 9.370784542991062, "kl": 0.5458984375, "learning_rate": 2.4605855855855854e-07, "loss": 0.0006, "reward": 3.72617244720459, "reward_std": 0.04560376284644008, "rewards/final_reward": 1.8367343242937455, "rewards/mask_iou_reward": 0.9183671621468728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.726172387599945, "rewards/thk_ans_format_reward": 1.0, "step": 2678, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 109.33333587646484, "epoch": 9.050590219224283, "grad_norm": 20.410965872762333, "kl": 0.564453125, "learning_rate": 2.45777027027027e-07, "loss": 0.0006, "reward": 3.73172664642334, "reward_std": 0.05516933067701757, "rewards/final_reward": 1.829631024851735, "rewards/mask_iou_reward": 0.9148155124258674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7317265272140503, "rewards/thk_ans_format_reward": 1.0, "step": 2679, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 227.8541717529297, "epoch": 9.053962900505903, "grad_norm": 10.325320257959437, "kl": 0.431640625, "learning_rate": 2.4549549549549547e-07, "loss": 0.0004, "reward": 3.636067748069763, "reward_std": 0.08627158403396606, "rewards/final_reward": 1.8044409887507031, "rewards/mask_iou_reward": 0.9022204943753516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6360676288604736, "rewards/thk_ans_format_reward": 1.0, "step": 2680, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 185.1041717529297, "epoch": 9.057335581787521, "grad_norm": 10.811211779463441, "kl": 0.494140625, "learning_rate": 2.45213963963964e-07, "loss": 0.0005, "reward": 3.682226538658142, "reward_std": 0.13119321130216122, "rewards/final_reward": 1.6477921250399699, "rewards/mask_iou_reward": 0.8238960625199849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6822264790534973, "rewards/thk_ans_format_reward": 1.0, "step": 2681, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 156.5416717529297, "epoch": 9.06070826306914, "grad_norm": 7.052167029313177, "kl": 0.5390625, "learning_rate": 2.4493243243243244e-07, "loss": 0.0005, "reward": 3.679692506790161, "reward_std": 0.04925611428916454, "rewards/final_reward": 1.473480284634789, "rewards/mask_iou_reward": 0.7367401423173945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6796923875808716, "rewards/thk_ans_format_reward": 1.0, "step": 2682, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 149.0729217529297, "epoch": 9.064080944350758, "grad_norm": 7.20948865847875, "kl": 0.53515625, "learning_rate": 2.446509009009009e-07, "loss": 0.0006, "reward": 3.765621781349182, "reward_std": 0.11311442777514458, "rewards/final_reward": 1.5743881457581685, "rewards/mask_iou_reward": 0.7871940728790843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7656217813491821, "rewards/thk_ans_format_reward": 1.0, "step": 2683, "think_completion_length": 10.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 168.04166793823242, "epoch": 9.067453625632378, "grad_norm": 9.033882101019975, "kl": 0.45703125, "learning_rate": 2.4436936936936936e-07, "loss": 0.0005, "reward": 3.5185396671295166, "reward_std": 0.03499617241322994, "rewards/final_reward": 1.715690100780282, "rewards/mask_iou_reward": 0.857845050390141, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5185397267341614, "rewards/thk_ans_format_reward": 1.0, "step": 2684, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 188.9479217529297, "epoch": 9.070826306913997, "grad_norm": 42.283447515692345, "kl": 0.4150390625, "learning_rate": 2.440878378378378e-07, "loss": 0.0004, "reward": 3.500747799873352, "reward_std": 0.07099348679184914, "rewards/final_reward": 1.4050785705392468, "rewards/mask_iou_reward": 0.7025392852696234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.500747799873352, "rewards/thk_ans_format_reward": 1.0, "step": 2685, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 168.71875762939453, "epoch": 9.074198988195615, "grad_norm": 64.2349849457948, "kl": 0.4560546875, "learning_rate": 2.438063063063063e-07, "loss": 0.0005, "reward": 3.7512688636779785, "reward_std": 0.02270980691537261, "rewards/final_reward": 1.7508754565417008, "rewards/mask_iou_reward": 0.8754377282708504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.751268982887268, "rewards/thk_ans_format_reward": 1.0, "step": 2686, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 212.1041717529297, "epoch": 9.077571669477235, "grad_norm": 8.261737297277202, "kl": 0.4921875, "learning_rate": 2.4352477477477474e-07, "loss": 0.0005, "reward": 3.7740758657455444, "reward_std": 0.02103525586426258, "rewards/final_reward": 1.7791438643069668, "rewards/mask_iou_reward": 0.8895719321534834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7740757465362549, "rewards/thk_ans_format_reward": 1.0, "step": 2687, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 137.21875, "epoch": 9.080944350758854, "grad_norm": 12.090658556796027, "kl": 0.44921875, "learning_rate": 2.4324324324324326e-07, "loss": 0.0005, "reward": 3.363537311553955, "reward_std": 0.01269416231662035, "rewards/final_reward": 1.3129628747279014, "rewards/mask_iou_reward": 0.6564814373639507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3635371327400208, "rewards/thk_ans_format_reward": 1.0, "step": 2688, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 188.82291793823242, "epoch": 9.084317032040472, "grad_norm": 30.84198680259513, "kl": 0.8203125, "learning_rate": 2.429617117117117e-07, "loss": 0.0008, "reward": 3.442896008491516, "reward_std": 0.030627870932221413, "rewards/final_reward": 1.6634693374207838, "rewards/mask_iou_reward": 0.8317346687103919, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4428958892822266, "rewards/thk_ans_format_reward": 1.0, "step": 2689, "think_completion_length": 10.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 187.45833587646484, "epoch": 9.08768971332209, "grad_norm": 11.753779857493912, "kl": 0.4951171875, "learning_rate": 2.426801801801802e-07, "loss": 0.0005, "reward": 3.4357932806015015, "reward_std": 0.3079192712903023, "rewards/final_reward": 1.3933123160743839, "rewards/mask_iou_reward": 0.6966561580371919, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.4670435190200806, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2690, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 154.38541793823242, "epoch": 9.09106239460371, "grad_norm": 28.493010119573313, "kl": 0.4873046875, "learning_rate": 2.4239864864864864e-07, "loss": 0.0005, "reward": 3.7012124061584473, "reward_std": 0.047339873388409615, "rewards/final_reward": 1.6889713608322674, "rewards/mask_iou_reward": 0.8444856804161337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7012121677398682, "rewards/thk_ans_format_reward": 1.0, "step": 2691, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 187.875, "epoch": 9.094435075885329, "grad_norm": 21.79511338790455, "kl": 0.8408203125, "learning_rate": 2.421171171171171e-07, "loss": 0.0009, "reward": 3.6198463439941406, "reward_std": 0.12590062618255615, "rewards/final_reward": 1.6057972566895269, "rewards/mask_iou_reward": 0.8028986283447634, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.619846224784851, "rewards/thk_ans_format_reward": 1.0, "step": 2692, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 246.83333587646484, "epoch": 9.097807757166947, "grad_norm": 37.82955624535845, "kl": 0.412109375, "learning_rate": 2.4183558558558556e-07, "loss": 0.0004, "reward": 3.8154425621032715, "reward_std": 0.03506853384897113, "rewards/final_reward": 1.8068316281833143, "rewards/mask_iou_reward": 0.9034158140916572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8154423236846924, "rewards/thk_ans_format_reward": 1.0, "step": 2693, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 267.59375762939453, "epoch": 9.101180438448566, "grad_norm": 16.82302390912551, "kl": 0.3876953125, "learning_rate": 2.41554054054054e-07, "loss": 0.0004, "reward": 3.6368253231048584, "reward_std": 0.06186164543032646, "rewards/final_reward": 1.6361911395760615, "rewards/mask_iou_reward": 0.8180955697880308, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6368253231048584, "rewards/thk_ans_format_reward": 1.0, "step": 2694, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 185.37500762939453, "epoch": 9.104553119730186, "grad_norm": 15.457808180787062, "kl": 0.490234375, "learning_rate": 2.412725225225225e-07, "loss": 0.0005, "reward": 3.486648440361023, "reward_std": 0.03985132835805416, "rewards/final_reward": 1.8332984350297488, "rewards/mask_iou_reward": 0.9166492175148744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4866483211517334, "rewards/thk_ans_format_reward": 1.0, "step": 2695, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 150.48958587646484, "epoch": 9.107925801011804, "grad_norm": 21.822536342612135, "kl": 0.6455078125, "learning_rate": 2.40990990990991e-07, "loss": 0.0006, "reward": 3.6761653423309326, "reward_std": 0.08975771814584732, "rewards/final_reward": 1.7592890844121998, "rewards/mask_iou_reward": 0.8796445422060999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6761654019355774, "rewards/thk_ans_format_reward": 1.0, "step": 2696, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 180.09375762939453, "epoch": 9.111298482293423, "grad_norm": 337.1635694325612, "kl": 0.3984375, "learning_rate": 2.4070945945945946e-07, "loss": 0.0004, "reward": 3.569099545478821, "reward_std": 0.08558660000562668, "rewards/final_reward": 1.368445699274106, "rewards/mask_iou_reward": 0.684222849637053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5690995454788208, "rewards/thk_ans_format_reward": 1.0, "step": 2697, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 248.36458587646484, "epoch": 9.114671163575043, "grad_norm": 86.0876614096406, "kl": 0.4296875, "learning_rate": 2.404279279279279e-07, "loss": 0.0005, "reward": 3.6935207843780518, "reward_std": 0.016915190033614635, "rewards/final_reward": 1.8676412256394792, "rewards/mask_iou_reward": 0.9338206128197396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6935204863548279, "rewards/thk_ans_format_reward": 1.0, "step": 2698, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 173.23959350585938, "epoch": 9.118043844856661, "grad_norm": 6.3744183320808085, "kl": 0.662109375, "learning_rate": 2.401463963963964e-07, "loss": 0.0007, "reward": 3.4611141681671143, "reward_std": 0.052461449056863785, "rewards/final_reward": 0.9666298071768408, "rewards/mask_iou_reward": 0.4833149035884204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4611140489578247, "rewards/thk_ans_format_reward": 1.0, "step": 2699, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.3854217529297, "epoch": 9.12141652613828, "grad_norm": 15.110711837565969, "kl": 0.498046875, "learning_rate": 2.3986486486486484e-07, "loss": 0.0005, "reward": 3.5770293474197388, "reward_std": 0.06012692954391241, "rewards/final_reward": 1.2116396444247322, "rewards/mask_iou_reward": 0.6058198222123661, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5770291090011597, "rewards/thk_ans_format_reward": 1.0, "step": 2700, "think_completion_length": 9.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 290.28126525878906, "epoch": 9.124789207419898, "grad_norm": 12.57204091524218, "kl": 0.396484375, "learning_rate": 2.3958333333333335e-07, "loss": 0.0004, "reward": 3.559865355491638, "reward_std": 0.21300777792930603, "rewards/final_reward": 1.4534144906746593, "rewards/mask_iou_reward": 0.7267072453373297, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.6015319228172302, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2701, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 119.19791793823242, "epoch": 9.128161888701518, "grad_norm": 15.569747051910305, "kl": 0.5263671875, "learning_rate": 2.393018018018018e-07, "loss": 0.0005, "reward": 3.6546788215637207, "reward_std": 0.02495476300828159, "rewards/final_reward": 1.3387666501724387, "rewards/mask_iou_reward": 0.6693833250862193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6546787023544312, "rewards/thk_ans_format_reward": 1.0, "step": 2702, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 219.12500762939453, "epoch": 9.131534569983137, "grad_norm": 9.758601611244647, "kl": 0.4072265625, "learning_rate": 2.3902027027027027e-07, "loss": 0.0003, "reward": 3.5538190603256226, "reward_std": 0.03838097210973501, "rewards/final_reward": 1.7076908719734, "rewards/mask_iou_reward": 0.8538454359867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5538190007209778, "rewards/thk_ans_format_reward": 1.0, "step": 2703, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 189.8854217529297, "epoch": 9.134907251264755, "grad_norm": 12.736825560495458, "kl": 0.470703125, "learning_rate": 2.3873873873873873e-07, "loss": 0.0005, "reward": 3.607846260070801, "reward_std": 0.016801190562546253, "rewards/final_reward": 1.2867632234165858, "rewards/mask_iou_reward": 0.6433816117082929, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6078462600708008, "rewards/thk_ans_format_reward": 1.0, "step": 2704, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 195.15625762939453, "epoch": 9.138279932546375, "grad_norm": 21.990337483598392, "kl": 0.4541015625, "learning_rate": 2.384572072072072e-07, "loss": 0.0005, "reward": 3.675834536552429, "reward_std": 0.04092971049249172, "rewards/final_reward": 1.7989294183946072, "rewards/mask_iou_reward": 0.8994647091973036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.675834596157074, "rewards/thk_ans_format_reward": 1.0, "step": 2705, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 247.65625, "epoch": 9.141652613827993, "grad_norm": 10.407018219762056, "kl": 0.5693359375, "learning_rate": 2.3817567567567568e-07, "loss": 0.0006, "reward": 3.3383116722106934, "reward_std": 0.05316999740898609, "rewards/final_reward": 1.4855893540566543, "rewards/mask_iou_reward": 0.7427946770283271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3383113145828247, "rewards/thk_ans_format_reward": 1.0, "step": 2706, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 161.1666717529297, "epoch": 9.145025295109612, "grad_norm": 8.801464639082292, "kl": 0.46875, "learning_rate": 2.3789414414414414e-07, "loss": 0.0005, "reward": 3.641343593597412, "reward_std": 0.05162101425230503, "rewards/final_reward": 1.8753772703918348, "rewards/mask_iou_reward": 0.9376886351959174, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6413435339927673, "rewards/thk_ans_format_reward": 1.0, "step": 2707, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 225.90625762939453, "epoch": 9.14839797639123, "grad_norm": 9.035870813243875, "kl": 0.4296875, "learning_rate": 2.376126126126126e-07, "loss": 0.0005, "reward": 3.6754767894744873, "reward_std": 0.030587462708353996, "rewards/final_reward": 1.6339339723553117, "rewards/mask_iou_reward": 0.8169669861776558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6754766702651978, "rewards/thk_ans_format_reward": 1.0, "step": 2708, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.1666717529297, "epoch": 9.15177065767285, "grad_norm": 13.765975270879892, "kl": 0.421875, "learning_rate": 2.3733108108108106e-07, "loss": 0.0004, "reward": 3.6789125204086304, "reward_std": 0.12016797810792923, "rewards/final_reward": 1.8026906448987035, "rewards/mask_iou_reward": 0.9013453224493517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6789124011993408, "rewards/thk_ans_format_reward": 1.0, "step": 2709, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 249.6354217529297, "epoch": 9.155143338954469, "grad_norm": 21.10436404174611, "kl": 0.4052734375, "learning_rate": 2.3704954954954952e-07, "loss": 0.0004, "reward": 3.5580179691314697, "reward_std": 0.047667574137449265, "rewards/final_reward": 1.312607811481132, "rewards/mask_iou_reward": 0.656303905740566, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5580180287361145, "rewards/thk_ans_format_reward": 1.0, "step": 2710, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 207.4166717529297, "epoch": 9.158516020236087, "grad_norm": 33.08955034272978, "kl": 0.4453125, "learning_rate": 2.36768018018018e-07, "loss": 0.0004, "reward": 3.8308794498443604, "reward_std": 0.021719856187701225, "rewards/final_reward": 1.7712656060535998, "rewards/mask_iou_reward": 0.8856328030267999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.830879271030426, "rewards/thk_ans_format_reward": 1.0, "step": 2711, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 217.89583587646484, "epoch": 9.161888701517707, "grad_norm": 16.238099343860256, "kl": 0.4501953125, "learning_rate": 2.3648648648648647e-07, "loss": 0.0005, "reward": 3.6413317918777466, "reward_std": 0.15415740525349975, "rewards/final_reward": 1.3824449875031644, "rewards/mask_iou_reward": 0.6912224937515822, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6413318514823914, "rewards/thk_ans_format_reward": 1.0, "step": 2712, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 201.89583587646484, "epoch": 9.165261382799326, "grad_norm": 26.11702177124519, "kl": 0.4384765625, "learning_rate": 2.3620495495495493e-07, "loss": 0.0004, "reward": 3.1481897830963135, "reward_std": 0.10512983053922653, "rewards/final_reward": 1.535971815659202, "rewards/mask_iou_reward": 0.767985907829601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1481897830963135, "rewards/thk_ans_format_reward": 1.0, "step": 2713, "think_completion_length": 11.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 270.0729217529297, "epoch": 9.168634064080944, "grad_norm": 10.88067258384534, "kl": 0.3583984375, "learning_rate": 2.3592342342342342e-07, "loss": 0.0004, "reward": 3.620645046234131, "reward_std": 0.17188102006912231, "rewards/final_reward": 1.874649497316282, "rewards/mask_iou_reward": 0.937324748658141, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6414784789085388, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2714, "think_completion_length": 7.458333333333333 }, { "clip_ratio": 0.0, "completion_length": 138.30208587646484, "epoch": 9.172006745362562, "grad_norm": 13.439799881812803, "kl": 0.509765625, "learning_rate": 2.3564189189189188e-07, "loss": 0.0005, "reward": 3.6212422847747803, "reward_std": 0.03326452663168311, "rewards/final_reward": 1.6828357434864154, "rewards/mask_iou_reward": 0.8414178717432077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6212422847747803, "rewards/thk_ans_format_reward": 1.0, "step": 2715, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 137.4166717529297, "epoch": 9.175379426644183, "grad_norm": 38.14283135240575, "kl": 1.384765625, "learning_rate": 2.3536036036036037e-07, "loss": 0.0015, "reward": 3.5311455726623535, "reward_std": 0.08386744372546673, "rewards/final_reward": 1.1419197146860056, "rewards/mask_iou_reward": 0.5709598573430028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5311453342437744, "rewards/thk_ans_format_reward": 1.0, "step": 2716, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 231.98959350585938, "epoch": 9.178752107925801, "grad_norm": 7.467749101139426, "kl": 0.4287109375, "learning_rate": 2.3507882882882883e-07, "loss": 0.0004, "reward": 3.3673723936080933, "reward_std": 0.03370837680995464, "rewards/final_reward": 1.1762894471475038, "rewards/mask_iou_reward": 0.5881447235737519, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3673723638057709, "rewards/thk_ans_format_reward": 1.0, "step": 2717, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 172.9791717529297, "epoch": 9.18212478920742, "grad_norm": 21.04843141904536, "kl": 0.400390625, "learning_rate": 2.347972972972973e-07, "loss": 0.0004, "reward": 3.6892040967941284, "reward_std": 0.07573777623474598, "rewards/final_reward": 1.8205007861710938, "rewards/mask_iou_reward": 0.9102503930855469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.689204216003418, "rewards/thk_ans_format_reward": 1.0, "step": 2718, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 207.125, "epoch": 9.18549747048904, "grad_norm": 17.602023385378942, "kl": 0.55078125, "learning_rate": 2.3451576576576575e-07, "loss": 0.0006, "reward": 3.3853163719177246, "reward_std": 0.05290138069540262, "rewards/final_reward": 1.1025753701321557, "rewards/mask_iou_reward": 0.5512876850660778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3853165805339813, "rewards/thk_ans_format_reward": 1.0, "step": 2719, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 182.625, "epoch": 9.188870151770658, "grad_norm": 10.662517906773052, "kl": 0.5, "learning_rate": 2.342342342342342e-07, "loss": 0.0005, "reward": 3.4125794172286987, "reward_std": 0.07688865810632706, "rewards/final_reward": 1.5585264025322876, "rewards/mask_iou_reward": 0.7792632012661438, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4125792384147644, "rewards/thk_ans_format_reward": 1.0, "step": 2720, "think_completion_length": 11.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 186.7604217529297, "epoch": 9.192242833052276, "grad_norm": 12.407990721986765, "kl": 0.4931640625, "learning_rate": 2.339527027027027e-07, "loss": 0.0005, "reward": 3.6630319356918335, "reward_std": 0.07096875412389636, "rewards/final_reward": 1.2780035781827974, "rewards/mask_iou_reward": 0.6390017890913987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6630319356918335, "rewards/thk_ans_format_reward": 1.0, "step": 2721, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 190.33333587646484, "epoch": 9.195615514333895, "grad_norm": 8.666970300646721, "kl": 0.47265625, "learning_rate": 2.3367117117117116e-07, "loss": 0.0005, "reward": 3.353834390640259, "reward_std": 0.07080530747771263, "rewards/final_reward": 1.5095522708688724, "rewards/mask_iou_reward": 0.7547761354344362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3538345694541931, "rewards/thk_ans_format_reward": 1.0, "step": 2722, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 124.01042175292969, "epoch": 9.198988195615515, "grad_norm": 13.157124933134252, "kl": 0.51953125, "learning_rate": 2.3338963963963962e-07, "loss": 0.0005, "reward": 3.3819308280944824, "reward_std": 0.08063776372000575, "rewards/final_reward": 1.5869294039563557, "rewards/mask_iou_reward": 0.7934647019781779, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3819307088851929, "rewards/thk_ans_format_reward": 1.0, "step": 2723, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 200.48958587646484, "epoch": 9.202360876897133, "grad_norm": 13.680351131666008, "kl": 0.478515625, "learning_rate": 2.331081081081081e-07, "loss": 0.0005, "reward": 3.3788503408432007, "reward_std": 0.06524365022778511, "rewards/final_reward": 1.132651409256614, "rewards/mask_iou_reward": 0.566325704628307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3788503110408783, "rewards/thk_ans_format_reward": 1.0, "step": 2724, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 151.92708587646484, "epoch": 9.205733558178752, "grad_norm": 113.04888297972742, "kl": 1.263671875, "learning_rate": 2.3282657657657657e-07, "loss": 0.0013, "reward": 3.409723162651062, "reward_std": 0.2719406746327877, "rewards/final_reward": 1.6586036546563951, "rewards/mask_iou_reward": 0.8293018273281976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4097230434417725, "rewards/thk_ans_format_reward": 1.0, "step": 2725, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 148.46875762939453, "epoch": 9.209106239460372, "grad_norm": 6.657850170510586, "kl": 0.74609375, "learning_rate": 2.3254504504504505e-07, "loss": 0.0008, "reward": 3.5601396560668945, "reward_std": 0.05424804985523224, "rewards/final_reward": 1.0, "rewards/mask_iou_reward": 0.5, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5601398944854736, "rewards/thk_ans_format_reward": 1.0, "step": 2726, "think_completion_length": 9.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 173.0, "epoch": 9.21247892074199, "grad_norm": 13.024736310820327, "kl": 0.4609375, "learning_rate": 2.3226351351351351e-07, "loss": 0.0004, "reward": 3.525766372680664, "reward_std": 0.078833919018507, "rewards/final_reward": 1.0751320455824074, "rewards/mask_iou_reward": 0.5375660227912037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5257662534713745, "rewards/thk_ans_format_reward": 1.0, "step": 2727, "think_completion_length": 10.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 158.8229217529297, "epoch": 9.215851602023609, "grad_norm": 10.84058202337327, "kl": 0.5947265625, "learning_rate": 2.3198198198198195e-07, "loss": 0.0006, "reward": 3.80234432220459, "reward_std": 0.049020628444850445, "rewards/final_reward": 1.6783597664770171, "rewards/mask_iou_reward": 0.8391798832385086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8023445010185242, "rewards/thk_ans_format_reward": 1.0, "step": 2728, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 187.8125, "epoch": 9.219224283305227, "grad_norm": 15.706268381170005, "kl": 0.4677734375, "learning_rate": 2.3170045045045044e-07, "loss": 0.0005, "reward": 3.2954295873641968, "reward_std": 0.10786097124218941, "rewards/final_reward": 1.4206121526546707, "rewards/mask_iou_reward": 0.7103060763273353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.295429527759552, "rewards/thk_ans_format_reward": 1.0, "step": 2729, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 191.1875114440918, "epoch": 9.222596964586847, "grad_norm": 16.073526537458733, "kl": 0.564453125, "learning_rate": 2.314189189189189e-07, "loss": 0.0006, "reward": 3.780893921852112, "reward_std": 0.08602484688162804, "rewards/final_reward": 1.6251333088983986, "rewards/mask_iou_reward": 0.8125666544491993, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7808939814567566, "rewards/thk_ans_format_reward": 1.0, "step": 2730, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 193.45833587646484, "epoch": 9.225969645868465, "grad_norm": 8.961890235000222, "kl": 0.583984375, "learning_rate": 2.3113738738738738e-07, "loss": 0.0006, "reward": 3.4740850925445557, "reward_std": 0.06211280822753906, "rewards/final_reward": 1.6200055105987259, "rewards/mask_iou_reward": 0.8100027552993629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4740851521492004, "rewards/thk_ans_format_reward": 1.0, "step": 2731, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 161.07291793823242, "epoch": 9.229342327150084, "grad_norm": 11.245455267084438, "kl": 0.50390625, "learning_rate": 2.3085585585585584e-07, "loss": 0.0005, "reward": 3.4567354917526245, "reward_std": 0.02357149589806795, "rewards/final_reward": 1.7550627231620988, "rewards/mask_iou_reward": 0.8775313615810494, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.456735610961914, "rewards/thk_ans_format_reward": 1.0, "step": 2732, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 208.6666717529297, "epoch": 9.232715008431704, "grad_norm": 47.63244965917098, "kl": 0.412109375, "learning_rate": 2.305743243243243e-07, "loss": 0.0004, "reward": 3.6387423276901245, "reward_std": 0.0591295319609344, "rewards/final_reward": 1.6464311243950158, "rewards/mask_iou_reward": 0.8232155621975079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6387420296669006, "rewards/thk_ans_format_reward": 1.0, "step": 2733, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 224.59375, "epoch": 9.236087689713322, "grad_norm": 9.854372652070786, "kl": 0.4228515625, "learning_rate": 2.302927927927928e-07, "loss": 0.0004, "reward": 3.661292791366577, "reward_std": 0.06245612911880016, "rewards/final_reward": 1.718917708254288, "rewards/mask_iou_reward": 0.859458854127144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6612926721572876, "rewards/thk_ans_format_reward": 1.0, "step": 2734, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 206.3125, "epoch": 9.23946037099494, "grad_norm": 9.775384666157215, "kl": 0.49609375, "learning_rate": 2.3001126126126125e-07, "loss": 0.0005, "reward": 3.6032204627990723, "reward_std": 0.07593077456112951, "rewards/final_reward": 1.0478658552790578, "rewards/mask_iou_reward": 0.5239329276395289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6032204627990723, "rewards/thk_ans_format_reward": 1.0, "step": 2735, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 255.12500762939453, "epoch": 9.24283305227656, "grad_norm": 9.93721364648544, "kl": 0.3740234375, "learning_rate": 2.2972972972972974e-07, "loss": 0.0004, "reward": 3.5557074546813965, "reward_std": 0.05211344361305237, "rewards/final_reward": 1.4102751917452139, "rewards/mask_iou_reward": 0.7051375958726069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.555707573890686, "rewards/thk_ans_format_reward": 1.0, "step": 2736, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 138.85416793823242, "epoch": 9.24620573355818, "grad_norm": 16.21290070522786, "kl": 0.669921875, "learning_rate": 2.294481981981982e-07, "loss": 0.0007, "reward": 3.4519814252853394, "reward_std": 0.07387526426464319, "rewards/final_reward": 1.8517275535386748, "rewards/mask_iou_reward": 0.9258637767693374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4519813656806946, "rewards/thk_ans_format_reward": 1.0, "step": 2737, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.8229217529297, "epoch": 9.249578414839798, "grad_norm": 17.709552106732847, "kl": 0.5380859375, "learning_rate": 2.2916666666666663e-07, "loss": 0.0006, "reward": 3.5484334230422974, "reward_std": 0.02219875669106841, "rewards/final_reward": 1.6846610932227493, "rewards/mask_iou_reward": 0.8423305466113746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5484334230422974, "rewards/thk_ans_format_reward": 1.0, "step": 2738, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 148.62500762939453, "epoch": 9.252951096121416, "grad_norm": 51.63817221274276, "kl": 0.4375, "learning_rate": 2.2888513513513512e-07, "loss": 0.0004, "reward": 3.5409849882125854, "reward_std": 0.050369157921522856, "rewards/final_reward": 1.7093165381275603, "rewards/mask_iou_reward": 0.8546582690637802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5409849286079407, "rewards/thk_ans_format_reward": 1.0, "step": 2739, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 195.48959350585938, "epoch": 9.256323777403036, "grad_norm": 44.18713123663508, "kl": 0.451171875, "learning_rate": 2.2860360360360358e-07, "loss": 0.0005, "reward": 3.589063048362732, "reward_std": 0.07845005393028259, "rewards/final_reward": 1.8372715492254943, "rewards/mask_iou_reward": 0.9186357746127471, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5890628695487976, "rewards/thk_ans_format_reward": 1.0, "step": 2740, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 208.25, "epoch": 9.259696458684655, "grad_norm": 11.948237225096927, "kl": 0.5185546875, "learning_rate": 2.2832207207207207e-07, "loss": 0.0005, "reward": 3.4584085941314697, "reward_std": 0.042058190330863, "rewards/final_reward": 1.5690370451956515, "rewards/mask_iou_reward": 0.7845185225978257, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4584084153175354, "rewards/thk_ans_format_reward": 1.0, "step": 2741, "think_completion_length": 11.5 }, { "clip_ratio": 0.0, "completion_length": 223.45833587646484, "epoch": 9.263069139966273, "grad_norm": 7.820089401290567, "kl": 0.49609375, "learning_rate": 2.2804054054054053e-07, "loss": 0.0005, "reward": 3.644398331642151, "reward_std": 0.02735021524131298, "rewards/final_reward": 1.2831713487682677, "rewards/mask_iou_reward": 0.6415856743841338, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6443983912467957, "rewards/thk_ans_format_reward": 1.0, "step": 2742, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.0729217529297, "epoch": 9.266441821247891, "grad_norm": 11.92523700648404, "kl": 0.44921875, "learning_rate": 2.27759009009009e-07, "loss": 0.0004, "reward": 3.547249913215637, "reward_std": 0.08835725113749504, "rewards/final_reward": 0.8956250313878287, "rewards/mask_iou_reward": 0.44781251569391434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5472498536109924, "rewards/thk_ans_format_reward": 1.0, "step": 2743, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.80208587646484, "epoch": 9.269814502529512, "grad_norm": 17.071450563721925, "kl": 0.5263671875, "learning_rate": 2.2747747747747748e-07, "loss": 0.0005, "reward": 3.250998616218567, "reward_std": 0.10396800190210342, "rewards/final_reward": 1.5332373470798437, "rewards/mask_iou_reward": 0.7666186735399219, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.250998616218567, "rewards/thk_ans_format_reward": 1.0, "step": 2744, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 179.5104217529297, "epoch": 9.27318718381113, "grad_norm": 17.36090561387625, "kl": 0.4482421875, "learning_rate": 2.2719594594594594e-07, "loss": 0.0005, "reward": 3.337352991104126, "reward_std": 0.14889592677354813, "rewards/final_reward": 1.15980387890831, "rewards/mask_iou_reward": 0.579901939454155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3373527526855469, "rewards/thk_ans_format_reward": 1.0, "step": 2745, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.02083587646484, "epoch": 9.276559865092748, "grad_norm": 76.89275415294323, "kl": 0.45703125, "learning_rate": 2.2691441441441443e-07, "loss": 0.0005, "reward": 3.601949453353882, "reward_std": 0.18171671777963638, "rewards/final_reward": 1.4770694682232217, "rewards/mask_iou_reward": 0.7385347341116109, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6019494533538818, "rewards/thk_ans_format_reward": 1.0, "step": 2746, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 180.33333587646484, "epoch": 9.279932546374368, "grad_norm": 7.111772882118619, "kl": 0.4208984375, "learning_rate": 2.2663288288288289e-07, "loss": 0.0005, "reward": 3.4016687870025635, "reward_std": 0.014272671192884445, "rewards/final_reward": 1.836040758464693, "rewards/mask_iou_reward": 0.9180203792323465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4016689360141754, "rewards/thk_ans_format_reward": 1.0, "step": 2747, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 152.61458587646484, "epoch": 9.283305227655987, "grad_norm": 16.122805060508597, "kl": 0.45703125, "learning_rate": 2.2635135135135132e-07, "loss": 0.0005, "reward": 3.6596243381500244, "reward_std": 0.11477926932275295, "rewards/final_reward": 1.8422299682834904, "rewards/mask_iou_reward": 0.9211149841417452, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6596242785453796, "rewards/thk_ans_format_reward": 1.0, "step": 2748, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 169.90625762939453, "epoch": 9.286677908937605, "grad_norm": 11.169039142915206, "kl": 0.5107421875, "learning_rate": 2.260698198198198e-07, "loss": 0.0005, "reward": 3.5329428911209106, "reward_std": 0.05546556948684156, "rewards/final_reward": 1.790726196815894, "rewards/mask_iou_reward": 0.895363098407947, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5329428911209106, "rewards/thk_ans_format_reward": 1.0, "step": 2749, "think_completion_length": 11.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 242.0416717529297, "epoch": 9.290050590219224, "grad_norm": 12.772723235514924, "kl": 1.099609375, "learning_rate": 2.2578828828828827e-07, "loss": 0.0011, "reward": 3.4059635400772095, "reward_std": 0.1593819446861744, "rewards/final_reward": 1.0051670460292021, "rewards/mask_iou_reward": 0.5025835230146011, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.426796793937683, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2750, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 238.76042938232422, "epoch": 9.293423271500844, "grad_norm": 7.808991961017135, "kl": 0.423828125, "learning_rate": 2.2550675675675673e-07, "loss": 0.0004, "reward": 3.221948981285095, "reward_std": 0.03692587744444609, "rewards/final_reward": 1.4581314656441513, "rewards/mask_iou_reward": 0.7290657328220757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2219487726688385, "rewards/thk_ans_format_reward": 1.0, "step": 2751, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 241.23959350585938, "epoch": 9.296795952782462, "grad_norm": 15.10335540799134, "kl": 0.4306640625, "learning_rate": 2.2522522522522522e-07, "loss": 0.0004, "reward": 3.3186607360839844, "reward_std": 0.06642237678170204, "rewards/final_reward": 0.3598344031115008, "rewards/mask_iou_reward": 0.1799172015557504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3186606764793396, "rewards/thk_ans_format_reward": 1.0, "step": 2752, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 138.6666717529297, "epoch": 9.30016863406408, "grad_norm": 38.570647147712435, "kl": 0.4482421875, "learning_rate": 2.2494369369369368e-07, "loss": 0.0005, "reward": 3.66131055355072, "reward_std": 0.03528841398656368, "rewards/final_reward": 1.517077571028143, "rewards/mask_iou_reward": 0.7585387855140715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6613103747367859, "rewards/thk_ans_format_reward": 1.0, "step": 2753, "think_completion_length": 12.0 }, { "clip_ratio": 0.0, "completion_length": 168.54166793823242, "epoch": 9.3035413153457, "grad_norm": 24.013114970469317, "kl": 0.478515625, "learning_rate": 2.2466216216216216e-07, "loss": 0.0005, "reward": 3.2862175703048706, "reward_std": 0.11558353528380394, "rewards/final_reward": 0.9459682073556034, "rewards/mask_iou_reward": 0.4729841036778017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2862175107002258, "rewards/thk_ans_format_reward": 1.0, "step": 2754, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 158.78125762939453, "epoch": 9.306913996627319, "grad_norm": 7.600629988537983, "kl": 0.5322265625, "learning_rate": 2.2438063063063062e-07, "loss": 0.0005, "reward": 3.2242766618728638, "reward_std": 0.03124239854514599, "rewards/final_reward": 1.2069206340078744, "rewards/mask_iou_reward": 0.6034603170039372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.224276602268219, "rewards/thk_ans_format_reward": 1.0, "step": 2755, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 163.72916793823242, "epoch": 9.310286677908937, "grad_norm": 14.613051056094477, "kl": 0.50390625, "learning_rate": 2.2409909909909909e-07, "loss": 0.0005, "reward": 3.601539373397827, "reward_std": 0.0633353553712368, "rewards/final_reward": 1.6655733024676027, "rewards/mask_iou_reward": 0.8327866512338014, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.601539433002472, "rewards/thk_ans_format_reward": 1.0, "step": 2756, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 254.5104217529297, "epoch": 9.313659359190556, "grad_norm": 11.469524143937166, "kl": 0.408203125, "learning_rate": 2.2381756756756757e-07, "loss": 0.0004, "reward": 3.648510694503784, "reward_std": 0.024022470228374004, "rewards/final_reward": 1.8617895600621224, "rewards/mask_iou_reward": 0.9308947800310612, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6485106945037842, "rewards/thk_ans_format_reward": 1.0, "step": 2757, "think_completion_length": 9.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 173.59375, "epoch": 9.317032040472176, "grad_norm": 12.709447015602692, "kl": 0.474609375, "learning_rate": 2.23536036036036e-07, "loss": 0.0005, "reward": 3.4784998893737793, "reward_std": 0.1642679050564766, "rewards/final_reward": 1.806289599758458, "rewards/mask_iou_reward": 0.903144799879229, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4784998893737793, "rewards/thk_ans_format_reward": 1.0, "step": 2758, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 200.6979217529297, "epoch": 9.320404721753794, "grad_norm": 12.901017739216863, "kl": 0.4248046875, "learning_rate": 2.232545045045045e-07, "loss": 0.0004, "reward": 3.3456382751464844, "reward_std": 0.17587218433618546, "rewards/final_reward": 1.1098193639560703, "rewards/mask_iou_reward": 0.5549096819780351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3456381559371948, "rewards/thk_ans_format_reward": 1.0, "step": 2759, "think_completion_length": 12.0 }, { "clip_ratio": 0.0, "completion_length": 149.09375381469727, "epoch": 9.323777403035413, "grad_norm": 9.370012619189097, "kl": 0.4833984375, "learning_rate": 2.2297297297297295e-07, "loss": 0.0005, "reward": 3.3864450454711914, "reward_std": 0.07419527135789394, "rewards/final_reward": 1.264501399628911, "rewards/mask_iou_reward": 0.6322506998144555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3864449858665466, "rewards/thk_ans_format_reward": 1.0, "step": 2760, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 331.3020935058594, "epoch": 9.327150084317031, "grad_norm": 13.10499968912817, "kl": 0.4248046875, "learning_rate": 2.2269144144144141e-07, "loss": 0.0004, "reward": 3.5883008241653442, "reward_std": 0.2940108925104141, "rewards/final_reward": 1.4497035366533044, "rewards/mask_iou_reward": 0.7248517683266522, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.671634018421173, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 2761, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 216.1875, "epoch": 9.330522765598651, "grad_norm": 34.535691879703776, "kl": 0.41015625, "learning_rate": 2.224099099099099e-07, "loss": 0.0004, "reward": 3.440687894821167, "reward_std": 0.026611979119479656, "rewards/final_reward": 1.379231231386878, "rewards/mask_iou_reward": 0.689615615693439, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4406880140304565, "rewards/thk_ans_format_reward": 1.0, "step": 2762, "think_completion_length": 10.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.7916717529297, "epoch": 9.33389544688027, "grad_norm": 10.049491198654804, "kl": 0.4208984375, "learning_rate": 2.2212837837837836e-07, "loss": 0.0004, "reward": 3.5684189796447754, "reward_std": 0.08044159226119518, "rewards/final_reward": 1.3640238839079284, "rewards/mask_iou_reward": 0.6820119419539642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.568418800830841, "rewards/thk_ans_format_reward": 1.0, "step": 2763, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.70833587646484, "epoch": 9.337268128161888, "grad_norm": 45.7431214634519, "kl": 0.7724609375, "learning_rate": 2.2184684684684685e-07, "loss": 0.0008, "reward": 3.304787039756775, "reward_std": 0.056283093988895416, "rewards/final_reward": 1.3249543151857697, "rewards/mask_iou_reward": 0.6624771575928848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3047870993614197, "rewards/thk_ans_format_reward": 1.0, "step": 2764, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.2083396911621, "epoch": 9.340640809443508, "grad_norm": 16.427846702225523, "kl": 0.453125, "learning_rate": 2.215653153153153e-07, "loss": 0.0005, "reward": 3.6365526914596558, "reward_std": 0.04116538679227233, "rewards/final_reward": 1.459079772140508, "rewards/mask_iou_reward": 0.729539886070254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6365526914596558, "rewards/thk_ans_format_reward": 1.0, "step": 2765, "think_completion_length": 10.625 }, { "clip_ratio": 0.0, "completion_length": 159.75000762939453, "epoch": 9.344013490725127, "grad_norm": 30.64628568079336, "kl": 0.619140625, "learning_rate": 2.2128378378378377e-07, "loss": 0.0007, "reward": 3.6885385513305664, "reward_std": 0.03211810206994414, "rewards/final_reward": 1.9013723725850795, "rewards/mask_iou_reward": 0.9506861862925398, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6885384321212769, "rewards/thk_ans_format_reward": 1.0, "step": 2766, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 231.13542938232422, "epoch": 9.347386172006745, "grad_norm": 17.744072847836915, "kl": 0.3955078125, "learning_rate": 2.2100225225225226e-07, "loss": 0.0004, "reward": 3.4503098726272583, "reward_std": 0.08233339712023735, "rewards/final_reward": 1.1525913602996831, "rewards/mask_iou_reward": 0.5762956801498416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4503095149993896, "rewards/thk_ans_format_reward": 1.0, "step": 2767, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 235.9375, "epoch": 9.350758853288363, "grad_norm": 10.819239423457798, "kl": 0.3896484375, "learning_rate": 2.207207207207207e-07, "loss": 0.0004, "reward": 3.7137391567230225, "reward_std": 0.07989808917045593, "rewards/final_reward": 1.4908883016966383, "rewards/mask_iou_reward": 0.7454441508483192, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7137390971183777, "rewards/thk_ans_format_reward": 1.0, "step": 2768, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 277.3229293823242, "epoch": 9.354131534569984, "grad_norm": 9.131795084672026, "kl": 0.798828125, "learning_rate": 2.2043918918918918e-07, "loss": 0.0008, "reward": 3.4504722356796265, "reward_std": 0.025778494775295258, "rewards/final_reward": 1.4927454592099887, "rewards/mask_iou_reward": 0.7463727296049943, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4504724144935608, "rewards/thk_ans_format_reward": 1.0, "step": 2769, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 240.6041717529297, "epoch": 9.357504215851602, "grad_norm": 10.494561667187618, "kl": 0.4384765625, "learning_rate": 2.2015765765765764e-07, "loss": 0.0004, "reward": 3.1785553693771362, "reward_std": 0.13898254744708538, "rewards/final_reward": 1.6997683199459952, "rewards/mask_iou_reward": 0.8498841599729976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1785553693771362, "rewards/thk_ans_format_reward": 1.0, "step": 2770, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 135.375, "epoch": 9.36087689713322, "grad_norm": 18.847084911989935, "kl": 0.96875, "learning_rate": 2.198761261261261e-07, "loss": 0.001, "reward": 3.3483951091766357, "reward_std": 0.0744620319455862, "rewards/final_reward": 1.822842056260184, "rewards/mask_iou_reward": 0.911421028130092, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3483951687812805, "rewards/thk_ans_format_reward": 1.0, "step": 2771, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 203.40625, "epoch": 9.36424957841484, "grad_norm": 13.94947088934637, "kl": 0.431640625, "learning_rate": 2.195945945945946e-07, "loss": 0.0004, "reward": 3.551551580429077, "reward_std": 0.10540284961462021, "rewards/final_reward": 1.421445187031141, "rewards/mask_iou_reward": 0.7107225935155705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5515515208244324, "rewards/thk_ans_format_reward": 1.0, "step": 2772, "think_completion_length": 10.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 190.05208587646484, "epoch": 9.367622259696459, "grad_norm": 13.877847201874827, "kl": 0.4296875, "learning_rate": 2.1931306306306305e-07, "loss": 0.0004, "reward": 3.6659048795700073, "reward_std": 0.030204295529983938, "rewards/final_reward": 1.4251490346438225, "rewards/mask_iou_reward": 0.7125745173219112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6659048795700073, "rewards/thk_ans_format_reward": 1.0, "step": 2773, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 169.1979217529297, "epoch": 9.370994940978077, "grad_norm": 14.637147856784619, "kl": 0.5185546875, "learning_rate": 2.1903153153153154e-07, "loss": 0.0006, "reward": 3.6994014978408813, "reward_std": 0.028351569548249245, "rewards/final_reward": 1.422695676590725, "rewards/mask_iou_reward": 0.7113478382953625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.699401617050171, "rewards/thk_ans_format_reward": 1.0, "step": 2774, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 265.4166717529297, "epoch": 9.374367622259696, "grad_norm": 9.120025231728167, "kl": 0.634765625, "learning_rate": 2.1875e-07, "loss": 0.0006, "reward": 3.551692247390747, "reward_std": 0.04547809809446335, "rewards/final_reward": 1.16152331109033, "rewards/mask_iou_reward": 0.580761655545165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.551692008972168, "rewards/thk_ans_format_reward": 1.0, "step": 2775, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 120.25, "epoch": 9.377740303541316, "grad_norm": 9.89069404329003, "kl": 0.4990234375, "learning_rate": 2.1846846846846846e-07, "loss": 0.0005, "reward": 3.7603938579559326, "reward_std": 0.013669957872480154, "rewards/final_reward": 1.9294720336901374, "rewards/mask_iou_reward": 0.9647360168450687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7603938579559326, "rewards/thk_ans_format_reward": 1.0, "step": 2776, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 225.2604217529297, "epoch": 9.381112984822934, "grad_norm": 8.331803620841779, "kl": 0.4443359375, "learning_rate": 2.1818693693693694e-07, "loss": 0.0004, "reward": 3.568936824798584, "reward_std": 0.21196496207267046, "rewards/final_reward": 1.3753243020169517, "rewards/mask_iou_reward": 0.6876621510084758, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.6106035709381104, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2777, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 149.27083587646484, "epoch": 9.384485666104553, "grad_norm": 10.35051680882014, "kl": 0.455078125, "learning_rate": 2.1790540540540538e-07, "loss": 0.0005, "reward": 3.6697943210601807, "reward_std": 0.011216352228075266, "rewards/final_reward": 1.5587942518005602, "rewards/mask_iou_reward": 0.7793971259002801, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6697942018508911, "rewards/thk_ans_format_reward": 1.0, "step": 2778, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 213.28125762939453, "epoch": 9.387858347386173, "grad_norm": 15.491498623038067, "kl": 0.4501953125, "learning_rate": 2.1762387387387387e-07, "loss": 0.0005, "reward": 3.496561288833618, "reward_std": 0.05198364332318306, "rewards/final_reward": 1.6413194591458895, "rewards/mask_iou_reward": 0.8206597295729448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4965611696243286, "rewards/thk_ans_format_reward": 1.0, "step": 2779, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 185.39584350585938, "epoch": 9.391231028667791, "grad_norm": 13.157649856445605, "kl": 0.4296875, "learning_rate": 2.1734234234234233e-07, "loss": 0.0004, "reward": 3.51545250415802, "reward_std": 0.05822751484811306, "rewards/final_reward": 1.3395250521672077, "rewards/mask_iou_reward": 0.6697625260836039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5154521465301514, "rewards/thk_ans_format_reward": 1.0, "step": 2780, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 160.11459350585938, "epoch": 9.39460370994941, "grad_norm": 10.604900553990017, "kl": 0.4404296875, "learning_rate": 2.170608108108108e-07, "loss": 0.0005, "reward": 3.6119425296783447, "reward_std": 0.06541701033711433, "rewards/final_reward": 1.712878393204974, "rewards/mask_iou_reward": 0.856439196602487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6119424104690552, "rewards/thk_ans_format_reward": 1.0, "step": 2781, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 200.34375, "epoch": 9.397976391231028, "grad_norm": 10.330735606148046, "kl": 0.486328125, "learning_rate": 2.1677927927927927e-07, "loss": 0.0005, "reward": 3.646028161048889, "reward_std": 0.1866953857243061, "rewards/final_reward": 1.9406015070581604, "rewards/mask_iou_reward": 0.9703007535290802, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6668614745140076, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2782, "think_completion_length": 10.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 235.0729217529297, "epoch": 9.401349072512648, "grad_norm": 82.46974246026812, "kl": 0.572265625, "learning_rate": 2.1649774774774774e-07, "loss": 0.0006, "reward": 3.3745267391204834, "reward_std": 0.10908135771751404, "rewards/final_reward": 1.268176157405094, "rewards/mask_iou_reward": 0.634088078702547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3849433660507202, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2783, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 296.03125, "epoch": 9.404721753794266, "grad_norm": 30.945234479281133, "kl": 0.375, "learning_rate": 2.1621621621621622e-07, "loss": 0.0004, "reward": 3.5293290615081787, "reward_std": 0.0759376734495163, "rewards/final_reward": 1.6069022349327682, "rewards/mask_iou_reward": 0.8034511174663841, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.529329001903534, "rewards/thk_ans_format_reward": 1.0, "step": 2784, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 143.02083587646484, "epoch": 9.408094435075885, "grad_norm": 22.46595324259033, "kl": 0.501953125, "learning_rate": 2.1593468468468468e-07, "loss": 0.0005, "reward": 3.8743661642074585, "reward_std": 0.05244017764925957, "rewards/final_reward": 1.8538225025823447, "rewards/mask_iou_reward": 0.9269112512911724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8743661642074585, "rewards/thk_ans_format_reward": 1.0, "step": 2785, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 194.25, "epoch": 9.411467116357505, "grad_norm": 87.8717800260948, "kl": 0.4267578125, "learning_rate": 2.1565315315315314e-07, "loss": 0.0004, "reward": 3.36142361164093, "reward_std": 0.08803194761276245, "rewards/final_reward": 1.5342241975445043, "rewards/mask_iou_reward": 0.7671120987722522, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3614235520362854, "rewards/thk_ans_format_reward": 1.0, "step": 2786, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.875, "epoch": 9.414839797639123, "grad_norm": 69.32235015398565, "kl": 0.5048828125, "learning_rate": 2.1537162162162163e-07, "loss": 0.0005, "reward": 3.302255868911743, "reward_std": 0.07581621408462524, "rewards/final_reward": 1.2662602907992375, "rewards/mask_iou_reward": 0.6331301453996188, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3022557497024536, "rewards/thk_ans_format_reward": 1.0, "step": 2787, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 206.50000762939453, "epoch": 9.418212478920742, "grad_norm": 22.671197228263072, "kl": 0.482421875, "learning_rate": 2.1509009009009006e-07, "loss": 0.0005, "reward": 3.691166639328003, "reward_std": 0.029686040244996548, "rewards/final_reward": 1.8484224625982473, "rewards/mask_iou_reward": 0.9242112312991236, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6911665201187134, "rewards/thk_ans_format_reward": 1.0, "step": 2788, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 191.30208587646484, "epoch": 9.42158516020236, "grad_norm": 20.21814340136336, "kl": 0.42578125, "learning_rate": 2.1480855855855855e-07, "loss": 0.0004, "reward": 3.7652477025985718, "reward_std": 0.060860181925818324, "rewards/final_reward": 1.9556798290556365, "rewards/mask_iou_reward": 0.9778399145278183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7652477025985718, "rewards/thk_ans_format_reward": 1.0, "step": 2789, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 186.4479217529297, "epoch": 9.42495784148398, "grad_norm": 72.0763048743931, "kl": 0.525390625, "learning_rate": 2.14527027027027e-07, "loss": 0.0005, "reward": 3.4164334535598755, "reward_std": 0.1100648082792759, "rewards/final_reward": 1.1242582255843374, "rewards/mask_iou_reward": 0.5621291127921687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.416433334350586, "rewards/thk_ans_format_reward": 1.0, "step": 2790, "think_completion_length": 9.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 176.77084350585938, "epoch": 9.428330522765599, "grad_norm": 13.376003044819086, "kl": 0.498046875, "learning_rate": 2.1424549549549547e-07, "loss": 0.0005, "reward": 3.608410954475403, "reward_std": 0.12914259731769562, "rewards/final_reward": 1.7236305317824567, "rewards/mask_iou_reward": 0.8618152658912284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6084111332893372, "rewards/thk_ans_format_reward": 1.0, "step": 2791, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 178.4479217529297, "epoch": 9.431703204047217, "grad_norm": 7.346062847254678, "kl": 0.5341796875, "learning_rate": 2.1396396396396396e-07, "loss": 0.0007, "reward": 3.62905216217041, "reward_std": 0.049922844395041466, "rewards/final_reward": 1.8607454231881027, "rewards/mask_iou_reward": 0.9303727115940513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6290519833564758, "rewards/thk_ans_format_reward": 1.0, "step": 2792, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.5208396911621, "epoch": 9.435075885328837, "grad_norm": 17.302456866640764, "kl": 0.47265625, "learning_rate": 2.1368243243243242e-07, "loss": 0.0005, "reward": 3.407977819442749, "reward_std": 0.23191232979297638, "rewards/final_reward": 1.5567038813895737, "rewards/mask_iou_reward": 0.7783519406947869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4079777002334595, "rewards/thk_ans_format_reward": 1.0, "step": 2793, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 143.0104217529297, "epoch": 9.438448566610456, "grad_norm": 23.319498306812275, "kl": 0.4443359375, "learning_rate": 2.134009009009009e-07, "loss": 0.0004, "reward": 3.6632840633392334, "reward_std": 0.04057050496339798, "rewards/final_reward": 1.3894538766016005, "rewards/mask_iou_reward": 0.6947269383008002, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6632840633392334, "rewards/thk_ans_format_reward": 1.0, "step": 2794, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 157.3854217529297, "epoch": 9.441821247892074, "grad_norm": 34.32798971403974, "kl": 0.6640625, "learning_rate": 2.1311936936936937e-07, "loss": 0.0007, "reward": 3.824118494987488, "reward_std": 0.09926417097449303, "rewards/final_reward": 1.7717287731157554, "rewards/mask_iou_reward": 0.8858643865578777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8241184949874878, "rewards/thk_ans_format_reward": 1.0, "step": 2795, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 197.2916717529297, "epoch": 9.445193929173692, "grad_norm": 12.333120877458516, "kl": 0.4375, "learning_rate": 2.1283783783783783e-07, "loss": 0.0005, "reward": 3.581265926361084, "reward_std": 0.08018996939063072, "rewards/final_reward": 1.5700802326638046, "rewards/mask_iou_reward": 0.7850401163319023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5812659859657288, "rewards/thk_ans_format_reward": 1.0, "step": 2796, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 222.71875762939453, "epoch": 9.448566610455313, "grad_norm": 7.065641300682204, "kl": 0.4169921875, "learning_rate": 2.1255630630630632e-07, "loss": 0.0005, "reward": 3.70440411567688, "reward_std": 0.04049869813024998, "rewards/final_reward": 1.2983284435416518, "rewards/mask_iou_reward": 0.6491642217708259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7044039964675903, "rewards/thk_ans_format_reward": 1.0, "step": 2797, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 149.28125, "epoch": 9.451939291736931, "grad_norm": 13.797851066630797, "kl": 0.4619140625, "learning_rate": 2.1227477477477475e-07, "loss": 0.0005, "reward": 3.374970555305481, "reward_std": 0.0377837847918272, "rewards/final_reward": 1.3635178205191425, "rewards/mask_iou_reward": 0.6817589102595712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.374970555305481, "rewards/thk_ans_format_reward": 1.0, "step": 2798, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 224.58333587646484, "epoch": 9.45531197301855, "grad_norm": 7.334236819259594, "kl": 0.4619140625, "learning_rate": 2.1199324324324324e-07, "loss": 0.0005, "reward": 3.2692062854766846, "reward_std": 0.084585752338171, "rewards/final_reward": 1.1545577914849081, "rewards/mask_iou_reward": 0.5772788957424541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2692063450813293, "rewards/thk_ans_format_reward": 1.0, "step": 2799, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 254.36459350585938, "epoch": 9.45868465430017, "grad_norm": 23.920565501545674, "kl": 0.525390625, "learning_rate": 2.117117117117117e-07, "loss": 0.0005, "reward": 3.405628800392151, "reward_std": 0.1159110739827156, "rewards/final_reward": 1.7630714442007784, "rewards/mask_iou_reward": 0.8815357221003892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4056288003921509, "rewards/thk_ans_format_reward": 1.0, "step": 2800, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 235.03126525878906, "epoch": 9.462057335581788, "grad_norm": 7.875361715270865, "kl": 0.5126953125, "learning_rate": 2.1143018018018016e-07, "loss": 0.0005, "reward": 3.4100340604782104, "reward_std": 0.08821703493595123, "rewards/final_reward": 1.370184435113929, "rewards/mask_iou_reward": 0.6850922175569645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4100341200828552, "rewards/thk_ans_format_reward": 1.0, "step": 2801, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 179.67708587646484, "epoch": 9.465430016863406, "grad_norm": 19.085261433748908, "kl": 0.5966796875, "learning_rate": 2.1114864864864865e-07, "loss": 0.0006, "reward": 3.590638756752014, "reward_std": 0.023143062833696604, "rewards/final_reward": 1.7881143745703603, "rewards/mask_iou_reward": 0.8940571872851801, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5906386971473694, "rewards/thk_ans_format_reward": 1.0, "step": 2802, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 130.93750381469727, "epoch": 9.468802698145025, "grad_norm": 13.034191044866834, "kl": 0.423828125, "learning_rate": 2.108671171171171e-07, "loss": 0.0004, "reward": 3.4855542182922363, "reward_std": 0.11600197479128838, "rewards/final_reward": 1.338612547361113, "rewards/mask_iou_reward": 0.6693062736805565, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4855543375015259, "rewards/thk_ans_format_reward": 1.0, "step": 2803, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 173.84375, "epoch": 9.472175379426645, "grad_norm": 16.930757784970062, "kl": 0.470703125, "learning_rate": 2.1058558558558557e-07, "loss": 0.0005, "reward": 3.5270018577575684, "reward_std": 0.05098412372171879, "rewards/final_reward": 1.841535809184157, "rewards/mask_iou_reward": 0.9207679045920785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.527001678943634, "rewards/thk_ans_format_reward": 1.0, "step": 2804, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 141.59375, "epoch": 9.475548060708263, "grad_norm": 8.068372827679651, "kl": 0.591796875, "learning_rate": 2.1030405405405406e-07, "loss": 0.0006, "reward": 3.5839741230010986, "reward_std": 0.021892188116908073, "rewards/final_reward": 1.6267602993609627, "rewards/mask_iou_reward": 0.8133801496804813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.583974003791809, "rewards/thk_ans_format_reward": 1.0, "step": 2805, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 156.20833587646484, "epoch": 9.478920741989882, "grad_norm": 8.689718251800521, "kl": 0.4111328125, "learning_rate": 2.1002252252252252e-07, "loss": 0.0004, "reward": 3.6123223304748535, "reward_std": 0.030155442655086517, "rewards/final_reward": 1.8596118805407436, "rewards/mask_iou_reward": 0.9298059402703718, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6123226284980774, "rewards/thk_ans_format_reward": 1.0, "step": 2806, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 146.31250762939453, "epoch": 9.4822934232715, "grad_norm": 8.01553579629774, "kl": 0.447265625, "learning_rate": 2.09740990990991e-07, "loss": 0.0005, "reward": 3.1493096351623535, "reward_std": 0.18032991886138916, "rewards/final_reward": 1.1444689445247709, "rewards/mask_iou_reward": 0.5722344722623854, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1493096947669983, "rewards/thk_ans_format_reward": 1.0, "step": 2807, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 97.47916793823242, "epoch": 9.48566610455312, "grad_norm": 6.552411445556528, "kl": 0.5888671875, "learning_rate": 2.0945945945945944e-07, "loss": 0.0006, "reward": 3.522453546524048, "reward_std": 0.012052702717483044, "rewards/final_reward": 1.1182514968418769, "rewards/mask_iou_reward": 0.5591257484209384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5224534273147583, "rewards/thk_ans_format_reward": 1.0, "step": 2808, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.71875762939453, "epoch": 9.489038785834738, "grad_norm": 8.233794743221564, "kl": 0.4638671875, "learning_rate": 2.091779279279279e-07, "loss": 0.0005, "reward": 3.684836983680725, "reward_std": 0.10470253601670265, "rewards/final_reward": 1.3776622207917923, "rewards/mask_iou_reward": 0.6888311103958962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6848370432853699, "rewards/thk_ans_format_reward": 1.0, "step": 2809, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 176.4479217529297, "epoch": 9.492411467116357, "grad_norm": 23.72330348754612, "kl": 0.458984375, "learning_rate": 2.0889639639639638e-07, "loss": 0.0005, "reward": 3.4966185092926025, "reward_std": 0.057080830447375774, "rewards/final_reward": 1.0069303965521077, "rewards/mask_iou_reward": 0.5034651982760538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.496618628501892, "rewards/thk_ans_format_reward": 1.0, "step": 2810, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.43750762939453, "epoch": 9.495784148397977, "grad_norm": 15.26687033533583, "kl": 0.591796875, "learning_rate": 2.0861486486486485e-07, "loss": 0.0006, "reward": 3.5150842666625977, "reward_std": 0.1310267341323197, "rewards/final_reward": 1.6641343179725157, "rewards/mask_iou_reward": 0.8320671589862578, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5359174609184265, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2811, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 252.14583587646484, "epoch": 9.499156829679595, "grad_norm": 8.620305372098384, "kl": 0.44140625, "learning_rate": 2.0833333333333333e-07, "loss": 0.0005, "reward": 3.5341700315475464, "reward_std": 0.049245577305555344, "rewards/final_reward": 1.544416112337022, "rewards/mask_iou_reward": 0.772208056168511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.534170150756836, "rewards/thk_ans_format_reward": 1.0, "step": 2812, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 213.86458587646484, "epoch": 9.502529510961214, "grad_norm": 6.127603181830918, "kl": 0.5126953125, "learning_rate": 2.080518018018018e-07, "loss": 0.0005, "reward": 3.625041961669922, "reward_std": 0.041364286094903946, "rewards/final_reward": 1.5758643227687703, "rewards/mask_iou_reward": 0.7879321613843852, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6250417828559875, "rewards/thk_ans_format_reward": 1.0, "step": 2813, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 216.66667938232422, "epoch": 9.505902192242832, "grad_norm": 14.304108783799528, "kl": 0.5126953125, "learning_rate": 2.0777027027027025e-07, "loss": 0.0005, "reward": 3.7484500408172607, "reward_std": 0.051718422677367926, "rewards/final_reward": 1.8292283794214241, "rewards/mask_iou_reward": 0.9146141897107121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.748449981212616, "rewards/thk_ans_format_reward": 1.0, "step": 2814, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 222.4479217529297, "epoch": 9.509274873524452, "grad_norm": 9.773661468760594, "kl": 0.3955078125, "learning_rate": 2.0748873873873874e-07, "loss": 0.0004, "reward": 3.4886139631271362, "reward_std": 0.045159148052334785, "rewards/final_reward": 1.7602771452812136, "rewards/mask_iou_reward": 0.8801385726406068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4886139631271362, "rewards/thk_ans_format_reward": 1.0, "step": 2815, "think_completion_length": 7.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 196.98958587646484, "epoch": 9.51264755480607, "grad_norm": 16.820619286046863, "kl": 0.4951171875, "learning_rate": 2.072072072072072e-07, "loss": 0.0006, "reward": 3.7381627559661865, "reward_std": 0.11359736323356628, "rewards/final_reward": 1.5969598305950474, "rewards/mask_iou_reward": 0.7984799152975237, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.738162636756897, "rewards/thk_ans_format_reward": 1.0, "step": 2816, "think_completion_length": 7.083333333333333 }, { "clip_ratio": 0.0, "completion_length": 169.42709350585938, "epoch": 9.516020236087689, "grad_norm": 25.73561044687546, "kl": 0.587890625, "learning_rate": 2.069256756756757e-07, "loss": 0.0006, "reward": 3.377044200897217, "reward_std": 0.0334283453412354, "rewards/final_reward": 1.0100619557690784, "rewards/mask_iou_reward": 0.5050309778845392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3770442605018616, "rewards/thk_ans_format_reward": 1.0, "step": 2817, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 245.8541717529297, "epoch": 9.51939291736931, "grad_norm": 24.29405298701604, "kl": 0.4287109375, "learning_rate": 2.0664414414414412e-07, "loss": 0.0005, "reward": 3.7982823848724365, "reward_std": 0.2190344613045454, "rewards/final_reward": 1.7568705996030087, "rewards/mask_iou_reward": 0.8784352998015044, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.8399492502212524, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2818, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 151.05208587646484, "epoch": 9.522765598650928, "grad_norm": 53.99049145280786, "kl": 0.470703125, "learning_rate": 2.0636261261261258e-07, "loss": 0.0005, "reward": 3.6919091939926147, "reward_std": 0.023173667024821043, "rewards/final_reward": 1.84371278506089, "rewards/mask_iou_reward": 0.921856392530445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6919094324111938, "rewards/thk_ans_format_reward": 1.0, "step": 2819, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 187.1354217529297, "epoch": 9.526138279932546, "grad_norm": 16.178033795092553, "kl": 0.431640625, "learning_rate": 2.0608108108108107e-07, "loss": 0.0004, "reward": 3.308905839920044, "reward_std": 0.08162033371627331, "rewards/final_reward": 1.4741759558412735, "rewards/mask_iou_reward": 0.7370879779206367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.308905839920044, "rewards/thk_ans_format_reward": 1.0, "step": 2820, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 231.23959350585938, "epoch": 9.529510961214164, "grad_norm": 8.992478435504813, "kl": 0.3759765625, "learning_rate": 2.0579954954954953e-07, "loss": 0.0004, "reward": 3.4609246253967285, "reward_std": 0.02671785280108452, "rewards/final_reward": 0.9214808888032776, "rewards/mask_iou_reward": 0.4607404444016388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4609246253967285, "rewards/thk_ans_format_reward": 1.0, "step": 2821, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 175.65625762939453, "epoch": 9.532883642495785, "grad_norm": 44.005912402987626, "kl": 0.4619140625, "learning_rate": 2.0551801801801802e-07, "loss": 0.0005, "reward": 3.5948944091796875, "reward_std": 0.012664198991842568, "rewards/final_reward": 1.7858823723189787, "rewards/mask_iou_reward": 0.8929411861594894, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.594894289970398, "rewards/thk_ans_format_reward": 1.0, "step": 2822, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 346.7916793823242, "epoch": 9.536256323777403, "grad_norm": 50.977032139791866, "kl": 0.5048828125, "learning_rate": 2.0523648648648648e-07, "loss": 0.0005, "reward": 3.54253351688385, "reward_std": 0.6379697918891907, "rewards/final_reward": 1.8271490412164877, "rewards/mask_iou_reward": 0.9135745206082438, "rewards/sam_format_reward": 0.9375000298023224, "rewards/sam_reward_func_ultra": 1.6675333976745605, "rewards/thk_ans_format_reward": 0.9375000298023224, "step": 2823, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 191.56250762939453, "epoch": 9.539629005059021, "grad_norm": 11.056068929360753, "kl": 0.41796875, "learning_rate": 2.0495495495495494e-07, "loss": 0.0004, "reward": 3.621427297592163, "reward_std": 0.030739820562303066, "rewards/final_reward": 1.3949359143596283, "rewards/mask_iou_reward": 0.6974679571798141, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6214274168014526, "rewards/thk_ans_format_reward": 1.0, "step": 2824, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 199.46875762939453, "epoch": 9.543001686340641, "grad_norm": 19.432115157877316, "kl": 0.4228515625, "learning_rate": 2.0467342342342343e-07, "loss": 0.0004, "reward": 3.6139813661575317, "reward_std": 0.07440846040844917, "rewards/final_reward": 1.8043686765361797, "rewards/mask_iou_reward": 0.9021843382680899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6139812469482422, "rewards/thk_ans_format_reward": 1.0, "step": 2825, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 223.39583587646484, "epoch": 9.54637436762226, "grad_norm": 30.093602692816805, "kl": 0.4404296875, "learning_rate": 2.043918918918919e-07, "loss": 0.0005, "reward": 3.518091917037964, "reward_std": 0.22877466678619385, "rewards/final_reward": 1.5167956450892555, "rewards/mask_iou_reward": 0.7583978225446277, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5389251112937927, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2826, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 250.0729217529297, "epoch": 9.549747048903878, "grad_norm": 8.053954032812966, "kl": 0.3759765625, "learning_rate": 2.0411036036036035e-07, "loss": 0.0004, "reward": 3.590845227241516, "reward_std": 0.020092520862817764, "rewards/final_reward": 1.6205377049667196, "rewards/mask_iou_reward": 0.8102688524833598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5908452272415161, "rewards/thk_ans_format_reward": 1.0, "step": 2827, "think_completion_length": 9.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.43750762939453, "epoch": 9.553119730185497, "grad_norm": 6.808401299666237, "kl": 0.48828125, "learning_rate": 2.038288288288288e-07, "loss": 0.0005, "reward": 3.514745831489563, "reward_std": 0.057424647733569145, "rewards/final_reward": 0.5211337043794396, "rewards/mask_iou_reward": 0.2605668521897198, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5147458910942078, "rewards/thk_ans_format_reward": 1.0, "step": 2828, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 236.30208587646484, "epoch": 9.556492411467117, "grad_norm": 12.857766432793152, "kl": 0.4296875, "learning_rate": 2.0354729729729727e-07, "loss": 0.0005, "reward": 3.6339670419692993, "reward_std": 0.162049344740808, "rewards/final_reward": 1.6951763072555914, "rewards/mask_iou_reward": 0.8475881536277957, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6548004150390625, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2829, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 229.33333587646484, "epoch": 9.559865092748735, "grad_norm": 8.916261260749389, "kl": 0.3984375, "learning_rate": 2.0326576576576576e-07, "loss": 0.0005, "reward": 3.8546788692474365, "reward_std": 0.035455340053886175, "rewards/final_reward": 1.8457946564614258, "rewards/mask_iou_reward": 0.9228973282307129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8546786308288574, "rewards/thk_ans_format_reward": 1.0, "step": 2830, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 231.01042938232422, "epoch": 9.563237774030354, "grad_norm": 89.02326475916136, "kl": 0.439453125, "learning_rate": 2.0298423423423422e-07, "loss": 0.0004, "reward": 3.3965485095977783, "reward_std": 0.07727420050650835, "rewards/final_reward": 1.6133975362772532, "rewards/mask_iou_reward": 0.8066987681386266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3965486288070679, "rewards/thk_ans_format_reward": 1.0, "step": 2831, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 266.48958587646484, "epoch": 9.566610455311974, "grad_norm": 21.9967390475209, "kl": 0.4501953125, "learning_rate": 2.027027027027027e-07, "loss": 0.0005, "reward": 3.4815778732299805, "reward_std": 0.0476123932749033, "rewards/final_reward": 1.590671112034137, "rewards/mask_iou_reward": 0.7953355560170685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4815776944160461, "rewards/thk_ans_format_reward": 1.0, "step": 2832, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 259.56250762939453, "epoch": 9.569983136593592, "grad_norm": 10.440515270065333, "kl": 0.60546875, "learning_rate": 2.0242117117117117e-07, "loss": 0.0006, "reward": 3.641559362411499, "reward_std": 0.08363806898705661, "rewards/final_reward": 1.3991671911885808, "rewards/mask_iou_reward": 0.6995835955942904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.641559362411499, "rewards/thk_ans_format_reward": 1.0, "step": 2833, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 218.10417938232422, "epoch": 9.57335581787521, "grad_norm": 8.531738541716415, "kl": 0.5087890625, "learning_rate": 2.0213963963963963e-07, "loss": 0.0005, "reward": 3.0582525730133057, "reward_std": 0.03069372847676277, "rewards/final_reward": 1.8473170777391852, "rewards/mask_iou_reward": 0.9236585388695926, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0582523941993713, "rewards/thk_ans_format_reward": 1.0, "step": 2834, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 213.4479217529297, "epoch": 9.576728499156829, "grad_norm": 8.21043348175991, "kl": 0.703125, "learning_rate": 2.0185810810810811e-07, "loss": 0.0007, "reward": 3.729646682739258, "reward_std": 0.06171860918402672, "rewards/final_reward": 1.3267377948958567, "rewards/mask_iou_reward": 0.6633688974479284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7296466827392578, "rewards/thk_ans_format_reward": 1.0, "step": 2835, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 296.2708435058594, "epoch": 9.580101180438449, "grad_norm": 10.642499161128272, "kl": 0.427734375, "learning_rate": 2.0157657657657657e-07, "loss": 0.0004, "reward": 3.362258553504944, "reward_std": 0.16495228372514248, "rewards/final_reward": 1.7112586410634867, "rewards/mask_iou_reward": 0.8556293205317433, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3830917477607727, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2836, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 196.20833587646484, "epoch": 9.583473861720067, "grad_norm": 21.840827938504045, "kl": 0.513671875, "learning_rate": 2.0129504504504503e-07, "loss": 0.0005, "reward": 3.5530951023101807, "reward_std": 0.05677308700978756, "rewards/final_reward": 1.543489324604837, "rewards/mask_iou_reward": 0.7717446623024184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5530951023101807, "rewards/thk_ans_format_reward": 1.0, "step": 2837, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 191.30208587646484, "epoch": 9.586846543001686, "grad_norm": 9.55288922944407, "kl": 0.462890625, "learning_rate": 2.010135135135135e-07, "loss": 0.0005, "reward": 3.3496075868606567, "reward_std": 0.05496904905885458, "rewards/final_reward": 0.8730091994708987, "rewards/mask_iou_reward": 0.43650459973544936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3496074676513672, "rewards/thk_ans_format_reward": 1.0, "step": 2838, "think_completion_length": 11.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 224.7916717529297, "epoch": 9.590219224283306, "grad_norm": 18.349280040949843, "kl": 0.412109375, "learning_rate": 2.0073198198198196e-07, "loss": 0.0005, "reward": 3.684578537940979, "reward_std": 0.17463991791009903, "rewards/final_reward": 1.3727932052574463, "rewards/mask_iou_reward": 0.6863966026287232, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7054117321968079, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2839, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 163.36458587646484, "epoch": 9.593591905564924, "grad_norm": 13.069291248829193, "kl": 0.525390625, "learning_rate": 2.0045045045045044e-07, "loss": 0.0005, "reward": 3.7611004114151, "reward_std": 0.10698830150067806, "rewards/final_reward": 1.5477931511291438, "rewards/mask_iou_reward": 0.7738965755645719, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7611003518104553, "rewards/thk_ans_format_reward": 1.0, "step": 2840, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 194.17708587646484, "epoch": 9.596964586846543, "grad_norm": 11.268109832702788, "kl": 0.42578125, "learning_rate": 2.001689189189189e-07, "loss": 0.0004, "reward": 3.5092090368270874, "reward_std": 0.08594032749533653, "rewards/final_reward": 1.531189337710352, "rewards/mask_iou_reward": 0.765594668855176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.509209156036377, "rewards/thk_ans_format_reward": 1.0, "step": 2841, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 236.30209350585938, "epoch": 9.600337268128161, "grad_norm": 7.4274243871513805, "kl": 0.4140625, "learning_rate": 1.998873873873874e-07, "loss": 0.0005, "reward": 3.5415148735046387, "reward_std": 0.046683117747306824, "rewards/final_reward": 1.481277049260404, "rewards/mask_iou_reward": 0.740638524630202, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5415149927139282, "rewards/thk_ans_format_reward": 1.0, "step": 2842, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 203.87500762939453, "epoch": 9.603709949409781, "grad_norm": 55.56419412559832, "kl": 0.4609375, "learning_rate": 1.9960585585585585e-07, "loss": 0.0005, "reward": 3.7100802659988403, "reward_std": 0.023584270384162664, "rewards/final_reward": 1.7788794468594284, "rewards/mask_iou_reward": 0.8894397234297142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7100803852081299, "rewards/thk_ans_format_reward": 1.0, "step": 2843, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 168.81250762939453, "epoch": 9.6070826306914, "grad_norm": 52.155557130505954, "kl": 0.4541015625, "learning_rate": 1.993243243243243e-07, "loss": 0.0005, "reward": 3.713410258293152, "reward_std": 0.026679479517042637, "rewards/final_reward": 1.7559925178717823, "rewards/mask_iou_reward": 0.8779962589358912, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7134102582931519, "rewards/thk_ans_format_reward": 1.0, "step": 2844, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 195.9583396911621, "epoch": 9.610455311973018, "grad_norm": 13.12615115215815, "kl": 0.46875, "learning_rate": 1.990427927927928e-07, "loss": 0.0005, "reward": 3.2496907711029053, "reward_std": 0.22381212748587132, "rewards/final_reward": 1.5477438783253565, "rewards/mask_iou_reward": 0.7738719391626783, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.291357159614563, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2845, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 244.58334350585938, "epoch": 9.613827993254638, "grad_norm": 12.077068644217759, "kl": 0.4794921875, "learning_rate": 1.9876126126126126e-07, "loss": 0.0005, "reward": 3.672512650489807, "reward_std": 0.04940144717693329, "rewards/final_reward": 1.7971136101756204, "rewards/mask_iou_reward": 0.8985568050878102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.672512710094452, "rewards/thk_ans_format_reward": 1.0, "step": 2846, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 183.86459350585938, "epoch": 9.617200674536257, "grad_norm": 11.875814882323452, "kl": 0.4873046875, "learning_rate": 1.9847972972972972e-07, "loss": 0.0005, "reward": 3.702423334121704, "reward_std": 0.0695484783500433, "rewards/final_reward": 1.546965553843143, "rewards/mask_iou_reward": 0.7734827769215715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7024234533309937, "rewards/thk_ans_format_reward": 1.0, "step": 2847, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 207.41667938232422, "epoch": 9.620573355817875, "grad_norm": 12.888691948386171, "kl": 0.427734375, "learning_rate": 1.9819819819819818e-07, "loss": 0.0004, "reward": 3.688524842262268, "reward_std": 0.06056614965200424, "rewards/final_reward": 1.271105252712622, "rewards/mask_iou_reward": 0.635552626356311, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.688524842262268, "rewards/thk_ans_format_reward": 1.0, "step": 2848, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.73958587646484, "epoch": 9.623946037099493, "grad_norm": 9.132160917417677, "kl": 0.4716796875, "learning_rate": 1.9791666666666664e-07, "loss": 0.0005, "reward": 3.5782171487808228, "reward_std": 0.056133901700377464, "rewards/final_reward": 1.1767378869727714, "rewards/mask_iou_reward": 0.5883689434863857, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.578217089176178, "rewards/thk_ans_format_reward": 1.0, "step": 2849, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 237.5104217529297, "epoch": 9.627318718381114, "grad_norm": 34.526748684702255, "kl": 0.4560546875, "learning_rate": 1.9763513513513513e-07, "loss": 0.0005, "reward": 3.475613832473755, "reward_std": 0.07276524603366852, "rewards/final_reward": 1.910589242117398, "rewards/mask_iou_reward": 0.955294621058699, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4756136536598206, "rewards/thk_ans_format_reward": 1.0, "step": 2850, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 199.1041717529297, "epoch": 9.630691399662732, "grad_norm": 17.432998355345177, "kl": 0.43359375, "learning_rate": 1.973536036036036e-07, "loss": 0.0004, "reward": 3.3042174577713013, "reward_std": 0.11458679661154747, "rewards/final_reward": 0.8685805073799024, "rewards/mask_iou_reward": 0.4342902536899512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3042174577713013, "rewards/thk_ans_format_reward": 1.0, "step": 2851, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 247.40625762939453, "epoch": 9.63406408094435, "grad_norm": 16.700568944178123, "kl": 0.3974609375, "learning_rate": 1.9707207207207208e-07, "loss": 0.0004, "reward": 3.670613646507263, "reward_std": 0.06477308459579945, "rewards/final_reward": 1.623943956731143, "rewards/mask_iou_reward": 0.8119719783655714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6706136465072632, "rewards/thk_ans_format_reward": 1.0, "step": 2852, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 278.875, "epoch": 9.63743676222597, "grad_norm": 7.654529656910214, "kl": 0.365234375, "learning_rate": 1.9679054054054054e-07, "loss": 0.0004, "reward": 3.3408288955688477, "reward_std": 0.19411547109484673, "rewards/final_reward": 1.2078860906808835, "rewards/mask_iou_reward": 0.6039430453404417, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.3616620898246765, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2853, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 187.83333587646484, "epoch": 9.640809443507589, "grad_norm": 9.992895153761978, "kl": 0.4052734375, "learning_rate": 1.96509009009009e-07, "loss": 0.0004, "reward": 3.546285390853882, "reward_std": 0.07918055914342403, "rewards/final_reward": 1.3308878379968134, "rewards/mask_iou_reward": 0.6654439189984067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5462854504585266, "rewards/thk_ans_format_reward": 1.0, "step": 2854, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 169.2291717529297, "epoch": 9.644182124789207, "grad_norm": 22.997248674417847, "kl": 0.533203125, "learning_rate": 1.9622747747747749e-07, "loss": 0.0005, "reward": 3.7459217309951782, "reward_std": 0.029171346686780453, "rewards/final_reward": 1.7417187497924966, "rewards/mask_iou_reward": 0.8708593748962483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7459213733673096, "rewards/thk_ans_format_reward": 1.0, "step": 2855, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 221.6354217529297, "epoch": 9.647554806070826, "grad_norm": 35.438827198570095, "kl": 0.3837890625, "learning_rate": 1.9594594594594595e-07, "loss": 0.0004, "reward": 3.6280492544174194, "reward_std": 0.028105991892516613, "rewards/final_reward": 1.9573374531014816, "rewards/mask_iou_reward": 0.9786687265507408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6280492544174194, "rewards/thk_ans_format_reward": 1.0, "step": 2856, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 153.42708587646484, "epoch": 9.650927487352446, "grad_norm": 25.10256945414261, "kl": 0.4345703125, "learning_rate": 1.956644144144144e-07, "loss": 0.0004, "reward": 3.5365242958068848, "reward_std": 0.05269638076424599, "rewards/final_reward": 1.9359144661557814, "rewards/mask_iou_reward": 0.9679572330778907, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5365243554115295, "rewards/thk_ans_format_reward": 1.0, "step": 2857, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 211.3541717529297, "epoch": 9.654300168634064, "grad_norm": 11.130338599723647, "kl": 0.5693359375, "learning_rate": 1.9538288288288287e-07, "loss": 0.0006, "reward": 3.5365134477615356, "reward_std": 0.01926427148282528, "rewards/final_reward": 0.9418421923795386, "rewards/mask_iou_reward": 0.4709210961897693, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5365132093429565, "rewards/thk_ans_format_reward": 1.0, "step": 2858, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 225.69792938232422, "epoch": 9.657672849915683, "grad_norm": 41.97861008305546, "kl": 0.5126953125, "learning_rate": 1.9510135135135133e-07, "loss": 0.0005, "reward": 3.6388747692108154, "reward_std": 0.08557260315865278, "rewards/final_reward": 1.4884645818165254, "rewards/mask_iou_reward": 0.7442322909082627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6388747692108154, "rewards/thk_ans_format_reward": 1.0, "step": 2859, "think_completion_length": 9.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 140.8229217529297, "epoch": 9.661045531197303, "grad_norm": 9.662696333050443, "kl": 0.66015625, "learning_rate": 1.9481981981981982e-07, "loss": 0.0006, "reward": 3.6072356700897217, "reward_std": 0.057722508907318115, "rewards/final_reward": 1.7999765534732362, "rewards/mask_iou_reward": 0.8999882767366181, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.607235610485077, "rewards/thk_ans_format_reward": 1.0, "step": 2860, "think_completion_length": 10.875 }, { "clip_ratio": 0.0, "completion_length": 205.0625, "epoch": 9.664418212478921, "grad_norm": 11.981101585335209, "kl": 0.478515625, "learning_rate": 1.9453828828828828e-07, "loss": 0.0005, "reward": 3.332722783088684, "reward_std": 0.09129737317562103, "rewards/final_reward": 0.5991891207919265, "rewards/mask_iou_reward": 0.29959456039596327, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3327226638793945, "rewards/thk_ans_format_reward": 1.0, "step": 2861, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 240.3229217529297, "epoch": 9.66779089376054, "grad_norm": 25.87603734036553, "kl": 0.4580078125, "learning_rate": 1.9425675675675674e-07, "loss": 0.0005, "reward": 3.6100000143051147, "reward_std": 0.07859287038445473, "rewards/final_reward": 1.7322674504974922, "rewards/mask_iou_reward": 0.8661337252487461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6100000143051147, "rewards/thk_ans_format_reward": 1.0, "step": 2862, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 152.7604217529297, "epoch": 9.671163575042158, "grad_norm": 7.350777072868899, "kl": 0.5625, "learning_rate": 1.9397522522522522e-07, "loss": 0.0006, "reward": 3.685990810394287, "reward_std": 0.03065543156117201, "rewards/final_reward": 1.5288908551946414, "rewards/mask_iou_reward": 0.7644454275973207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6859906911849976, "rewards/thk_ans_format_reward": 1.0, "step": 2863, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 134.2395896911621, "epoch": 9.674536256323778, "grad_norm": 14.105106198347636, "kl": 0.611328125, "learning_rate": 1.9369369369369368e-07, "loss": 0.0006, "reward": 3.8575843572616577, "reward_std": 0.021873501129448414, "rewards/final_reward": 1.7784202895456058, "rewards/mask_iou_reward": 0.8892101447728029, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8575843572616577, "rewards/thk_ans_format_reward": 1.0, "step": 2864, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 205.75000762939453, "epoch": 9.677908937605396, "grad_norm": 9.671511209060515, "kl": 0.47265625, "learning_rate": 1.9341216216216217e-07, "loss": 0.0004, "reward": 3.6749985218048096, "reward_std": 0.062141310423612595, "rewards/final_reward": 1.9276742195950716, "rewards/mask_iou_reward": 0.9638371097975358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6749984622001648, "rewards/thk_ans_format_reward": 1.0, "step": 2865, "think_completion_length": 10.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 217.23959350585938, "epoch": 9.681281618887015, "grad_norm": 14.355187256720177, "kl": 0.443359375, "learning_rate": 1.9313063063063063e-07, "loss": 0.0004, "reward": 3.312918186187744, "reward_std": 0.07452259492129087, "rewards/final_reward": 1.587132035400053, "rewards/mask_iou_reward": 0.7935660177000265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3129181861877441, "rewards/thk_ans_format_reward": 1.0, "step": 2866, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 198.8229217529297, "epoch": 9.684654300168635, "grad_norm": 21.42101266336042, "kl": 0.4296875, "learning_rate": 1.9284909909909907e-07, "loss": 0.0004, "reward": 3.5092891454696655, "reward_std": 0.0639540646225214, "rewards/final_reward": 1.7018374552469127, "rewards/mask_iou_reward": 0.8509187276234563, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5092891454696655, "rewards/thk_ans_format_reward": 1.0, "step": 2867, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 136.625, "epoch": 9.688026981450253, "grad_norm": 9.391503997617093, "kl": 0.513671875, "learning_rate": 1.9256756756756755e-07, "loss": 0.0006, "reward": 3.828369140625, "reward_std": 0.05252628936432302, "rewards/final_reward": 1.9016076696050992, "rewards/mask_iou_reward": 0.9508038348025496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.828369140625, "rewards/thk_ans_format_reward": 1.0, "step": 2868, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 209.89583587646484, "epoch": 9.691399662731872, "grad_norm": 9.51695567109779, "kl": 0.576171875, "learning_rate": 1.9228603603603601e-07, "loss": 0.0006, "reward": 3.6162601709365845, "reward_std": 0.14732644706964493, "rewards/final_reward": 1.7979722921278647, "rewards/mask_iou_reward": 0.8989861460639323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6162601113319397, "rewards/thk_ans_format_reward": 1.0, "step": 2869, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 246.69792938232422, "epoch": 9.69477234401349, "grad_norm": 13.885753410600678, "kl": 0.4423828125, "learning_rate": 1.920045045045045e-07, "loss": 0.0005, "reward": 3.571479082107544, "reward_std": 0.037937651155516505, "rewards/final_reward": 1.94194025944619, "rewards/mask_iou_reward": 0.970970129723095, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5714789032936096, "rewards/thk_ans_format_reward": 1.0, "step": 2870, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 228.64584350585938, "epoch": 9.69814502529511, "grad_norm": 9.751534245853538, "kl": 0.4345703125, "learning_rate": 1.9172297297297296e-07, "loss": 0.0005, "reward": 3.57540225982666, "reward_std": 0.08446568250656128, "rewards/final_reward": 1.8323340722062373, "rewards/mask_iou_reward": 0.9161670361031187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5754019618034363, "rewards/thk_ans_format_reward": 1.0, "step": 2871, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 238.43751525878906, "epoch": 9.701517706576729, "grad_norm": 25.327767980045827, "kl": 0.427734375, "learning_rate": 1.9144144144144142e-07, "loss": 0.0004, "reward": 3.1767791509628296, "reward_std": 0.16962359100580215, "rewards/final_reward": 1.242573685966323, "rewards/mask_iou_reward": 0.6212868429831615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1767792701721191, "rewards/thk_ans_format_reward": 1.0, "step": 2872, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 223.0729217529297, "epoch": 9.704890387858347, "grad_norm": 9.41024703721106, "kl": 0.435546875, "learning_rate": 1.911599099099099e-07, "loss": 0.0005, "reward": 3.851110577583313, "reward_std": 0.03305862098932266, "rewards/final_reward": 1.9446028234279495, "rewards/mask_iou_reward": 0.9723014117139748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8511103391647339, "rewards/thk_ans_format_reward": 1.0, "step": 2873, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 139.83333587646484, "epoch": 9.708263069139967, "grad_norm": 25.116639383545913, "kl": 0.595703125, "learning_rate": 1.9087837837837837e-07, "loss": 0.0007, "reward": 3.62956440448761, "reward_std": 0.04598809592425823, "rewards/final_reward": 1.8718766468069932, "rewards/mask_iou_reward": 0.9359383234034966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6295644640922546, "rewards/thk_ans_format_reward": 1.0, "step": 2874, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 219.02083587646484, "epoch": 9.711635750421586, "grad_norm": 8.665427467336604, "kl": 0.6279296875, "learning_rate": 1.9059684684684686e-07, "loss": 0.0006, "reward": 3.5676958560943604, "reward_std": 0.0411482872441411, "rewards/final_reward": 1.9172117945198663, "rewards/mask_iou_reward": 0.9586058972599332, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5676957964897156, "rewards/thk_ans_format_reward": 1.0, "step": 2875, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 127.20833587646484, "epoch": 9.715008431703204, "grad_norm": 8.656622653908304, "kl": 0.474609375, "learning_rate": 1.9031531531531532e-07, "loss": 0.0005, "reward": 3.753931999206543, "reward_std": 0.015061838086694479, "rewards/final_reward": 1.7427191758720917, "rewards/mask_iou_reward": 0.8713595879360458, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.753931999206543, "rewards/thk_ans_format_reward": 1.0, "step": 2876, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 204.7604217529297, "epoch": 9.718381112984822, "grad_norm": 18.87539927772862, "kl": 0.451171875, "learning_rate": 1.9003378378378375e-07, "loss": 0.0007, "reward": 3.5150372982025146, "reward_std": 0.033334359526634216, "rewards/final_reward": 1.7786438688846506, "rewards/mask_iou_reward": 0.8893219344423253, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5150372982025146, "rewards/thk_ans_format_reward": 1.0, "step": 2877, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 185.80209350585938, "epoch": 9.721753794266442, "grad_norm": 11.569865957597344, "kl": 0.423828125, "learning_rate": 1.8975225225225224e-07, "loss": 0.0004, "reward": 3.5578192472457886, "reward_std": 0.025620101019740105, "rewards/final_reward": 1.266151811013885, "rewards/mask_iou_reward": 0.6330759055069425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5578192472457886, "rewards/thk_ans_format_reward": 1.0, "step": 2878, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 153.55209350585938, "epoch": 9.72512647554806, "grad_norm": 12.922203046169475, "kl": 0.4345703125, "learning_rate": 1.894707207207207e-07, "loss": 0.0004, "reward": 3.7492141723632812, "reward_std": 0.03508290648460388, "rewards/final_reward": 1.8595868449740705, "rewards/mask_iou_reward": 0.9297934224870352, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7492140531539917, "rewards/thk_ans_format_reward": 1.0, "step": 2879, "think_completion_length": 10.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 229.35417938232422, "epoch": 9.72849915682968, "grad_norm": 18.81680975098738, "kl": 0.4453125, "learning_rate": 1.891891891891892e-07, "loss": 0.0004, "reward": 3.4972580671310425, "reward_std": 0.03990233689546585, "rewards/final_reward": 1.6644943242104895, "rewards/mask_iou_reward": 0.8322471621052447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4972580671310425, "rewards/thk_ans_format_reward": 1.0, "step": 2880, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 214.30208587646484, "epoch": 9.7318718381113, "grad_norm": 15.639340031886851, "kl": 0.529296875, "learning_rate": 1.8890765765765765e-07, "loss": 0.0005, "reward": 3.573089361190796, "reward_std": 0.023514626547694206, "rewards/final_reward": 1.8379977343968616, "rewards/mask_iou_reward": 0.9189988671984308, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5730892419815063, "rewards/thk_ans_format_reward": 1.0, "step": 2881, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 230.45834350585938, "epoch": 9.735244519392918, "grad_norm": 55.216362046610456, "kl": 0.46875, "learning_rate": 1.886261261261261e-07, "loss": 0.0005, "reward": 3.632011890411377, "reward_std": 0.04148270934820175, "rewards/final_reward": 1.554612728702804, "rewards/mask_iou_reward": 0.777306364351402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6320120096206665, "rewards/thk_ans_format_reward": 1.0, "step": 2882, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 173.61458587646484, "epoch": 9.738617200674536, "grad_norm": 8.776607905670236, "kl": 0.50390625, "learning_rate": 1.883445945945946e-07, "loss": 0.0006, "reward": 3.7969205379486084, "reward_std": 0.03961823880672455, "rewards/final_reward": 1.7294312710622033, "rewards/mask_iou_reward": 0.8647156355311016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7969204783439636, "rewards/thk_ans_format_reward": 1.0, "step": 2883, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 180.6041717529297, "epoch": 9.741989881956155, "grad_norm": 28.578509588903785, "kl": 0.4267578125, "learning_rate": 1.8806306306306306e-07, "loss": 0.0004, "reward": 3.512947916984558, "reward_std": 0.12944239377975464, "rewards/final_reward": 1.3326931978912158, "rewards/mask_iou_reward": 0.6663465989456079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.512947916984558, "rewards/thk_ans_format_reward": 1.0, "step": 2884, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 181.50000762939453, "epoch": 9.745362563237775, "grad_norm": 7.72589885581016, "kl": 0.4501953125, "learning_rate": 1.8778153153153154e-07, "loss": 0.0004, "reward": 3.344777226448059, "reward_std": 0.04612966813147068, "rewards/final_reward": 1.068333717229333, "rewards/mask_iou_reward": 0.5341668586146665, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3447771072387695, "rewards/thk_ans_format_reward": 1.0, "step": 2885, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 152.02083587646484, "epoch": 9.748735244519393, "grad_norm": 25.71317419883287, "kl": 0.595703125, "learning_rate": 1.875e-07, "loss": 0.0006, "reward": 3.7086650133132935, "reward_std": 0.08883501403033733, "rewards/final_reward": 1.6490717348582409, "rewards/mask_iou_reward": 0.8245358674291204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7086647152900696, "rewards/thk_ans_format_reward": 1.0, "step": 2886, "think_completion_length": 9.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 162.40625381469727, "epoch": 9.752107925801011, "grad_norm": 17.403028726088174, "kl": 0.55078125, "learning_rate": 1.8721846846846844e-07, "loss": 0.0006, "reward": 3.697953701019287, "reward_std": 0.0342103186994791, "rewards/final_reward": 1.7548838832095166, "rewards/mask_iou_reward": 0.8774419416047583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6979536414146423, "rewards/thk_ans_format_reward": 1.0, "step": 2887, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 226.89584350585938, "epoch": 9.75548060708263, "grad_norm": 13.441340996428666, "kl": 0.513671875, "learning_rate": 1.8693693693693693e-07, "loss": 0.0005, "reward": 3.58347225189209, "reward_std": 0.03695745766162872, "rewards/final_reward": 1.780572817810726, "rewards/mask_iou_reward": 0.890286408905363, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5834720730781555, "rewards/thk_ans_format_reward": 1.0, "step": 2888, "think_completion_length": 9.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 122.28125381469727, "epoch": 9.75885328836425, "grad_norm": 9.806673786228945, "kl": 0.57421875, "learning_rate": 1.8665540540540539e-07, "loss": 0.0006, "reward": 3.58203661441803, "reward_std": 0.12312077358365059, "rewards/final_reward": 0.9904166333088189, "rewards/mask_iou_reward": 0.49520831665440945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5820364356040955, "rewards/thk_ans_format_reward": 1.0, "step": 2889, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 190.4166717529297, "epoch": 9.762225969645868, "grad_norm": 8.634095090279637, "kl": 0.4501953125, "learning_rate": 1.8637387387387387e-07, "loss": 0.0005, "reward": 3.577873945236206, "reward_std": 0.0438346890732646, "rewards/final_reward": 1.4734708922293716, "rewards/mask_iou_reward": 0.7367354461146858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5778738856315613, "rewards/thk_ans_format_reward": 1.0, "step": 2890, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 210.78126525878906, "epoch": 9.765598650927487, "grad_norm": 21.725535355634502, "kl": 0.509765625, "learning_rate": 1.8609234234234233e-07, "loss": 0.0005, "reward": 3.6148117780685425, "reward_std": 0.06591962184756994, "rewards/final_reward": 1.7778298342514938, "rewards/mask_iou_reward": 0.8889149171257469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6148120760917664, "rewards/thk_ans_format_reward": 1.0, "step": 2891, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.20833587646484, "epoch": 9.768971332209107, "grad_norm": 17.3936906485533, "kl": 0.5234375, "learning_rate": 1.858108108108108e-07, "loss": 0.0005, "reward": 3.6338586807250977, "reward_std": 0.09483909234404564, "rewards/final_reward": 1.4055704259070454, "rewards/mask_iou_reward": 0.7027852129535227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6338586807250977, "rewards/thk_ans_format_reward": 1.0, "step": 2892, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 132.25000762939453, "epoch": 9.772344013490725, "grad_norm": 53.145981829869896, "kl": 0.560546875, "learning_rate": 1.8552927927927928e-07, "loss": 0.0006, "reward": 3.6119606494903564, "reward_std": 0.09925832878798246, "rewards/final_reward": 1.2578269223932477, "rewards/mask_iou_reward": 0.6289134611966238, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.611960530281067, "rewards/thk_ans_format_reward": 1.0, "step": 2893, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 154.75000762939453, "epoch": 9.775716694772344, "grad_norm": 22.358863679413318, "kl": 0.6171875, "learning_rate": 1.8524774774774774e-07, "loss": 0.0006, "reward": 3.8267993927001953, "reward_std": 0.020386284217238426, "rewards/final_reward": 1.9695081912671557, "rewards/mask_iou_reward": 0.9847540956335779, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8267992734909058, "rewards/thk_ans_format_reward": 1.0, "step": 2894, "think_completion_length": 12.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 224.75000762939453, "epoch": 9.779089376053962, "grad_norm": 10.042252567866315, "kl": 0.431640625, "learning_rate": 1.8496621621621623e-07, "loss": 0.0005, "reward": 3.6202865839004517, "reward_std": 0.02719450183212757, "rewards/final_reward": 1.4129175307550255, "rewards/mask_iou_reward": 0.7064587653775127, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6202865242958069, "rewards/thk_ans_format_reward": 1.0, "step": 2895, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 178.4791717529297, "epoch": 9.782462057335582, "grad_norm": 8.299297282864869, "kl": 0.66015625, "learning_rate": 1.8468468468468466e-07, "loss": 0.0007, "reward": 3.6827521324157715, "reward_std": 0.04479054640978575, "rewards/final_reward": 1.7186386513465273, "rewards/mask_iou_reward": 0.8593193256732636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6827520728111267, "rewards/thk_ans_format_reward": 1.0, "step": 2896, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 120.45833587646484, "epoch": 9.7858347386172, "grad_norm": 11.577478643284216, "kl": 0.623046875, "learning_rate": 1.8440315315315313e-07, "loss": 0.0006, "reward": 3.799902319908142, "reward_std": 0.042380135506391525, "rewards/final_reward": 1.901752685896824, "rewards/mask_iou_reward": 0.950876342948412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7999022603034973, "rewards/thk_ans_format_reward": 1.0, "step": 2897, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 203.625, "epoch": 9.789207419898819, "grad_norm": 19.355628986540445, "kl": 0.5244140625, "learning_rate": 1.841216216216216e-07, "loss": 0.0005, "reward": 3.282261848449707, "reward_std": 0.09237036108970642, "rewards/final_reward": 1.6548112822568366, "rewards/mask_iou_reward": 0.8274056411284183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.282261848449707, "rewards/thk_ans_format_reward": 1.0, "step": 2898, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 212.14583587646484, "epoch": 9.79258010118044, "grad_norm": 22.404219362540285, "kl": 0.4814453125, "learning_rate": 1.8384009009009007e-07, "loss": 0.0005, "reward": 3.1129021644592285, "reward_std": 0.04305828921496868, "rewards/final_reward": 0.548002698119196, "rewards/mask_iou_reward": 0.274001349059598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1129021644592285, "rewards/thk_ans_format_reward": 1.0, "step": 2899, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 161.8854217529297, "epoch": 9.795952782462058, "grad_norm": 12.759625247955837, "kl": 0.5048828125, "learning_rate": 1.8355855855855856e-07, "loss": 0.0005, "reward": 3.7565758228302, "reward_std": 0.08746011555194855, "rewards/final_reward": 1.6317371237065799, "rewards/mask_iou_reward": 0.8158685618532899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7565756440162659, "rewards/thk_ans_format_reward": 1.0, "step": 2900, "think_completion_length": 10.375 }, { "clip_ratio": 0.0, "completion_length": 169.1875, "epoch": 9.799325463743676, "grad_norm": 22.181261915354742, "kl": 0.44140625, "learning_rate": 1.8327702702702702e-07, "loss": 0.0004, "reward": 3.4462321996688843, "reward_std": 0.045475758612155914, "rewards/final_reward": 1.6241891651138398, "rewards/mask_iou_reward": 0.8120945825569199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4462321400642395, "rewards/thk_ans_format_reward": 1.0, "step": 2901, "think_completion_length": 7.041666666666667 }, { "clip_ratio": 0.0, "completion_length": 181.90625, "epoch": 9.802698145025294, "grad_norm": 19.883643169696228, "kl": 0.4521484375, "learning_rate": 1.8299549549549548e-07, "loss": 0.0005, "reward": 3.6311463117599487, "reward_std": 0.02114281803369522, "rewards/final_reward": 1.7492453606934104, "rewards/mask_iou_reward": 0.8746226803467052, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6311463713645935, "rewards/thk_ans_format_reward": 1.0, "step": 2902, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 191.5, "epoch": 9.806070826306915, "grad_norm": 12.52322065751209, "kl": 0.458984375, "learning_rate": 1.8271396396396397e-07, "loss": 0.0005, "reward": 3.7113521099090576, "reward_std": 0.12051836773753166, "rewards/final_reward": 1.8746252562926338, "rewards/mask_iou_reward": 0.9373126281463169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7113521099090576, "rewards/thk_ans_format_reward": 1.0, "step": 2903, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 130.96875762939453, "epoch": 9.809443507588533, "grad_norm": 27.936075958457913, "kl": 0.494140625, "learning_rate": 1.8243243243243243e-07, "loss": 0.0005, "reward": 3.5085314512252808, "reward_std": 0.03908315673470497, "rewards/final_reward": 1.9092109378444695, "rewards/mask_iou_reward": 0.9546054689222347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5085315108299255, "rewards/thk_ans_format_reward": 1.0, "step": 2904, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 179.68750762939453, "epoch": 9.812816188870151, "grad_norm": 20.574499633313156, "kl": 0.623046875, "learning_rate": 1.8215090090090092e-07, "loss": 0.0008, "reward": 3.7213168144226074, "reward_std": 0.09398959390819073, "rewards/final_reward": 1.4996546278202438, "rewards/mask_iou_reward": 0.7498273139101219, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7213165760040283, "rewards/thk_ans_format_reward": 1.0, "step": 2905, "think_completion_length": 11.75 }, { "clip_ratio": 0.0, "completion_length": 194.25000762939453, "epoch": 9.816188870151771, "grad_norm": 14.292492911019973, "kl": 0.5546875, "learning_rate": 1.8186936936936935e-07, "loss": 0.0005, "reward": 3.7173796892166138, "reward_std": 0.06229967065155506, "rewards/final_reward": 1.7996702310086092, "rewards/mask_iou_reward": 0.8998351155043046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7173798084259033, "rewards/thk_ans_format_reward": 1.0, "step": 2906, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 170.4375, "epoch": 9.81956155143339, "grad_norm": 8.865503077281648, "kl": 0.4423828125, "learning_rate": 1.815878378378378e-07, "loss": 0.0005, "reward": 3.446324944496155, "reward_std": 0.04598255269229412, "rewards/final_reward": 1.935048211500431, "rewards/mask_iou_reward": 0.9675241057502155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.44632488489151, "rewards/thk_ans_format_reward": 1.0, "step": 2907, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 176.92709350585938, "epoch": 9.822934232715008, "grad_norm": 11.956988910613449, "kl": 0.470703125, "learning_rate": 1.813063063063063e-07, "loss": 0.0005, "reward": 3.64016330242157, "reward_std": 0.026767144328914583, "rewards/final_reward": 1.8412667576827553, "rewards/mask_iou_reward": 0.9206333788413776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.640163242816925, "rewards/thk_ans_format_reward": 1.0, "step": 2908, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 110.36458587646484, "epoch": 9.826306913996627, "grad_norm": 35.14376581025715, "kl": 0.630859375, "learning_rate": 1.8102477477477476e-07, "loss": 0.0006, "reward": 3.6144161224365234, "reward_std": 0.1693898644298315, "rewards/final_reward": 1.640366187628075, "rewards/mask_iou_reward": 0.8201830938140375, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6248327493667603, "rewards/thk_ans_format_reward": 1.0, "step": 2909, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 234.56251525878906, "epoch": 9.829679595278247, "grad_norm": 10.483673875517827, "kl": 0.5126953125, "learning_rate": 1.8074324324324325e-07, "loss": 0.0005, "reward": 3.6567115783691406, "reward_std": 0.03177372459322214, "rewards/final_reward": 1.540711419191794, "rewards/mask_iou_reward": 0.770355709595897, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.656711459159851, "rewards/thk_ans_format_reward": 1.0, "step": 2910, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 192.1979217529297, "epoch": 9.833052276559865, "grad_norm": 12.243313598862233, "kl": 0.482421875, "learning_rate": 1.804617117117117e-07, "loss": 0.0005, "reward": 3.477978467941284, "reward_std": 0.054888444021344185, "rewards/final_reward": 1.393397765441062, "rewards/mask_iou_reward": 0.696698882720531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4779785871505737, "rewards/thk_ans_format_reward": 1.0, "step": 2911, "think_completion_length": 10.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 199.83333587646484, "epoch": 9.836424957841484, "grad_norm": 32.98155743117736, "kl": 0.44921875, "learning_rate": 1.8018018018018017e-07, "loss": 0.0004, "reward": 3.407157063484192, "reward_std": 0.038790177553892136, "rewards/final_reward": 1.738950939104507, "rewards/mask_iou_reward": 0.8694754695522535, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4071571826934814, "rewards/thk_ans_format_reward": 1.0, "step": 2912, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 180.625, "epoch": 9.839797639123104, "grad_norm": 40.04921195295062, "kl": 0.408203125, "learning_rate": 1.7989864864864865e-07, "loss": 0.0004, "reward": 3.4153835773468018, "reward_std": 0.022413354832679033, "rewards/final_reward": 1.0282049835428524, "rewards/mask_iou_reward": 0.5141024917714262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4153834581375122, "rewards/thk_ans_format_reward": 1.0, "step": 2913, "think_completion_length": 10.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 159.42708587646484, "epoch": 9.843170320404722, "grad_norm": 10.017276834975831, "kl": 0.599609375, "learning_rate": 1.7961711711711712e-07, "loss": 0.0007, "reward": 3.7833425998687744, "reward_std": 0.03610672801733017, "rewards/final_reward": 1.9598054072300441, "rewards/mask_iou_reward": 0.9799027036150221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7833424806594849, "rewards/thk_ans_format_reward": 1.0, "step": 2914, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 175.95833587646484, "epoch": 9.84654300168634, "grad_norm": 46.39224566897612, "kl": 0.564453125, "learning_rate": 1.7933558558558558e-07, "loss": 0.0006, "reward": 3.693509101867676, "reward_std": 0.023674975149333477, "rewards/final_reward": 1.8687781753558061, "rewards/mask_iou_reward": 0.9343890876779031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6935092210769653, "rewards/thk_ans_format_reward": 1.0, "step": 2915, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 203.20833587646484, "epoch": 9.849915682967959, "grad_norm": 33.12544783333367, "kl": 0.4609375, "learning_rate": 1.7905405405405404e-07, "loss": 0.0005, "reward": 3.379317879676819, "reward_std": 0.07394935376942158, "rewards/final_reward": 0.4523041336614774, "rewards/mask_iou_reward": 0.2261520668307387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.379317820072174, "rewards/thk_ans_format_reward": 1.0, "step": 2916, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 133.8541717529297, "epoch": 9.853288364249579, "grad_norm": 5.957307232631729, "kl": 0.50390625, "learning_rate": 1.787725225225225e-07, "loss": 0.0005, "reward": 3.7431968450546265, "reward_std": 0.021535064559429884, "rewards/final_reward": 1.7233288962292024, "rewards/mask_iou_reward": 0.8616644481146012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7431967854499817, "rewards/thk_ans_format_reward": 1.0, "step": 2917, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 171.33333587646484, "epoch": 9.856661045531197, "grad_norm": 8.247865879330005, "kl": 0.548828125, "learning_rate": 1.7849099099099098e-07, "loss": 0.0006, "reward": 3.375833749771118, "reward_std": 0.03770332410931587, "rewards/final_reward": 1.6320474866468297, "rewards/mask_iou_reward": 0.8160237433234149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.375833809375763, "rewards/thk_ans_format_reward": 1.0, "step": 2918, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 173.96875762939453, "epoch": 9.860033726812816, "grad_norm": 8.409200834188237, "kl": 0.4375, "learning_rate": 1.7820945945945945e-07, "loss": 0.0004, "reward": 3.7715072631835938, "reward_std": 0.02342725871130824, "rewards/final_reward": 1.5038488635778524, "rewards/mask_iou_reward": 0.7519244317889262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7715071439743042, "rewards/thk_ans_format_reward": 1.0, "step": 2919, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 136.28125762939453, "epoch": 9.863406408094434, "grad_norm": 27.130241868912144, "kl": 0.5166015625, "learning_rate": 1.779279279279279e-07, "loss": 0.0005, "reward": 3.368473529815674, "reward_std": 0.07650148123502731, "rewards/final_reward": 1.604985981721935, "rewards/mask_iou_reward": 0.8024929908609675, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.368473470211029, "rewards/thk_ans_format_reward": 1.0, "step": 2920, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 161.73959350585938, "epoch": 9.866779089376054, "grad_norm": 12.110615569131403, "kl": 0.984375, "learning_rate": 1.776463963963964e-07, "loss": 0.001, "reward": 3.820041060447693, "reward_std": 0.02060036826878786, "rewards/final_reward": 1.7444984203992941, "rewards/mask_iou_reward": 0.8722492101996471, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8200412392616272, "rewards/thk_ans_format_reward": 1.0, "step": 2921, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 163.6875, "epoch": 9.870151770657673, "grad_norm": 10.618169954242731, "kl": 0.484375, "learning_rate": 1.7736486486486485e-07, "loss": 0.0005, "reward": 3.7311623096466064, "reward_std": 0.0554316071793437, "rewards/final_reward": 1.8880639926998695, "rewards/mask_iou_reward": 0.9440319963499347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.731162190437317, "rewards/thk_ans_format_reward": 1.0, "step": 2922, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 191.43750762939453, "epoch": 9.873524451939291, "grad_norm": 10.306036874313062, "kl": 0.45703125, "learning_rate": 1.7708333333333334e-07, "loss": 0.0005, "reward": 3.725971221923828, "reward_std": 0.049082960933446884, "rewards/final_reward": 1.800602803022311, "rewards/mask_iou_reward": 0.9003014015111555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7259709239006042, "rewards/thk_ans_format_reward": 1.0, "step": 2923, "think_completion_length": 6.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 149.9479217529297, "epoch": 9.876897133220911, "grad_norm": 11.546465496663332, "kl": 0.4404296875, "learning_rate": 1.768018018018018e-07, "loss": 0.0005, "reward": 3.609371066093445, "reward_std": 0.03989543952047825, "rewards/final_reward": 1.694492264856553, "rewards/mask_iou_reward": 0.8472461324282765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6093710660934448, "rewards/thk_ans_format_reward": 1.0, "step": 2924, "think_completion_length": 8.958333333333332 }, { "clip_ratio": 0.0, "completion_length": 129.8229217529297, "epoch": 9.88026981450253, "grad_norm": 29.196481852946373, "kl": 0.4521484375, "learning_rate": 1.7652027027027026e-07, "loss": 0.0005, "reward": 3.4165241718292236, "reward_std": 0.04025060310959816, "rewards/final_reward": 1.485056108184279, "rewards/mask_iou_reward": 0.7425280540921395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4165241718292236, "rewards/thk_ans_format_reward": 1.0, "step": 2925, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 190.37500762939453, "epoch": 9.883642495784148, "grad_norm": 11.587141178290794, "kl": 0.4248046875, "learning_rate": 1.7623873873873872e-07, "loss": 0.0004, "reward": 3.633195996284485, "reward_std": 0.05130454897880554, "rewards/final_reward": 1.6711122317909908, "rewards/mask_iou_reward": 0.8355561158954954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6331957578659058, "rewards/thk_ans_format_reward": 1.0, "step": 2926, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 149.08333587646484, "epoch": 9.887015177065766, "grad_norm": 13.206269452632558, "kl": 0.5595703125, "learning_rate": 1.7595720720720718e-07, "loss": 0.0006, "reward": 3.688322424888611, "reward_std": 0.03490753611549735, "rewards/final_reward": 1.8829596973409148, "rewards/mask_iou_reward": 0.9414798486704574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6883226037025452, "rewards/thk_ans_format_reward": 1.0, "step": 2927, "think_completion_length": 11.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.20833587646484, "epoch": 9.890387858347387, "grad_norm": 15.933006268489812, "kl": 0.509765625, "learning_rate": 1.7567567567567567e-07, "loss": 0.0006, "reward": 3.5859495401382446, "reward_std": 0.09474263805896044, "rewards/final_reward": 1.5611196697849357, "rewards/mask_iou_reward": 0.7805598348924678, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5859493613243103, "rewards/thk_ans_format_reward": 1.0, "step": 2928, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 182.3541717529297, "epoch": 9.893760539629005, "grad_norm": 20.903893582274613, "kl": 0.498046875, "learning_rate": 1.7539414414414413e-07, "loss": 0.0005, "reward": 3.428972840309143, "reward_std": 0.10967486724257469, "rewards/final_reward": 1.555474882501772, "rewards/mask_iou_reward": 0.777737441250886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4289729595184326, "rewards/thk_ans_format_reward": 1.0, "step": 2929, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 169.93750762939453, "epoch": 9.897133220910623, "grad_norm": 17.340904195009937, "kl": 0.421875, "learning_rate": 1.751126126126126e-07, "loss": 0.0005, "reward": 3.2104990482330322, "reward_std": 0.03169256402179599, "rewards/final_reward": 0.706509706649572, "rewards/mask_iou_reward": 0.353254853324786, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2104989886283875, "rewards/thk_ans_format_reward": 1.0, "step": 2930, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 147.9166717529297, "epoch": 9.900505902192243, "grad_norm": 9.984017440437876, "kl": 0.533203125, "learning_rate": 1.7483108108108108e-07, "loss": 0.0005, "reward": 3.6238770484924316, "reward_std": 0.04825960611924529, "rewards/final_reward": 1.497253118569068, "rewards/mask_iou_reward": 0.748626559284534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.623877227306366, "rewards/thk_ans_format_reward": 1.0, "step": 2931, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 190.27083587646484, "epoch": 9.903878583473862, "grad_norm": 6.951641531264174, "kl": 0.5078125, "learning_rate": 1.7454954954954954e-07, "loss": 0.0005, "reward": 3.6145602464675903, "reward_std": 0.09818883240222931, "rewards/final_reward": 1.7906074611252016, "rewards/mask_iou_reward": 0.8953037305626008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6145601868629456, "rewards/thk_ans_format_reward": 1.0, "step": 2932, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 148.37500762939453, "epoch": 9.90725126475548, "grad_norm": 76.02270592629036, "kl": 0.5341796875, "learning_rate": 1.7426801801801803e-07, "loss": 0.0005, "reward": 3.306624174118042, "reward_std": 0.14644039422273636, "rewards/final_reward": 0.8189659901344464, "rewards/mask_iou_reward": 0.4094829950672232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3066240549087524, "rewards/thk_ans_format_reward": 1.0, "step": 2933, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 195.55208587646484, "epoch": 9.910623946037099, "grad_norm": 8.56338879773896, "kl": 0.4306640625, "learning_rate": 1.739864864864865e-07, "loss": 0.0004, "reward": 3.6486175060272217, "reward_std": 0.026789831928908825, "rewards/final_reward": 1.8038686547675227, "rewards/mask_iou_reward": 0.9019343273837613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6486175060272217, "rewards/thk_ans_format_reward": 1.0, "step": 2934, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 173.59375762939453, "epoch": 9.913996627318719, "grad_norm": 11.418503877009623, "kl": 0.400390625, "learning_rate": 1.7370495495495495e-07, "loss": 0.0004, "reward": 3.7190651893615723, "reward_std": 0.13147340714931488, "rewards/final_reward": 1.2865869158245113, "rewards/mask_iou_reward": 0.6432934579122557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7190650701522827, "rewards/thk_ans_format_reward": 1.0, "step": 2935, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 209.71875, "epoch": 9.917369308600337, "grad_norm": 9.971262476586999, "kl": 0.45703125, "learning_rate": 1.734234234234234e-07, "loss": 0.0004, "reward": 3.8314409255981445, "reward_std": 0.03323422558605671, "rewards/final_reward": 1.8260853344315668, "rewards/mask_iou_reward": 0.9130426672157834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8314408659934998, "rewards/thk_ans_format_reward": 1.0, "step": 2936, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 160.18750381469727, "epoch": 9.920741989881956, "grad_norm": 36.2502426585584, "kl": 0.498046875, "learning_rate": 1.7314189189189187e-07, "loss": 0.0005, "reward": 3.436210036277771, "reward_std": 0.152422234416008, "rewards/final_reward": 1.5981619230412196, "rewards/mask_iou_reward": 0.7990809615206098, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4466266632080078, "rewards/thk_ans_format_reward": 1.0, "step": 2937, "think_completion_length": 8.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 147.5729217529297, "epoch": 9.924114671163576, "grad_norm": 11.851306195065153, "kl": 0.498046875, "learning_rate": 1.7286036036036036e-07, "loss": 0.0005, "reward": 3.2320234775543213, "reward_std": 0.08792375959455967, "rewards/final_reward": 0.45280834392977154, "rewards/mask_iou_reward": 0.22640417196488577, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2320235967636108, "rewards/thk_ans_format_reward": 1.0, "step": 2938, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 150.84375762939453, "epoch": 9.927487352445194, "grad_norm": 15.982465512064199, "kl": 0.453125, "learning_rate": 1.7257882882882882e-07, "loss": 0.0005, "reward": 3.6887292861938477, "reward_std": 0.017841395922005177, "rewards/final_reward": 1.4235046997267258, "rewards/mask_iou_reward": 0.7117523498633629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6887291073799133, "rewards/thk_ans_format_reward": 1.0, "step": 2939, "think_completion_length": 10.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 156.0104217529297, "epoch": 9.930860033726812, "grad_norm": 14.291848001764228, "kl": 0.498046875, "learning_rate": 1.7229729729729728e-07, "loss": 0.0005, "reward": 3.6745972633361816, "reward_std": 0.06178950145840645, "rewards/final_reward": 1.9635444240660527, "rewards/mask_iou_reward": 0.9817722120330263, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6745970845222473, "rewards/thk_ans_format_reward": 1.0, "step": 2940, "think_completion_length": 9.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 185.2604217529297, "epoch": 9.93423271500843, "grad_norm": 7.356592155605947, "kl": 0.453125, "learning_rate": 1.7201576576576577e-07, "loss": 0.0005, "reward": 3.3794697523117065, "reward_std": 0.09687015041708946, "rewards/final_reward": 1.695792669363521, "rewards/mask_iou_reward": 0.8478963346817605, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3794699907302856, "rewards/thk_ans_format_reward": 1.0, "step": 2941, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 284.0416717529297, "epoch": 9.937605396290051, "grad_norm": 11.633500137625838, "kl": 0.3662109375, "learning_rate": 1.7173423423423423e-07, "loss": 0.0004, "reward": 3.1571407318115234, "reward_std": 0.03151166997849941, "rewards/final_reward": 0.9415822823880424, "rewards/mask_iou_reward": 0.4707911411940212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.157140702009201, "rewards/thk_ans_format_reward": 1.0, "step": 2942, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 157.42708587646484, "epoch": 9.94097807757167, "grad_norm": 16.193175894939944, "kl": 0.61328125, "learning_rate": 1.714527027027027e-07, "loss": 0.0006, "reward": 3.3838253021240234, "reward_std": 0.1029960885643959, "rewards/final_reward": 1.0003497415716365, "rewards/mask_iou_reward": 0.5001748707858182, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3838250637054443, "rewards/thk_ans_format_reward": 1.0, "step": 2943, "think_completion_length": 11.125 }, { "clip_ratio": 0.0, "completion_length": 140.8958396911621, "epoch": 9.944350758853288, "grad_norm": 10.13797614381608, "kl": 0.505859375, "learning_rate": 1.7117117117117117e-07, "loss": 0.0005, "reward": 3.6523609161376953, "reward_std": 0.044180636294186115, "rewards/final_reward": 1.8727462066479181, "rewards/mask_iou_reward": 0.9363731033239591, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.652361273765564, "rewards/thk_ans_format_reward": 1.0, "step": 2944, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 159.67709350585938, "epoch": 9.947723440134908, "grad_norm": 11.285243604780446, "kl": 0.51171875, "learning_rate": 1.7088963963963963e-07, "loss": 0.0005, "reward": 3.1645601987838745, "reward_std": 0.09775098785758018, "rewards/final_reward": 1.036894984210309, "rewards/mask_iou_reward": 0.5184474921051545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.164560079574585, "rewards/thk_ans_format_reward": 1.0, "step": 2945, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 181.43750762939453, "epoch": 9.951096121416526, "grad_norm": 10.555231941128286, "kl": 0.435546875, "learning_rate": 1.706081081081081e-07, "loss": 0.0004, "reward": 3.799447178840637, "reward_std": 0.022393792401999235, "rewards/final_reward": 1.7339816399490353, "rewards/mask_iou_reward": 0.8669908199745177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7994474172592163, "rewards/thk_ans_format_reward": 1.0, "step": 2946, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 242.68750762939453, "epoch": 9.954468802698145, "grad_norm": 8.845005544933374, "kl": 0.4521484375, "learning_rate": 1.7032657657657656e-07, "loss": 0.0005, "reward": 3.6235469579696655, "reward_std": 0.055221643298864365, "rewards/final_reward": 1.281981455757465, "rewards/mask_iou_reward": 0.6409907278787325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6235471963882446, "rewards/thk_ans_format_reward": 1.0, "step": 2947, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 153.89584350585938, "epoch": 9.957841483979763, "grad_norm": 11.47169436446601, "kl": 0.5224609375, "learning_rate": 1.7004504504504504e-07, "loss": 0.0005, "reward": 3.5237936973571777, "reward_std": 0.07607119157910347, "rewards/final_reward": 1.8342230228232848, "rewards/mask_iou_reward": 0.9171115114116424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5237935781478882, "rewards/thk_ans_format_reward": 1.0, "step": 2948, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 211.1875, "epoch": 9.961214165261383, "grad_norm": 10.261729370598339, "kl": 0.43359375, "learning_rate": 1.697635135135135e-07, "loss": 0.0004, "reward": 3.6379356384277344, "reward_std": 0.03940104506909847, "rewards/final_reward": 1.620287095778117, "rewards/mask_iou_reward": 0.8101435478890585, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.637935757637024, "rewards/thk_ans_format_reward": 1.0, "step": 2949, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 169.5416717529297, "epoch": 9.964586846543002, "grad_norm": 17.48974843978608, "kl": 0.97265625, "learning_rate": 1.6948198198198196e-07, "loss": 0.001, "reward": 3.606302499771118, "reward_std": 0.04735631635412574, "rewards/final_reward": 1.796730760126513, "rewards/mask_iou_reward": 0.8983653800632565, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6063026189804077, "rewards/thk_ans_format_reward": 1.0, "step": 2950, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 243.3854217529297, "epoch": 9.96795952782462, "grad_norm": 10.82864820514153, "kl": 0.4345703125, "learning_rate": 1.6920045045045045e-07, "loss": 0.0004, "reward": 3.7164015769958496, "reward_std": 0.11439871042966843, "rewards/final_reward": 1.6110125597812006, "rewards/mask_iou_reward": 0.8055062798906003, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.71640145778656, "rewards/thk_ans_format_reward": 1.0, "step": 2951, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.20833587646484, "epoch": 9.97133220910624, "grad_norm": 110.90492820482815, "kl": 0.470703125, "learning_rate": 1.689189189189189e-07, "loss": 0.0005, "reward": 3.732366681098938, "reward_std": 0.0235447958111763, "rewards/final_reward": 1.9120111381690408, "rewards/mask_iou_reward": 0.9560055690845204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7323666214942932, "rewards/thk_ans_format_reward": 1.0, "step": 2952, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 279.6458435058594, "epoch": 9.974704890387859, "grad_norm": 9.008389810735736, "kl": 1.67578125, "learning_rate": 1.686373873873874e-07, "loss": 0.0017, "reward": 3.518873453140259, "reward_std": 0.03557255119085312, "rewards/final_reward": 1.5394322548166186, "rewards/mask_iou_reward": 0.7697161274083093, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.518873393535614, "rewards/thk_ans_format_reward": 1.0, "step": 2953, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 229.45834350585938, "epoch": 9.978077571669477, "grad_norm": 14.853078093798148, "kl": 0.4912109375, "learning_rate": 1.6835585585585586e-07, "loss": 0.0005, "reward": 3.7388641834259033, "reward_std": 0.048480624333024025, "rewards/final_reward": 1.8573203732843733, "rewards/mask_iou_reward": 0.9286601866421866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7388643026351929, "rewards/thk_ans_format_reward": 1.0, "step": 2954, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 185.39583587646484, "epoch": 9.981450252951095, "grad_norm": 22.784244760850683, "kl": 0.857421875, "learning_rate": 1.680743243243243e-07, "loss": 0.0009, "reward": 3.455596923828125, "reward_std": 0.05335235968232155, "rewards/final_reward": 1.6826988393083862, "rewards/mask_iou_reward": 0.8413494196541931, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4555969834327698, "rewards/thk_ans_format_reward": 1.0, "step": 2955, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 192.03125, "epoch": 9.984822934232715, "grad_norm": 22.288546418231373, "kl": 0.4384765625, "learning_rate": 1.6779279279279278e-07, "loss": 0.0004, "reward": 3.4251439571380615, "reward_std": 0.028437476605176926, "rewards/final_reward": 1.1720492580818076, "rewards/mask_iou_reward": 0.5860246290409038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4251437187194824, "rewards/thk_ans_format_reward": 1.0, "step": 2956, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 183.03125, "epoch": 9.988195615514334, "grad_norm": 16.51769126088518, "kl": 0.60546875, "learning_rate": 1.6751126126126124e-07, "loss": 0.0006, "reward": 3.3870999813079834, "reward_std": 0.05538425035774708, "rewards/final_reward": 1.507213691171632, "rewards/mask_iou_reward": 0.753606845585816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3870996832847595, "rewards/thk_ans_format_reward": 1.0, "step": 2957, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 193.875, "epoch": 9.991568296795952, "grad_norm": 7.731847744255573, "kl": 0.517578125, "learning_rate": 1.6722972972972973e-07, "loss": 0.0005, "reward": 3.5086575746536255, "reward_std": 0.038238752633333206, "rewards/final_reward": 1.5398872895969526, "rewards/mask_iou_reward": 0.7699436447984763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5086576342582703, "rewards/thk_ans_format_reward": 1.0, "step": 2958, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 209.33333587646484, "epoch": 9.994940978077572, "grad_norm": 31.29226900190095, "kl": 0.40234375, "learning_rate": 1.669481981981982e-07, "loss": 0.0004, "reward": 3.445936441421509, "reward_std": 0.21446402929723263, "rewards/final_reward": 1.4543522509606437, "rewards/mask_iou_reward": 0.7271761254803218, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.4771862030029297, "rewards/thk_ans_format_reward": 1.0, "step": 2959, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 178.78948211669922, "epoch": 9.99831365935919, "grad_norm": 13.639265248226923, "kl": 0.5166015625, "learning_rate": 1.6666666666666665e-07, "loss": 0.0005, "reward": 3.4749454259872437, "reward_std": 0.021927848923951387, "rewards/final_reward": 1.6298833637258023, "rewards/mask_iou_reward": 0.8149416818629012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4749454855918884, "rewards/thk_ans_format_reward": 1.0, "step": 2960, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 175.18750762939453, "epoch": 10.003372681281618, "grad_norm": 11.074702040879929, "kl": 0.517578125, "learning_rate": 1.6638513513513514e-07, "loss": 0.0005, "reward": 3.554487109184265, "reward_std": 0.057201748713850975, "rewards/final_reward": 1.7276102054288849, "rewards/mask_iou_reward": 0.8638051027144424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55448716878891, "rewards/thk_ans_format_reward": 1.0, "step": 2961, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 136.3958396911621, "epoch": 10.006745362563239, "grad_norm": 17.21456270645261, "kl": 0.5576171875, "learning_rate": 1.661036036036036e-07, "loss": 0.0005, "reward": 3.784704804420471, "reward_std": 0.0637154346331954, "rewards/final_reward": 1.7564609223970806, "rewards/mask_iou_reward": 0.8782304611985403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.784704566001892, "rewards/thk_ans_format_reward": 1.0, "step": 2962, "think_completion_length": 9.125 }, { "clip_ratio": 0.0, "completion_length": 140.1354217529297, "epoch": 10.010118043844857, "grad_norm": 10.966578836490807, "kl": 0.4501953125, "learning_rate": 1.6582207207207209e-07, "loss": 0.0005, "reward": 3.706210970878601, "reward_std": 0.034264068119227886, "rewards/final_reward": 1.6498285462680125, "rewards/mask_iou_reward": 0.8249142731340062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.706210970878601, "rewards/thk_ans_format_reward": 1.0, "step": 2963, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 236.33333587646484, "epoch": 10.013490725126475, "grad_norm": 14.586749604154246, "kl": 0.400390625, "learning_rate": 1.6554054054054055e-07, "loss": 0.0005, "reward": 3.7034146785736084, "reward_std": 0.15973122231662273, "rewards/final_reward": 1.8500299913430904, "rewards/mask_iou_reward": 0.9250149956715452, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7242479920387268, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2964, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 188.98958587646484, "epoch": 10.016863406408094, "grad_norm": 16.797745597040446, "kl": 0.5888671875, "learning_rate": 1.6525900900900898e-07, "loss": 0.0006, "reward": 3.473345994949341, "reward_std": 0.02174593461677432, "rewards/final_reward": 1.1856811101943596, "rewards/mask_iou_reward": 0.5928405550971798, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4733460545539856, "rewards/thk_ans_format_reward": 1.0, "step": 2965, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.33334350585938, "epoch": 10.020236087689714, "grad_norm": 18.3615049832474, "kl": 0.443359375, "learning_rate": 1.6497747747747747e-07, "loss": 0.0004, "reward": 3.4337416887283325, "reward_std": 0.05834187474101782, "rewards/final_reward": 1.1960408761287606, "rewards/mask_iou_reward": 0.5980204380643803, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4337416887283325, "rewards/thk_ans_format_reward": 1.0, "step": 2966, "think_completion_length": 9.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 182.86458587646484, "epoch": 10.023608768971332, "grad_norm": 17.822461269329146, "kl": 0.68359375, "learning_rate": 1.6469594594594593e-07, "loss": 0.0007, "reward": 3.599363684654236, "reward_std": 0.01909334654919803, "rewards/final_reward": 1.160729292363, "rewards/mask_iou_reward": 0.5803646461815, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5993636846542358, "rewards/thk_ans_format_reward": 1.0, "step": 2967, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 169.39583587646484, "epoch": 10.02698145025295, "grad_norm": 6.870113350227844, "kl": 0.638671875, "learning_rate": 1.6441441441441442e-07, "loss": 0.0007, "reward": 3.8057702779769897, "reward_std": 0.014783780090510845, "rewards/final_reward": 1.8299943991686436, "rewards/mask_iou_reward": 0.9149971995843218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.805770218372345, "rewards/thk_ans_format_reward": 1.0, "step": 2968, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 192.67708587646484, "epoch": 10.03035413153457, "grad_norm": 15.001810746884246, "kl": 0.423828125, "learning_rate": 1.6413288288288288e-07, "loss": 0.0004, "reward": 3.303891658782959, "reward_std": 0.12750976346433163, "rewards/final_reward": 1.1526862911700408, "rewards/mask_iou_reward": 0.5763431455850204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303891658782959, "rewards/thk_ans_format_reward": 1.0, "step": 2969, "think_completion_length": 9.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 277.38543701171875, "epoch": 10.03372681281619, "grad_norm": 7.303224734033114, "kl": 0.3857421875, "learning_rate": 1.6385135135135134e-07, "loss": 0.0004, "reward": 3.4886913299560547, "reward_std": 0.1877976879477501, "rewards/final_reward": 1.896774348900549, "rewards/mask_iou_reward": 0.9483871744502745, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5303580164909363, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2970, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 240.88542938232422, "epoch": 10.037099494097808, "grad_norm": 8.140977371174358, "kl": 0.4140625, "learning_rate": 1.6356981981981982e-07, "loss": 0.0004, "reward": 3.4109930992126465, "reward_std": 0.14596521109342575, "rewards/final_reward": 1.4359857698723864, "rewards/mask_iou_reward": 0.7179928849361932, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4318262338638306, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2971, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 212.1250114440918, "epoch": 10.040472175379426, "grad_norm": 6.908211064573147, "kl": 0.4951171875, "learning_rate": 1.6328828828828828e-07, "loss": 0.0005, "reward": 3.5527108907699585, "reward_std": 0.0504262950271368, "rewards/final_reward": 1.452446585897007, "rewards/mask_iou_reward": 0.7262232929485035, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5527108311653137, "rewards/thk_ans_format_reward": 1.0, "step": 2972, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 165.09375762939453, "epoch": 10.043844856661046, "grad_norm": 17.422370241399438, "kl": 0.5751953125, "learning_rate": 1.6300675675675674e-07, "loss": 0.0006, "reward": 3.5222601890563965, "reward_std": 0.0661549512296915, "rewards/final_reward": 1.777932428648302, "rewards/mask_iou_reward": 0.888966214324151, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5222598910331726, "rewards/thk_ans_format_reward": 1.0, "step": 2973, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 204.7291717529297, "epoch": 10.047217537942664, "grad_norm": 14.254621038601908, "kl": 0.419921875, "learning_rate": 1.6272522522522523e-07, "loss": 0.0004, "reward": 3.6620055437088013, "reward_std": 0.08311491832137108, "rewards/final_reward": 1.4122443571993455, "rewards/mask_iou_reward": 0.7061221785996727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6620052456855774, "rewards/thk_ans_format_reward": 1.0, "step": 2974, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 209.56250762939453, "epoch": 10.050590219224283, "grad_norm": 16.099298514297175, "kl": 0.4072265625, "learning_rate": 1.6244369369369367e-07, "loss": 0.0004, "reward": 3.5003554821014404, "reward_std": 0.03283052425831556, "rewards/final_reward": 1.4058345564704786, "rewards/mask_iou_reward": 0.7029172782352393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.500355303287506, "rewards/thk_ans_format_reward": 1.0, "step": 2975, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 147.55208587646484, "epoch": 10.053962900505903, "grad_norm": 13.845801977509861, "kl": 0.61328125, "learning_rate": 1.6216216216216215e-07, "loss": 0.0006, "reward": 3.2513362169265747, "reward_std": 0.11728479154407978, "rewards/final_reward": 0.5600943317194658, "rewards/mask_iou_reward": 0.2800471658597329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.251335859298706, "rewards/thk_ans_format_reward": 1.0, "step": 2976, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 135.75, "epoch": 10.057335581787521, "grad_norm": 9.462435762142102, "kl": 0.494140625, "learning_rate": 1.6188063063063061e-07, "loss": 0.0005, "reward": 3.6380168199539185, "reward_std": 0.014404607936739922, "rewards/final_reward": 1.2512799839296525, "rewards/mask_iou_reward": 0.6256399919648262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6380165815353394, "rewards/thk_ans_format_reward": 1.0, "step": 2977, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 155.4791717529297, "epoch": 10.06070826306914, "grad_norm": 32.37701361935778, "kl": 0.642578125, "learning_rate": 1.6159909909909907e-07, "loss": 0.0007, "reward": 3.2058398723602295, "reward_std": 0.07977872295305133, "rewards/final_reward": 0.7515505715066793, "rewards/mask_iou_reward": 0.3757752857533396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2058398127555847, "rewards/thk_ans_format_reward": 1.0, "step": 2978, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 179.9791717529297, "epoch": 10.064080944350758, "grad_norm": 101.75506967696498, "kl": 0.509765625, "learning_rate": 1.6131756756756756e-07, "loss": 0.0005, "reward": 3.602941393852234, "reward_std": 0.10243973135948181, "rewards/final_reward": 1.7877489590511702, "rewards/mask_iou_reward": 0.8938744795255851, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.602941632270813, "rewards/thk_ans_format_reward": 1.0, "step": 2979, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 225.62500762939453, "epoch": 10.067453625632378, "grad_norm": 11.296191395336175, "kl": 0.427734375, "learning_rate": 1.6103603603603602e-07, "loss": 0.0005, "reward": 3.6917929649353027, "reward_std": 0.09627137146890163, "rewards/final_reward": 1.882786193694157, "rewards/mask_iou_reward": 0.9413930968470785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.691792905330658, "rewards/thk_ans_format_reward": 1.0, "step": 2980, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 193.7604217529297, "epoch": 10.070826306913997, "grad_norm": 9.002201625690828, "kl": 0.4228515625, "learning_rate": 1.607545045045045e-07, "loss": 0.0004, "reward": 3.6802202463150024, "reward_std": 0.05004505813121796, "rewards/final_reward": 1.4352498487281913, "rewards/mask_iou_reward": 0.7176249243640956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6802199482917786, "rewards/thk_ans_format_reward": 1.0, "step": 2981, "think_completion_length": 10.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 180.65625762939453, "epoch": 10.074198988195615, "grad_norm": 7.67063156372946, "kl": 0.4111328125, "learning_rate": 1.6047297297297297e-07, "loss": 0.0006, "reward": 3.744957447052002, "reward_std": 0.021554138511419296, "rewards/final_reward": 1.5525201778527098, "rewards/mask_iou_reward": 0.7762600889263549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.744957447052002, "rewards/thk_ans_format_reward": 1.0, "step": 2982, "think_completion_length": 6.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 202.5416717529297, "epoch": 10.077571669477235, "grad_norm": 20.886407622595403, "kl": 0.4521484375, "learning_rate": 1.6019144144144143e-07, "loss": 0.0005, "reward": 3.314300298690796, "reward_std": 0.13691934198141098, "rewards/final_reward": 1.1969343026669517, "rewards/mask_iou_reward": 0.5984671513334758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3143000602722168, "rewards/thk_ans_format_reward": 1.0, "step": 2983, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 159.09375762939453, "epoch": 10.080944350758854, "grad_norm": 16.196788087359803, "kl": 0.607421875, "learning_rate": 1.5990990990990992e-07, "loss": 0.0006, "reward": 3.558018445968628, "reward_std": 0.027575062587857246, "rewards/final_reward": 1.7583389896541204, "rewards/mask_iou_reward": 0.8791694948270602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5580185651779175, "rewards/thk_ans_format_reward": 1.0, "step": 2984, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 194.7812614440918, "epoch": 10.084317032040472, "grad_norm": 10.59394553028089, "kl": 0.435546875, "learning_rate": 1.5962837837837835e-07, "loss": 0.0004, "reward": 3.549351930618286, "reward_std": 0.057122400030493736, "rewards/final_reward": 1.7864545330082315, "rewards/mask_iou_reward": 0.8932272665041158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5493518710136414, "rewards/thk_ans_format_reward": 1.0, "step": 2985, "think_completion_length": 9.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 256.0104217529297, "epoch": 10.08768971332209, "grad_norm": 7.993621357063377, "kl": 0.58203125, "learning_rate": 1.5934684684684684e-07, "loss": 0.0006, "reward": 3.766826868057251, "reward_std": 0.034018273930996656, "rewards/final_reward": 1.9693942374379159, "rewards/mask_iou_reward": 0.9846971187189579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7668269276618958, "rewards/thk_ans_format_reward": 1.0, "step": 2986, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 299.40626525878906, "epoch": 10.09106239460371, "grad_norm": 7.8579908910789, "kl": 0.474609375, "learning_rate": 1.590653153153153e-07, "loss": 0.0006, "reward": 3.545803666114807, "reward_std": 0.16862626932561398, "rewards/final_reward": 1.7307227523857664, "rewards/mask_iou_reward": 0.8653613761928832, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5666367411613464, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 2987, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 279.4166717529297, "epoch": 10.094435075885329, "grad_norm": 14.888977164594754, "kl": 0.392578125, "learning_rate": 1.5878378378378376e-07, "loss": 0.0004, "reward": 3.4926047325134277, "reward_std": 0.1254997132346034, "rewards/final_reward": 1.6567365575695758, "rewards/mask_iou_reward": 0.8283682787847879, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.534271240234375, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 2988, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 163.0520896911621, "epoch": 10.097807757166947, "grad_norm": 178.17919910050486, "kl": 0.509765625, "learning_rate": 1.5850225225225225e-07, "loss": 0.0005, "reward": 3.5052605867385864, "reward_std": 0.07205817103385925, "rewards/final_reward": 1.3663548258724365, "rewards/mask_iou_reward": 0.6831774129362183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5052603483200073, "rewards/thk_ans_format_reward": 1.0, "step": 2989, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 193.7604217529297, "epoch": 10.101180438448566, "grad_norm": 9.404698816959156, "kl": 0.515625, "learning_rate": 1.582207207207207e-07, "loss": 0.0005, "reward": 3.4773871898651123, "reward_std": 0.04026375897228718, "rewards/final_reward": 1.5072943995416463, "rewards/mask_iou_reward": 0.7536471997708232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4773871302604675, "rewards/thk_ans_format_reward": 1.0, "step": 2990, "think_completion_length": 6.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 208.90625, "epoch": 10.104553119730186, "grad_norm": 17.635410219594927, "kl": 0.564453125, "learning_rate": 1.579391891891892e-07, "loss": 0.0006, "reward": 3.370635747909546, "reward_std": 0.08598719723522663, "rewards/final_reward": 0.9948692949078439, "rewards/mask_iou_reward": 0.49743464745392196, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3706358075141907, "rewards/thk_ans_format_reward": 1.0, "step": 2991, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 220.53125762939453, "epoch": 10.107925801011804, "grad_norm": 8.498665489249783, "kl": 0.416015625, "learning_rate": 1.5765765765765766e-07, "loss": 0.0004, "reward": 3.434711217880249, "reward_std": 0.053263518027961254, "rewards/final_reward": 1.2998215907954818, "rewards/mask_iou_reward": 0.6499107953977409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.434711217880249, "rewards/thk_ans_format_reward": 1.0, "step": 2992, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 151.14583587646484, "epoch": 10.111298482293423, "grad_norm": 11.28021036658622, "kl": 0.4296875, "learning_rate": 1.5737612612612612e-07, "loss": 0.0004, "reward": 3.796360492706299, "reward_std": 0.03567369282245636, "rewards/final_reward": 1.616571069132243, "rewards/mask_iou_reward": 0.8082855345661215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7963602542877197, "rewards/thk_ans_format_reward": 1.0, "step": 2993, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 170.89583587646484, "epoch": 10.114671163575043, "grad_norm": 12.73797474710979, "kl": 0.5234375, "learning_rate": 1.570945945945946e-07, "loss": 0.0005, "reward": 3.5808873176574707, "reward_std": 0.07881723530590534, "rewards/final_reward": 0.8961576623113501, "rewards/mask_iou_reward": 0.44807883115567504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5808873772621155, "rewards/thk_ans_format_reward": 1.0, "step": 2994, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 198.98958587646484, "epoch": 10.118043844856661, "grad_norm": 9.283367900830749, "kl": 0.4599609375, "learning_rate": 1.5681306306306304e-07, "loss": 0.0005, "reward": 3.6719605922698975, "reward_std": 0.039013393223285675, "rewards/final_reward": 1.5600923645952776, "rewards/mask_iou_reward": 0.7800461822976388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6719606518745422, "rewards/thk_ans_format_reward": 1.0, "step": 2995, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 258.6666717529297, "epoch": 10.12141652613828, "grad_norm": 18.461367215836585, "kl": 0.474609375, "learning_rate": 1.5653153153153153e-07, "loss": 0.0005, "reward": 3.6122279167175293, "reward_std": 0.05605051852762699, "rewards/final_reward": 1.843798595685787, "rewards/mask_iou_reward": 0.9218992978428935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6122277975082397, "rewards/thk_ans_format_reward": 1.0, "step": 2996, "think_completion_length": 9.375 }, { "clip_ratio": 0.0, "completion_length": 163.61458587646484, "epoch": 10.124789207419898, "grad_norm": 13.510683556899686, "kl": 0.4072265625, "learning_rate": 1.5624999999999999e-07, "loss": 0.0004, "reward": 3.553600788116455, "reward_std": 0.14882715791463852, "rewards/final_reward": 1.5099988524276768, "rewards/mask_iou_reward": 0.7549994262138384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.553600788116455, "rewards/thk_ans_format_reward": 1.0, "step": 2997, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 194.4479217529297, "epoch": 10.128161888701518, "grad_norm": 7.768656628795247, "kl": 0.416015625, "learning_rate": 1.5596846846846845e-07, "loss": 0.0004, "reward": 3.483713388442993, "reward_std": 0.061230381950736046, "rewards/final_reward": 1.5944935107421987, "rewards/mask_iou_reward": 0.7972467553710993, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4837132096290588, "rewards/thk_ans_format_reward": 1.0, "step": 2998, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 185.2604217529297, "epoch": 10.131534569983137, "grad_norm": 14.741656546568667, "kl": 0.4453125, "learning_rate": 1.5568693693693693e-07, "loss": 0.0005, "reward": 3.785600185394287, "reward_std": 0.042826587334275246, "rewards/final_reward": 1.6696943980484984, "rewards/mask_iou_reward": 0.8348471990242492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7856003642082214, "rewards/thk_ans_format_reward": 1.0, "step": 2999, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 174.2604217529297, "epoch": 10.134907251264755, "grad_norm": 11.287609971331385, "kl": 0.4287109375, "learning_rate": 1.554054054054054e-07, "loss": 0.0004, "reward": 3.588198661804199, "reward_std": 0.009367643389850855, "rewards/final_reward": 1.8614195241644378, "rewards/mask_iou_reward": 0.9307097620822189, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.588198959827423, "rewards/thk_ans_format_reward": 1.0, "step": 3000, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.0416717529297, "epoch": 10.138279932546375, "grad_norm": 7.047347437674352, "kl": 0.4853515625, "learning_rate": 1.5512387387387388e-07, "loss": 0.0005, "reward": 3.660353422164917, "reward_std": 0.06875004037283361, "rewards/final_reward": 1.5753046420376315, "rewards/mask_iou_reward": 0.7876523210188158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.660353183746338, "rewards/thk_ans_format_reward": 1.0, "step": 3001, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 208.80208587646484, "epoch": 10.141652613827993, "grad_norm": 7.245173276493627, "kl": 0.48828125, "learning_rate": 1.5484234234234234e-07, "loss": 0.0005, "reward": 3.671100616455078, "reward_std": 0.06982170045375824, "rewards/final_reward": 1.6626594398675416, "rewards/mask_iou_reward": 0.8313297199337708, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6711003184318542, "rewards/thk_ans_format_reward": 1.0, "step": 3002, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 171.93750762939453, "epoch": 10.145025295109612, "grad_norm": 12.782446082021838, "kl": 0.4755859375, "learning_rate": 1.545608108108108e-07, "loss": 0.0005, "reward": 3.6925946474075317, "reward_std": 0.07178288232535124, "rewards/final_reward": 1.834792823104551, "rewards/mask_iou_reward": 0.9173964115522755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6925946474075317, "rewards/thk_ans_format_reward": 1.0, "step": 3003, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 273.70833587646484, "epoch": 10.14839797639123, "grad_norm": 8.797747104877757, "kl": 0.4453125, "learning_rate": 1.542792792792793e-07, "loss": 0.0004, "reward": 3.409364938735962, "reward_std": 0.044542797608301044, "rewards/final_reward": 0.7013575445373311, "rewards/mask_iou_reward": 0.35067877226866556, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4093648195266724, "rewards/thk_ans_format_reward": 1.0, "step": 3004, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 164.2604217529297, "epoch": 10.15177065767285, "grad_norm": 16.59870925962645, "kl": 0.533203125, "learning_rate": 1.5399774774774772e-07, "loss": 0.0005, "reward": 3.5411399602890015, "reward_std": 0.027776396833360195, "rewards/final_reward": 1.9392752603293033, "rewards/mask_iou_reward": 0.9696376301646517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5411400198936462, "rewards/thk_ans_format_reward": 1.0, "step": 3005, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 182.21875, "epoch": 10.155143338954469, "grad_norm": 13.806504308080143, "kl": 0.849609375, "learning_rate": 1.537162162162162e-07, "loss": 0.0008, "reward": 3.625385046005249, "reward_std": 0.11380976252257824, "rewards/final_reward": 1.1961639545209342, "rewards/mask_iou_reward": 0.5980819772604671, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6253849864006042, "rewards/thk_ans_format_reward": 1.0, "step": 3006, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 257.11458587646484, "epoch": 10.158516020236087, "grad_norm": 46.019880167512845, "kl": 0.43359375, "learning_rate": 1.5343468468468467e-07, "loss": 0.0004, "reward": 3.5516542196273804, "reward_std": 0.09062372241169214, "rewards/final_reward": 1.5300205463825334, "rewards/mask_iou_reward": 0.7650102731912667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5516541600227356, "rewards/thk_ans_format_reward": 1.0, "step": 3007, "think_completion_length": 10.375 }, { "clip_ratio": 0.0, "completion_length": 183.8541717529297, "epoch": 10.161888701517707, "grad_norm": 210.24459797416944, "kl": 0.4365234375, "learning_rate": 1.5315315315315313e-07, "loss": 0.0004, "reward": 3.5649502277374268, "reward_std": 0.07269263081252575, "rewards/final_reward": 1.65368951076421, "rewards/mask_iou_reward": 0.826844755382105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.564950168132782, "rewards/thk_ans_format_reward": 1.0, "step": 3008, "think_completion_length": 7.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 272.1979217529297, "epoch": 10.165261382799326, "grad_norm": 7.431795164683937, "kl": 0.4365234375, "learning_rate": 1.5287162162162162e-07, "loss": 0.0004, "reward": 3.682637095451355, "reward_std": 0.04478604253381491, "rewards/final_reward": 1.4957267679750599, "rewards/mask_iou_reward": 0.7478633839875299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6826370358467102, "rewards/thk_ans_format_reward": 1.0, "step": 3009, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 237.09375762939453, "epoch": 10.168634064080944, "grad_norm": 43.951720601436165, "kl": 0.5146484375, "learning_rate": 1.5259009009009008e-07, "loss": 0.0005, "reward": 3.2419824600219727, "reward_std": 0.09567523375153542, "rewards/final_reward": 0.9779962089804097, "rewards/mask_iou_reward": 0.48899810449020487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2419825792312622, "rewards/thk_ans_format_reward": 1.0, "step": 3010, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 190.45833587646484, "epoch": 10.172006745362562, "grad_norm": 21.70274754464807, "kl": 0.51953125, "learning_rate": 1.5230855855855857e-07, "loss": 0.0005, "reward": 3.8406925201416016, "reward_std": 0.01246106019243598, "rewards/final_reward": 1.8198003630804593, "rewards/mask_iou_reward": 0.9099001815402297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.840692400932312, "rewards/thk_ans_format_reward": 1.0, "step": 3011, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 285.3854217529297, "epoch": 10.175379426644183, "grad_norm": 7.894902107360492, "kl": 0.4482421875, "learning_rate": 1.5202702702702703e-07, "loss": 0.0005, "reward": 3.2574455738067627, "reward_std": 0.18548056297004223, "rewards/final_reward": 1.2295448944124066, "rewards/mask_iou_reward": 0.6147724472062033, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2782787680625916, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3012, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 205.6875, "epoch": 10.178752107925801, "grad_norm": 10.25010181038996, "kl": 0.494140625, "learning_rate": 1.517454954954955e-07, "loss": 0.0006, "reward": 3.657282590866089, "reward_std": 0.06546662375330925, "rewards/final_reward": 1.2590024112232103, "rewards/mask_iou_reward": 0.6295012056116052, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.657282531261444, "rewards/thk_ans_format_reward": 1.0, "step": 3013, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 204.58333587646484, "epoch": 10.18212478920742, "grad_norm": 19.823039707702044, "kl": 0.4228515625, "learning_rate": 1.5146396396396398e-07, "loss": 0.0005, "reward": 3.6551074981689453, "reward_std": 0.08102907240390778, "rewards/final_reward": 1.8555531520368094, "rewards/mask_iou_reward": 0.9277765760184047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6551074385643005, "rewards/thk_ans_format_reward": 1.0, "step": 3014, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 158.84375762939453, "epoch": 10.18549747048904, "grad_norm": 11.426374871385816, "kl": 0.50390625, "learning_rate": 1.511824324324324e-07, "loss": 0.0005, "reward": 3.4826070070266724, "reward_std": 0.0319385826587677, "rewards/final_reward": 1.738464134583586, "rewards/mask_iou_reward": 0.869232067291793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4826070666313171, "rewards/thk_ans_format_reward": 1.0, "step": 3015, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 258.4375, "epoch": 10.188870151770658, "grad_norm": 20.16353455629868, "kl": 0.38671875, "learning_rate": 1.509009009009009e-07, "loss": 0.0004, "reward": 3.5612874031066895, "reward_std": 0.09555951785296202, "rewards/final_reward": 1.8304799898021402, "rewards/mask_iou_reward": 0.9152399949010701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5612870454788208, "rewards/thk_ans_format_reward": 1.0, "step": 3016, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 152.9166717529297, "epoch": 10.192242833052276, "grad_norm": 28.516210393580096, "kl": 0.47265625, "learning_rate": 1.5061936936936936e-07, "loss": 0.0005, "reward": 3.6769092082977295, "reward_std": 0.05226367339491844, "rewards/final_reward": 1.5773681028693793, "rewards/mask_iou_reward": 0.7886840514346897, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6769092679023743, "rewards/thk_ans_format_reward": 1.0, "step": 3017, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 240.47917938232422, "epoch": 10.195615514333895, "grad_norm": 67.81277800194884, "kl": 0.4306640625, "learning_rate": 1.5033783783783782e-07, "loss": 0.0005, "reward": 3.811589479446411, "reward_std": 0.010986678651534021, "rewards/final_reward": 1.8891695953724157, "rewards/mask_iou_reward": 0.9445847976862078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.811589539051056, "rewards/thk_ans_format_reward": 1.0, "step": 3018, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 264.03125762939453, "epoch": 10.198988195615515, "grad_norm": 8.865567503921177, "kl": 0.765625, "learning_rate": 1.500563063063063e-07, "loss": 0.0008, "reward": 3.419390916824341, "reward_std": 0.026064681820571423, "rewards/final_reward": 1.5951678983941053, "rewards/mask_iou_reward": 0.7975839491970527, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4193909168243408, "rewards/thk_ans_format_reward": 1.0, "step": 3019, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 276.40625, "epoch": 10.202360876897133, "grad_norm": 14.18369248827003, "kl": 0.3935546875, "learning_rate": 1.4977477477477477e-07, "loss": 0.0004, "reward": 3.5527989864349365, "reward_std": 0.03511756705120206, "rewards/final_reward": 0.9831977676554682, "rewards/mask_iou_reward": 0.4915988838277341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5527989268302917, "rewards/thk_ans_format_reward": 1.0, "step": 3020, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 194.02083587646484, "epoch": 10.205733558178752, "grad_norm": 9.654467905221043, "kl": 0.443359375, "learning_rate": 1.4949324324324325e-07, "loss": 0.0005, "reward": 3.4940335750579834, "reward_std": 0.09076207876205444, "rewards/final_reward": 1.1036258644813064, "rewards/mask_iou_reward": 0.5518129322406532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4940335154533386, "rewards/thk_ans_format_reward": 1.0, "step": 3021, "think_completion_length": 8.625 }, { "clip_ratio": 0.0, "completion_length": 127.00000381469727, "epoch": 10.209106239460372, "grad_norm": 15.997006681536114, "kl": 0.515625, "learning_rate": 1.4921171171171171e-07, "loss": 0.0005, "reward": 3.1683356761932373, "reward_std": 0.02166743017733097, "rewards/final_reward": 1.7799907009308857, "rewards/mask_iou_reward": 0.8899953504654429, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1683356165885925, "rewards/thk_ans_format_reward": 1.0, "step": 3022, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 171.4791717529297, "epoch": 10.21247892074199, "grad_norm": 10.312279577192898, "kl": 0.4375, "learning_rate": 1.4893018018018018e-07, "loss": 0.0005, "reward": 3.736538052558899, "reward_std": 0.024791957112029195, "rewards/final_reward": 1.8820995461035657, "rewards/mask_iou_reward": 0.9410497730517828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7365379929542542, "rewards/thk_ans_format_reward": 1.0, "step": 3023, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 288.25, "epoch": 10.215851602023609, "grad_norm": 10.412018438289754, "kl": 0.4609375, "learning_rate": 1.4864864864864866e-07, "loss": 0.0005, "reward": 3.5170916318893433, "reward_std": 0.3517511487007141, "rewards/final_reward": 1.2948631458501156, "rewards/mask_iou_reward": 0.6474315729250578, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5483417510986328, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 3024, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 191.48958587646484, "epoch": 10.219224283305227, "grad_norm": 115.1120642139752, "kl": 1.458984375, "learning_rate": 1.483671171171171e-07, "loss": 0.0015, "reward": 3.6020342111587524, "reward_std": 0.035571375861763954, "rewards/final_reward": 1.3863584342528519, "rewards/mask_iou_reward": 0.6931792171264259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6020342707633972, "rewards/thk_ans_format_reward": 1.0, "step": 3025, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 260.36458587646484, "epoch": 10.222596964586847, "grad_norm": 14.95920819273015, "kl": 0.421875, "learning_rate": 1.4808558558558556e-07, "loss": 0.0004, "reward": 3.603550434112549, "reward_std": 0.057018641382455826, "rewards/final_reward": 1.2303422133345536, "rewards/mask_iou_reward": 0.6151711066672768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6035504341125488, "rewards/thk_ans_format_reward": 1.0, "step": 3026, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 224.41667938232422, "epoch": 10.225969645868465, "grad_norm": 14.760725731411785, "kl": 0.4404296875, "learning_rate": 1.4780405405405404e-07, "loss": 0.0005, "reward": 3.7295358180999756, "reward_std": 0.037195175886154175, "rewards/final_reward": 1.8347441322552864, "rewards/mask_iou_reward": 0.9173720661276432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7295357584953308, "rewards/thk_ans_format_reward": 1.0, "step": 3027, "think_completion_length": 7.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 205.5729217529297, "epoch": 10.229342327150084, "grad_norm": 6.346323784178766, "kl": 0.421875, "learning_rate": 1.475225225225225e-07, "loss": 0.0004, "reward": 3.4006311893463135, "reward_std": 0.10144811868667603, "rewards/final_reward": 1.8824980037471182, "rewards/mask_iou_reward": 0.9412490018735591, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.400631308555603, "rewards/thk_ans_format_reward": 1.0, "step": 3028, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 299.78126525878906, "epoch": 10.232715008431704, "grad_norm": 8.824476487222075, "kl": 0.4111328125, "learning_rate": 1.47240990990991e-07, "loss": 0.0004, "reward": 3.7033588886260986, "reward_std": 0.06088071269914508, "rewards/final_reward": 1.5659035082472332, "rewards/mask_iou_reward": 0.7829517541236166, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.703358769416809, "rewards/thk_ans_format_reward": 1.0, "step": 3029, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 246.12501525878906, "epoch": 10.236087689713322, "grad_norm": 28.21267807554526, "kl": 0.505859375, "learning_rate": 1.4695945945945945e-07, "loss": 0.0005, "reward": 3.7472550868988037, "reward_std": 0.04748426750302315, "rewards/final_reward": 1.8960582219169015, "rewards/mask_iou_reward": 0.9480291109584508, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7472549080848694, "rewards/thk_ans_format_reward": 1.0, "step": 3030, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 223.87501525878906, "epoch": 10.23946037099494, "grad_norm": 10.76581095317364, "kl": 0.490234375, "learning_rate": 1.4667792792792791e-07, "loss": 0.0005, "reward": 3.740869402885437, "reward_std": 0.22409842908382416, "rewards/final_reward": 1.678593825559922, "rewards/mask_iou_reward": 0.839296912779961, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7617026567459106, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3031, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 233.1979217529297, "epoch": 10.24283305227656, "grad_norm": 16.11696078361264, "kl": 0.525390625, "learning_rate": 1.463963963963964e-07, "loss": 0.0005, "reward": 3.488149046897888, "reward_std": 0.017415442038327456, "rewards/final_reward": 1.104710881954887, "rewards/mask_iou_reward": 0.5523554409774435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4881488680839539, "rewards/thk_ans_format_reward": 1.0, "step": 3032, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 224.1041717529297, "epoch": 10.24620573355818, "grad_norm": 11.79782727085532, "kl": 0.6591796875, "learning_rate": 1.4611486486486486e-07, "loss": 0.0007, "reward": 3.353591799736023, "reward_std": 0.05060257390141487, "rewards/final_reward": 1.440081338540764, "rewards/mask_iou_reward": 0.720040669270382, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.353591799736023, "rewards/thk_ans_format_reward": 1.0, "step": 3033, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 218.42708587646484, "epoch": 10.249578414839798, "grad_norm": 34.85221516597678, "kl": 9.57421875, "learning_rate": 1.4583333333333335e-07, "loss": 0.0096, "reward": 3.6186007261276245, "reward_std": 0.06899136863648891, "rewards/final_reward": 1.5653077673713776, "rewards/mask_iou_reward": 0.7826538836856888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6186007261276245, "rewards/thk_ans_format_reward": 1.0, "step": 3034, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 292.8958435058594, "epoch": 10.252951096121416, "grad_norm": 45.66579665600695, "kl": 0.4111328125, "learning_rate": 1.4555180180180178e-07, "loss": 0.0004, "reward": 3.5985703468322754, "reward_std": 0.24263647571206093, "rewards/final_reward": 1.8425728631798113, "rewards/mask_iou_reward": 0.9212864315899056, "rewards/sam_format_reward": 0.9583333432674408, "rewards/sam_reward_func_ultra": 1.681903898715973, "rewards/thk_ans_format_reward": 0.9583333432674408, "step": 3035, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 175.02083587646484, "epoch": 10.256323777403036, "grad_norm": 7.415837993379524, "kl": 0.4443359375, "learning_rate": 1.4527027027027024e-07, "loss": 0.0004, "reward": 3.685833692550659, "reward_std": 0.03945900313556194, "rewards/final_reward": 1.7368624821102698, "rewards/mask_iou_reward": 0.8684312410551349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6858336925506592, "rewards/thk_ans_format_reward": 1.0, "step": 3036, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 235.93751525878906, "epoch": 10.259696458684655, "grad_norm": 30.27222983383401, "kl": 0.5341796875, "learning_rate": 1.4498873873873873e-07, "loss": 0.0005, "reward": 3.559749484062195, "reward_std": 0.08500716462731361, "rewards/final_reward": 1.6927849118778742, "rewards/mask_iou_reward": 0.8463924559389371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.559749722480774, "rewards/thk_ans_format_reward": 1.0, "step": 3037, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 274.7916793823242, "epoch": 10.263069139966273, "grad_norm": 5.858934619023522, "kl": 0.701171875, "learning_rate": 1.447072072072072e-07, "loss": 0.0007, "reward": 3.5214637517929077, "reward_std": 0.06163910590112209, "rewards/final_reward": 1.7095983083281925, "rewards/mask_iou_reward": 0.8547991541640962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5214638113975525, "rewards/thk_ans_format_reward": 1.0, "step": 3038, "think_completion_length": 8.291666666666668 }, { "clip_ratio": 0.0, "completion_length": 176.00000762939453, "epoch": 10.266441821247891, "grad_norm": 13.128488544669443, "kl": 0.4150390625, "learning_rate": 1.4442567567567568e-07, "loss": 0.0004, "reward": 3.762415289878845, "reward_std": 0.098273616284132, "rewards/final_reward": 1.842667643260338, "rewards/mask_iou_reward": 0.921333821630169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7624154090881348, "rewards/thk_ans_format_reward": 1.0, "step": 3039, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 265.5833435058594, "epoch": 10.269814502529512, "grad_norm": 10.121474630049313, "kl": 0.376953125, "learning_rate": 1.4414414414414414e-07, "loss": 0.0004, "reward": 3.6652374267578125, "reward_std": 0.03870641253888607, "rewards/final_reward": 1.2309389639257047, "rewards/mask_iou_reward": 0.6154694819628523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6652374267578125, "rewards/thk_ans_format_reward": 1.0, "step": 3040, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 238.5729217529297, "epoch": 10.27318718381113, "grad_norm": 10.438654044764187, "kl": 0.416015625, "learning_rate": 1.438626126126126e-07, "loss": 0.0004, "reward": 3.6193418502807617, "reward_std": 0.019348585978150368, "rewards/final_reward": 1.921007736284884, "rewards/mask_iou_reward": 0.960503868142442, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6193416714668274, "rewards/thk_ans_format_reward": 1.0, "step": 3041, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 225.88542938232422, "epoch": 10.276559865092748, "grad_norm": 9.229400078665387, "kl": 0.4716796875, "learning_rate": 1.435810810810811e-07, "loss": 0.0005, "reward": 3.881032109260559, "reward_std": 0.015071406960487366, "rewards/final_reward": 1.8651157299403511, "rewards/mask_iou_reward": 0.9325578649701756, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8810319304466248, "rewards/thk_ans_format_reward": 1.0, "step": 3042, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.72916793823242, "epoch": 10.279932546374368, "grad_norm": 20.614520400940542, "kl": 0.73046875, "learning_rate": 1.4329954954954955e-07, "loss": 0.0007, "reward": 3.632157564163208, "reward_std": 0.046529789455235004, "rewards/final_reward": 1.5362756707926812, "rewards/mask_iou_reward": 0.7681378353963406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6321576237678528, "rewards/thk_ans_format_reward": 1.0, "step": 3043, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 237.1041717529297, "epoch": 10.283305227655987, "grad_norm": 36.98989263520269, "kl": 0.4287109375, "learning_rate": 1.4301801801801803e-07, "loss": 0.0004, "reward": 3.5096585750579834, "reward_std": 0.1018081046640873, "rewards/final_reward": 1.855999401330896, "rewards/mask_iou_reward": 0.927999700665448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5096585750579834, "rewards/thk_ans_format_reward": 1.0, "step": 3044, "think_completion_length": 9.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 215.46875762939453, "epoch": 10.286677908937605, "grad_norm": 10.131238284617314, "kl": 0.45703125, "learning_rate": 1.4273648648648647e-07, "loss": 0.0005, "reward": 3.261106252670288, "reward_std": 0.050902172923088074, "rewards/final_reward": 1.1564071824493518, "rewards/mask_iou_reward": 0.5782035912246759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2611061930656433, "rewards/thk_ans_format_reward": 1.0, "step": 3045, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 173.3229217529297, "epoch": 10.290050590219224, "grad_norm": 13.902812917861246, "kl": 0.56640625, "learning_rate": 1.4245495495495493e-07, "loss": 0.0006, "reward": 3.7212787866592407, "reward_std": 0.07197993621230125, "rewards/final_reward": 1.8479431196911142, "rewards/mask_iou_reward": 0.9239715598455571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7212787866592407, "rewards/thk_ans_format_reward": 1.0, "step": 3046, "think_completion_length": 13.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 271.71875, "epoch": 10.293423271500844, "grad_norm": 14.873126860968421, "kl": 0.4013671875, "learning_rate": 1.4217342342342342e-07, "loss": 0.0004, "reward": 3.8307950496673584, "reward_std": 0.01546748448163271, "rewards/final_reward": 1.8101105255828047, "rewards/mask_iou_reward": 0.9050552627914024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8307951092720032, "rewards/thk_ans_format_reward": 1.0, "step": 3047, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 190.7916717529297, "epoch": 10.296795952782462, "grad_norm": 20.70107362318775, "kl": 0.431640625, "learning_rate": 1.4189189189189188e-07, "loss": 0.0004, "reward": 3.799539804458618, "reward_std": 0.04087031399831176, "rewards/final_reward": 1.880673807035448, "rewards/mask_iou_reward": 0.940336903517724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.799539864063263, "rewards/thk_ans_format_reward": 1.0, "step": 3048, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 198.0104217529297, "epoch": 10.30016863406408, "grad_norm": 20.29669481716131, "kl": 0.4267578125, "learning_rate": 1.4161036036036036e-07, "loss": 0.0004, "reward": 3.5044933557510376, "reward_std": 0.03597615472972393, "rewards/final_reward": 1.813689756326931, "rewards/mask_iou_reward": 0.9068448781634655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5044932961463928, "rewards/thk_ans_format_reward": 1.0, "step": 3049, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 200.45834350585938, "epoch": 10.3035413153457, "grad_norm": 6.06329398522273, "kl": 0.578125, "learning_rate": 1.4132882882882883e-07, "loss": 0.0006, "reward": 3.7332570552825928, "reward_std": 0.058670297265052795, "rewards/final_reward": 1.9510471461607337, "rewards/mask_iou_reward": 0.9755235730803669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7332569360733032, "rewards/thk_ans_format_reward": 1.0, "step": 3050, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 192.8541717529297, "epoch": 10.306913996627319, "grad_norm": 10.59607026615506, "kl": 0.611328125, "learning_rate": 1.4104729729729729e-07, "loss": 0.0006, "reward": 3.7314302921295166, "reward_std": 0.02940399432554841, "rewards/final_reward": 1.8294265843891298, "rewards/mask_iou_reward": 0.9147132921945649, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7314303517341614, "rewards/thk_ans_format_reward": 1.0, "step": 3051, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 240.97917938232422, "epoch": 10.310286677908937, "grad_norm": 7.163739321310357, "kl": 0.4052734375, "learning_rate": 1.4076576576576577e-07, "loss": 0.0004, "reward": 3.285438656806946, "reward_std": 0.0969182513654232, "rewards/final_reward": 1.2372289659372262, "rewards/mask_iou_reward": 0.6186144829686131, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2854386568069458, "rewards/thk_ans_format_reward": 1.0, "step": 3052, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 176.7291717529297, "epoch": 10.313659359190556, "grad_norm": 19.472550548175086, "kl": 0.5625, "learning_rate": 1.4048423423423423e-07, "loss": 0.0006, "reward": 3.4311933517456055, "reward_std": 0.12055841088294983, "rewards/final_reward": 1.6633807218495948, "rewards/mask_iou_reward": 0.8316903609247974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4311934113502502, "rewards/thk_ans_format_reward": 1.0, "step": 3053, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 194.79167938232422, "epoch": 10.317032040472176, "grad_norm": 51.19403605734523, "kl": 0.4560546875, "learning_rate": 1.402027027027027e-07, "loss": 0.0005, "reward": 3.355466604232788, "reward_std": 0.02344503626227379, "rewards/final_reward": 1.869085082618021, "rewards/mask_iou_reward": 0.9345425413090105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3554664850234985, "rewards/thk_ans_format_reward": 1.0, "step": 3054, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 199.4166717529297, "epoch": 10.320404721753794, "grad_norm": 12.938744913511657, "kl": 0.484375, "learning_rate": 1.3992117117117116e-07, "loss": 0.0005, "reward": 3.5117716789245605, "reward_std": 0.015957183204591274, "rewards/final_reward": 0.5727209760785165, "rewards/mask_iou_reward": 0.28636048803925823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5117716193199158, "rewards/thk_ans_format_reward": 1.0, "step": 3055, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 204.70833587646484, "epoch": 10.323777403035413, "grad_norm": 11.810036001437068, "kl": 0.501953125, "learning_rate": 1.3963963963963962e-07, "loss": 0.0005, "reward": 3.472335457801819, "reward_std": 0.04017342161387205, "rewards/final_reward": 1.7242630804990866, "rewards/mask_iou_reward": 0.8621315402495433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4723355174064636, "rewards/thk_ans_format_reward": 1.0, "step": 3056, "think_completion_length": 10.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 134.6979217529297, "epoch": 10.327150084317031, "grad_norm": 28.163535898988556, "kl": 0.466796875, "learning_rate": 1.393581081081081e-07, "loss": 0.0005, "reward": 3.6833776235580444, "reward_std": 0.056855410104617476, "rewards/final_reward": 1.755651434260051, "rewards/mask_iou_reward": 0.8778257171300256, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6833775639533997, "rewards/thk_ans_format_reward": 1.0, "step": 3057, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 156.8229217529297, "epoch": 10.330522765598651, "grad_norm": 11.691020876666066, "kl": 0.8681640625, "learning_rate": 1.3907657657657656e-07, "loss": 0.0009, "reward": 3.5328752994537354, "reward_std": 0.02728255931288004, "rewards/final_reward": 1.775632264751895, "rewards/mask_iou_reward": 0.8878161323759475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5328753590583801, "rewards/thk_ans_format_reward": 1.0, "step": 3058, "think_completion_length": 10.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 181.78125, "epoch": 10.33389544688027, "grad_norm": 5.488915685669901, "kl": 0.74609375, "learning_rate": 1.3879504504504505e-07, "loss": 0.0007, "reward": 3.5203044414520264, "reward_std": 0.03370634466409683, "rewards/final_reward": 1.976008900677117, "rewards/mask_iou_reward": 0.9880044503385585, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.520304560661316, "rewards/thk_ans_format_reward": 1.0, "step": 3059, "think_completion_length": 11.625 }, { "clip_ratio": 0.0, "completion_length": 227.02083587646484, "epoch": 10.337268128161888, "grad_norm": 14.039207196930784, "kl": 0.609375, "learning_rate": 1.385135135135135e-07, "loss": 0.0006, "reward": 3.555195689201355, "reward_std": 0.12395616993308067, "rewards/final_reward": 1.818213667140335, "rewards/mask_iou_reward": 0.9091068335701675, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5551958084106445, "rewards/thk_ans_format_reward": 1.0, "step": 3060, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 204.59375762939453, "epoch": 10.340640809443508, "grad_norm": 11.364492284892538, "kl": 0.4423828125, "learning_rate": 1.3823198198198197e-07, "loss": 0.0004, "reward": 3.6343125104904175, "reward_std": 0.049136221408843994, "rewards/final_reward": 1.7915940520924123, "rewards/mask_iou_reward": 0.8957970260462061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.634312391281128, "rewards/thk_ans_format_reward": 1.0, "step": 3061, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 229.9166717529297, "epoch": 10.344013490725127, "grad_norm": 7.810797873979519, "kl": 0.396484375, "learning_rate": 1.3795045045045046e-07, "loss": 0.0005, "reward": 3.295444369316101, "reward_std": 0.08025751262903214, "rewards/final_reward": 1.177657295365734, "rewards/mask_iou_reward": 0.588828647682867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2954444289207458, "rewards/thk_ans_format_reward": 1.0, "step": 3062, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 159.68750762939453, "epoch": 10.347386172006745, "grad_norm": 14.246952677980223, "kl": 0.53515625, "learning_rate": 1.3766891891891892e-07, "loss": 0.0005, "reward": 3.6882766485214233, "reward_std": 0.087204210460186, "rewards/final_reward": 1.7750797740027533, "rewards/mask_iou_reward": 0.8875398870013766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6882765889167786, "rewards/thk_ans_format_reward": 1.0, "step": 3063, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 185.50000762939453, "epoch": 10.350758853288363, "grad_norm": 44.254324086618205, "kl": 0.3994140625, "learning_rate": 1.3738738738738738e-07, "loss": 0.0004, "reward": 3.6730443239212036, "reward_std": 0.037104617804288864, "rewards/final_reward": 1.861958143782565, "rewards/mask_iou_reward": 0.9309790718912825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6730441451072693, "rewards/thk_ans_format_reward": 1.0, "step": 3064, "think_completion_length": 9.625 }, { "clip_ratio": 0.0, "completion_length": 271.78126525878906, "epoch": 10.354131534569984, "grad_norm": 18.577352776682563, "kl": 0.5625, "learning_rate": 1.3710585585585584e-07, "loss": 0.0006, "reward": 3.7525107860565186, "reward_std": 0.03455898258835077, "rewards/final_reward": 1.7476212503404103, "rewards/mask_iou_reward": 0.8738106251702051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7525106072425842, "rewards/thk_ans_format_reward": 1.0, "step": 3065, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 191.30208587646484, "epoch": 10.357504215851602, "grad_norm": 7.837123501383387, "kl": 0.4873046875, "learning_rate": 1.368243243243243e-07, "loss": 0.0005, "reward": 3.599125623703003, "reward_std": 0.03124680370092392, "rewards/final_reward": 1.267670592041667, "rewards/mask_iou_reward": 0.6338352960208335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5991255640983582, "rewards/thk_ans_format_reward": 1.0, "step": 3066, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 216.2604217529297, "epoch": 10.36087689713322, "grad_norm": 12.620081651732837, "kl": 0.5390625, "learning_rate": 1.365427927927928e-07, "loss": 0.0006, "reward": 3.6787497997283936, "reward_std": 0.059250480961054564, "rewards/final_reward": 1.6039802302769672, "rewards/mask_iou_reward": 0.8019901151384836, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.678749680519104, "rewards/thk_ans_format_reward": 1.0, "step": 3067, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.84375, "epoch": 10.36424957841484, "grad_norm": 10.729647212051022, "kl": 0.3984375, "learning_rate": 1.3626126126126125e-07, "loss": 0.0004, "reward": 3.7765551805496216, "reward_std": 0.0969116073101759, "rewards/final_reward": 1.8530384947731258, "rewards/mask_iou_reward": 0.9265192473865629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7765551805496216, "rewards/thk_ans_format_reward": 1.0, "step": 3068, "think_completion_length": 8.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 171.67708587646484, "epoch": 10.367622259696459, "grad_norm": 17.869347979918643, "kl": 0.6171875, "learning_rate": 1.3597972972972974e-07, "loss": 0.0006, "reward": 3.801684260368347, "reward_std": 0.03484427556395531, "rewards/final_reward": 1.7825355703083081, "rewards/mask_iou_reward": 0.8912677851541541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8016842603683472, "rewards/thk_ans_format_reward": 1.0, "step": 3069, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 222.02084350585938, "epoch": 10.370994940978077, "grad_norm": 9.546367466466828, "kl": 0.443359375, "learning_rate": 1.356981981981982e-07, "loss": 0.0005, "reward": 3.4482284784317017, "reward_std": 0.1556754820048809, "rewards/final_reward": 1.4857851477246928, "rewards/mask_iou_reward": 0.7428925738623464, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4482285380363464, "rewards/thk_ans_format_reward": 1.0, "step": 3070, "think_completion_length": 8.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 268.125, "epoch": 10.374367622259696, "grad_norm": 24.998556434514093, "kl": 0.4150390625, "learning_rate": 1.3541666666666666e-07, "loss": 0.0004, "reward": 3.7703176736831665, "reward_std": 0.019884683191776276, "rewards/final_reward": 1.695647588416222, "rewards/mask_iou_reward": 0.847823794208111, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.770317792892456, "rewards/thk_ans_format_reward": 1.0, "step": 3071, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 207.15625, "epoch": 10.377740303541316, "grad_norm": 10.177827488944438, "kl": 0.46875, "learning_rate": 1.3513513513513515e-07, "loss": 0.0005, "reward": 3.3514381647109985, "reward_std": 0.1550460159778595, "rewards/final_reward": 1.6597564033637382, "rewards/mask_iou_reward": 0.8298782016818691, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3514381647109985, "rewards/thk_ans_format_reward": 1.0, "step": 3072, "think_completion_length": 9.666666666666668 }, { "clip_ratio": 0.0, "completion_length": 216.32292938232422, "epoch": 10.381112984822934, "grad_norm": 8.509582773501355, "kl": 0.447265625, "learning_rate": 1.348536036036036e-07, "loss": 0.0005, "reward": 3.4990919828414917, "reward_std": 0.07601478323340416, "rewards/final_reward": 1.8712923387097589, "rewards/mask_iou_reward": 0.9356461693548794, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4990919828414917, "rewards/thk_ans_format_reward": 1.0, "step": 3073, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 284.7708435058594, "epoch": 10.384485666104553, "grad_norm": 5.405843324927532, "kl": 0.36328125, "learning_rate": 1.3457207207207207e-07, "loss": 0.0004, "reward": 3.723816156387329, "reward_std": 0.04175476357340813, "rewards/final_reward": 1.6051074212975824, "rewards/mask_iou_reward": 0.8025537106487912, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7238160967826843, "rewards/thk_ans_format_reward": 1.0, "step": 3074, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 210.0729217529297, "epoch": 10.387858347386173, "grad_norm": 5.879979396345877, "kl": 0.49609375, "learning_rate": 1.3429054054054053e-07, "loss": 0.0005, "reward": 3.746949791908264, "reward_std": 0.0503030139952898, "rewards/final_reward": 1.7305502081549002, "rewards/mask_iou_reward": 0.8652751040774501, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.746949553489685, "rewards/thk_ans_format_reward": 1.0, "step": 3075, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 224.6354217529297, "epoch": 10.391231028667791, "grad_norm": 26.78246561208562, "kl": 0.3935546875, "learning_rate": 1.34009009009009e-07, "loss": 0.0005, "reward": 3.5735831260681152, "reward_std": 0.04332230752333999, "rewards/final_reward": 1.248773449844379, "rewards/mask_iou_reward": 0.6243867249221895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5735830664634705, "rewards/thk_ans_format_reward": 1.0, "step": 3076, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 194.43750762939453, "epoch": 10.39460370994941, "grad_norm": 10.001276634062897, "kl": 0.4296875, "learning_rate": 1.3372747747747748e-07, "loss": 0.0004, "reward": 3.7090967893600464, "reward_std": 0.03287575836293399, "rewards/final_reward": 1.9403098107212258, "rewards/mask_iou_reward": 0.9701549053606129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7090969681739807, "rewards/thk_ans_format_reward": 1.0, "step": 3077, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 172.25000762939453, "epoch": 10.397976391231028, "grad_norm": 18.211080017805312, "kl": 0.49609375, "learning_rate": 1.3344594594594594e-07, "loss": 0.0005, "reward": 3.458000421524048, "reward_std": 0.05489188525825739, "rewards/final_reward": 1.5067666567106284, "rewards/mask_iou_reward": 0.7533833283553142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4580003023147583, "rewards/thk_ans_format_reward": 1.0, "step": 3078, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 224.8541717529297, "epoch": 10.401349072512648, "grad_norm": 14.014103313482549, "kl": 0.4716796875, "learning_rate": 1.3316441441441442e-07, "loss": 0.0005, "reward": 3.5899453163146973, "reward_std": 0.056886350736021996, "rewards/final_reward": 1.6582750067291117, "rewards/mask_iou_reward": 0.8291375033645558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5899450778961182, "rewards/thk_ans_format_reward": 1.0, "step": 3079, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 172.2604217529297, "epoch": 10.404721753794266, "grad_norm": 11.468827152465424, "kl": 0.681640625, "learning_rate": 1.3288288288288288e-07, "loss": 0.0007, "reward": 3.7649770975112915, "reward_std": 0.07069988921284676, "rewards/final_reward": 1.8338776917010735, "rewards/mask_iou_reward": 0.9169388458505368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7649770379066467, "rewards/thk_ans_format_reward": 1.0, "step": 3080, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 171.6458396911621, "epoch": 10.408094435075885, "grad_norm": 11.085719211351236, "kl": 0.4833984375, "learning_rate": 1.3260135135135134e-07, "loss": 0.0005, "reward": 3.5876041650772095, "reward_std": 0.19347361475229263, "rewards/final_reward": 1.6700500913049008, "rewards/mask_iou_reward": 0.8350250456524504, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5980209112167358, "rewards/thk_ans_format_reward": 1.0, "step": 3081, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 203.11458587646484, "epoch": 10.411467116357505, "grad_norm": 38.16946217127505, "kl": 0.4169921875, "learning_rate": 1.3231981981981983e-07, "loss": 0.0004, "reward": 3.5083699226379395, "reward_std": 0.045381806790828705, "rewards/final_reward": 1.7874092424230517, "rewards/mask_iou_reward": 0.8937046212115258, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5083699822425842, "rewards/thk_ans_format_reward": 1.0, "step": 3082, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 212.33334350585938, "epoch": 10.414839797639123, "grad_norm": 10.40371028482117, "kl": 0.564453125, "learning_rate": 1.320382882882883e-07, "loss": 0.0006, "reward": 3.4012542963027954, "reward_std": 0.13047372177243233, "rewards/final_reward": 1.448548234038224, "rewards/mask_iou_reward": 0.724274117019112, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.4220874905586243, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3083, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 176.75, "epoch": 10.418212478920742, "grad_norm": 12.777325204116849, "kl": 0.517578125, "learning_rate": 1.3175675675675673e-07, "loss": 0.0005, "reward": 3.7277798652648926, "reward_std": 0.05290667526423931, "rewards/final_reward": 1.7855710697263598, "rewards/mask_iou_reward": 0.8927855348631799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.727779746055603, "rewards/thk_ans_format_reward": 1.0, "step": 3084, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 140.33334350585938, "epoch": 10.42158516020236, "grad_norm": 27.100025984014987, "kl": 0.4765625, "learning_rate": 1.3147522522522521e-07, "loss": 0.0005, "reward": 3.613362193107605, "reward_std": 0.05754461046308279, "rewards/final_reward": 1.8879014683702529, "rewards/mask_iou_reward": 0.9439507341851264, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6133623123168945, "rewards/thk_ans_format_reward": 1.0, "step": 3085, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 176.6875, "epoch": 10.42495784148398, "grad_norm": 10.037161786953995, "kl": 0.505859375, "learning_rate": 1.3119369369369367e-07, "loss": 0.0005, "reward": 3.4091343879699707, "reward_std": 0.09373841434717178, "rewards/final_reward": 1.3407313291318463, "rewards/mask_iou_reward": 0.6703656645659232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4091344475746155, "rewards/thk_ans_format_reward": 1.0, "step": 3086, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 199.6041717529297, "epoch": 10.428330522765599, "grad_norm": 12.974808226706712, "kl": 0.4169921875, "learning_rate": 1.3091216216216216e-07, "loss": 0.0004, "reward": 3.6260504722595215, "reward_std": 0.042093608528375626, "rewards/final_reward": 1.736496195289474, "rewards/mask_iou_reward": 0.868248097644737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6260504722595215, "rewards/thk_ans_format_reward": 1.0, "step": 3087, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 172.95833587646484, "epoch": 10.431703204047217, "grad_norm": 11.04435290865121, "kl": 0.42578125, "learning_rate": 1.3063063063063062e-07, "loss": 0.0005, "reward": 3.579022169113159, "reward_std": 0.05499119684100151, "rewards/final_reward": 1.669786111592614, "rewards/mask_iou_reward": 0.834893055796307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5790221095085144, "rewards/thk_ans_format_reward": 1.0, "step": 3088, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 129.1458396911621, "epoch": 10.435075885328837, "grad_norm": 11.159996455611521, "kl": 0.59765625, "learning_rate": 1.3034909909909908e-07, "loss": 0.0006, "reward": 3.652455687522888, "reward_std": 0.02265701163560152, "rewards/final_reward": 1.8173958249268871, "rewards/mask_iou_reward": 0.9086979124634436, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6524556279182434, "rewards/thk_ans_format_reward": 1.0, "step": 3089, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 175.67708587646484, "epoch": 10.438448566610456, "grad_norm": 8.753344860212746, "kl": 0.5224609375, "learning_rate": 1.3006756756756757e-07, "loss": 0.0005, "reward": 3.7730822563171387, "reward_std": 0.030583191197365522, "rewards/final_reward": 1.6578078450498204, "rewards/mask_iou_reward": 0.8289039225249102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7730821371078491, "rewards/thk_ans_format_reward": 1.0, "step": 3090, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 250.59376525878906, "epoch": 10.441821247892074, "grad_norm": 12.035103836623776, "kl": 0.4619140625, "learning_rate": 1.2978603603603603e-07, "loss": 0.0007, "reward": 3.6348942518234253, "reward_std": 0.057315885089337826, "rewards/final_reward": 1.4426078250224328, "rewards/mask_iou_reward": 0.7213039125112164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6348941922187805, "rewards/thk_ans_format_reward": 1.0, "step": 3091, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 183.6354217529297, "epoch": 10.445193929173692, "grad_norm": 21.070982930474848, "kl": 0.61328125, "learning_rate": 1.2950450450450452e-07, "loss": 0.0006, "reward": 3.4380534887313843, "reward_std": 0.04184722830541432, "rewards/final_reward": 1.6863444507629848, "rewards/mask_iou_reward": 0.8431722253814924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.43805330991745, "rewards/thk_ans_format_reward": 1.0, "step": 3092, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 192.65625, "epoch": 10.448566610455313, "grad_norm": 18.121357835908366, "kl": 0.66015625, "learning_rate": 1.2922297297297298e-07, "loss": 0.0007, "reward": 3.4254337549209595, "reward_std": 0.05125655606389046, "rewards/final_reward": 1.0593367921198502, "rewards/mask_iou_reward": 0.5296683960599251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4254336953163147, "rewards/thk_ans_format_reward": 1.0, "step": 3093, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 203.55208587646484, "epoch": 10.451939291736931, "grad_norm": 6.635819206062496, "kl": 0.52734375, "learning_rate": 1.289414414414414e-07, "loss": 0.0005, "reward": 3.7229537963867188, "reward_std": 0.030687447171658278, "rewards/final_reward": 1.7149159311178637, "rewards/mask_iou_reward": 0.8574579655589318, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7229537963867188, "rewards/thk_ans_format_reward": 1.0, "step": 3094, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 156.67708587646484, "epoch": 10.45531197301855, "grad_norm": 36.00135551494365, "kl": 0.46875, "learning_rate": 1.286599099099099e-07, "loss": 0.0005, "reward": 3.5245972871780396, "reward_std": 0.048061273992061615, "rewards/final_reward": 1.2067712422147152, "rewards/mask_iou_reward": 0.6033856211073576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.52459716796875, "rewards/thk_ans_format_reward": 1.0, "step": 3095, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 206.8125, "epoch": 10.45868465430017, "grad_norm": 6.8862624024723935, "kl": 0.5234375, "learning_rate": 1.2837837837837836e-07, "loss": 0.0005, "reward": 3.550014853477478, "reward_std": 0.036302256397902966, "rewards/final_reward": 0.8265123227011708, "rewards/mask_iou_reward": 0.4132561613505854, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5500149130821228, "rewards/thk_ans_format_reward": 1.0, "step": 3096, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 174.46875762939453, "epoch": 10.462057335581788, "grad_norm": 35.856713973040975, "kl": 0.501953125, "learning_rate": 1.2809684684684685e-07, "loss": 0.0005, "reward": 3.73908531665802, "reward_std": 0.0826911348849535, "rewards/final_reward": 1.7057263593186214, "rewards/mask_iou_reward": 0.8528631796593107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7390851378440857, "rewards/thk_ans_format_reward": 1.0, "step": 3097, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 214.90626525878906, "epoch": 10.465430016863406, "grad_norm": 7.584187280860042, "kl": 0.4541015625, "learning_rate": 1.278153153153153e-07, "loss": 0.0005, "reward": 3.591671943664551, "reward_std": 0.13332638936117291, "rewards/final_reward": 1.9272358695258596, "rewards/mask_iou_reward": 0.9636179347629298, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6125050783157349, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3098, "think_completion_length": 8.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 229.86459350585938, "epoch": 10.468802698145025, "grad_norm": 13.897238219395529, "kl": 0.4208984375, "learning_rate": 1.2753378378378377e-07, "loss": 0.0004, "reward": 3.6895748376846313, "reward_std": 0.024271592497825623, "rewards/final_reward": 1.8354064804276868, "rewards/mask_iou_reward": 0.9177032402138434, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6895748972892761, "rewards/thk_ans_format_reward": 1.0, "step": 3099, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 245.40626525878906, "epoch": 10.472175379426645, "grad_norm": 47.21938702989025, "kl": 0.3671875, "learning_rate": 1.2725225225225226e-07, "loss": 0.0003, "reward": 3.7118654251098633, "reward_std": 0.031610630452632904, "rewards/final_reward": 1.547019004124051, "rewards/mask_iou_reward": 0.7735095020620255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7118653059005737, "rewards/thk_ans_format_reward": 1.0, "step": 3100, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 120.85417175292969, "epoch": 10.475548060708263, "grad_norm": 11.341914701976625, "kl": 0.4951171875, "learning_rate": 1.2697072072072072e-07, "loss": 0.0005, "reward": 3.369482636451721, "reward_std": 0.024935521185398102, "rewards/final_reward": 1.9687640793498886, "rewards/mask_iou_reward": 0.9843820396749443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.369482398033142, "rewards/thk_ans_format_reward": 1.0, "step": 3101, "think_completion_length": 7.583333333333333 }, { "clip_ratio": 0.0, "completion_length": 250.89583587646484, "epoch": 10.478920741989882, "grad_norm": 8.101147783530548, "kl": 0.3388671875, "learning_rate": 1.266891891891892e-07, "loss": 0.0004, "reward": 3.565877318382263, "reward_std": 0.029876575339585543, "rewards/final_reward": 1.7556893977145163, "rewards/mask_iou_reward": 0.8778446988572581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5658771991729736, "rewards/thk_ans_format_reward": 1.0, "step": 3102, "think_completion_length": 9.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 221.30208587646484, "epoch": 10.4822934232715, "grad_norm": 12.216823149622934, "kl": 0.4443359375, "learning_rate": 1.2640765765765766e-07, "loss": 0.0005, "reward": 3.601265549659729, "reward_std": 0.07581897638738155, "rewards/final_reward": 1.9240982076614976, "rewards/mask_iou_reward": 0.9620491038307488, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6012657284736633, "rewards/thk_ans_format_reward": 1.0, "step": 3103, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 180.9791717529297, "epoch": 10.48566610455312, "grad_norm": 10.304927264889335, "kl": 0.4453125, "learning_rate": 1.261261261261261e-07, "loss": 0.0005, "reward": 3.7941430807113647, "reward_std": 0.11498380824923515, "rewards/final_reward": 1.7165435770248911, "rewards/mask_iou_reward": 0.8582717885124456, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.8045597076416016, "rewards/thk_ans_format_reward": 1.0, "step": 3104, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 199.86459350585938, "epoch": 10.489038785834738, "grad_norm": 8.780323726364982, "kl": 0.5126953125, "learning_rate": 1.2584459459459459e-07, "loss": 0.0005, "reward": 3.632111072540283, "reward_std": 0.09831107221543789, "rewards/final_reward": 1.5393308961893373, "rewards/mask_iou_reward": 0.7696654480946686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6321112513542175, "rewards/thk_ans_format_reward": 1.0, "step": 3105, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 291.00001525878906, "epoch": 10.492411467116357, "grad_norm": 8.055355884111894, "kl": 0.3359375, "learning_rate": 1.2556306306306305e-07, "loss": 0.0003, "reward": 3.6069531440734863, "reward_std": 0.03117097169160843, "rewards/final_reward": 1.7884215969975852, "rewards/mask_iou_reward": 0.8942107984987926, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6069526672363281, "rewards/thk_ans_format_reward": 1.0, "step": 3106, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 200.92709350585938, "epoch": 10.495784148397977, "grad_norm": 10.36770739560577, "kl": 0.4267578125, "learning_rate": 1.2528153153153153e-07, "loss": 0.0004, "reward": 3.5954439640045166, "reward_std": 0.015956447925418615, "rewards/final_reward": 1.8034319018309988, "rewards/mask_iou_reward": 0.9017159509154994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5954439043998718, "rewards/thk_ans_format_reward": 1.0, "step": 3107, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 197.40625762939453, "epoch": 10.499156829679595, "grad_norm": 18.19336016512648, "kl": 0.4248046875, "learning_rate": 1.25e-07, "loss": 0.0005, "reward": 3.6256226301193237, "reward_std": 0.046171706169843674, "rewards/final_reward": 1.875848951219222, "rewards/mask_iou_reward": 0.937924475609611, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6256226301193237, "rewards/thk_ans_format_reward": 1.0, "step": 3108, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 189.55209350585938, "epoch": 10.502529510961214, "grad_norm": 9.518348775740566, "kl": 0.4599609375, "learning_rate": 1.2471846846846846e-07, "loss": 0.0004, "reward": 3.7291589975357056, "reward_std": 0.021761665120720863, "rewards/final_reward": 1.7267013531189652, "rewards/mask_iou_reward": 0.8633506765594826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7291589379310608, "rewards/thk_ans_format_reward": 1.0, "step": 3109, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 178.87500762939453, "epoch": 10.505902192242832, "grad_norm": 6.22035865216061, "kl": 0.5078125, "learning_rate": 1.2443693693693694e-07, "loss": 0.0005, "reward": 3.3314541578292847, "reward_std": 0.13443849235773087, "rewards/final_reward": 1.6590253812713955, "rewards/mask_iou_reward": 0.8295126906356978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3314539790153503, "rewards/thk_ans_format_reward": 1.0, "step": 3110, "think_completion_length": 9.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 270.56251525878906, "epoch": 10.509274873524452, "grad_norm": 6.872467141781061, "kl": 0.46875, "learning_rate": 1.241554054054054e-07, "loss": 0.0005, "reward": 3.4900680780410767, "reward_std": 0.038352854549884796, "rewards/final_reward": 1.909817570806116, "rewards/mask_iou_reward": 0.954908785403058, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4900680184364319, "rewards/thk_ans_format_reward": 1.0, "step": 3111, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 296.56251525878906, "epoch": 10.51264755480607, "grad_norm": 10.280674378090223, "kl": 0.4111328125, "learning_rate": 1.2387387387387386e-07, "loss": 0.0004, "reward": 3.6083072423934937, "reward_std": 0.04970341920852661, "rewards/final_reward": 1.6760755903575955, "rewards/mask_iou_reward": 0.8380377951787977, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.608307123184204, "rewards/thk_ans_format_reward": 1.0, "step": 3112, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 264.1354217529297, "epoch": 10.516020236087689, "grad_norm": 23.28563797167097, "kl": 0.431640625, "learning_rate": 1.2359234234234232e-07, "loss": 0.0005, "reward": 3.634291648864746, "reward_std": 0.02619549073278904, "rewards/final_reward": 1.748686872196122, "rewards/mask_iou_reward": 0.874343436098061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6342918276786804, "rewards/thk_ans_format_reward": 1.0, "step": 3113, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 202.7916717529297, "epoch": 10.51939291736931, "grad_norm": 10.582982961174917, "kl": 0.42578125, "learning_rate": 1.233108108108108e-07, "loss": 0.0004, "reward": 3.5773465633392334, "reward_std": 0.08526142570190132, "rewards/final_reward": 1.6190797931129668, "rewards/mask_iou_reward": 0.8095398965564834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5773464441299438, "rewards/thk_ans_format_reward": 1.0, "step": 3114, "think_completion_length": 8.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 253.47917938232422, "epoch": 10.522765598650928, "grad_norm": 11.808159257005988, "kl": 0.4267578125, "learning_rate": 1.2302927927927927e-07, "loss": 0.0004, "reward": 3.7226184606552124, "reward_std": 0.024431753903627396, "rewards/final_reward": 1.778847208369364, "rewards/mask_iou_reward": 0.889423604184682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7226186394691467, "rewards/thk_ans_format_reward": 1.0, "step": 3115, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 183.65625, "epoch": 10.526138279932546, "grad_norm": 50.688113567662526, "kl": 0.634765625, "learning_rate": 1.2274774774774773e-07, "loss": 0.0006, "reward": 3.64953076839447, "reward_std": 0.04483196325600147, "rewards/final_reward": 1.8453000319146462, "rewards/mask_iou_reward": 0.9226500159573231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6495305895805359, "rewards/thk_ans_format_reward": 1.0, "step": 3116, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 159.28125762939453, "epoch": 10.529510961214164, "grad_norm": 8.422268951168364, "kl": 0.44921875, "learning_rate": 1.2246621621621622e-07, "loss": 0.0005, "reward": 3.286513924598694, "reward_std": 0.055416141636669636, "rewards/final_reward": 0.2872778971173338, "rewards/mask_iou_reward": 0.1436389485586669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2865139245986938, "rewards/thk_ans_format_reward": 1.0, "step": 3117, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 168.50000762939453, "epoch": 10.532883642495785, "grad_norm": 10.599310623729721, "kl": 0.5302734375, "learning_rate": 1.2218468468468468e-07, "loss": 0.0006, "reward": 3.5024014711380005, "reward_std": 0.028407627250999212, "rewards/final_reward": 1.8209042963764035, "rewards/mask_iou_reward": 0.9104521481882017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5024012327194214, "rewards/thk_ans_format_reward": 1.0, "step": 3118, "think_completion_length": 8.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 192.3854217529297, "epoch": 10.536256323777403, "grad_norm": 34.94987635831581, "kl": 0.47265625, "learning_rate": 1.2190315315315314e-07, "loss": 0.0005, "reward": 3.6050848960876465, "reward_std": 0.1102399006485939, "rewards/final_reward": 1.8166443892282316, "rewards/mask_iou_reward": 0.9083221946141158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.605084776878357, "rewards/thk_ans_format_reward": 1.0, "step": 3119, "think_completion_length": 7.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 202.70833587646484, "epoch": 10.539629005059021, "grad_norm": 8.738350824887798, "kl": 0.439453125, "learning_rate": 1.2162162162162163e-07, "loss": 0.0005, "reward": 3.62838351726532, "reward_std": 0.025765706785023212, "rewards/final_reward": 1.4891728952793306, "rewards/mask_iou_reward": 0.7445864476396653, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6283833384513855, "rewards/thk_ans_format_reward": 1.0, "step": 3120, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 268.3854217529297, "epoch": 10.543001686340641, "grad_norm": 15.598197022156395, "kl": 0.37109375, "learning_rate": 1.213400900900901e-07, "loss": 0.0004, "reward": 3.7006378173828125, "reward_std": 0.040458193980157375, "rewards/final_reward": 1.4425537032359959, "rewards/mask_iou_reward": 0.7212768516179979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.700637698173523, "rewards/thk_ans_format_reward": 1.0, "step": 3121, "think_completion_length": 8.0 }, { "clip_ratio": 0.0, "completion_length": 186.27083587646484, "epoch": 10.54637436762226, "grad_norm": 13.870187339867892, "kl": 0.599609375, "learning_rate": 1.2105855855855855e-07, "loss": 0.0006, "reward": 3.5342659950256348, "reward_std": 0.02030755707528442, "rewards/final_reward": 1.8798160791428677, "rewards/mask_iou_reward": 0.9399080395714339, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5342658758163452, "rewards/thk_ans_format_reward": 1.0, "step": 3122, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 200.4479217529297, "epoch": 10.549747048903878, "grad_norm": 21.325236325585628, "kl": 0.4404296875, "learning_rate": 1.20777027027027e-07, "loss": 0.0005, "reward": 3.6597487926483154, "reward_std": 0.07287406735122204, "rewards/final_reward": 1.4427094025292027, "rewards/mask_iou_reward": 0.7213547012646013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6597489714622498, "rewards/thk_ans_format_reward": 1.0, "step": 3123, "think_completion_length": 9.0 }, { "clip_ratio": 0.0, "completion_length": 238.30209350585938, "epoch": 10.553119730185497, "grad_norm": 75.3481537657436, "kl": 1.54296875, "learning_rate": 1.204954954954955e-07, "loss": 0.0016, "reward": 3.491469383239746, "reward_std": 0.054703426314517856, "rewards/final_reward": 1.8018433956168733, "rewards/mask_iou_reward": 0.9009216978084367, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5123026967048645, "rewards/thk_ans_format_reward": 1.0, "step": 3124, "think_completion_length": 6.833333333333333 }, { "clip_ratio": 0.0, "completion_length": 167.1041717529297, "epoch": 10.556492411467117, "grad_norm": 10.292393267531867, "kl": 0.47265625, "learning_rate": 1.2021396396396396e-07, "loss": 0.0005, "reward": 3.3591725826263428, "reward_std": 0.023047425784170628, "rewards/final_reward": 1.831021463665678, "rewards/mask_iou_reward": 0.915510731832839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3591725826263428, "rewards/thk_ans_format_reward": 1.0, "step": 3125, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 263.2916717529297, "epoch": 10.559865092748735, "grad_norm": 39.62698803266399, "kl": 0.3984375, "learning_rate": 1.1993243243243242e-07, "loss": 0.0004, "reward": 3.537477135658264, "reward_std": 0.06482739746570587, "rewards/final_reward": 1.289090077875543, "rewards/mask_iou_reward": 0.6445450389377715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5374772548675537, "rewards/thk_ans_format_reward": 1.0, "step": 3126, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 274.15626525878906, "epoch": 10.563237774030354, "grad_norm": 9.282324146659736, "kl": 0.3828125, "learning_rate": 1.196509009009009e-07, "loss": 0.0004, "reward": 3.5415862798690796, "reward_std": 0.09624642692506313, "rewards/final_reward": 1.480154818709382, "rewards/mask_iou_reward": 0.740077409354691, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5415863990783691, "rewards/thk_ans_format_reward": 1.0, "step": 3127, "think_completion_length": 7.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 160.15625762939453, "epoch": 10.566610455311974, "grad_norm": 9.707291647146798, "kl": 0.4462890625, "learning_rate": 1.1936936936936937e-07, "loss": 0.0005, "reward": 3.565924644470215, "reward_std": 0.030608173459768295, "rewards/final_reward": 1.7547236881395065, "rewards/mask_iou_reward": 0.8773618440697533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5659246444702148, "rewards/thk_ans_format_reward": 1.0, "step": 3128, "think_completion_length": 8.708333333333332 }, { "clip_ratio": 0.0, "completion_length": 177.6979217529297, "epoch": 10.569983136593592, "grad_norm": 18.0298815257855, "kl": 0.5205078125, "learning_rate": 1.1908783783783784e-07, "loss": 0.0005, "reward": 3.5914461612701416, "reward_std": 0.02858224418014288, "rewards/final_reward": 1.4344778908627398, "rewards/mask_iou_reward": 0.7172389454313699, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5914462804794312, "rewards/thk_ans_format_reward": 1.0, "step": 3129, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 235.4166717529297, "epoch": 10.57335581787521, "grad_norm": 6.43292906131864, "kl": 0.4423828125, "learning_rate": 1.188063063063063e-07, "loss": 0.0005, "reward": 3.5320615768432617, "reward_std": 0.0571563933044672, "rewards/final_reward": 1.7172090882626136, "rewards/mask_iou_reward": 0.8586045441313068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5320618152618408, "rewards/thk_ans_format_reward": 1.0, "step": 3130, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 198.62500762939453, "epoch": 10.576728499156829, "grad_norm": 18.8093852353866, "kl": 0.50390625, "learning_rate": 1.1852477477477476e-07, "loss": 0.0005, "reward": 3.671657681465149, "reward_std": 0.030323058366775513, "rewards/final_reward": 1.4895206263686176, "rewards/mask_iou_reward": 0.7447603131843088, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6716576218605042, "rewards/thk_ans_format_reward": 1.0, "step": 3131, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 209.36459350585938, "epoch": 10.580101180438449, "grad_norm": 16.060535556302863, "kl": 0.4130859375, "learning_rate": 1.1824324324324324e-07, "loss": 0.0004, "reward": 3.5097408294677734, "reward_std": 0.0780508778989315, "rewards/final_reward": 1.3697000814567954, "rewards/mask_iou_reward": 0.6848500407283977, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5097407102584839, "rewards/thk_ans_format_reward": 1.0, "step": 3132, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 213.9791717529297, "epoch": 10.583473861720067, "grad_norm": 22.222040156683512, "kl": 0.619140625, "learning_rate": 1.1796171171171171e-07, "loss": 0.0006, "reward": 3.2384536266326904, "reward_std": 0.21653933450579643, "rewards/final_reward": 1.4502180401317724, "rewards/mask_iou_reward": 0.7251090200658862, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2384536862373352, "rewards/thk_ans_format_reward": 1.0, "step": 3133, "think_completion_length": 7.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 227.875, "epoch": 10.586846543001686, "grad_norm": 12.796673567815091, "kl": 0.3876953125, "learning_rate": 1.1768018018018018e-07, "loss": 0.0004, "reward": 3.5177581310272217, "reward_std": 0.04781328607350588, "rewards/final_reward": 1.688266574289527, "rewards/mask_iou_reward": 0.8441332871447635, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.517758071422577, "rewards/thk_ans_format_reward": 1.0, "step": 3134, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 150.9791717529297, "epoch": 10.590219224283306, "grad_norm": 37.131481922445516, "kl": 0.630859375, "learning_rate": 1.1739864864864864e-07, "loss": 0.0006, "reward": 3.4619154930114746, "reward_std": 0.03177159791812301, "rewards/final_reward": 1.81192326704147, "rewards/mask_iou_reward": 0.905961633520735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4619154930114746, "rewards/thk_ans_format_reward": 1.0, "step": 3135, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 249.83334350585938, "epoch": 10.593591905564924, "grad_norm": 9.559126232506571, "kl": 0.52734375, "learning_rate": 1.171171171171171e-07, "loss": 0.0005, "reward": 3.460838556289673, "reward_std": 0.030008337926119566, "rewards/final_reward": 0.8600520619291397, "rewards/mask_iou_reward": 0.43002603096456987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4608385562896729, "rewards/thk_ans_format_reward": 1.0, "step": 3136, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 233.61458587646484, "epoch": 10.596964586846543, "grad_norm": 8.868938047375126, "kl": 0.4970703125, "learning_rate": 1.1683558558558558e-07, "loss": 0.0005, "reward": 3.5875900983810425, "reward_std": 0.06950164213776588, "rewards/final_reward": 0.9352956134800028, "rewards/mask_iou_reward": 0.4676478067400014, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5875900983810425, "rewards/thk_ans_format_reward": 1.0, "step": 3137, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 176.34375762939453, "epoch": 10.600337268128161, "grad_norm": 8.631692924608684, "kl": 0.4736328125, "learning_rate": 1.1655405405405405e-07, "loss": 0.0005, "reward": 3.4805089235305786, "reward_std": 0.03367413394153118, "rewards/final_reward": 1.632930184323556, "rewards/mask_iou_reward": 0.816465092161778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4805086851119995, "rewards/thk_ans_format_reward": 1.0, "step": 3138, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 231.97917938232422, "epoch": 10.603709949409781, "grad_norm": 6.632640004769775, "kl": 0.4287109375, "learning_rate": 1.1627252252252253e-07, "loss": 0.0004, "reward": 3.681812882423401, "reward_std": 0.1951524093747139, "rewards/final_reward": 1.7462302154525413, "rewards/mask_iou_reward": 0.8731151077262707, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7026461362838745, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3139, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 202.40625762939453, "epoch": 10.6070826306914, "grad_norm": 13.731958002321711, "kl": 0.4052734375, "learning_rate": 1.1599099099099097e-07, "loss": 0.0004, "reward": 3.4694260358810425, "reward_std": 0.05269887298345566, "rewards/final_reward": 1.4713394746301809, "rewards/mask_iou_reward": 0.7356697373150904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4694259762763977, "rewards/thk_ans_format_reward": 1.0, "step": 3140, "think_completion_length": 8.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 146.3958396911621, "epoch": 10.610455311973018, "grad_norm": 10.443905821748844, "kl": 0.546875, "learning_rate": 1.1570945945945945e-07, "loss": 0.0005, "reward": 3.3069279193878174, "reward_std": 0.3599608391523361, "rewards/final_reward": 1.0115353345672449, "rewards/mask_iou_reward": 0.5057676672836224, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.3381779789924622, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3141, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 255.55209350585938, "epoch": 10.613827993254638, "grad_norm": 21.849604022714967, "kl": 0.3603515625, "learning_rate": 1.1542792792792792e-07, "loss": 0.0004, "reward": 3.516273021697998, "reward_std": 0.15535355359315872, "rewards/final_reward": 1.7520223331517997, "rewards/mask_iou_reward": 0.8760111665758998, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.537106454372406, "rewards/thk_ans_format_reward": 1.0, "step": 3142, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 281.87501525878906, "epoch": 10.617200674536257, "grad_norm": 7.274680848470261, "kl": 0.5224609375, "learning_rate": 1.151463963963964e-07, "loss": 0.0005, "reward": 3.563242197036743, "reward_std": 0.04035542719066143, "rewards/final_reward": 1.5929854370163006, "rewards/mask_iou_reward": 0.7964927185081503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5632421374320984, "rewards/thk_ans_format_reward": 1.0, "step": 3143, "think_completion_length": 9.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 262.5520935058594, "epoch": 10.620573355817875, "grad_norm": 8.76195986955573, "kl": 0.4267578125, "learning_rate": 1.1486486486486487e-07, "loss": 0.0006, "reward": 3.703770875930786, "reward_std": 0.03697956167161465, "rewards/final_reward": 1.3946098700720109, "rewards/mask_iou_reward": 0.6973049350360054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7037709951400757, "rewards/thk_ans_format_reward": 1.0, "step": 3144, "think_completion_length": 6.875 }, { "clip_ratio": 0.0, "completion_length": 162.2916717529297, "epoch": 10.623946037099493, "grad_norm": 9.26128589112025, "kl": 0.55078125, "learning_rate": 1.1458333333333332e-07, "loss": 0.0006, "reward": 3.501655697822571, "reward_std": 0.09267310053110123, "rewards/final_reward": 1.7073892721831088, "rewards/mask_iou_reward": 0.8536946360915544, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5016555190086365, "rewards/thk_ans_format_reward": 1.0, "step": 3145, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 172.3958396911621, "epoch": 10.627318718381114, "grad_norm": 20.42773432434273, "kl": 0.560546875, "learning_rate": 1.1430180180180179e-07, "loss": 0.0006, "reward": 3.547227621078491, "reward_std": 0.09544646553695202, "rewards/final_reward": 1.2804396471081816, "rewards/mask_iou_reward": 0.6402198235540908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5472275614738464, "rewards/thk_ans_format_reward": 1.0, "step": 3146, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 267.40626525878906, "epoch": 10.630691399662732, "grad_norm": 9.38414902098105, "kl": 0.51171875, "learning_rate": 1.1402027027027026e-07, "loss": 0.0005, "reward": 3.6407686471939087, "reward_std": 0.13431363552808762, "rewards/final_reward": 1.5476860661383145, "rewards/mask_iou_reward": 0.7738430330691572, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6616019010543823, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3147, "think_completion_length": 7.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 223.2916717529297, "epoch": 10.63406408094435, "grad_norm": 16.44221325891791, "kl": 0.3955078125, "learning_rate": 1.1373873873873874e-07, "loss": 0.0004, "reward": 3.2191805839538574, "reward_std": 0.10380381904542446, "rewards/final_reward": 1.538585252347514, "rewards/mask_iou_reward": 0.769292626173757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.219180703163147, "rewards/thk_ans_format_reward": 1.0, "step": 3148, "think_completion_length": 9.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 224.3125, "epoch": 10.63743676222597, "grad_norm": 15.05638678051703, "kl": 0.48828125, "learning_rate": 1.1345720720720721e-07, "loss": 0.0005, "reward": 3.862180471420288, "reward_std": 0.011594453826546669, "rewards/final_reward": 1.9617679351859052, "rewards/mask_iou_reward": 0.9808839675929526, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8621803522109985, "rewards/thk_ans_format_reward": 1.0, "step": 3149, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 209.8958396911621, "epoch": 10.640809443507589, "grad_norm": 22.34933047326768, "kl": 0.4013671875, "learning_rate": 1.1317567567567566e-07, "loss": 0.0004, "reward": 3.4726303815841675, "reward_std": 0.023849932476878166, "rewards/final_reward": 1.2154385044698301, "rewards/mask_iou_reward": 0.6077192522349151, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4726303815841675, "rewards/thk_ans_format_reward": 1.0, "step": 3150, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 239.9375, "epoch": 10.644182124789207, "grad_norm": 5.441981994440659, "kl": 0.4072265625, "learning_rate": 1.1289414414414413e-07, "loss": 0.0004, "reward": 3.5731762647628784, "reward_std": 0.16858915518969297, "rewards/final_reward": 1.7494747987237824, "rewards/mask_iou_reward": 0.8747373993618912, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5940097570419312, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3151, "think_completion_length": 10.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 179.0104217529297, "epoch": 10.647554806070826, "grad_norm": 8.798895286882274, "kl": 0.654296875, "learning_rate": 1.1261261261261261e-07, "loss": 0.0006, "reward": 3.7292591333389282, "reward_std": 0.09326490294188261, "rewards/final_reward": 1.831878290939525, "rewards/mask_iou_reward": 0.9159391454697625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7292590141296387, "rewards/thk_ans_format_reward": 1.0, "step": 3152, "think_completion_length": 9.875 }, { "clip_ratio": 0.0, "completion_length": 180.6979217529297, "epoch": 10.650927487352446, "grad_norm": 12.936342160811696, "kl": 0.482421875, "learning_rate": 1.1233108108108108e-07, "loss": 0.0005, "reward": 3.400019407272339, "reward_std": 0.02918088808655739, "rewards/final_reward": 1.4276408117049701, "rewards/mask_iou_reward": 0.7138204058524851, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.400019347667694, "rewards/thk_ans_format_reward": 1.0, "step": 3153, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 182.23958587646484, "epoch": 10.654300168634064, "grad_norm": 45.72291023072974, "kl": 0.49609375, "learning_rate": 1.1204954954954954e-07, "loss": 0.0005, "reward": 3.5731505155563354, "reward_std": 0.04934038035571575, "rewards/final_reward": 1.5378148013277484, "rewards/mask_iou_reward": 0.7689074006638742, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5731505155563354, "rewards/thk_ans_format_reward": 1.0, "step": 3154, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 306.5416717529297, "epoch": 10.657672849915683, "grad_norm": 13.974923581463589, "kl": 0.455078125, "learning_rate": 1.11768018018018e-07, "loss": 0.0005, "reward": 3.72958767414093, "reward_std": 0.13264624774456024, "rewards/final_reward": 1.822597720406296, "rewards/mask_iou_reward": 0.911298860203148, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.7504209280014038, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3155, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 200.3125, "epoch": 10.661045531197303, "grad_norm": 20.442499355497326, "kl": 0.4052734375, "learning_rate": 1.1148648648648648e-07, "loss": 0.0004, "reward": 3.708630681037903, "reward_std": 0.06668695248663425, "rewards/final_reward": 1.805878203669689, "rewards/mask_iou_reward": 0.9029391018348445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7086305618286133, "rewards/thk_ans_format_reward": 1.0, "step": 3156, "think_completion_length": 8.583333333333332 }, { "clip_ratio": 0.0, "completion_length": 190.9166717529297, "epoch": 10.664418212478921, "grad_norm": 15.684658816394188, "kl": 0.671875, "learning_rate": 1.1120495495495495e-07, "loss": 0.0007, "reward": 3.311765432357788, "reward_std": 0.09275224432349205, "rewards/final_reward": 1.5564693854632141, "rewards/mask_iou_reward": 0.7782346927316071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.322182059288025, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3157, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 174.17709350585938, "epoch": 10.66779089376054, "grad_norm": 24.875937326115956, "kl": 0.4521484375, "learning_rate": 1.1092342342342342e-07, "loss": 0.0005, "reward": 3.5257372856140137, "reward_std": 0.021635837852954865, "rewards/final_reward": 1.0622315912320346, "rewards/mask_iou_reward": 0.5311157956160173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5257372856140137, "rewards/thk_ans_format_reward": 1.0, "step": 3158, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 184.43750762939453, "epoch": 10.671163575042158, "grad_norm": 48.47509905804193, "kl": 0.466796875, "learning_rate": 1.1064189189189189e-07, "loss": 0.0005, "reward": 3.70083487033844, "reward_std": 0.017503976356238127, "rewards/final_reward": 1.8391874891535551, "rewards/mask_iou_reward": 0.9195937445767776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7008347511291504, "rewards/thk_ans_format_reward": 1.0, "step": 3159, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 191.6041717529297, "epoch": 10.674536256323778, "grad_norm": 9.61768093844924, "kl": 0.458984375, "learning_rate": 1.1036036036036035e-07, "loss": 0.0005, "reward": 3.6513434648513794, "reward_std": 0.03646405367180705, "rewards/final_reward": 1.7044724619966742, "rewards/mask_iou_reward": 0.8522362309983371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.651343584060669, "rewards/thk_ans_format_reward": 1.0, "step": 3160, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 164.1041717529297, "epoch": 10.677908937605396, "grad_norm": 19.17738559099291, "kl": 0.6015625, "learning_rate": 1.1007882882882882e-07, "loss": 0.0006, "reward": 3.6847745180130005, "reward_std": 0.03435724973678589, "rewards/final_reward": 1.9092129905471467, "rewards/mask_iou_reward": 0.9546064952735733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6847743391990662, "rewards/thk_ans_format_reward": 1.0, "step": 3161, "think_completion_length": 8.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 195.25, "epoch": 10.681281618887015, "grad_norm": 10.967767931797544, "kl": 0.505859375, "learning_rate": 1.097972972972973e-07, "loss": 0.0005, "reward": 3.380388021469116, "reward_std": 0.025232050102204084, "rewards/final_reward": 1.4506534061370655, "rewards/mask_iou_reward": 0.7253267030685328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3803880214691162, "rewards/thk_ans_format_reward": 1.0, "step": 3162, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 221.28125762939453, "epoch": 10.684654300168635, "grad_norm": 7.730518808963188, "kl": 0.40234375, "learning_rate": 1.0951576576576577e-07, "loss": 0.0004, "reward": 3.6407421827316284, "reward_std": 0.06835777265951037, "rewards/final_reward": 1.7174296933986817, "rewards/mask_iou_reward": 0.8587148466993408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.640742301940918, "rewards/thk_ans_format_reward": 1.0, "step": 3163, "think_completion_length": 7.0 }, { "clip_ratio": 0.0, "completion_length": 203.61459350585938, "epoch": 10.688026981450253, "grad_norm": 15.784759543118883, "kl": 0.646484375, "learning_rate": 1.0923423423423423e-07, "loss": 0.0007, "reward": 3.557579278945923, "reward_std": 0.04200829612091184, "rewards/final_reward": 1.5002643919718692, "rewards/mask_iou_reward": 0.7501321959859346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.557579219341278, "rewards/thk_ans_format_reward": 1.0, "step": 3164, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 255.72917938232422, "epoch": 10.691399662731872, "grad_norm": 12.029126664888803, "kl": 0.4169921875, "learning_rate": 1.0895270270270269e-07, "loss": 0.0004, "reward": 3.4169652462005615, "reward_std": 0.07824656739830971, "rewards/final_reward": 1.0356403614267473, "rewards/mask_iou_reward": 0.5178201807133737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4169652462005615, "rewards/thk_ans_format_reward": 1.0, "step": 3165, "think_completion_length": 9.5 }, { "clip_ratio": 0.0, "completion_length": 194.6666717529297, "epoch": 10.69477234401349, "grad_norm": 39.3048031368233, "kl": 0.4921875, "learning_rate": 1.0867117117117116e-07, "loss": 0.0005, "reward": 3.767988443374634, "reward_std": 0.04218120127916336, "rewards/final_reward": 1.7434309960952667, "rewards/mask_iou_reward": 0.8717154980476334, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7679883241653442, "rewards/thk_ans_format_reward": 1.0, "step": 3166, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 171.5104217529297, "epoch": 10.69814502529511, "grad_norm": 17.339184068767512, "kl": 0.4638671875, "learning_rate": 1.0838963963963964e-07, "loss": 0.0005, "reward": 3.464288115501404, "reward_std": 0.03953620605170727, "rewards/final_reward": 1.3581066537637718, "rewards/mask_iou_reward": 0.6790533268818859, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4642879366874695, "rewards/thk_ans_format_reward": 1.0, "step": 3167, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 256.2291717529297, "epoch": 10.701517706576729, "grad_norm": 12.181885012295425, "kl": 0.427734375, "learning_rate": 1.0810810810810811e-07, "loss": 0.0004, "reward": 3.3465646505355835, "reward_std": 0.03928683325648308, "rewards/final_reward": 1.4744314939278178, "rewards/mask_iou_reward": 0.7372157469639089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3465644121170044, "rewards/thk_ans_format_reward": 1.0, "step": 3168, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 228.61458587646484, "epoch": 10.704890387858347, "grad_norm": 9.592572877067397, "kl": 0.416015625, "learning_rate": 1.0782657657657657e-07, "loss": 0.0004, "reward": 3.708613395690918, "reward_std": 0.07097115181386471, "rewards/final_reward": 1.5417674942840909, "rewards/mask_iou_reward": 0.7708837471420454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7086135149002075, "rewards/thk_ans_format_reward": 1.0, "step": 3169, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 190.4479217529297, "epoch": 10.708263069139967, "grad_norm": 37.75207061018355, "kl": 0.5849609375, "learning_rate": 1.0754504504504503e-07, "loss": 0.0006, "reward": 3.46931254863739, "reward_std": 0.08026197552680969, "rewards/final_reward": 1.5688330174525815, "rewards/mask_iou_reward": 0.7844165087262908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4693125486373901, "rewards/thk_ans_format_reward": 1.0, "step": 3170, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 294.3333435058594, "epoch": 10.711635750421586, "grad_norm": 10.453179838498924, "kl": 0.419921875, "learning_rate": 1.072635135135135e-07, "loss": 0.0004, "reward": 3.502511143684387, "reward_std": 0.03253740817308426, "rewards/final_reward": 1.1187443726913366, "rewards/mask_iou_reward": 0.5593721863456683, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5025108456611633, "rewards/thk_ans_format_reward": 1.0, "step": 3171, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 255.51042938232422, "epoch": 10.715008431703204, "grad_norm": 19.590390528265072, "kl": 0.412109375, "learning_rate": 1.0698198198198198e-07, "loss": 0.0004, "reward": 3.530317544937134, "reward_std": 0.06995473802089691, "rewards/final_reward": 1.922661907681869, "rewards/mask_iou_reward": 0.9613309538409345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.530317783355713, "rewards/thk_ans_format_reward": 1.0, "step": 3172, "think_completion_length": 9.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 195.93750762939453, "epoch": 10.718381112984822, "grad_norm": 7.326455739047058, "kl": 0.494140625, "learning_rate": 1.0670045045045045e-07, "loss": 0.0005, "reward": 3.727341413497925, "reward_std": 0.05604278179816902, "rewards/final_reward": 1.7987221130595397, "rewards/mask_iou_reward": 0.8993610565297698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.72734135389328, "rewards/thk_ans_format_reward": 1.0, "step": 3173, "think_completion_length": 8.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 263.1770935058594, "epoch": 10.721753794266442, "grad_norm": 9.937929320228857, "kl": 0.802734375, "learning_rate": 1.0641891891891891e-07, "loss": 0.0008, "reward": 3.434737205505371, "reward_std": 0.12638374976813793, "rewards/final_reward": 1.7686646256373062, "rewards/mask_iou_reward": 0.8843323128186531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4347370266914368, "rewards/thk_ans_format_reward": 1.0, "step": 3174, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 234.39584350585938, "epoch": 10.72512647554806, "grad_norm": 8.874852737333692, "kl": 0.3779296875, "learning_rate": 1.0613738738738738e-07, "loss": 0.0004, "reward": 3.7087361812591553, "reward_std": 0.018423269502818584, "rewards/final_reward": 1.6229624286472206, "rewards/mask_iou_reward": 0.8114812143236103, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7087361812591553, "rewards/thk_ans_format_reward": 1.0, "step": 3175, "think_completion_length": 8.583333333333334 }, { "clip_ratio": 0.0, "completion_length": 236.03125, "epoch": 10.72849915682968, "grad_norm": 21.65028430383011, "kl": 0.4404296875, "learning_rate": 1.0585585585585585e-07, "loss": 0.0006, "reward": 3.6942501068115234, "reward_std": 0.03157848212867975, "rewards/final_reward": 1.9245843890419088, "rewards/mask_iou_reward": 0.9622921945209544, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6942499279975891, "rewards/thk_ans_format_reward": 1.0, "step": 3176, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 154.34375762939453, "epoch": 10.7318718381113, "grad_norm": 19.97482282089851, "kl": 0.53125, "learning_rate": 1.0557432432432432e-07, "loss": 0.0006, "reward": 3.7493603229522705, "reward_std": 0.030578995821997523, "rewards/final_reward": 1.8926272902165633, "rewards/mask_iou_reward": 0.9463136451082816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7493602633476257, "rewards/thk_ans_format_reward": 1.0, "step": 3177, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 204.27083587646484, "epoch": 10.735244519392918, "grad_norm": 7.7444598309142565, "kl": 0.5029296875, "learning_rate": 1.0529279279279278e-07, "loss": 0.0005, "reward": 3.5413291454315186, "reward_std": 0.029289917089045048, "rewards/final_reward": 0.9951512429044618, "rewards/mask_iou_reward": 0.4975756214522309, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5413289666175842, "rewards/thk_ans_format_reward": 1.0, "step": 3178, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 299.7395935058594, "epoch": 10.738617200674536, "grad_norm": 6.478442215378928, "kl": 0.3828125, "learning_rate": 1.0501126126126126e-07, "loss": 0.0004, "reward": 3.5706183910369873, "reward_std": 0.05361626110970974, "rewards/final_reward": 1.269704174686019, "rewards/mask_iou_reward": 0.6348520873430095, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5706182718276978, "rewards/thk_ans_format_reward": 1.0, "step": 3179, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 215.4479217529297, "epoch": 10.741989881956155, "grad_norm": 16.661016023497513, "kl": 0.4423828125, "learning_rate": 1.0472972972972972e-07, "loss": 0.0005, "reward": 3.549037218093872, "reward_std": 0.13535232981666923, "rewards/final_reward": 1.5239766400728418, "rewards/mask_iou_reward": 0.7619883200364209, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5698702931404114, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3180, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 224.70834350585938, "epoch": 10.745362563237775, "grad_norm": 18.01798808332277, "kl": 0.423828125, "learning_rate": 1.0444819819819819e-07, "loss": 0.0004, "reward": 3.6769832372665405, "reward_std": 0.030254771932959557, "rewards/final_reward": 1.7641558989696924, "rewards/mask_iou_reward": 0.8820779494848462, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6769834756851196, "rewards/thk_ans_format_reward": 1.0, "step": 3181, "think_completion_length": 8.541666666666668 }, { "clip_ratio": 0.0, "completion_length": 272.0520935058594, "epoch": 10.748735244519393, "grad_norm": 9.840387283943757, "kl": 0.3984375, "learning_rate": 1.0416666666666667e-07, "loss": 0.0004, "reward": 3.1989909410476685, "reward_std": 0.2475245175883174, "rewards/final_reward": 0.6654427980785749, "rewards/mask_iou_reward": 0.33272139903928744, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.2614907622337341, "rewards/thk_ans_format_reward": 0.96875, "step": 3182, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 255.875, "epoch": 10.752107925801011, "grad_norm": 10.436477239154478, "kl": 0.416015625, "learning_rate": 1.0388513513513513e-07, "loss": 0.0004, "reward": 3.53597092628479, "reward_std": 0.055690947920084, "rewards/final_reward": 1.8742546162403326, "rewards/mask_iou_reward": 0.9371273081201663, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.53597092628479, "rewards/thk_ans_format_reward": 1.0, "step": 3183, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 239.3854217529297, "epoch": 10.75548060708263, "grad_norm": 12.405938651548457, "kl": 0.4375, "learning_rate": 1.036036036036036e-07, "loss": 0.0005, "reward": 3.2055702209472656, "reward_std": 0.20975297689437866, "rewards/final_reward": 1.6398424581666908, "rewards/mask_iou_reward": 0.8199212290833454, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.226403534412384, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3184, "think_completion_length": 7.25 }, { "clip_ratio": 0.0, "completion_length": 240.37501525878906, "epoch": 10.75885328836425, "grad_norm": 11.512690924025932, "kl": 0.4453125, "learning_rate": 1.0332207207207206e-07, "loss": 0.0005, "reward": 3.782678008079529, "reward_std": 0.040368370711803436, "rewards/final_reward": 1.7962749476994122, "rewards/mask_iou_reward": 0.8981374738497061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7826780080795288, "rewards/thk_ans_format_reward": 1.0, "step": 3185, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 226.61459350585938, "epoch": 10.762225969645868, "grad_norm": 9.533661368134151, "kl": 0.6845703125, "learning_rate": 1.0304054054054054e-07, "loss": 0.0007, "reward": 3.674705743789673, "reward_std": 0.06669612042605877, "rewards/final_reward": 1.4661596776198316, "rewards/mask_iou_reward": 0.7330798388099158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6747060418128967, "rewards/thk_ans_format_reward": 1.0, "step": 3186, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 183.8854217529297, "epoch": 10.765598650927487, "grad_norm": 21.614612511269403, "kl": 0.673828125, "learning_rate": 1.0275900900900901e-07, "loss": 0.0007, "reward": 3.7622073888778687, "reward_std": 0.008181184297427535, "rewards/final_reward": 1.815139171588172, "rewards/mask_iou_reward": 0.907569585794086, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7622073292732239, "rewards/thk_ans_format_reward": 1.0, "step": 3187, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 300.0416793823242, "epoch": 10.768971332209107, "grad_norm": 12.895675518609572, "kl": 0.423828125, "learning_rate": 1.0247747747747747e-07, "loss": 0.0004, "reward": 3.4585185050964355, "reward_std": 0.22953240387141705, "rewards/final_reward": 1.7792611975583754, "rewards/mask_iou_reward": 0.8896305987791877, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5001851320266724, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 3188, "think_completion_length": 7.166666666666667 }, { "clip_ratio": 0.0, "completion_length": 191.3229217529297, "epoch": 10.772344013490725, "grad_norm": 10.760076478630971, "kl": 0.5849609375, "learning_rate": 1.0219594594594594e-07, "loss": 0.0006, "reward": 3.7292808294296265, "reward_std": 0.03788667544722557, "rewards/final_reward": 1.426709951626853, "rewards/mask_iou_reward": 0.7133549758134266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.729280948638916, "rewards/thk_ans_format_reward": 1.0, "step": 3189, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 186.89584350585938, "epoch": 10.775716694772344, "grad_norm": 6.787950901772287, "kl": 0.4140625, "learning_rate": 1.019144144144144e-07, "loss": 0.0004, "reward": 3.651894688606262, "reward_std": 0.0521730100736022, "rewards/final_reward": 1.769749352345033, "rewards/mask_iou_reward": 0.8848746761725165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6518943905830383, "rewards/thk_ans_format_reward": 1.0, "step": 3190, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 230.70834350585938, "epoch": 10.779089376053962, "grad_norm": 11.302821962201145, "kl": 0.3984375, "learning_rate": 1.0163288288288288e-07, "loss": 0.0004, "reward": 3.6362640857696533, "reward_std": 0.030033452436327934, "rewards/final_reward": 1.6840505229700709, "rewards/mask_iou_reward": 0.8420252614850354, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.636264145374298, "rewards/thk_ans_format_reward": 1.0, "step": 3191, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 178.5416717529297, "epoch": 10.782462057335582, "grad_norm": 15.81904763170142, "kl": 0.55078125, "learning_rate": 1.0135135135135135e-07, "loss": 0.0006, "reward": 3.6694321632385254, "reward_std": 0.06063992343842983, "rewards/final_reward": 1.3317782285626767, "rewards/mask_iou_reward": 0.6658891142813383, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.669431984424591, "rewards/thk_ans_format_reward": 1.0, "step": 3192, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 206.61458587646484, "epoch": 10.7858347386172, "grad_norm": 7.475758488939929, "kl": 0.3955078125, "learning_rate": 1.0106981981981981e-07, "loss": 0.0004, "reward": 3.729905366897583, "reward_std": 0.06045639142394066, "rewards/final_reward": 1.7604984407554976, "rewards/mask_iou_reward": 0.8802492203777488, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7299054265022278, "rewards/thk_ans_format_reward": 1.0, "step": 3193, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 252.33334350585938, "epoch": 10.789207419898819, "grad_norm": 11.706376543671285, "kl": 0.47265625, "learning_rate": 1.0078828828828829e-07, "loss": 0.0005, "reward": 3.769546627998352, "reward_std": 0.03965230449102819, "rewards/final_reward": 1.83340238694567, "rewards/mask_iou_reward": 0.916701193472835, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7695463299751282, "rewards/thk_ans_format_reward": 1.0, "step": 3194, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 146.89583587646484, "epoch": 10.79258010118044, "grad_norm": 17.883562068634024, "kl": 0.5537109375, "learning_rate": 1.0050675675675675e-07, "loss": 0.0006, "reward": 3.3498950004577637, "reward_std": 0.05078345909714699, "rewards/final_reward": 1.6377353144867908, "rewards/mask_iou_reward": 0.8188676572433954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3498948812484741, "rewards/thk_ans_format_reward": 1.0, "step": 3195, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 234.5416717529297, "epoch": 10.795952782462058, "grad_norm": 12.160900753006624, "kl": 0.45703125, "learning_rate": 1.0022522522522522e-07, "loss": 0.0005, "reward": 3.6239691972732544, "reward_std": 0.04034661874175072, "rewards/final_reward": 1.172895157250723, "rewards/mask_iou_reward": 0.5864475786253615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6239690780639648, "rewards/thk_ans_format_reward": 1.0, "step": 3196, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 184.98958587646484, "epoch": 10.799325463743676, "grad_norm": 22.684056229809475, "kl": 0.501953125, "learning_rate": 9.99436936936937e-08, "loss": 0.0005, "reward": 3.7545530796051025, "reward_std": 0.01758536882698536, "rewards/final_reward": 1.8126713424449878, "rewards/mask_iou_reward": 0.9063356712224939, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.754552960395813, "rewards/thk_ans_format_reward": 1.0, "step": 3197, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 162.9166717529297, "epoch": 10.802698145025294, "grad_norm": 6.930106395906022, "kl": 0.408203125, "learning_rate": 9.966216216216216e-08, "loss": 0.0004, "reward": 3.507596015930176, "reward_std": 0.03860746696591377, "rewards/final_reward": 1.4387048334123578, "rewards/mask_iou_reward": 0.7193524167061789, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5075958967208862, "rewards/thk_ans_format_reward": 1.0, "step": 3198, "think_completion_length": 8.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 264.5, "epoch": 10.806070826306915, "grad_norm": 34.761760766300156, "kl": 0.587890625, "learning_rate": 9.938063063063063e-08, "loss": 0.0007, "reward": 3.4861974716186523, "reward_std": 0.04883183538913727, "rewards/final_reward": 1.9181291083170278, "rewards/mask_iou_reward": 0.9590645541585139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4861974716186523, "rewards/thk_ans_format_reward": 1.0, "step": 3199, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 205.0729217529297, "epoch": 10.809443507588533, "grad_norm": 6.418198060708881, "kl": 0.462890625, "learning_rate": 9.909909909909909e-08, "loss": 0.0005, "reward": 3.2320475578308105, "reward_std": 0.06442866101861, "rewards/final_reward": 1.0330291851997446, "rewards/mask_iou_reward": 0.5165145925998723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2320473790168762, "rewards/thk_ans_format_reward": 1.0, "step": 3200, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 287.2604217529297, "epoch": 10.812816188870151, "grad_norm": 6.785660820670892, "kl": 0.443359375, "learning_rate": 9.881756756756756e-08, "loss": 0.0006, "reward": 3.643518805503845, "reward_std": 0.019335764925926924, "rewards/final_reward": 1.7463516779887074, "rewards/mask_iou_reward": 0.8731758389943537, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6435189247131348, "rewards/thk_ans_format_reward": 1.0, "step": 3201, "think_completion_length": 7.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 197.5312614440918, "epoch": 10.816188870151771, "grad_norm": 33.51514881034473, "kl": 0.435546875, "learning_rate": 9.853603603603604e-08, "loss": 0.0004, "reward": 3.4961583614349365, "reward_std": 0.024693522602319717, "rewards/final_reward": 0.7044322796740056, "rewards/mask_iou_reward": 0.3522161398370028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4961583018302917, "rewards/thk_ans_format_reward": 1.0, "step": 3202, "think_completion_length": 9.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 237.06250762939453, "epoch": 10.81956155143339, "grad_norm": 14.447403026749459, "kl": 0.53125, "learning_rate": 9.82545045045045e-08, "loss": 0.0005, "reward": 3.3684170246124268, "reward_std": 0.035486179403960705, "rewards/final_reward": 1.9557854709075924, "rewards/mask_iou_reward": 0.9778927354537962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3684170246124268, "rewards/thk_ans_format_reward": 1.0, "step": 3203, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 261.4479217529297, "epoch": 10.822934232715008, "grad_norm": 11.104553870128184, "kl": 0.44921875, "learning_rate": 9.797297297297297e-08, "loss": 0.0005, "reward": 3.383490204811096, "reward_std": 0.04366124048829079, "rewards/final_reward": 1.634455664503657, "rewards/mask_iou_reward": 0.8172278322518285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3834902048110962, "rewards/thk_ans_format_reward": 1.0, "step": 3204, "think_completion_length": 8.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 167.3229217529297, "epoch": 10.826306913996627, "grad_norm": 15.877695572152604, "kl": 0.423828125, "learning_rate": 9.769144144144143e-08, "loss": 0.0006, "reward": 3.6447088718414307, "reward_std": 0.0836187805980444, "rewards/final_reward": 1.851138693242559, "rewards/mask_iou_reward": 0.9255693466212795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6447087526321411, "rewards/thk_ans_format_reward": 1.0, "step": 3205, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 189.89583587646484, "epoch": 10.829679595278247, "grad_norm": 39.2943027631783, "kl": 0.4541015625, "learning_rate": 9.740990990990991e-08, "loss": 0.0005, "reward": 3.7319670915603638, "reward_std": 0.07651904597878456, "rewards/final_reward": 1.3600613865001712, "rewards/mask_iou_reward": 0.6800306932500856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7319666743278503, "rewards/thk_ans_format_reward": 1.0, "step": 3206, "think_completion_length": 8.541666666666666 }, { "clip_ratio": 0.0, "completion_length": 215.67708587646484, "epoch": 10.833052276559865, "grad_norm": 11.942067417307154, "kl": 0.4384765625, "learning_rate": 9.712837837837837e-08, "loss": 0.0004, "reward": 3.27489697933197, "reward_std": 0.05321320705115795, "rewards/final_reward": 1.5223945434338848, "rewards/mask_iou_reward": 0.7611972717169424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2748970985412598, "rewards/thk_ans_format_reward": 1.0, "step": 3207, "think_completion_length": 7.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 263.5729293823242, "epoch": 10.836424957841484, "grad_norm": 9.369178782769083, "kl": 0.4482421875, "learning_rate": 9.684684684684684e-08, "loss": 0.0005, "reward": 3.5060927867889404, "reward_std": 0.09241212159395218, "rewards/final_reward": 1.0254573640507907, "rewards/mask_iou_reward": 0.5127286820253953, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.50609290599823, "rewards/thk_ans_format_reward": 1.0, "step": 3208, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 181.81250762939453, "epoch": 10.839797639123104, "grad_norm": 6.76604175667331, "kl": 0.3876953125, "learning_rate": 9.656531531531532e-08, "loss": 0.0004, "reward": 3.7473455667495728, "reward_std": 0.021959856152534485, "rewards/final_reward": 1.7867562628767018, "rewards/mask_iou_reward": 0.8933781314383509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7473454475402832, "rewards/thk_ans_format_reward": 1.0, "step": 3209, "think_completion_length": 10.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 268.3541717529297, "epoch": 10.843170320404722, "grad_norm": 7.365789573300768, "kl": 0.3984375, "learning_rate": 9.628378378378378e-08, "loss": 0.0004, "reward": 3.7011457681655884, "reward_std": 0.03818492125719786, "rewards/final_reward": 1.1432885171328353, "rewards/mask_iou_reward": 0.5716442585664177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7011457085609436, "rewards/thk_ans_format_reward": 1.0, "step": 3210, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 249.85417938232422, "epoch": 10.84654300168634, "grad_norm": 13.904530784966804, "kl": 0.4072265625, "learning_rate": 9.600225225225225e-08, "loss": 0.0004, "reward": 3.4692060947418213, "reward_std": 0.018061704467982054, "rewards/final_reward": 0.9882531894496005, "rewards/mask_iou_reward": 0.49412659472480025, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4692060947418213, "rewards/thk_ans_format_reward": 1.0, "step": 3211, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 245.5416717529297, "epoch": 10.849915682967959, "grad_norm": 70.01853729733192, "kl": 0.412109375, "learning_rate": 9.572072072072071e-08, "loss": 0.0004, "reward": 3.4471957683563232, "reward_std": 0.09547746926546097, "rewards/final_reward": 1.5687760234479442, "rewards/mask_iou_reward": 0.7843880117239721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4471957683563232, "rewards/thk_ans_format_reward": 1.0, "step": 3212, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 217.45834350585938, "epoch": 10.853288364249579, "grad_norm": 38.89937225809673, "kl": 0.419921875, "learning_rate": 9.543918918918919e-08, "loss": 0.0004, "reward": 3.6407090425491333, "reward_std": 0.12232569698244333, "rewards/final_reward": 1.54928817900061, "rewards/mask_iou_reward": 0.774644089500305, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6511258482933044, "rewards/thk_ans_format_reward": 1.0, "step": 3213, "think_completion_length": 8.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 223.7604217529297, "epoch": 10.856661045531197, "grad_norm": 13.272328594438857, "kl": 0.560546875, "learning_rate": 9.515765765765766e-08, "loss": 0.0006, "reward": 3.628928542137146, "reward_std": 0.013613590504974127, "rewards/final_reward": 1.6047447176640486, "rewards/mask_iou_reward": 0.8023723588320243, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6289284229278564, "rewards/thk_ans_format_reward": 1.0, "step": 3214, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 155.6979217529297, "epoch": 10.860033726812816, "grad_norm": 10.236418470637137, "kl": 0.4541015625, "learning_rate": 9.487612612612612e-08, "loss": 0.0005, "reward": 3.5637048482894897, "reward_std": 0.03039349429309368, "rewards/final_reward": 1.8540802816023476, "rewards/mask_iou_reward": 0.9270401408011738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5637049674987793, "rewards/thk_ans_format_reward": 1.0, "step": 3215, "think_completion_length": 9.083333333333334 }, { "clip_ratio": 0.0, "completion_length": 201.23958587646484, "epoch": 10.863406408094434, "grad_norm": 28.92028211701786, "kl": 0.427734375, "learning_rate": 9.45945945945946e-08, "loss": 0.0004, "reward": 3.6250351667404175, "reward_std": 0.07762327417731285, "rewards/final_reward": 1.8546084172688548, "rewards/mask_iou_reward": 0.9273042086344274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6250353455543518, "rewards/thk_ans_format_reward": 1.0, "step": 3216, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 195.31250762939453, "epoch": 10.866779089376054, "grad_norm": 8.611578128472187, "kl": 0.4150390625, "learning_rate": 9.431306306306305e-08, "loss": 0.0004, "reward": 3.6757365465164185, "reward_std": 0.05385137163102627, "rewards/final_reward": 1.6966801983814146, "rewards/mask_iou_reward": 0.8483400991907073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.675736427307129, "rewards/thk_ans_format_reward": 1.0, "step": 3217, "think_completion_length": 9.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 227.90626525878906, "epoch": 10.870151770657673, "grad_norm": 22.430200048234568, "kl": 0.41796875, "learning_rate": 9.403153153153153e-08, "loss": 0.0004, "reward": 3.767126202583313, "reward_std": 0.021601593121886253, "rewards/final_reward": 1.73027651916644, "rewards/mask_iou_reward": 0.86513825958322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7671259045600891, "rewards/thk_ans_format_reward": 1.0, "step": 3218, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 178.3229217529297, "epoch": 10.873524451939291, "grad_norm": 11.13731742848097, "kl": 0.4287109375, "learning_rate": 9.375e-08, "loss": 0.0004, "reward": 3.4959752559661865, "reward_std": 0.08527533710002899, "rewards/final_reward": 1.5524004259183637, "rewards/mask_iou_reward": 0.7762002129591818, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4959751963615417, "rewards/thk_ans_format_reward": 1.0, "step": 3219, "think_completion_length": 6.958333333333334 }, { "clip_ratio": 0.0, "completion_length": 246.56250762939453, "epoch": 10.876897133220911, "grad_norm": 16.294777386618534, "kl": 0.5146484375, "learning_rate": 9.346846846846846e-08, "loss": 0.0005, "reward": 3.5129069089889526, "reward_std": 0.16785257682204247, "rewards/final_reward": 1.2444732205540479, "rewards/mask_iou_reward": 0.6222366102770239, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.5545734763145447, "rewards/thk_ans_format_reward": 0.9791666865348816, "step": 3220, "think_completion_length": 9.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 269.46875762939453, "epoch": 10.88026981450253, "grad_norm": 15.48641745142567, "kl": 0.3671875, "learning_rate": 9.318693693693694e-08, "loss": 0.0004, "reward": 3.4492322206497192, "reward_std": 0.009585548657923937, "rewards/final_reward": 1.4472096435305972, "rewards/mask_iou_reward": 0.7236048217652986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4492321610450745, "rewards/thk_ans_format_reward": 1.0, "step": 3221, "think_completion_length": 7.75 }, { "clip_ratio": 0.0, "completion_length": 256.0729293823242, "epoch": 10.883642495784148, "grad_norm": 12.46701648794007, "kl": 0.431640625, "learning_rate": 9.29054054054054e-08, "loss": 0.0004, "reward": 3.465346574783325, "reward_std": 0.09596022218465805, "rewards/final_reward": 1.506819034084586, "rewards/mask_iou_reward": 0.753409517042293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.46534663438797, "rewards/thk_ans_format_reward": 1.0, "step": 3222, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 299.61458587646484, "epoch": 10.887015177065766, "grad_norm": 9.487352623402131, "kl": 0.357421875, "learning_rate": 9.262387387387387e-08, "loss": 0.0004, "reward": 3.685336470603943, "reward_std": 0.048621442168951035, "rewards/final_reward": 1.708998655660046, "rewards/mask_iou_reward": 0.854499327830023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6853366494178772, "rewards/thk_ans_format_reward": 1.0, "step": 3223, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 304.15625, "epoch": 10.890387858347387, "grad_norm": 29.683667017829382, "kl": 0.4052734375, "learning_rate": 9.234234234234233e-08, "loss": 0.0004, "reward": 3.2169747352600098, "reward_std": 0.06005913391709328, "rewards/final_reward": 1.1622017621002811, "rewards/mask_iou_reward": 0.5811008810501406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.216974675655365, "rewards/thk_ans_format_reward": 1.0, "step": 3224, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 179.8854217529297, "epoch": 10.893760539629005, "grad_norm": 9.668247469756977, "kl": 0.4580078125, "learning_rate": 9.20608108108108e-08, "loss": 0.0005, "reward": 3.6324193477630615, "reward_std": 0.039188480004668236, "rewards/final_reward": 1.5502014532756223, "rewards/mask_iou_reward": 0.7751007266378112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6324193477630615, "rewards/thk_ans_format_reward": 1.0, "step": 3225, "think_completion_length": 8.708333333333334 }, { "clip_ratio": 0.0, "completion_length": 198.0729217529297, "epoch": 10.897133220910623, "grad_norm": 15.305182881846388, "kl": 0.88671875, "learning_rate": 9.177927927927928e-08, "loss": 0.0009, "reward": 3.5677725076675415, "reward_std": 0.042133665177971125, "rewards/final_reward": 1.9255321917009562, "rewards/mask_iou_reward": 0.9627660958504781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5677724480628967, "rewards/thk_ans_format_reward": 1.0, "step": 3226, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 294.2604293823242, "epoch": 10.900505902192243, "grad_norm": 8.05123745901817, "kl": 0.43359375, "learning_rate": 9.149774774774774e-08, "loss": 0.0004, "reward": 3.1885026693344116, "reward_std": 0.04743565432727337, "rewards/final_reward": 0.5120157429678219, "rewards/mask_iou_reward": 0.25600787148391096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1885027289390564, "rewards/thk_ans_format_reward": 1.0, "step": 3227, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 201.875, "epoch": 10.903878583473862, "grad_norm": 19.24892607651357, "kl": 0.5078125, "learning_rate": 9.121621621621621e-08, "loss": 0.0005, "reward": 3.751962900161743, "reward_std": 0.01671873265877366, "rewards/final_reward": 1.7371263028198993, "rewards/mask_iou_reward": 0.8685631514099497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.751962959766388, "rewards/thk_ans_format_reward": 1.0, "step": 3228, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 206.55209350585938, "epoch": 10.90725126475548, "grad_norm": 9.79467029988201, "kl": 0.486328125, "learning_rate": 9.093468468468468e-08, "loss": 0.0005, "reward": 3.695330023765564, "reward_std": 0.07344387657940388, "rewards/final_reward": 1.7112524455751048, "rewards/mask_iou_reward": 0.8556262227875524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.695330023765564, "rewards/thk_ans_format_reward": 1.0, "step": 3229, "think_completion_length": 7.666666666666667 }, { "clip_ratio": 0.0, "completion_length": 284.6145935058594, "epoch": 10.910623946037099, "grad_norm": 13.314238932481985, "kl": 0.4599609375, "learning_rate": 9.065315315315315e-08, "loss": 0.0005, "reward": 3.8808305263519287, "reward_std": 0.013991189189255238, "rewards/final_reward": 1.9198786636199257, "rewards/mask_iou_reward": 0.9599393318099628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8808305263519287, "rewards/thk_ans_format_reward": 1.0, "step": 3230, "think_completion_length": 8.416666666666668 }, { "clip_ratio": 0.0, "completion_length": 208.09376525878906, "epoch": 10.913996627318719, "grad_norm": 7.402782423189984, "kl": 0.462890625, "learning_rate": 9.037162162162162e-08, "loss": 0.0005, "reward": 2.9830663204193115, "reward_std": 0.13797349110245705, "rewards/final_reward": 0.8877441586400034, "rewards/mask_iou_reward": 0.4438720793200017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.98306605219841, "rewards/thk_ans_format_reward": 1.0, "step": 3231, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 307.4166717529297, "epoch": 10.917369308600337, "grad_norm": 8.39626969225335, "kl": 0.49609375, "learning_rate": 9.009009009009008e-08, "loss": 0.0005, "reward": 3.6167478561401367, "reward_std": 0.026893689297139645, "rewards/final_reward": 1.6967432154165967, "rewards/mask_iou_reward": 0.8483716077082983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6167477369308472, "rewards/thk_ans_format_reward": 1.0, "step": 3232, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 219.6666717529297, "epoch": 10.920741989881956, "grad_norm": 11.167540183524496, "kl": 0.443359375, "learning_rate": 8.980855855855856e-08, "loss": 0.0004, "reward": 3.337292790412903, "reward_std": 0.04990806803107262, "rewards/final_reward": 1.5319708460289454, "rewards/mask_iou_reward": 0.7659854230144727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3372925519943237, "rewards/thk_ans_format_reward": 1.0, "step": 3233, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 163.3229217529297, "epoch": 10.924114671163576, "grad_norm": 9.234633088741644, "kl": 0.4169921875, "learning_rate": 8.952702702702702e-08, "loss": 0.0004, "reward": 3.564087986946106, "reward_std": 0.05904023256152868, "rewards/final_reward": 1.6774216256025858, "rewards/mask_iou_reward": 0.8387108128012929, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5640875697135925, "rewards/thk_ans_format_reward": 1.0, "step": 3234, "think_completion_length": 7.5 }, { "clip_ratio": 0.0, "completion_length": 159.93750762939453, "epoch": 10.927487352445194, "grad_norm": 22.966644031125828, "kl": 0.4521484375, "learning_rate": 8.924549549549549e-08, "loss": 0.0005, "reward": 3.5395623445510864, "reward_std": 0.012699170969426632, "rewards/final_reward": 1.8294896969313919, "rewards/mask_iou_reward": 0.9147448484656959, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5395622253417969, "rewards/thk_ans_format_reward": 1.0, "step": 3235, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 217.5729217529297, "epoch": 10.930860033726812, "grad_norm": 10.42284799278542, "kl": 0.369140625, "learning_rate": 8.896396396396395e-08, "loss": 0.0004, "reward": 3.5979576110839844, "reward_std": 0.008746222592890263, "rewards/final_reward": 1.8873318631194849, "rewards/mask_iou_reward": 0.9436659315597424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.59795743227005, "rewards/thk_ans_format_reward": 1.0, "step": 3236, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 220.9479217529297, "epoch": 10.93423271500843, "grad_norm": 8.054077747626835, "kl": 0.4912109375, "learning_rate": 8.868243243243243e-08, "loss": 0.0005, "reward": 3.634762167930603, "reward_std": 0.05479666404426098, "rewards/final_reward": 1.427444284547413, "rewards/mask_iou_reward": 0.7137221422737065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6347622275352478, "rewards/thk_ans_format_reward": 1.0, "step": 3237, "think_completion_length": 7.791666666666667 }, { "clip_ratio": 0.0, "completion_length": 199.46875762939453, "epoch": 10.937605396290051, "grad_norm": 11.689568398021407, "kl": 0.4052734375, "learning_rate": 8.84009009009009e-08, "loss": 0.0004, "reward": 3.4657106399536133, "reward_std": 0.06903432868421078, "rewards/final_reward": 1.5770540530338935, "rewards/mask_iou_reward": 0.7885270265169467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4657106399536133, "rewards/thk_ans_format_reward": 1.0, "step": 3238, "think_completion_length": 6.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 216.21875762939453, "epoch": 10.94097807757167, "grad_norm": 7.9066782689942805, "kl": 0.3916015625, "learning_rate": 8.811936936936936e-08, "loss": 0.0004, "reward": 3.693625569343567, "reward_std": 0.02968922909349203, "rewards/final_reward": 1.8096445212847971, "rewards/mask_iou_reward": 0.9048222606423986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6936253905296326, "rewards/thk_ans_format_reward": 1.0, "step": 3239, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 171.3229217529297, "epoch": 10.944350758853288, "grad_norm": 5.704007915875586, "kl": 0.4609375, "learning_rate": 8.783783783783784e-08, "loss": 0.0005, "reward": 3.5016400814056396, "reward_std": 0.01643510302528739, "rewards/final_reward": 1.6616097302980894, "rewards/mask_iou_reward": 0.8308048651490447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5016400814056396, "rewards/thk_ans_format_reward": 1.0, "step": 3240, "think_completion_length": 8.875 }, { "clip_ratio": 0.0, "completion_length": 171.6354217529297, "epoch": 10.947723440134908, "grad_norm": 10.243122206364419, "kl": 0.484375, "learning_rate": 8.75563063063063e-08, "loss": 0.0005, "reward": 3.5590054988861084, "reward_std": 0.15231713093817234, "rewards/final_reward": 1.5732570087525417, "rewards/mask_iou_reward": 0.7866285043762709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5590054392814636, "rewards/thk_ans_format_reward": 1.0, "step": 3241, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 162.8229217529297, "epoch": 10.951096121416526, "grad_norm": 19.269768147583175, "kl": 0.4482421875, "learning_rate": 8.727477477477477e-08, "loss": 0.0005, "reward": 3.5184816122055054, "reward_std": 0.03059364575892687, "rewards/final_reward": 1.3453870798808751, "rewards/mask_iou_reward": 0.6726935399404376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.518481731414795, "rewards/thk_ans_format_reward": 1.0, "step": 3242, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 237.37500762939453, "epoch": 10.954468802698145, "grad_norm": 10.50987455275937, "kl": 0.4052734375, "learning_rate": 8.699324324324324e-08, "loss": 0.0004, "reward": 3.3991010189056396, "reward_std": 0.04655653005465865, "rewards/final_reward": 1.1448598582338887, "rewards/mask_iou_reward": 0.5724299291169443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3991011381149292, "rewards/thk_ans_format_reward": 1.0, "step": 3243, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 240.83333587646484, "epoch": 10.957841483979763, "grad_norm": 45.08296764599884, "kl": 0.3984375, "learning_rate": 8.67117117117117e-08, "loss": 0.0005, "reward": 3.770294427871704, "reward_std": 0.01194569538347423, "rewards/final_reward": 1.8231030989413408, "rewards/mask_iou_reward": 0.9115515494706704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7702943086624146, "rewards/thk_ans_format_reward": 1.0, "step": 3244, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 271.125, "epoch": 10.961214165261383, "grad_norm": 8.5253164523504, "kl": 0.376953125, "learning_rate": 8.643018018018018e-08, "loss": 0.0004, "reward": 3.5307878255844116, "reward_std": 0.09277350455522537, "rewards/final_reward": 1.573530974288559, "rewards/mask_iou_reward": 0.7867654871442795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5307878851890564, "rewards/thk_ans_format_reward": 1.0, "step": 3245, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 232.82292938232422, "epoch": 10.964586846543002, "grad_norm": 11.467629355388848, "kl": 0.453125, "learning_rate": 8.614864864864864e-08, "loss": 0.0005, "reward": 3.6759188175201416, "reward_std": 0.02997300773859024, "rewards/final_reward": 1.6248022016955932, "rewards/mask_iou_reward": 0.8124011008477966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6759187579154968, "rewards/thk_ans_format_reward": 1.0, "step": 3246, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 217.6666717529297, "epoch": 10.96795952782462, "grad_norm": 13.907193326709237, "kl": 1.087890625, "learning_rate": 8.586711711711711e-08, "loss": 0.0011, "reward": 3.539788007736206, "reward_std": 0.21456366777420044, "rewards/final_reward": 1.7829225695129152, "rewards/mask_iou_reward": 0.8914612847564576, "rewards/sam_format_reward": 0.9791666865348816, "rewards/sam_reward_func_ultra": 1.560621440410614, "rewards/thk_ans_format_reward": 1.0, "step": 3247, "think_completion_length": 9.25 }, { "clip_ratio": 0.0, "completion_length": 211.06250762939453, "epoch": 10.97133220910624, "grad_norm": 16.606124292610147, "kl": 0.3974609375, "learning_rate": 8.558558558558559e-08, "loss": 0.0004, "reward": 3.6476298570632935, "reward_std": 0.05834665335714817, "rewards/final_reward": 1.5603483165007168, "rewards/mask_iou_reward": 0.7801741582503584, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6476297974586487, "rewards/thk_ans_format_reward": 1.0, "step": 3248, "think_completion_length": 7.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 210.35417938232422, "epoch": 10.974704890387859, "grad_norm": 11.609692382916647, "kl": 0.40625, "learning_rate": 8.530405405405405e-08, "loss": 0.0004, "reward": 3.69338595867157, "reward_std": 0.027596603147685528, "rewards/final_reward": 1.9251977772616073, "rewards/mask_iou_reward": 0.9625988886308037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6933859586715698, "rewards/thk_ans_format_reward": 1.0, "step": 3249, "think_completion_length": 7.916666666666666 }, { "clip_ratio": 0.0, "completion_length": 258.3645935058594, "epoch": 10.978077571669477, "grad_norm": 21.16551017125066, "kl": 0.3671875, "learning_rate": 8.502252252252252e-08, "loss": 0.0004, "reward": 3.619927167892456, "reward_std": 0.03946511447429657, "rewards/final_reward": 1.2778173729229194, "rewards/mask_iou_reward": 0.6389086864614597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6199271082878113, "rewards/thk_ans_format_reward": 1.0, "step": 3250, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 242.28125, "epoch": 10.981450252951095, "grad_norm": 6.630093534026958, "kl": 0.498046875, "learning_rate": 8.474099099099098e-08, "loss": 0.0005, "reward": 3.6429604291915894, "reward_std": 0.06296277791261673, "rewards/final_reward": 1.7986877345795302, "rewards/mask_iou_reward": 0.8993438672897651, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.642960548400879, "rewards/thk_ans_format_reward": 1.0, "step": 3251, "think_completion_length": 8.458333333333334 }, { "clip_ratio": 0.0, "completion_length": 303.8333435058594, "epoch": 10.984822934232715, "grad_norm": 11.65935010806574, "kl": 0.66015625, "learning_rate": 8.445945945945946e-08, "loss": 0.0007, "reward": 3.341140866279602, "reward_std": 0.11203420907258987, "rewards/final_reward": 0.977063955266588, "rewards/mask_iou_reward": 0.488531977633294, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3411406874656677, "rewards/thk_ans_format_reward": 1.0, "step": 3252, "think_completion_length": 7.208333333333333 }, { "clip_ratio": 0.0, "completion_length": 189.45834350585938, "epoch": 10.988195615514334, "grad_norm": 10.670823671803728, "kl": 0.4453125, "learning_rate": 8.417792792792793e-08, "loss": 0.0004, "reward": 3.72513210773468, "reward_std": 0.07078396715223789, "rewards/final_reward": 1.7892148373927257, "rewards/mask_iou_reward": 0.8946074186963628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.725132167339325, "rewards/thk_ans_format_reward": 1.0, "step": 3253, "think_completion_length": 8.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 178.52083587646484, "epoch": 10.991568296795952, "grad_norm": 6.109386082964309, "kl": 0.46875, "learning_rate": 8.389639639639639e-08, "loss": 0.0005, "reward": 3.374544382095337, "reward_std": 0.054962567053735256, "rewards/final_reward": 1.930871790359452, "rewards/mask_iou_reward": 0.965435895179726, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3745442032814026, "rewards/thk_ans_format_reward": 1.0, "step": 3254, "think_completion_length": 9.416666666666666 }, { "clip_ratio": 0.0, "completion_length": 234.81250762939453, "epoch": 10.994940978077572, "grad_norm": 10.063378755116721, "kl": 0.5302734375, "learning_rate": 8.361486486486486e-08, "loss": 0.0005, "reward": 3.614203453063965, "reward_std": 0.06491642817854881, "rewards/final_reward": 1.5505027246245475, "rewards/mask_iou_reward": 0.7752513623122738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6142034530639648, "rewards/thk_ans_format_reward": 1.0, "step": 3255, "think_completion_length": 9.333333333333334 }, { "clip_ratio": 0.0, "completion_length": 90.28947257995605, "epoch": 10.99831365935919, "grad_norm": 8.135456862235158, "kl": 0.546875, "learning_rate": 8.333333333333333e-08, "loss": 0.0006, "reward": 3.6233062744140625, "reward_std": 0.04351498291362077, "rewards/final_reward": 1.9861655112622487, "rewards/mask_iou_reward": 0.9930827556311244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6233062744140625, "rewards/thk_ans_format_reward": 1.0, "step": 3256, "think_completion_length": 9.916666666666668 }, { "clip_ratio": 0.0, "completion_length": 189.67709350585938, "epoch": 11.003372681281618, "grad_norm": 11.557014185942567, "kl": 0.560546875, "learning_rate": 8.30518018018018e-08, "loss": 0.0006, "reward": 3.7738869190216064, "reward_std": 0.05592325050383806, "rewards/final_reward": 1.7459996031354077, "rewards/mask_iou_reward": 0.8729998015677038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7738869786262512, "rewards/thk_ans_format_reward": 1.0, "step": 3257, "think_completion_length": 7.291666666666667 }, { "clip_ratio": 0.0, "completion_length": 285.7083435058594, "epoch": 11.006745362563239, "grad_norm": 7.8662512856552, "kl": 0.3935546875, "learning_rate": 8.277027027027027e-08, "loss": 0.0004, "reward": 3.5249884128570557, "reward_std": 0.03571249917149544, "rewards/final_reward": 1.5830277205064258, "rewards/mask_iou_reward": 0.7915138602532129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5249884724617004, "rewards/thk_ans_format_reward": 1.0, "step": 3258, "think_completion_length": 7.125 }, { "clip_ratio": 0.0, "completion_length": 218.02083587646484, "epoch": 11.010118043844857, "grad_norm": 18.362126856228976, "kl": 0.71484375, "learning_rate": 8.248873873873873e-08, "loss": 0.0007, "reward": 3.383101224899292, "reward_std": 0.0536690279841423, "rewards/final_reward": 1.5824458142464324, "rewards/mask_iou_reward": 0.7912229071232162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3831011056900024, "rewards/thk_ans_format_reward": 1.0, "step": 3259, "think_completion_length": 8.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 255.6979217529297, "epoch": 11.013490725126475, "grad_norm": 7.679861782392985, "kl": 0.3681640625, "learning_rate": 8.220720720720721e-08, "loss": 0.0004, "reward": 3.794712543487549, "reward_std": 0.045380293391644955, "rewards/final_reward": 1.7493822493573408, "rewards/mask_iou_reward": 0.8746911246786704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7947124242782593, "rewards/thk_ans_format_reward": 1.0, "step": 3260, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 209.08333587646484, "epoch": 11.016863406408094, "grad_norm": 5.712072509389033, "kl": 0.615234375, "learning_rate": 8.192567567567567e-08, "loss": 0.0006, "reward": 3.6091835498809814, "reward_std": 0.12349607236683369, "rewards/final_reward": 1.7861388123293191, "rewards/mask_iou_reward": 0.8930694061646596, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6196001768112183, "rewards/thk_ans_format_reward": 1.0, "step": 3261, "think_completion_length": 8.375 }, { "clip_ratio": 0.0, "completion_length": 222.67708587646484, "epoch": 11.020236087689714, "grad_norm": 9.664881769651082, "kl": 0.5009765625, "learning_rate": 8.164414414414414e-08, "loss": 0.0005, "reward": 3.8026552200317383, "reward_std": 0.012866603909060359, "rewards/final_reward": 1.479182740000682, "rewards/mask_iou_reward": 0.739591370000341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8026551604270935, "rewards/thk_ans_format_reward": 1.0, "step": 3262, "think_completion_length": 10.125 }, { "clip_ratio": 0.0, "completion_length": 295.2395935058594, "epoch": 11.023608768971332, "grad_norm": 14.431790656745983, "kl": 0.38671875, "learning_rate": 8.136261261261262e-08, "loss": 0.0004, "reward": 3.31676983833313, "reward_std": 0.3980730175971985, "rewards/final_reward": 1.1245725328677882, "rewards/mask_iou_reward": 0.5622862664338941, "rewards/sam_format_reward": 0.9270833432674408, "rewards/sam_reward_func_ultra": 1.4626030325889587, "rewards/thk_ans_format_reward": 0.9270833432674408, "step": 3263, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 226.92708587646484, "epoch": 11.02698145025295, "grad_norm": 7.97145047005376, "kl": 0.45703125, "learning_rate": 8.108108108108108e-08, "loss": 0.0005, "reward": 3.676851272583008, "reward_std": 0.02242056792601943, "rewards/final_reward": 1.827183155940764, "rewards/mask_iou_reward": 0.913591577970382, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6768511533737183, "rewards/thk_ans_format_reward": 1.0, "step": 3264, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 205.1041717529297, "epoch": 11.03035413153457, "grad_norm": 15.361060325537107, "kl": 0.546875, "learning_rate": 8.079954954954954e-08, "loss": 0.0005, "reward": 3.680310845375061, "reward_std": 0.046101706102490425, "rewards/final_reward": 1.5770611709529159, "rewards/mask_iou_reward": 0.7885305854764579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.680310845375061, "rewards/thk_ans_format_reward": 1.0, "step": 3265, "think_completion_length": 10.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 260.39583587646484, "epoch": 11.03372681281619, "grad_norm": 7.5963750674538515, "kl": 0.4228515625, "learning_rate": 8.051801801801801e-08, "loss": 0.0004, "reward": 3.7898802757263184, "reward_std": 0.031077871099114418, "rewards/final_reward": 1.8079273683003763, "rewards/mask_iou_reward": 0.9039636841501881, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.789880096912384, "rewards/thk_ans_format_reward": 1.0, "step": 3266, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 196.71875, "epoch": 11.037099494097808, "grad_norm": 50.02483132654234, "kl": 0.505859375, "learning_rate": 8.023648648648649e-08, "loss": 0.0005, "reward": 3.673597812652588, "reward_std": 0.03365712705999613, "rewards/final_reward": 1.5130002342504598, "rewards/mask_iou_reward": 0.7565001171252299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6735975742340088, "rewards/thk_ans_format_reward": 1.0, "step": 3267, "think_completion_length": 7.791666666666666 }, { "clip_ratio": 0.0, "completion_length": 359.8333435058594, "epoch": 11.040472175379426, "grad_norm": 7.948491490121137, "kl": 0.3974609375, "learning_rate": 7.995495495495496e-08, "loss": 0.0004, "reward": 3.448967218399048, "reward_std": 0.11523477360606194, "rewards/final_reward": 1.6464742945714734, "rewards/mask_iou_reward": 0.8232371472857367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4489670991897583, "rewards/thk_ans_format_reward": 1.0, "step": 3268, "think_completion_length": 9.208333333333332 }, { "clip_ratio": 0.0, "completion_length": 168.12500762939453, "epoch": 11.043844856661046, "grad_norm": 30.844776065741474, "kl": 0.4765625, "learning_rate": 7.967342342342342e-08, "loss": 0.0005, "reward": 3.5432692766189575, "reward_std": 0.07160472683608532, "rewards/final_reward": 1.6734057054448193, "rewards/mask_iou_reward": 0.8367028527224096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5432690382003784, "rewards/thk_ans_format_reward": 1.0, "step": 3269, "think_completion_length": 8.458333333333332 }, { "clip_ratio": 0.0, "completion_length": 213.05208587646484, "epoch": 11.047217537942664, "grad_norm": 13.632646077084331, "kl": 0.7734375, "learning_rate": 7.939189189189188e-08, "loss": 0.0008, "reward": 3.6073343753814697, "reward_std": 0.07992910593748093, "rewards/final_reward": 1.7809643798983177, "rewards/mask_iou_reward": 0.8904821899491588, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.607334315776825, "rewards/thk_ans_format_reward": 1.0, "step": 3270, "think_completion_length": 8.125 }, { "clip_ratio": 0.0, "completion_length": 170.91666793823242, "epoch": 11.050590219224283, "grad_norm": 12.258278428753142, "kl": 0.796875, "learning_rate": 7.911036036036035e-08, "loss": 0.0008, "reward": 3.5184515714645386, "reward_std": 0.047073401510715485, "rewards/final_reward": 1.6241069163450614, "rewards/mask_iou_reward": 0.8120534581725307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5184515714645386, "rewards/thk_ans_format_reward": 1.0, "step": 3271, "think_completion_length": 8.5 }, { "clip_ratio": 0.0, "completion_length": 325.8645935058594, "epoch": 11.053962900505903, "grad_norm": 8.700745630019178, "kl": 0.380859375, "learning_rate": 7.882882882882883e-08, "loss": 0.0004, "reward": 3.5366759300231934, "reward_std": 0.08967574685811996, "rewards/final_reward": 0.9761992976696359, "rewards/mask_iou_reward": 0.48809964883481793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.536676049232483, "rewards/thk_ans_format_reward": 1.0, "step": 3272, "think_completion_length": 9.166666666666668 }, { "clip_ratio": 0.0, "completion_length": 268.2604293823242, "epoch": 11.057335581787521, "grad_norm": 7.473400543514334, "kl": 0.51953125, "learning_rate": 7.85472972972973e-08, "loss": 0.0005, "reward": 3.5971490144729614, "reward_std": 0.069289181381464, "rewards/final_reward": 1.7362259460742102, "rewards/mask_iou_reward": 0.8681129730371051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5971489548683167, "rewards/thk_ans_format_reward": 1.0, "step": 3273, "think_completion_length": 7.708333333333333 }, { "clip_ratio": 0.0, "completion_length": 202.84375762939453, "epoch": 11.06070826306914, "grad_norm": 6.5430731406634886, "kl": 0.4736328125, "learning_rate": 7.826576576576576e-08, "loss": 0.0005, "reward": 3.5351314544677734, "reward_std": 0.05387053173035383, "rewards/final_reward": 1.3752052736546478, "rewards/mask_iou_reward": 0.6876026368273239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.535131573677063, "rewards/thk_ans_format_reward": 1.0, "step": 3274, "think_completion_length": 9.166666666666666 }, { "clip_ratio": 0.0, "completion_length": 215.93751525878906, "epoch": 11.064080944350758, "grad_norm": 59.65870522669481, "kl": 0.517578125, "learning_rate": 7.798423423423422e-08, "loss": 0.0005, "reward": 3.680536389350891, "reward_std": 0.06550450623035431, "rewards/final_reward": 1.6033873661332025, "rewards/mask_iou_reward": 0.8016936830666013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6805359721183777, "rewards/thk_ans_format_reward": 1.0, "step": 3275, "think_completion_length": 9.041666666666668 }, { "clip_ratio": 0.0, "completion_length": 199.95834350585938, "epoch": 11.067453625632378, "grad_norm": 10.050849213572555, "kl": 0.4326171875, "learning_rate": 7.77027027027027e-08, "loss": 0.0004, "reward": 3.075374126434326, "reward_std": 0.06078692898154259, "rewards/final_reward": 1.414640110385101, "rewards/mask_iou_reward": 0.7073200551925505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0753739774227142, "rewards/thk_ans_format_reward": 1.0, "step": 3276, "think_completion_length": 8.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 238.78125762939453, "epoch": 11.070826306913997, "grad_norm": 11.056552105553564, "kl": 0.392578125, "learning_rate": 7.742117117117117e-08, "loss": 0.0004, "reward": 3.5583964586257935, "reward_std": 0.06431005522608757, "rewards/final_reward": 1.4149002911814856, "rewards/mask_iou_reward": 0.7074501455907428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5583963990211487, "rewards/thk_ans_format_reward": 1.0, "step": 3277, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 210.53125762939453, "epoch": 11.074198988195615, "grad_norm": 8.529640320104521, "kl": 0.4326171875, "learning_rate": 7.713963963963965e-08, "loss": 0.0004, "reward": 3.541915774345398, "reward_std": 0.037605963414534926, "rewards/final_reward": 1.51284828093512, "rewards/mask_iou_reward": 0.75642414046756, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5419156551361084, "rewards/thk_ans_format_reward": 1.0, "step": 3278, "think_completion_length": 8.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 190.33333587646484, "epoch": 11.077571669477235, "grad_norm": 27.685500769109286, "kl": 0.505859375, "learning_rate": 7.68581081081081e-08, "loss": 0.0005, "reward": 3.642976999282837, "reward_std": 0.029503632336854935, "rewards/final_reward": 1.4340308792184384, "rewards/mask_iou_reward": 0.7170154396092192, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6429771184921265, "rewards/thk_ans_format_reward": 1.0, "step": 3279, "think_completion_length": 8.791666666666668 }, { "clip_ratio": 0.0, "completion_length": 216.34375762939453, "epoch": 11.080944350758854, "grad_norm": 6.816602663960299, "kl": 0.3935546875, "learning_rate": 7.657657657657657e-08, "loss": 0.0004, "reward": 3.582181930541992, "reward_std": 0.05259714089334011, "rewards/final_reward": 1.7713748625896824, "rewards/mask_iou_reward": 0.8856874312948412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5821818113327026, "rewards/thk_ans_format_reward": 1.0, "step": 3280, "think_completion_length": 7.833333333333334 }, { "clip_ratio": 0.0, "completion_length": 257.15626525878906, "epoch": 11.084317032040472, "grad_norm": 8.972118747155667, "kl": 0.6015625, "learning_rate": 7.629504504504504e-08, "loss": 0.0006, "reward": 3.2960952520370483, "reward_std": 0.03791832132264972, "rewards/final_reward": 1.9528624409568343, "rewards/mask_iou_reward": 0.9764312204784171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2960952520370483, "rewards/thk_ans_format_reward": 1.0, "step": 3281, "think_completion_length": 7.416666666666667 }, { "clip_ratio": 0.0, "completion_length": 221.06251525878906, "epoch": 11.08768971332209, "grad_norm": 9.61726929093108, "kl": 0.55078125, "learning_rate": 7.601351351351351e-08, "loss": 0.0006, "reward": 3.6938302516937256, "reward_std": 0.028953220695257187, "rewards/final_reward": 1.4376917790190762, "rewards/mask_iou_reward": 0.7188458895095381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6938302516937256, "rewards/thk_ans_format_reward": 1.0, "step": 3282, "think_completion_length": 7.666666666666666 }, { "clip_ratio": 0.0, "completion_length": 258.84375762939453, "epoch": 11.09106239460371, "grad_norm": 6.908669230016509, "kl": 0.392578125, "learning_rate": 7.573198198198199e-08, "loss": 0.0004, "reward": 3.4922462701797485, "reward_std": 0.07157362625002861, "rewards/final_reward": 1.9423061070144874, "rewards/mask_iou_reward": 0.9711530535072437, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4922462105751038, "rewards/thk_ans_format_reward": 1.0, "step": 3283, "think_completion_length": 7.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 212.03125762939453, "epoch": 11.094435075885329, "grad_norm": 69.26979033185252, "kl": 0.474609375, "learning_rate": 7.545045045045045e-08, "loss": 0.0005, "reward": 3.3932985067367554, "reward_std": 0.014708156697452068, "rewards/final_reward": 1.5756231422508065, "rewards/mask_iou_reward": 0.7878115711254032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3932985067367554, "rewards/thk_ans_format_reward": 1.0, "step": 3284, "think_completion_length": 8.333333333333332 }, { "clip_ratio": 0.0, "completion_length": 290.30208587646484, "epoch": 11.097807757166947, "grad_norm": 15.734960613548125, "kl": 0.4365234375, "learning_rate": 7.516891891891891e-08, "loss": 0.0005, "reward": 3.662890672683716, "reward_std": 0.012950449250638485, "rewards/final_reward": 1.5119907518770903, "rewards/mask_iou_reward": 0.7559953759385452, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6628906726837158, "rewards/thk_ans_format_reward": 1.0, "step": 3285, "think_completion_length": 7.625 }, { "clip_ratio": 0.0, "completion_length": 283.5416717529297, "epoch": 11.101180438448566, "grad_norm": 9.099008238943991, "kl": 0.3857421875, "learning_rate": 7.488738738738738e-08, "loss": 0.0004, "reward": 3.594428062438965, "reward_std": 0.06632101535797119, "rewards/final_reward": 1.7235100982074392, "rewards/mask_iou_reward": 0.8617550491037196, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5944279432296753, "rewards/thk_ans_format_reward": 1.0, "step": 3286, "think_completion_length": 9.75 }, { "clip_ratio": 0.0, "completion_length": 259.48958587646484, "epoch": 11.104553119730186, "grad_norm": 7.084162835272017, "kl": 0.59375, "learning_rate": 7.460585585585586e-08, "loss": 0.0006, "reward": 3.198632597923279, "reward_std": 0.3177741765975952, "rewards/final_reward": 1.4057838586232614, "rewards/mask_iou_reward": 0.7028919293116307, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.2194659411907196, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3287, "think_completion_length": 8.75 }, { "clip_ratio": 0.0, "completion_length": 223.7604217529297, "epoch": 11.107925801011804, "grad_norm": 17.248231136349567, "kl": 0.4658203125, "learning_rate": 7.432432432432433e-08, "loss": 0.0005, "reward": 3.525164246559143, "reward_std": 0.020461719017475843, "rewards/final_reward": 1.2088079626138248, "rewards/mask_iou_reward": 0.6044039813069124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5251641869544983, "rewards/thk_ans_format_reward": 1.0, "step": 3288, "think_completion_length": 8.041666666666666 }, { "clip_ratio": 0.0, "completion_length": 262.1666717529297, "epoch": 11.111298482293423, "grad_norm": 7.706524760091841, "kl": 0.408203125, "learning_rate": 7.404279279279278e-08, "loss": 0.0004, "reward": 3.6560596227645874, "reward_std": 0.06422694679349661, "rewards/final_reward": 1.8228721948733961, "rewards/mask_iou_reward": 0.9114360974366981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6560596227645874, "rewards/thk_ans_format_reward": 1.0, "step": 3289, "think_completion_length": 8.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 250.94792938232422, "epoch": 11.114671163575043, "grad_norm": 5.03298899344414, "kl": 0.3583984375, "learning_rate": 7.376126126126125e-08, "loss": 0.0003, "reward": 3.7929335832595825, "reward_std": 0.031620634719729424, "rewards/final_reward": 1.9703793094418138, "rewards/mask_iou_reward": 0.9851896547209069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7929337620735168, "rewards/thk_ans_format_reward": 1.0, "step": 3290, "think_completion_length": 8.291666666666666 }, { "clip_ratio": 0.0, "completion_length": 201.90625762939453, "epoch": 11.118043844856661, "grad_norm": 16.76234833828291, "kl": 0.779296875, "learning_rate": 7.347972972972973e-08, "loss": 0.0008, "reward": 3.5754377841949463, "reward_std": 0.22860531508922577, "rewards/final_reward": 1.5805011760570107, "rewards/mask_iou_reward": 0.7902505880285053, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.5962709784507751, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3291, "think_completion_length": 7.916666666666667 }, { "clip_ratio": 0.0, "completion_length": 201.40625762939453, "epoch": 11.12141652613828, "grad_norm": 8.834695585041631, "kl": 0.4853515625, "learning_rate": 7.31981981981982e-08, "loss": 0.0005, "reward": 3.5796273946762085, "reward_std": 0.08974376507103443, "rewards/final_reward": 1.8597737950833935, "rewards/mask_iou_reward": 0.9298868975416967, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5796274542808533, "rewards/thk_ans_format_reward": 1.0, "step": 3292, "think_completion_length": 9.083333333333332 }, { "clip_ratio": 0.0, "completion_length": 158.33333587646484, "epoch": 11.124789207419898, "grad_norm": 17.70647786369016, "kl": 0.6015625, "learning_rate": 7.291666666666667e-08, "loss": 0.0006, "reward": 3.784427046775818, "reward_std": 0.040433993737678975, "rewards/final_reward": 1.9819054048092806, "rewards/mask_iou_reward": 0.9909527024046403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.784426987171173, "rewards/thk_ans_format_reward": 1.0, "step": 3293, "think_completion_length": 9.833333333333332 }, { "clip_ratio": 0.0, "completion_length": 270.9479293823242, "epoch": 11.128161888701518, "grad_norm": 7.106594817415373, "kl": 0.4462890625, "learning_rate": 7.263513513513512e-08, "loss": 0.0004, "reward": 3.288295030593872, "reward_std": 0.24038275331258774, "rewards/final_reward": 1.5676557711252865, "rewards/mask_iou_reward": 0.7838278855626433, "rewards/sam_format_reward": 0.9479166865348816, "rewards/sam_reward_func_ultra": 1.3924616575241089, "rewards/thk_ans_format_reward": 0.9479166865348816, "step": 3294, "think_completion_length": 7.875 }, { "clip_ratio": 0.0, "completion_length": 229.77084350585938, "epoch": 11.131534569983137, "grad_norm": 10.746438770092233, "kl": 0.427734375, "learning_rate": 7.23536036036036e-08, "loss": 0.0005, "reward": 3.652415633201599, "reward_std": 0.2226637750864029, "rewards/final_reward": 1.8436673039719813, "rewards/mask_iou_reward": 0.9218336519859907, "rewards/sam_format_reward": 0.9895833432674408, "rewards/sam_reward_func_ultra": 1.6732491850852966, "rewards/thk_ans_format_reward": 0.9895833432674408, "step": 3295, "think_completion_length": 7.541666666666667 }, { "clip_ratio": 0.0, "completion_length": 230.75, "epoch": 11.134907251264755, "grad_norm": 15.824855680054148, "kl": 0.3994140625, "learning_rate": 7.207207207207207e-08, "loss": 0.0004, "reward": 3.43550705909729, "reward_std": 0.08496489748358727, "rewards/final_reward": 1.5848493301955808, "rewards/mask_iou_reward": 0.7924246650977904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4355069994926453, "rewards/thk_ans_format_reward": 1.0, "step": 3296, "think_completion_length": 8.25 }, { "clip_ratio": 0.0, "completion_length": 266.6145935058594, "epoch": 11.138279932546375, "grad_norm": 7.5556161914264806, "kl": 0.384765625, "learning_rate": 7.179054054054054e-08, "loss": 0.0003, "reward": 3.502914309501648, "reward_std": 0.07663140445947647, "rewards/final_reward": 1.8889766537903219, "rewards/mask_iou_reward": 0.9444883268951609, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5029143691062927, "rewards/thk_ans_format_reward": 1.0, "step": 3297, "think_completion_length": 7.208333333333334 }, { "clip_ratio": 0.0, "completion_length": 248.23959350585938, "epoch": 11.141652613827993, "grad_norm": 7.059335996561547, "kl": 0.4306640625, "learning_rate": 7.150900900900902e-08, "loss": 0.0004, "reward": 3.641920804977417, "reward_std": 0.04107053391635418, "rewards/final_reward": 1.7453527965394877, "rewards/mask_iou_reward": 0.8726763982697439, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6419206857681274, "rewards/thk_ans_format_reward": 1.0, "step": 3298, "think_completion_length": 7.958333333333333 }, { "clip_ratio": 0.0, "completion_length": 232.83333587646484, "epoch": 11.145025295109612, "grad_norm": 12.348962116367375, "kl": 0.40625, "learning_rate": 7.122747747747746e-08, "loss": 0.0004, "reward": 3.6770007610321045, "reward_std": 0.02028821548447013, "rewards/final_reward": 1.6679152904367895, "rewards/mask_iou_reward": 0.8339576452183948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6770007610321045, "rewards/thk_ans_format_reward": 1.0, "step": 3299, "think_completion_length": 7.375 }, { "clip_ratio": 0.0, "completion_length": 177.86459350585938, "epoch": 11.14839797639123, "grad_norm": 6.210183344320372, "kl": 0.4912109375, "learning_rate": 7.094594594594594e-08, "loss": 0.0005, "reward": 3.6773531436920166, "reward_std": 0.029986443929374218, "rewards/final_reward": 1.5234544784468844, "rewards/mask_iou_reward": 0.7617272392234422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6773530840873718, "rewards/thk_ans_format_reward": 1.0, "step": 3300, "think_completion_length": 7.958333333333334 } ], "logging_steps": 1.0, "max_steps": 3552, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }