diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23386 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100.0, + "global_step": 4166, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/mean_length": 188.1875, + "completions/min_length": 13.0, + "epoch": 0.00048007681228996637, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.25846678018569946, + "kl": 0.0, + "learning_rate": 4.784688995215311e-08, + "loss": 7.450580596923828e-09, + "memory(GiB)": 22.98, + "reward": -0.44574999809265137, + "reward_std": 0.21814244985580444, + "rewards/MMContentORM/mean": -0.8299999833106995, + "rewards/MMContentORM/std": 0.5199999809265137, + "rewards/MMFormatORM/mean": 0.12187499552965164, + "rewards/MMFormatORM/std": 0.262023389339447, + "rewards/MMRubricORM/mean": -0.8125, + "rewards/MMRubricORM/std": 0.40311288833618164, + "step": 1, + "train_speed(iter/s)": 0.047076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/mean_length": 166.34375, + "completions/min_length": 10.25, + "epoch": 0.002400384061449832, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.5505234599113464, + "kl": 0.0014767646789550781, + "learning_rate": 2.3923444976076555e-07, + "loss": 5.9054447774542496e-05, + "memory(GiB)": 23.69, + "reward": -0.40181251987814903, + "reward_std": 0.2141649704426527, + "rewards/MMContentORM/mean": -0.7381249889731407, + "rewards/MMContentORM/std": 0.5696750730276108, + "rewards/MMFormatORM/mean": 0.1320312451571226, + "rewards/MMFormatORM/std": 0.25288669392466545, + "rewards/MMRubricORM/mean": -0.796875, + "rewards/MMRubricORM/std": 0.3890564441680908, + "step": 5, + "train_speed(iter/s)": 0.069193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 496.2, + "completions/mean_length": 196.9625, + "completions/min_length": 12.6, + "epoch": 0.004800768122899664, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19138023257255554, + "kl": 0.003989028930664063, + "learning_rate": 4.784688995215311e-07, + "loss": 0.0001598534407094121, + "memory(GiB)": 23.69, + "reward": -0.35715001821517944, + "reward_std": 0.25731615722179413, + "rewards/MMContentORM/mean": -0.6659999847412109, + "rewards/MMContentORM/std": 0.6672868490219116, + "rewards/MMFormatORM/mean": 0.15437499880790712, + "rewards/MMFormatORM/std": 0.2739557534456253, + "rewards/MMRubricORM/mean": -0.7625, + "rewards/MMRubricORM/std": 0.42147040367126465, + "step": 10, + "train_speed(iter/s)": 0.066915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.4, + "completions/mean_length": 175.4625, + "completions/min_length": 10.6, + "epoch": 0.007201152184349496, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1383572220802307, + "kl": 0.0017303466796875, + "learning_rate": 7.177033492822967e-07, + "loss": 6.930254749022424e-05, + "memory(GiB)": 23.69, + "reward": -0.43625002503395083, + "reward_std": 0.23157747238874435, + "rewards/MMContentORM/mean": -0.7774999976158142, + "rewards/MMContentORM/std": 0.5322124093770981, + "rewards/MMFormatORM/mean": 0.1056249976158142, + "rewards/MMFormatORM/std": 0.23594435155391694, + "rewards/MMRubricORM/mean": -0.8375, + "rewards/MMRubricORM/std": 0.36299130916595457, + "step": 15, + "train_speed(iter/s)": 0.072542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 180.6, + "completions/min_length": 11.2, + "epoch": 0.009601536245799328, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6662951707839966, + "kl": 0.0012393951416015624, + "learning_rate": 9.569377990430622e-07, + "loss": 4.959976649843156e-05, + "memory(GiB)": 23.69, + "reward": -0.40305001139640806, + "reward_std": 0.25151788890361787, + "rewards/MMContentORM/mean": -0.7519999861717224, + "rewards/MMContentORM/std": 0.5646255791187287, + "rewards/MMFormatORM/mean": 0.13812499791383742, + "rewards/MMFormatORM/std": 0.2656771123409271, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.4087340235710144, + "step": 20, + "train_speed(iter/s)": 0.075755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.6, + "completions/mean_length": 165.925, + "completions/min_length": 10.6, + "epoch": 0.01200192030724916, + "frac_reward_zero_std": 0.575, + "grad_norm": 1.417803406715393, + "kl": 0.00196990966796875, + "learning_rate": 1.196172248803828e-06, + "loss": 7.890671258792281e-05, + "memory(GiB)": 24.1, + "reward": -0.3179000109434128, + "reward_std": 0.2674277901649475, + "rewards/MMContentORM/mean": -0.6160000085830688, + "rewards/MMContentORM/std": 0.691703325510025, + "rewards/MMFormatORM/mean": 0.17749999463558197, + "rewards/MMFormatORM/std": 0.28609572947025297, + "rewards/MMRubricORM/mean": -0.7125, + "rewards/MMRubricORM/std": 0.45950802564620974, + "step": 25, + "train_speed(iter/s)": 0.077874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/mean_length": 176.1375, + "completions/min_length": 10.0, + "epoch": 0.014402304368698993, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6226190328598022, + "kl": 0.001837921142578125, + "learning_rate": 1.4354066985645934e-06, + "loss": 7.35294190235436e-05, + "memory(GiB)": 24.1, + "reward": -0.3812000215053558, + "reward_std": 0.28241844177246095, + "rewards/MMContentORM/mean": -0.7405000030994415, + "rewards/MMContentORM/std": 0.5169497162103653, + "rewards/MMFormatORM/mean": 0.16249999552965164, + "rewards/MMFormatORM/std": 0.28217866122722624, + "rewards/MMRubricORM/mean": -0.75, + "rewards/MMRubricORM/std": 0.43412102460861207, + "step": 30, + "train_speed(iter/s)": 0.07745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.6, + "completions/mean_length": 159.9625, + "completions/min_length": 12.2, + "epoch": 0.016802688430148822, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14647576212882996, + "kl": 0.0017627716064453126, + "learning_rate": 1.6746411483253591e-06, + "loss": 7.049270207062363e-05, + "memory(GiB)": 24.1, + "reward": -0.41890002489089967, + "reward_std": 0.22358716428279876, + "rewards/MMContentORM/mean": -0.7735000014305115, + "rewards/MMContentORM/std": 0.48501716256141664, + "rewards/MMFormatORM/mean": 0.12624999843537807, + "rewards/MMFormatORM/std": 0.23877365738153458, + "rewards/MMRubricORM/mean": -0.8, + "rewards/MMRubricORM/std": 0.3904210150241852, + "step": 35, + "train_speed(iter/s)": 0.078437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.8, + "completions/mean_length": 173.625, + "completions/min_length": 13.2, + "epoch": 0.019203072491598656, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4236033260822296, + "kl": 0.00177764892578125, + "learning_rate": 1.9138755980861244e-06, + "loss": 7.109665311872959e-05, + "memory(GiB)": 24.1, + "reward": -0.3528000235557556, + "reward_std": 0.28255987763404844, + "rewards/MMContentORM/mean": -0.6695000052452087, + "rewards/MMContentORM/std": 0.6945539474487304, + "rewards/MMFormatORM/mean": 0.16249999552965164, + "rewards/MMFormatORM/std": 0.28217866122722624, + "rewards/MMRubricORM/mean": -0.75, + "rewards/MMRubricORM/std": 0.43412102460861207, + "step": 40, + "train_speed(iter/s)": 0.079708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.8, + "completions/mean_length": 168.5875, + "completions/min_length": 11.4, + "epoch": 0.02160345655304849, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.08703174442052841, + "kl": 0.00247039794921875, + "learning_rate": 2.15311004784689e-06, + "loss": 9.898855350911617e-05, + "memory(GiB)": 24.1, + "reward": -0.44075002074241637, + "reward_std": 0.22521351724863053, + "rewards/MMContentORM/mean": -0.8174999952316284, + "rewards/MMContentORM/std": 0.4400706171989441, + "rewards/MMFormatORM/mean": 0.12187499701976776, + "rewards/MMFormatORM/std": 0.25194679796695707, + "rewards/MMRubricORM/mean": -0.8125, + "rewards/MMRubricORM/std": 0.3876104474067688, + "step": 45, + "train_speed(iter/s)": 0.080264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.2, + "completions/mean_length": 170.0875, + "completions/min_length": 11.6, + "epoch": 0.02400384061449832, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.3127768933773041, + "kl": 0.0022594451904296873, + "learning_rate": 2.392344497607656e-06, + "loss": 9.052451932802796e-05, + "memory(GiB)": 24.1, + "reward": -0.4175000250339508, + "reward_std": 0.20364675521850586, + "rewards/MMContentORM/mean": -0.7824999928474426, + "rewards/MMContentORM/std": 0.4635924696922302, + "rewards/MMFormatORM/mean": 0.132499997317791, + "rewards/MMFormatORM/std": 0.26173200011253356, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.4147436022758484, + "step": 50, + "train_speed(iter/s)": 0.080866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.2, + "completions/mean_length": 197.925, + "completions/min_length": 14.6, + "epoch": 0.026404224675948152, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17192748188972473, + "kl": 0.0035968780517578124, + "learning_rate": 2.631578947368421e-06, + "loss": 0.0001437270431779325, + "memory(GiB)": 24.55, + "reward": -0.3665000259876251, + "reward_std": 0.28623682260513306, + "rewards/MMContentORM/mean": -0.6874999880790711, + "rewards/MMContentORM/std": 0.6308155179023742, + "rewards/MMFormatORM/mean": 0.15249999463558198, + "rewards/MMFormatORM/std": 0.27434429824352263, + "rewards/MMRubricORM/mean": -0.7625, + "rewards/MMRubricORM/std": 0.4253008782863617, + "step": 55, + "train_speed(iter/s)": 0.080304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 543.4, + "completions/mean_length": 194.5625, + "completions/min_length": 11.6, + "epoch": 0.028804608737397985, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.14923764765262604, + "kl": 0.004736709594726563, + "learning_rate": 2.870813397129187e-06, + "loss": 0.00018961232854053378, + "memory(GiB)": 24.55, + "reward": -0.48050001859664915, + "reward_std": 0.1689985252916813, + "rewards/MMContentORM/mean": -0.8699999928474427, + "rewards/MMContentORM/std": 0.3361044704914093, + "rewards/MMFormatORM/mean": 0.09374999701976776, + "rewards/MMFormatORM/std": 0.22575461566448213, + "rewards/MMRubricORM/mean": -0.85, + "rewards/MMRubricORM/std": 0.3601807415485382, + "step": 60, + "train_speed(iter/s)": 0.077823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.4, + "completions/mean_length": 165.6375, + "completions/min_length": 12.2, + "epoch": 0.031204992798847815, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.9673174023628235, + "kl": 0.014190292358398438, + "learning_rate": 3.1100478468899525e-06, + "loss": 0.000567801995202899, + "memory(GiB)": 24.55, + "reward": -0.3531000196933746, + "reward_std": 0.21906168013811111, + "rewards/MMContentORM/mean": -0.6989999890327454, + "rewards/MMContentORM/std": 0.570724368095398, + "rewards/MMFormatORM/mean": 0.17874999791383744, + "rewards/MMFormatORM/std": 0.29591297507286074, + "rewards/MMRubricORM/mean": -0.725, + "rewards/MMRubricORM/std": 0.45525074005126953, + "step": 65, + "train_speed(iter/s)": 0.078973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.2, + "completions/mean_length": 165.6125, + "completions/min_length": 13.6, + "epoch": 0.033605376860297645, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19349712133407593, + "kl": 0.01680755615234375, + "learning_rate": 3.3492822966507182e-06, + "loss": 0.0006725039333105087, + "memory(GiB)": 24.55, + "reward": -0.41095001697540284, + "reward_std": 0.22394072413444518, + "rewards/MMContentORM/mean": -0.7554999947547912, + "rewards/MMContentORM/std": 0.5833674430847168, + "rewards/MMFormatORM/mean": 0.12812499552965165, + "rewards/MMFormatORM/std": 0.25294241309165955, + "rewards/MMRubricORM/mean": -0.8, + "rewards/MMRubricORM/std": 0.39294117093086245, + "step": 70, + "train_speed(iter/s)": 0.079238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 529.2, + "completions/mean_length": 185.7875, + "completions/min_length": 11.2, + "epoch": 0.03600576092174748, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9495518803596497, + "kl": 0.05297927856445313, + "learning_rate": 3.5885167464114835e-06, + "loss": 0.0021199073642492296, + "memory(GiB)": 24.55, + "reward": -0.3796000242233276, + "reward_std": 0.2705390602350235, + "rewards/MMContentORM/mean": -0.6914999961853028, + "rewards/MMContentORM/std": 0.5877212882041931, + "rewards/MMFormatORM/mean": 0.13624999821186065, + "rewards/MMFormatORM/std": 0.2286323994398117, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.3555411517620087, + "step": 75, + "train_speed(iter/s)": 0.077333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.4, + "completions/mean_length": 157.9625, + "completions/min_length": 10.0, + "epoch": 0.03840614498319731, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.26823142170906067, + "kl": 0.060577392578125, + "learning_rate": 3.827751196172249e-06, + "loss": 0.002423027902841568, + "memory(GiB)": 24.55, + "reward": -0.387850022315979, + "reward_std": 0.27301393151283265, + "rewards/MMContentORM/mean": -0.706499969959259, + "rewards/MMContentORM/std": 0.6382155597209931, + "rewards/MMFormatORM/mean": 0.13062499612569808, + "rewards/MMFormatORM/std": 0.25994100272655485, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.41726375818252565, + "step": 80, + "train_speed(iter/s)": 0.077772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.2, + "completions/mean_length": 165.1375, + "completions/min_length": 11.2, + "epoch": 0.04080652904464714, + "frac_reward_zero_std": 0.575, + "grad_norm": 1.0264372825622559, + "kl": 0.06542510986328125, + "learning_rate": 4.066985645933015e-06, + "loss": 0.002620968222618103, + "memory(GiB)": 24.55, + "reward": -0.3707500219345093, + "reward_std": 0.27471098899841306, + "rewards/MMContentORM/mean": -0.6924999833106995, + "rewards/MMContentORM/std": 0.6197769522666932, + "rewards/MMFormatORM/mean": 0.14687499552965164, + "rewards/MMFormatORM/std": 0.2623553693294525, + "rewards/MMRubricORM/mean": -0.7625, + "rewards/MMRubricORM/std": 0.41832817196846006, + "step": 85, + "train_speed(iter/s)": 0.078137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.4, + "completions/mean_length": 179.8625, + "completions/min_length": 11.8, + "epoch": 0.04320691310609698, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17547202110290527, + "kl": 0.02538909912109375, + "learning_rate": 4.30622009569378e-06, + "loss": 0.0010150117799639703, + "memory(GiB)": 24.55, + "reward": -0.4253000199794769, + "reward_std": 0.2470631130039692, + "rewards/MMContentORM/mean": -0.7644999980926513, + "rewards/MMContentORM/std": 0.5263380289077759, + "rewards/MMFormatORM/mean": 0.11374999582767487, + "rewards/MMFormatORM/std": 0.24558367133140563, + "rewards/MMRubricORM/mean": -0.825, + "rewards/MMRubricORM/std": 0.3778210341930389, + "step": 90, + "train_speed(iter/s)": 0.078557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.8, + "completions/mean_length": 198.0625, + "completions/min_length": 10.0, + "epoch": 0.04560729716754681, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.3727310597896576, + "kl": 0.06304931640625, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.0025252360850572587, + "memory(GiB)": 24.55, + "reward": -0.35370001196861267, + "reward_std": 0.32187501192092893, + "rewards/MMContentORM/mean": -0.6679999887943268, + "rewards/MMContentORM/std": 0.599762350320816, + "rewards/MMFormatORM/mean": 0.15874999314546584, + "rewards/MMFormatORM/std": 0.2781087428331375, + "rewards/MMRubricORM/mean": -0.75, + "rewards/MMRubricORM/std": 0.43412102460861207, + "step": 95, + "train_speed(iter/s)": 0.078826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.2, + "completions/mean_length": 178.225, + "completions/min_length": 10.6, + "epoch": 0.04800768122899664, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5053905248641968, + "kl": 0.058896636962890624, + "learning_rate": 4.784688995215312e-06, + "loss": 0.0023545216768980025, + "memory(GiB)": 24.55, + "reward": -0.366600027680397, + "reward_std": 0.25964961051940916, + "rewards/MMContentORM/mean": -0.6715000033378601, + "rewards/MMContentORM/std": 0.6754477977752685, + "rewards/MMFormatORM/mean": 0.1424999937415123, + "rewards/MMFormatORM/std": 0.2643744289875031, + "rewards/MMRubricORM/mean": -0.775, + "rewards/MMRubricORM/std": 0.4129913091659546, + "step": 100, + "train_speed(iter/s)": 0.079147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.6, + "completions/mean_length": 183.025, + "completions/min_length": 38.8, + "epoch": 0.050408065290446474, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.7233589887619019, + "kl": 0.03177032470703125, + "learning_rate": 5.023923444976077e-06, + "loss": 0.0012724055908620358, + "memory(GiB)": 24.55, + "reward": -0.4313500225543976, + "reward_std": 0.21149563789367676, + "rewards/MMContentORM/mean": -0.793999993801117, + "rewards/MMContentORM/std": 0.48824622631073, + "rewards/MMFormatORM/mean": 0.12187499701976776, + "rewards/MMFormatORM/std": 0.255849027633667, + "rewards/MMRubricORM/mean": -0.8125, + "rewards/MMRubricORM/std": 0.39361388683319093, + "step": 105, + "train_speed(iter/s)": 0.078492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.4, + "completions/mean_length": 205.525, + "completions/min_length": 40.4, + "epoch": 0.052808449351896304, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.656991720199585, + "kl": 0.00919189453125, + "learning_rate": 5.263157894736842e-06, + "loss": 0.00036728212144225835, + "memory(GiB)": 24.55, + "reward": -0.3993500292301178, + "reward_std": 0.2595789015293121, + "rewards/MMContentORM/mean": -0.7514999866485595, + "rewards/MMContentORM/std": 0.5262986779212951, + "rewards/MMFormatORM/mean": 0.14062499403953552, + "rewards/MMFormatORM/std": 0.26713907420635224, + "rewards/MMRubricORM/mean": -0.775, + "rewards/MMRubricORM/std": 0.4210435926914215, + "step": 110, + "train_speed(iter/s)": 0.078426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.8, + "completions/mean_length": 198.6875, + "completions/min_length": 40.8, + "epoch": 0.055208833413346134, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16790097951889038, + "kl": 0.0282012939453125, + "learning_rate": 5.502392344497608e-06, + "loss": 0.0011287719011306764, + "memory(GiB)": 24.55, + "reward": -0.36190002262592313, + "reward_std": 0.12529932260513305, + "rewards/MMContentORM/mean": -0.7134999990463257, + "rewards/MMContentORM/std": 0.5213123708963394, + "rewards/MMFormatORM/mean": 0.17124999314546585, + "rewards/MMFormatORM/std": 0.2684005439281464, + "rewards/MMRubricORM/mean": -0.725, + "rewards/MMRubricORM/std": 0.42883480787277223, + "step": 115, + "train_speed(iter/s)": 0.078525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.8, + "completions/mean_length": 191.825, + "completions/min_length": 18.8, + "epoch": 0.05760921747479597, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.7576951384544373, + "kl": 0.027909088134765624, + "learning_rate": 5.741626794258374e-06, + "loss": 0.001117511186748743, + "memory(GiB)": 24.55, + "reward": -0.26640002727508544, + "reward_std": 0.39357563853263855, + "rewards/MMContentORM/mean": -0.5684999823570251, + "rewards/MMContentORM/std": 0.7017473936080932, + "rewards/MMFormatORM/mean": 0.22749999165534973, + "rewards/MMFormatORM/std": 0.3158136546611786, + "rewards/MMRubricORM/mean": -0.65, + "rewards/MMRubricORM/std": 0.4858671844005585, + "step": 120, + "train_speed(iter/s)": 0.079177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.4, + "completions/mean_length": 199.3375, + "completions/min_length": 21.6, + "epoch": 0.0600096015362458, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.35645198822021484, + "kl": 0.01103973388671875, + "learning_rate": 5.98086124401914e-06, + "loss": 0.0004417818039655685, + "memory(GiB)": 24.59, + "reward": -0.3517500251531601, + "reward_std": 0.2542048916220665, + "rewards/MMContentORM/mean": -0.6524999976158142, + "rewards/MMContentORM/std": 0.6953619718551636, + "rewards/MMFormatORM/mean": 0.15437499582767486, + "rewards/MMFormatORM/std": 0.25867260694503785, + "rewards/MMRubricORM/mean": -0.7625, + "rewards/MMRubricORM/std": 0.397957855463028, + "step": 125, + "train_speed(iter/s)": 0.079515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 460.8, + "completions/mean_length": 206.5125, + "completions/min_length": 23.6, + "epoch": 0.06240998559769563, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7836592197418213, + "kl": 0.046865081787109374, + "learning_rate": 6.220095693779905e-06, + "loss": 0.0018781695514917373, + "memory(GiB)": 24.59, + "reward": -0.4076500177383423, + "reward_std": 0.17062486261129378, + "rewards/MMContentORM/mean": -0.7634999871253967, + "rewards/MMContentORM/std": 0.5306057691574096, + "rewards/MMFormatORM/mean": 0.1381249964237213, + "rewards/MMFormatORM/std": 0.26958334147930146, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.4147436022758484, + "step": 130, + "train_speed(iter/s)": 0.079017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 485.4, + "completions/mean_length": 211.1625, + "completions/min_length": 60.6, + "epoch": 0.06481036965914547, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.19905580580234528, + "kl": 0.051471710205078125, + "learning_rate": 6.459330143540671e-06, + "loss": 0.0020540472120046615, + "memory(GiB)": 24.59, + "reward": -0.2741000235080719, + "reward_std": 0.37745361328125, + "rewards/MMContentORM/mean": -0.5589999914169311, + "rewards/MMContentORM/std": 0.7385274767875671, + "rewards/MMFormatORM/mean": 0.2112499952316284, + "rewards/MMFormatORM/std": 0.3126032888889313, + "rewards/MMRubricORM/mean": -0.675, + "rewards/MMRubricORM/std": 0.4809281527996063, + "step": 135, + "train_speed(iter/s)": 0.078216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 450.8, + "completions/mean_length": 202.375, + "completions/min_length": 13.4, + "epoch": 0.06721075372059529, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.833666205406189, + "kl": 0.051274871826171874, + "learning_rate": 6.6985645933014365e-06, + "loss": 0.002046111598610878, + "memory(GiB)": 24.59, + "reward": -0.2925000175833702, + "reward_std": 0.2917522594332695, + "rewards/MMContentORM/mean": -0.5849999874830246, + "rewards/MMContentORM/std": 0.7148973345756531, + "rewards/MMFormatORM/mean": 0.19749999642372132, + "rewards/MMFormatORM/std": 0.2828102707862854, + "rewards/MMRubricORM/mean": -0.6875, + "rewards/MMRubricORM/std": 0.44643059372901917, + "step": 140, + "train_speed(iter/s)": 0.077881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/mean_length": 165.1875, + "completions/min_length": 37.8, + "epoch": 0.06961113778204513, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9144070148468018, + "kl": 0.05808563232421875, + "learning_rate": 6.937799043062201e-06, + "loss": 0.0023345451802015303, + "memory(GiB)": 24.59, + "reward": -0.39740002155303955, + "reward_std": 0.22047589719295502, + "rewards/MMContentORM/mean": -0.7284999907016754, + "rewards/MMContentORM/std": 0.6163743019104004, + "rewards/MMFormatORM/mean": 0.1287499949336052, + "rewards/MMFormatORM/std": 0.24967178106307983, + "rewards/MMRubricORM/mean": -0.7875, + "rewards/MMRubricORM/std": 0.40525074005126954, + "step": 145, + "train_speed(iter/s)": 0.078142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/mean_length": 189.1125, + "completions/min_length": 38.2, + "epoch": 0.07201152184349496, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23582716286182404, + "kl": 0.036444091796875, + "learning_rate": 7.177033492822967e-06, + "loss": 0.001458549778908491, + "memory(GiB)": 24.59, + "reward": -0.29195002317428587, + "reward_std": 0.23907281160354615, + "rewards/MMContentORM/mean": -0.6305000007152557, + "rewards/MMContentORM/std": 0.6437041282653808, + "rewards/MMFormatORM/mean": 0.22562499046325685, + "rewards/MMFormatORM/std": 0.3097758531570435, + "rewards/MMRubricORM/mean": -0.65, + "rewards/MMRubricORM/std": 0.48037723302841184, + "step": 150, + "train_speed(iter/s)": 0.078351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 485.6, + "completions/mean_length": 236.125, + "completions/min_length": 72.2, + "epoch": 0.07441190590494479, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.1894054263830185, + "kl": 0.00907440185546875, + "learning_rate": 7.416267942583732e-06, + "loss": 0.00036348355934023857, + "memory(GiB)": 24.59, + "reward": -0.29550001621246336, + "reward_std": 0.35963451862335205, + "rewards/MMContentORM/mean": -0.6124999940395355, + "rewards/MMContentORM/std": 0.6703195393085479, + "rewards/MMFormatORM/mean": 0.21124999225139618, + "rewards/MMFormatORM/std": 0.3112755298614502, + "rewards/MMRubricORM/mean": -0.675, + "rewards/MMRubricORM/std": 0.47888544797897337, + "step": 155, + "train_speed(iter/s)": 0.077809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.2, + "completions/mean_length": 194.05, + "completions/min_length": 17.0, + "epoch": 0.07681228996639462, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16335317492485046, + "kl": 0.023895263671875, + "learning_rate": 7.655502392344498e-06, + "loss": 0.0009563345462083817, + "memory(GiB)": 24.59, + "reward": -0.24430001378059388, + "reward_std": 0.26785205602645873, + "rewards/MMContentORM/mean": -0.56700000166893, + "rewards/MMContentORM/std": 0.6720305800437927, + "rewards/MMFormatORM/mean": 0.2562499925494194, + "rewards/MMFormatORM/std": 0.31278570294380187, + "rewards/MMRubricORM/mean": -0.6, + "rewards/MMRubricORM/std": 0.486371648311615, + "step": 160, + "train_speed(iter/s)": 0.078087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/mean_length": 211.7875, + "completions/min_length": 60.6, + "epoch": 0.07921267402784446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.5669417381286621, + "kl": 0.015301513671875, + "learning_rate": 7.894736842105265e-06, + "loss": 0.0006132687442004681, + "memory(GiB)": 24.59, + "reward": -0.22980001866817473, + "reward_std": 0.2708218902349472, + "rewards/MMContentORM/mean": -0.584499990940094, + "rewards/MMContentORM/std": 0.6051075398921967, + "rewards/MMFormatORM/mean": 0.2849999874830246, + "rewards/MMFormatORM/std": 0.32614828944206237, + "rewards/MMRubricORM/mean": -0.55, + "rewards/MMRubricORM/std": 0.5098386645317078, + "step": 165, + "train_speed(iter/s)": 0.077789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.8, + "completions/mean_length": 204.05, + "completions/min_length": 56.0, + "epoch": 0.08161305808929428, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.23664671182632446, + "kl": 0.054852294921875, + "learning_rate": 8.13397129186603e-06, + "loss": 0.002194448187947273, + "memory(GiB)": 24.59, + "reward": -0.17945002168416976, + "reward_std": 0.2834791004657745, + "rewards/MMContentORM/mean": -0.48049999326467513, + "rewards/MMContentORM/std": 0.6853928565979004, + "rewards/MMFormatORM/mean": 0.3006249964237213, + "rewards/MMFormatORM/std": 0.3232024133205414, + "rewards/MMRubricORM/mean": -0.5375, + "rewards/MMRubricORM/std": 0.49723449945449827, + "step": 170, + "train_speed(iter/s)": 0.077962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.4, + "completions/mean_length": 180.9375, + "completions/min_length": 19.4, + "epoch": 0.08401344215074412, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.41499343514442444, + "kl": 0.0380462646484375, + "learning_rate": 8.373205741626795e-06, + "loss": 0.0015261590480804444, + "memory(GiB)": 24.67, + "reward": -0.2555500268936157, + "reward_std": 0.271882563829422, + "rewards/MMContentORM/mean": -0.5519999861717224, + "rewards/MMContentORM/std": 0.7190791845321656, + "rewards/MMFormatORM/mean": 0.2318749874830246, + "rewards/MMFormatORM/std": 0.31184685230255127, + "rewards/MMRubricORM/mean": -0.6375, + "rewards/MMRubricORM/std": 0.48463451862335205, + "step": 175, + "train_speed(iter/s)": 0.078217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.8, + "completions/mean_length": 192.8875, + "completions/min_length": 40.8, + "epoch": 0.08641382621219396, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.4990822970867157, + "kl": 0.023101806640625, + "learning_rate": 8.61244019138756e-06, + "loss": 0.000923317763954401, + "memory(GiB)": 24.67, + "reward": -0.20000003054738044, + "reward_std": 0.35242201685905455, + "rewards/MMContentORM/mean": -0.5174999952316284, + "rewards/MMContentORM/std": 0.6820277512073517, + "rewards/MMFormatORM/mean": 0.29249999225139617, + "rewards/MMFormatORM/std": 0.32704830169677734, + "rewards/MMRubricORM/mean": -0.55, + "rewards/MMRubricORM/std": 0.5031512618064881, + "step": 180, + "train_speed(iter/s)": 0.078339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.8, + "completions/mean_length": 203.6, + "completions/min_length": 40.0, + "epoch": 0.08881421027364378, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.43722620606422424, + "kl": 0.02008056640625, + "learning_rate": 8.851674641148326e-06, + "loss": 0.0008031532168388366, + "memory(GiB)": 24.67, + "reward": -0.12450002208352089, + "reward_std": 0.3882016271352768, + "rewards/MMContentORM/mean": -0.41499999687075617, + "rewards/MMContentORM/std": 0.637773585319519, + "rewards/MMFormatORM/mean": 0.3412499904632568, + "rewards/MMFormatORM/std": 0.3158136546611786, + "rewards/MMRubricORM/mean": -0.475, + "rewards/MMRubricORM/std": 0.4858671844005585, + "step": 185, + "train_speed(iter/s)": 0.078522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.6, + "completions/mean_length": 192.125, + "completions/min_length": 35.8, + "epoch": 0.09121459433509362, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.2395554482936859, + "kl": 0.04338836669921875, + "learning_rate": 9.090909090909091e-06, + "loss": 0.0017313847318291664, + "memory(GiB)": 24.67, + "reward": -0.1971000224351883, + "reward_std": 0.31183409988880156, + "rewards/MMContentORM/mean": -0.5064999997615814, + "rewards/MMContentORM/std": 0.7074662327766419, + "rewards/MMFormatORM/mean": 0.2887499928474426, + "rewards/MMFormatORM/std": 0.32622864842414856, + "rewards/MMRubricORM/mean": -0.55, + "rewards/MMRubricORM/std": 0.5080508947372436, + "step": 190, + "train_speed(iter/s)": 0.078772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.8, + "completions/mean_length": 195.0, + "completions/min_length": 90.4, + "epoch": 0.09361497839654345, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.25950706005096436, + "kl": 0.0434844970703125, + "learning_rate": 9.330143540669856e-06, + "loss": 0.001740964502096176, + "memory(GiB)": 24.67, + "reward": -0.04445001631975174, + "reward_std": 0.4817518353462219, + "rewards/MMContentORM/mean": -0.27049999833106997, + "rewards/MMContentORM/std": 0.8492022752761841, + "rewards/MMFormatORM/mean": 0.3718749940395355, + "rewards/MMFormatORM/std": 0.3288069784641266, + "rewards/MMRubricORM/mean": -0.425, + "rewards/MMRubricORM/std": 0.5082185864448547, + "step": 195, + "train_speed(iter/s)": 0.079029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/mean_length": 196.225, + "completions/min_length": 99.2, + "epoch": 0.09601536245799328, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.26332148909568787, + "kl": 0.057666015625, + "learning_rate": 9.569377990430623e-06, + "loss": 0.002305997908115387, + "memory(GiB)": 24.67, + "reward": -0.13745001405477525, + "reward_std": 0.4148595631122589, + "rewards/MMContentORM/mean": -0.38799999952316283, + "rewards/MMContentORM/std": 0.7763695597648621, + "rewards/MMFormatORM/mean": 0.30687499344348906, + "rewards/MMFormatORM/std": 0.320049911737442, + "rewards/MMRubricORM/mean": -0.525, + "rewards/MMRubricORM/std": 0.49438175559043884, + "step": 200, + "train_speed(iter/s)": 0.079255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 436.8, + "completions/mean_length": 201.1375, + "completions/min_length": 57.4, + "epoch": 0.09841574651944311, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.20934279263019562, + "kl": 0.0318572998046875, + "learning_rate": 9.808612440191389e-06, + "loss": 0.0012754278257489204, + "memory(GiB)": 25.29, + "reward": -0.09840002059936523, + "reward_std": 0.3428053617477417, + "rewards/MMContentORM/mean": -0.3834999889135361, + "rewards/MMContentORM/std": 0.7341944694519043, + "rewards/MMFormatORM/mean": 0.3562499850988388, + "rewards/MMFormatORM/std": 0.3010324537754059, + "rewards/MMRubricORM/mean": -0.4375, + "rewards/MMRubricORM/std": 0.4770470380783081, + "step": 205, + "train_speed(iter/s)": 0.078583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.6, + "completions/mean_length": 185.1375, + "completions/min_length": 86.8, + "epoch": 0.10081613058089295, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.22751331329345703, + "kl": 0.0099822998046875, + "learning_rate": 9.99999842417629e-06, + "loss": 0.00039928192272782327, + "memory(GiB)": 25.29, + "reward": -0.013300008326768874, + "reward_std": 0.367412693798542, + "rewards/MMContentORM/mean": -0.2519999980926514, + "rewards/MMContentORM/std": 0.6957788646221161, + "rewards/MMFormatORM/mean": 0.40624999105930326, + "rewards/MMFormatORM/std": 0.29864728450775146, + "rewards/MMRubricORM/mean": -0.375, + "rewards/MMRubricORM/std": 0.45945738554000853, + "step": 210, + "train_speed(iter/s)": 0.078979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/mean_length": 203.25, + "completions/min_length": 119.4, + "epoch": 0.10321651464234277, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.2543454170227051, + "kl": 0.01162109375, + "learning_rate": 9.999943270450725e-06, + "loss": 0.00046498142182826996, + "memory(GiB)": 25.29, + "reward": -0.09350001960992813, + "reward_std": 0.3010860651731491, + "rewards/MMContentORM/mean": -0.4375, + "rewards/MMContentORM/std": 0.6415534257888794, + "rewards/MMFormatORM/mean": 0.39124998450279236, + "rewards/MMFormatORM/std": 0.30987287759780885, + "rewards/MMRubricORM/mean": -0.375, + "rewards/MMRubricORM/std": 0.4858671844005585, + "step": 215, + "train_speed(iter/s)": 0.078963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.2, + "completions/mean_length": 192.45, + "completions/min_length": 76.8, + "epoch": 0.10561689870379261, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.5995835661888123, + "kl": 0.02928466796875, + "learning_rate": 9.999809326532929e-06, + "loss": 0.0011718601919710637, + "memory(GiB)": 25.29, + "reward": 0.18744998872280122, + "reward_std": 0.29281292855739594, + "rewards/MMContentORM/mean": 0.08300001323223113, + "rewards/MMContentORM/std": 0.809050726890564, + "rewards/MMFormatORM/mean": 0.4981249749660492, + "rewards/MMFormatORM/std": 0.2565573215484619, + "rewards/MMRubricORM/mean": -0.225, + "rewards/MMRubricORM/std": 0.3916578650474548, + "step": 220, + "train_speed(iter/s)": 0.079297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.2, + "completions/mean_length": 191.175, + "completions/min_length": 101.8, + "epoch": 0.10801728276524244, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.2489888072013855, + "kl": 0.01142578125, + "learning_rate": 9.99959659453362e-06, + "loss": 0.0004573634825646877, + "memory(GiB)": 25.29, + "reward": 0.07849998809397221, + "reward_std": 0.29528780579566954, + "rewards/MMContentORM/mean": -0.17499998956918716, + "rewards/MMContentORM/std": 0.7530360221862793, + "rewards/MMFormatORM/mean": 0.4899999797344208, + "rewards/MMFormatORM/std": 0.23433216214179992, + "rewards/MMRubricORM/mean": -0.2375, + "rewards/MMRubricORM/std": 0.36226795315742494, + "step": 225, + "train_speed(iter/s)": 0.079619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/mean_length": 197.075, + "completions/min_length": 113.0, + "epoch": 0.11041766682669227, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.27176177501678467, + "kl": 0.012841796875, + "learning_rate": 9.999305077805077e-06, + "loss": 0.0005132704041898251, + "memory(GiB)": 25.29, + "reward": -0.02635001763701439, + "reward_std": 0.40764704942703245, + "rewards/MMContentORM/mean": -0.2990000039339066, + "rewards/MMContentORM/std": 0.7697360038757324, + "rewards/MMFormatORM/mean": 0.41437498331069944, + "rewards/MMFormatORM/std": 0.31634018421173093, + "rewards/MMRubricORM/mean": -0.3625, + "rewards/MMRubricORM/std": 0.486677223443985, + "step": 230, + "train_speed(iter/s)": 0.079753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.4, + "completions/mean_length": 189.825, + "completions/min_length": 85.0, + "epoch": 0.1128180508881421, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.3836219310760498, + "kl": 0.0144561767578125, + "learning_rate": 9.99893478094108e-06, + "loss": 0.0005777373909950257, + "memory(GiB)": 25.29, + "reward": 0.07359998375177383, + "reward_std": 0.32229926288127897, + "rewards/MMContentORM/mean": -0.14599999487400056, + "rewards/MMContentORM/std": 0.7706803798675537, + "rewards/MMFormatORM/mean": 0.4674999833106995, + "rewards/MMFormatORM/std": 0.28921514451503755, + "rewards/MMRubricORM/mean": -0.275, + "rewards/MMRubricORM/std": 0.4454106867313385, + "step": 235, + "train_speed(iter/s)": 0.079954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.6, + "completions/mean_length": 192.375, + "completions/min_length": 102.6, + "epoch": 0.11521843494959194, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.3361359238624573, + "kl": 0.01556396484375, + "learning_rate": 9.99848570977685e-06, + "loss": 0.0006220159120857716, + "memory(GiB)": 25.29, + "reward": 0.0773499846458435, + "reward_std": 0.34245182275772096, + "rewards/MMContentORM/mean": -0.15099999755620958, + "rewards/MMContentORM/std": 0.7413495063781739, + "rewards/MMFormatORM/mean": 0.47562498450279234, + "rewards/MMFormatORM/std": 0.2913523316383362, + "rewards/MMRubricORM/mean": -0.2625, + "rewards/MMRubricORM/std": 0.4509934544563293, + "step": 240, + "train_speed(iter/s)": 0.079982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.2, + "completions/mean_length": 196.525, + "completions/min_length": 130.8, + "epoch": 0.11761881901104176, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.23139292001724243, + "kl": 0.0150390625, + "learning_rate": 9.997957871388948e-06, + "loss": 0.0006011344958096743, + "memory(GiB)": 25.29, + "reward": 0.22974997647106649, + "reward_std": 0.27442815005779264, + "rewards/MMContentORM/mean": 0.13999999761581422, + "rewards/MMContentORM/std": 0.7938369989395142, + "rewards/MMFormatORM/mean": 0.5281249761581421, + "rewards/MMFormatORM/std": 0.22127365171909333, + "rewards/MMRubricORM/mean": -0.1875, + "rewards/MMRubricORM/std": 0.3404210150241852, + "step": 245, + "train_speed(iter/s)": 0.0802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.8, + "completions/mean_length": 190.8875, + "completions/min_length": 96.8, + "epoch": 0.1200192030724916, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.24103079736232758, + "kl": 0.058306884765625, + "learning_rate": 9.997351274095165e-06, + "loss": 0.002327635698020458, + "memory(GiB)": 25.29, + "reward": 0.14489998891949654, + "reward_std": 0.31650099754333494, + "rewards/MMContentORM/mean": -0.028999996185302735, + "rewards/MMContentORM/std": 0.7646125912666321, + "rewards/MMFormatORM/mean": 0.5037499785423278, + "rewards/MMFormatORM/std": 0.2736783236265182, + "rewards/MMRubricORM/mean": -0.225, + "rewards/MMRubricORM/std": 0.4210435926914215, + "step": 250, + "train_speed(iter/s)": 0.080485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.2, + "completions/mean_length": 196.875, + "completions/min_length": 126.4, + "epoch": 0.12241958713394142, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2732993960380554, + "kl": 0.0129638671875, + "learning_rate": 9.996665927454393e-06, + "loss": 0.0005180831998586654, + "memory(GiB)": 25.29, + "reward": 0.13094998374581338, + "reward_std": 0.3363707005977631, + "rewards/MMContentORM/mean": -0.10699999555945397, + "rewards/MMContentORM/std": 0.732841408252716, + "rewards/MMFormatORM/mean": 0.5281249821186066, + "rewards/MMFormatORM/std": 0.2597552388906479, + "rewards/MMRubricORM/mean": -0.1875, + "rewards/MMRubricORM/std": 0.3996234655380249, + "step": 255, + "train_speed(iter/s)": 0.080761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/mean_length": 192.6625, + "completions/min_length": 101.6, + "epoch": 0.12481997119539126, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2951659858226776, + "kl": 0.03148193359375, + "learning_rate": 9.995901842266476e-06, + "loss": 0.0012587737292051315, + "memory(GiB)": 25.29, + "reward": 0.1078499898314476, + "reward_std": 0.3215214520692825, + "rewards/MMContentORM/mean": -0.1535000056028366, + "rewards/MMContentORM/std": 0.7672501325607299, + "rewards/MMFormatORM/mean": 0.5168749809265136, + "rewards/MMFormatORM/std": 0.25745911300182345, + "rewards/MMRubricORM/mean": -0.1875, + "rewards/MMRubricORM/std": 0.3996234655380249, + "step": 260, + "train_speed(iter/s)": 0.081034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.4, + "completions/mean_length": 199.075, + "completions/min_length": 118.2, + "epoch": 0.12722035525684108, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.23530304431915283, + "kl": 0.0120758056640625, + "learning_rate": 9.99505903057203e-06, + "loss": 0.00048267128877341745, + "memory(GiB)": 25.29, + "reward": 0.20024997591972352, + "reward_std": 0.22662772685289384, + "rewards/MMContentORM/mean": 0.012500005960464477, + "rewards/MMContentORM/std": 0.6804582595825195, + "rewards/MMFormatORM/mean": 0.5568749785423279, + "rewards/MMFormatORM/std": 0.19259803593158722, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.29467830061912537, + "step": 265, + "train_speed(iter/s)": 0.081248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.6, + "completions/mean_length": 188.1125, + "completions/min_length": 134.8, + "epoch": 0.12962073931829093, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.2173798680305481, + "kl": 0.013104248046875, + "learning_rate": 9.994137505652267e-06, + "loss": 0.0005250374786555767, + "memory(GiB)": 25.29, + "reward": 0.21479999721050264, + "reward_std": 0.28807530701160433, + "rewards/MMContentORM/mean": 0.05450000464916229, + "rewards/MMContentORM/std": 0.7564670324325562, + "rewards/MMFormatORM/mean": 0.5512499809265137, + "rewards/MMFormatORM/std": 0.22316529154777526, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.34438174962997437, + "step": 270, + "train_speed(iter/s)": 0.081407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.2, + "completions/mean_length": 189.925, + "completions/min_length": 126.8, + "epoch": 0.13202112337974076, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.23718853294849396, + "kl": 0.015252685546875, + "learning_rate": 9.993137282028777e-06, + "loss": 0.0006098361685872078, + "memory(GiB)": 25.29, + "reward": 0.22029998302459716, + "reward_std": 0.29967186152935027, + "rewards/MMContentORM/mean": 0.12200000137090683, + "rewards/MMContentORM/std": 0.8326894640922546, + "rewards/MMFormatORM/mean": 0.5224999785423279, + "rewards/MMFormatORM/std": 0.25347527861595154, + "rewards/MMRubricORM/mean": -0.1875, + "rewards/MMRubricORM/std": 0.39013060331344607, + "step": 275, + "train_speed(iter/s)": 0.081663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 467.6, + "completions/mean_length": 205.3, + "completions/min_length": 126.8, + "epoch": 0.13442150744119058, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.27481362223625183, + "kl": 0.013482666015625, + "learning_rate": 9.992058375463302e-06, + "loss": 0.0005398368928581476, + "memory(GiB)": 25.29, + "reward": 0.15504998862743377, + "reward_std": 0.3667762905359268, + "rewards/MMContentORM/mean": -0.03049999326467514, + "rewards/MMContentORM/std": 0.7381291270256043, + "rewards/MMFormatORM/mean": 0.5181249678134918, + "rewards/MMFormatORM/std": 0.2611870527267456, + "rewards/MMRubricORM/mean": -0.2, + "rewards/MMRubricORM/std": 0.4024340331554413, + "step": 280, + "train_speed(iter/s)": 0.081362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.4, + "completions/mean_length": 183.0, + "completions/min_length": 125.6, + "epoch": 0.13682189150264043, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.33459168672561646, + "kl": 0.013397216796875, + "learning_rate": 9.990900802957484e-06, + "loss": 0.0005357235670089722, + "memory(GiB)": 25.29, + "reward": 0.17229999005794525, + "reward_std": 0.3010860651731491, + "rewards/MMContentORM/mean": -0.07550000250339509, + "rewards/MMContentORM/std": 0.6900161981582642, + "rewards/MMFormatORM/mean": 0.568749976158142, + "rewards/MMFormatORM/std": 0.1936162531375885, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2978711724281311, + "step": 285, + "train_speed(iter/s)": 0.081635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.8, + "completions/mean_length": 193.6875, + "completions/min_length": 125.6, + "epoch": 0.13922227556409025, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.22289112210273743, + "kl": 0.015679931640625, + "learning_rate": 9.989664582752603e-06, + "loss": 0.0006269993260502815, + "memory(GiB)": 25.29, + "reward": 0.20594998747110366, + "reward_std": 0.2612759530544281, + "rewards/MMContentORM/mean": 0.055499997735023496, + "rewards/MMContentORM/std": 0.7748138785362244, + "rewards/MMFormatORM/mean": 0.5406249761581421, + "rewards/MMFormatORM/std": 0.2212115779519081, + "rewards/MMRubricORM/mean": -0.1625, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 290, + "train_speed(iter/s)": 0.081907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.2, + "completions/mean_length": 200.9875, + "completions/min_length": 134.4, + "epoch": 0.14162265962554008, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.26251521706581116, + "kl": 0.015625, + "learning_rate": 9.988349734329284e-06, + "loss": 0.0006249185651540756, + "memory(GiB)": 25.29, + "reward": 0.2513999938964844, + "reward_std": 0.24013345837593078, + "rewards/MMContentORM/mean": 0.08850000128149986, + "rewards/MMContentORM/std": 0.7766122579574585, + "rewards/MMFormatORM/mean": 0.5837499737739563, + "rewards/MMFormatORM/std": 0.16351408362388611, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 295, + "train_speed(iter/s)": 0.08196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/mean_length": 191.7, + "completions/min_length": 116.6, + "epoch": 0.14402304368698993, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.28246110677719116, + "kl": 0.017596435546875, + "learning_rate": 9.986956278407198e-06, + "loss": 0.0007036954164505004, + "memory(GiB)": 25.29, + "reward": 0.13574998527765275, + "reward_std": 0.30907638669013976, + "rewards/MMContentORM/mean": -0.049999994784593584, + "rewards/MMContentORM/std": 0.7848005771636963, + "rewards/MMFormatORM/mean": 0.5018749833106995, + "rewards/MMFormatORM/std": 0.2665435582399368, + "rewards/MMRubricORM/mean": -0.225, + "rewards/MMRubricORM/std": 0.4095080256462097, + "step": 300, + "train_speed(iter/s)": 0.081952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.6, + "completions/mean_length": 182.3, + "completions/min_length": 105.6, + "epoch": 0.14642342774843975, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6548066139221191, + "kl": 0.02274169921875, + "learning_rate": 9.985484236944723e-06, + "loss": 0.0009119081310927868, + "memory(GiB)": 25.29, + "reward": 0.2615999788045883, + "reward_std": 0.2737917542457581, + "rewards/MMContentORM/mean": 0.12650000676512718, + "rewards/MMContentORM/std": 0.7763458490371704, + "rewards/MMFormatORM/mean": 0.5774999856948853, + "rewards/MMFormatORM/std": 0.19905767738819122, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 305, + "train_speed(iter/s)": 0.081951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.2, + "completions/mean_length": 189.0875, + "completions/min_length": 117.0, + "epoch": 0.14882381180988957, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.8391085863113403, + "kl": 0.0144775390625, + "learning_rate": 9.983933633138607e-06, + "loss": 0.000579320639371872, + "memory(GiB)": 25.29, + "reward": 0.2854999780654907, + "reward_std": 0.23037539422512054, + "rewards/MMContentORM/mean": 0.14999999850988388, + "rewards/MMContentORM/std": 0.6774580955505372, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 310, + "train_speed(iter/s)": 0.082155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.4, + "completions/mean_length": 205.75, + "completions/min_length": 138.6, + "epoch": 0.15122419587133942, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.4532962143421173, + "kl": 0.014453125, + "learning_rate": 9.982304491423607e-06, + "loss": 0.0005786891095340251, + "memory(GiB)": 25.29, + "reward": 0.2162499874830246, + "reward_std": 0.26664996445178984, + "rewards/MMContentORM/mean": 0.019999995827674866, + "rewards/MMContentORM/std": 0.7253228902816773, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 315, + "train_speed(iter/s)": 0.082254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.2, + "completions/mean_length": 186.875, + "completions/min_length": 129.8, + "epoch": 0.15362457993278925, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.3340102732181549, + "kl": 0.017840576171875, + "learning_rate": 9.980596837472085e-06, + "loss": 0.000713213300332427, + "memory(GiB)": 25.29, + "reward": 0.2187499910593033, + "reward_std": 0.20993999540805816, + "rewards/MMContentORM/mean": 0.030000004172325134, + "rewards/MMContentORM/std": 0.716671884059906, + "rewards/MMFormatORM/mean": 0.5731249749660492, + "rewards/MMFormatORM/std": 0.19444467574357988, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27606874108314516, + "step": 320, + "train_speed(iter/s)": 0.082482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.2, + "completions/mean_length": 190.875, + "completions/min_length": 119.0, + "epoch": 0.15602496399423907, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.297242671251297, + "kl": 0.016180419921875, + "learning_rate": 9.978810698193628e-06, + "loss": 0.0006479379255324603, + "memory(GiB)": 25.29, + "reward": 0.36794998943805696, + "reward_std": 0.2478409305214882, + "rewards/MMContentORM/mean": 0.37049999833106995, + "rewards/MMContentORM/std": 0.6727034986019135, + "rewards/MMFormatORM/mean": 0.5931249797344208, + "rewards/MMFormatORM/std": 0.1350412219762802, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2077557325363159, + "step": 325, + "train_speed(iter/s)": 0.082645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.4, + "completions/mean_length": 188.0875, + "completions/min_length": 105.4, + "epoch": 0.15842534805568892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.25825080275535583, + "kl": 0.016131591796875, + "learning_rate": 9.976946101734607e-06, + "loss": 0.0006450886372476816, + "memory(GiB)": 25.29, + "reward": 0.32744998335838316, + "reward_std": 0.2085257887840271, + "rewards/MMContentORM/mean": 0.2405000112950802, + "rewards/MMContentORM/std": 0.7213131546974182, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 330, + "train_speed(iter/s)": 0.082761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.2, + "completions/mean_length": 188.65, + "completions/min_length": 133.8, + "epoch": 0.16082573211713874, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.2867213189601898, + "kl": 0.01766357421875, + "learning_rate": 9.975003077477733e-06, + "loss": 0.0007068701088428497, + "memory(GiB)": 25.29, + "reward": 0.3085499942302704, + "reward_std": 0.19721208810806273, + "rewards/MMContentORM/mean": 0.22200000584125518, + "rewards/MMContentORM/std": 0.6932164669036865, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 335, + "train_speed(iter/s)": 0.082968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/mean_length": 188.6, + "completions/min_length": 118.8, + "epoch": 0.16322611617858857, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.21676421165466309, + "kl": 0.0164306640625, + "learning_rate": 9.97298165604161e-06, + "loss": 0.0006582758855074644, + "memory(GiB)": 25.29, + "reward": 0.2879999876022339, + "reward_std": 0.1766352742910385, + "rewards/MMContentORM/mean": 0.1849999964237213, + "rewards/MMContentORM/std": 0.7507146120071411, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 340, + "train_speed(iter/s)": 0.083168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.4, + "completions/mean_length": 185.2, + "completions/min_length": 111.8, + "epoch": 0.16562650024003842, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.21741819381713867, + "kl": 0.017755126953125, + "learning_rate": 9.970881869280231e-06, + "loss": 0.0007105268072336912, + "memory(GiB)": 25.29, + "reward": 0.31114999651908876, + "reward_std": 0.21616254448890687, + "rewards/MMContentORM/mean": 0.22849999815225602, + "rewards/MMContentORM/std": 0.7147838234901428, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 345, + "train_speed(iter/s)": 0.08331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.8, + "completions/mean_length": 193.775, + "completions/min_length": 107.8, + "epoch": 0.16802688430148824, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.25716090202331543, + "kl": 0.01529541015625, + "learning_rate": 9.968703750282498e-06, + "loss": 0.0006120001431554556, + "memory(GiB)": 25.29, + "reward": 0.3012999713420868, + "reward_std": 0.24819448292255403, + "rewards/MMContentORM/mean": 0.18950000554323196, + "rewards/MMContentORM/std": 0.7667634725570679, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 350, + "train_speed(iter/s)": 0.083431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.2, + "completions/mean_length": 199.3625, + "completions/min_length": 131.2, + "epoch": 0.17042726836293806, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.29177579283714294, + "kl": 0.016534423828125, + "learning_rate": 9.966447333371679e-06, + "loss": 0.0006617675069719553, + "memory(GiB)": 25.29, + "reward": 0.3361999988555908, + "reward_std": 0.2548412889242172, + "rewards/MMContentORM/mean": 0.2804999977350235, + "rewards/MMContentORM/std": 0.7203467965126038, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.1303652733564377, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 355, + "train_speed(iter/s)": 0.083425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/mean_length": 194.0125, + "completions/min_length": 119.0, + "epoch": 0.1728276524243879, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.22960415482521057, + "kl": 0.015142822265625, + "learning_rate": 9.964112654104881e-06, + "loss": 0.0006059727631509304, + "memory(GiB)": 25.29, + "reward": 0.18509998098015784, + "reward_std": 0.19671710431575776, + "rewards/MMContentORM/mean": -0.043500003218650815, + "rewards/MMContentORM/std": 0.6924860835075378, + "rewards/MMFormatORM/mean": 0.5687499880790711, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 360, + "train_speed(iter/s)": 0.083524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.6, + "completions/mean_length": 199.275, + "completions/min_length": 126.8, + "epoch": 0.17522803648583773, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.22960397601127625, + "kl": 0.014129638671875, + "learning_rate": 9.961699749272491e-06, + "loss": 0.00056455098092556, + "memory(GiB)": 25.29, + "reward": 0.28239999413490297, + "reward_std": 0.15443212017416955, + "rewards/MMContentORM/mean": 0.13350000753998756, + "rewards/MMContentORM/std": 0.694654929637909, + "rewards/MMFormatORM/mean": 0.6037499666213989, + "rewards/MMFormatORM/std": 0.1227274090051651, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 365, + "train_speed(iter/s)": 0.083611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 420.8, + "completions/mean_length": 195.0375, + "completions/min_length": 104.8, + "epoch": 0.17762842054728756, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.247096449136734, + "kl": 0.0181640625, + "learning_rate": 9.959208656897584e-06, + "loss": 0.000726937735453248, + "memory(GiB)": 25.29, + "reward": 0.3057999789714813, + "reward_std": 0.21651609390974044, + "rewards/MMContentORM/mean": 0.22949999421834946, + "rewards/MMContentORM/std": 0.7364683985710144, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 370, + "train_speed(iter/s)": 0.08347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.6, + "completions/mean_length": 181.1, + "completions/min_length": 111.0, + "epoch": 0.1800288046087374, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.2202872931957245, + "kl": 0.01561279296875, + "learning_rate": 9.956639416235337e-06, + "loss": 0.0006248470395803452, + "memory(GiB)": 25.29, + "reward": 0.3359999775886536, + "reward_std": 0.12784490436315538, + "rewards/MMContentORM/mean": 0.2475000001490116, + "rewards/MMContentORM/std": 0.6804847836494445, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 375, + "train_speed(iter/s)": 0.083633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/mean_length": 188.625, + "completions/min_length": 126.0, + "epoch": 0.18242918867018723, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.3163186311721802, + "kl": 0.02537841796875, + "learning_rate": 9.953992067772402e-06, + "loss": 0.0010158225893974304, + "memory(GiB)": 25.29, + "reward": 0.3191999852657318, + "reward_std": 0.2754888117313385, + "rewards/MMContentORM/mean": 0.2705000042915344, + "rewards/MMContentORM/std": 0.7479893922805786, + "rewards/MMFormatORM/mean": 0.5774999737739563, + "rewards/MMFormatORM/std": 0.19774004817008972, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 380, + "train_speed(iter/s)": 0.083784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/mean_length": 196.3625, + "completions/min_length": 104.0, + "epoch": 0.18482957273163705, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.2652575373649597, + "kl": 0.025384521484375, + "learning_rate": 9.95126665322627e-06, + "loss": 0.001015142910182476, + "memory(GiB)": 25.29, + "reward": 0.33874998092651365, + "reward_std": 0.2068287432193756, + "rewards/MMContentORM/mean": 0.28500000238418577, + "rewards/MMContentORM/std": 0.7165241241455078, + "rewards/MMFormatORM/mean": 0.5993749856948852, + "rewards/MMFormatORM/std": 0.16130690574645995, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 385, + "train_speed(iter/s)": 0.083924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.8, + "completions/mean_length": 184.35, + "completions/min_length": 107.2, + "epoch": 0.1872299567930869, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.25149068236351013, + "kl": 0.024176025390625, + "learning_rate": 9.948463215544617e-06, + "loss": 0.0009666066616773605, + "memory(GiB)": 25.29, + "reward": 0.36579999327659607, + "reward_std": 0.18809040486812592, + "rewards/MMContentORM/mean": 0.32200001180171967, + "rewards/MMContentORM/std": 0.6846403241157532, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 390, + "train_speed(iter/s)": 0.083997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/mean_length": 171.7125, + "completions/min_length": 89.8, + "epoch": 0.18963034085453673, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.2679647207260132, + "kl": 0.023345947265625, + "learning_rate": 9.945581798904623e-06, + "loss": 0.0009329639375209809, + "memory(GiB)": 25.29, + "reward": 0.2854999899864197, + "reward_std": 0.24409326910972595, + "rewards/MMContentORM/mean": 0.18250000327825547, + "rewards/MMContentORM/std": 0.7319893360137939, + "rewards/MMFormatORM/mean": 0.5812499761581421, + "rewards/MMFormatORM/std": 0.1992675095796585, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 395, + "train_speed(iter/s)": 0.084235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/mean_length": 182.8, + "completions/min_length": 118.4, + "epoch": 0.19203072491598655, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.283568799495697, + "kl": 0.022515869140625, + "learning_rate": 9.942622448712276e-06, + "loss": 0.0009008722379803657, + "memory(GiB)": 25.29, + "reward": 0.3700499892234802, + "reward_std": 0.1973535120487213, + "rewards/MMContentORM/mean": 0.34700000286102295, + "rewards/MMContentORM/std": 0.7341425061225891, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 400, + "train_speed(iter/s)": 0.0845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.8, + "completions/mean_length": 192.4875, + "completions/min_length": 123.8, + "epoch": 0.1944311089774364, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.23822158575057983, + "kl": 0.020452880859375, + "learning_rate": 9.93958521160166e-06, + "loss": 0.0008180794306099415, + "memory(GiB)": 25.29, + "reward": 0.3346499800682068, + "reward_std": 0.20442457497119904, + "rewards/MMContentORM/mean": 0.2585000067949295, + "rewards/MMContentORM/std": 0.7207361459732056, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 405, + "train_speed(iter/s)": 0.084282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/mean_length": 183.3375, + "completions/min_length": 130.0, + "epoch": 0.19683149303888622, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.21869473159313202, + "kl": 0.017327880859375, + "learning_rate": 9.936470135434219e-06, + "loss": 0.000694124260917306, + "memory(GiB)": 25.29, + "reward": 0.3697000026702881, + "reward_std": 0.18314065933227539, + "rewards/MMContentORM/mean": 0.3479999989271164, + "rewards/MMContentORM/std": 0.7096795082092285, + "rewards/MMFormatORM/mean": 0.6074999928474426, + "rewards/MMFormatORM/std": 0.12490466833114625, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 410, + "train_speed(iter/s)": 0.084451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.4, + "completions/mean_length": 188.8625, + "completions/min_length": 132.2, + "epoch": 0.19923187710033605, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.26058775186538696, + "kl": 0.019140625, + "learning_rate": 9.933277269297995e-06, + "loss": 0.0007644101046025753, + "memory(GiB)": 25.29, + "reward": 0.3062999933958054, + "reward_std": 0.23659793436527252, + "rewards/MMContentORM/mean": 0.20200001299381257, + "rewards/MMContentORM/std": 0.7170636773109436, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 415, + "train_speed(iter/s)": 0.084554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/mean_length": 182.9875, + "completions/min_length": 107.2, + "epoch": 0.2016322611617859, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.2722574472427368, + "kl": 0.024627685546875, + "learning_rate": 9.930006663506872e-06, + "loss": 0.0009830674156546594, + "memory(GiB)": 25.29, + "reward": 0.393399977684021, + "reward_std": 0.2146776258945465, + "rewards/MMContentORM/mean": 0.3910000085830688, + "rewards/MMContentORM/std": 0.68272864818573, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 420, + "train_speed(iter/s)": 0.084714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.6, + "completions/mean_length": 192.0, + "completions/min_length": 112.0, + "epoch": 0.20403264522323572, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.25141969323158264, + "kl": 0.017279052734375, + "learning_rate": 9.926658369599761e-06, + "loss": 0.0006905121728777886, + "memory(GiB)": 25.29, + "reward": 0.3258499801158905, + "reward_std": 0.20725299715995787, + "rewards/MMContentORM/mean": 0.23650000989437103, + "rewards/MMContentORM/std": 0.7136297464370728, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 425, + "train_speed(iter/s)": 0.084795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.8, + "completions/mean_length": 189.6625, + "completions/min_length": 128.2, + "epoch": 0.20643302928468554, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.23711970448493958, + "kl": 0.022698974609375, + "learning_rate": 9.923232440339811e-06, + "loss": 0.0009088035672903061, + "memory(GiB)": 25.29, + "reward": 0.40454998016357424, + "reward_std": 0.1615738956257701, + "rewards/MMContentORM/mean": 0.4044999837875366, + "rewards/MMContentORM/std": 0.6727037191390991, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 430, + "train_speed(iter/s)": 0.084935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.4, + "completions/mean_length": 196.9, + "completions/min_length": 134.6, + "epoch": 0.2088334133461354, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.2575157582759857, + "kl": 0.021795654296875, + "learning_rate": 9.919728929713555e-06, + "loss": 0.0008713678456842899, + "memory(GiB)": 25.29, + "reward": 0.3168999910354614, + "reward_std": 0.22896117568016053, + "rewards/MMContentORM/mean": 0.2859999895095825, + "rewards/MMContentORM/std": 0.7667613625526428, + "rewards/MMFormatORM/mean": 0.5687499880790711, + "rewards/MMFormatORM/std": 0.18971401453018188, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.291867733001709, + "step": 435, + "train_speed(iter/s)": 0.085105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.6, + "completions/mean_length": 203.775, + "completions/min_length": 126.6, + "epoch": 0.21123379740758522, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.19185078144073486, + "kl": 0.014251708984375, + "learning_rate": 9.916147892930075e-06, + "loss": 0.0005701377056539058, + "memory(GiB)": 25.29, + "reward": 0.3422499805688858, + "reward_std": 0.19254517406225205, + "rewards/MMContentORM/mean": 0.27750000208616254, + "rewards/MMContentORM/std": 0.7132205009460449, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 440, + "train_speed(iter/s)": 0.085168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 464.2, + "completions/mean_length": 220.8625, + "completions/min_length": 117.4, + "epoch": 0.21363418146903504, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.3216782808303833, + "kl": 0.016217041015625, + "learning_rate": 9.912489386420127e-06, + "loss": 0.0006480277515947819, + "memory(GiB)": 25.29, + "reward": 0.38044998943805697, + "reward_std": 0.19636355340480804, + "rewards/MMContentORM/mean": 0.4055000066757202, + "rewards/MMContentORM/std": 0.6969575762748719, + "rewards/MMFormatORM/mean": 0.5893749892711639, + "rewards/MMFormatORM/std": 0.17063776403665543, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.23944272398948668, + "step": 445, + "train_speed(iter/s)": 0.084893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.8, + "completions/mean_length": 193.45, + "completions/min_length": 100.2, + "epoch": 0.2160345655304849, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.2631596326828003, + "kl": 0.01585693359375, + "learning_rate": 9.908753467835252e-06, + "loss": 0.000633768830448389, + "memory(GiB)": 25.29, + "reward": 0.31024998873472215, + "reward_std": 0.13823937475681305, + "rewards/MMContentORM/mean": 0.19750000834465026, + "rewards/MMContentORM/std": 0.6008442759513855, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 450, + "train_speed(iter/s)": 0.084948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.2, + "completions/mean_length": 197.2625, + "completions/min_length": 127.0, + "epoch": 0.2184349495919347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.2569175660610199, + "kl": 0.01932373046875, + "learning_rate": 9.904940196046867e-06, + "loss": 0.0007727490272372961, + "memory(GiB)": 25.29, + "reward": 0.3388499915599823, + "reward_std": 0.20187898278236388, + "rewards/MMContentORM/mean": 0.326500004529953, + "rewards/MMContentORM/std": 0.7351299285888672, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.1856150358915329, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.285561603307724, + "step": 455, + "train_speed(iter/s)": 0.085103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.6, + "completions/mean_length": 192.1625, + "completions/min_length": 112.2, + "epoch": 0.22083533365338454, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.295173704624176, + "kl": 0.023602294921875, + "learning_rate": 9.901049631145336e-06, + "loss": 0.0009442863985896111, + "memory(GiB)": 25.29, + "reward": 0.3670499801635742, + "reward_std": 0.22988041043281554, + "rewards/MMContentORM/mean": 0.33950000405311587, + "rewards/MMContentORM/std": 0.7248481631278991, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 460, + "train_speed(iter/s)": 0.085231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 198.8625, + "completions/min_length": 134.2, + "epoch": 0.2232357177148344, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.18804140388965607, + "kl": 0.01673583984375, + "learning_rate": 9.897081834439026e-06, + "loss": 0.0006706462241709233, + "memory(GiB)": 25.29, + "reward": 0.3739499866962433, + "reward_std": 0.16638222634792327, + "rewards/MMContentORM/mean": 0.3279999911785126, + "rewards/MMContentORM/std": 0.6578051447868347, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 465, + "train_speed(iter/s)": 0.085203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 437.2, + "completions/mean_length": 207.525, + "completions/min_length": 127.8, + "epoch": 0.2256361017762842, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.2759953439235687, + "kl": 0.017724609375, + "learning_rate": 9.89303686845334e-06, + "loss": 0.0007088197395205498, + "memory(GiB)": 25.29, + "reward": 0.33759998679161074, + "reward_std": 0.26276087909936907, + "rewards/MMContentORM/mean": 0.35399999022483825, + "rewards/MMContentORM/std": 0.7571944236755371, + "rewards/MMFormatORM/mean": 0.5587499856948852, + "rewards/MMFormatORM/std": 0.2141388863325119, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.3288854479789734, + "step": 470, + "train_speed(iter/s)": 0.084976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/mean_length": 202.15, + "completions/min_length": 133.0, + "epoch": 0.22803648583773403, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.24971622228622437, + "kl": 0.016107177734375, + "learning_rate": 9.888914796929732e-06, + "loss": 0.000644554104655981, + "memory(GiB)": 25.29, + "reward": 0.42414999604225156, + "reward_std": 0.18872679471969606, + "rewards/MMContentORM/mean": 0.45350001454353334, + "rewards/MMContentORM/std": 0.6859857916831971, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 475, + "train_speed(iter/s)": 0.085055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.4, + "completions/mean_length": 203.1875, + "completions/min_length": 129.2, + "epoch": 0.23043686989918388, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1769513338804245, + "kl": 0.0209716796875, + "learning_rate": 9.884715684824698e-06, + "loss": 0.000839579850435257, + "memory(GiB)": 25.29, + "reward": 0.38434997797012327, + "reward_std": 0.15351288318634032, + "rewards/MMContentORM/mean": 0.39899998605251313, + "rewards/MMContentORM/std": 0.7158730506896973, + "rewards/MMFormatORM/mean": 0.5993749916553497, + "rewards/MMFormatORM/std": 0.1306377649307251, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 480, + "train_speed(iter/s)": 0.085104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.6, + "completions/mean_length": 194.1875, + "completions/min_length": 132.0, + "epoch": 0.2328372539606337, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.23804667592048645, + "kl": 0.014935302734375, + "learning_rate": 9.880439598308759e-06, + "loss": 0.0005985048599541187, + "memory(GiB)": 25.29, + "reward": 0.44059998989105226, + "reward_std": 0.19487863630056382, + "rewards/MMContentORM/mean": 0.5090000003576278, + "rewards/MMContentORM/std": 0.6222364962100982, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 485, + "train_speed(iter/s)": 0.085257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 205.2375, + "completions/min_length": 136.2, + "epoch": 0.23523763802208353, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.2561754882335663, + "kl": 0.013427734375, + "learning_rate": 9.876086604765416e-06, + "loss": 0.0005371436476707458, + "memory(GiB)": 25.29, + "reward": 0.36444997787475586, + "reward_std": 0.1216930739581585, + "rewards/MMContentORM/mean": 0.3330000042915344, + "rewards/MMContentORM/std": 0.6553688704967499, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 490, + "train_speed(iter/s)": 0.085298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.2, + "completions/mean_length": 195.65, + "completions/min_length": 130.2, + "epoch": 0.23763802208353338, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.14273209869861603, + "kl": 0.019012451171875, + "learning_rate": 9.871656772790088e-06, + "loss": 0.0007593894377350807, + "memory(GiB)": 25.29, + "reward": 0.3240499943494797, + "reward_std": 0.17599887698888778, + "rewards/MMContentORM/mean": 0.26449999809265134, + "rewards/MMContentORM/std": 0.7485530972480774, + "rewards/MMFormatORM/mean": 0.5893749833106995, + "rewards/MMFormatORM/std": 0.14441428184509278, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21124515533447266, + "step": 495, + "train_speed(iter/s)": 0.085438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/mean_length": 206.625, + "completions/min_length": 114.4, + "epoch": 0.2400384061449832, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1166425347328186, + "kl": 0.02442626953125, + "learning_rate": 9.86715017218903e-06, + "loss": 0.0009763010777533055, + "memory(GiB)": 25.29, + "reward": 0.41664999127388, + "reward_std": 0.15973542779684066, + "rewards/MMContentORM/mean": 0.4634999930858612, + "rewards/MMContentORM/std": 0.678302276134491, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 500, + "train_speed(iter/s)": 0.085421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/mean_length": 203.2125, + "completions/min_length": 135.0, + "epoch": 0.24243879020643302, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.1965927928686142, + "kl": 0.01986083984375, + "learning_rate": 9.862566873978227e-06, + "loss": 0.000794212706387043, + "memory(GiB)": 25.29, + "reward": 0.4267499804496765, + "reward_std": 0.12225875928997994, + "rewards/MMContentORM/mean": 0.4600000023841858, + "rewards/MMContentORM/std": 0.6565978765487671, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 505, + "train_speed(iter/s)": 0.085301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 196.475, + "completions/min_length": 123.0, + "epoch": 0.24483917426788285, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.19395841658115387, + "kl": 0.017010498046875, + "learning_rate": 9.857906950382297e-06, + "loss": 0.0006808775477111339, + "memory(GiB)": 25.29, + "reward": 0.3696999788284302, + "reward_std": 0.15372501760721208, + "rewards/MMContentORM/mean": 0.36050000339746474, + "rewards/MMContentORM/std": 0.6960474014282226, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 510, + "train_speed(iter/s)": 0.085386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/mean_length": 211.9125, + "completions/min_length": 141.8, + "epoch": 0.2472395583293327, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3381326496601105, + "kl": 0.027838134765625, + "learning_rate": 9.853170474833323e-06, + "loss": 0.0011151479557156563, + "memory(GiB)": 25.29, + "reward": 0.37864998877048495, + "reward_std": 0.23044609874486924, + "rewards/MMContentORM/mean": 0.4135000079870224, + "rewards/MMContentORM/std": 0.7175926685333252, + "rewards/MMFormatORM/mean": 0.5831249773502349, + "rewards/MMFormatORM/std": 0.16790457367897033, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 515, + "train_speed(iter/s)": 0.085432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/mean_length": 208.3, + "completions/min_length": 109.2, + "epoch": 0.24963994239078252, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.20441797375679016, + "kl": 0.021405029296875, + "learning_rate": 9.848357521969716e-06, + "loss": 0.0008581820875406265, + "memory(GiB)": 25.29, + "reward": 0.36869998276233673, + "reward_std": 0.2739331744611263, + "rewards/MMContentORM/mean": 0.41549999415874483, + "rewards/MMContentORM/std": 0.7344144463539124, + "rewards/MMFormatORM/mean": 0.568749976158142, + "rewards/MMFormatORM/std": 0.1936162531375885, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2978711724281311, + "step": 520, + "train_speed(iter/s)": 0.085467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.2, + "completions/mean_length": 201.225, + "completions/min_length": 113.4, + "epoch": 0.25204032645223234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.18100592494010925, + "kl": 0.030682373046875, + "learning_rate": 9.843468167635034e-06, + "loss": 0.0012254069559276104, + "memory(GiB)": 25.63, + "reward": 0.3865999817848206, + "reward_std": 0.2231628954410553, + "rewards/MMContentORM/mean": 0.41899999380111697, + "rewards/MMContentORM/std": 0.7231726169586181, + "rewards/MMFormatORM/mean": 0.5912499785423279, + "rewards/MMFormatORM/std": 0.18667025864124298, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 525, + "train_speed(iter/s)": 0.085558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/mean_length": 192.2625, + "completions/min_length": 113.8, + "epoch": 0.25444071051368217, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.21060362458229065, + "kl": 0.02144775390625, + "learning_rate": 9.838502488876785e-06, + "loss": 0.0008578533306717873, + "memory(GiB)": 25.63, + "reward": 0.38974998593330384, + "reward_std": 0.11646047895774245, + "rewards/MMContentORM/mean": 0.425, + "rewards/MMContentORM/std": 0.7169785857200622, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.13730934262275696, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21124515533447266, + "step": 530, + "train_speed(iter/s)": 0.085629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.6, + "completions/mean_length": 196.6625, + "completions/min_length": 139.8, + "epoch": 0.25684109457513205, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.215934157371521, + "kl": 0.017657470703125, + "learning_rate": 9.833460563945213e-06, + "loss": 0.0007070350926369429, + "memory(GiB)": 25.63, + "reward": 0.4114499926567078, + "reward_std": 0.18292852416634559, + "rewards/MMContentORM/mean": 0.45050002038478854, + "rewards/MMContentORM/std": 0.679290497303009, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 535, + "train_speed(iter/s)": 0.085733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 199.0875, + "completions/min_length": 125.4, + "epoch": 0.25924147863658187, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3002188503742218, + "kl": 0.01728515625, + "learning_rate": 9.828342472292063e-06, + "loss": 0.0006916997022926808, + "memory(GiB)": 25.63, + "reward": 0.3853999853134155, + "reward_std": 0.1479267368093133, + "rewards/MMContentORM/mean": 0.37100000232458114, + "rewards/MMContentORM/std": 0.6368636965751648, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 540, + "train_speed(iter/s)": 0.085766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.4, + "completions/mean_length": 202.4125, + "completions/min_length": 129.8, + "epoch": 0.2616418626980317, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.1911313533782959, + "kl": 0.019146728515625, + "learning_rate": 9.823148294569342e-06, + "loss": 0.0007662074174731969, + "memory(GiB)": 25.63, + "reward": 0.31649998128414153, + "reward_std": 0.2351837173104286, + "rewards/MMContentORM/mean": 0.25999999046325684, + "rewards/MMContentORM/std": 0.7403302311897277, + "rewards/MMFormatORM/mean": 0.5812499761581421, + "rewards/MMFormatORM/std": 0.17399703860282897, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 545, + "train_speed(iter/s)": 0.085865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 440.2, + "completions/mean_length": 210.1125, + "completions/min_length": 121.8, + "epoch": 0.2640422467594815, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.13542525470256805, + "kl": 0.019854736328125, + "learning_rate": 9.817878112628026e-06, + "loss": 0.0007948323152959346, + "memory(GiB)": 25.63, + "reward": 0.4270999848842621, + "reward_std": 0.14835099875926971, + "rewards/MMContentORM/mean": 0.5040000081062317, + "rewards/MMContentORM/std": 0.6937057256698609, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 550, + "train_speed(iter/s)": 0.08564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.2, + "completions/mean_length": 191.6125, + "completions/min_length": 107.2, + "epoch": 0.26644263082093134, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.24546104669570923, + "kl": 0.0204833984375, + "learning_rate": 9.812532009516787e-06, + "loss": 0.000820968858897686, + "memory(GiB)": 25.63, + "reward": 0.4259999990463257, + "reward_std": 0.14651251956820488, + "rewards/MMContentORM/mean": 0.47249999046325686, + "rewards/MMContentORM/std": 0.6199936449527741, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 555, + "train_speed(iter/s)": 0.085693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/mean_length": 201.3375, + "completions/min_length": 147.8, + "epoch": 0.26884301488238116, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.19194868206977844, + "kl": 0.0193115234375, + "learning_rate": 9.807110069480682e-06, + "loss": 0.0007728527300059796, + "memory(GiB)": 25.63, + "reward": 0.4393999844789505, + "reward_std": 0.15315932929515838, + "rewards/MMContentORM/mean": 0.5384999871253967, + "rewards/MMContentORM/std": 0.678757655620575, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.10973276048898697, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.14574271440505981, + "step": 560, + "train_speed(iter/s)": 0.085797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.8, + "completions/mean_length": 198.1875, + "completions/min_length": 138.0, + "epoch": 0.27124339894383104, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.2646057605743408, + "kl": 0.02037353515625, + "learning_rate": 9.801612377959817e-06, + "loss": 0.0008142871782183647, + "memory(GiB)": 25.63, + "reward": 0.49279999136924746, + "reward_std": 0.07127636531367898, + "rewards/MMContentORM/mean": 0.5819999873638153, + "rewards/MMContentORM/std": 0.6154716610908508, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 565, + "train_speed(iter/s)": 0.085862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.8, + "completions/mean_length": 194.2625, + "completions/min_length": 130.8, + "epoch": 0.27364378300528086, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.25992026925086975, + "kl": 0.020379638671875, + "learning_rate": 9.796039021588011e-06, + "loss": 0.0008148624561727047, + "memory(GiB)": 25.63, + "reward": 0.3938499867916107, + "reward_std": 0.1806657761335373, + "rewards/MMContentORM/mean": 0.40650000274181364, + "rewards/MMContentORM/std": 0.6918909192085266, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 570, + "train_speed(iter/s)": 0.085903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 435.8, + "completions/mean_length": 207.45, + "completions/min_length": 110.0, + "epoch": 0.2760441670667307, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.19565346837043762, + "kl": 0.13974609375, + "learning_rate": 9.790390088191423e-06, + "loss": 0.005578663945198059, + "memory(GiB)": 25.63, + "reward": 0.36904999017715456, + "reward_std": 0.21177847310900688, + "rewards/MMContentORM/mean": 0.40199999809265136, + "rewards/MMContentORM/std": 0.7141570091247559, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.1856150358915329, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.285561603307724, + "step": 575, + "train_speed(iter/s)": 0.085739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 444.2, + "completions/mean_length": 208.9625, + "completions/min_length": 132.0, + "epoch": 0.2784445511281805, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.1956750750541687, + "kl": 0.026104736328125, + "learning_rate": 9.784665666787176e-06, + "loss": 0.0010431693866848946, + "memory(GiB)": 25.63, + "reward": 0.5113999903202057, + "reward_std": 0.13548165708780288, + "rewards/MMContentORM/mean": 0.6860000014305114, + "rewards/MMContentORM/std": 0.5733676970005035, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 580, + "train_speed(iter/s)": 0.085557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.2, + "completions/mean_length": 198.375, + "completions/min_length": 128.6, + "epoch": 0.28084493518963033, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1519889384508133, + "kl": 0.02601318359375, + "learning_rate": 9.778865847581941e-06, + "loss": 0.0010399827733635902, + "memory(GiB)": 25.63, + "reward": 0.3574499785900116, + "reward_std": 0.22040518671274184, + "rewards/MMContentORM/mean": 0.3730000019073486, + "rewards/MMContentORM/std": 0.7398205995559692, + "rewards/MMFormatORM/mean": 0.5768749952316284, + "rewards/MMFormatORM/std": 0.19223275780677795, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.29574271440505984, + "step": 585, + "train_speed(iter/s)": 0.085641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.2, + "completions/mean_length": 198.475, + "completions/min_length": 120.0, + "epoch": 0.28324531925108015, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.3285408318042755, + "kl": 0.02108154296875, + "learning_rate": 9.772990721970534e-06, + "loss": 0.0008435861207544803, + "memory(GiB)": 25.63, + "reward": 0.4222499907016754, + "reward_std": 0.10656098783947528, + "rewards/MMContentORM/mean": 0.4525000065565109, + "rewards/MMContentORM/std": 0.6456537485122681, + "rewards/MMFormatORM/mean": 0.621874988079071, + "rewards/MMFormatORM/std": 0.11249999552965165, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 590, + "train_speed(iter/s)": 0.085706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.8, + "completions/mean_length": 194.6, + "completions/min_length": 110.8, + "epoch": 0.28564570331253003, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.26459255814552307, + "kl": 0.02178955078125, + "learning_rate": 9.767040382534456e-06, + "loss": 0.000872167106717825, + "memory(GiB)": 25.63, + "reward": 0.4603499710559845, + "reward_std": 0.0990656575653702, + "rewards/MMContentORM/mean": 0.5440000176429749, + "rewards/MMContentORM/std": 0.6181544482707977, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 595, + "train_speed(iter/s)": 0.085788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.8, + "completions/mean_length": 206.9, + "completions/min_length": 118.2, + "epoch": 0.28804608737397985, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24745135009288788, + "kl": 0.018096923828125, + "learning_rate": 9.761014923040453e-06, + "loss": 0.0007242465391755104, + "memory(GiB)": 25.63, + "reward": 0.45274998545646666, + "reward_std": 0.096237235609442, + "rewards/MMContentORM/mean": 0.5249999940395356, + "rewards/MMContentORM/std": 0.6533244967460632, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 600, + "train_speed(iter/s)": 0.085872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.8, + "completions/mean_length": 201.075, + "completions/min_length": 126.2, + "epoch": 0.2904464714354297, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.27756911516189575, + "kl": 0.0199951171875, + "learning_rate": 9.754914438439021e-06, + "loss": 0.0007998712360858917, + "memory(GiB)": 25.63, + "reward": 0.4426999866962433, + "reward_std": 0.14665394686162472, + "rewards/MMContentORM/mean": 0.5430000066757202, + "rewards/MMContentORM/std": 0.6288729965686798, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 605, + "train_speed(iter/s)": 0.085721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.2, + "completions/mean_length": 209.825, + "completions/min_length": 141.2, + "epoch": 0.2928468554968795, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.24841666221618652, + "kl": 0.01837158203125, + "learning_rate": 9.748739024862923e-06, + "loss": 0.0007352313958108425, + "memory(GiB)": 25.63, + "reward": 0.4437499940395355, + "reward_std": 0.1993333987891674, + "rewards/MMContentORM/mean": 0.5800000041723251, + "rewards/MMContentORM/std": 0.6180064260959626, + "rewards/MMFormatORM/mean": 0.5793749749660492, + "rewards/MMFormatORM/std": 0.11520133018493653, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.17888544797897338, + "step": 610, + "train_speed(iter/s)": 0.085709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/mean_length": 214.075, + "completions/min_length": 147.6, + "epoch": 0.2952472395583293, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.31791460514068604, + "kl": 0.0159912109375, + "learning_rate": 9.74248877962567e-06, + "loss": 0.0006397653836756944, + "memory(GiB)": 25.63, + "reward": 0.40454997420310973, + "reward_std": 0.10670241061598063, + "rewards/MMContentORM/mean": 0.40449999570846557, + "rewards/MMContentORM/std": 0.6732450246810913, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 615, + "train_speed(iter/s)": 0.085735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 474.4, + "completions/mean_length": 219.6375, + "completions/min_length": 134.8, + "epoch": 0.29764762361977914, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.252468466758728, + "kl": 0.019781494140625, + "learning_rate": 9.73616380121998e-06, + "loss": 0.0007909733802080154, + "memory(GiB)": 25.63, + "reward": 0.3432499796152115, + "reward_std": 0.24713381975889206, + "rewards/MMContentORM/mean": 0.3699999928474426, + "rewards/MMContentORM/std": 0.7536191344261169, + "rewards/MMFormatORM/mean": 0.5568749785423279, + "rewards/MMFormatORM/std": 0.2235463410615921, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.34438174962997437, + "step": 620, + "train_speed(iter/s)": 0.085518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.8, + "completions/mean_length": 212.825, + "completions/min_length": 125.0, + "epoch": 0.300048007681229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.21043799817562103, + "kl": 0.017236328125, + "learning_rate": 9.729764189316239e-06, + "loss": 0.0006894416641443968, + "memory(GiB)": 25.63, + "reward": 0.46269998550415037, + "reward_std": 0.17041273787617683, + "rewards/MMContentORM/mean": 0.5930000126361847, + "rewards/MMContentORM/std": 0.6541426777839661, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 625, + "train_speed(iter/s)": 0.085515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/mean_length": 209.8125, + "completions/min_length": 139.8, + "epoch": 0.30244839174267885, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.18214058876037598, + "kl": 0.01575927734375, + "learning_rate": 9.72329004476092e-06, + "loss": 0.0006303795147687197, + "memory(GiB)": 25.63, + "reward": 0.4604499816894531, + "reward_std": 0.13300678343512118, + "rewards/MMContentORM/mean": 0.5730000138282776, + "rewards/MMContentORM/std": 0.6502374410629272, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 630, + "train_speed(iter/s)": 0.085502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 438.6, + "completions/mean_length": 216.5875, + "completions/min_length": 127.6, + "epoch": 0.30484877580412867, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.538560152053833, + "kl": 0.01651611328125, + "learning_rate": 9.716741469575003e-06, + "loss": 0.00066067217849195, + "memory(GiB)": 25.63, + "reward": 0.3206499844789505, + "reward_std": 0.21290984600782395, + "rewards/MMContentORM/mean": 0.2810000032186508, + "rewards/MMContentORM/std": 0.7222515106201172, + "rewards/MMFormatORM/mean": 0.576874977350235, + "rewards/MMFormatORM/std": 0.17944467663764954, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27606874108314516, + "step": 635, + "train_speed(iter/s)": 0.085343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/mean_length": 209.8875, + "completions/min_length": 135.0, + "epoch": 0.3072491598655785, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.22361965477466583, + "kl": 0.017071533203125, + "learning_rate": 9.710118566952355e-06, + "loss": 0.0006829463876783848, + "memory(GiB)": 25.63, + "reward": 0.34789999127388, + "reward_std": 0.1711198389530182, + "rewards/MMContentORM/mean": 0.36350000500679014, + "rewards/MMContentORM/std": 0.7551613569259643, + "rewards/MMFormatORM/mean": 0.5687499821186066, + "rewards/MMFormatORM/std": 0.18744589388370514, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.28837831020355226, + "step": 640, + "train_speed(iter/s)": 0.085374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.8, + "completions/mean_length": 200.7875, + "completions/min_length": 137.6, + "epoch": 0.3096495439270283, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.2611067295074463, + "kl": 0.017279052734375, + "learning_rate": 9.703421441258116e-06, + "loss": 0.0006911037024110555, + "memory(GiB)": 25.63, + "reward": 0.5186999857425689, + "reward_std": 0.09008539766073227, + "rewards/MMContentORM/mean": 0.6755000114440918, + "rewards/MMContentORM/std": 0.5361402273178101, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 645, + "train_speed(iter/s)": 0.085458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.2, + "completions/mean_length": 209.5625, + "completions/min_length": 123.0, + "epoch": 0.31204992798847814, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.217108815908432, + "kl": 0.01778564453125, + "learning_rate": 9.696650198027045e-06, + "loss": 0.0007126822136342525, + "memory(GiB)": 25.63, + "reward": 0.35569998621940613, + "reward_std": 0.15895759630948306, + "rewards/MMContentORM/mean": 0.3830000050365925, + "rewards/MMContentORM/std": 0.7334277391433716, + "rewards/MMFormatORM/mean": 0.568749976158142, + "rewards/MMFormatORM/std": 0.1936162531375885, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2978711724281311, + "step": 650, + "train_speed(iter/s)": 0.085492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/mean_length": 217.1375, + "completions/min_length": 142.2, + "epoch": 0.314450312049928, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.20862624049186707, + "kl": 0.016278076171875, + "learning_rate": 9.689804943961868e-06, + "loss": 0.0006509024649858474, + "memory(GiB)": 25.63, + "reward": 0.3771999955177307, + "reward_std": 0.2166575163602829, + "rewards/MMContentORM/mean": 0.4080000162124634, + "rewards/MMContentORM/std": 0.7419367551803588, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.19821036159992217, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 655, + "train_speed(iter/s)": 0.085518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.4, + "completions/mean_length": 214.8125, + "completions/min_length": 146.4, + "epoch": 0.31685069611137784, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10907348245382309, + "kl": 0.01588134765625, + "learning_rate": 9.682885786931581e-06, + "loss": 0.0006347180809825659, + "memory(GiB)": 25.63, + "reward": 0.481499981880188, + "reward_std": 0.08216580778826028, + "rewards/MMContentORM/mean": 0.5825000107288361, + "rewards/MMContentORM/std": 0.592711991071701, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 660, + "train_speed(iter/s)": 0.085548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.8, + "completions/mean_length": 199.2, + "completions/min_length": 142.8, + "epoch": 0.31925108017282766, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.24605858325958252, + "kl": 0.01693115234375, + "learning_rate": 9.675892835969767e-06, + "loss": 0.0006764709949493408, + "memory(GiB)": 25.63, + "reward": 0.45569999814033507, + "reward_std": 0.10677312165498734, + "rewards/MMContentORM/mean": 0.5180000066757202, + "rewards/MMContentORM/std": 0.6405679821968079, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 665, + "train_speed(iter/s)": 0.08562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.2, + "completions/mean_length": 206.3625, + "completions/min_length": 142.4, + "epoch": 0.3216514642342775, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.13116636872291565, + "kl": 0.014251708984375, + "learning_rate": 9.668826201272866e-06, + "loss": 0.0005692524835467338, + "memory(GiB)": 25.63, + "reward": 0.5005499720573425, + "reward_std": 0.08916615936905145, + "rewards/MMContentORM/mean": 0.6444999992847442, + "rewards/MMContentORM/std": 0.4851596847176552, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 670, + "train_speed(iter/s)": 0.085665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.8, + "completions/mean_length": 200.4875, + "completions/min_length": 114.8, + "epoch": 0.3240518482957273, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.21502132713794708, + "kl": 0.01915283203125, + "learning_rate": 9.66168599419844e-06, + "loss": 0.0007658099755644798, + "memory(GiB)": 25.63, + "reward": 0.4354999840259552, + "reward_std": 0.0936209331266582, + "rewards/MMContentORM/mean": 0.5250000029802322, + "rewards/MMContentORM/std": 0.6535530805587768, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 675, + "train_speed(iter/s)": 0.08572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.8, + "completions/mean_length": 201.55, + "completions/min_length": 131.2, + "epoch": 0.32645223235717713, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1893157660961151, + "kl": 0.01766357421875, + "learning_rate": 9.654472327263426e-06, + "loss": 0.0007074634078890086, + "memory(GiB)": 25.63, + "reward": 0.425249981880188, + "reward_std": 0.1939593806862831, + "rewards/MMContentORM/mean": 0.4850000083446503, + "rewards/MMContentORM/std": 0.6584623217582702, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 680, + "train_speed(iter/s)": 0.085754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/mean_length": 213.975, + "completions/min_length": 132.0, + "epoch": 0.32885261641862695, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.18459384143352509, + "kl": 0.015380859375, + "learning_rate": 9.647185314142354e-06, + "loss": 0.0006157746538519859, + "memory(GiB)": 25.63, + "reward": 0.3858999848365784, + "reward_std": 0.12006673291325569, + "rewards/MMContentORM/mean": 0.34350000619888305, + "rewards/MMContentORM/std": 0.6572001695632934, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 685, + "train_speed(iter/s)": 0.085759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/mean_length": 203.2625, + "completions/min_length": 140.4, + "epoch": 0.33125300048007683, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.25200966000556946, + "kl": 0.0184814453125, + "learning_rate": 9.63982506966556e-06, + "loss": 0.0007389162667095662, + "memory(GiB)": 25.63, + "reward": 0.43359999656677245, + "reward_std": 0.13519881889224053, + "rewards/MMContentORM/mean": 0.4915000021457672, + "rewards/MMContentORM/std": 0.6082589268684387, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 690, + "train_speed(iter/s)": 0.085756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.8, + "completions/mean_length": 196.825, + "completions/min_length": 116.2, + "epoch": 0.33365338454152665, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.21117174625396729, + "kl": 0.019537353515625, + "learning_rate": 9.632391709817374e-06, + "loss": 0.0007822229526937008, + "memory(GiB)": 25.63, + "reward": 0.4227499783039093, + "reward_std": 0.15464425683021546, + "rewards/MMContentORM/mean": 0.5075000166893006, + "rewards/MMContentORM/std": 0.7225377321243286, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 695, + "train_speed(iter/s)": 0.085769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.8, + "completions/mean_length": 199.0875, + "completions/min_length": 121.6, + "epoch": 0.3360537686029765, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.17211079597473145, + "kl": 0.018890380859375, + "learning_rate": 9.624885351734296e-06, + "loss": 0.000755119789391756, + "memory(GiB)": 25.63, + "reward": 0.503549975156784, + "reward_std": 0.1313097208738327, + "rewards/MMContentORM/mean": 0.6520000100135803, + "rewards/MMContentORM/std": 0.5663350522518158, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 700, + "train_speed(iter/s)": 0.085818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.6, + "completions/mean_length": 184.925, + "completions/min_length": 108.6, + "epoch": 0.3384541526644263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.20221319794654846, + "kl": 0.023406982421875, + "learning_rate": 9.617306113703148e-06, + "loss": 0.0009360792115330696, + "memory(GiB)": 25.63, + "reward": 0.4147499859333038, + "reward_std": 0.1776959329843521, + "rewards/MMContentORM/mean": 0.46249999701976774, + "rewards/MMContentORM/std": 0.6942957758903503, + "rewards/MMFormatORM/mean": 0.6056249737739563, + "rewards/MMFormatORM/std": 0.12368168532848359, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 705, + "train_speed(iter/s)": 0.085714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.6, + "completions/mean_length": 193.7125, + "completions/min_length": 135.2, + "epoch": 0.3408545367258761, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.18443672358989716, + "kl": 0.02154541015625, + "learning_rate": 9.60965411515921e-06, + "loss": 0.0008609195239841938, + "memory(GiB)": 25.63, + "reward": 0.41414997577667234, + "reward_std": 0.08068087929859757, + "rewards/MMContentORM/mean": 0.42850000858306886, + "rewards/MMContentORM/std": 0.6322312831878663, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 710, + "train_speed(iter/s)": 0.085788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 424.2, + "completions/mean_length": 213.925, + "completions/min_length": 115.0, + "epoch": 0.34325492078732595, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.25778675079345703, + "kl": 0.037451171875, + "learning_rate": 9.601929476684335e-06, + "loss": 0.0014980776235461236, + "memory(GiB)": 25.63, + "reward": 0.37259999513626096, + "reward_std": 0.20449528098106384, + "rewards/MMContentORM/mean": 0.3964999854564667, + "rewards/MMContentORM/std": 0.737775981426239, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.19821036159992217, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 715, + "train_speed(iter/s)": 0.085686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 198.0375, + "completions/min_length": 122.4, + "epoch": 0.3456553048487758, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21450744569301605, + "kl": 0.01873779296875, + "learning_rate": 9.594132320005056e-06, + "loss": 0.0007497821934521198, + "memory(GiB)": 25.63, + "reward": 0.4351499855518341, + "reward_std": 0.11151074110530317, + "rewards/MMContentORM/mean": 0.48100000619888306, + "rewards/MMContentORM/std": 0.6345597028732299, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 720, + "train_speed(iter/s)": 0.08572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.6, + "completions/mean_length": 203.5, + "completions/min_length": 127.2, + "epoch": 0.34805568891022565, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.17188891768455505, + "kl": 0.017034912109375, + "learning_rate": 9.58626276799066e-06, + "loss": 0.0006807168014347553, + "memory(GiB)": 25.63, + "reward": 0.46594999432563783, + "reward_std": 0.10245977491140365, + "rewards/MMContentORM/mean": 0.5580000162124634, + "rewards/MMContentORM/std": 0.5856799840927124, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 725, + "train_speed(iter/s)": 0.085717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.8, + "completions/mean_length": 207.4625, + "completions/min_length": 142.6, + "epoch": 0.35045607297167547, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.2247178703546524, + "kl": 0.01895751953125, + "learning_rate": 9.57832094465126e-06, + "loss": 0.000757955340668559, + "memory(GiB)": 25.63, + "reward": 0.4785999894142151, + "reward_std": 0.1292591169476509, + "rewards/MMContentORM/mean": 0.6039999961853028, + "rewards/MMContentORM/std": 0.6318910479545593, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 730, + "train_speed(iter/s)": 0.085751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.4, + "completions/mean_length": 204.5125, + "completions/min_length": 130.6, + "epoch": 0.3528564570331253, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.2592087686061859, + "kl": 0.02066650390625, + "learning_rate": 9.57030697513583e-06, + "loss": 0.0008267030119895935, + "memory(GiB)": 25.63, + "reward": 0.4224999785423279, + "reward_std": 0.15726054804399608, + "rewards/MMContentORM/mean": 0.49250001907348634, + "rewards/MMContentORM/std": 0.6467598974704742, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 735, + "train_speed(iter/s)": 0.085797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.2, + "completions/mean_length": 204.025, + "completions/min_length": 127.4, + "epoch": 0.3552568410945751, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1708785593509674, + "kl": 0.0177734375, + "learning_rate": 9.562220985730246e-06, + "loss": 0.0007100693415850401, + "memory(GiB)": 25.63, + "reward": 0.4913999915122986, + "reward_std": 0.11002581561915577, + "rewards/MMContentORM/mean": 0.6110000014305115, + "rewards/MMContentORM/std": 0.6001002073287964, + "rewards/MMFormatORM/mean": 0.6299999833106995, + "rewards/MMFormatORM/std": 0.07999999672174454, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 740, + "train_speed(iter/s)": 0.0858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.4, + "completions/mean_length": 213.3375, + "completions/min_length": 134.6, + "epoch": 0.35765722515602494, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.13782000541687012, + "kl": 0.01622314453125, + "learning_rate": 9.554063103855285e-06, + "loss": 0.0006494280882179737, + "memory(GiB)": 25.63, + "reward": 0.48459997177124026, + "reward_std": 0.1038032690063119, + "rewards/MMContentORM/mean": 0.5940000116825104, + "rewards/MMContentORM/std": 0.5801396131515503, + "rewards/MMFormatORM/mean": 0.6299999833106995, + "rewards/MMFormatORM/std": 0.06737477481365203, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 745, + "train_speed(iter/s)": 0.085736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 459.6, + "completions/mean_length": 222.775, + "completions/min_length": 120.6, + "epoch": 0.3600576092174748, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.1946718990802765, + "kl": 0.0191650390625, + "learning_rate": 9.54583345806462e-06, + "loss": 0.0007668656297028064, + "memory(GiB)": 25.63, + "reward": 0.4473499894142151, + "reward_std": 0.10330830663442611, + "rewards/MMContentORM/mean": 0.5115000009536743, + "rewards/MMContentORM/std": 0.6476596593856812, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 750, + "train_speed(iter/s)": 0.085561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 450.4, + "completions/mean_length": 226.075, + "completions/min_length": 133.0, + "epoch": 0.36245799327892464, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.18249185383319855, + "kl": 0.019720458984375, + "learning_rate": 9.537532178042796e-06, + "loss": 0.0007876944728195667, + "memory(GiB)": 25.63, + "reward": 0.38464999198913574, + "reward_std": 0.1914138063788414, + "rewards/MMContentORM/mean": 0.41600000858306885, + "rewards/MMContentORM/std": 0.7192264080047608, + "rewards/MMFormatORM/mean": 0.5893749833106995, + "rewards/MMFormatORM/std": 0.1641829013824463, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 755, + "train_speed(iter/s)": 0.085399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 504.0, + "completions/mean_length": 236.925, + "completions/min_length": 153.0, + "epoch": 0.36485837734037446, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.09791433811187744, + "kl": 0.01590576171875, + "learning_rate": 9.529159394603192e-06, + "loss": 0.0006361880339682102, + "memory(GiB)": 25.63, + "reward": 0.46654998064041137, + "reward_std": 0.18250426054000854, + "rewards/MMContentORM/mean": 0.6170000016689301, + "rewards/MMContentORM/std": 0.6278144896030426, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 760, + "train_speed(iter/s)": 0.085164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/mean_length": 210.0375, + "completions/min_length": 142.8, + "epoch": 0.3672587614018243, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.20352379977703094, + "kl": 0.01798095703125, + "learning_rate": 9.520715239685943e-06, + "loss": 0.0007194386795163155, + "memory(GiB)": 25.63, + "reward": 0.441599977016449, + "reward_std": 0.09956062764395028, + "rewards/MMContentORM/mean": 0.5115000009536743, + "rewards/MMContentORM/std": 0.6773199915885926, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 765, + "train_speed(iter/s)": 0.085188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.2, + "completions/mean_length": 214.0125, + "completions/min_length": 154.8, + "epoch": 0.3696591454632741, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.19353719055652618, + "kl": 0.01456298828125, + "learning_rate": 9.512199846355879e-06, + "loss": 0.0005822981242090463, + "memory(GiB)": 25.63, + "reward": 0.4994999825954437, + "reward_std": 0.11610694080591202, + "rewards/MMContentORM/mean": 0.6275000095367431, + "rewards/MMContentORM/std": 0.5707884192466736, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 770, + "train_speed(iter/s)": 0.085248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/mean_length": 222.7, + "completions/min_length": 154.4, + "epoch": 0.37205952952472393, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1961638182401657, + "kl": 0.01519775390625, + "learning_rate": 9.503613348800418e-06, + "loss": 0.0006085673347115516, + "memory(GiB)": 25.63, + "reward": 0.44324998259544374, + "reward_std": 0.14813887765631079, + "rewards/MMContentORM/mean": 0.5300000160932541, + "rewards/MMContentORM/std": 0.637897276878357, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 775, + "train_speed(iter/s)": 0.085193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 497.0, + "completions/mean_length": 242.0875, + "completions/min_length": 145.8, + "epoch": 0.3744599135861738, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.21977363526821136, + "kl": 0.01630859375, + "learning_rate": 9.494955882327455e-06, + "loss": 0.0006526447832584381, + "memory(GiB)": 25.63, + "reward": 0.42324999570846555, + "reward_std": 0.11490485058166086, + "rewards/MMContentORM/mean": 0.4800000011920929, + "rewards/MMContentORM/std": 0.5872390195727348, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 780, + "train_speed(iter/s)": 0.084974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.2, + "completions/mean_length": 211.6, + "completions/min_length": 129.8, + "epoch": 0.37686029764762363, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.18094761669635773, + "kl": 0.019195556640625, + "learning_rate": 9.486227583363225e-06, + "loss": 0.0007680790033191443, + "memory(GiB)": 25.63, + "reward": 0.49619998335838317, + "reward_std": 0.1111571803689003, + "rewards/MMContentORM/mean": 0.6480000019073486, + "rewards/MMContentORM/std": 0.6220699548721313, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 785, + "train_speed(iter/s)": 0.084985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.4, + "completions/mean_length": 212.1875, + "completions/min_length": 132.8, + "epoch": 0.37926068170907346, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2051294595003128, + "kl": 0.019140625, + "learning_rate": 9.47742858945016e-06, + "loss": 0.0007654055021703243, + "memory(GiB)": 25.63, + "reward": 0.42904997169971465, + "reward_std": 0.13145114853978157, + "rewards/MMContentORM/mean": 0.49449999928474425, + "rewards/MMContentORM/std": 0.6315191209316253, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 790, + "train_speed(iter/s)": 0.084986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 616.6, + "completions/mean_length": 243.05, + "completions/min_length": 138.0, + "epoch": 0.3816610657705233, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.14649176597595215, + "kl": 0.022198486328125, + "learning_rate": 9.468559039244718e-06, + "loss": 0.000887654721736908, + "memory(GiB)": 25.63, + "reward": 0.46064999103546145, + "reward_std": 0.15181582197546958, + "rewards/MMContentORM/mean": 0.5735000014305115, + "rewards/MMContentORM/std": 0.6020529091358184, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 795, + "train_speed(iter/s)": 0.084659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.4, + "completions/mean_length": 213.4625, + "completions/min_length": 121.0, + "epoch": 0.3840614498319731, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.17191362380981445, + "kl": 0.01591796875, + "learning_rate": 9.459619072515196e-06, + "loss": 0.0006367039866745472, + "memory(GiB)": 25.63, + "reward": 0.46714999675750735, + "reward_std": 0.07474118582904339, + "rewards/MMContentORM/mean": 0.5484999895095826, + "rewards/MMContentORM/std": 0.654664158821106, + "rewards/MMFormatORM/mean": 0.6318749904632568, + "rewards/MMFormatORM/std": 0.07249999642372132, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 800, + "train_speed(iter/s)": 0.084706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/mean_length": 216.2375, + "completions/min_length": 132.4, + "epoch": 0.3864618338934229, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.18298013508319855, + "kl": 0.018414306640625, + "learning_rate": 9.450608830139537e-06, + "loss": 0.0007364887278527021, + "memory(GiB)": 25.63, + "reward": 0.4408999800682068, + "reward_std": 0.16447303146123887, + "rewards/MMContentORM/mean": 0.5385000050067902, + "rewards/MMContentORM/std": 0.686086630821228, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 805, + "train_speed(iter/s)": 0.084608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 457.8, + "completions/mean_length": 218.7375, + "completions/min_length": 117.6, + "epoch": 0.3888622179548728, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.09511148929595947, + "kl": 0.022802734375, + "learning_rate": 9.44152845410309e-06, + "loss": 0.0009122312068939209, + "memory(GiB)": 25.63, + "reward": 0.43959996700286863, + "reward_std": 0.11992530548013747, + "rewards/MMContentORM/mean": 0.5065000057220459, + "rewards/MMContentORM/std": 0.6312320232391357, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 810, + "train_speed(iter/s)": 0.084476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.4, + "completions/mean_length": 215.85, + "completions/min_length": 147.6, + "epoch": 0.3912626020163226, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13439743220806122, + "kl": 0.0202392578125, + "learning_rate": 9.4323780874964e-06, + "loss": 0.0008096899837255478, + "memory(GiB)": 25.63, + "reward": 0.40035000443458557, + "reward_std": 0.16298811305314304, + "rewards/MMContentORM/mean": 0.451500004529953, + "rewards/MMContentORM/std": 0.6973459839820861, + "rewards/MMFormatORM/mean": 0.5931249916553497, + "rewards/MMFormatORM/std": 0.1556377649307251, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.23944272398948668, + "step": 815, + "train_speed(iter/s)": 0.084482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 211.5375, + "completions/min_length": 136.8, + "epoch": 0.39366298607777245, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.21310864388942719, + "kl": 0.017193603515625, + "learning_rate": 9.42315787451293e-06, + "loss": 0.0006876428611576557, + "memory(GiB)": 25.63, + "reward": 0.4637999773025513, + "reward_std": 0.11851109359413385, + "rewards/MMContentORM/mean": 0.5670000195503235, + "rewards/MMContentORM/std": 0.6650908708572387, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 820, + "train_speed(iter/s)": 0.084493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 609.6, + "completions/mean_length": 235.6875, + "completions/min_length": 147.2, + "epoch": 0.39606337013922227, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.1284942924976349, + "kl": 0.016168212890625, + "learning_rate": 9.413867960446796e-06, + "loss": 0.0006466972175985575, + "memory(GiB)": 25.63, + "reward": 0.4465499848127365, + "reward_std": 0.12763277366757392, + "rewards/MMContentORM/mean": 0.5420000106096268, + "rewards/MMContentORM/std": 0.6290184378623962, + "rewards/MMFormatORM/mean": 0.6056249856948852, + "rewards/MMFormatORM/std": 0.11977944672107696, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 825, + "train_speed(iter/s)": 0.084165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.8, + "completions/mean_length": 207.575, + "completions/min_length": 131.4, + "epoch": 0.3984637542006721, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1951877474784851, + "kl": 0.015869140625, + "learning_rate": 9.404508491690484e-06, + "loss": 0.0006350751966238022, + "memory(GiB)": 25.63, + "reward": 0.5316999852657318, + "reward_std": 0.07452905047684907, + "rewards/MMContentORM/mean": 0.7080000102519989, + "rewards/MMContentORM/std": 0.4912826240062714, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 830, + "train_speed(iter/s)": 0.084201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/mean_length": 213.35, + "completions/min_length": 149.4, + "epoch": 0.4008641382621219, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.17311322689056396, + "kl": 0.0205078125, + "learning_rate": 9.395079615732539e-06, + "loss": 0.0008202603086829186, + "memory(GiB)": 25.63, + "reward": 0.47449998259544374, + "reward_std": 0.10154052944853902, + "rewards/MMContentORM/mean": 0.5689999997615814, + "rewards/MMContentORM/std": 0.6329957902431488, + "rewards/MMFormatORM/mean": 0.6299999833106995, + "rewards/MMFormatORM/std": 0.09440345466136932, + "rewards/MMRubricORM/mean": -0.025500000268220902, + "rewards/MMRubricORM/std": 0.16631300747394562, + "step": 835, + "train_speed(iter/s)": 0.084227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/mean_length": 199.025, + "completions/min_length": 104.8, + "epoch": 0.4032645223235718, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14815396070480347, + "kl": 0.022406005859375, + "learning_rate": 9.385581481155233e-06, + "loss": 0.0008968940936028957, + "memory(GiB)": 25.63, + "reward": 0.4544999897480011, + "reward_std": 0.09220672622323037, + "rewards/MMContentORM/mean": 0.5724999904632568, + "rewards/MMContentORM/std": 0.6493830382823944, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 840, + "train_speed(iter/s)": 0.08428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.4, + "completions/mean_length": 212.925, + "completions/min_length": 118.2, + "epoch": 0.4056649063850216, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.22879934310913086, + "kl": 0.02987060546875, + "learning_rate": 9.376014237632233e-06, + "loss": 0.0011936011724174023, + "memory(GiB)": 25.63, + "reward": 0.43914997577667236, + "reward_std": 0.1680792823433876, + "rewards/MMContentORM/mean": 0.5485000014305115, + "rewards/MMContentORM/std": 0.6784831821918488, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 845, + "train_speed(iter/s)": 0.084318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/mean_length": 213.9875, + "completions/min_length": 133.4, + "epoch": 0.40806529044647144, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.0645090639591217, + "kl": 0.016571044921875, + "learning_rate": 9.366378035926244e-06, + "loss": 0.0006628448609262705, + "memory(GiB)": 25.63, + "reward": 0.38159998059272765, + "reward_std": 0.10861159779597074, + "rewards/MMContentORM/mean": 0.3615000039339066, + "rewards/MMContentORM/std": 0.677151370048523, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 850, + "train_speed(iter/s)": 0.084327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/mean_length": 222.175, + "completions/min_length": 147.2, + "epoch": 0.41046567450792126, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2151261568069458, + "kl": 0.015704345703125, + "learning_rate": 9.356673027886624e-06, + "loss": 0.0006273643113672734, + "memory(GiB)": 25.63, + "reward": 0.4602999806404114, + "reward_std": 0.1168140321969986, + "rewards/MMContentORM/mean": 0.5820000171661377, + "rewards/MMContentORM/std": 0.683591103553772, + "rewards/MMFormatORM/mean": 0.5999999880790711, + "rewards/MMFormatORM/std": 0.12756490409374238, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 855, + "train_speed(iter/s)": 0.084361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.2, + "completions/mean_length": 218.55, + "completions/min_length": 114.6, + "epoch": 0.4128660585693711, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.12868833541870117, + "kl": 0.0200927734375, + "learning_rate": 9.346899366447e-06, + "loss": 0.0008026616647839546, + "memory(GiB)": 25.63, + "reward": 0.4429999828338623, + "reward_std": 0.16150318831205368, + "rewards/MMContentORM/mean": 0.5725000083446503, + "rewards/MMContentORM/std": 0.7186736941337586, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 860, + "train_speed(iter/s)": 0.084399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.2, + "completions/mean_length": 224.7875, + "completions/min_length": 154.4, + "epoch": 0.4152664426308209, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.24704837799072266, + "kl": 0.015057373046875, + "learning_rate": 9.337057205622848e-06, + "loss": 0.0006027618423104286, + "memory(GiB)": 25.63, + "reward": 0.4193499803543091, + "reward_std": 0.1847669929265976, + "rewards/MMContentORM/mean": 0.49900001287460327, + "rewards/MMContentORM/std": 0.7176998615264892, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 865, + "train_speed(iter/s)": 0.084412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 449.6, + "completions/mean_length": 232.175, + "completions/min_length": 159.2, + "epoch": 0.4176668266922708, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.21097196638584137, + "kl": 0.016717529296875, + "learning_rate": 9.327146700509082e-06, + "loss": 0.0006690716370940208, + "memory(GiB)": 25.63, + "reward": 0.46244998574256896, + "reward_std": 0.14983592703938484, + "rewards/MMContentORM/mean": 0.6105000078678131, + "rewards/MMContentORM/std": 0.5413160175085068, + "rewards/MMFormatORM/mean": 0.5893749892711639, + "rewards/MMFormatORM/std": 0.17063776403665543, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.23944272398948668, + "step": 870, + "train_speed(iter/s)": 0.08432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.8, + "completions/mean_length": 226.425, + "completions/min_length": 158.4, + "epoch": 0.4200672107537206, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11343076825141907, + "kl": 0.01474609375, + "learning_rate": 9.317168007277589e-06, + "loss": 0.0005900030490010976, + "memory(GiB)": 25.63, + "reward": 0.38004998564720155, + "reward_std": 0.11193500086665154, + "rewards/MMContentORM/mean": 0.371999990940094, + "rewards/MMContentORM/std": 0.7004570722579956, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 875, + "train_speed(iter/s)": 0.084275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/mean_length": 211.6375, + "completions/min_length": 137.8, + "epoch": 0.42246759481517043, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17401404678821564, + "kl": 0.01837158203125, + "learning_rate": 9.307121283174788e-06, + "loss": 0.0007351872511208058, + "memory(GiB)": 25.63, + "reward": 0.38199999928474426, + "reward_std": 0.18780755996704102, + "rewards/MMContentORM/mean": 0.42000000476837157, + "rewards/MMContentORM/std": 0.7346604287624359, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.16980934143066406, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2612451553344727, + "step": 880, + "train_speed(iter/s)": 0.084282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.2, + "completions/mean_length": 225.7625, + "completions/min_length": 164.0, + "epoch": 0.42486797887662026, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.23540575802326202, + "kl": 0.01390380859375, + "learning_rate": 9.297006686519139e-06, + "loss": 0.0005556363612413406, + "memory(GiB)": 25.63, + "reward": 0.40919997692108157, + "reward_std": 0.18893892914056779, + "rewards/MMContentORM/mean": 0.4755000114440918, + "rewards/MMContentORM/std": 0.7272086381912232, + "rewards/MMFormatORM/mean": 0.5912499725818634, + "rewards/MMFormatORM/std": 0.13540457487106322, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2077557325363159, + "step": 885, + "train_speed(iter/s)": 0.084274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 536.0, + "completions/mean_length": 235.975, + "completions/min_length": 157.6, + "epoch": 0.4272683629380701, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.13821138441562653, + "kl": 0.017333984375, + "learning_rate": 9.286824376698653e-06, + "loss": 0.0006932040210813284, + "memory(GiB)": 27.09, + "reward": 0.3673999786376953, + "reward_std": 0.20407101474702358, + "rewards/MMContentORM/mean": 0.4410000145435333, + "rewards/MMContentORM/std": 0.7875102996826172, + "rewards/MMFormatORM/mean": 0.5524999797344208, + "rewards/MMFormatORM/std": 0.23184934854507447, + "rewards/MMRubricORM/mean": -0.15, + "rewards/MMRubricORM/std": 0.35669131875038146, + "step": 890, + "train_speed(iter/s)": 0.084032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/mean_length": 227.0, + "completions/min_length": 132.4, + "epoch": 0.4296687469995199, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.22291657328605652, + "kl": 0.020013427734375, + "learning_rate": 9.276574514168382e-06, + "loss": 0.000801488570868969, + "memory(GiB)": 27.09, + "reward": 0.4104499936103821, + "reward_std": 0.21955665349960327, + "rewards/MMContentORM/mean": 0.4930000126361847, + "rewards/MMContentORM/std": 0.7277546286582947, + "rewards/MMFormatORM/mean": 0.5831249833106995, + "rewards/MMFormatORM/std": 0.19391306340694428, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 895, + "train_speed(iter/s)": 0.084014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.2, + "completions/mean_length": 227.45, + "completions/min_length": 148.0, + "epoch": 0.4320691310609698, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.1956370770931244, + "kl": 0.01566162109375, + "learning_rate": 9.266257260447883e-06, + "loss": 0.0006269800476729869, + "memory(GiB)": 27.09, + "reward": 0.41879996508359907, + "reward_std": 0.08725697756744921, + "rewards/MMContentORM/mean": 0.4420000076293945, + "rewards/MMContentORM/std": 0.6016733646392822, + "rewards/MMFormatORM/mean": 0.6237499833106994, + "rewards/MMFormatORM/std": 0.05990467071533203, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.08062257766723632, + "step": 900, + "train_speed(iter/s)": 0.084041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.8, + "completions/mean_length": 223.0875, + "completions/min_length": 156.8, + "epoch": 0.4344695151224196, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.2335289567708969, + "kl": 0.017236328125, + "learning_rate": 9.255872778118686e-06, + "loss": 0.0006896716542541981, + "memory(GiB)": 27.09, + "reward": 0.4828499794006348, + "reward_std": 0.12324871122837067, + "rewards/MMContentORM/mean": 0.628999999165535, + "rewards/MMContentORM/std": 0.5791173458099366, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 905, + "train_speed(iter/s)": 0.083925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.6, + "completions/mean_length": 228.55, + "completions/min_length": 155.0, + "epoch": 0.4368698991838694, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11608735471963882, + "kl": 0.0136962890625, + "learning_rate": 9.245421230821717e-06, + "loss": 0.0005476945545524359, + "memory(GiB)": 27.09, + "reward": 0.4916999876499176, + "reward_std": 0.06264965860173106, + "rewards/MMContentORM/mean": 0.6080000042915344, + "rewards/MMContentORM/std": 0.5947914302349091, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 910, + "train_speed(iter/s)": 0.083928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 437.0, + "completions/mean_length": 233.6375, + "completions/min_length": 158.6, + "epoch": 0.43927028324531925, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.19499173760414124, + "kl": 0.013690185546875, + "learning_rate": 9.234902783254726e-06, + "loss": 0.0005476208403706551, + "memory(GiB)": 27.09, + "reward": 0.46409996747970583, + "reward_std": 0.11554124504327774, + "rewards/MMContentORM/mean": 0.5715000033378601, + "rewards/MMContentORM/std": 0.5974150598049164, + "rewards/MMFormatORM/mean": 0.6137499809265137, + "rewards/MMFormatORM/std": 0.11046060025691987, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 915, + "train_speed(iter/s)": 0.083822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.6, + "completions/mean_length": 219.725, + "completions/min_length": 143.0, + "epoch": 0.44167066730676907, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.07724174112081528, + "kl": 0.01375732421875, + "learning_rate": 9.224317601169699e-06, + "loss": 0.000550596509128809, + "memory(GiB)": 27.09, + "reward": 0.5028999745845795, + "reward_std": 0.06095260072033852, + "rewards/MMContentORM/mean": 0.635999995470047, + "rewards/MMContentORM/std": 0.5765919387340546, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 920, + "train_speed(iter/s)": 0.083811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.6, + "completions/mean_length": 217.275, + "completions/min_length": 134.8, + "epoch": 0.4440710513682189, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.19694066047668457, + "kl": 0.01658935546875, + "learning_rate": 9.213665851370232e-06, + "loss": 0.0006623049266636372, + "memory(GiB)": 27.09, + "reward": 0.449649965763092, + "reward_std": 0.1215516519267112, + "rewards/MMContentORM/mean": 0.5460000038146973, + "rewards/MMContentORM/std": 0.6028219342231751, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 925, + "train_speed(iter/s)": 0.083789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 440.8, + "completions/mean_length": 213.4625, + "completions/min_length": 139.2, + "epoch": 0.4464714354296688, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.16958890855312347, + "kl": 0.012957763671875, + "learning_rate": 9.202947701708915e-06, + "loss": 0.000518304156139493, + "memory(GiB)": 27.09, + "reward": 0.4152499794960022, + "reward_std": 0.15832121148705483, + "rewards/MMContentORM/mean": 0.49250001907348634, + "rewards/MMContentORM/std": 0.70787513256073, + "rewards/MMFormatORM/mean": 0.5893749713897705, + "rewards/MMFormatORM/std": 0.1807103618979454, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 930, + "train_speed(iter/s)": 0.083692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.8, + "completions/mean_length": 208.15, + "completions/min_length": 133.4, + "epoch": 0.4488718194911186, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.16020964086055756, + "kl": 0.01607666015625, + "learning_rate": 9.192163321084678e-06, + "loss": 0.0006430365610867739, + "memory(GiB)": 27.09, + "reward": 0.4014999806880951, + "reward_std": 0.1008334287442267, + "rewards/MMContentORM/mean": 0.44000001847743986, + "rewards/MMContentORM/std": 0.648642772436142, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 935, + "train_speed(iter/s)": 0.083729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.8, + "completions/mean_length": 207.1375, + "completions/min_length": 117.0, + "epoch": 0.4512722035525684, + "frac_reward_zero_std": 0.55, + "grad_norm": 1.190772294998169, + "kl": 0.0478271484375, + "learning_rate": 9.181312879440129e-06, + "loss": 0.0019131312146782875, + "memory(GiB)": 27.09, + "reward": 0.45269997119903566, + "reward_std": 0.12133952155709267, + "rewards/MMContentORM/mean": 0.5680000096559524, + "rewards/MMContentORM/std": 0.6713850498199463, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 940, + "train_speed(iter/s)": 0.083772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.8, + "completions/mean_length": 203.5625, + "completions/min_length": 139.4, + "epoch": 0.45367258761401824, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.15727227926254272, + "kl": 0.015093994140625, + "learning_rate": 9.170396547758892e-06, + "loss": 0.0006036899052560329, + "memory(GiB)": 27.09, + "reward": 0.4320499897003174, + "reward_std": 0.10684382803738117, + "rewards/MMContentORM/mean": 0.5020000040531158, + "rewards/MMContentORM/std": 0.688554298877716, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 945, + "train_speed(iter/s)": 0.083855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.8, + "completions/mean_length": 222.275, + "completions/min_length": 149.2, + "epoch": 0.45607297167546806, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.20384089648723602, + "kl": 0.01500244140625, + "learning_rate": 9.159414498062889e-06, + "loss": 0.0005995483603328467, + "memory(GiB)": 27.09, + "reward": 0.3853499710559845, + "reward_std": 0.2594374790787697, + "rewards/MMContentORM/mean": 0.4714999973773956, + "rewards/MMContentORM/std": 0.7658512353897095, + "rewards/MMFormatORM/mean": 0.5606249809265137, + "rewards/MMFormatORM/std": 0.22611625194549562, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.3478711724281311, + "step": 950, + "train_speed(iter/s)": 0.083831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.2, + "completions/mean_length": 223.95, + "completions/min_length": 147.4, + "epoch": 0.4584733557369179, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.17926473915576935, + "kl": 0.014312744140625, + "learning_rate": 9.148366903409645e-06, + "loss": 0.0005721227265894413, + "memory(GiB)": 27.09, + "reward": 0.43774998784065244, + "reward_std": 0.0767210841178894, + "rewards/MMContentORM/mean": 0.4875000238418579, + "rewards/MMContentORM/std": 0.6453867673873901, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 955, + "train_speed(iter/s)": 0.08384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.6, + "completions/mean_length": 209.1375, + "completions/min_length": 123.0, + "epoch": 0.46087373979836777, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17937743663787842, + "kl": 0.015667724609375, + "learning_rate": 9.137253937889556e-06, + "loss": 0.0006268246099352837, + "memory(GiB)": 27.09, + "reward": 0.41344997882843015, + "reward_std": 0.19254517555236816, + "rewards/MMContentORM/mean": 0.5130000054836273, + "rewards/MMContentORM/std": 0.7074776887893677, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 960, + "train_speed(iter/s)": 0.083871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.2, + "completions/mean_length": 208.1375, + "completions/min_length": 128.8, + "epoch": 0.4632741238598176, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.15272051095962524, + "kl": 0.016375732421875, + "learning_rate": 9.12607577662315e-06, + "loss": 0.0006551730446517467, + "memory(GiB)": 27.09, + "reward": 0.4290499806404114, + "reward_std": 0.1823628380894661, + "rewards/MMContentORM/mean": 0.5520000040531159, + "rewards/MMContentORM/std": 0.6928182065486908, + "rewards/MMFormatORM/mean": 0.5768749952316284, + "rewards/MMFormatORM/std": 0.20230934023857117, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.31124515533447267, + "step": 965, + "train_speed(iter/s)": 0.083892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.6, + "completions/mean_length": 219.725, + "completions/min_length": 142.0, + "epoch": 0.4656745079212674, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.20706000924110413, + "kl": 0.01566162109375, + "learning_rate": 9.114832595758315e-06, + "loss": 0.0006271812599152327, + "memory(GiB)": 27.09, + "reward": 0.4845999896526337, + "reward_std": 0.1179454043507576, + "rewards/MMContentORM/mean": 0.6515000104904175, + "rewards/MMContentORM/std": 0.6177137017250061, + "rewards/MMFormatORM/mean": 0.5974999666213989, + "rewards/MMFormatORM/std": 0.1561816841363907, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 970, + "train_speed(iter/s)": 0.083908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.6, + "completions/mean_length": 220.25, + "completions/min_length": 135.8, + "epoch": 0.46807489198271723, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.15420930087566376, + "kl": 0.029974365234375, + "learning_rate": 9.103524572467542e-06, + "loss": 0.0012021941132843495, + "memory(GiB)": 27.09, + "reward": 0.39144999980926515, + "reward_std": 0.18759542852640151, + "rewards/MMContentORM/mean": 0.43499999344348905, + "rewards/MMContentORM/std": 0.7306297183036804, + "rewards/MMFormatORM/mean": 0.5893749713897705, + "rewards/MMFormatORM/std": 0.20309771001338958, + "rewards/MMRubricORM/mean": -0.09149999916553497, + "rewards/MMRubricORM/std": 0.3158472299575806, + "step": 975, + "train_speed(iter/s)": 0.083941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.8, + "completions/mean_length": 213.75, + "completions/min_length": 141.8, + "epoch": 0.47047527604416706, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.15446196496486664, + "kl": 0.016387939453125, + "learning_rate": 9.092151884945117e-06, + "loss": 0.0006551665253937244, + "memory(GiB)": 27.09, + "reward": 0.5428999841213227, + "reward_std": 0.055295750661753115, + "rewards/MMContentORM/mean": 0.735999995470047, + "rewards/MMContentORM/std": 0.4234260804951191, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 980, + "train_speed(iter/s)": 0.083929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.6, + "completions/mean_length": 211.8125, + "completions/min_length": 134.4, + "epoch": 0.4728756601056169, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.10230781883001328, + "kl": 0.017529296875, + "learning_rate": 9.080714712404322e-06, + "loss": 0.0007016819901764393, + "memory(GiB)": 27.09, + "reward": 0.3961999833583832, + "reward_std": 0.22203153222799302, + "rewards/MMContentORM/mean": 0.4880000114440918, + "rewards/MMContentORM/std": 0.6864187598228455, + "rewards/MMFormatORM/mean": 0.5649999797344207, + "rewards/MMFormatORM/std": 0.18773135244846345, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.28837831020355226, + "step": 985, + "train_speed(iter/s)": 0.08397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 472.8, + "completions/mean_length": 230.55, + "completions/min_length": 149.6, + "epoch": 0.47527604416706676, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.14536498486995697, + "kl": 0.014703369140625, + "learning_rate": 9.069213235074606e-06, + "loss": 0.0005882191471755505, + "memory(GiB)": 27.09, + "reward": 0.47779998779296873, + "reward_std": 0.11624835301190614, + "rewards/MMContentORM/mean": 0.6019999861717225, + "rewards/MMContentORM/std": 0.646269428730011, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 990, + "train_speed(iter/s)": 0.083826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.2, + "completions/mean_length": 213.1, + "completions/min_length": 143.2, + "epoch": 0.4776764282285166, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.22323811054229736, + "kl": 0.01553955078125, + "learning_rate": 9.057647634198745e-06, + "loss": 0.0006211692001670599, + "memory(GiB)": 27.09, + "reward": 0.4356500029563904, + "reward_std": 0.13456242978572847, + "rewards/MMContentORM/mean": 0.511000007390976, + "rewards/MMContentORM/std": 0.6877371788024902, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 995, + "train_speed(iter/s)": 0.083869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.4, + "completions/mean_length": 207.8875, + "completions/min_length": 126.6, + "epoch": 0.4800768122899664, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.12058038264513016, + "kl": 0.01517333984375, + "learning_rate": 9.046018092029991e-06, + "loss": 0.0006069786846637726, + "memory(GiB)": 27.09, + "reward": 0.38174998164176943, + "reward_std": 0.11306637614034117, + "rewards/MMContentORM/mean": 0.40499998927116393, + "rewards/MMContentORM/std": 0.7353096008300781, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 1000, + "train_speed(iter/s)": 0.083877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.4, + "completions/mean_length": 219.125, + "completions/min_length": 153.2, + "epoch": 0.4824771963514162, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1304665207862854, + "kl": 0.0144775390625, + "learning_rate": 9.034324791829198e-06, + "loss": 0.0005794113036245108, + "memory(GiB)": 27.09, + "reward": 0.583199965953827, + "reward_std": 0.028849952155724168, + "rewards/MMContentORM/mean": 0.8079999923706055, + "rewards/MMContentORM/std": 0.34920589849352834, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 1005, + "train_speed(iter/s)": 0.083816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 210.7125, + "completions/min_length": 124.8, + "epoch": 0.48487758041286605, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.20664307475090027, + "kl": 0.018109130859375, + "learning_rate": 9.022567917861929e-06, + "loss": 0.0007231380324810744, + "memory(GiB)": 27.09, + "reward": 0.36904996633529663, + "reward_std": 0.08435783945024014, + "rewards/MMContentORM/mean": 0.4019999861717224, + "rewards/MMContentORM/std": 0.7406057715415955, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.1856150358915329, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.285561603307724, + "step": 1010, + "train_speed(iter/s)": 0.083848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.8, + "completions/mean_length": 209.65, + "completions/min_length": 125.8, + "epoch": 0.48727796447431587, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.18518085777759552, + "kl": 0.014801025390625, + "learning_rate": 9.010747655395558e-06, + "loss": 0.0005913883913308382, + "memory(GiB)": 27.09, + "reward": 0.45489998161792755, + "reward_std": 0.11144002974033355, + "rewards/MMContentORM/mean": 0.5610000133514405, + "rewards/MMContentORM/std": 0.6329753637313843, + "rewards/MMFormatORM/mean": 0.6074999809265137, + "rewards/MMFormatORM/std": 0.11700960993766785, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1015, + "train_speed(iter/s)": 0.083883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 442.2, + "completions/mean_length": 222.925, + "completions/min_length": 125.8, + "epoch": 0.4896783485357657, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.14311504364013672, + "kl": 0.01640625, + "learning_rate": 8.998864190696349e-06, + "loss": 0.0006562491878867149, + "memory(GiB)": 27.09, + "reward": 0.4841999769210815, + "reward_std": 0.12699637860059737, + "rewards/MMContentORM/mean": 0.6180000066757202, + "rewards/MMContentORM/std": 0.6347463011741639, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1020, + "train_speed(iter/s)": 0.083779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.2, + "completions/mean_length": 212.7125, + "completions/min_length": 141.4, + "epoch": 0.4920787325972156, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.2222282737493515, + "kl": 0.015057373046875, + "learning_rate": 8.986917711026519e-06, + "loss": 0.0006025471724569797, + "memory(GiB)": 27.09, + "reward": 0.5089999794960022, + "reward_std": 0.12529932723846288, + "rewards/MMContentORM/mean": 0.6800000011920929, + "rewards/MMContentORM/std": 0.49175867438316345, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1025, + "train_speed(iter/s)": 0.083808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/mean_length": 217.975, + "completions/min_length": 131.0, + "epoch": 0.4944791166586654, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17176951467990875, + "kl": 0.01639404296875, + "learning_rate": 8.974908404641294e-06, + "loss": 0.0006549724377691746, + "memory(GiB)": 27.09, + "reward": 0.3853999853134155, + "reward_std": 0.16772572994232177, + "rewards/MMContentORM/mean": 0.4284999996423721, + "rewards/MMContentORM/std": 0.7090068340301514, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 1030, + "train_speed(iter/s)": 0.083807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/mean_length": 204.675, + "completions/min_length": 144.2, + "epoch": 0.4968795007201152, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1412537693977356, + "kl": 0.016650390625, + "learning_rate": 8.962836460785929e-06, + "loss": 0.0006653706543147564, + "memory(GiB)": 27.09, + "reward": 0.48119999170303346, + "reward_std": 0.09107534990180284, + "rewards/MMContentORM/mean": 0.6105000078678131, + "rewards/MMContentORM/std": 0.6093651533126831, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1035, + "train_speed(iter/s)": 0.083855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.6, + "completions/mean_length": 202.1125, + "completions/min_length": 99.2, + "epoch": 0.49927988478156504, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16436995565891266, + "kl": 0.0226806640625, + "learning_rate": 8.950702069692739e-06, + "loss": 0.0009060959331691265, + "memory(GiB)": 27.09, + "reward": 0.46224998235702514, + "reward_std": 0.11858180016279221, + "rewards/MMContentORM/mean": 0.5775000125169754, + "rewards/MMContentORM/std": 0.6317477941513061, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 1040, + "train_speed(iter/s)": 0.083873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.6, + "completions/mean_length": 205.5125, + "completions/min_length": 109.2, + "epoch": 0.5016802688430149, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1451932191848755, + "kl": 0.118011474609375, + "learning_rate": 8.938505422578095e-06, + "loss": 0.004709529504179954, + "memory(GiB)": 27.09, + "reward": 0.46809998750686643, + "reward_std": 0.14071424752473832, + "rewards/MMContentORM/mean": 0.5939999878406524, + "rewards/MMContentORM/std": 0.6745672464370728, + "rewards/MMFormatORM/mean": 0.6074999809265137, + "rewards/MMFormatORM/std": 0.12880690693855285, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1045, + "train_speed(iter/s)": 0.083921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 468.0, + "completions/mean_length": 218.65, + "completions/min_length": 135.8, + "epoch": 0.5040806529044647, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.24477499723434448, + "kl": 0.019097900390625, + "learning_rate": 8.92624671163941e-06, + "loss": 0.0007639925926923752, + "memory(GiB)": 27.09, + "reward": 0.5066499829292297, + "reward_std": 0.12579429522156715, + "rewards/MMContentORM/mean": 0.6884999990463256, + "rewards/MMContentORM/std": 0.60511314868927, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1050, + "train_speed(iter/s)": 0.08379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.2, + "completions/mean_length": 196.3125, + "completions/min_length": 116.0, + "epoch": 0.5064810369659145, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.09241645038127899, + "kl": 0.020648193359375, + "learning_rate": 8.913926130052116e-06, + "loss": 0.0008254698477685452, + "memory(GiB)": 27.09, + "reward": 0.48494998812675477, + "reward_std": 0.08577205466572195, + "rewards/MMContentORM/mean": 0.6054999828338623, + "rewards/MMContentORM/std": 0.5610681354999543, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1055, + "train_speed(iter/s)": 0.083853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.2, + "completions/mean_length": 206.4125, + "completions/min_length": 138.6, + "epoch": 0.5088814210273643, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1564904898405075, + "kl": 0.016485595703125, + "learning_rate": 8.901543871966614e-06, + "loss": 0.0006593840662389994, + "memory(GiB)": 27.09, + "reward": 0.46794998049736025, + "reward_std": 0.10472251027822495, + "rewards/MMContentORM/mean": 0.5630000084638596, + "rewards/MMContentORM/std": 0.6074943840503693, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1060, + "train_speed(iter/s)": 0.083877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.4, + "completions/mean_length": 198.475, + "completions/min_length": 135.0, + "epoch": 0.5112818050888142, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16738565266132355, + "kl": 0.016766357421875, + "learning_rate": 8.889100132505217e-06, + "loss": 0.0006704972125589848, + "memory(GiB)": 27.09, + "reward": 0.48009997606277466, + "reward_std": 0.07410478852689266, + "rewards/MMContentORM/mean": 0.6365000009536743, + "rewards/MMContentORM/std": 0.5494045548141002, + "rewards/MMFormatORM/mean": 0.6012499749660491, + "rewards/MMFormatORM/std": 0.10254122316837311, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.15775573253631592, + "step": 1065, + "train_speed(iter/s)": 0.083925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 457.8, + "completions/mean_length": 219.9375, + "completions/min_length": 134.6, + "epoch": 0.5136821891502641, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.19380617141723633, + "kl": 0.014617919921875, + "learning_rate": 8.876595107759075e-06, + "loss": 0.000584835559129715, + "memory(GiB)": 27.09, + "reward": 0.5560999870300293, + "reward_std": 0.0694378862157464, + "rewards/MMContentORM/mean": 0.7690000057220459, + "rewards/MMContentORM/std": 0.4652357272803783, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1070, + "train_speed(iter/s)": 0.083827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.6, + "completions/mean_length": 212.1625, + "completions/min_length": 127.2, + "epoch": 0.5160825732117139, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.12439420074224472, + "kl": 0.01439208984375, + "learning_rate": 8.86402899478508e-06, + "loss": 0.0005762668326497078, + "memory(GiB)": 27.09, + "reward": 0.5047999918460846, + "reward_std": 0.0550129035487771, + "rewards/MMContentORM/mean": 0.6694999992847442, + "rewards/MMContentORM/std": 0.551321929693222, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1075, + "train_speed(iter/s)": 0.083826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.8, + "completions/mean_length": 200.025, + "completions/min_length": 134.0, + "epoch": 0.5184829572731637, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.20139308273792267, + "kl": 0.0151123046875, + "learning_rate": 8.851401991602776e-06, + "loss": 0.0006052942015230655, + "memory(GiB)": 27.09, + "reward": 0.49969996213912965, + "reward_std": 0.06660945881158113, + "rewards/MMContentORM/mean": 0.628000020980835, + "rewards/MMContentORM/std": 0.5635871171951294, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1080, + "train_speed(iter/s)": 0.083879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.2, + "completions/mean_length": 219.8, + "completions/min_length": 130.8, + "epoch": 0.5208833413346136, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1856120079755783, + "kl": 0.015875244140625, + "learning_rate": 8.838714297191222e-06, + "loss": 0.0006359885912388564, + "memory(GiB)": 27.09, + "reward": 0.5098999857902526, + "reward_std": 0.08160011963918805, + "rewards/MMContentORM/mean": 0.6534999966621399, + "rewards/MMContentORM/std": 0.5234884560108185, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1085, + "train_speed(iter/s)": 0.083889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.6, + "completions/mean_length": 212.625, + "completions/min_length": 128.8, + "epoch": 0.5232837253960634, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.07770657539367676, + "kl": 0.0150390625, + "learning_rate": 8.82596611148586e-06, + "loss": 0.0006017507985234261, + "memory(GiB)": 27.09, + "reward": 0.4991499662399292, + "reward_std": 0.03245619940571487, + "rewards/MMContentORM/mean": 0.6410000085830688, + "rewards/MMContentORM/std": 0.5666637182235718, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.05240467190742493, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.08062257766723632, + "step": 1090, + "train_speed(iter/s)": 0.083867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.6, + "completions/mean_length": 213.2, + "completions/min_length": 132.2, + "epoch": 0.5256841094575132, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14051133394241333, + "kl": 0.017071533203125, + "learning_rate": 8.81315763537537e-06, + "loss": 0.000683901971206069, + "memory(GiB)": 27.09, + "reward": 0.39309998154640197, + "reward_std": 0.13378459885716437, + "rewards/MMContentORM/mean": 0.41899998784065245, + "rewards/MMContentORM/std": 0.7018602132797241, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 1095, + "train_speed(iter/s)": 0.083859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.2, + "completions/mean_length": 225.4375, + "completions/min_length": 138.0, + "epoch": 0.528084493518963, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1682557910680771, + "kl": 0.01475830078125, + "learning_rate": 8.8002890706985e-06, + "loss": 0.000590839795768261, + "memory(GiB)": 27.09, + "reward": 0.46239997148513795, + "reward_std": 0.13746155560947954, + "rewards/MMContentORM/mean": 0.5634999990463256, + "rewards/MMContentORM/std": 0.6250258028507233, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1100, + "train_speed(iter/s)": 0.083827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.2, + "completions/mean_length": 211.5, + "completions/min_length": 137.2, + "epoch": 0.5304848775804129, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.10628636926412582, + "kl": 0.020086669921875, + "learning_rate": 8.787360620240891e-06, + "loss": 0.0008035540580749512, + "memory(GiB)": 27.09, + "reward": 0.4173499792814255, + "reward_std": 0.16496800733730196, + "rewards/MMContentORM/mean": 0.4939999938011169, + "rewards/MMContentORM/std": 0.6666546583175659, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.13730934262275696, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21124515533447266, + "step": 1105, + "train_speed(iter/s)": 0.083764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.8, + "completions/mean_length": 221.6875, + "completions/min_length": 120.4, + "epoch": 0.5328852616418627, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.2164710909128189, + "kl": 0.01485595703125, + "learning_rate": 8.77437248773187e-06, + "loss": 0.0005937457084655762, + "memory(GiB)": 27.09, + "reward": 0.4448999762535095, + "reward_std": 0.11822825372219085, + "rewards/MMContentORM/mean": 0.5485000073909759, + "rewards/MMContentORM/std": 0.6824892044067383, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 1110, + "train_speed(iter/s)": 0.08375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.2, + "completions/mean_length": 226.6375, + "completions/min_length": 122.2, + "epoch": 0.5352856457033125, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.17362850904464722, + "kl": 0.020477294921875, + "learning_rate": 8.761324877841254e-06, + "loss": 0.0008191258646547794, + "memory(GiB)": 27.09, + "reward": 0.43039997220039367, + "reward_std": 0.17041273415088654, + "rewards/MMContentORM/mean": 0.5735000014305115, + "rewards/MMContentORM/std": 0.7222278237342834, + "rewards/MMFormatORM/mean": 0.5649999737739563, + "rewards/MMFormatORM/std": 0.21917218267917632, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.335561603307724, + "step": 1115, + "train_speed(iter/s)": 0.083757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.6, + "completions/mean_length": 223.8625, + "completions/min_length": 119.4, + "epoch": 0.5376860297647623, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.1994379311800003, + "kl": 0.01700439453125, + "learning_rate": 8.748217996176112e-06, + "loss": 0.0006800967268645764, + "memory(GiB)": 27.09, + "reward": 0.39449998140335085, + "reward_std": 0.26742778718471527, + "rewards/MMContentORM/mean": 0.48000000715255736, + "rewards/MMContentORM/std": 0.7163052916526794, + "rewards/MMFormatORM/mean": 0.5687499821186066, + "rewards/MMFormatORM/std": 0.18744589388370514, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.28837831020355226, + "step": 1120, + "train_speed(iter/s)": 0.08372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.4, + "completions/mean_length": 216.15, + "completions/min_length": 138.6, + "epoch": 0.5400864138262121, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.07599830627441406, + "kl": 0.01529541015625, + "learning_rate": 8.735052049277535e-06, + "loss": 0.0006118299439549446, + "memory(GiB)": 27.09, + "reward": 0.42814998626708983, + "reward_std": 0.18024151921272277, + "rewards/MMContentORM/mean": 0.5210000097751617, + "rewards/MMContentORM/std": 0.6903650641441346, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 1125, + "train_speed(iter/s)": 0.083741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.4, + "completions/mean_length": 228.1375, + "completions/min_length": 141.6, + "epoch": 0.5424867978876621, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11949385702610016, + "kl": 0.01396484375, + "learning_rate": 8.721827244617371e-06, + "loss": 0.000558951823040843, + "memory(GiB)": 27.09, + "reward": 0.4563499987125397, + "reward_std": 0.1478560298681259, + "rewards/MMContentORM/mean": 0.5789999961853027, + "rewards/MMContentORM/std": 0.6697975873947144, + "rewards/MMFormatORM/mean": 0.5993749856948852, + "rewards/MMFormatORM/std": 0.15370826721191405, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1130, + "train_speed(iter/s)": 0.083701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.4, + "completions/mean_length": 216.7125, + "completions/min_length": 152.6, + "epoch": 0.5448871819491119, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1708482950925827, + "kl": 0.0151611328125, + "learning_rate": 8.708543790594966e-06, + "loss": 0.0006066753529012203, + "memory(GiB)": 27.09, + "reward": 0.532749992609024, + "reward_std": 0.06851864554919303, + "rewards/MMContentORM/mean": 0.7250000238418579, + "rewards/MMContentORM/std": 0.4237551301717758, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1135, + "train_speed(iter/s)": 0.083719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/mean_length": 214.9625, + "completions/min_length": 138.4, + "epoch": 0.5472875660105617, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14022567868232727, + "kl": 0.013623046875, + "learning_rate": 8.695201896533875e-06, + "loss": 0.0005450892262160778, + "memory(GiB)": 27.09, + "reward": 0.4738999783992767, + "reward_std": 0.15061374502256514, + "rewards/MMContentORM/mean": 0.6210000067949295, + "rewards/MMContentORM/std": 0.549624501913786, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1140, + "train_speed(iter/s)": 0.083726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/mean_length": 217.35, + "completions/min_length": 123.6, + "epoch": 0.5496879500720115, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1822585016489029, + "kl": 0.018511962890625, + "learning_rate": 8.681801772678564e-06, + "loss": 0.0007403687573969364, + "memory(GiB)": 27.09, + "reward": 0.3406499683856964, + "reward_std": 0.20272752242162823, + "rewards/MMContentORM/mean": 0.33100001215934755, + "rewards/MMContentORM/std": 0.737173342704773, + "rewards/MMFormatORM/mean": 0.576874977350235, + "rewards/MMFormatORM/std": 0.17944467663764954, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27606874108314516, + "step": 1145, + "train_speed(iter/s)": 0.083719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/mean_length": 212.4125, + "completions/min_length": 141.6, + "epoch": 0.5520883341334614, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16844458878040314, + "kl": 0.0210693359375, + "learning_rate": 8.668343630191094e-06, + "loss": 0.0008432833477854728, + "memory(GiB)": 27.09, + "reward": 0.48644999265670774, + "reward_std": 0.1396535847336054, + "rewards/MMContentORM/mean": 0.6380000114440918, + "rewards/MMContentORM/std": 0.5572850041091442, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1150, + "train_speed(iter/s)": 0.08373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.4, + "completions/mean_length": 211.1125, + "completions/min_length": 141.8, + "epoch": 0.5544887181949112, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17075812816619873, + "kl": 0.01673583984375, + "learning_rate": 8.654827681147798e-06, + "loss": 0.0006688498891890049, + "memory(GiB)": 27.09, + "reward": 0.4780499696731567, + "reward_std": 0.11589480005204678, + "rewards/MMContentORM/mean": 0.57950000166893, + "rewards/MMContentORM/std": 0.6060647606849671, + "rewards/MMFormatORM/mean": 0.6281249761581421, + "rewards/MMFormatORM/std": 0.06690345257520676, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 1155, + "train_speed(iter/s)": 0.083765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.8, + "completions/mean_length": 212.925, + "completions/min_length": 129.6, + "epoch": 0.556889102256361, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18922355771064758, + "kl": 0.015997314453125, + "learning_rate": 8.641254138535937e-06, + "loss": 0.0006405468098819256, + "memory(GiB)": 27.09, + "reward": 0.4417499840259552, + "reward_std": 0.146866075694561, + "rewards/MMContentORM/mean": 0.5550000250339509, + "rewards/MMContentORM/std": 0.6111162975430489, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.14121158123016359, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21724859476089478, + "step": 1160, + "train_speed(iter/s)": 0.08377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 466.2, + "completions/mean_length": 219.0375, + "completions/min_length": 125.6, + "epoch": 0.5592894863178108, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.09351535886526108, + "kl": 0.014825439453125, + "learning_rate": 8.627623216250345e-06, + "loss": 0.0005931487306952476, + "memory(GiB)": 27.09, + "reward": 0.4562999814748764, + "reward_std": 0.08669129339978099, + "rewards/MMContentORM/mean": 0.5520000055432319, + "rewards/MMContentORM/std": 0.5042887216433882, + "rewards/MMFormatORM/mean": 0.6137499809265137, + "rewards/MMFormatORM/std": 0.09990466982126237, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1165, + "train_speed(iter/s)": 0.083689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.8, + "completions/mean_length": 198.775, + "completions/min_length": 119.8, + "epoch": 0.5616898703792607, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14243614673614502, + "kl": 0.0142822265625, + "learning_rate": 8.613935129090055e-06, + "loss": 0.0005715936422348022, + "memory(GiB)": 27.09, + "reward": 0.5254999697208405, + "reward_std": 0.06462955782189965, + "rewards/MMContentORM/mean": 0.6925000309944153, + "rewards/MMContentORM/std": 0.53197683095932, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1170, + "train_speed(iter/s)": 0.083721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/mean_length": 208.5875, + "completions/min_length": 140.6, + "epoch": 0.5640902544407105, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.08023513108491898, + "kl": 0.016717529296875, + "learning_rate": 8.60019009275492e-06, + "loss": 0.000668759923428297, + "memory(GiB)": 27.09, + "reward": 0.4964999794960022, + "reward_std": 0.054164377762936054, + "rewards/MMContentORM/mean": 0.620000010728836, + "rewards/MMContentORM/std": 0.5584413051605225, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1175, + "train_speed(iter/s)": 0.083728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 470.0, + "completions/mean_length": 220.7125, + "completions/min_length": 119.2, + "epoch": 0.5664906385021603, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.2373812049627304, + "kl": 0.019677734375, + "learning_rate": 8.586388323842207e-06, + "loss": 0.0007880028337240219, + "memory(GiB)": 27.09, + "reward": 0.39134998321533204, + "reward_std": 0.16157390028238297, + "rewards/MMContentORM/mean": 0.4289999961853027, + "rewards/MMContentORM/std": 0.7077797532081604, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 1180, + "train_speed(iter/s)": 0.083613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 201.75, + "completions/min_length": 114.4, + "epoch": 0.5688910225636101, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.23980411887168884, + "kl": 0.01798095703125, + "learning_rate": 8.57253003984319e-06, + "loss": 0.0007191254291683436, + "memory(GiB)": 27.09, + "reward": 0.5243999719619751, + "reward_std": 0.07297341881785542, + "rewards/MMContentORM/mean": 0.7185000061988831, + "rewards/MMContentORM/std": 0.5680613338947296, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1185, + "train_speed(iter/s)": 0.083637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.2, + "completions/mean_length": 204.625, + "completions/min_length": 149.6, + "epoch": 0.5712914066250601, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12727871537208557, + "kl": 0.014764404296875, + "learning_rate": 8.558615459139717e-06, + "loss": 0.0005905915051698685, + "memory(GiB)": 27.09, + "reward": 0.4597499847412109, + "reward_std": 0.11066221240907907, + "rewards/MMContentORM/mean": 0.5424999952316284, + "rewards/MMContentORM/std": 0.6061853706836701, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1190, + "train_speed(iter/s)": 0.083665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/mean_length": 218.9125, + "completions/min_length": 149.8, + "epoch": 0.5736917906865099, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.15900759398937225, + "kl": 0.0161865234375, + "learning_rate": 8.544644801000777e-06, + "loss": 0.0006472207140177488, + "memory(GiB)": 27.09, + "reward": 0.409499979019165, + "reward_std": 0.14325983561575412, + "rewards/MMContentORM/mean": 0.46000003516674043, + "rewards/MMContentORM/std": 0.5570301927626133, + "rewards/MMFormatORM/mean": 0.6012499690055847, + "rewards/MMFormatORM/std": 0.13321036398410796, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.20493902564048766, + "step": 1195, + "train_speed(iter/s)": 0.083651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.4, + "completions/mean_length": 206.2, + "completions/min_length": 123.6, + "epoch": 0.5760921747479597, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.009572326205670834, + "kl": 0.034039306640625, + "learning_rate": 8.53061828557903e-06, + "loss": 0.001363489031791687, + "memory(GiB)": 27.09, + "reward": 0.4724999785423279, + "reward_std": 0.15542207062244415, + "rewards/MMContentORM/mean": 0.6175000131130218, + "rewards/MMContentORM/std": 0.6357908546924591, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1200, + "train_speed(iter/s)": 0.083674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 449.6, + "completions/mean_length": 214.8375, + "completions/min_length": 136.4, + "epoch": 0.5784925588094095, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.08620696514844894, + "kl": 0.015582275390625, + "learning_rate": 8.51653613390736e-06, + "loss": 0.0006235348992049694, + "memory(GiB)": 27.09, + "reward": 0.4412499874830246, + "reward_std": 0.11023794980719685, + "rewards/MMContentORM/mean": 0.5250000119209289, + "rewards/MMContentORM/std": 0.6168273031711579, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1205, + "train_speed(iter/s)": 0.083513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.6, + "completions/mean_length": 208.2875, + "completions/min_length": 119.8, + "epoch": 0.5808929428708594, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.25394561886787415, + "kl": 0.0171142578125, + "learning_rate": 8.502398567895369e-06, + "loss": 0.0006845718715339899, + "memory(GiB)": 27.09, + "reward": 0.4345999777317047, + "reward_std": 0.09871211070567369, + "rewards/MMContentORM/mean": 0.4939999908208847, + "rewards/MMContentORM/std": 0.6138741195201873, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 1210, + "train_speed(iter/s)": 0.083519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.4, + "completions/mean_length": 209.825, + "completions/min_length": 141.4, + "epoch": 0.5832933269323092, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.17071519792079926, + "kl": 0.018353271484375, + "learning_rate": 8.488205810325892e-06, + "loss": 0.0007337843533605337, + "memory(GiB)": 27.09, + "reward": 0.49284998178482053, + "reward_std": 0.13003694042563438, + "rewards/MMContentORM/mean": 0.6540000021457673, + "rewards/MMContentORM/std": 0.5871677160263061, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1215, + "train_speed(iter/s)": 0.083533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.2, + "completions/mean_length": 207.375, + "completions/min_length": 124.6, + "epoch": 0.585693710993759, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.17592158913612366, + "kl": 0.0169677734375, + "learning_rate": 8.473958084851487e-06, + "loss": 0.000678650476038456, + "memory(GiB)": 27.09, + "reward": 0.5602999925613403, + "reward_std": 0.06406386941671371, + "rewards/MMContentORM/mean": 0.7795000076293945, + "rewards/MMContentORM/std": 0.4123713135719299, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1220, + "train_speed(iter/s)": 0.083569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.4, + "completions/mean_length": 204.0375, + "completions/min_length": 137.8, + "epoch": 0.5880940950552088, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1302787959575653, + "kl": 0.01573486328125, + "learning_rate": 8.459655615990908e-06, + "loss": 0.000629202276468277, + "memory(GiB)": 27.09, + "reward": 0.45289998650550845, + "reward_std": 0.1441083623562008, + "rewards/MMContentORM/mean": 0.5685000061988831, + "rewards/MMContentORM/std": 0.6687065124511719, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 1225, + "train_speed(iter/s)": 0.083593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 209.125, + "completions/min_length": 138.2, + "epoch": 0.5904944791166586, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08660220354795456, + "kl": 0.01583251953125, + "learning_rate": 8.445298629125566e-06, + "loss": 0.0006336371414363384, + "memory(GiB)": 27.09, + "reward": 0.5385999858379364, + "reward_std": 0.09079250784125178, + "rewards/MMContentORM/mean": 0.7539999961853028, + "rewards/MMContentORM/std": 0.5569849014282227, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1230, + "train_speed(iter/s)": 0.083624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.8, + "completions/mean_length": 207.4375, + "completions/min_length": 133.8, + "epoch": 0.5928948631781085, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.10941971838474274, + "kl": 0.013848876953125, + "learning_rate": 8.430887350495978e-06, + "loss": 0.0005538208410143852, + "memory(GiB)": 27.09, + "reward": 0.5291499614715576, + "reward_std": 0.0521137666888535, + "rewards/MMContentORM/mean": 0.7159999907016754, + "rewards/MMContentORM/std": 0.41987812891602516, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1235, + "train_speed(iter/s)": 0.083672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/mean_length": 207.2, + "completions/min_length": 133.4, + "epoch": 0.5952952472395583, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17035551369190216, + "kl": 0.016778564453125, + "learning_rate": 8.416422007198204e-06, + "loss": 0.0006709801964461803, + "memory(GiB)": 27.09, + "reward": 0.49859996438026427, + "reward_std": 0.09135819533839822, + "rewards/MMContentORM/mean": 0.6540000200271606, + "rewards/MMContentORM/std": 0.6145297229290009, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1240, + "train_speed(iter/s)": 0.083718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 468.6, + "completions/mean_length": 221.25, + "completions/min_length": 141.2, + "epoch": 0.5976956313010081, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.16948705911636353, + "kl": 0.016485595703125, + "learning_rate": 8.401902827180267e-06, + "loss": 0.0006599447224289179, + "memory(GiB)": 27.09, + "reward": 0.4580999851226807, + "reward_std": 0.10960154831409455, + "rewards/MMContentORM/mean": 0.5815000176429749, + "rewards/MMContentORM/std": 0.6139387130737305, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1245, + "train_speed(iter/s)": 0.083619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.2, + "completions/mean_length": 225.325, + "completions/min_length": 134.2, + "epoch": 0.600096015362458, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10282408446073532, + "kl": 0.0154541015625, + "learning_rate": 8.387330039238558e-06, + "loss": 0.000617855554446578, + "memory(GiB)": 27.09, + "reward": 0.38104998469352724, + "reward_std": 0.2287490501999855, + "rewards/MMContentORM/mean": 0.4320000112056732, + "rewards/MMContentORM/std": 0.749615466594696, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 1250, + "train_speed(iter/s)": 0.083602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/mean_length": 237.125, + "completions/min_length": 158.0, + "epoch": 0.6024963994239079, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13665515184402466, + "kl": 0.0227783203125, + "learning_rate": 8.372703873014236e-06, + "loss": 0.0009101461619138718, + "memory(GiB)": 27.09, + "reward": 0.4327999770641327, + "reward_std": 0.1585333364084363, + "rewards/MMContentORM/mean": 0.5470000147819519, + "rewards/MMContentORM/std": 0.6171560496091842, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 1255, + "train_speed(iter/s)": 0.083549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.8, + "completions/mean_length": 221.9875, + "completions/min_length": 144.6, + "epoch": 0.6048967834853577, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1299554705619812, + "kl": 0.01610107421875, + "learning_rate": 8.358024558989606e-06, + "loss": 0.0006435022689402104, + "memory(GiB)": 27.09, + "reward": 0.49649999141693113, + "reward_std": 0.0552957511274144, + "rewards/MMContentORM/mean": 0.6199999928474427, + "rewards/MMContentORM/std": 0.6000278711318969, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1260, + "train_speed(iter/s)": 0.083545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 459.4, + "completions/mean_length": 225.875, + "completions/min_length": 150.6, + "epoch": 0.6072971675468075, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.12043029069900513, + "kl": 0.014471435546875, + "learning_rate": 8.34329232848449e-06, + "loss": 0.0005782137159258127, + "memory(GiB)": 27.09, + "reward": 0.4554999887943268, + "reward_std": 0.1602303996682167, + "rewards/MMContentORM/mean": 0.5749999940395355, + "rewards/MMContentORM/std": 0.6556944012641907, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 1265, + "train_speed(iter/s)": 0.083458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 459.0, + "completions/mean_length": 224.525, + "completions/min_length": 138.0, + "epoch": 0.6096975516082573, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.15134023129940033, + "kl": 0.017626953125, + "learning_rate": 8.328507413652569e-06, + "loss": 0.0007050371263176203, + "memory(GiB)": 27.09, + "reward": 0.45614997744560243, + "reward_std": 0.14799745231866837, + "rewards/MMContentORM/mean": 0.5910000026226043, + "rewards/MMContentORM/std": 0.6682988286018372, + "rewards/MMFormatORM/mean": 0.5931249976158142, + "rewards/MMFormatORM/std": 0.18240466713905334, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2806225776672363, + "step": 1270, + "train_speed(iter/s)": 0.083379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.4, + "completions/mean_length": 216.9125, + "completions/min_length": 131.2, + "epoch": 0.6120979356697072, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1947038173675537, + "kl": 0.017083740234375, + "learning_rate": 8.313670047477751e-06, + "loss": 0.0006824467331171036, + "memory(GiB)": 27.09, + "reward": 0.43594998121261597, + "reward_std": 0.1567655718419701, + "rewards/MMContentORM/mean": 0.5405000030994416, + "rewards/MMContentORM/std": 0.6641241073608398, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.13730934262275696, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21124515533447266, + "step": 1275, + "train_speed(iter/s)": 0.083394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 455.8, + "completions/mean_length": 234.675, + "completions/min_length": 138.6, + "epoch": 0.614498319731157, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14940251410007477, + "kl": 0.0174072265625, + "learning_rate": 8.29878046377047e-06, + "loss": 0.0006969640962779522, + "memory(GiB)": 27.09, + "reward": 0.5129499852657318, + "reward_std": 0.10443966835737228, + "rewards/MMContentORM/mean": 0.6755000114440918, + "rewards/MMContentORM/std": 0.5826462268829345, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1280, + "train_speed(iter/s)": 0.083311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/mean_length": 224.5375, + "completions/min_length": 146.6, + "epoch": 0.6168987037926068, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.23286378383636475, + "kl": 0.01444091796875, + "learning_rate": 8.283838897164022e-06, + "loss": 0.0005787207745015621, + "memory(GiB)": 27.09, + "reward": 0.5347499787807465, + "reward_std": 0.0849235224770382, + "rewards/MMContentORM/mean": 0.7300000071525574, + "rewards/MMContentORM/std": 0.5398510098457336, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1285, + "train_speed(iter/s)": 0.08333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.8, + "completions/mean_length": 215.95, + "completions/min_length": 139.2, + "epoch": 0.6192990878540566, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16865944862365723, + "kl": 0.01298828125, + "learning_rate": 8.268845583110863e-06, + "loss": 0.0005195950157940388, + "memory(GiB)": 27.09, + "reward": 0.5143999695777893, + "reward_std": 0.09899494871497154, + "rewards/MMContentORM/mean": 0.693500018119812, + "rewards/MMContentORM/std": 0.5922718286514282, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1290, + "train_speed(iter/s)": 0.083314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.6, + "completions/mean_length": 209.5, + "completions/min_length": 135.2, + "epoch": 0.6216994719155065, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.12279748171567917, + "kl": 0.016485595703125, + "learning_rate": 8.253800757878886e-06, + "loss": 0.0006598389707505703, + "memory(GiB)": 27.09, + "reward": 0.41129997968673704, + "reward_std": 0.16164461448788642, + "rewards/MMContentORM/mean": 0.46449999809265136, + "rewards/MMContentORM/std": 0.6922466158866882, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1295, + "train_speed(iter/s)": 0.083344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.8, + "completions/mean_length": 217.825, + "completions/min_length": 136.2, + "epoch": 0.6240998559769563, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.1909555196762085, + "kl": 0.02734375, + "learning_rate": 8.238704658547722e-06, + "loss": 0.0010941483080387116, + "memory(GiB)": 27.09, + "reward": 0.5067499876022339, + "reward_std": 0.09284311935771257, + "rewards/MMContentORM/mean": 0.6600000023841858, + "rewards/MMContentORM/std": 0.5655688047409058, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1300, + "train_speed(iter/s)": 0.083366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.6, + "completions/mean_length": 210.3875, + "completions/min_length": 144.0, + "epoch": 0.6265002400384061, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0646437555551529, + "kl": 0.015008544921875, + "learning_rate": 8.223557523004982e-06, + "loss": 0.0006002359557896852, + "memory(GiB)": 27.09, + "reward": 0.431849992275238, + "reward_std": 0.11052078779321164, + "rewards/MMContentORM/mean": 0.5015000164508819, + "rewards/MMContentORM/std": 0.6876445889472962, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1305, + "train_speed(iter/s)": 0.083302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/mean_length": 222.225, + "completions/min_length": 157.6, + "epoch": 0.628900624099856, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.06356562674045563, + "kl": 0.014605712890625, + "learning_rate": 8.208359589942515e-06, + "loss": 0.000583806075155735, + "memory(GiB)": 27.09, + "reward": 0.46749998927116393, + "reward_std": 0.10931870595086365, + "rewards/MMContentORM/mean": 0.5799999982118607, + "rewards/MMContentORM/std": 0.5833412051200867, + "rewards/MMFormatORM/mean": 0.6137499809265137, + "rewards/MMFormatORM/std": 0.08727944791316986, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1310, + "train_speed(iter/s)": 0.083321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.2, + "completions/mean_length": 220.925, + "completions/min_length": 134.4, + "epoch": 0.6313010081613059, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.18615718185901642, + "kl": 0.016546630859375, + "learning_rate": 8.193111098852654e-06, + "loss": 0.0006628592498600483, + "memory(GiB)": 27.09, + "reward": 0.4991499900817871, + "reward_std": 0.06738727213814855, + "rewards/MMContentORM/mean": 0.6410000026226044, + "rewards/MMContentORM/std": 0.49751891270279885, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1315, + "train_speed(iter/s)": 0.083324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.6, + "completions/mean_length": 214.4125, + "completions/min_length": 132.6, + "epoch": 0.6337013922227557, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1614212691783905, + "kl": 0.012420654296875, + "learning_rate": 8.177812290024438e-06, + "loss": 0.000497491005808115, + "memory(GiB)": 27.09, + "reward": 0.4735999882221222, + "reward_std": 0.10012631714344025, + "rewards/MMContentORM/mean": 0.5914999902248382, + "rewards/MMContentORM/std": 0.6162684261798859, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1320, + "train_speed(iter/s)": 0.083354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/mean_length": 214.65, + "completions/min_length": 152.4, + "epoch": 0.6361017762842055, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.08849076926708221, + "kl": 0.014666748046875, + "learning_rate": 8.162463404539812e-06, + "loss": 0.0005868059583008289, + "memory(GiB)": 27.09, + "reward": 0.48304998874664307, + "reward_std": 0.12183449864387512, + "rewards/MMContentORM/mean": 0.6295000195503235, + "rewards/MMContentORM/std": 0.6145186185836792, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1325, + "train_speed(iter/s)": 0.083338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.6, + "completions/mean_length": 221.7125, + "completions/min_length": 138.0, + "epoch": 0.6385021603456553, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.29121577739715576, + "kl": 0.018963623046875, + "learning_rate": 8.147064684269854e-06, + "loss": 0.0007598603144288063, + "memory(GiB)": 27.09, + "reward": 0.4774999737739563, + "reward_std": 0.13194613000378014, + "rewards/MMContentORM/mean": 0.6175000011920929, + "rewards/MMContentORM/std": 0.5826177567243576, + "rewards/MMFormatORM/mean": 0.6074999928474426, + "rewards/MMFormatORM/std": 0.12490466833114625, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1330, + "train_speed(iter/s)": 0.083346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.2, + "completions/mean_length": 220.975, + "completions/min_length": 138.8, + "epoch": 0.6409025444071051, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1789148896932602, + "kl": 0.01461181640625, + "learning_rate": 8.131616371870941e-06, + "loss": 0.0005845078732818366, + "memory(GiB)": 27.09, + "reward": 0.48419997096061707, + "reward_std": 0.09079250679351389, + "rewards/MMContentORM/mean": 0.6180000066757202, + "rewards/MMContentORM/std": 0.5878942906856537, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1335, + "train_speed(iter/s)": 0.083336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/mean_length": 215.825, + "completions/min_length": 145.6, + "epoch": 0.643302928468555, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.12938667833805084, + "kl": 0.01439208984375, + "learning_rate": 8.116118710780936e-06, + "loss": 0.0005751181393861771, + "memory(GiB)": 27.09, + "reward": 0.45569998025894165, + "reward_std": 0.15994756268337368, + "rewards/MMContentORM/mean": 0.5754999995231629, + "rewards/MMContentORM/std": 0.6518269121646881, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1340, + "train_speed(iter/s)": 0.083378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/mean_length": 214.35, + "completions/min_length": 154.2, + "epoch": 0.6457033125300048, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1686517745256424, + "kl": 0.013623046875, + "learning_rate": 8.100571945215349e-06, + "loss": 0.0005452525801956654, + "memory(GiB)": 27.09, + "reward": 0.4809499800205231, + "reward_std": 0.06936717408243567, + "rewards/MMContentORM/mean": 0.5954999804496766, + "rewards/MMContentORM/std": 0.5270328655838966, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1345, + "train_speed(iter/s)": 0.083404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.6, + "completions/mean_length": 224.625, + "completions/min_length": 151.4, + "epoch": 0.6481036965914546, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.06384813040494919, + "kl": 0.014715576171875, + "learning_rate": 8.08497632016349e-06, + "loss": 0.0005893761292099953, + "memory(GiB)": 27.09, + "reward": 0.4773499846458435, + "reward_std": 0.12423865795135498, + "rewards/MMContentORM/mean": 0.6190000057220459, + "rewards/MMContentORM/std": 0.6496911168098449, + "rewards/MMFormatORM/mean": 0.6056249856948852, + "rewards/MMFormatORM/std": 0.15690345019102098, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1350, + "train_speed(iter/s)": 0.083396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.2, + "completions/mean_length": 216.75, + "completions/min_length": 128.2, + "epoch": 0.6505040806529044, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.16834478080272675, + "kl": 0.018731689453125, + "learning_rate": 8.069332081384604e-06, + "loss": 0.0007483120542019605, + "memory(GiB)": 27.09, + "reward": 0.4377999842166901, + "reward_std": 0.0825900660827756, + "rewards/MMContentORM/mean": 0.559500002861023, + "rewards/MMContentORM/std": 0.7009612798690796, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 1355, + "train_speed(iter/s)": 0.0834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.8, + "completions/mean_length": 222.05, + "completions/min_length": 152.6, + "epoch": 0.6529044647143543, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.17165519297122955, + "kl": 0.02119140625, + "learning_rate": 8.053639475404008e-06, + "loss": 0.0008492187596857547, + "memory(GiB)": 27.09, + "reward": 0.45319998264312744, + "reward_std": 0.17027131617069244, + "rewards/MMContentORM/mean": 0.5979999959468841, + "rewards/MMContentORM/std": 0.6957221150398254, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 1360, + "train_speed(iter/s)": 0.083404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.6, + "completions/mean_length": 220.6125, + "completions/min_length": 154.6, + "epoch": 0.6553048487758041, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16907748579978943, + "kl": 0.0153076171875, + "learning_rate": 8.037898749509193e-06, + "loss": 0.0006130572408437728, + "memory(GiB)": 27.09, + "reward": 0.4759499728679657, + "reward_std": 0.06738727379124612, + "rewards/MMContentORM/mean": 0.5830000102519989, + "rewards/MMContentORM/std": 0.5839648485183716, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1365, + "train_speed(iter/s)": 0.083414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 207.2625, + "completions/min_length": 122.0, + "epoch": 0.6577052328372539, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.16428223252296448, + "kl": 0.019390869140625, + "learning_rate": 8.022110151745939e-06, + "loss": 0.0007753587327897548, + "memory(GiB)": 27.09, + "reward": 0.39994998574256896, + "reward_std": 0.12126881405711173, + "rewards/MMContentORM/mean": 0.45050000548362734, + "rewards/MMContentORM/std": 0.7288543343544006, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 1370, + "train_speed(iter/s)": 0.083439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.4, + "completions/mean_length": 215.875, + "completions/min_length": 144.8, + "epoch": 0.6601056168987038, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0659169927239418, + "kl": 0.01590576171875, + "learning_rate": 8.006273930914397e-06, + "loss": 0.0006364564411342144, + "memory(GiB)": 27.09, + "reward": 0.5269499838352203, + "reward_std": 0.06936717077624052, + "rewards/MMContentORM/mean": 0.7104999959468842, + "rewards/MMContentORM/std": 0.42932928130030634, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1375, + "train_speed(iter/s)": 0.083445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 223.675, + "completions/min_length": 133.8, + "epoch": 0.6625060009601537, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.06429751962423325, + "kl": 0.018243408203125, + "learning_rate": 7.990390336565179e-06, + "loss": 0.0007286330219358206, + "memory(GiB)": 27.09, + "reward": 0.48864997625350953, + "reward_std": 0.09581296914257109, + "rewards/MMContentORM/mean": 0.643500006198883, + "rewards/MMContentORM/std": 0.5242509357631207, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1380, + "train_speed(iter/s)": 0.083473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.8, + "completions/mean_length": 216.2125, + "completions/min_length": 117.8, + "epoch": 0.6649063850216035, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11517506837844849, + "kl": 0.02274169921875, + "learning_rate": 7.97445961899541e-06, + "loss": 0.0009097927249968052, + "memory(GiB)": 27.09, + "reward": 0.5096500039100647, + "reward_std": 0.1130663748132065, + "rewards/MMContentORM/mean": 0.6960000216960907, + "rewards/MMContentORM/std": 0.44011374935507774, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1385, + "train_speed(iter/s)": 0.083494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 206.375, + "completions/min_length": 116.0, + "epoch": 0.6673067690830533, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.27704310417175293, + "kl": 0.021282958984375, + "learning_rate": 7.958482029244803e-06, + "loss": 0.0008504557423293591, + "memory(GiB)": 27.09, + "reward": 0.37329998016357424, + "reward_std": 0.16235171258449554, + "rewards/MMContentORM/mean": 0.42700001001358034, + "rewards/MMContentORM/std": 0.7586719036102295, + "rewards/MMFormatORM/mean": 0.5687499940395355, + "rewards/MMFormatORM/std": 0.20804243683815002, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.32006530165672303, + "step": 1390, + "train_speed(iter/s)": 0.083518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.4, + "completions/mean_length": 225.675, + "completions/min_length": 130.6, + "epoch": 0.6697071531445031, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.09009167551994324, + "kl": 0.013970947265625, + "learning_rate": 7.942457819091686e-06, + "loss": 0.0005581377539783716, + "memory(GiB)": 27.09, + "reward": 0.46669996380805967, + "reward_std": 0.07254915833473205, + "rewards/MMContentORM/mean": 0.5455000042915344, + "rewards/MMContentORM/std": 0.6198044538497924, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1395, + "train_speed(iter/s)": 0.083434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.8, + "completions/mean_length": 206.9875, + "completions/min_length": 137.2, + "epoch": 0.672107537205953, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.22825519740581512, + "kl": 0.014642333984375, + "learning_rate": 7.926387241049045e-06, + "loss": 0.0005855937954038382, + "memory(GiB)": 27.09, + "reward": 0.48020000457763673, + "reward_std": 0.11455129608511924, + "rewards/MMContentORM/mean": 0.6080000162124634, + "rewards/MMContentORM/std": 0.5722609221935272, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1400, + "train_speed(iter/s)": 0.083454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.8, + "completions/mean_length": 216.625, + "completions/min_length": 137.0, + "epoch": 0.6745079212674028, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.19864213466644287, + "kl": 0.014227294921875, + "learning_rate": 7.910270548360537e-06, + "loss": 0.0005694822408258915, + "memory(GiB)": 27.09, + "reward": 0.3801499783992767, + "reward_std": 0.15648273127153517, + "rewards/MMContentORM/mean": 0.40100000500679017, + "rewards/MMContentORM/std": 0.7135852456092835, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 1405, + "train_speed(iter/s)": 0.083378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.6, + "completions/mean_length": 213.2375, + "completions/min_length": 123.8, + "epoch": 0.6769083053288526, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.13676653802394867, + "kl": 0.015936279296875, + "learning_rate": 7.89410799499651e-06, + "loss": 0.0006383438128978014, + "memory(GiB)": 27.09, + "reward": 0.4782499849796295, + "reward_std": 0.08407499901950359, + "rewards/MMContentORM/mean": 0.6174999952316285, + "rewards/MMContentORM/std": 0.6308860540390014, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1410, + "train_speed(iter/s)": 0.083388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.6, + "completions/mean_length": 219.2125, + "completions/min_length": 160.4, + "epoch": 0.6793086893903024, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.21563780307769775, + "kl": 0.01568603515625, + "learning_rate": 7.877899835649988e-06, + "loss": 0.0006277403328567744, + "memory(GiB)": 27.09, + "reward": 0.49304999113082887, + "reward_std": 0.13088545948266983, + "rewards/MMContentORM/mean": 0.654500013589859, + "rewards/MMContentORM/std": 0.5817828834056854, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1415, + "train_speed(iter/s)": 0.083413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.2, + "completions/mean_length": 214.4375, + "completions/min_length": 154.2, + "epoch": 0.6817090734517522, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.08068116009235382, + "kl": 0.015985107421875, + "learning_rate": 7.86164632573267e-06, + "loss": 0.0006394727155566215, + "memory(GiB)": 27.09, + "reward": 0.5065499901771545, + "reward_std": 0.08464068165048957, + "rewards/MMContentORM/mean": 0.659500002861023, + "rewards/MMContentORM/std": 0.6008779644966126, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1420, + "train_speed(iter/s)": 0.083412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.8, + "completions/mean_length": 211.925, + "completions/min_length": 142.2, + "epoch": 0.6841094575132021, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.46003836393356323, + "kl": 0.015850830078125, + "learning_rate": 7.845347721370894e-06, + "loss": 0.0006344554014503956, + "memory(GiB)": 27.09, + "reward": 0.47069998979568484, + "reward_std": 0.0646295606624335, + "rewards/MMContentORM/mean": 0.5554999947547913, + "rewards/MMContentORM/std": 0.6328672289848327, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1425, + "train_speed(iter/s)": 0.083437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/mean_length": 209.475, + "completions/min_length": 115.6, + "epoch": 0.6865098415746519, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11785315722227097, + "kl": 0.020574951171875, + "learning_rate": 7.82900427940161e-06, + "loss": 0.0008225988596677781, + "memory(GiB)": 27.09, + "reward": 0.45914997458457946, + "reward_std": 0.1380979523062706, + "rewards/MMContentORM/mean": 0.5985000252723693, + "rewards/MMContentORM/std": 0.6026765942573548, + "rewards/MMFormatORM/mean": 0.5931249976158142, + "rewards/MMFormatORM/std": 0.18240466713905334, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2806225776672363, + "step": 1430, + "train_speed(iter/s)": 0.083462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.4, + "completions/mean_length": 205.8375, + "completions/min_length": 148.4, + "epoch": 0.6889102256361018, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16498683393001556, + "kl": 0.016790771484375, + "learning_rate": 7.812616257368324e-06, + "loss": 0.0006715003866702319, + "memory(GiB)": 27.09, + "reward": 0.4596499800682068, + "reward_std": 0.13795653358101845, + "rewards/MMContentORM/mean": 0.5710000157356262, + "rewards/MMContentORM/std": 0.6740511536598206, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1435, + "train_speed(iter/s)": 0.083488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/mean_length": 225.9875, + "completions/min_length": 151.2, + "epoch": 0.6913106096975516, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.11214323341846466, + "kl": 0.02467041015625, + "learning_rate": 7.79618391351705e-06, + "loss": 0.0009878157638013364, + "memory(GiB)": 27.09, + "reward": 0.48644998073577883, + "reward_std": 0.15662415251135825, + "rewards/MMContentORM/mean": 0.6380000293254853, + "rewards/MMContentORM/std": 0.5628958165645599, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1440, + "train_speed(iter/s)": 0.083475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.6, + "completions/mean_length": 220.3, + "completions/min_length": 129.6, + "epoch": 0.6937109937590015, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10853149741888046, + "kl": 0.0424072265625, + "learning_rate": 7.779707506792232e-06, + "loss": 0.001695425808429718, + "memory(GiB)": 27.09, + "reward": 0.5110499858856201, + "reward_std": 0.12240018071606755, + "rewards/MMContentORM/mean": 0.6995000064373016, + "rewards/MMContentORM/std": 0.4983797550201416, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1445, + "train_speed(iter/s)": 0.083475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 565.4, + "completions/mean_length": 235.8375, + "completions/min_length": 151.4, + "epoch": 0.6961113778204513, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.06722735613584518, + "kl": 0.01473388671875, + "learning_rate": 7.763187296832664e-06, + "loss": 0.0005892225075513124, + "memory(GiB)": 27.09, + "reward": 0.48484996557235716, + "reward_std": 0.11080363169312477, + "rewards/MMContentORM/mean": 0.634000015258789, + "rewards/MMContentORM/std": 0.5153150960803032, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1450, + "train_speed(iter/s)": 0.083401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 451.2, + "completions/mean_length": 219.5625, + "completions/min_length": 131.4, + "epoch": 0.6985117618819011, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14000947773456573, + "kl": 0.0172607421875, + "learning_rate": 7.746623543967406e-06, + "loss": 0.0006907809525728226, + "memory(GiB)": 27.09, + "reward": 0.5349499821662903, + "reward_std": 0.06767011939082294, + "rewards/MMContentORM/mean": 0.7305000185966491, + "rewards/MMContentORM/std": 0.5024080984294415, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1455, + "train_speed(iter/s)": 0.083335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.6, + "completions/mean_length": 214.1, + "completions/min_length": 147.0, + "epoch": 0.7009121459433509, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1414344757795334, + "kl": 0.019146728515625, + "learning_rate": 7.730016509211672e-06, + "loss": 0.0007662178948521614, + "memory(GiB)": 27.09, + "reward": 0.5028499722480774, + "reward_std": 0.11080363541841506, + "rewards/MMContentORM/mean": 0.6790000140666962, + "rewards/MMContentORM/std": 0.5045298062264919, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1460, + "train_speed(iter/s)": 0.083355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.4, + "completions/mean_length": 209.0125, + "completions/min_length": 142.8, + "epoch": 0.7033125300048008, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10340370237827301, + "kl": 0.016058349609375, + "learning_rate": 7.713366454262724e-06, + "loss": 0.0006422744132578373, + "memory(GiB)": 27.09, + "reward": 0.4182499825954437, + "reward_std": 0.10146982565056532, + "rewards/MMContentORM/mean": 0.4675000041723251, + "rewards/MMContentORM/std": 0.6833849430084229, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1465, + "train_speed(iter/s)": 0.083393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.2, + "completions/mean_length": 210.225, + "completions/min_length": 135.8, + "epoch": 0.7057129140662506, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.076369509100914, + "kl": 0.016009521484375, + "learning_rate": 7.696673641495747e-06, + "loss": 0.0006410168949514628, + "memory(GiB)": 27.09, + "reward": 0.4791999697685242, + "reward_std": 0.09729789346456527, + "rewards/MMContentORM/mean": 0.6054999947547912, + "rewards/MMContentORM/std": 0.6232686996459961, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1470, + "train_speed(iter/s)": 0.083415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.4, + "completions/mean_length": 211.225, + "completions/min_length": 125.2, + "epoch": 0.7081132981277004, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15498086810112, + "kl": 0.016680908203125, + "learning_rate": 7.679938333959709e-06, + "loss": 0.0006680141203105449, + "memory(GiB)": 27.09, + "reward": 0.5133999764919281, + "reward_std": 0.11455130055546761, + "rewards/MMContentORM/mean": 0.6910000026226044, + "rewards/MMContentORM/std": 0.5742799043655396, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1475, + "train_speed(iter/s)": 0.083418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.6, + "completions/mean_length": 209.6875, + "completions/min_length": 144.0, + "epoch": 0.7105136821891502, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.16933397948741913, + "kl": 0.018133544921875, + "learning_rate": 7.663160795373221e-06, + "loss": 0.0007249978370964527, + "memory(GiB)": 27.09, + "reward": 0.5215499997138977, + "reward_std": 0.06682158990297467, + "rewards/MMContentORM/mean": 0.697000014781952, + "rewards/MMContentORM/std": 0.49735930785536764, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1480, + "train_speed(iter/s)": 0.083447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.2, + "completions/mean_length": 214.2625, + "completions/min_length": 126.0, + "epoch": 0.7129140662506, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.37113457918167114, + "kl": 0.031427001953125, + "learning_rate": 7.64634129012038e-06, + "loss": 0.0012564392760396003, + "memory(GiB)": 27.09, + "reward": 0.4060499906539917, + "reward_std": 0.15563419908285142, + "rewards/MMContentORM/mean": 0.4945000231266022, + "rewards/MMContentORM/std": 0.71776123046875, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.1856150358915329, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.285561603307724, + "step": 1485, + "train_speed(iter/s)": 0.083439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.2, + "completions/mean_length": 209.775, + "completions/min_length": 127.6, + "epoch": 0.7153144503120499, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.16112570464611053, + "kl": 0.01470947265625, + "learning_rate": 7.629480083246607e-06, + "loss": 0.0005889590363949537, + "memory(GiB)": 27.09, + "reward": 0.4991499841213226, + "reward_std": 0.07926666894927621, + "rewards/MMContentORM/mean": 0.6409999907016755, + "rewards/MMContentORM/std": 0.5213750995695591, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1490, + "train_speed(iter/s)": 0.083445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/mean_length": 211.7375, + "completions/min_length": 126.8, + "epoch": 0.7177148343734998, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.47146689891815186, + "kl": 0.03209228515625, + "learning_rate": 7.61257744045446e-06, + "loss": 0.0012816525064408778, + "memory(GiB)": 27.09, + "reward": 0.4565499722957611, + "reward_std": 0.04914391748607159, + "rewards/MMContentORM/mean": 0.5345000147819519, + "rewards/MMContentORM/std": 0.6424328684806824, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1495, + "train_speed(iter/s)": 0.083444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 205.65, + "completions/min_length": 124.8, + "epoch": 0.7201152184349496, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1686468869447708, + "kl": 0.016204833984375, + "learning_rate": 7.595633628099459e-06, + "loss": 0.0006481107324361801, + "memory(GiB)": 27.09, + "reward": 0.44354997873306273, + "reward_std": 0.16298812627792358, + "rewards/MMContentORM/mean": 0.5594999849796295, + "rewards/MMContentORM/std": 0.6292815625667572, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 1500, + "train_speed(iter/s)": 0.083468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 491.6, + "completions/mean_length": 228.2625, + "completions/min_length": 140.2, + "epoch": 0.7225156024963995, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.16463321447372437, + "kl": 0.014453125, + "learning_rate": 7.578648913185877e-06, + "loss": 0.0005780975334346294, + "memory(GiB)": 27.09, + "reward": 0.4644499897956848, + "reward_std": 0.1550685167312622, + "rewards/MMContentORM/mean": 0.640500009059906, + "rewards/MMContentORM/std": 0.6791411757469177, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.2101138174533844, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3232520341873169, + "step": 1505, + "train_speed(iter/s)": 0.083311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.6, + "completions/mean_length": 198.4125, + "completions/min_length": 133.2, + "epoch": 0.7249159865578493, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.10300061106681824, + "kl": 0.01964111328125, + "learning_rate": 7.561623563362541e-06, + "loss": 0.0007859501987695694, + "memory(GiB)": 27.09, + "reward": 0.54544997215271, + "reward_std": 0.05904341547284275, + "rewards/MMContentORM/mean": 0.728000009059906, + "rewards/MMContentORM/std": 0.4972465097904205, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1510, + "train_speed(iter/s)": 0.083355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.4, + "completions/mean_length": 214.025, + "completions/min_length": 122.0, + "epoch": 0.7273163706192991, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.004715959541499615, + "kl": 0.0142822265625, + "learning_rate": 7.5445578469186135e-06, + "loss": 0.0005710616242140532, + "memory(GiB)": 27.09, + "reward": 0.4336499750614166, + "reward_std": 0.10401540845632554, + "rewards/MMContentORM/mean": 0.5060000061988831, + "rewards/MMContentORM/std": 0.6872617721557617, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1515, + "train_speed(iter/s)": 0.083352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 456.6, + "completions/mean_length": 219.925, + "completions/min_length": 135.4, + "epoch": 0.7297167546807489, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1058562770485878, + "kl": 0.0159423828125, + "learning_rate": 7.527452032779361e-06, + "loss": 0.0006374444346874952, + "memory(GiB)": 27.09, + "reward": 0.5062999784946441, + "reward_std": 0.12628927137702703, + "rewards/MMContentORM/mean": 0.7019999921321869, + "rewards/MMContentORM/std": 0.45146496072411535, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 1520, + "train_speed(iter/s)": 0.083288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 456.0, + "completions/mean_length": 212.125, + "completions/min_length": 101.8, + "epoch": 0.7321171387421987, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1445714682340622, + "kl": 0.021337890625, + "learning_rate": 7.510306390501919e-06, + "loss": 0.000853828527033329, + "memory(GiB)": 27.09, + "reward": 0.4123999834060669, + "reward_std": 0.15980613380670547, + "rewards/MMContentORM/mean": 0.5535000085830688, + "rewards/MMContentORM/std": 0.7459115505218505, + "rewards/MMFormatORM/mean": 0.5524999856948852, + "rewards/MMFormatORM/std": 0.23411746919155121, + "rewards/MMRubricORM/mean": -0.15, + "rewards/MMRubricORM/std": 0.3601807415485382, + "step": 1525, + "train_speed(iter/s)": 0.083215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.2, + "completions/mean_length": 202.3625, + "completions/min_length": 116.6, + "epoch": 0.7345175228036486, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.20122231543064117, + "kl": 0.0188720703125, + "learning_rate": 7.493121190271044e-06, + "loss": 0.000754462881013751, + "memory(GiB)": 27.09, + "reward": 0.4821499824523926, + "reward_std": 0.1300369380041957, + "rewards/MMContentORM/mean": 0.6559999942779541, + "rewards/MMContentORM/std": 0.6435462713241578, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 1530, + "train_speed(iter/s)": 0.083228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.2, + "completions/mean_length": 214.9875, + "completions/min_length": 138.0, + "epoch": 0.7369179068650984, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.22068944573402405, + "kl": 0.03699951171875, + "learning_rate": 7.475896702894854e-06, + "loss": 0.0014766624197363853, + "memory(GiB)": 27.09, + "reward": 0.4285499930381775, + "reward_std": 0.1915552258491516, + "rewards/MMContentORM/mean": 0.5220000118017196, + "rewards/MMContentORM/std": 0.6519321262836456, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 1535, + "train_speed(iter/s)": 0.083241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.8, + "completions/mean_length": 199.75, + "completions/min_length": 128.6, + "epoch": 0.7393182909265482, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.0786973237991333, + "kl": 0.015472412109375, + "learning_rate": 7.458633199800562e-06, + "loss": 0.000618355255573988, + "memory(GiB)": 27.09, + "reward": 0.48274998664855956, + "reward_std": 0.06569022093899549, + "rewards/MMContentORM/mean": 0.6000000178813935, + "rewards/MMContentORM/std": 0.5159270875155926, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1540, + "train_speed(iter/s)": 0.083273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.2, + "completions/mean_length": 209.1, + "completions/min_length": 136.6, + "epoch": 0.741718674987998, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.16483508050441742, + "kl": 0.019476318359375, + "learning_rate": 7.4413309530302e-06, + "loss": 0.0007791164331138134, + "memory(GiB)": 27.09, + "reward": 0.4971999883651733, + "reward_std": 0.17324115931987763, + "rewards/MMContentORM/mean": 0.6955000162124634, + "rewards/MMContentORM/std": 0.601590758562088, + "rewards/MMFormatORM/mean": 0.5912499845027923, + "rewards/MMFormatORM/std": 0.15600111782550813, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.23944272398948668, + "step": 1545, + "train_speed(iter/s)": 0.083282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.8, + "completions/mean_length": 209.175, + "completions/min_length": 144.8, + "epoch": 0.7441190590494479, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11053567379713058, + "kl": 0.01842041015625, + "learning_rate": 7.423990235236331e-06, + "loss": 0.0007370706647634506, + "memory(GiB)": 27.09, + "reward": 0.48004999160766604, + "reward_std": 0.13569379299879075, + "rewards/MMContentORM/mean": 0.622000002861023, + "rewards/MMContentORM/std": 0.6143165111541748, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1550, + "train_speed(iter/s)": 0.083297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.8, + "completions/mean_length": 203.4375, + "completions/min_length": 141.0, + "epoch": 0.7465194431108978, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.10629149526357651, + "kl": 0.0184814453125, + "learning_rate": 7.406611319677756e-06, + "loss": 0.000739166047424078, + "memory(GiB)": 27.09, + "reward": 0.4108499825000763, + "reward_std": 0.200747612118721, + "rewards/MMContentORM/mean": 0.5065000057220459, + "rewards/MMContentORM/std": 0.7548076272010803, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 1555, + "train_speed(iter/s)": 0.083321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.2, + "completions/mean_length": 197.025, + "completions/min_length": 133.8, + "epoch": 0.7489198271723476, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1347576528787613, + "kl": 0.01641845703125, + "learning_rate": 7.389194480215198e-06, + "loss": 0.0006570426747202873, + "memory(GiB)": 27.09, + "reward": 0.44274998307228086, + "reward_std": 0.11108647137880326, + "rewards/MMContentORM/mean": 0.557500010728836, + "rewards/MMContentORM/std": 0.6644225358963013, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 1560, + "train_speed(iter/s)": 0.083368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.2, + "completions/mean_length": 200.275, + "completions/min_length": 116.2, + "epoch": 0.7513202112337974, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.12396983802318573, + "kl": 0.0153076171875, + "learning_rate": 7.3717399913069995e-06, + "loss": 0.0006124400533735752, + "memory(GiB)": 27.09, + "reward": 0.5360499680042267, + "reward_std": 0.03330472994130105, + "rewards/MMContentORM/mean": 0.7045000195503235, + "rewards/MMContentORM/std": 0.44892730191349983, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1565, + "train_speed(iter/s)": 0.083404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 431.0, + "completions/mean_length": 225.5, + "completions/min_length": 150.2, + "epoch": 0.7537205952952473, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1431017369031906, + "kl": 0.017681884765625, + "learning_rate": 7.354248128004788e-06, + "loss": 0.000707083148881793, + "memory(GiB)": 27.09, + "reward": 0.45769999623298646, + "reward_std": 0.09899494783021509, + "rewards/MMContentORM/mean": 0.5805000126361847, + "rewards/MMContentORM/std": 0.6738178968429566, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 1570, + "train_speed(iter/s)": 0.083352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.4, + "completions/mean_length": 211.7625, + "completions/min_length": 108.4, + "epoch": 0.7561209793566971, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11345090717077255, + "kl": 0.015887451171875, + "learning_rate": 7.336719165949144e-06, + "loss": 0.0006354267243295908, + "memory(GiB)": 27.09, + "reward": 0.444299989938736, + "reward_std": 0.14212846513837576, + "rewards/MMContentORM/mean": 0.5470000028610229, + "rewards/MMContentORM/std": 0.6608891606330871, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1575, + "train_speed(iter/s)": 0.083381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.6, + "completions/mean_length": 209.275, + "completions/min_length": 143.4, + "epoch": 0.7585213634181469, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.00587793905287981, + "kl": 0.0219482421875, + "learning_rate": 7.319153381365261e-06, + "loss": 0.0008785548619925976, + "memory(GiB)": 27.09, + "reward": 0.5002999901771545, + "reward_std": 0.13307749554514886, + "rewards/MMContentORM/mean": 0.6869999945163727, + "rewards/MMContentORM/std": 0.5800727725028991, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 1580, + "train_speed(iter/s)": 0.083389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.6, + "completions/mean_length": 203.225, + "completions/min_length": 140.6, + "epoch": 0.7609217474795967, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.20501793920993805, + "kl": 0.02083740234375, + "learning_rate": 7.301551051058586e-06, + "loss": 0.0008345272392034531, + "memory(GiB)": 27.09, + "reward": 0.4959499776363373, + "reward_std": 0.06682158932089806, + "rewards/MMContentORM/mean": 0.6330000102519989, + "rewards/MMContentORM/std": 0.5427005112171173, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1585, + "train_speed(iter/s)": 0.083423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.6, + "completions/mean_length": 212.8375, + "completions/min_length": 142.0, + "epoch": 0.7633221315410466, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.06863018125295639, + "kl": 0.0158203125, + "learning_rate": 7.283912452410468e-06, + "loss": 0.0006327041424810887, + "memory(GiB)": 27.09, + "reward": 0.43779999017715454, + "reward_std": 0.10097484942525625, + "rewards/MMContentORM/mean": 0.5019999980926514, + "rewards/MMContentORM/std": 0.6665767431259155, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1590, + "train_speed(iter/s)": 0.08342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/mean_length": 209.3875, + "completions/min_length": 140.2, + "epoch": 0.7657225156024964, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.11942637711763382, + "kl": 0.014971923828125, + "learning_rate": 7.266237863373772e-06, + "loss": 0.0005985536612570286, + "memory(GiB)": 27.09, + "reward": 0.48544998168945314, + "reward_std": 0.10486393286846578, + "rewards/MMContentORM/mean": 0.6105000019073487, + "rewards/MMContentORM/std": 0.5260161735117436, + "rewards/MMFormatORM/mean": 0.6218749761581421, + "rewards/MMFormatORM/std": 0.09190345257520675, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1595, + "train_speed(iter/s)": 0.083437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.2, + "completions/mean_length": 201.3875, + "completions/min_length": 132.4, + "epoch": 0.7681228996639462, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.07546590268611908, + "kl": 0.01690673828125, + "learning_rate": 7.248527562468513e-06, + "loss": 0.0006768060848116875, + "memory(GiB)": 27.09, + "reward": 0.4923499941825867, + "reward_std": 0.07417550361715257, + "rewards/MMContentORM/mean": 0.6239999949932098, + "rewards/MMContentORM/std": 0.5938864171504974, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1600, + "train_speed(iter/s)": 0.083478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.8, + "completions/mean_length": 197.5375, + "completions/min_length": 132.2, + "epoch": 0.770523283725396, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.007782844360917807, + "kl": 0.015924072265625, + "learning_rate": 7.230781828777462e-06, + "loss": 0.0006374673917889595, + "memory(GiB)": 27.09, + "reward": 0.5460999727249145, + "reward_std": 0.03719381578266621, + "rewards/MMContentORM/mean": 0.7440000057220459, + "rewards/MMContentORM/std": 0.44936863109469416, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1605, + "train_speed(iter/s)": 0.083452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 578.0, + "completions/mean_length": 221.8625, + "completions/min_length": 121.6, + "epoch": 0.7729236677868458, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.005166975781321526, + "kl": 0.016876220703125, + "learning_rate": 7.213000941941743e-06, + "loss": 0.0006743951700627804, + "memory(GiB)": 27.09, + "reward": 0.4998499691486359, + "reward_std": 0.10373256290331483, + "rewards/MMContentORM/mean": 0.6714999973773956, + "rewards/MMContentORM/std": 0.600242418050766, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1610, + "train_speed(iter/s)": 0.08332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 209.65, + "completions/min_length": 140.8, + "epoch": 0.7753240518482958, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.07150600850582123, + "kl": 0.01649169921875, + "learning_rate": 7.195185182156437e-06, + "loss": 0.0006602241192013025, + "memory(GiB)": 27.09, + "reward": 0.47224998474121094, + "reward_std": 0.1037325656041503, + "rewards/MMContentORM/mean": 0.6025000065565109, + "rewards/MMContentORM/std": 0.5951342463493348, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1615, + "train_speed(iter/s)": 0.083336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/mean_length": 205.8625, + "completions/min_length": 139.0, + "epoch": 0.7777244359097456, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15874557197093964, + "kl": 0.0163330078125, + "learning_rate": 7.177334830166151e-06, + "loss": 0.0006535663735121489, + "memory(GiB)": 27.09, + "reward": 0.4278499722480774, + "reward_std": 0.11278353529050947, + "rewards/MMContentORM/mean": 0.4915000081062317, + "rewards/MMContentORM/std": 0.6723409533500672, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1620, + "train_speed(iter/s)": 0.083371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.2, + "completions/mean_length": 205.5875, + "completions/min_length": 109.4, + "epoch": 0.7801248199711954, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.03448270633816719, + "kl": 0.03670654296875, + "learning_rate": 7.159450167260613e-06, + "loss": 0.0014746349304914474, + "memory(GiB)": 27.09, + "reward": 0.4049499869346619, + "reward_std": 0.1109450563788414, + "rewards/MMContentORM/mean": 0.4629999935626984, + "rewards/MMContentORM/std": 0.6945199608802796, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.12723276019096375, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.1957427144050598, + "step": 1625, + "train_speed(iter/s)": 0.083357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/mean_length": 200.1125, + "completions/min_length": 129.6, + "epoch": 0.7825252040326452, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.007031524088233709, + "kl": 0.0147705078125, + "learning_rate": 7.141531475270227e-06, + "loss": 0.0005904654040932656, + "memory(GiB)": 27.09, + "reward": 0.45404996871948244, + "reward_std": 0.13682516813278198, + "rewards/MMContentORM/mean": 0.5569999933242797, + "rewards/MMContentORM/std": 0.6599100232124329, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1630, + "train_speed(iter/s)": 0.08337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/mean_length": 199.3, + "completions/min_length": 117.0, + "epoch": 0.7849255880940951, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.28181007504463196, + "kl": 0.027081298828125, + "learning_rate": 7.123579036561634e-06, + "loss": 0.0010821642354130745, + "memory(GiB)": 27.09, + "reward": 0.4140499770641327, + "reward_std": 0.17317044883966445, + "rewards/MMContentORM/mean": 0.5144999921321869, + "rewards/MMContentORM/std": 0.6127165146172047, + "rewards/MMFormatORM/mean": 0.5768749713897705, + "rewards/MMFormatORM/std": 0.1856150358915329, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.285561603307724, + "step": 1635, + "train_speed(iter/s)": 0.083396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.6, + "completions/mean_length": 211.725, + "completions/min_length": 125.8, + "epoch": 0.7873259721555449, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07750914245843887, + "kl": 0.03267822265625, + "learning_rate": 7.1055931340332605e-06, + "loss": 0.0013033310882747174, + "memory(GiB)": 27.09, + "reward": 0.40509998202323916, + "reward_std": 0.1790394376264885, + "rewards/MMContentORM/mean": 0.5065000236034394, + "rewards/MMContentORM/std": 0.7263549327850342, + "rewards/MMFormatORM/mean": 0.568749976158142, + "rewards/MMFormatORM/std": 0.1590408891439438, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.24467830061912538, + "step": 1640, + "train_speed(iter/s)": 0.083359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/mean_length": 206.0625, + "completions/min_length": 132.2, + "epoch": 0.7897263562169947, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.21835249662399292, + "kl": 0.015789794921875, + "learning_rate": 7.0875740511108695e-06, + "loss": 0.0006318403407931328, + "memory(GiB)": 27.09, + "reward": 0.39799998998641967, + "reward_std": 0.1074802316725254, + "rewards/MMContentORM/mean": 0.4025000035762787, + "rewards/MMContentORM/std": 0.6922868967056275, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1645, + "train_speed(iter/s)": 0.083353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.2, + "completions/mean_length": 210.0875, + "completions/min_length": 118.4, + "epoch": 0.7921267402784445, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.18537920713424683, + "kl": 0.01591796875, + "learning_rate": 7.06952207174308e-06, + "loss": 0.0006365090608596802, + "memory(GiB)": 27.09, + "reward": 0.47984997630119325, + "reward_std": 0.14333054379094393, + "rewards/MMContentORM/mean": 0.6215000152587891, + "rewards/MMContentORM/std": 0.5664967365562916, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1650, + "train_speed(iter/s)": 0.083384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.2, + "completions/mean_length": 217.1375, + "completions/min_length": 146.6, + "epoch": 0.7945271243398944, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15769225358963013, + "kl": 0.020892333984375, + "learning_rate": 7.051437480396907e-06, + "loss": 0.0008358799852430821, + "memory(GiB)": 27.09, + "reward": 0.4181499779224396, + "reward_std": 0.16383664608001708, + "rewards/MMContentORM/mean": 0.4960000038146973, + "rewards/MMContentORM/std": 0.7032395720481872, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 1655, + "train_speed(iter/s)": 0.083365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.6, + "completions/mean_length": 208.675, + "completions/min_length": 115.8, + "epoch": 0.7969275084013442, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18478098511695862, + "kl": 0.014483642578125, + "learning_rate": 7.03332056205327e-06, + "loss": 0.0005793534219264984, + "memory(GiB)": 27.09, + "reward": 0.4719999849796295, + "reward_std": 0.09107535546645522, + "rewards/MMContentORM/mean": 0.5875000059604645, + "rewards/MMContentORM/std": 0.6219356417655945, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1660, + "train_speed(iter/s)": 0.083375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 460.6, + "completions/mean_length": 215.6625, + "completions/min_length": 116.6, + "epoch": 0.799327892462794, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.006300389766693115, + "kl": 0.0181884765625, + "learning_rate": 7.015171602202502e-06, + "loss": 0.0007266091182827949, + "memory(GiB)": 27.09, + "reward": 0.4193999707698822, + "reward_std": 0.1954443134367466, + "rewards/MMContentORM/mean": 0.5135000150650739, + "rewards/MMContentORM/std": 0.5563328020274639, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.1430424392223358, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.22006530165672303, + "step": 1665, + "train_speed(iter/s)": 0.083317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.2, + "completions/mean_length": 203.8, + "completions/min_length": 130.6, + "epoch": 0.8017282765242438, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.11067415028810501, + "kl": 0.01673583984375, + "learning_rate": 6.996990886839856e-06, + "loss": 0.0006691563874483108, + "memory(GiB)": 27.09, + "reward": 0.42014997601509096, + "reward_std": 0.14545186161994933, + "rewards/MMContentORM/mean": 0.546000012755394, + "rewards/MMContentORM/std": 0.7207041382789612, + "rewards/MMFormatORM/mean": 0.5668749928474426, + "rewards/MMFormatORM/std": 0.2141141563653946, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.3295581638813019, + "step": 1670, + "train_speed(iter/s)": 0.083354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.6, + "completions/mean_length": 203.7, + "completions/min_length": 150.8, + "epoch": 0.8041286605856938, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.10222038626670837, + "kl": 0.014013671875, + "learning_rate": 6.978778702460994e-06, + "loss": 0.0005606257822364568, + "memory(GiB)": 27.09, + "reward": 0.3979499816894531, + "reward_std": 0.12621856052428485, + "rewards/MMContentORM/mean": 0.38800000548362734, + "rewards/MMContentORM/std": 0.6839123487472534, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1675, + "train_speed(iter/s)": 0.083378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.8, + "completions/mean_length": 196.65, + "completions/min_length": 111.2, + "epoch": 0.8065290446471436, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.15431593358516693, + "kl": 0.017578125, + "learning_rate": 6.9605353360574745e-06, + "loss": 0.0007030891254544258, + "memory(GiB)": 27.09, + "reward": 0.4542999804019928, + "reward_std": 0.09135819002985954, + "rewards/MMContentORM/mean": 0.5720000088214874, + "rewards/MMContentORM/std": 0.6552067339420319, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1680, + "train_speed(iter/s)": 0.083423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/mean_length": 207.6375, + "completions/min_length": 120.4, + "epoch": 0.8089294287085934, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.19521333277225494, + "kl": 0.018505859375, + "learning_rate": 6.9422610751122276e-06, + "loss": 0.0007405009120702744, + "memory(GiB)": 27.09, + "reward": 0.4330499887466431, + "reward_std": 0.10316687764134258, + "rewards/MMContentORM/mean": 0.504500013589859, + "rewards/MMContentORM/std": 0.6536614775657654, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 1685, + "train_speed(iter/s)": 0.083404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.2, + "completions/mean_length": 188.7375, + "completions/min_length": 112.4, + "epoch": 0.8113298127700432, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12136948853731155, + "kl": 0.01881103515625, + "learning_rate": 6.923956207595028e-06, + "loss": 0.000752145517617464, + "memory(GiB)": 27.09, + "reward": 0.47574997544288633, + "reward_std": 0.06710443496704102, + "rewards/MMContentORM/mean": 0.5825000107288361, + "rewards/MMContentORM/std": 0.5429187417030334, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1690, + "train_speed(iter/s)": 0.083447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.6, + "completions/mean_length": 198.775, + "completions/min_length": 138.2, + "epoch": 0.8137301968314931, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.17351680994033813, + "kl": 0.02083740234375, + "learning_rate": 6.905621021957953e-06, + "loss": 0.0008334385231137276, + "memory(GiB)": 27.09, + "reward": 0.4406499922275543, + "reward_std": 0.07785245906561614, + "rewards/MMContentORM/mean": 0.5235000073909759, + "rewards/MMContentORM/std": 0.6393161118030548, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 1695, + "train_speed(iter/s)": 0.083453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.2, + "completions/mean_length": 199.2375, + "completions/min_length": 135.0, + "epoch": 0.8161305808929429, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.15735894441604614, + "kl": 0.015130615234375, + "learning_rate": 6.887255807130844e-06, + "loss": 0.00060483543202281, + "memory(GiB)": 27.09, + "reward": 0.479749983549118, + "reward_std": 0.07672108160331845, + "rewards/MMContentORM/mean": 0.5925000190734864, + "rewards/MMContentORM/std": 0.5261063687503338, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1700, + "train_speed(iter/s)": 0.083481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.8, + "completions/mean_length": 207.0, + "completions/min_length": 138.0, + "epoch": 0.8185309649543927, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0804097130894661, + "kl": 0.0136962890625, + "learning_rate": 6.868860852516742e-06, + "loss": 0.0005484659224748612, + "memory(GiB)": 27.09, + "reward": 0.42814998626708983, + "reward_std": 0.09652007222175599, + "rewards/MMContentORM/mean": 0.4634999930858612, + "rewards/MMContentORM/std": 0.6531470894813538, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1705, + "train_speed(iter/s)": 0.083426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.2, + "completions/mean_length": 209.5125, + "completions/min_length": 132.2, + "epoch": 0.8209313490158425, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0899352878332138, + "kl": 0.017169189453125, + "learning_rate": 6.85043644798734e-06, + "loss": 0.0006862088106572628, + "memory(GiB)": 27.09, + "reward": 0.38989998400211334, + "reward_std": 0.15853333994746208, + "rewards/MMContentORM/mean": 0.41100001335144043, + "rewards/MMContentORM/std": 0.6764008283615113, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 1710, + "train_speed(iter/s)": 0.083445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.2, + "completions/mean_length": 208.825, + "completions/min_length": 143.2, + "epoch": 0.8233317330772923, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.18190357089042664, + "kl": 0.015631103515625, + "learning_rate": 6.831982883878406e-06, + "loss": 0.0006255049258470536, + "memory(GiB)": 27.09, + "reward": 0.4102999925613403, + "reward_std": 0.12628926811739802, + "rewards/MMContentORM/mean": 0.46200000643730166, + "rewards/MMContentORM/std": 0.688849925994873, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1715, + "train_speed(iter/s)": 0.083444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.4, + "completions/mean_length": 202.925, + "completions/min_length": 106.0, + "epoch": 0.8257321171387422, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2049115151166916, + "kl": 0.015179443359375, + "learning_rate": 6.8135004509852135e-06, + "loss": 0.0006078362464904785, + "memory(GiB)": 27.09, + "reward": 0.4344499886035919, + "reward_std": 0.07756961137056351, + "rewards/MMContentORM/mean": 0.5080000042915345, + "rewards/MMContentORM/std": 0.6811013698577881, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1720, + "train_speed(iter/s)": 0.083479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.4, + "completions/mean_length": 215.35, + "completions/min_length": 131.8, + "epoch": 0.828132501200192, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1609988808631897, + "kl": 0.01397705078125, + "learning_rate": 6.794989440557954e-06, + "loss": 0.0005596654955297709, + "memory(GiB)": 27.09, + "reward": 0.45239998698234557, + "reward_std": 0.08994398396462203, + "rewards/MMContentORM/mean": 0.5384999930858612, + "rewards/MMContentORM/std": 0.6075566828250885, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1725, + "train_speed(iter/s)": 0.083487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 453.0, + "completions/mean_length": 216.5125, + "completions/min_length": 127.8, + "epoch": 0.8305328852616418, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.15101298689842224, + "kl": 0.016229248046875, + "learning_rate": 6.776450144297152e-06, + "loss": 0.0006488990969955921, + "memory(GiB)": 27.09, + "reward": 0.4963999569416046, + "reward_std": 0.10521748885512353, + "rewards/MMContentORM/mean": 0.6484999895095825, + "rewards/MMContentORM/std": 0.5765063345432282, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 1730, + "train_speed(iter/s)": 0.08343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.2, + "completions/mean_length": 203.075, + "completions/min_length": 117.8, + "epoch": 0.8329332693230916, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18309247493743896, + "kl": 0.03756103515625, + "learning_rate": 6.757882854349065e-06, + "loss": 0.0015039796009659768, + "memory(GiB)": 27.09, + "reward": 0.47954997420310974, + "reward_std": 0.07700393050909042, + "rewards/MMContentORM/mean": 0.5920000076293945, + "rewards/MMContentORM/std": 0.6195413947105408, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1735, + "train_speed(iter/s)": 0.083452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.4, + "completions/mean_length": 209.7625, + "completions/min_length": 123.6, + "epoch": 0.8353336533845416, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16256070137023926, + "kl": 0.0221435546875, + "learning_rate": 6.739287863301082e-06, + "loss": 0.0008845901116728783, + "memory(GiB)": 27.09, + "reward": 0.4170499801635742, + "reward_std": 0.1987677127122879, + "rewards/MMContentORM/mean": 0.5220000147819519, + "rewards/MMContentORM/std": 0.6289721466600895, + "rewards/MMFormatORM/mean": 0.576874977350235, + "rewards/MMFormatORM/std": 0.17944467663764954, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27606874108314516, + "step": 1740, + "train_speed(iter/s)": 0.083476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.2, + "completions/mean_length": 205.825, + "completions/min_length": 140.0, + "epoch": 0.8377340374459914, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.152576744556427, + "kl": 0.0164794921875, + "learning_rate": 6.720665464177109e-06, + "loss": 0.0006592854391783476, + "memory(GiB)": 27.09, + "reward": 0.4217999815940857, + "reward_std": 0.1360473409295082, + "rewards/MMContentORM/mean": 0.4944999933242798, + "rewards/MMContentORM/std": 0.6864893198013305, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.16880690604448317, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1745, + "train_speed(iter/s)": 0.083496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.8, + "completions/mean_length": 209.375, + "completions/min_length": 137.4, + "epoch": 0.8401344215074412, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.20455506443977356, + "kl": 0.017779541015625, + "learning_rate": 6.702015950432958e-06, + "loss": 0.0007104447111487388, + "memory(GiB)": 27.09, + "reward": 0.44764997959136965, + "reward_std": 0.1215516522526741, + "rewards/MMContentORM/mean": 0.5409999847412109, + "rewards/MMContentORM/std": 0.6508532583713531, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1750, + "train_speed(iter/s)": 0.083506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.4, + "completions/mean_length": 218.05, + "completions/min_length": 141.4, + "epoch": 0.842534805568891, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12836699187755585, + "kl": 0.01434326171875, + "learning_rate": 6.6833396159517206e-06, + "loss": 0.0005732546094805002, + "memory(GiB)": 27.09, + "reward": 0.49699997901916504, + "reward_std": 0.08909545510541647, + "rewards/MMContentORM/mean": 0.650000023841858, + "rewards/MMContentORM/std": 0.5488846890628338, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1755, + "train_speed(iter/s)": 0.083526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.4, + "completions/mean_length": 215.375, + "completions/min_length": 154.8, + "epoch": 0.8449351896303409, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.11277302354574203, + "kl": 0.0169677734375, + "learning_rate": 6.66463675503913e-06, + "loss": 0.000678945379331708, + "memory(GiB)": 27.09, + "reward": 0.3402999937534332, + "reward_std": 0.1948786199092865, + "rewards/MMContentORM/mean": 0.3445000022649765, + "rewards/MMContentORM/std": 0.7665389060974122, + "rewards/MMFormatORM/mean": 0.5687499880790711, + "rewards/MMFormatORM/std": 0.2142127960920334, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.3295581638813019, + "step": 1760, + "train_speed(iter/s)": 0.083508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/mean_length": 209.9125, + "completions/min_length": 116.8, + "epoch": 0.8473355736917907, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1689959019422531, + "kl": 0.0170654296875, + "learning_rate": 6.645907662418933e-06, + "loss": 0.0006823433097451926, + "memory(GiB)": 27.09, + "reward": 0.45354996919631957, + "reward_std": 0.17373612970113755, + "rewards/MMContentORM/mean": 0.5845000147819519, + "rewards/MMContentORM/std": 0.6833672761917114, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 1765, + "train_speed(iter/s)": 0.083507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.8, + "completions/mean_length": 207.425, + "completions/min_length": 135.0, + "epoch": 0.8497359577532405, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1972796618938446, + "kl": 0.015631103515625, + "learning_rate": 6.627152633228238e-06, + "loss": 0.0006257255561649799, + "memory(GiB)": 27.09, + "reward": 0.4673499882221222, + "reward_std": 0.09425733387470245, + "rewards/MMContentORM/mean": 0.5615000009536744, + "rewards/MMContentORM/std": 0.6091739594936371, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1770, + "train_speed(iter/s)": 0.08352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.2, + "completions/mean_length": 213.6875, + "completions/min_length": 140.4, + "epoch": 0.8521363418146903, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.12356596440076828, + "kl": 0.020318603515625, + "learning_rate": 6.608371963012872e-06, + "loss": 0.00081367501989007, + "memory(GiB)": 27.09, + "reward": 0.3578499734401703, + "reward_std": 0.21842527836561204, + "rewards/MMContentORM/mean": 0.4065000042319298, + "rewards/MMContentORM/std": 0.7466981053352356, + "rewards/MMFormatORM/mean": 0.5568749845027924, + "rewards/MMFormatORM/std": 0.19573256969451905, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.30068787932395935, + "step": 1775, + "train_speed(iter/s)": 0.083538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 218.6375, + "completions/min_length": 138.6, + "epoch": 0.8545367258761402, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0733184739947319, + "kl": 0.01422119140625, + "learning_rate": 6.589565947722711e-06, + "loss": 0.0005693596322089434, + "memory(GiB)": 27.09, + "reward": 0.5736999750137329, + "reward_std": 0.06547808428294957, + "rewards/MMContentORM/mean": 0.8130000233650208, + "rewards/MMContentORM/std": 0.38130461126565934, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 1780, + "train_speed(iter/s)": 0.08354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 469.0, + "completions/mean_length": 224.125, + "completions/min_length": 131.4, + "epoch": 0.85693710993759, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.12514179944992065, + "kl": 0.017791748046875, + "learning_rate": 6.570734883707036e-06, + "loss": 0.0007113578729331493, + "memory(GiB)": 27.09, + "reward": 0.45974999070167544, + "reward_std": 0.15931115644052624, + "rewards/MMContentORM/mean": 0.6, + "rewards/MMContentORM/std": 0.6618961155414581, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 1785, + "train_speed(iter/s)": 0.083463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/mean_length": 212.8, + "completions/min_length": 120.2, + "epoch": 0.8593374939990398, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.20723937451839447, + "kl": 0.01754150390625, + "learning_rate": 6.5518790677098385e-06, + "loss": 0.0007023832760751248, + "memory(GiB)": 27.09, + "reward": 0.4733999729156494, + "reward_std": 0.12190521762240678, + "rewards/MMContentORM/mean": 0.5909999907016754, + "rewards/MMContentORM/std": 0.6185662746429443, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1790, + "train_speed(iter/s)": 0.083484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.8, + "completions/mean_length": 220.6125, + "completions/min_length": 132.4, + "epoch": 0.8617378780604896, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1408216804265976, + "kl": 0.013970947265625, + "learning_rate": 6.532998796865169e-06, + "loss": 0.0005585259757936, + "memory(GiB)": 27.09, + "reward": 0.4389999687671661, + "reward_std": 0.0987121019512415, + "rewards/MMContentORM/mean": 0.5050000041723252, + "rewards/MMContentORM/std": 0.6495143830776214, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1795, + "train_speed(iter/s)": 0.083474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.2, + "completions/mean_length": 207.2125, + "completions/min_length": 116.4, + "epoch": 0.8641382621219396, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.26537343859672546, + "kl": 0.01622314453125, + "learning_rate": 6.5140943686924316e-06, + "loss": 0.0006490823347121477, + "memory(GiB)": 27.09, + "reward": 0.4860499739646912, + "reward_std": 0.1266428239643574, + "rewards/MMContentORM/mean": 0.6245000183582305, + "rewards/MMContentORM/std": 0.5797793388366699, + "rewards/MMFormatORM/mean": 0.6156249880790711, + "rewards/MMFormatORM/std": 0.13036334812641143, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 1800, + "train_speed(iter/s)": 0.083505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.8, + "completions/mean_length": 209.7875, + "completions/min_length": 118.0, + "epoch": 0.8665386461833894, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.08358591049909592, + "kl": 0.01490478515625, + "learning_rate": 6.495166081091716e-06, + "loss": 0.0005963623523712158, + "memory(GiB)": 27.09, + "reward": 0.4978999674320221, + "reward_std": 0.05218447903171182, + "rewards/MMContentORM/mean": 0.623499995470047, + "rewards/MMContentORM/std": 0.543954461812973, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1805, + "train_speed(iter/s)": 0.083454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.2, + "completions/mean_length": 210.3375, + "completions/min_length": 136.0, + "epoch": 0.8689390302448392, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.16976818442344666, + "kl": 0.014532470703125, + "learning_rate": 6.476214232339088e-06, + "loss": 0.0005812739953398704, + "memory(GiB)": 27.09, + "reward": 0.3728999882936478, + "reward_std": 0.1367544449865818, + "rewards/MMContentORM/mean": 0.36850000321865084, + "rewards/MMContentORM/std": 0.7042155861854553, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 1810, + "train_speed(iter/s)": 0.083466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.6, + "completions/mean_length": 202.7375, + "completions/min_length": 110.6, + "epoch": 0.871339414306289, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13253819942474365, + "kl": 0.02115478515625, + "learning_rate": 6.457239121081898e-06, + "loss": 0.0008474783971905708, + "memory(GiB)": 27.09, + "reward": 0.4685999810695648, + "reward_std": 0.10832875426858664, + "rewards/MMContentORM/mean": 0.5790000200271607, + "rewards/MMContentORM/std": 0.600553035736084, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 1815, + "train_speed(iter/s)": 0.083464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/mean_length": 201.4625, + "completions/min_length": 92.6, + "epoch": 0.8737397983677389, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.22328393161296844, + "kl": 0.03316650390625, + "learning_rate": 6.43824104633407e-06, + "loss": 0.0013257008045911788, + "memory(GiB)": 27.09, + "reward": 0.4241999924182892, + "reward_std": 0.20619233280885965, + "rewards/MMContentORM/mean": 0.5254999876022339, + "rewards/MMContentORM/std": 0.6518503844738006, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.16980934143066406, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2612451553344727, + "step": 1820, + "train_speed(iter/s)": 0.083486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.8, + "completions/mean_length": 211.9125, + "completions/min_length": 138.2, + "epoch": 0.8761401824291887, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.13617774844169617, + "kl": 0.0123046875, + "learning_rate": 6.419220307471395e-06, + "loss": 0.0004924539476633072, + "memory(GiB)": 27.09, + "reward": 0.5035999953746796, + "reward_std": 0.07198347002267838, + "rewards/MMContentORM/mean": 0.6665000081062317, + "rewards/MMContentORM/std": 0.6048071205615997, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1825, + "train_speed(iter/s)": 0.083493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.6, + "completions/mean_length": 208.5125, + "completions/min_length": 145.6, + "epoch": 0.8785405664906385, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.10531944036483765, + "kl": 0.013299560546875, + "learning_rate": 6.400177204226809e-06, + "loss": 0.0005324467085301877, + "memory(GiB)": 27.09, + "reward": 0.5384999752044678, + "reward_std": 0.06321534756571054, + "rewards/MMContentORM/mean": 0.725, + "rewards/MMContentORM/std": 0.5201015174388885, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1830, + "train_speed(iter/s)": 0.083512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/mean_length": 208.0375, + "completions/min_length": 131.2, + "epoch": 0.8809409505520883, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.3434266448020935, + "kl": 0.020574951171875, + "learning_rate": 6.381112036685666e-06, + "loss": 0.0008229421451687813, + "memory(GiB)": 27.09, + "reward": 0.42379998564720156, + "reward_std": 0.10040915869176388, + "rewards/MMContentORM/mean": 0.46700001060962676, + "rewards/MMContentORM/std": 0.6455421566963195, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1835, + "train_speed(iter/s)": 0.08351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 485.6, + "completions/mean_length": 216.975, + "completions/min_length": 137.4, + "epoch": 0.8833413346135381, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17209282517433167, + "kl": 0.0139007568359375, + "learning_rate": 6.36202510528102e-06, + "loss": 0.0005555123090744019, + "memory(GiB)": 27.09, + "reward": 0.475549989938736, + "reward_std": 0.09567154424730688, + "rewards/MMContentORM/mean": 0.5819999992847442, + "rewards/MMContentORM/std": 0.6257418870925904, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1840, + "train_speed(iter/s)": 0.08343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.4, + "completions/mean_length": 212.0, + "completions/min_length": 143.4, + "epoch": 0.885741718674988, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.20552751421928406, + "kl": 0.0138427734375, + "learning_rate": 6.342916710788882e-06, + "loss": 0.0005536759272217751, + "memory(GiB)": 27.09, + "reward": 0.4509999692440033, + "reward_std": 0.11624835580587387, + "rewards/MMContentORM/mean": 0.5349999845027924, + "rewards/MMContentORM/std": 0.6666475296020508, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 1845, + "train_speed(iter/s)": 0.083409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.4, + "completions/mean_length": 212.4875, + "completions/min_length": 132.4, + "epoch": 0.8881421027364378, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14356966316699982, + "kl": 0.012799072265625, + "learning_rate": 6.323787154323484e-06, + "loss": 0.0005117998458445072, + "memory(GiB)": 27.09, + "reward": 0.5021499991416931, + "reward_std": 0.08520636514294892, + "rewards/MMContentORM/mean": 0.6485000073909759, + "rewards/MMContentORM/std": 0.5814681231975556, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1850, + "train_speed(iter/s)": 0.083424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 211.4875, + "completions/min_length": 145.2, + "epoch": 0.8905424867978876, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.07189600169658661, + "kl": 0.014239501953125, + "learning_rate": 6.304636737332534e-06, + "loss": 0.0005696343258023262, + "memory(GiB)": 27.09, + "reward": 0.42489999532699585, + "reward_std": 0.13166328519582748, + "rewards/MMContentORM/mean": 0.4985000014305115, + "rewards/MMContentORM/std": 0.6740443706512451, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 1855, + "train_speed(iter/s)": 0.083428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/mean_length": 200.425, + "completions/min_length": 128.0, + "epoch": 0.8929428708593375, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12281981855630875, + "kl": 0.016815185546875, + "learning_rate": 6.285465761592459e-06, + "loss": 0.0006720408797264099, + "memory(GiB)": 27.09, + "reward": 0.485349977016449, + "reward_std": 0.06371032111346722, + "rewards/MMContentORM/mean": 0.5940000057220459, + "rewards/MMContentORM/std": 0.5279915370047092, + "rewards/MMFormatORM/mean": 0.6318749785423279, + "rewards/MMFormatORM/std": 0.06536335051059723, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1860, + "train_speed(iter/s)": 0.083449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.2, + "completions/mean_length": 224.4125, + "completions/min_length": 148.8, + "epoch": 0.8953432549207874, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.09118235856294632, + "kl": 0.0157958984375, + "learning_rate": 6.266274529203663e-06, + "loss": 0.0006318187341094017, + "memory(GiB)": 27.09, + "reward": 0.50444997549057, + "reward_std": 0.06908432939089834, + "rewards/MMContentORM/mean": 0.6829999923706055, + "rewards/MMContentORM/std": 0.4913133792579174, + "rewards/MMFormatORM/mean": 0.6093749821186065, + "rewards/MMFormatORM/std": 0.09063776731491088, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.1394427239894867, + "step": 1865, + "train_speed(iter/s)": 0.083431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 524.6, + "completions/mean_length": 226.525, + "completions/min_length": 128.0, + "epoch": 0.8977436389822372, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.10997837036848068, + "kl": 0.01796875, + "learning_rate": 6.247063342585753e-06, + "loss": 0.0007188735064119101, + "memory(GiB)": 27.09, + "reward": 0.45734997391700744, + "reward_std": 0.12932982593774794, + "rewards/MMContentORM/mean": 0.5939999997615815, + "rewards/MMContentORM/std": 0.6364952743053436, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 1870, + "train_speed(iter/s)": 0.083354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/mean_length": 196.575, + "completions/min_length": 120.2, + "epoch": 0.900144023043687, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.15118958055973053, + "kl": 0.0134521484375, + "learning_rate": 6.227832504472782e-06, + "loss": 0.0005381078924983739, + "memory(GiB)": 27.09, + "reward": 0.4734999716281891, + "reward_std": 0.060104073002003135, + "rewards/MMContentORM/mean": 0.5625000059604645, + "rewards/MMContentORM/std": 0.6234762132167816, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1875, + "train_speed(iter/s)": 0.083341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 204.7375, + "completions/min_length": 124.2, + "epoch": 0.9025444071051368, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.07756619155406952, + "kl": 0.014874267578125, + "learning_rate": 6.208582317908473e-06, + "loss": 0.000595169048756361, + "memory(GiB)": 27.09, + "reward": 0.49734997749328613, + "reward_std": 0.06653874590992928, + "rewards/MMContentORM/mean": 0.6364999890327454, + "rewards/MMContentORM/std": 0.5963007152080536, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1880, + "train_speed(iter/s)": 0.083348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/mean_length": 205.8, + "completions/min_length": 96.0, + "epoch": 0.9049447911665867, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.17429836094379425, + "kl": 0.018310546875, + "learning_rate": 6.1893130862414506e-06, + "loss": 0.0007323648314923048, + "memory(GiB)": 27.09, + "reward": 0.48079999089241027, + "reward_std": 0.029981326917186378, + "rewards/MMContentORM/mean": 0.551999980211258, + "rewards/MMContentORM/std": 0.5579077005386353, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 1885, + "train_speed(iter/s)": 0.08336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.2, + "completions/mean_length": 206.85, + "completions/min_length": 128.4, + "epoch": 0.9073451752280365, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.011451794765889645, + "kl": 0.0149658203125, + "learning_rate": 6.1700251131204525e-06, + "loss": 0.0005986175034195185, + "memory(GiB)": 27.09, + "reward": 0.5000999927520752, + "reward_std": 0.08075158959254622, + "rewards/MMContentORM/mean": 0.6290000021457672, + "rewards/MMContentORM/std": 0.5125225283205509, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1890, + "train_speed(iter/s)": 0.08337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/mean_length": 192.5375, + "completions/min_length": 97.6, + "epoch": 0.9097455592894863, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.3059964179992676, + "kl": 0.030767822265625, + "learning_rate": 6.1507187024895475e-06, + "loss": 0.0012321647256612778, + "memory(GiB)": 27.09, + "reward": 0.4602999806404114, + "reward_std": 0.14212846592999995, + "rewards/MMContentORM/mean": 0.5870000183582306, + "rewards/MMContentORM/std": 0.6352852940559387, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 1895, + "train_speed(iter/s)": 0.083396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.6, + "completions/mean_length": 197.9125, + "completions/min_length": 130.6, + "epoch": 0.9121459433509361, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.16474473476409912, + "kl": 0.014501953125, + "learning_rate": 6.131394158583351e-06, + "loss": 0.0005803803913295269, + "memory(GiB)": 27.09, + "reward": 0.421099978685379, + "reward_std": 0.1438255153596401, + "rewards/MMContentORM/mean": 0.489000004529953, + "rewards/MMContentORM/std": 0.6867013454437256, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 1900, + "train_speed(iter/s)": 0.083417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.6, + "completions/mean_length": 209.8875, + "completions/min_length": 146.2, + "epoch": 0.914546327412386, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12378682941198349, + "kl": 0.01851806640625, + "learning_rate": 6.112051785922221e-06, + "loss": 0.0007428077049553394, + "memory(GiB)": 27.09, + "reward": 0.4398999661207199, + "reward_std": 0.09984347894787789, + "rewards/MMContentORM/mean": 0.5235000193119049, + "rewards/MMContentORM/std": 0.6574730277061462, + "rewards/MMFormatORM/mean": 0.6074999928474426, + "rewards/MMFormatORM/std": 0.12490466833114625, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 1905, + "train_speed(iter/s)": 0.083366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 474.0, + "completions/mean_length": 222.4625, + "completions/min_length": 128.4, + "epoch": 0.9169467114738358, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1761566698551178, + "kl": 0.013580322265625, + "learning_rate": 6.092691889307469e-06, + "loss": 0.0005431583616882562, + "memory(GiB)": 27.09, + "reward": 0.48289998769760134, + "reward_std": 0.14580541402101516, + "rewards/MMContentORM/mean": 0.631000018119812, + "rewards/MMContentORM/std": 0.6453944146633148, + "rewards/MMFormatORM/mean": 0.6074999928474426, + "rewards/MMFormatORM/std": 0.14940344989299775, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 1910, + "train_speed(iter/s)": 0.083304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.4, + "completions/mean_length": 194.9, + "completions/min_length": 118.0, + "epoch": 0.9193470955352856, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09627247601747513, + "kl": 0.017974853515625, + "learning_rate": 6.073314773816553e-06, + "loss": 0.0007188072893768549, + "memory(GiB)": 27.09, + "reward": 0.5188999831676483, + "reward_std": 0.05529574886895716, + "rewards/MMContentORM/mean": 0.6759999990463257, + "rewards/MMContentORM/std": 0.5570424318313598, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1915, + "train_speed(iter/s)": 0.083336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.4, + "completions/mean_length": 201.7375, + "completions/min_length": 143.8, + "epoch": 0.9217474795967355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1766253411769867, + "kl": 0.016607666015625, + "learning_rate": 6.053920744798267e-06, + "loss": 0.0006644959561526775, + "memory(GiB)": 27.09, + "reward": 0.5365999698638916, + "reward_std": 0.04567909436300397, + "rewards/MMContentORM/mean": 0.7490000247955322, + "rewards/MMContentORM/std": 0.5337904691696167, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 1920, + "train_speed(iter/s)": 0.08337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.4, + "completions/mean_length": 205.6875, + "completions/min_length": 146.4, + "epoch": 0.9241478636581854, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17899306118488312, + "kl": 0.017431640625, + "learning_rate": 6.034510107867933e-06, + "loss": 0.0006970945280045271, + "memory(GiB)": 27.09, + "reward": 0.417499977350235, + "reward_std": 0.1641901969909668, + "rewards/MMContentORM/mean": 0.47999998927116394, + "rewards/MMContentORM/std": 0.6940834045410156, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 1925, + "train_speed(iter/s)": 0.083381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 443.6, + "completions/mean_length": 226.2625, + "completions/min_length": 141.4, + "epoch": 0.9265482477196352, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.10172195732593536, + "kl": 0.0140380859375, + "learning_rate": 6.015083168902586e-06, + "loss": 0.0005614136345684529, + "memory(GiB)": 27.09, + "reward": 0.4858999788761139, + "reward_std": 0.08160012271255254, + "rewards/MMContentORM/mean": 0.5935000061988831, + "rewards/MMContentORM/std": 0.6041896402835846, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1930, + "train_speed(iter/s)": 0.083321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.2, + "completions/mean_length": 207.8375, + "completions/min_length": 122.4, + "epoch": 0.928948631781085, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11313877999782562, + "kl": 0.015618896484375, + "learning_rate": 5.995640234036149e-06, + "loss": 0.0006248398683965206, + "memory(GiB)": 27.09, + "reward": 0.4472499847412109, + "reward_std": 0.08761052712798119, + "rewards/MMContentORM/mean": 0.5150000095367432, + "rewards/MMContentORM/std": 0.6490139365196228, + "rewards/MMFormatORM/mean": 0.6218749761581421, + "rewards/MMFormatORM/std": 0.09190345257520675, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 1935, + "train_speed(iter/s)": 0.083355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/mean_length": 210.6625, + "completions/min_length": 144.2, + "epoch": 0.9313490158425348, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16516950726509094, + "kl": 0.0192626953125, + "learning_rate": 5.9761816096546135e-06, + "loss": 0.0007695911917835474, + "memory(GiB)": 27.09, + "reward": 0.5396999716758728, + "reward_std": 0.05034599886275828, + "rewards/MMContentORM/mean": 0.715500009059906, + "rewards/MMContentORM/std": 0.4298131003975868, + "rewards/MMFormatORM/mean": 0.6399999856948853, + "rewards/MMFormatORM/std": 0.03999999761581421, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1940, + "train_speed(iter/s)": 0.08336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.6, + "completions/mean_length": 206.6875, + "completions/min_length": 145.4, + "epoch": 0.9337493999039846, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.07769346237182617, + "kl": 0.01458740234375, + "learning_rate": 5.956707602391209e-06, + "loss": 0.0005837498232722282, + "memory(GiB)": 27.09, + "reward": 0.5033499836921692, + "reward_std": 0.04334564357995987, + "rewards/MMContentORM/mean": 0.6265000075101852, + "rewards/MMContentORM/std": 0.4830021485686302, + "rewards/MMFormatORM/mean": 0.6381249785423279, + "rewards/MMFormatORM/std": 0.04749999791383743, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1945, + "train_speed(iter/s)": 0.083377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.2, + "completions/mean_length": 206.6125, + "completions/min_length": 136.2, + "epoch": 0.9361497839654345, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.2008439302444458, + "kl": 0.015899658203125, + "learning_rate": 5.937218519121575e-06, + "loss": 0.0006357332691550255, + "memory(GiB)": 27.09, + "reward": 0.5125499844551087, + "reward_std": 0.06936717720236629, + "rewards/MMContentORM/mean": 0.6745000123977661, + "rewards/MMContentORM/std": 0.5375838339328766, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1950, + "train_speed(iter/s)": 0.083384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.6, + "completions/mean_length": 209.15, + "completions/min_length": 121.4, + "epoch": 0.9385501680268843, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.006351741962134838, + "kl": 0.013763427734375, + "learning_rate": 5.917714666958917e-06, + "loss": 0.0005507726222276687, + "memory(GiB)": 27.09, + "reward": 0.5436999797821045, + "reward_std": 0.04624478132463992, + "rewards/MMContentORM/mean": 0.7380000114440918, + "rewards/MMContentORM/std": 0.4596146807074547, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 1955, + "train_speed(iter/s)": 0.083393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.4, + "completions/mean_length": 200.225, + "completions/min_length": 137.2, + "epoch": 0.9409505520883341, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.11096161603927612, + "kl": 0.01678466796875, + "learning_rate": 5.8981963532491746e-06, + "loss": 0.000671281386166811, + "memory(GiB)": 27.09, + "reward": 0.47979997396469115, + "reward_std": 0.06363960476592183, + "rewards/MMContentORM/mean": 0.6070000171661377, + "rewards/MMContentORM/std": 0.6155083239078522, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 1960, + "train_speed(iter/s)": 0.083427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.4, + "completions/mean_length": 217.85, + "completions/min_length": 153.4, + "epoch": 0.9433509361497839, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1260797083377838, + "kl": 0.01507568359375, + "learning_rate": 5.878663885566178e-06, + "loss": 0.0006023185327649116, + "memory(GiB)": 27.09, + "reward": 0.49534996747970583, + "reward_std": 0.09369164705276489, + "rewards/MMContentORM/mean": 0.6315000057220459, + "rewards/MMContentORM/std": 0.604107654094696, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1965, + "train_speed(iter/s)": 0.083409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.8, + "completions/mean_length": 201.225, + "completions/min_length": 153.8, + "epoch": 0.9457513202112338, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.15319658815860748, + "kl": 0.017413330078125, + "learning_rate": 5.859117571706791e-06, + "loss": 0.0006967600900679826, + "memory(GiB)": 27.09, + "reward": 0.5093499839305877, + "reward_std": 0.07728676870465279, + "rewards/MMContentORM/mean": 0.6664999902248383, + "rewards/MMContentORM/std": 0.5927111029624939, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 1970, + "train_speed(iter/s)": 0.083437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 212.5125, + "completions/min_length": 143.6, + "epoch": 0.9481517042726836, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.21331176161766052, + "kl": 0.014532470703125, + "learning_rate": 5.83955771968608e-06, + "loss": 0.0005808803252875805, + "memory(GiB)": 27.09, + "reward": 0.5360999882221222, + "reward_std": 0.05642712083645165, + "rewards/MMContentORM/mean": 0.7190000295639039, + "rewards/MMContentORM/std": 0.4764695011079311, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 1975, + "train_speed(iter/s)": 0.083446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.4, + "completions/mean_length": 208.9375, + "completions/min_length": 147.0, + "epoch": 0.9505520883341335, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12605807185173035, + "kl": 0.015679931640625, + "learning_rate": 5.819984637732436e-06, + "loss": 0.0006278078071773052, + "memory(GiB)": 27.09, + "reward": 0.5486499905586243, + "reward_std": 0.03471893714740872, + "rewards/MMContentORM/mean": 0.7360000014305115, + "rewards/MMContentORM/std": 0.49632705450057985, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1980, + "train_speed(iter/s)": 0.083456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/mean_length": 218.3625, + "completions/min_length": 130.4, + "epoch": 0.9529524723955833, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.10039184242486954, + "kl": 0.015283203125, + "learning_rate": 5.80039863428274e-06, + "loss": 0.0006117623299360276, + "memory(GiB)": 27.09, + "reward": 0.548499995470047, + "reward_std": 0.08810550635680556, + "rewards/MMContentORM/mean": 0.7500000119209289, + "rewards/MMContentORM/std": 0.4995552241802216, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 1985, + "train_speed(iter/s)": 0.083456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.6, + "completions/mean_length": 206.6, + "completions/min_length": 143.0, + "epoch": 0.9553528564570332, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.16859173774719238, + "kl": 0.015777587890625, + "learning_rate": 5.780800017977491e-06, + "loss": 0.0006312967278063297, + "memory(GiB)": 27.09, + "reward": 0.5292499780654907, + "reward_std": 0.04065863774158061, + "rewards/MMContentORM/mean": 0.6875, + "rewards/MMContentORM/std": 0.5327123403549194, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 1990, + "train_speed(iter/s)": 0.083474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 211.175, + "completions/min_length": 141.4, + "epoch": 0.957753240518483, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.06966093927621841, + "kl": 0.020062255859375, + "learning_rate": 5.761189097655937e-06, + "loss": 0.0008020093664526939, + "memory(GiB)": 27.09, + "reward": 0.43149998784065247, + "reward_std": 0.10479322522878647, + "rewards/MMContentORM/mean": 0.5149999976158142, + "rewards/MMContentORM/std": 0.6905157566070557, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 1995, + "train_speed(iter/s)": 0.083493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.2, + "completions/mean_length": 213.875, + "completions/min_length": 141.2, + "epoch": 0.9601536245799328, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.13645286858081818, + "kl": 0.019488525390625, + "learning_rate": 5.7415661823512245e-06, + "loss": 0.0007798057049512863, + "memory(GiB)": 27.09, + "reward": 0.46374998688697816, + "reward_std": 0.17232191623188556, + "rewards/MMContentORM/mean": 0.6100000143051147, + "rewards/MMContentORM/std": 0.59096859395504, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2000, + "train_speed(iter/s)": 0.083497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.4, + "completions/mean_length": 209.8375, + "completions/min_length": 144.2, + "epoch": 0.9625540086413826, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10858377814292908, + "kl": 0.017437744140625, + "learning_rate": 5.721931581285514e-06, + "loss": 0.000697833951562643, + "memory(GiB)": 27.09, + "reward": 0.4571499764919281, + "reward_std": 0.1461589643266052, + "rewards/MMContentORM/mean": 0.5935000061988831, + "rewards/MMContentORM/std": 0.6896682381629944, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2005, + "train_speed(iter/s)": 0.08346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.2, + "completions/mean_length": 212.15, + "completions/min_length": 127.2, + "epoch": 0.9649543927028325, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.22721628844738007, + "kl": 0.016845703125, + "learning_rate": 5.702285603865115e-06, + "loss": 0.0006736557465046644, + "memory(GiB)": 27.09, + "reward": 0.5219999849796295, + "reward_std": 0.10691454559564591, + "rewards/MMContentORM/mean": 0.7125000119209289, + "rewards/MMContentORM/std": 0.590934443473816, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2010, + "train_speed(iter/s)": 0.083488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/mean_length": 216.8, + "completions/min_length": 152.2, + "epoch": 0.9673547767642823, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15032155811786652, + "kl": 0.01512451171875, + "learning_rate": 5.682628559675609e-06, + "loss": 0.0006046965718269348, + "memory(GiB)": 27.09, + "reward": 0.4427499771118164, + "reward_std": 0.07700392529368401, + "rewards/MMContentORM/mean": 0.4999999850988388, + "rewards/MMContentORM/std": 0.6438470005989074, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2015, + "train_speed(iter/s)": 0.083505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.4, + "completions/mean_length": 219.5875, + "completions/min_length": 122.4, + "epoch": 0.9697551608257321, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17557469010353088, + "kl": 0.015576171875, + "learning_rate": 5.662960758476965e-06, + "loss": 0.0006231794133782387, + "memory(GiB)": 27.09, + "reward": 0.5015999853610993, + "reward_std": 0.08145869905129074, + "rewards/MMContentORM/mean": 0.6490000009536743, + "rewards/MMContentORM/std": 0.5806757628917694, + "rewards/MMFormatORM/mean": 0.6237499833106994, + "rewards/MMFormatORM/std": 0.08440345227718353, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2020, + "train_speed(iter/s)": 0.083499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/mean_length": 211.525, + "completions/min_length": 150.0, + "epoch": 0.9721555448871819, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1798633337020874, + "kl": 0.016705322265625, + "learning_rate": 5.6432825101986725e-06, + "loss": 0.0006679143756628036, + "memory(GiB)": 27.09, + "reward": 0.49559998512268066, + "reward_std": 0.12614785209298135, + "rewards/MMContentORM/mean": 0.6465000152587891, + "rewards/MMContentORM/std": 0.632487416267395, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2025, + "train_speed(iter/s)": 0.083521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 212.1125, + "completions/min_length": 137.8, + "epoch": 0.9745559289486317, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.09048443287611008, + "kl": 0.01593017578125, + "learning_rate": 5.623594124934836e-06, + "loss": 0.0006376095581799746, + "memory(GiB)": 27.09, + "reward": 0.5258999943733216, + "reward_std": 0.03973939623683691, + "rewards/MMContentORM/mean": 0.6935000121593475, + "rewards/MMContentORM/std": 0.48069806694984435, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2030, + "train_speed(iter/s)": 0.083527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 439.4, + "completions/mean_length": 219.675, + "completions/min_length": 147.2, + "epoch": 0.9769563130100816, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.13792569935321808, + "kl": 0.01651611328125, + "learning_rate": 5.603895912939312e-06, + "loss": 0.0006604710128158331, + "memory(GiB)": 27.09, + "reward": 0.41819998621940613, + "reward_std": 0.08061017030850053, + "rewards/MMContentORM/mean": 0.453000009059906, + "rewards/MMContentORM/std": 0.6981132864952088, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2035, + "train_speed(iter/s)": 0.083488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.4, + "completions/mean_length": 225.675, + "completions/min_length": 144.2, + "epoch": 0.9793566970715314, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1497989296913147, + "kl": 0.01591796875, + "learning_rate": 5.584188184620803e-06, + "loss": 0.0006368092261254787, + "memory(GiB)": 27.09, + "reward": 0.43914997577667236, + "reward_std": 0.12904698103666307, + "rewards/MMContentORM/mean": 0.5485000073909759, + "rewards/MMContentORM/std": 0.6871413588523865, + "rewards/MMFormatORM/mean": 0.5931249976158142, + "rewards/MMFormatORM/std": 0.18240466713905334, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2806225776672363, + "step": 2040, + "train_speed(iter/s)": 0.083459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.4, + "completions/mean_length": 220.75, + "completions/min_length": 157.8, + "epoch": 0.9817570811329813, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.13917264342308044, + "kl": 0.01572265625, + "learning_rate": 5.564471250537974e-06, + "loss": 0.0006287385243922472, + "memory(GiB)": 27.09, + "reward": 0.4864999830722809, + "reward_std": 0.07113494109362364, + "rewards/MMContentORM/mean": 0.5950000047683716, + "rewards/MMContentORM/std": 0.5641628682613373, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 2045, + "train_speed(iter/s)": 0.083475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.2, + "completions/mean_length": 207.475, + "completions/min_length": 135.4, + "epoch": 0.9841574651944311, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.14633502066135406, + "kl": 0.01990966796875, + "learning_rate": 5.544745421394554e-06, + "loss": 0.0007959839887917042, + "memory(GiB)": 27.09, + "reward": 0.5193999826908111, + "reward_std": 0.10493464283645153, + "rewards/MMContentORM/mean": 0.7060000181198121, + "rewards/MMContentORM/std": 0.5711513638496399, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2050, + "train_speed(iter/s)": 0.083485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.4, + "completions/mean_length": 219.575, + "completions/min_length": 128.2, + "epoch": 0.986557849255881, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14857056736946106, + "kl": 0.01473388671875, + "learning_rate": 5.525011008034444e-06, + "loss": 0.00058915582485497, + "memory(GiB)": 27.09, + "reward": 0.44309998750686647, + "reward_std": 0.08216580655425787, + "rewards/MMContentORM/mean": 0.4865000069141388, + "rewards/MMContentORM/std": 0.6088324308395385, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 2055, + "train_speed(iter/s)": 0.083477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.8, + "completions/mean_length": 202.225, + "completions/min_length": 140.0, + "epoch": 0.9889582333173308, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.2547079622745514, + "kl": 0.01441650390625, + "learning_rate": 5.505268321436819e-06, + "loss": 0.0005766792222857475, + "memory(GiB)": 27.09, + "reward": 0.4755499839782715, + "reward_std": 0.08888332126662135, + "rewards/MMContentORM/mean": 0.5820000112056732, + "rewards/MMContentORM/std": 0.5622067280113697, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2060, + "train_speed(iter/s)": 0.083503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.4, + "completions/mean_length": 209.1, + "completions/min_length": 127.8, + "epoch": 0.9913586173787806, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.08104149997234344, + "kl": 0.018072509765625, + "learning_rate": 5.485517672711221e-06, + "loss": 0.0007230919785797596, + "memory(GiB)": 27.09, + "reward": 0.47184998393058775, + "reward_std": 0.10203550313599408, + "rewards/MMContentORM/mean": 0.6015000164508819, + "rewards/MMContentORM/std": 0.5722574293613434, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2065, + "train_speed(iter/s)": 0.083517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/mean_length": 208.3, + "completions/min_length": 137.4, + "epoch": 0.9937590014402304, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.12099120765924454, + "kl": 0.0142578125, + "learning_rate": 5.465759373092664e-06, + "loss": 0.0005701714660972356, + "memory(GiB)": 27.09, + "reward": 0.5222999691963196, + "reward_std": 0.03521391893737018, + "rewards/MMContentORM/mean": 0.6845000147819519, + "rewards/MMContentORM/std": 0.5714206457138061, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 2070, + "train_speed(iter/s)": 0.083524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.6, + "completions/mean_length": 210.7875, + "completions/min_length": 146.0, + "epoch": 0.9961593855016803, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1514410525560379, + "kl": 0.017779541015625, + "learning_rate": 5.445993733936725e-06, + "loss": 0.0007106260396540165, + "memory(GiB)": 27.09, + "reward": 0.48714996576309205, + "reward_std": 0.09906565884593874, + "rewards/MMContentORM/mean": 0.611000019311905, + "rewards/MMContentORM/std": 0.5427431344985962, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2075, + "train_speed(iter/s)": 0.083534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/mean_length": 209.0375, + "completions/min_length": 130.2, + "epoch": 0.9985597695631301, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.12781541049480438, + "kl": 0.018701171875, + "learning_rate": 5.426221066714641e-06, + "loss": 0.0007481152191758156, + "memory(GiB)": 27.09, + "reward": 0.511499959230423, + "reward_std": 0.06349818790331482, + "rewards/MMContentORM/mean": 0.6575000166893006, + "rewards/MMContentORM/std": 0.5641934812068939, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 2080, + "train_speed(iter/s)": 0.083547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/mean_length": 215.65, + "completions/min_length": 134.6, + "epoch": 1.00096015362458, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.004740873351693153, + "kl": 0.0146728515625, + "learning_rate": 5.406441683008395e-06, + "loss": 0.000586447911337018, + "memory(GiB)": 27.09, + "reward": 0.42514997720718384, + "reward_std": 0.11504627112299204, + "rewards/MMContentORM/mean": 0.5135000109672546, + "rewards/MMContentORM/std": 0.6854106187820435, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.14121158123016359, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21724859476089478, + "step": 2085, + "train_speed(iter/s)": 0.083547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.8, + "completions/mean_length": 222.7, + "completions/min_length": 157.2, + "epoch": 1.0033605376860297, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.11715701222419739, + "kl": 0.01546630859375, + "learning_rate": 5.386655894505816e-06, + "loss": 0.0006185109727084636, + "memory(GiB)": 27.09, + "reward": 0.467849987745285, + "reward_std": 0.11221784176304936, + "rewards/MMContentORM/mean": 0.5915000081062317, + "rewards/MMContentORM/std": 0.6085148751735687, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 2090, + "train_speed(iter/s)": 0.083558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/mean_length": 205.6875, + "completions/min_length": 148.0, + "epoch": 1.0057609217474797, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1382003277540207, + "kl": 0.014483642578125, + "learning_rate": 5.366864012995654e-06, + "loss": 0.0005779881961643696, + "memory(GiB)": 27.09, + "reward": 0.5312999904155731, + "reward_std": 0.06321534309536218, + "rewards/MMContentORM/mean": 0.7070000171661377, + "rewards/MMContentORM/std": 0.48905040323734283, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2095, + "train_speed(iter/s)": 0.083579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 616.0, + "completions/mean_length": 230.3375, + "completions/min_length": 131.6, + "epoch": 1.0081613058089294, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23635810613632202, + "kl": 0.0135009765625, + "learning_rate": 5.347066350362678e-06, + "loss": 0.0005402253940701484, + "memory(GiB)": 27.09, + "reward": 0.4407999932765961, + "reward_std": 0.11045007631182671, + "rewards/MMContentORM/mean": 0.5095000088214874, + "rewards/MMContentORM/std": 0.6070198595523835, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2100, + "train_speed(iter/s)": 0.083454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.4, + "completions/mean_length": 209.8375, + "completions/min_length": 126.0, + "epoch": 1.0105616898703793, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1306273341178894, + "kl": 0.014874267578125, + "learning_rate": 5.327263218582758e-06, + "loss": 0.0005949225276708602, + "memory(GiB)": 27.09, + "reward": 0.42854997515678406, + "reward_std": 0.12537002861499785, + "rewards/MMContentORM/mean": 0.496999990940094, + "rewards/MMContentORM/std": 0.6504930973052978, + "rewards/MMFormatORM/mean": 0.6056249976158142, + "rewards/MMFormatORM/std": 0.16487477123737335, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.25, + "step": 2105, + "train_speed(iter/s)": 0.08341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/mean_length": 221.3875, + "completions/min_length": 156.0, + "epoch": 1.012962073931829, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18639177083969116, + "kl": 0.02562255859375, + "learning_rate": 5.307454929717944e-06, + "loss": 0.0010251142084598541, + "memory(GiB)": 27.09, + "reward": 0.43019998669624326, + "reward_std": 0.19318157732486724, + "rewards/MMContentORM/mean": 0.540500009059906, + "rewards/MMContentORM/std": 0.7170865178108216, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 2110, + "train_speed(iter/s)": 0.08342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.8, + "completions/mean_length": 222.8375, + "completions/min_length": 147.4, + "epoch": 1.015362457993279, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.10656420886516571, + "kl": 0.01639404296875, + "learning_rate": 5.28764179591156e-06, + "loss": 0.0006556062027812005, + "memory(GiB)": 27.09, + "reward": 0.4640499770641327, + "reward_std": 0.1481388673186302, + "rewards/MMContentORM/mean": 0.5820000290870666, + "rewards/MMContentORM/std": 0.6475385546684265, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2115, + "train_speed(iter/s)": 0.083433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.8, + "completions/mean_length": 216.5875, + "completions/min_length": 150.4, + "epoch": 1.0177628420547287, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.11035939306020737, + "kl": 0.017095947265625, + "learning_rate": 5.267824129383267e-06, + "loss": 0.000684003159403801, + "memory(GiB)": 27.09, + "reward": 0.5336999654769897, + "reward_std": 0.05642711999826133, + "rewards/MMContentORM/mean": 0.7130000233650208, + "rewards/MMContentORM/std": 0.5324123561382293, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2120, + "train_speed(iter/s)": 0.083443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.8, + "completions/mean_length": 222.8125, + "completions/min_length": 165.8, + "epoch": 1.0201632261161786, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.12353075295686722, + "kl": 0.01419677734375, + "learning_rate": 5.248002242424164e-06, + "loss": 0.0005672593601047992, + "memory(GiB)": 27.09, + "reward": 0.5149999797344208, + "reward_std": 0.11851109731942415, + "rewards/MMContentORM/mean": 0.6949999928474426, + "rewards/MMContentORM/std": 0.583082401752472, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2125, + "train_speed(iter/s)": 0.08345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.8, + "completions/mean_length": 225.7, + "completions/min_length": 148.6, + "epoch": 1.0225636101776283, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1546584665775299, + "kl": 0.01431884765625, + "learning_rate": 5.228176447391848e-06, + "loss": 0.0005729184485971928, + "memory(GiB)": 27.09, + "reward": 0.4828499734401703, + "reward_std": 0.1130663676187396, + "rewards/MMContentORM/mean": 0.6290000081062317, + "rewards/MMContentORM/std": 0.6190735220909118, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 2130, + "train_speed(iter/s)": 0.083434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.6, + "completions/mean_length": 212.1125, + "completions/min_length": 127.6, + "epoch": 1.0249639942390782, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.15606635808944702, + "kl": 0.012115478515625, + "learning_rate": 5.208347056705506e-06, + "loss": 0.0004839696455746889, + "memory(GiB)": 27.09, + "reward": 0.4859499931335449, + "reward_std": 0.10585388457402586, + "rewards/MMContentORM/mean": 0.6080000042915344, + "rewards/MMContentORM/std": 0.5991616785526276, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2135, + "train_speed(iter/s)": 0.083448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.6, + "completions/mean_length": 233.0375, + "completions/min_length": 124.6, + "epoch": 1.0273643783005282, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.10234855115413666, + "kl": 0.017791748046875, + "learning_rate": 5.188514382840984e-06, + "loss": 0.0007121129892766476, + "memory(GiB)": 27.09, + "reward": 0.42114998698234557, + "reward_std": 0.16525085866451264, + "rewards/MMContentORM/mean": 0.5035000085830689, + "rewards/MMContentORM/std": 0.714464795589447, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 2140, + "train_speed(iter/s)": 0.08342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 463.4, + "completions/mean_length": 223.875, + "completions/min_length": 127.6, + "epoch": 1.029764762361978, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.2525361478328705, + "kl": 0.02169189453125, + "learning_rate": 5.168678738325863e-06, + "loss": 0.0008673015981912613, + "memory(GiB)": 27.09, + "reward": 0.4111499905586243, + "reward_std": 0.17882730215787887, + "rewards/MMContentORM/mean": 0.47850002646446227, + "rewards/MMContentORM/std": 0.7203184485435485, + "rewards/MMFormatORM/mean": 0.5931249976158142, + "rewards/MMFormatORM/std": 0.18240466713905334, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2806225776672363, + "step": 2145, + "train_speed(iter/s)": 0.083368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.6, + "completions/mean_length": 216.4375, + "completions/min_length": 143.0, + "epoch": 1.0321651464234278, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.11111954599618912, + "kl": 0.020556640625, + "learning_rate": 5.148840435734542e-06, + "loss": 0.0008218312636017799, + "memory(GiB)": 27.09, + "reward": 0.41229996979236605, + "reward_std": 0.22641559094190597, + "rewards/MMContentORM/mean": 0.5245000123977661, + "rewards/MMContentORM/std": 0.6923137307167053, + "rewards/MMFormatORM/mean": 0.5687499940395355, + "rewards/MMFormatORM/std": 0.20804243683815002, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.32006530165672303, + "step": 2150, + "train_speed(iter/s)": 0.083382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.6, + "completions/mean_length": 219.275, + "completions/min_length": 136.8, + "epoch": 1.0345655304848775, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.20036152005195618, + "kl": 0.01781005859375, + "learning_rate": 5.128999787683301e-06, + "loss": 0.0007116260938346386, + "memory(GiB)": 27.09, + "reward": 0.46249998211860655, + "reward_std": 0.11525840454269201, + "rewards/MMContentORM/mean": 0.5799999892711639, + "rewards/MMContentORM/std": 0.6871018171310425, + "rewards/MMFormatORM/mean": 0.6074999809265137, + "rewards/MMFormatORM/std": 0.12120826840400696, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2155, + "train_speed(iter/s)": 0.083386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 213.925, + "completions/min_length": 129.0, + "epoch": 1.0369659145463275, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.22052158415317535, + "kl": 0.021417236328125, + "learning_rate": 5.109157106825382e-06, + "loss": 0.0008573445491492748, + "memory(GiB)": 27.09, + "reward": 0.4300999820232391, + "reward_std": 0.1565534472465515, + "rewards/MMContentORM/mean": 0.5115000009536743, + "rewards/MMContentORM/std": 0.7336692571640014, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2160, + "train_speed(iter/s)": 0.083405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.6, + "completions/mean_length": 213.975, + "completions/min_length": 139.0, + "epoch": 1.0393662986077772, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.1396639049053192, + "kl": 0.014166259765625, + "learning_rate": 5.089312705846059e-06, + "loss": 0.0005671509075909853, + "memory(GiB)": 27.09, + "reward": 0.48359997272491456, + "reward_std": 0.11030865609645843, + "rewards/MMContentORM/mean": 0.6164999902248383, + "rewards/MMContentORM/std": 0.6159408092498779, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2165, + "train_speed(iter/s)": 0.083423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 455.0, + "completions/mean_length": 234.1125, + "completions/min_length": 146.0, + "epoch": 1.0417666826692271, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1445954293012619, + "kl": 0.015972900390625, + "learning_rate": 5.069466897457716e-06, + "loss": 0.0006388931069523096, + "memory(GiB)": 27.09, + "reward": 0.4729499876499176, + "reward_std": 0.08181225277949125, + "rewards/MMContentORM/mean": 0.5754999935626983, + "rewards/MMContentORM/std": 0.6227695643901825, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.05240467190742493, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.08062257766723632, + "step": 2170, + "train_speed(iter/s)": 0.083381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.8, + "completions/mean_length": 215.3875, + "completions/min_length": 138.2, + "epoch": 1.0441670667306768, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1612766534090042, + "kl": 0.01553955078125, + "learning_rate": 5.049619994394913e-06, + "loss": 0.0006216233130544424, + "memory(GiB)": 27.09, + "reward": 0.5203999698162078, + "reward_std": 0.09107535094954074, + "rewards/MMContentORM/mean": 0.7085000276565552, + "rewards/MMContentORM/std": 0.5656350731849671, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2175, + "train_speed(iter/s)": 0.083399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.6, + "completions/mean_length": 226.025, + "completions/min_length": 151.8, + "epoch": 1.0465674507921268, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.08171720802783966, + "kl": 0.016748046875, + "learning_rate": 5.029772309409458e-06, + "loss": 0.000670594209805131, + "memory(GiB)": 27.09, + "reward": 0.48629997968673705, + "reward_std": 0.04709330874029547, + "rewards/MMContentORM/mean": 0.5945000052452087, + "rewards/MMContentORM/std": 0.6320461511611939, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2180, + "train_speed(iter/s)": 0.083388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.8, + "completions/mean_length": 226.75, + "completions/min_length": 144.4, + "epoch": 1.0489678348535765, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.06735818833112717, + "kl": 0.01427001953125, + "learning_rate": 5.009924155265484e-06, + "loss": 0.0005706480704247951, + "memory(GiB)": 27.09, + "reward": 0.4983999729156494, + "reward_std": 0.08202438042499124, + "rewards/MMContentORM/mean": 0.653499984741211, + "rewards/MMContentORM/std": 0.600805139541626, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2185, + "train_speed(iter/s)": 0.083397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 522.6, + "completions/mean_length": 237.8, + "completions/min_length": 159.2, + "epoch": 1.0513682189150264, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.2018902599811554, + "kl": 0.0145751953125, + "learning_rate": 4.9900758447345156e-06, + "loss": 0.0005828267894685268, + "memory(GiB)": 27.09, + "reward": 0.5095999836921692, + "reward_std": 0.09220672026276588, + "rewards/MMContentORM/mean": 0.6815000116825104, + "rewards/MMContentORM/std": 0.4883635245263577, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 2190, + "train_speed(iter/s)": 0.083317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.6, + "completions/mean_length": 220.4125, + "completions/min_length": 160.6, + "epoch": 1.0537686029764763, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11942193657159805, + "kl": 0.01512451171875, + "learning_rate": 4.9702276905905435e-06, + "loss": 0.0006050709635019302, + "memory(GiB)": 27.09, + "reward": 0.5128499686717987, + "reward_std": 0.1390879033599049, + "rewards/MMContentORM/mean": 0.7040000319480896, + "rewards/MMContentORM/std": 0.5877701699733734, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2195, + "train_speed(iter/s)": 0.083333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.6, + "completions/mean_length": 221.0, + "completions/min_length": 124.4, + "epoch": 1.056168987037926, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.13161082565784454, + "kl": 0.014984130859375, + "learning_rate": 4.950380005605088e-06, + "loss": 0.0005988968070596457, + "memory(GiB)": 27.09, + "reward": 0.4813499927520752, + "reward_std": 0.07785245627164841, + "rewards/MMContentORM/mean": 0.5965000092983246, + "rewards/MMContentORM/std": 0.6116379499435425, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2200, + "train_speed(iter/s)": 0.083343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 454.6, + "completions/mean_length": 228.3125, + "completions/min_length": 129.4, + "epoch": 1.058569371099376, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1304662823677063, + "kl": 0.024444580078125, + "learning_rate": 4.9305331025422845e-06, + "loss": 0.0009780921041965484, + "memory(GiB)": 27.09, + "reward": 0.47104998826980593, + "reward_std": 0.08350930837914347, + "rewards/MMContentORM/mean": 0.5995000153779984, + "rewards/MMContentORM/std": 0.6014656841754913, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 2205, + "train_speed(iter/s)": 0.083246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 476.8, + "completions/mean_length": 229.775, + "completions/min_length": 156.0, + "epoch": 1.0609697551608257, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.06856456398963928, + "kl": 0.017633056640625, + "learning_rate": 4.9106872941539424e-06, + "loss": 0.0007049092557281256, + "memory(GiB)": 27.09, + "reward": 0.42249998450279236, + "reward_std": 0.13392602608073503, + "rewards/MMContentORM/mean": 0.49250001311302183, + "rewards/MMContentORM/std": 0.633867347240448, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2210, + "train_speed(iter/s)": 0.08319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.4, + "completions/mean_length": 217.775, + "completions/min_length": 120.0, + "epoch": 1.0633701392222756, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.1842622011899948, + "kl": 0.022515869140625, + "learning_rate": 4.8908428931746195e-06, + "loss": 0.0009016531519591808, + "memory(GiB)": 27.09, + "reward": 0.38274996876716616, + "reward_std": 0.20951574475038798, + "rewards/MMContentORM/mean": 0.4525000035762787, + "rewards/MMContentORM/std": 0.7619948863983155, + "rewards/MMFormatORM/mean": 0.5668749690055848, + "rewards/MMFormatORM/std": 0.19322119355201722, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2978711724281311, + "step": 2215, + "train_speed(iter/s)": 0.083199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/mean_length": 217.6875, + "completions/min_length": 138.0, + "epoch": 1.0657705232837253, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.1208593100309372, + "kl": 0.014208984375, + "learning_rate": 4.871000212316701e-06, + "loss": 0.0005678186193108558, + "memory(GiB)": 27.09, + "reward": 0.514799976348877, + "reward_std": 0.011879390012472868, + "rewards/MMContentORM/mean": 0.6370000064373016, + "rewards/MMContentORM/std": 0.5178160190582275, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 2220, + "train_speed(iter/s)": 0.083209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.4, + "completions/mean_length": 216.4875, + "completions/min_length": 151.2, + "epoch": 1.0681709073451753, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1443309485912323, + "kl": 0.013623046875, + "learning_rate": 4.851159564265459e-06, + "loss": 0.0005446367897093296, + "memory(GiB)": 27.09, + "reward": 0.41879999041557314, + "reward_std": 0.1261478431522846, + "rewards/MMContentORM/mean": 0.45449999570846555, + "rewards/MMContentORM/std": 0.6680951356887818, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 2225, + "train_speed(iter/s)": 0.083226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/mean_length": 227.0125, + "completions/min_length": 129.6, + "epoch": 1.070571291406625, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.10511992871761322, + "kl": 0.01990966796875, + "learning_rate": 4.831321261674138e-06, + "loss": 0.0007973327301442623, + "memory(GiB)": 27.09, + "reward": 0.4790499657392502, + "reward_std": 0.20272751227021218, + "rewards/MMContentORM/mean": 0.6770000040531159, + "rewards/MMContentORM/std": 0.5581120260059833, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.17163621485233307, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.26405572295188906, + "step": 2230, + "train_speed(iter/s)": 0.08321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.2, + "completions/mean_length": 217.4875, + "completions/min_length": 141.2, + "epoch": 1.072971675468075, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.23232097923755646, + "kl": 0.01639404296875, + "learning_rate": 4.811485617159018e-06, + "loss": 0.0006561274174600839, + "memory(GiB)": 27.09, + "reward": 0.4388999938964844, + "reward_std": 0.14424977898597718, + "rewards/MMContentORM/mean": 0.5335000038146973, + "rewards/MMContentORM/std": 0.7047018647193909, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2235, + "train_speed(iter/s)": 0.083209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.8, + "completions/mean_length": 219.5125, + "completions/min_length": 151.4, + "epoch": 1.0753720595295246, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.08917021751403809, + "kl": 0.015106201171875, + "learning_rate": 4.791652943294496e-06, + "loss": 0.0006044380366802216, + "memory(GiB)": 27.09, + "reward": 0.5175999701023102, + "reward_std": 0.11653119549155236, + "rewards/MMContentORM/mean": 0.7015000104904174, + "rewards/MMContentORM/std": 0.5376629948616027, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2240, + "train_speed(iter/s)": 0.0832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.2, + "completions/mean_length": 214.7125, + "completions/min_length": 126.2, + "epoch": 1.0777724435909746, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.06148146465420723, + "kl": 0.0146240234375, + "learning_rate": 4.771823552608153e-06, + "loss": 0.0005852002650499344, + "memory(GiB)": 27.09, + "reward": 0.461699965596199, + "reward_std": 0.12600642547477037, + "rewards/MMContentORM/mean": 0.5905000075697899, + "rewards/MMContentORM/std": 0.5648605763912201, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 2245, + "train_speed(iter/s)": 0.083202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.8, + "completions/mean_length": 219.975, + "completions/min_length": 158.6, + "epoch": 1.0801728276524245, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1749095767736435, + "kl": 0.019097900390625, + "learning_rate": 4.751997757575837e-06, + "loss": 0.0007640034891664982, + "memory(GiB)": 27.09, + "reward": 0.4941999793052673, + "reward_std": 0.10550032928586006, + "rewards/MMContentORM/mean": 0.6430000066757202, + "rewards/MMContentORM/std": 0.5990769028663635, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2250, + "train_speed(iter/s)": 0.083217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.4, + "completions/mean_length": 215.775, + "completions/min_length": 141.8, + "epoch": 1.0825732117138742, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.12527073919773102, + "kl": 0.013714599609375, + "learning_rate": 4.732175870616734e-06, + "loss": 0.0005480511114001274, + "memory(GiB)": 27.09, + "reward": 0.5476999878883362, + "reward_std": 0.054164378554560244, + "rewards/MMContentORM/mean": 0.7479999840259552, + "rewards/MMContentORM/std": 0.419530663639307, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2255, + "train_speed(iter/s)": 0.083218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.8, + "completions/mean_length": 209.775, + "completions/min_length": 131.2, + "epoch": 1.0849735957753242, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00807945616543293, + "kl": 0.015020751953125, + "learning_rate": 4.71235820408844e-06, + "loss": 0.0006013516336679459, + "memory(GiB)": 27.09, + "reward": 0.5140499889850616, + "reward_std": 0.05989194584544748, + "rewards/MMContentORM/mean": 0.6495000064373017, + "rewards/MMContentORM/std": 0.4968143880367279, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 2260, + "train_speed(iter/s)": 0.08322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.4, + "completions/mean_length": 217.525, + "completions/min_length": 136.4, + "epoch": 1.0873739798367739, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.1432449072599411, + "kl": 0.01522216796875, + "learning_rate": 4.692545070282057e-06, + "loss": 0.0006086730398237705, + "memory(GiB)": 27.09, + "reward": 0.4586499869823456, + "reward_std": 0.15577562851831317, + "rewards/MMContentORM/mean": 0.5684999942779541, + "rewards/MMContentORM/std": 0.6647186577320099, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2265, + "train_speed(iter/s)": 0.083209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.4, + "completions/mean_length": 214.075, + "completions/min_length": 138.8, + "epoch": 1.0897743638982238, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16632573306560516, + "kl": 0.013531494140625, + "learning_rate": 4.672736781417244e-06, + "loss": 0.000541134737432003, + "memory(GiB)": 27.09, + "reward": 0.521150004863739, + "reward_std": 0.1035911375656724, + "rewards/MMContentORM/mean": 0.6959999978542328, + "rewards/MMContentORM/std": 0.4113076165318489, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2270, + "train_speed(iter/s)": 0.083217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.4, + "completions/mean_length": 212.2875, + "completions/min_length": 131.8, + "epoch": 1.0921747479596735, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.10571814328432083, + "kl": 0.017926025390625, + "learning_rate": 4.652933649637323e-06, + "loss": 0.0007169050164520741, + "memory(GiB)": 27.09, + "reward": 0.48269999623298643, + "reward_std": 0.07311483474913985, + "rewards/MMContentORM/mean": 0.5855000197887421, + "rewards/MMContentORM/std": 0.5871885895729065, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2275, + "train_speed(iter/s)": 0.08323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.4, + "completions/mean_length": 211.2625, + "completions/min_length": 140.2, + "epoch": 1.0945751320211234, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.21430432796478271, + "kl": 0.015948486328125, + "learning_rate": 4.633135987004349e-06, + "loss": 0.0006380814127624034, + "memory(GiB)": 27.09, + "reward": 0.5692499935626983, + "reward_std": 0.02199101869482547, + "rewards/MMContentORM/mean": 0.7749999940395356, + "rewards/MMContentORM/std": 0.3264094144105911, + "rewards/MMFormatORM/mean": 0.6481249809265137, + "rewards/MMFormatORM/std": 0.007499998807907105, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 2280, + "train_speed(iter/s)": 0.083226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.8, + "completions/mean_length": 211.85, + "completions/min_length": 151.2, + "epoch": 1.0969755160825732, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.18268615007400513, + "kl": 0.01839599609375, + "learning_rate": 4.613344105494186e-06, + "loss": 0.0007361322641372681, + "memory(GiB)": 27.09, + "reward": 0.5117499828338623, + "reward_std": 0.12593571692705155, + "rewards/MMContentORM/mean": 0.7049999952316284, + "rewards/MMContentORM/std": 0.5719310343265533, + "rewards/MMFormatORM/mean": 0.6056249856948852, + "rewards/MMFormatORM/std": 0.15690345019102098, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2285, + "train_speed(iter/s)": 0.083239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.2, + "completions/mean_length": 219.5375, + "completions/min_length": 148.2, + "epoch": 1.099375900144023, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16588640213012695, + "kl": 0.017315673828125, + "learning_rate": 4.593558316991606e-06, + "loss": 0.000692180311307311, + "memory(GiB)": 27.09, + "reward": 0.4636499762535095, + "reward_std": 0.1040154074318707, + "rewards/MMContentORM/mean": 0.581000006198883, + "rewards/MMContentORM/std": 0.6416810989379883, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 2290, + "train_speed(iter/s)": 0.083227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 475.2, + "completions/mean_length": 220.275, + "completions/min_length": 109.0, + "epoch": 1.1017762842054728, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1836349368095398, + "kl": 0.024261474609375, + "learning_rate": 4.57377893328536e-06, + "loss": 0.0009730796329677105, + "memory(GiB)": 27.09, + "reward": 0.4206999778747559, + "reward_std": 0.19699994921684266, + "rewards/MMContentORM/mean": 0.5455000042915344, + "rewards/MMContentORM/std": 0.7305709242820739, + "rewards/MMFormatORM/mean": 0.568749976158142, + "rewards/MMFormatORM/std": 0.21811503469944, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.335561603307724, + "step": 2295, + "train_speed(iter/s)": 0.083164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/mean_length": 216.825, + "completions/min_length": 143.8, + "epoch": 1.1041766682669227, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13757243752479553, + "kl": 0.01383056640625, + "learning_rate": 4.554006266063276e-06, + "loss": 0.0005533020943403244, + "memory(GiB)": 27.09, + "reward": 0.49374998807907106, + "reward_std": 0.0761553971329704, + "rewards/MMContentORM/mean": 0.6274999976158142, + "rewards/MMContentORM/std": 0.6148638248443603, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2300, + "train_speed(iter/s)": 0.083171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.2, + "completions/mean_length": 205.6125, + "completions/min_length": 133.4, + "epoch": 1.1065770523283724, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.11595190316438675, + "kl": 0.017413330078125, + "learning_rate": 4.534240626907338e-06, + "loss": 0.0006965134758502245, + "memory(GiB)": 27.09, + "reward": 0.520749980211258, + "reward_std": 0.062296105083078146, + "rewards/MMContentORM/mean": 0.6950000166893006, + "rewards/MMContentORM/std": 0.579815822839737, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2305, + "train_speed(iter/s)": 0.083141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/mean_length": 216.1375, + "completions/min_length": 147.8, + "epoch": 1.1089774363898224, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14087656140327454, + "kl": 0.0158203125, + "learning_rate": 4.51448232728878e-06, + "loss": 0.0006328361108899116, + "memory(GiB)": 27.09, + "reward": 0.4798499882221222, + "reward_std": 0.06993285585194826, + "rewards/MMContentORM/mean": 0.6215000033378602, + "rewards/MMContentORM/std": 0.5286586560308933, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2310, + "train_speed(iter/s)": 0.083141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/mean_length": 211.375, + "completions/min_length": 140.0, + "epoch": 1.1113778204512723, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1471603959798813, + "kl": 0.0154541015625, + "learning_rate": 4.494731678563182e-06, + "loss": 0.0006185553036630153, + "memory(GiB)": 27.09, + "reward": 0.482699978351593, + "reward_std": 0.03804234203416854, + "rewards/MMContentORM/mean": 0.5855000197887421, + "rewards/MMContentORM/std": 0.5835969924926758, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2315, + "train_speed(iter/s)": 0.083155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.8, + "completions/mean_length": 208.625, + "completions/min_length": 132.4, + "epoch": 1.113778204512722, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15146136283874512, + "kl": 0.016741943359375, + "learning_rate": 4.474988991965556e-06, + "loss": 0.0006692257709801197, + "memory(GiB)": 27.09, + "reward": 0.47709997892379763, + "reward_std": 0.11455130190588533, + "rewards/MMContentORM/mean": 0.6165000081062317, + "rewards/MMContentORM/std": 0.6526979386806488, + "rewards/MMFormatORM/mean": 0.6074999928474426, + "rewards/MMFormatORM/std": 0.12490466833114625, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 2320, + "train_speed(iter/s)": 0.083179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.4, + "completions/mean_length": 208.6375, + "completions/min_length": 136.4, + "epoch": 1.116178588574172, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.11009304225444794, + "kl": 0.01671142578125, + "learning_rate": 4.455254578605447e-06, + "loss": 0.0006676350720226764, + "memory(GiB)": 27.09, + "reward": 0.499949985742569, + "reward_std": 0.053386559477075934, + "rewards/MMContentORM/mean": 0.6429999947547913, + "rewards/MMContentORM/std": 0.557970917224884, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2325, + "train_speed(iter/s)": 0.0832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.6, + "completions/mean_length": 215.6625, + "completions/min_length": 134.6, + "epoch": 1.1185789726356217, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.15660890936851501, + "kl": 0.016387939453125, + "learning_rate": 4.435528749462026e-06, + "loss": 0.0006562436930835247, + "memory(GiB)": 27.09, + "reward": 0.4931999921798706, + "reward_std": 0.11030865758657456, + "rewards/MMContentORM/mean": 0.6404999971389771, + "rewards/MMContentORM/std": 0.6184715509414673, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2330, + "train_speed(iter/s)": 0.083202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.6, + "completions/mean_length": 209.925, + "completions/min_length": 132.2, + "epoch": 1.1209793566970716, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.005811932031065226, + "kl": 0.0166015625, + "learning_rate": 4.415811815379198e-06, + "loss": 0.000663516204804182, + "memory(GiB)": 27.09, + "reward": 0.5019499778747558, + "reward_std": 0.10755094066262245, + "rewards/MMContentORM/mean": 0.6479999959468842, + "rewards/MMContentORM/std": 0.5736066222190856, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2335, + "train_speed(iter/s)": 0.083186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.2, + "completions/mean_length": 204.1875, + "completions/min_length": 136.4, + "epoch": 1.1233797407585213, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.159584641456604, + "kl": 0.018511962890625, + "learning_rate": 4.396104087060689e-06, + "loss": 0.0007406437769532203, + "memory(GiB)": 27.09, + "reward": 0.45104997754096987, + "reward_std": 0.08619631510227918, + "rewards/MMContentORM/mean": 0.5495000213384629, + "rewards/MMContentORM/std": 0.5832443118095398, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2340, + "train_speed(iter/s)": 0.083202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.2, + "completions/mean_length": 221.7875, + "completions/min_length": 137.0, + "epoch": 1.1257801248199713, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13437344133853912, + "kl": 0.017962646484375, + "learning_rate": 4.376405875065165e-06, + "loss": 0.0007189226802438497, + "memory(GiB)": 27.09, + "reward": 0.4833999931812286, + "reward_std": 0.12416795073077083, + "rewards/MMContentORM/mean": 0.6160000026226043, + "rewards/MMContentORM/std": 0.6286175012588501, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2345, + "train_speed(iter/s)": 0.083187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/mean_length": 213.275, + "completions/min_length": 141.4, + "epoch": 1.128180508881421, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.14463329315185547, + "kl": 0.0177001953125, + "learning_rate": 4.35671748980133e-06, + "loss": 0.0007080785930156708, + "memory(GiB)": 27.09, + "reward": 0.5173499882221222, + "reward_std": 0.06088189166039228, + "rewards/MMContentORM/mean": 0.6864999771118164, + "rewards/MMContentORM/std": 0.5560923993587494, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2350, + "train_speed(iter/s)": 0.083196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.6, + "completions/mean_length": 217.05, + "completions/min_length": 154.4, + "epoch": 1.130580892942871, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.0970078557729721, + "kl": 0.013482666015625, + "learning_rate": 4.337039241523034e-06, + "loss": 0.0005393566098064184, + "memory(GiB)": 27.09, + "reward": 0.5178999722003936, + "reward_std": 0.04709331314079464, + "rewards/MMContentORM/mean": 0.6734999895095826, + "rewards/MMContentORM/std": 0.4659044176340103, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2355, + "train_speed(iter/s)": 0.08321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.2, + "completions/mean_length": 209.6875, + "completions/min_length": 142.0, + "epoch": 1.1329812770043206, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.16535647213459015, + "kl": 0.02020263671875, + "learning_rate": 4.3173714403243926e-06, + "loss": 0.0008074231445789337, + "memory(GiB)": 27.09, + "reward": 0.4483999729156494, + "reward_std": 0.1086115974234417, + "rewards/MMContentORM/mean": 0.5285000026226043, + "rewards/MMContentORM/std": 0.6377276480197906, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2360, + "train_speed(iter/s)": 0.08323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.4, + "completions/mean_length": 208.6125, + "completions/min_length": 157.8, + "epoch": 1.1353816610657705, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1501249223947525, + "kl": 0.01739501953125, + "learning_rate": 4.2977143961348846e-06, + "loss": 0.0006959887687116861, + "memory(GiB)": 27.09, + "reward": 0.3825499892234802, + "reward_std": 0.1649680064059794, + "rewards/MMContentORM/mean": 0.4069999992847443, + "rewards/MMContentORM/std": 0.696259343624115, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2365, + "train_speed(iter/s)": 0.083254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.4, + "completions/mean_length": 211.1875, + "completions/min_length": 141.6, + "epoch": 1.1377820451272203, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.12787005305290222, + "kl": 0.016790771484375, + "learning_rate": 4.278068418714488e-06, + "loss": 0.000671594263985753, + "memory(GiB)": 27.09, + "reward": 0.4533999919891357, + "reward_std": 0.10776307452470064, + "rewards/MMContentORM/mean": 0.5409999907016754, + "rewards/MMContentORM/std": 0.589427363872528, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2370, + "train_speed(iter/s)": 0.083268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.6, + "completions/mean_length": 211.275, + "completions/min_length": 157.4, + "epoch": 1.1401824291886702, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.08513356745243073, + "kl": 0.01527099609375, + "learning_rate": 4.258433817648778e-06, + "loss": 0.0006108290050178766, + "memory(GiB)": 27.09, + "reward": 0.5273999691009521, + "reward_std": 0.08343859422020614, + "rewards/MMContentORM/mean": 0.7134999930858612, + "rewards/MMContentORM/std": 0.4296311870217323, + "rewards/MMFormatORM/mean": 0.6237499833106994, + "rewards/MMFormatORM/std": 0.08440345227718353, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2375, + "train_speed(iter/s)": 0.08327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/mean_length": 219.425, + "completions/min_length": 137.0, + "epoch": 1.1425828132501201, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.14258858561515808, + "kl": 0.0161865234375, + "learning_rate": 4.238810902344065e-06, + "loss": 0.0006477432791143656, + "memory(GiB)": 27.09, + "reward": 0.45419996976852417, + "reward_std": 0.11737972022965551, + "rewards/MMContentORM/mean": 0.5430000007152558, + "rewards/MMContentORM/std": 0.6464896261692047, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2380, + "train_speed(iter/s)": 0.083263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/mean_length": 213.075, + "completions/min_length": 133.4, + "epoch": 1.1449831973115698, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1284060925245285, + "kl": 0.01661376953125, + "learning_rate": 4.219199982022512e-06, + "loss": 0.0006643535569310188, + "memory(GiB)": 27.09, + "reward": 0.5382999777793884, + "reward_std": 0.0634981878567487, + "rewards/MMContentORM/mean": 0.7245000004768372, + "rewards/MMContentORM/std": 0.5121561586856842, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2385, + "train_speed(iter/s)": 0.083274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.8, + "completions/mean_length": 217.0625, + "completions/min_length": 140.4, + "epoch": 1.1473835813730198, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14643016457557678, + "kl": 0.014398193359375, + "learning_rate": 4.199601365717259e-06, + "loss": 0.0005763438530266285, + "memory(GiB)": 27.09, + "reward": 0.506199985742569, + "reward_std": 0.033658286277204755, + "rewards/MMContentORM/mean": 0.6154999971389771, + "rewards/MMContentORM/std": 0.5775948464870453, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 2390, + "train_speed(iter/s)": 0.083288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 458.2, + "completions/mean_length": 222.6375, + "completions/min_length": 136.6, + "epoch": 1.1497839654344695, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.16194605827331543, + "kl": 0.019708251953125, + "learning_rate": 4.180015362267564e-06, + "loss": 0.0007893730886280537, + "memory(GiB)": 27.09, + "reward": 0.4298999786376953, + "reward_std": 0.14891668558120727, + "rewards/MMContentORM/mean": 0.510999983549118, + "rewards/MMContentORM/std": 0.6927963614463806, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2395, + "train_speed(iter/s)": 0.083235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.2, + "completions/mean_length": 219.5625, + "completions/min_length": 139.8, + "epoch": 1.1521843494959194, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16206054389476776, + "kl": 0.016741943359375, + "learning_rate": 4.160442280313923e-06, + "loss": 0.0006699077785015106, + "memory(GiB)": 27.09, + "reward": 0.45619996786117556, + "reward_std": 0.11398561298847198, + "rewards/MMContentORM/mean": 0.5354999899864197, + "rewards/MMContentORM/std": 0.6260799109935761, + "rewards/MMFormatORM/mean": 0.6237499833106994, + "rewards/MMFormatORM/std": 0.07680481374263763, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2400, + "train_speed(iter/s)": 0.08323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.8, + "completions/mean_length": 203.65, + "completions/min_length": 100.8, + "epoch": 1.1545847335573691, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.10279816389083862, + "kl": 0.021893310546875, + "learning_rate": 4.14088242829321e-06, + "loss": 0.0008754994720220566, + "memory(GiB)": 27.09, + "reward": 0.4607499837875366, + "reward_std": 0.09623723030090332, + "rewards/MMContentORM/mean": 0.5450000107288361, + "rewards/MMContentORM/std": 0.5940271973609924, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2405, + "train_speed(iter/s)": 0.083192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.8, + "completions/mean_length": 207.1875, + "completions/min_length": 132.4, + "epoch": 1.156985117618819, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.17465510964393616, + "kl": 0.015850830078125, + "learning_rate": 4.121336114433825e-06, + "loss": 0.0006340592168271542, + "memory(GiB)": 27.09, + "reward": 0.39219998121261596, + "reward_std": 0.09814641983248293, + "rewards/MMContentORM/mean": 0.38800000250339506, + "rewards/MMContentORM/std": 0.6825608611106873, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2410, + "train_speed(iter/s)": 0.0832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.6, + "completions/mean_length": 209.975, + "completions/min_length": 150.6, + "epoch": 1.1593855016802688, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.07339280098676682, + "kl": 0.021881103515625, + "learning_rate": 4.101803646750826e-06, + "loss": 0.0008749545551836491, + "memory(GiB)": 27.09, + "reward": 0.45749999284744264, + "reward_std": 0.13420886893291026, + "rewards/MMContentORM/mean": 0.5799999952316284, + "rewards/MMContentORM/std": 0.5645999349653721, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.10480934381484985, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.16124515533447265, + "step": 2415, + "train_speed(iter/s)": 0.083212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/mean_length": 220.1, + "completions/min_length": 119.6, + "epoch": 1.1617858857417187, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.17658497393131256, + "kl": 0.021868896484375, + "learning_rate": 4.082285333041085e-06, + "loss": 0.0008741414174437522, + "memory(GiB)": 27.09, + "reward": 0.5191999733448028, + "reward_std": 0.061801125714555386, + "rewards/MMContentORM/mean": 0.7055000066757202, + "rewards/MMContentORM/std": 0.5510667979717254, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 2420, + "train_speed(iter/s)": 0.083195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.6, + "completions/mean_length": 217.0625, + "completions/min_length": 155.4, + "epoch": 1.1641862698031684, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.23948155343532562, + "kl": 0.022308349609375, + "learning_rate": 4.062781480878426e-06, + "loss": 0.0008926920592784882, + "memory(GiB)": 27.09, + "reward": 0.42619999051094054, + "reward_std": 0.1429769902024418, + "rewards/MMContentORM/mean": 0.5305000066757202, + "rewards/MMContentORM/std": 0.6429137587547302, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 2425, + "train_speed(iter/s)": 0.083198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.2, + "completions/mean_length": 212.85, + "completions/min_length": 139.4, + "epoch": 1.1665866538646184, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.22126245498657227, + "kl": 0.015814208984375, + "learning_rate": 4.0432923976087915e-06, + "loss": 0.00063277967274189, + "memory(GiB)": 27.09, + "reward": 0.4893499791622162, + "reward_std": 0.07785245273262262, + "rewards/MMContentORM/mean": 0.6165000140666962, + "rewards/MMContentORM/std": 0.594619619846344, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2430, + "train_speed(iter/s)": 0.083199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.6, + "completions/mean_length": 210.85, + "completions/min_length": 119.6, + "epoch": 1.168987037926068, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1325448900461197, + "kl": 0.027337646484375, + "learning_rate": 4.02381839034539e-06, + "loss": 0.0010928992182016374, + "memory(GiB)": 27.09, + "reward": 0.4188999950885773, + "reward_std": 0.14410835653543472, + "rewards/MMContentORM/mean": 0.4834999918937683, + "rewards/MMContentORM/std": 0.6920648813247681, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 2435, + "train_speed(iter/s)": 0.083209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.8, + "completions/mean_length": 218.55, + "completions/min_length": 134.8, + "epoch": 1.171387421987518, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10885204374790192, + "kl": 0.0170654296875, + "learning_rate": 4.004359765963852e-06, + "loss": 0.0006822014227509498, + "memory(GiB)": 27.09, + "reward": 0.4416999638080597, + "reward_std": 0.15089658349752427, + "rewards/MMContentORM/mean": 0.540499997138977, + "rewards/MMContentORM/std": 0.6798429071903229, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 2440, + "train_speed(iter/s)": 0.083214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 212.0625, + "completions/min_length": 143.8, + "epoch": 1.173787806048968, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10752154886722565, + "kl": 0.0148681640625, + "learning_rate": 3.984916831097416e-06, + "loss": 0.0005946123506873846, + "memory(GiB)": 27.09, + "reward": 0.46099998354911803, + "reward_std": 0.11511698234826326, + "rewards/MMContentORM/mean": 0.5599999904632569, + "rewards/MMContentORM/std": 0.6491201877593994, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2445, + "train_speed(iter/s)": 0.083234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 588.4, + "completions/mean_length": 236.825, + "completions/min_length": 135.4, + "epoch": 1.1761881901104176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.200127974152565, + "kl": 0.017510986328125, + "learning_rate": 3.965489892132067e-06, + "loss": 0.0007008564192801714, + "memory(GiB)": 27.09, + "reward": 0.419449982047081, + "reward_std": 0.19212091341614723, + "rewards/MMContentORM/mean": 0.5280000180006027, + "rewards/MMContentORM/std": 0.7196203231811523, + "rewards/MMFormatORM/mean": 0.5768749952316284, + "rewards/MMFormatORM/std": 0.16249999403953552, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.25, + "step": 2450, + "train_speed(iter/s)": 0.083144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.8, + "completions/mean_length": 215.9375, + "completions/min_length": 165.4, + "epoch": 1.1785885741718676, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1706075668334961, + "kl": 0.0157958984375, + "learning_rate": 3.9460792552017345e-06, + "loss": 0.0006313313730061054, + "memory(GiB)": 27.09, + "reward": 0.4721999764442444, + "reward_std": 0.0885297678411007, + "rewards/MMContentORM/mean": 0.5880000114440918, + "rewards/MMContentORM/std": 0.6373610079288483, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2455, + "train_speed(iter/s)": 0.083165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.8, + "completions/mean_length": 211.5625, + "completions/min_length": 119.2, + "epoch": 1.1809889582333173, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13533374667167664, + "kl": 0.015020751953125, + "learning_rate": 3.9266852261834474e-06, + "loss": 0.0006013016682118177, + "memory(GiB)": 27.09, + "reward": 0.49929999113082885, + "reward_std": 0.055861435388214885, + "rewards/MMContentORM/mean": 0.6270000100135803, + "rewards/MMContentORM/std": 0.5022149316966533, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2460, + "train_speed(iter/s)": 0.083172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/mean_length": 219.2375, + "completions/min_length": 145.2, + "epoch": 1.1833893422947672, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.10274124890565872, + "kl": 0.01768798828125, + "learning_rate": 3.9073081106925314e-06, + "loss": 0.0007067923899739981, + "memory(GiB)": 27.09, + "reward": 0.4602999687194824, + "reward_std": 0.14382552150636913, + "rewards/MMContentORM/mean": 0.5870000004768372, + "rewards/MMContentORM/std": 0.646770179271698, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2465, + "train_speed(iter/s)": 0.083177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.4, + "completions/mean_length": 222.5875, + "completions/min_length": 159.4, + "epoch": 1.185789726356217, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14314013719558716, + "kl": 0.013629150390625, + "learning_rate": 3.887948214077782e-06, + "loss": 0.0005453084595501423, + "memory(GiB)": 27.09, + "reward": 0.43144997358322146, + "reward_std": 0.12409724295139313, + "rewards/MMContentORM/mean": 0.47549999356269834, + "rewards/MMContentORM/std": 0.6934274673461914, + "rewards/MMFormatORM/mean": 0.621874988079071, + "rewards/MMFormatORM/std": 0.11249999552965165, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2470, + "train_speed(iter/s)": 0.083159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.8, + "completions/mean_length": 223.725, + "completions/min_length": 145.8, + "epoch": 1.1881901104176669, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.004896758124232292, + "kl": 0.01409912109375, + "learning_rate": 3.8686058414166504e-06, + "loss": 0.0005641079042106867, + "memory(GiB)": 27.09, + "reward": 0.4801999807357788, + "reward_std": 0.04567909836769104, + "rewards/MMContentORM/mean": 0.6080000057816506, + "rewards/MMContentORM/std": 0.5569998919963837, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2475, + "train_speed(iter/s)": 0.083132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 483.2, + "completions/mean_length": 228.875, + "completions/min_length": 141.6, + "epoch": 1.1905904944791166, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.18111135065555573, + "kl": 0.018408203125, + "learning_rate": 3.849281297510454e-06, + "loss": 0.0007354037370532751, + "memory(GiB)": 27.09, + "reward": 0.41744997948408125, + "reward_std": 0.17444324921816587, + "rewards/MMContentORM/mean": 0.4980000019073486, + "rewards/MMContentORM/std": 0.6551562428474427, + "rewards/MMFormatORM/mean": 0.5893749833106995, + "rewards/MMFormatORM/std": 0.14223275929689408, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.1957427144050598, + "step": 2480, + "train_speed(iter/s)": 0.083083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/mean_length": 217.55, + "completions/min_length": 147.0, + "epoch": 1.1929908785405665, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.2514013350009918, + "kl": 0.01690673828125, + "learning_rate": 3.829974886879549e-06, + "loss": 0.0006760565564036369, + "memory(GiB)": 27.09, + "reward": 0.3862499862909317, + "reward_std": 0.19424223005771638, + "rewards/MMContentORM/mean": 0.4199999988079071, + "rewards/MMContentORM/std": 0.7334108114242553, + "rewards/MMFormatORM/mean": 0.5893749713897705, + "rewards/MMFormatORM/std": 0.1641829013824463, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2485, + "train_speed(iter/s)": 0.083096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.2, + "completions/mean_length": 218.5625, + "completions/min_length": 153.8, + "epoch": 1.1953912626020164, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1668197512626648, + "kl": 0.014886474609375, + "learning_rate": 3.8106869137585507e-06, + "loss": 0.0005956954322755337, + "memory(GiB)": 27.09, + "reward": 0.48804997801780703, + "reward_std": 0.10062129367142916, + "rewards/MMContentORM/mean": 0.6295000076293945, + "rewards/MMContentORM/std": 0.5976639330387116, + "rewards/MMFormatORM/mean": 0.6156249880790711, + "rewards/MMFormatORM/std": 0.11690345108509063, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2490, + "train_speed(iter/s)": 0.0831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/mean_length": 209.3625, + "completions/min_length": 132.4, + "epoch": 1.1977916466634662, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.127728670835495, + "kl": 0.02237548828125, + "learning_rate": 3.791417682091527e-06, + "loss": 0.0008956640027463436, + "memory(GiB)": 27.09, + "reward": 0.4219499886035919, + "reward_std": 0.1320168349891901, + "rewards/MMContentORM/mean": 0.505500003695488, + "rewards/MMContentORM/std": 0.6720031678676606, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2495, + "train_speed(iter/s)": 0.083119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 453.2, + "completions/mean_length": 226.9875, + "completions/min_length": 155.8, + "epoch": 1.2001920307249159, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.05174791067838669, + "kl": 0.0158203125, + "learning_rate": 3.7721674955272204e-06, + "loss": 0.0006332498509436846, + "memory(GiB)": 27.09, + "reward": 0.41349998116493225, + "reward_std": 0.19813132584095, + "rewards/MMContentORM/mean": 0.5149999976158142, + "rewards/MMContentORM/std": 0.6243453428149224, + "rewards/MMFormatORM/mean": 0.574999988079071, + "rewards/MMFormatORM/std": 0.17490466833114623, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27006530165672304, + "step": 2500, + "train_speed(iter/s)": 0.083078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.8, + "completions/mean_length": 216.775, + "completions/min_length": 137.4, + "epoch": 1.2025924147863658, + "frac_reward_zero_std": 0.7, + "grad_norm": 1.2531236410140991, + "kl": 0.06446533203125, + "learning_rate": 3.75293665741425e-06, + "loss": 0.002578136883676052, + "memory(GiB)": 27.09, + "reward": 0.36609998643398284, + "reward_std": 0.14919953048229218, + "rewards/MMContentORM/mean": 0.351500004529953, + "rewards/MMContentORM/std": 0.6848280310630799, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2505, + "train_speed(iter/s)": 0.083051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.2, + "completions/mean_length": 205.3, + "completions/min_length": 144.8, + "epoch": 1.2049927988478157, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.17887341976165771, + "kl": 0.05023193359375, + "learning_rate": 3.7337254707963382e-06, + "loss": 0.0020127676427364348, + "memory(GiB)": 27.09, + "reward": 0.47439998388290405, + "reward_std": 0.07014498831704259, + "rewards/MMContentORM/mean": 0.593500018119812, + "rewards/MMContentORM/std": 0.6442030429840088, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2510, + "train_speed(iter/s)": 0.083075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 218.0875, + "completions/min_length": 142.6, + "epoch": 1.2073931829092654, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.14247167110443115, + "kl": 0.014263916015625, + "learning_rate": 3.714534238407543e-06, + "loss": 0.0005710631608963013, + "memory(GiB)": 27.09, + "reward": 0.5075999617576599, + "reward_std": 0.08768124505877495, + "rewards/MMContentORM/mean": 0.6765000104904175, + "rewards/MMContentORM/std": 0.5244777373969555, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2515, + "train_speed(iter/s)": 0.083085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.6, + "completions/mean_length": 210.3625, + "completions/min_length": 126.8, + "epoch": 1.2097935669707154, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.09162472188472748, + "kl": 0.01552734375, + "learning_rate": 3.695363262667468e-06, + "loss": 0.0006212275475263596, + "memory(GiB)": 27.09, + "reward": 0.4254499673843384, + "reward_std": 0.13145115275401623, + "rewards/MMContentORM/mean": 0.4855000078678131, + "rewards/MMContentORM/std": 0.6825006127357482, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2520, + "train_speed(iter/s)": 0.083091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.6, + "completions/mean_length": 209.475, + "completions/min_length": 122.4, + "epoch": 1.212193951032165, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1547505259513855, + "kl": 0.016241455078125, + "learning_rate": 3.6762128456765167e-06, + "loss": 0.0006503340788185597, + "memory(GiB)": 27.09, + "reward": 0.4827499806880951, + "reward_std": 0.0962372301146388, + "rewards/MMContentORM/mean": 0.6000000059604644, + "rewards/MMContentORM/std": 0.6012049317359924, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2525, + "train_speed(iter/s)": 0.083092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.6, + "completions/mean_length": 210.95, + "completions/min_length": 120.6, + "epoch": 1.214594335093615, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13495810329914093, + "kl": 0.01824951171875, + "learning_rate": 3.657083289211119e-06, + "loss": 0.0007304124068468809, + "memory(GiB)": 27.09, + "reward": 0.5092499613761902, + "reward_std": 0.1385222177952528, + "rewards/MMContentORM/mean": 0.6825000166893005, + "rewards/MMContentORM/std": 0.595128345489502, + "rewards/MMFormatORM/mean": 0.6156249880790711, + "rewards/MMFormatORM/std": 0.13036334812641143, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2530, + "train_speed(iter/s)": 0.083084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.6, + "completions/mean_length": 217.025, + "completions/min_length": 131.6, + "epoch": 1.2169947191550647, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.2071637213230133, + "kl": 0.014971923828125, + "learning_rate": 3.637974894718981e-06, + "loss": 0.0005985705181956291, + "memory(GiB)": 27.09, + "reward": 0.5091499626636505, + "reward_std": 0.0951058566570282, + "rewards/MMContentORM/mean": 0.6659999966621399, + "rewards/MMContentORM/std": 0.5581172168254852, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2535, + "train_speed(iter/s)": 0.083093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/mean_length": 217.8125, + "completions/min_length": 125.4, + "epoch": 1.2193951032165147, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.13724660873413086, + "kl": 0.017254638671875, + "learning_rate": 3.6188879633143363e-06, + "loss": 0.0006901083514094352, + "memory(GiB)": 27.09, + "reward": 0.4133999764919281, + "reward_std": 0.16942277988418936, + "rewards/MMContentORM/mean": 0.4985000193119049, + "rewards/MMContentORM/std": 0.6871892631053924, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.16980934143066406, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2612451553344727, + "step": 2540, + "train_speed(iter/s)": 0.083084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 459.0, + "completions/mean_length": 222.625, + "completions/min_length": 129.4, + "epoch": 1.2217954872779644, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.004516011103987694, + "kl": 0.015435791015625, + "learning_rate": 3.5998227957731925e-06, + "loss": 0.0006180405616760254, + "memory(GiB)": 27.09, + "reward": 0.4870499849319458, + "reward_std": 0.10429824888706207, + "rewards/MMContentORM/mean": 0.6394999861717224, + "rewards/MMContentORM/std": 0.6266542971134186, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2545, + "train_speed(iter/s)": 0.083042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/mean_length": 215.125, + "completions/min_length": 152.6, + "epoch": 1.2241958713394143, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1654830425977707, + "kl": 0.014935302734375, + "learning_rate": 3.580779692528606e-06, + "loss": 0.0005975381471216678, + "memory(GiB)": 27.09, + "reward": 0.45944997668266296, + "reward_std": 0.12353154704906047, + "rewards/MMContentORM/mean": 0.5705000042915345, + "rewards/MMContentORM/std": 0.6434941411018371, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 2550, + "train_speed(iter/s)": 0.083034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.4, + "completions/mean_length": 214.4125, + "completions/min_length": 145.4, + "epoch": 1.2265962554008643, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.13121533393859863, + "kl": 0.015008544921875, + "learning_rate": 3.56175895366593e-06, + "loss": 0.0006003345362842083, + "memory(GiB)": 27.09, + "reward": 0.4941499769687653, + "reward_std": 0.09482301429379732, + "rewards/MMContentORM/mean": 0.6285000085830689, + "rewards/MMContentORM/std": 0.6090345978736877, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2555, + "train_speed(iter/s)": 0.083055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.8, + "completions/mean_length": 217.95, + "completions/min_length": 139.8, + "epoch": 1.228996639462314, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15669004619121552, + "kl": 0.017437744140625, + "learning_rate": 3.542760878918104e-06, + "loss": 0.0006977845449000597, + "memory(GiB)": 27.09, + "reward": 0.4942499816417694, + "reward_std": 0.11334921540692448, + "rewards/MMContentORM/mean": 0.657500022649765, + "rewards/MMContentORM/std": 0.5437608852982521, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2560, + "train_speed(iter/s)": 0.083051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.8, + "completions/mean_length": 208.3375, + "completions/min_length": 120.2, + "epoch": 1.2313970235237637, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.151866614818573, + "kl": 0.021160888671875, + "learning_rate": 3.5237857676609146e-06, + "loss": 0.0008449718356132507, + "memory(GiB)": 27.09, + "reward": 0.49779998064041137, + "reward_std": 0.08174154479056597, + "rewards/MMContentORM/mean": 0.6519999980926514, + "rewards/MMContentORM/std": 0.4489804258570075, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2565, + "train_speed(iter/s)": 0.083065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 457.8, + "completions/mean_length": 224.7875, + "completions/min_length": 153.8, + "epoch": 1.2337974075852136, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.10642041265964508, + "kl": 0.019122314453125, + "learning_rate": 3.504833918908285e-06, + "loss": 0.0007648383732885122, + "memory(GiB)": 27.09, + "reward": 0.4542999804019928, + "reward_std": 0.15117942336946727, + "rewards/MMContentORM/mean": 0.5719999969005585, + "rewards/MMContentORM/std": 0.6259439706802368, + "rewards/MMFormatORM/mean": 0.6012499749660491, + "rewards/MMFormatORM/std": 0.10254122316837311, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.15775573253631592, + "step": 2570, + "train_speed(iter/s)": 0.083017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/mean_length": 212.3125, + "completions/min_length": 137.8, + "epoch": 1.2361977916466635, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14486433565616608, + "kl": 0.015093994140625, + "learning_rate": 3.485905631307569e-06, + "loss": 0.0006040884181857109, + "memory(GiB)": 27.09, + "reward": 0.5060999631881714, + "reward_std": 0.14354267725721, + "rewards/MMContentORM/mean": 0.7014999866485596, + "rewards/MMContentORM/std": 0.598384690284729, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2575, + "train_speed(iter/s)": 0.083015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 209.325, + "completions/min_length": 137.4, + "epoch": 1.2385981757081133, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15318423509597778, + "kl": 0.01575927734375, + "learning_rate": 3.4670012031348322e-06, + "loss": 0.0006300311535596848, + "memory(GiB)": 27.09, + "reward": 0.47869997620582583, + "reward_std": 0.1590990237891674, + "rewards/MMContentORM/mean": 0.6330000102519989, + "rewards/MMContentORM/std": 0.6443586707115173, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2580, + "train_speed(iter/s)": 0.083024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 444.8, + "completions/mean_length": 217.175, + "completions/min_length": 122.4, + "epoch": 1.2409985597695632, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21531488001346588, + "kl": 0.01881103515625, + "learning_rate": 3.448120932290162e-06, + "loss": 0.0007515028119087219, + "memory(GiB)": 27.09, + "reward": 0.5039499998092651, + "reward_std": 0.09963134194258601, + "rewards/MMContentORM/mean": 0.653000009059906, + "rewards/MMContentORM/std": 0.6097694873809815, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2585, + "train_speed(iter/s)": 0.082982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.8, + "completions/mean_length": 211.8375, + "completions/min_length": 133.4, + "epoch": 1.243398943831013, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.09009666740894318, + "kl": 0.064111328125, + "learning_rate": 3.4292651162929646e-06, + "loss": 0.0025743709877133368, + "memory(GiB)": 27.09, + "reward": 0.4833499789237976, + "reward_std": 0.08520637114997953, + "rewards/MMContentORM/mean": 0.6015000224113465, + "rewards/MMContentORM/std": 0.5234865859150887, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2590, + "train_speed(iter/s)": 0.082999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 213.6375, + "completions/min_length": 128.0, + "epoch": 1.2457993278924628, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11567575484514236, + "kl": 0.02685546875, + "learning_rate": 3.4104340522772892e-06, + "loss": 0.0010737581178545952, + "memory(GiB)": 27.09, + "reward": 0.4908499777317047, + "reward_std": 0.128339883685112, + "rewards/MMContentORM/mean": 0.6490000128746033, + "rewards/MMContentORM/std": 0.6057178854942322, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2595, + "train_speed(iter/s)": 0.083011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 487.2, + "completions/mean_length": 232.225, + "completions/min_length": 159.8, + "epoch": 1.2481997119539125, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.19209064543247223, + "kl": 0.01651611328125, + "learning_rate": 3.391628036987131e-06, + "loss": 0.0006616008933633566, + "memory(GiB)": 27.09, + "reward": 0.49179998636245725, + "reward_std": 0.10097484849393368, + "rewards/MMContentORM/mean": 0.6370000183582306, + "rewards/MMContentORM/std": 0.5937303185462952, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2600, + "train_speed(iter/s)": 0.082964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.4, + "completions/mean_length": 206.225, + "completions/min_length": 122.6, + "epoch": 1.2506000960153625, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15095524489879608, + "kl": 0.03800048828125, + "learning_rate": 3.3728473667717624e-06, + "loss": 0.0015181325376033782, + "memory(GiB)": 27.09, + "reward": 0.47339999079704287, + "reward_std": 0.12077383659780025, + "rewards/MMContentORM/mean": 0.5910000026226043, + "rewards/MMContentORM/std": 0.6444690108299256, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2605, + "train_speed(iter/s)": 0.08294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/mean_length": 215.8875, + "completions/min_length": 140.6, + "epoch": 1.2530004800768122, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1393103450536728, + "kl": 0.013934326171875, + "learning_rate": 3.3540923375810687e-06, + "loss": 0.0005574138835072517, + "memory(GiB)": 27.09, + "reward": 0.5067499876022339, + "reward_std": 0.07643824107944966, + "rewards/MMContentORM/mean": 0.6599999904632569, + "rewards/MMContentORM/std": 0.5702946484088898, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2610, + "train_speed(iter/s)": 0.082956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/mean_length": 205.475, + "completions/min_length": 120.6, + "epoch": 1.2554008641382621, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.166556254029274, + "kl": 0.01988525390625, + "learning_rate": 3.3353632449608703e-06, + "loss": 0.0007944651879370213, + "memory(GiB)": 27.09, + "reward": 0.3788499802350998, + "reward_std": 0.20470740869641305, + "rewards/MMContentORM/mean": 0.4265000134706497, + "rewards/MMContentORM/std": 0.7319641351699829, + "rewards/MMFormatORM/mean": 0.576874977350235, + "rewards/MMFormatORM/std": 0.17944467663764954, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.27606874108314516, + "step": 2615, + "train_speed(iter/s)": 0.082975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.4, + "completions/mean_length": 211.975, + "completions/min_length": 123.8, + "epoch": 1.257801248199712, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17242266237735748, + "kl": 0.03348388671875, + "learning_rate": 3.3166603840482815e-06, + "loss": 0.0013363593257963657, + "memory(GiB)": 27.09, + "reward": 0.4672499895095825, + "reward_std": 0.14361337879672648, + "rewards/MMContentORM/mean": 0.5900000095367431, + "rewards/MMContentORM/std": 0.5261604383587837, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2620, + "train_speed(iter/s)": 0.08299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.8, + "completions/mean_length": 214.225, + "completions/min_length": 145.2, + "epoch": 1.2602016322611618, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.12951968610286713, + "kl": 0.016705322265625, + "learning_rate": 3.297984049567041e-06, + "loss": 0.0006685989443212748, + "memory(GiB)": 27.09, + "reward": 0.4681499719619751, + "reward_std": 0.12480434402823448, + "rewards/MMContentORM/mean": 0.6210000038146972, + "rewards/MMContentORM/std": 0.6814538955688476, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 2625, + "train_speed(iter/s)": 0.083002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.8, + "completions/mean_length": 208.0875, + "completions/min_length": 115.2, + "epoch": 1.2626020163226115, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0806797593832016, + "kl": 0.017120361328125, + "learning_rate": 3.2793345358228935e-06, + "loss": 0.000685088150203228, + "memory(GiB)": 27.09, + "reward": 0.4675999879837036, + "reward_std": 0.09291383468080312, + "rewards/MMContentORM/mean": 0.6089999973773956, + "rewards/MMContentORM/std": 0.6673885464668274, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.1443081244826317, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2630, + "train_speed(iter/s)": 0.083009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.4, + "completions/mean_length": 225.5125, + "completions/min_length": 153.2, + "epoch": 1.2650024003840614, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1779165416955948, + "kl": 0.016461181640625, + "learning_rate": 3.2607121366989216e-06, + "loss": 0.0006584211252629757, + "memory(GiB)": 27.09, + "reward": 0.4672499716281891, + "reward_std": 0.10175266489386559, + "rewards/MMContentORM/mean": 0.5775000095367432, + "rewards/MMContentORM/std": 0.6095366299152374, + "rewards/MMFormatORM/mean": 0.6156249880790711, + "rewards/MMFormatORM/std": 0.11690345108509063, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2635, + "train_speed(iter/s)": 0.083015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.8, + "completions/mean_length": 221.1875, + "completions/min_length": 122.8, + "epoch": 1.2674027844455114, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.17611438035964966, + "kl": 0.022784423828125, + "learning_rate": 3.242117145650936e-06, + "loss": 0.0009104921482503414, + "memory(GiB)": 27.09, + "reward": 0.4905499994754791, + "reward_std": 0.13272394693922251, + "rewards/MMContentORM/mean": 0.6645000100135803, + "rewards/MMContentORM/std": 0.5864324927330017, + "rewards/MMFormatORM/mean": 0.5993749856948852, + "rewards/MMFormatORM/std": 0.12920948565006257, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2640, + "train_speed(iter/s)": 0.083009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/mean_length": 212.8875, + "completions/min_length": 133.4, + "epoch": 1.269803168506961, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1824599802494049, + "kl": 0.016510009765625, + "learning_rate": 3.2235498557028487e-06, + "loss": 0.0006610351148992777, + "memory(GiB)": 27.09, + "reward": 0.3902499735355377, + "reward_std": 0.1405021134763956, + "rewards/MMContentORM/mean": 0.3975000023841858, + "rewards/MMContentORM/std": 0.6953428506851196, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2645, + "train_speed(iter/s)": 0.083028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 534.6, + "completions/mean_length": 226.7, + "completions/min_length": 137.4, + "epoch": 1.272203552568411, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.17883527278900146, + "kl": 0.01495361328125, + "learning_rate": 3.2050105594420463e-06, + "loss": 0.0005979948677122593, + "memory(GiB)": 27.09, + "reward": 0.4455499768257141, + "reward_std": 0.17939299046993257, + "rewards/MMContentORM/mean": 0.5645000159740448, + "rewards/MMContentORM/std": 0.68505859375, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 2650, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 457.2, + "completions/mean_length": 217.825, + "completions/min_length": 132.0, + "epoch": 1.2746039366298607, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.15864062309265137, + "kl": 0.01883544921875, + "learning_rate": 3.186499549014788e-06, + "loss": 0.0007538811769336462, + "memory(GiB)": 27.09, + "reward": 0.3846499860286713, + "reward_std": 0.18024151772260666, + "rewards/MMContentORM/mean": 0.4284999966621399, + "rewards/MMContentORM/std": 0.7736505270004272, + "rewards/MMFormatORM/mean": 0.5831249833106995, + "rewards/MMFormatORM/std": 0.19467147588729858, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 2655, + "train_speed(iter/s)": 0.08292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 220.7875, + "completions/min_length": 154.4, + "epoch": 1.2770043206913106, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.20636998116970062, + "kl": 0.0220458984375, + "learning_rate": 3.168017116121594e-06, + "loss": 0.0008822778239846229, + "memory(GiB)": 27.09, + "reward": 0.5081999719142913, + "reward_std": 0.09531799275428057, + "rewards/MMContentORM/mean": 0.6780000150203704, + "rewards/MMContentORM/std": 0.5256944436579942, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2660, + "train_speed(iter/s)": 0.082931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.8, + "completions/mean_length": 215.725, + "completions/min_length": 121.8, + "epoch": 1.2794047047527606, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.10092300921678543, + "kl": 0.023101806640625, + "learning_rate": 3.149563552012662e-06, + "loss": 0.0009249597787857056, + "memory(GiB)": 27.09, + "reward": 0.47155000567436217, + "reward_std": 0.15959399938583374, + "rewards/MMContentORM/mean": 0.6170000016689301, + "rewards/MMContentORM/std": 0.6519209682941437, + "rewards/MMFormatORM/mean": 0.5993749856948852, + "rewards/MMFormatORM/std": 0.17476680278778076, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 2665, + "train_speed(iter/s)": 0.08294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 444.0, + "completions/mean_length": 218.575, + "completions/min_length": 132.6, + "epoch": 1.2818050888142103, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.16917936503887177, + "kl": 0.01629638671875, + "learning_rate": 3.1311391474832596e-06, + "loss": 0.0006522711366415024, + "memory(GiB)": 27.09, + "reward": 0.4537999749183655, + "reward_std": 0.0890954568516463, + "rewards/MMContentORM/mean": 0.542000013589859, + "rewards/MMContentORM/std": 0.6675113797187805, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2670, + "train_speed(iter/s)": 0.082909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.2, + "completions/mean_length": 205.0625, + "completions/min_length": 136.0, + "epoch": 1.28420547287566, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13570167124271393, + "kl": 0.0154052734375, + "learning_rate": 3.1127441928691575e-06, + "loss": 0.0006159848999232054, + "memory(GiB)": 27.09, + "reward": 0.4889999747276306, + "reward_std": 0.09192388076335192, + "rewards/MMContentORM/mean": 0.6299999952316284, + "rewards/MMContentORM/std": 0.5938115835189819, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2675, + "train_speed(iter/s)": 0.082919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.6, + "completions/mean_length": 207.45, + "completions/min_length": 135.2, + "epoch": 1.28660585693711, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.11782620847225189, + "kl": 0.018231201171875, + "learning_rate": 3.0943789780420473e-06, + "loss": 0.0007292300462722779, + "memory(GiB)": 27.09, + "reward": 0.41379998326301576, + "reward_std": 0.06477098376490176, + "rewards/MMContentORM/mean": 0.4419999986886978, + "rewards/MMContentORM/std": 0.6502050161361694, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2680, + "train_speed(iter/s)": 0.08293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.2, + "completions/mean_length": 218.4, + "completions/min_length": 146.4, + "epoch": 1.2890062409985599, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1357801854610443, + "kl": 0.01868896484375, + "learning_rate": 3.0760437924049723e-06, + "loss": 0.0007480094209313393, + "memory(GiB)": 27.09, + "reward": 0.4612999796867371, + "reward_std": 0.14127993900328875, + "rewards/MMContentORM/mean": 0.5895000040531159, + "rewards/MMContentORM/std": 0.5788779146969318, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2685, + "train_speed(iter/s)": 0.082928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.4, + "completions/mean_length": 213.9875, + "completions/min_length": 152.0, + "epoch": 1.2914066250600096, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.18375363945960999, + "kl": 0.015374755859375, + "learning_rate": 3.0577389248877737e-06, + "loss": 0.0006146729923784733, + "memory(GiB)": 27.09, + "reward": 0.42199998497962954, + "reward_std": 0.130673336237669, + "rewards/MMContentORM/mean": 0.4625000059604645, + "rewards/MMContentORM/std": 0.6796015799045563, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2690, + "train_speed(iter/s)": 0.082933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.8, + "completions/mean_length": 197.325, + "completions/min_length": 133.8, + "epoch": 1.2938070091214593, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15211942791938782, + "kl": 0.0177978515625, + "learning_rate": 3.0394646639425276e-06, + "loss": 0.0007108286954462528, + "memory(GiB)": 27.09, + "reward": 0.4626499831676483, + "reward_std": 0.11787469983100891, + "rewards/MMContentORM/mean": 0.5785000056028367, + "rewards/MMContentORM/std": 0.6342697024345398, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2695, + "train_speed(iter/s)": 0.082962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.8, + "completions/mean_length": 215.3875, + "completions/min_length": 126.2, + "epoch": 1.2962073931829092, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1444334089756012, + "kl": 0.0158203125, + "learning_rate": 3.021221297539007e-06, + "loss": 0.000632589589804411, + "memory(GiB)": 27.09, + "reward": 0.49159998297691343, + "reward_std": 0.10861160224303604, + "rewards/MMContentORM/mean": 0.6365000009536743, + "rewards/MMContentORM/std": 0.5993415236473083, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 2700, + "train_speed(iter/s)": 0.08297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.4, + "completions/mean_length": 217.225, + "completions/min_length": 136.2, + "epoch": 1.2986077772443592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1874951273202896, + "kl": 0.01707763671875, + "learning_rate": 3.0030091131601458e-06, + "loss": 0.0006825667340308427, + "memory(GiB)": 27.09, + "reward": 0.43964999318122866, + "reward_std": 0.22167797833681108, + "rewards/MMContentORM/mean": 0.5785000145435333, + "rewards/MMContentORM/std": 0.7030053317546845, + "rewards/MMFormatORM/mean": 0.576874989271164, + "rewards/MMFormatORM/std": 0.20004121959209442, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.30775573253631594, + "step": 2705, + "train_speed(iter/s)": 0.08295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.4, + "completions/mean_length": 214.0625, + "completions/min_length": 147.0, + "epoch": 1.3010081613058089, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13891303539276123, + "kl": 0.015338134765625, + "learning_rate": 2.984828397797499e-06, + "loss": 0.0006134298164397478, + "memory(GiB)": 27.09, + "reward": 0.4906999826431274, + "reward_std": 0.15061374455690385, + "rewards/MMContentORM/mean": 0.6630000233650207, + "rewards/MMContentORM/std": 0.5227236907929182, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2710, + "train_speed(iter/s)": 0.082958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.2, + "completions/mean_length": 212.925, + "completions/min_length": 128.6, + "epoch": 1.3034085453672588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22613218426704407, + "kl": 0.018865966796875, + "learning_rate": 2.966679437946732e-06, + "loss": 0.0007538828998804092, + "memory(GiB)": 27.09, + "reward": 0.4224499940872192, + "reward_std": 0.13795652985572815, + "rewards/MMContentORM/mean": 0.47800001204013826, + "rewards/MMContentORM/std": 0.6876972198486329, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2715, + "train_speed(iter/s)": 0.08295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.6, + "completions/mean_length": 211.9875, + "completions/min_length": 144.6, + "epoch": 1.3058089294287085, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1565045714378357, + "kl": 0.018109130859375, + "learning_rate": 2.948562519603093e-06, + "loss": 0.0007246591150760651, + "memory(GiB)": 27.09, + "reward": 0.41704997420310974, + "reward_std": 0.11052079051733017, + "rewards/MMContentORM/mean": 0.4644999921321869, + "rewards/MMContentORM/std": 0.6703303098678589, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2720, + "train_speed(iter/s)": 0.082952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/mean_length": 219.45, + "completions/min_length": 151.0, + "epoch": 1.3082093134901585, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.13537226617336273, + "kl": 0.015643310546875, + "learning_rate": 2.930477928256921e-06, + "loss": 0.0006261279806494713, + "memory(GiB)": 27.09, + "reward": 0.4939499914646149, + "reward_std": 0.11151073649525642, + "rewards/MMContentORM/mean": 0.628000009059906, + "rewards/MMContentORM/std": 0.6397946953773499, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2725, + "train_speed(iter/s)": 0.082952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.2, + "completions/mean_length": 209.675, + "completions/min_length": 131.4, + "epoch": 1.3106096975516084, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.148203507065773, + "kl": 0.014532470703125, + "learning_rate": 2.912425948889134e-06, + "loss": 0.000581054575741291, + "memory(GiB)": 27.09, + "reward": 0.4759999871253967, + "reward_std": 0.11653119549155236, + "rewards/MMContentORM/mean": 0.5975000083446502, + "rewards/MMContentORM/std": 0.6301550388336181, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2730, + "train_speed(iter/s)": 0.082965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.6, + "completions/mean_length": 213.9, + "completions/min_length": 133.6, + "epoch": 1.313010081613058, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.2361445128917694, + "kl": 0.024627685546875, + "learning_rate": 2.894406865966739e-06, + "loss": 0.0009845934808254242, + "memory(GiB)": 27.09, + "reward": 0.4808999836444855, + "reward_std": 0.1356230785138905, + "rewards/MMContentORM/mean": 0.6385000109672546, + "rewards/MMContentORM/std": 0.6562785744667053, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2735, + "train_speed(iter/s)": 0.082973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.6, + "completions/mean_length": 215.15, + "completions/min_length": 137.4, + "epoch": 1.3154104656745078, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.13266721367835999, + "kl": 0.013671875, + "learning_rate": 2.876420963438369e-06, + "loss": 0.000547263352200389, + "memory(GiB)": 27.09, + "reward": 0.4798999786376953, + "reward_std": 0.12119809612631798, + "rewards/MMContentORM/mean": 0.6360000133514404, + "rewards/MMContentORM/std": 0.6224928319454193, + "rewards/MMFormatORM/mean": 0.6012499690055847, + "rewards/MMFormatORM/std": 0.13321036398410796, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.20493902564048766, + "step": 2740, + "train_speed(iter/s)": 0.08299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.4, + "completions/mean_length": 208.075, + "completions/min_length": 129.4, + "epoch": 1.3178108497359577, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14808304607868195, + "kl": 0.015924072265625, + "learning_rate": 2.8584685247297735e-06, + "loss": 0.0006361417472362518, + "memory(GiB)": 27.09, + "reward": 0.455049991607666, + "reward_std": 0.09468159638345242, + "rewards/MMContentORM/mean": 0.559500002861023, + "rewards/MMContentORM/std": 0.5647126242518425, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2745, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/mean_length": 217.9125, + "completions/min_length": 148.2, + "epoch": 1.3202112337974077, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.20360088348388672, + "kl": 0.015631103515625, + "learning_rate": 2.840549832739388e-06, + "loss": 0.0006251013837754726, + "memory(GiB)": 27.09, + "reward": 0.4665999889373779, + "reward_std": 0.091923877899535, + "rewards/MMContentORM/mean": 0.5739999890327454, + "rewards/MMContentORM/std": 0.6347232937812806, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2750, + "train_speed(iter/s)": 0.082996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/mean_length": 214.2875, + "completions/min_length": 142.2, + "epoch": 1.3226116178588574, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14281363785266876, + "kl": 0.01690673828125, + "learning_rate": 2.822665169833851e-06, + "loss": 0.000676287803798914, + "memory(GiB)": 27.09, + "reward": 0.4568999886512756, + "reward_std": 0.047376152616925536, + "rewards/MMContentORM/mean": 0.5210000097751617, + "rewards/MMContentORM/std": 0.6158773601055145, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 2755, + "train_speed(iter/s)": 0.083002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 486.8, + "completions/mean_length": 232.45, + "completions/min_length": 141.2, + "epoch": 1.3250120019203073, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2492820769548416, + "kl": 0.020391845703125, + "learning_rate": 2.8048148178435666e-06, + "loss": 0.0008158944547176361, + "memory(GiB)": 27.09, + "reward": 0.3429499715566635, + "reward_std": 0.19226232618093492, + "rewards/MMContentORM/mean": 0.36550000309944153, + "rewards/MMContentORM/std": 0.7640251278877258, + "rewards/MMFormatORM/mean": 0.5606249749660492, + "rewards/MMFormatORM/std": 0.22384813129901887, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.34438174962997437, + "step": 2760, + "train_speed(iter/s)": 0.082952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.6, + "completions/mean_length": 214.4625, + "completions/min_length": 140.2, + "epoch": 1.327412385981757, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1096256673336029, + "kl": 0.01571044921875, + "learning_rate": 2.7869990580582584e-06, + "loss": 0.0006275205872952938, + "memory(GiB)": 27.09, + "reward": 0.5268999934196472, + "reward_std": 0.06434671822935342, + "rewards/MMContentORM/mean": 0.6960000038146973, + "rewards/MMContentORM/std": 0.5165394425392151, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2765, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.6, + "completions/mean_length": 212.7, + "completions/min_length": 152.4, + "epoch": 1.329812770043207, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.16367603838443756, + "kl": 0.01417236328125, + "learning_rate": 2.769218171222538e-06, + "loss": 0.0005671579390764236, + "memory(GiB)": 27.09, + "reward": 0.4968499898910522, + "reward_std": 0.1402192786335945, + "rewards/MMContentORM/mean": 0.6639999985694885, + "rewards/MMContentORM/std": 0.5903096914291381, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2770, + "train_speed(iter/s)": 0.08298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.6, + "completions/mean_length": 211.125, + "completions/min_length": 140.4, + "epoch": 1.3322131541046567, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.200842946767807, + "kl": 0.03021240234375, + "learning_rate": 2.7514724375314866e-06, + "loss": 0.0012100426480174064, + "memory(GiB)": 27.09, + "reward": 0.5299499869346619, + "reward_std": 0.06965001099742948, + "rewards/MMContentORM/mean": 0.7180000185966492, + "rewards/MMContentORM/std": 0.4512696675956249, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2775, + "train_speed(iter/s)": 0.082989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 428.0, + "completions/mean_length": 215.7125, + "completions/min_length": 139.8, + "epoch": 1.3346135381661066, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1629835069179535, + "kl": 0.01583251953125, + "learning_rate": 2.733762136626229e-06, + "loss": 0.0006339491344988346, + "memory(GiB)": 27.09, + "reward": 0.46864998936653135, + "reward_std": 0.1297540941275656, + "rewards/MMContentORM/mean": 0.5935000061988831, + "rewards/MMContentORM/std": 0.6146703898906708, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2780, + "train_speed(iter/s)": 0.082955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.2, + "completions/mean_length": 217.0375, + "completions/min_length": 124.8, + "epoch": 1.3370139222275563, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.20240968465805054, + "kl": 0.0284912109375, + "learning_rate": 2.7160875475895336e-06, + "loss": 0.0011421437375247478, + "memory(GiB)": 27.09, + "reward": 0.4283499836921692, + "reward_std": 0.19070670306682586, + "rewards/MMContentORM/mean": 0.5214999914169312, + "rewards/MMContentORM/std": 0.6971070170402527, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 2785, + "train_speed(iter/s)": 0.082956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/mean_length": 211.175, + "completions/min_length": 94.6, + "epoch": 1.3394143062890063, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.21979370713233948, + "kl": 0.018133544921875, + "learning_rate": 2.6984489489414123e-06, + "loss": 0.0007254761178046465, + "memory(GiB)": 27.09, + "reward": 0.46289997100830077, + "reward_std": 0.1593818672001362, + "rewards/MMContentORM/mean": 0.5935000061988831, + "rewards/MMContentORM/std": 0.6262759625911712, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 2790, + "train_speed(iter/s)": 0.082967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.6, + "completions/mean_length": 213.75, + "completions/min_length": 154.8, + "epoch": 1.3418146903504562, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.07046330720186234, + "kl": 0.018560791015625, + "learning_rate": 2.680846618634741e-06, + "loss": 0.0007423401810228824, + "memory(GiB)": 27.09, + "reward": 0.4933999955654144, + "reward_std": 0.07155919813085347, + "rewards/MMContentORM/mean": 0.6410000085830688, + "rewards/MMContentORM/std": 0.557767578959465, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 2795, + "train_speed(iter/s)": 0.082976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.8, + "completions/mean_length": 221.375, + "completions/min_length": 156.6, + "epoch": 1.344215074411906, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.08354189991950989, + "kl": 0.01629638671875, + "learning_rate": 2.6632808340508577e-06, + "loss": 0.0006520752795040607, + "memory(GiB)": 27.09, + "reward": 0.4364499866962433, + "reward_std": 0.12155165586154908, + "rewards/MMContentORM/mean": 0.5130000054836273, + "rewards/MMContentORM/std": 0.6738922476768494, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2800, + "train_speed(iter/s)": 0.08298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.8, + "completions/mean_length": 212.125, + "completions/min_length": 136.2, + "epoch": 1.3466154584733556, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.07269278913736343, + "kl": 0.013916015625, + "learning_rate": 2.6457518719952126e-06, + "loss": 0.0005559300072491169, + "memory(GiB)": 27.09, + "reward": 0.4854999780654907, + "reward_std": 0.08329717591404914, + "rewards/MMContentORM/mean": 0.5925000071525574, + "rewards/MMContentORM/std": 0.5689398109912872, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2805, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.8, + "completions/mean_length": 207.2375, + "completions/min_length": 144.4, + "epoch": 1.3490158425348056, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.21120710670948029, + "kl": 0.016229248046875, + "learning_rate": 2.628260008693e-06, + "loss": 0.0006484090350568295, + "memory(GiB)": 27.09, + "reward": 0.4767999827861786, + "reward_std": 0.12105667740106582, + "rewards/MMContentORM/mean": 0.5995000123977661, + "rewards/MMContentORM/std": 0.6707588911056519, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2810, + "train_speed(iter/s)": 0.082981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.2, + "completions/mean_length": 215.175, + "completions/min_length": 147.6, + "epoch": 1.3514162265962555, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.17420639097690582, + "kl": 0.013153076171875, + "learning_rate": 2.6108055197848013e-06, + "loss": 0.0005263995379209518, + "memory(GiB)": 27.09, + "reward": 0.49479998350143434, + "reward_std": 0.1250164821743965, + "rewards/MMContentORM/mean": 0.6445000052452088, + "rewards/MMContentORM/std": 0.6272825956344604, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2815, + "train_speed(iter/s)": 0.082983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/mean_length": 211.6875, + "completions/min_length": 148.4, + "epoch": 1.3538166106577052, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13600178062915802, + "kl": 0.015380859375, + "learning_rate": 2.5933886803222453e-06, + "loss": 0.0006146400235593319, + "memory(GiB)": 27.09, + "reward": 0.5410999894142151, + "reward_std": 0.07990306429564953, + "rewards/MMContentORM/mean": 0.7315000176429749, + "rewards/MMContentORM/std": 0.4983065962791443, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2820, + "train_speed(iter/s)": 0.082998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.6, + "completions/mean_length": 209.3125, + "completions/min_length": 116.4, + "epoch": 1.3562169947191551, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11442702263593674, + "kl": 0.017779541015625, + "learning_rate": 2.5760097647636695e-06, + "loss": 0.0007101839408278465, + "memory(GiB)": 27.09, + "reward": 0.48674996495246886, + "reward_std": 0.1194303346797824, + "rewards/MMContentORM/mean": 0.6099999904632568, + "rewards/MMContentORM/std": 0.6200049042701721, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 2825, + "train_speed(iter/s)": 0.083002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.4, + "completions/mean_length": 210.4875, + "completions/min_length": 157.0, + "epoch": 1.3586173787806048, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.1676364243030548, + "kl": 0.02005615234375, + "learning_rate": 2.558669046969802e-06, + "loss": 0.0008024025708436966, + "memory(GiB)": 27.09, + "reward": 0.4595999836921692, + "reward_std": 0.1962928393855691, + "rewards/MMContentORM/mean": 0.614000004529953, + "rewards/MMContentORM/std": 0.6589715838432312, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 2830, + "train_speed(iter/s)": 0.083023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.6, + "completions/mean_length": 207.925, + "completions/min_length": 124.2, + "epoch": 1.3610177628420548, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11557711660861969, + "kl": 0.016168212890625, + "learning_rate": 2.541366800199441e-06, + "loss": 0.0006470034830272197, + "memory(GiB)": 27.09, + "reward": 0.4768499732017517, + "reward_std": 0.1373908487148583, + "rewards/MMContentORM/mean": 0.6140000224113464, + "rewards/MMContentORM/std": 0.6126730859279632, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2835, + "train_speed(iter/s)": 0.083045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 455.6, + "completions/mean_length": 221.3125, + "completions/min_length": 119.6, + "epoch": 1.3634181469035045, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.17624257504940033, + "kl": 0.021038818359375, + "learning_rate": 2.524103297105147e-06, + "loss": 0.0008411226794123649, + "memory(GiB)": 27.09, + "reward": 0.4879499852657318, + "reward_std": 0.17599886879324914, + "rewards/MMContentORM/mean": 0.6705000042915344, + "rewards/MMContentORM/std": 0.5964474260807038, + "rewards/MMFormatORM/mean": 0.5931249916553497, + "rewards/MMFormatORM/std": 0.1556377649307251, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.23944272398948668, + "step": 2840, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.4, + "completions/mean_length": 212.0625, + "completions/min_length": 137.4, + "epoch": 1.3658185309649544, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11209463328123093, + "kl": 0.015325927734375, + "learning_rate": 2.5068788097289563e-06, + "loss": 0.0006131676957011223, + "memory(GiB)": 27.09, + "reward": 0.508549964427948, + "reward_std": 0.09256027387455106, + "rewards/MMContentORM/mean": 0.6645000219345093, + "rewards/MMContentORM/std": 0.49717583805322646, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2845, + "train_speed(iter/s)": 0.083015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/mean_length": 207.475, + "completions/min_length": 144.2, + "epoch": 1.3682189150264041, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.14943933486938477, + "kl": 0.015374755859375, + "learning_rate": 2.4896936094980813e-06, + "loss": 0.0006145826540887356, + "memory(GiB)": 27.09, + "reward": 0.51869997382164, + "reward_std": 0.07311484031379223, + "rewards/MMContentORM/mean": 0.6755000114440918, + "rewards/MMContentORM/std": 0.5316406607627868, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2850, + "train_speed(iter/s)": 0.083019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 212.0875, + "completions/min_length": 142.2, + "epoch": 1.370619299087854, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13996316492557526, + "kl": 0.016644287109375, + "learning_rate": 2.47254796722064e-06, + "loss": 0.000665975920855999, + "memory(GiB)": 27.09, + "reward": 0.477649986743927, + "reward_std": 0.11419774293899536, + "rewards/MMContentORM/mean": 0.6160000085830688, + "rewards/MMContentORM/std": 0.6154688060283661, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 2855, + "train_speed(iter/s)": 0.083038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.8, + "completions/mean_length": 202.125, + "completions/min_length": 120.2, + "epoch": 1.373019683149304, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09467165917158127, + "kl": 0.019873046875, + "learning_rate": 2.455442153081388e-06, + "loss": 0.0007954918779432774, + "memory(GiB)": 27.09, + "reward": 0.4916999876499176, + "reward_std": 0.07297342019155621, + "rewards/MMContentORM/mean": 0.6655000030994416, + "rewards/MMContentORM/std": 0.5675058551132679, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 2860, + "train_speed(iter/s)": 0.083051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.4, + "completions/mean_length": 204.1, + "completions/min_length": 111.6, + "epoch": 1.3754200672107537, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1891569048166275, + "kl": 0.01641845703125, + "learning_rate": 2.4383764366374608e-06, + "loss": 0.0006566672120243311, + "memory(GiB)": 27.09, + "reward": 0.5054999768733979, + "reward_std": 0.05897270615678281, + "rewards/MMContentORM/mean": 0.6424999952316284, + "rewards/MMContentORM/std": 0.5048037022352219, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2865, + "train_speed(iter/s)": 0.083069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/mean_length": 197.075, + "completions/min_length": 130.0, + "epoch": 1.3778204512722034, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1894003450870514, + "kl": 0.016998291015625, + "learning_rate": 2.4213510868141253e-06, + "loss": 0.0006809456273913384, + "memory(GiB)": 27.09, + "reward": 0.46944997906684877, + "reward_std": 0.14615897387266158, + "rewards/MMContentORM/mean": 0.5954999923706055, + "rewards/MMContentORM/std": 0.6364932656288147, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2870, + "train_speed(iter/s)": 0.083076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 437.6, + "completions/mean_length": 216.4625, + "completions/min_length": 132.4, + "epoch": 1.3802208353336534, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.16716767847537994, + "kl": 2.116851806640625, + "learning_rate": 2.404366371900541e-06, + "loss": 0.08460500240325927, + "memory(GiB)": 27.09, + "reward": 0.47039997577667236, + "reward_std": 0.08881261080969124, + "rewards/MMContentORM/mean": 0.5835000097751617, + "rewards/MMContentORM/std": 0.5921829402446747, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2875, + "train_speed(iter/s)": 0.083038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 434.2, + "completions/mean_length": 214.5625, + "completions/min_length": 143.2, + "epoch": 1.3826212193951033, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.22403933107852936, + "kl": 0.01729736328125, + "learning_rate": 2.387422559545539e-06, + "loss": 0.0006910515949130058, + "memory(GiB)": 27.09, + "reward": 0.4853999733924866, + "reward_std": 0.1501894833520055, + "rewards/MMContentORM/mean": 0.6535000264644623, + "rewards/MMContentORM/std": 0.5772003047168255, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.13168290257453918, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2880, + "train_speed(iter/s)": 0.082999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/mean_length": 218.0125, + "completions/min_length": 145.6, + "epoch": 1.385021603456553, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1463712602853775, + "kl": 0.01533203125, + "learning_rate": 2.3705199167533933e-06, + "loss": 0.0006130510475486517, + "memory(GiB)": 27.09, + "reward": 0.5021499812602996, + "reward_std": 0.08464068132452666, + "rewards/MMContentORM/mean": 0.6485000193119049, + "rewards/MMContentORM/std": 0.5746123373508454, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2885, + "train_speed(iter/s)": 0.082987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.2, + "completions/mean_length": 221.3625, + "completions/min_length": 130.8, + "epoch": 1.387421987518003, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14541618525981903, + "kl": 0.022015380859375, + "learning_rate": 2.35365870987962e-06, + "loss": 0.000881551206111908, + "memory(GiB)": 27.09, + "reward": 0.4464499831199646, + "reward_std": 0.12112738967407495, + "rewards/MMContentORM/mean": 0.5380000054836274, + "rewards/MMContentORM/std": 0.587418507039547, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 2890, + "train_speed(iter/s)": 0.082996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.8, + "completions/mean_length": 207.3625, + "completions/min_length": 114.0, + "epoch": 1.3898223715794527, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.006435598712414503, + "kl": 0.112371826171875, + "learning_rate": 2.336839204626781e-06, + "loss": 0.00448373295366764, + "memory(GiB)": 27.09, + "reward": 0.48819997906684875, + "reward_std": 0.0975807286798954, + "rewards/MMContentORM/mean": 0.6280000001192093, + "rewards/MMContentORM/std": 0.5371371787041426, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2895, + "train_speed(iter/s)": 0.083007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.8, + "completions/mean_length": 216.225, + "completions/min_length": 128.0, + "epoch": 1.3922227556409026, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.12930913269519806, + "kl": 0.031195068359375, + "learning_rate": 2.320061666040293e-06, + "loss": 0.0012484462931752205, + "memory(GiB)": 27.09, + "reward": 0.4482999861240387, + "reward_std": 0.13816866455599666, + "rewards/MMContentORM/mean": 0.5569999933242797, + "rewards/MMContentORM/std": 0.6566318869590759, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2900, + "train_speed(iter/s)": 0.083008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.6, + "completions/mean_length": 225.3125, + "completions/min_length": 159.4, + "epoch": 1.3946231397023523, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11358582973480225, + "kl": 0.016644287109375, + "learning_rate": 2.303326358504254e-06, + "loss": 0.0006654250435531139, + "memory(GiB)": 27.09, + "reward": 0.5058999717235565, + "reward_std": 0.08046875060535967, + "rewards/MMContentORM/mean": 0.643500006198883, + "rewards/MMContentORM/std": 0.5736395001411438, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2905, + "train_speed(iter/s)": 0.082965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.8, + "completions/mean_length": 208.175, + "completions/min_length": 143.8, + "epoch": 1.3970235237638022, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11047236621379852, + "kl": 0.017657470703125, + "learning_rate": 2.286633545737275e-06, + "loss": 0.0007063564844429493, + "memory(GiB)": 27.09, + "reward": 0.444299989938736, + "reward_std": 0.12515790089964868, + "rewards/MMContentORM/mean": 0.5469999969005584, + "rewards/MMContentORM/std": 0.6875294208526611, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2910, + "train_speed(iter/s)": 0.08298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/mean_length": 213.2125, + "completions/min_length": 118.8, + "epoch": 1.399423907825252, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.18031013011932373, + "kl": 0.017578125, + "learning_rate": 2.2699834907883284e-06, + "loss": 0.0007039817050099373, + "memory(GiB)": 27.09, + "reward": 0.4786999821662903, + "reward_std": 0.13420886383391917, + "rewards/MMContentORM/mean": 0.6330000162124634, + "rewards/MMContentORM/std": 0.6391359031200409, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 2915, + "train_speed(iter/s)": 0.082991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.8, + "completions/mean_length": 220.05, + "completions/min_length": 153.6, + "epoch": 1.4018242918867019, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18326731026172638, + "kl": 0.019744873046875, + "learning_rate": 2.2533764560325956e-06, + "loss": 0.0007898284122347832, + "memory(GiB)": 27.09, + "reward": 0.4050499856472015, + "reward_std": 0.2015961468219757, + "rewards/MMContentORM/mean": 0.49200000166893004, + "rewards/MMContentORM/std": 0.7459113121032714, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 2920, + "train_speed(iter/s)": 0.082989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.4, + "completions/mean_length": 210.2875, + "completions/min_length": 125.4, + "epoch": 1.4042246759481518, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.12701493501663208, + "kl": 0.0131103515625, + "learning_rate": 2.236812703167337e-06, + "loss": 0.0005245218984782696, + "memory(GiB)": 27.09, + "reward": 0.4417999804019928, + "reward_std": 0.15414927080273627, + "rewards/MMContentORM/mean": 0.5445000112056733, + "rewards/MMContentORM/std": 0.6644897401332855, + "rewards/MMFormatORM/mean": 0.5974999785423278, + "rewards/MMFormatORM/std": 0.11980934292078019, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.16124515533447265, + "step": 2925, + "train_speed(iter/s)": 0.082983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.8, + "completions/mean_length": 211.0, + "completions/min_length": 129.4, + "epoch": 1.4066250600096015, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.14272558689117432, + "kl": 0.0160888671875, + "learning_rate": 2.2202924932077703e-06, + "loss": 0.0006431899964809418, + "memory(GiB)": 27.09, + "reward": 0.44374998211860656, + "reward_std": 0.1390879049897194, + "rewards/MMContentORM/mean": 0.5599999845027923, + "rewards/MMContentORM/std": 0.6930801510810852, + "rewards/MMFormatORM/mean": 0.5931249976158142, + "rewards/MMFormatORM/std": 0.18240466713905334, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2806225776672363, + "step": 2930, + "train_speed(iter/s)": 0.082994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 213.025, + "completions/min_length": 138.4, + "epoch": 1.4090254440710512, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1295127421617508, + "kl": 0.01728515625, + "learning_rate": 2.2038160864829516e-06, + "loss": 0.0006916459649801254, + "memory(GiB)": 27.09, + "reward": 0.4630999803543091, + "reward_std": 0.1404314052313566, + "rewards/MMContentORM/mean": 0.5940000176429748, + "rewards/MMContentORM/std": 0.6665144979953765, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 2935, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.8, + "completions/mean_length": 216.4, + "completions/min_length": 145.4, + "epoch": 1.4114258281325012, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1919974982738495, + "kl": 0.013800048828125, + "learning_rate": 2.1873837426316775e-06, + "loss": 0.0005520004779100418, + "memory(GiB)": 27.09, + "reward": 0.4706499934196472, + "reward_std": 0.1478560283780098, + "rewards/MMContentORM/mean": 0.5984999895095825, + "rewards/MMContentORM/std": 0.6424413204193116, + "rewards/MMFormatORM/mean": 0.609375, + "rewards/MMFormatORM/std": 0.16249999403953552, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.25, + "step": 2940, + "train_speed(iter/s)": 0.082999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.8, + "completions/mean_length": 210.3, + "completions/min_length": 140.8, + "epoch": 1.413826212193951, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1101124957203865, + "kl": 0.01229248046875, + "learning_rate": 2.1709957205983904e-06, + "loss": 0.0004918764345347882, + "memory(GiB)": 27.09, + "reward": 0.5490499854087829, + "reward_std": 0.0635689014568925, + "rewards/MMContentORM/mean": 0.7369999945163727, + "rewards/MMContentORM/std": 0.4961404323577881, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 2945, + "train_speed(iter/s)": 0.082996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.8, + "completions/mean_length": 212.0375, + "completions/min_length": 128.2, + "epoch": 1.4162265962554008, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15772999823093414, + "kl": 0.01820068359375, + "learning_rate": 2.1546522786291055e-06, + "loss": 0.000728009082376957, + "memory(GiB)": 27.09, + "reward": 0.5127999722957611, + "reward_std": 0.09956063730642199, + "rewards/MMContentORM/mean": 0.6895000219345093, + "rewards/MMContentORM/std": 0.5332017622888088, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2950, + "train_speed(iter/s)": 0.082998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.8, + "completions/mean_length": 197.4125, + "completions/min_length": 136.6, + "epoch": 1.4186269803168507, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.005250279791653156, + "kl": 0.0145751953125, + "learning_rate": 2.138353674267332e-06, + "loss": 0.0005831093527376652, + "memory(GiB)": 27.09, + "reward": 0.4632999837398529, + "reward_std": 0.05529574602842331, + "rewards/MMContentORM/mean": 0.5370000183582306, + "rewards/MMContentORM/std": 0.5351093679666519, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 2955, + "train_speed(iter/s)": 0.083011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.4, + "completions/mean_length": 206.375, + "completions/min_length": 134.0, + "epoch": 1.4210273643783005, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15277594327926636, + "kl": 0.015472412109375, + "learning_rate": 2.1221001643500124e-06, + "loss": 0.0006176586262881756, + "memory(GiB)": 27.09, + "reward": 0.5094499945640564, + "reward_std": 0.05168950129300356, + "rewards/MMContentORM/mean": 0.6379999995231629, + "rewards/MMContentORM/std": 0.5326842725276947, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 2960, + "train_speed(iter/s)": 0.083029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/mean_length": 210.15, + "completions/min_length": 145.8, + "epoch": 1.4234277484397504, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11770177632570267, + "kl": 0.0151611328125, + "learning_rate": 2.1058920050034916e-06, + "loss": 0.0006076030433177948, + "memory(GiB)": 27.09, + "reward": 0.45579997897148133, + "reward_std": 0.13378460630774497, + "rewards/MMContentORM/mean": 0.546999990940094, + "rewards/MMContentORM/std": 0.6460906147956849, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 2965, + "train_speed(iter/s)": 0.08303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/mean_length": 208.5375, + "completions/min_length": 137.4, + "epoch": 1.4258281325012003, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.18440692126750946, + "kl": 0.019158935546875, + "learning_rate": 2.089729451639464e-06, + "loss": 0.0007669050246477127, + "memory(GiB)": 27.09, + "reward": 0.4253999888896942, + "reward_std": 0.18809040188789367, + "rewards/MMContentORM/mean": 0.5285000085830689, + "rewards/MMContentORM/std": 0.6971115171909332, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 2970, + "train_speed(iter/s)": 0.083041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/mean_length": 207.1125, + "completions/min_length": 120.8, + "epoch": 1.42822851656265, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.08875837922096252, + "kl": 0.020159912109375, + "learning_rate": 2.0736127589509574e-06, + "loss": 0.0008051252923905849, + "memory(GiB)": 27.09, + "reward": 0.4783999741077423, + "reward_std": 0.09956063062418252, + "rewards/MMContentORM/mean": 0.6034999847412109, + "rewards/MMContentORM/std": 0.5641872756183147, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 2975, + "train_speed(iter/s)": 0.083048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.2, + "completions/mean_length": 213.2375, + "completions/min_length": 142.4, + "epoch": 1.4306289006240998, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.005619046278297901, + "kl": 0.016546630859375, + "learning_rate": 2.057542180908314e-06, + "loss": 0.0006620488129556179, + "memory(GiB)": 27.09, + "reward": 0.4491499841213226, + "reward_std": 0.0645588494837284, + "rewards/MMContentORM/mean": 0.5159999966621399, + "rewards/MMContentORM/std": 0.6301424145698548, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2980, + "train_speed(iter/s)": 0.083063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.8, + "completions/mean_length": 204.6625, + "completions/min_length": 116.0, + "epoch": 1.4330292846855497, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.2883993685245514, + "kl": 0.045916748046875, + "learning_rate": 2.0415179707551972e-06, + "loss": 0.0018356535583734512, + "memory(GiB)": 27.09, + "reward": 0.48004997372627256, + "reward_std": 0.11250068647786975, + "rewards/MMContentORM/mean": 0.6220000118017197, + "rewards/MMContentORM/std": 0.5794163227081299, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 2985, + "train_speed(iter/s)": 0.083059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.8, + "completions/mean_length": 212.075, + "completions/min_length": 142.8, + "epoch": 1.4354296687469996, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.005611935164779425, + "kl": 0.01571044921875, + "learning_rate": 2.025540381004592e-06, + "loss": 0.0006283918395638466, + "memory(GiB)": 27.09, + "reward": 0.5766499638557434, + "reward_std": 0.03358757034875452, + "rewards/MMContentORM/mean": 0.806000006198883, + "rewards/MMContentORM/std": 0.41324327513575554, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 2990, + "train_speed(iter/s)": 0.083062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.2, + "completions/mean_length": 210.15, + "completions/min_length": 140.6, + "epoch": 1.4378300528084493, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16042460501194, + "kl": 0.019024658203125, + "learning_rate": 2.009609663434823e-06, + "loss": 0.0007612261921167374, + "memory(GiB)": 27.09, + "reward": 0.42854997515678406, + "reward_std": 0.059184834850020705, + "rewards/MMContentORM/mean": 0.46450000554323195, + "rewards/MMContentORM/std": 0.6028416275978088, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 2995, + "train_speed(iter/s)": 0.083066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.2, + "completions/mean_length": 213.075, + "completions/min_length": 123.8, + "epoch": 1.4402304368698993, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.17296624183654785, + "kl": 0.014404296875, + "learning_rate": 1.9937260690856038e-06, + "loss": 0.0005766497924923897, + "memory(GiB)": 27.09, + "reward": 0.5021999835968017, + "reward_std": 0.1513208493590355, + "rewards/MMContentORM/mean": 0.6630000114440918, + "rewards/MMContentORM/std": 0.6147877216339112, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 3000, + "train_speed(iter/s)": 0.083074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.2, + "completions/mean_length": 213.15, + "completions/min_length": 137.2, + "epoch": 1.442630820931349, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16241255402565002, + "kl": 0.01876220703125, + "learning_rate": 1.977889848254063e-06, + "loss": 0.0007500813342630863, + "memory(GiB)": 27.09, + "reward": 0.4236499905586243, + "reward_std": 0.1413506418466568, + "rewards/MMContentORM/mean": 0.4810000091791153, + "rewards/MMContentORM/std": 0.6403470635414124, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3005, + "train_speed(iter/s)": 0.08304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.4, + "completions/mean_length": 212.275, + "completions/min_length": 142.6, + "epoch": 1.445031204992799, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.06250451505184174, + "kl": 0.01522216796875, + "learning_rate": 1.962101250490809e-06, + "loss": 0.0006091888062655926, + "memory(GiB)": 27.09, + "reward": 0.4913999855518341, + "reward_std": 0.1162483523832634, + "rewards/MMContentORM/mean": 0.6360000133514404, + "rewards/MMContentORM/std": 0.5707429587841034, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3010, + "train_speed(iter/s)": 0.08305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.4, + "completions/mean_length": 214.075, + "completions/min_length": 128.6, + "epoch": 1.4474315890542486, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16592533886432648, + "kl": 0.014422607421875, + "learning_rate": 1.946360524595992e-06, + "loss": 0.0005768738687038421, + "memory(GiB)": 27.09, + "reward": 0.4861999869346619, + "reward_std": 0.09022682073991746, + "rewards/MMContentORM/mean": 0.6230000138282776, + "rewards/MMContentORM/std": 0.49670754447579385, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 3015, + "train_speed(iter/s)": 0.083063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/mean_length": 208.425, + "completions/min_length": 125.8, + "epoch": 1.4498319731156986, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11563282459974289, + "kl": 0.017535400390625, + "learning_rate": 1.930667918615396e-06, + "loss": 0.0007019482553005218, + "memory(GiB)": 27.09, + "reward": 0.5023999869823456, + "reward_std": 0.08259006794542074, + "rewards/MMContentORM/mean": 0.6634999990463257, + "rewards/MMContentORM/std": 0.5144065268337726, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3020, + "train_speed(iter/s)": 0.083066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.2, + "completions/mean_length": 217.95, + "completions/min_length": 129.4, + "epoch": 1.4522323571771483, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1004854291677475, + "kl": 0.016015625, + "learning_rate": 1.915023679836513e-06, + "loss": 0.0006412723101675511, + "memory(GiB)": 27.09, + "reward": 0.44114998579025266, + "reward_std": 0.08266078755259514, + "rewards/MMContentORM/mean": 0.4959999889135361, + "rewards/MMContentORM/std": 0.6346112012863159, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3025, + "train_speed(iter/s)": 0.083076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.2, + "completions/mean_length": 208.8, + "completions/min_length": 140.2, + "epoch": 1.4546327412385982, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.07408447563648224, + "kl": 0.017010498046875, + "learning_rate": 1.8994280547846516e-06, + "loss": 0.0006803128868341446, + "memory(GiB)": 27.09, + "reward": 0.48314996957778933, + "reward_std": 0.07813529700506479, + "rewards/MMContentORM/mean": 0.6009999990463257, + "rewards/MMContentORM/std": 0.6228325486183166, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3030, + "train_speed(iter/s)": 0.083097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/mean_length": 204.5, + "completions/min_length": 130.6, + "epoch": 1.4570331253000481, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.12042010575532913, + "kl": 0.015240478515625, + "learning_rate": 1.8838812892190655e-06, + "loss": 0.000609145499765873, + "memory(GiB)": 27.09, + "reward": 0.5214499771595001, + "reward_std": 0.053386559383943676, + "rewards/MMContentORM/mean": 0.6680000066757202, + "rewards/MMContentORM/std": 0.47634573876857755, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3035, + "train_speed(iter/s)": 0.083108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.4, + "completions/mean_length": 220.7875, + "completions/min_length": 149.4, + "epoch": 1.4594335093614978, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14619703590869904, + "kl": 0.01563720703125, + "learning_rate": 1.8683836281290608e-06, + "loss": 0.0006260167807340622, + "memory(GiB)": 27.09, + "reward": 0.5266999781131745, + "reward_std": 0.08329718094319105, + "rewards/MMContentORM/mean": 0.728000009059906, + "rewards/MMContentORM/std": 0.5765750944614411, + "rewards/MMFormatORM/mean": 0.6137499928474426, + "rewards/MMFormatORM/std": 0.14499999433755875, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 3040, + "train_speed(iter/s)": 0.083102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.8, + "completions/mean_length": 221.7625, + "completions/min_length": 155.8, + "epoch": 1.4618338934229476, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.12591391801834106, + "kl": 0.013836669921875, + "learning_rate": 1.8529353157301477e-06, + "loss": 0.0005534658208489418, + "memory(GiB)": 27.09, + "reward": 0.4464499831199646, + "reward_std": 0.11136931926012039, + "rewards/MMContentORM/mean": 0.5380000054836274, + "rewards/MMContentORM/std": 0.6489728450775146, + "rewards/MMFormatORM/mean": 0.609375, + "rewards/MMFormatORM/std": 0.16249999403953552, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.25, + "step": 3045, + "train_speed(iter/s)": 0.083104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.8, + "completions/mean_length": 219.4625, + "completions/min_length": 134.8, + "epoch": 1.4642342774843975, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.24321743845939636, + "kl": 0.030645751953125, + "learning_rate": 1.8375365954601882e-06, + "loss": 0.001224792841821909, + "memory(GiB)": 27.09, + "reward": 0.4084999799728394, + "reward_std": 0.11978388726711273, + "rewards/MMContentORM/mean": 0.45750000774860383, + "rewards/MMContentORM/std": 0.7140669703483582, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 3050, + "train_speed(iter/s)": 0.08311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/mean_length": 207.55, + "completions/min_length": 138.4, + "epoch": 1.4666346615458474, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17153875529766083, + "kl": 0.018304443359375, + "learning_rate": 1.8221877099755635e-06, + "loss": 0.0007329397834837436, + "memory(GiB)": 27.09, + "reward": 0.48879997730255126, + "reward_std": 0.09842926461715251, + "rewards/MMContentORM/mean": 0.6295000195503235, + "rewards/MMContentORM/std": 0.6123105943202972, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3055, + "train_speed(iter/s)": 0.083119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.4, + "completions/mean_length": 212.4125, + "completions/min_length": 142.8, + "epoch": 1.4690350456072971, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.18333663046360016, + "kl": 0.01534423828125, + "learning_rate": 1.8068889011473472e-06, + "loss": 0.0006133603863418102, + "memory(GiB)": 27.09, + "reward": 0.4755499720573425, + "reward_std": 0.10585388541221619, + "rewards/MMContentORM/mean": 0.5819999992847442, + "rewards/MMContentORM/std": 0.6341395020484925, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3060, + "train_speed(iter/s)": 0.083124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/mean_length": 203.5125, + "completions/min_length": 121.0, + "epoch": 1.471435429668747, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.26711151003837585, + "kl": 0.0223388671875, + "learning_rate": 1.7916404100574858e-06, + "loss": 0.0008931753225624561, + "memory(GiB)": 27.09, + "reward": 0.4269499838352203, + "reward_std": 0.13951216414570808, + "rewards/MMContentORM/mean": 0.5180000185966491, + "rewards/MMContentORM/std": 0.7072037339210511, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 3065, + "train_speed(iter/s)": 0.083143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.8, + "completions/mean_length": 204.775, + "completions/min_length": 127.8, + "epoch": 1.4738358137301968, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11704003810882568, + "kl": 0.015966796875, + "learning_rate": 1.77644247699502e-06, + "loss": 0.0006390390917658806, + "memory(GiB)": 27.09, + "reward": 0.5196999788284302, + "reward_std": 0.038890871894545855, + "rewards/MMContentORM/mean": 0.678000009059906, + "rewards/MMContentORM/std": 0.5452348232269287, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3070, + "train_speed(iter/s)": 0.083156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.2, + "completions/mean_length": 212.1375, + "completions/min_length": 147.4, + "epoch": 1.4762361977916467, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.11178287863731384, + "kl": 0.015142822265625, + "learning_rate": 1.7612953414522787e-06, + "loss": 0.0006058240309357643, + "memory(GiB)": 27.09, + "reward": 0.5580999732017518, + "reward_std": 0.07170062698423862, + "rewards/MMContentORM/mean": 0.7740000009536743, + "rewards/MMContentORM/std": 0.47003708481788636, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3075, + "train_speed(iter/s)": 0.08317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.2, + "completions/mean_length": 215.875, + "completions/min_length": 129.0, + "epoch": 1.4786365818530964, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1516324132680893, + "kl": 0.014453125, + "learning_rate": 1.7461992421211144e-06, + "loss": 0.0005788389593362808, + "memory(GiB)": 27.09, + "reward": 0.4489999830722809, + "reward_std": 0.09079250320792198, + "rewards/MMContentORM/mean": 0.5300000131130218, + "rewards/MMContentORM/std": 0.6716732859611512, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 3080, + "train_speed(iter/s)": 0.083181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/mean_length": 215.9125, + "completions/min_length": 125.6, + "epoch": 1.4810369659145464, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1026253029704094, + "kl": 0.01588134765625, + "learning_rate": 1.7311544168891397e-06, + "loss": 0.0006352938711643219, + "memory(GiB)": 27.09, + "reward": 0.49314998388290404, + "reward_std": 0.09171175360679626, + "rewards/MMContentORM/mean": 0.6260000169277191, + "rewards/MMContentORM/std": 0.5888873279094696, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3085, + "train_speed(iter/s)": 0.083182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/mean_length": 222.55, + "completions/min_length": 147.4, + "epoch": 1.483437349975996, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.19577515125274658, + "kl": 0.46920166015625, + "learning_rate": 1.7161611028359776e-06, + "loss": 0.01879151463508606, + "memory(GiB)": 27.09, + "reward": 0.4272999823093414, + "reward_std": 0.1520279485033825, + "rewards/MMContentORM/mean": 0.504500013589859, + "rewards/MMContentORM/std": 0.6994093418121338, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 3090, + "train_speed(iter/s)": 0.08317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 476.0, + "completions/mean_length": 229.6, + "completions/min_length": 144.4, + "epoch": 1.485837734037446, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17094756662845612, + "kl": 0.022589111328125, + "learning_rate": 1.701219536229531e-06, + "loss": 0.0009042560122907162, + "memory(GiB)": 27.09, + "reward": 0.41979997158050536, + "reward_std": 0.1699884652160108, + "rewards/MMContentORM/mean": 0.5145000159740448, + "rewards/MMContentORM/std": 0.6915717840194702, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.1430424392223358, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.22006530165672303, + "step": 3095, + "train_speed(iter/s)": 0.083129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 444.0, + "completions/mean_length": 228.7625, + "completions/min_length": 142.6, + "epoch": 1.488238118098896, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.06433889269828796, + "kl": 0.019317626953125, + "learning_rate": 1.686329952522251e-06, + "loss": 0.0007725300267338753, + "memory(GiB)": 27.09, + "reward": 0.4120999813079834, + "reward_std": 0.23122391402721404, + "rewards/MMContentORM/mean": 0.5240000247955322, + "rewards/MMContentORM/std": 0.7343219518661499, + "rewards/MMFormatORM/mean": 0.5687499880790711, + "rewards/MMFormatORM/std": 0.2142127960920334, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.3295581638813019, + "step": 3100, + "train_speed(iter/s)": 0.083097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.4, + "completions/mean_length": 215.2625, + "completions/min_length": 146.6, + "epoch": 1.4906385021603457, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.07017713785171509, + "kl": 0.014739990234375, + "learning_rate": 1.6714925863474317e-06, + "loss": 0.000588908651843667, + "memory(GiB)": 27.09, + "reward": 0.43324996829032897, + "reward_std": 0.1584626256953925, + "rewards/MMContentORM/mean": 0.5625000119209289, + "rewards/MMContentORM/std": 0.7162049651145935, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 3105, + "train_speed(iter/s)": 0.083075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.6, + "completions/mean_length": 209.9375, + "completions/min_length": 151.6, + "epoch": 1.4930388862217954, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1349526196718216, + "kl": 0.015740966796875, + "learning_rate": 1.6567076715155118e-06, + "loss": 0.0006291633006185293, + "memory(GiB)": 27.09, + "reward": 0.4481499850749969, + "reward_std": 0.08690342083573341, + "rewards/MMContentORM/mean": 0.5135000020265579, + "rewards/MMContentORM/std": 0.6392456710338592, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3110, + "train_speed(iter/s)": 0.083088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.2, + "completions/mean_length": 206.9, + "completions/min_length": 143.6, + "epoch": 1.4954392702832453, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15179786086082458, + "kl": 0.021075439453125, + "learning_rate": 1.6419754410103949e-06, + "loss": 0.0008435728028416634, + "memory(GiB)": 27.09, + "reward": 0.42634997963905336, + "reward_std": 0.1286227189935744, + "rewards/MMContentORM/mean": 0.5165000140666962, + "rewards/MMContentORM/std": 0.6922470927238464, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3115, + "train_speed(iter/s)": 0.083093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.2, + "completions/mean_length": 218.2875, + "completions/min_length": 119.0, + "epoch": 1.4978396543446952, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21460042893886566, + "kl": 0.032769775390625, + "learning_rate": 1.6272961269857657e-06, + "loss": 0.0013143711723387242, + "memory(GiB)": 27.09, + "reward": 0.43154999017715456, + "reward_std": 0.14884596914052964, + "rewards/MMContentORM/mean": 0.5295000314712525, + "rewards/MMContentORM/std": 0.7493190169334412, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 3120, + "train_speed(iter/s)": 0.083091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.2, + "completions/mean_length": 208.25, + "completions/min_length": 130.8, + "epoch": 1.500240038406145, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.12087953835725784, + "kl": 0.01927490234375, + "learning_rate": 1.6126699607614427e-06, + "loss": 0.0007707193493843078, + "memory(GiB)": 27.09, + "reward": 0.5104999840259552, + "reward_std": 0.06943788453936577, + "rewards/MMContentORM/mean": 0.6549999952316284, + "rewards/MMContentORM/std": 0.5868445634841919, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3125, + "train_speed(iter/s)": 0.083097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.8, + "completions/mean_length": 209.8625, + "completions/min_length": 131.4, + "epoch": 1.5026404224675947, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.0041847084648907185, + "kl": 0.01480712890625, + "learning_rate": 1.5980971728197342e-06, + "loss": 0.0005915745161473752, + "memory(GiB)": 27.09, + "reward": 0.5453999817371369, + "reward_std": 0.10776307303458452, + "rewards/MMContentORM/mean": 0.7710000157356263, + "rewards/MMContentORM/std": 0.4581117108464241, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3130, + "train_speed(iter/s)": 0.083106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.4, + "completions/mean_length": 219.2625, + "completions/min_length": 140.8, + "epoch": 1.5050408065290446, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.10535841435194016, + "kl": 0.013946533203125, + "learning_rate": 1.583577992801797e-06, + "loss": 0.0005582999438047409, + "memory(GiB)": 27.09, + "reward": 0.5231499969959259, + "reward_std": 0.07700392559636385, + "rewards/MMContentORM/mean": 0.7010000109672546, + "rewards/MMContentORM/std": 0.5700598895549774, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3135, + "train_speed(iter/s)": 0.083109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.6, + "completions/mean_length": 214.6, + "completions/min_length": 149.8, + "epoch": 1.5074411905904945, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.0759124755859375, + "kl": 0.015350341796875, + "learning_rate": 1.5691126495040238e-06, + "loss": 0.000614680303260684, + "memory(GiB)": 27.09, + "reward": 0.47774999141693114, + "reward_std": 0.1819385740207508, + "rewards/MMContentORM/mean": 0.6450000107288361, + "rewards/MMContentORM/std": 0.619084757566452, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.14121158123016359, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21724859476089478, + "step": 3140, + "train_speed(iter/s)": 0.083108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.2, + "completions/mean_length": 209.8875, + "completions/min_length": 139.0, + "epoch": 1.5098415746519445, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.12398523837327957, + "kl": 0.015985107421875, + "learning_rate": 1.5547013708744347e-06, + "loss": 0.0006400375626981258, + "memory(GiB)": 27.09, + "reward": 0.39759998619556425, + "reward_std": 0.1776252317475155, + "rewards/MMContentORM/mean": 0.45900002419948577, + "rewards/MMContentORM/std": 0.7010067760944366, + "rewards/MMFormatORM/mean": 0.5849999725818634, + "rewards/MMFormatORM/std": 0.14694467782974244, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.22606874108314515, + "step": 3145, + "train_speed(iter/s)": 0.083126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/mean_length": 219.8125, + "completions/min_length": 149.6, + "epoch": 1.5122419587133942, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.13619284331798553, + "kl": 0.0173095703125, + "learning_rate": 1.5403443840090943e-06, + "loss": 0.0006929846480488777, + "memory(GiB)": 27.09, + "reward": 0.5145999729633332, + "reward_std": 0.0552957494975999, + "rewards/MMContentORM/mean": 0.6940000057220459, + "rewards/MMContentORM/std": 0.4436347268521786, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 3150, + "train_speed(iter/s)": 0.083117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.4, + "completions/mean_length": 210.0, + "completions/min_length": 124.0, + "epoch": 1.5146423427748439, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1929241418838501, + "kl": 0.015277099609375, + "learning_rate": 1.5260419151485151e-06, + "loss": 0.0006110362242907286, + "memory(GiB)": 27.09, + "reward": 0.4635499775409698, + "reward_std": 0.12282444722950459, + "rewards/MMContentORM/mean": 0.584500002861023, + "rewards/MMContentORM/std": 0.5167646646499634, + "rewards/MMFormatORM/mean": 0.6056249856948852, + "rewards/MMFormatORM/std": 0.13240466862916947, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3155, + "train_speed(iter/s)": 0.083126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.4, + "completions/mean_length": 225.3375, + "completions/min_length": 132.8, + "epoch": 1.5170427268362938, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.18700656294822693, + "kl": 0.017669677734375, + "learning_rate": 1.511794189674109e-06, + "loss": 0.0007067018188536168, + "memory(GiB)": 27.09, + "reward": 0.4364499807357788, + "reward_std": 0.20866721048951148, + "rewards/MMContentORM/mean": 0.5704999923706054, + "rewards/MMContentORM/std": 0.605513896048069, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.14713743329048157, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.22636529207229614, + "step": 3160, + "train_speed(iter/s)": 0.083113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.8, + "completions/mean_length": 209.6875, + "completions/min_length": 121.6, + "epoch": 1.5194431108977438, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15477411448955536, + "kl": 0.014892578125, + "learning_rate": 1.4976014321046323e-06, + "loss": 0.000595424510538578, + "memory(GiB)": 27.09, + "reward": 0.49574996829032897, + "reward_std": 0.09482301846146583, + "rewards/MMContentORM/mean": 0.6325000166893006, + "rewards/MMContentORM/std": 0.5926263153553009, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3165, + "train_speed(iter/s)": 0.083112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/mean_length": 209.625, + "completions/min_length": 135.4, + "epoch": 1.5218434949591935, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.14993643760681152, + "kl": 0.013580322265625, + "learning_rate": 1.4834638660926403e-06, + "loss": 0.0005426953546702861, + "memory(GiB)": 27.09, + "reward": 0.44014999866485593, + "reward_std": 0.0772867701947689, + "rewards/MMContentORM/mean": 0.4934999912977219, + "rewards/MMContentORM/std": 0.6292442440986633, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3170, + "train_speed(iter/s)": 0.083121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.6, + "completions/mean_length": 217.0125, + "completions/min_length": 132.2, + "epoch": 1.5242438790206432, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.181192085146904, + "kl": 0.0142333984375, + "learning_rate": 1.4693817144209699e-06, + "loss": 0.0005688410252332688, + "memory(GiB)": 27.09, + "reward": 0.48604997992515564, + "reward_std": 0.11419774182140827, + "rewards/MMContentORM/mean": 0.6370000183582306, + "rewards/MMContentORM/std": 0.6059823155403137, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3175, + "train_speed(iter/s)": 0.083117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 462.0, + "completions/mean_length": 227.6375, + "completions/min_length": 130.4, + "epoch": 1.5266442630820931, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12335384637117386, + "kl": 0.0125244140625, + "learning_rate": 1.4553551989992238e-06, + "loss": 0.0005013378337025643, + "memory(GiB)": 27.09, + "reward": 0.5401999652385712, + "reward_std": 0.10210621654987335, + "rewards/MMContentORM/mean": 0.7580000162124634, + "rewards/MMContentORM/std": 0.5495529055595398, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3180, + "train_speed(iter/s)": 0.083082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.8, + "completions/mean_length": 212.6125, + "completions/min_length": 139.2, + "epoch": 1.529044647143543, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.14260004460811615, + "kl": 0.02462158203125, + "learning_rate": 1.4413845408602838e-06, + "loss": 0.0009833592921495438, + "memory(GiB)": 27.09, + "reward": 0.4681999832391739, + "reward_std": 0.07283199802041054, + "rewards/MMContentORM/mean": 0.5780000060796737, + "rewards/MMContentORM/std": 0.587296724319458, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 3185, + "train_speed(iter/s)": 0.083092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.6, + "completions/mean_length": 221.975, + "completions/min_length": 147.8, + "epoch": 1.5314450312049928, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1370711326599121, + "kl": 0.015081787109375, + "learning_rate": 1.427469960156812e-06, + "loss": 0.0006033728364855051, + "memory(GiB)": 27.09, + "reward": 0.4931999921798706, + "reward_std": 0.07085209367796778, + "rewards/MMContentORM/mean": 0.6405000150203705, + "rewards/MMContentORM/std": 0.6245416283607483, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 3190, + "train_speed(iter/s)": 0.083091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.8, + "completions/mean_length": 206.65, + "completions/min_length": 127.6, + "epoch": 1.5338454152664425, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.17985497415065765, + "kl": 0.018963623046875, + "learning_rate": 1.4136116761577935e-06, + "loss": 0.0007579845376312732, + "memory(GiB)": 27.09, + "reward": 0.5177499890327454, + "reward_std": 0.09425732623785735, + "rewards/MMContentORM/mean": 0.6875000119209289, + "rewards/MMContentORM/std": 0.5862172305583954, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3195, + "train_speed(iter/s)": 0.0831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.2, + "completions/mean_length": 223.8, + "completions/min_length": 136.0, + "epoch": 1.5362457993278924, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11441948264837265, + "kl": 0.01436767578125, + "learning_rate": 1.3998099072450811e-06, + "loss": 0.0005743363872170448, + "memory(GiB)": 27.09, + "reward": 0.4908499836921692, + "reward_std": 0.11193500682711602, + "rewards/MMContentORM/mean": 0.6490000128746033, + "rewards/MMContentORM/std": 0.5852620244026184, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3200, + "train_speed(iter/s)": 0.083089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 501.4, + "completions/mean_length": 236.55, + "completions/min_length": 149.6, + "epoch": 1.5386461833893423, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11317011713981628, + "kl": 0.0179931640625, + "learning_rate": 1.386064870909946e-06, + "loss": 0.0007195640355348587, + "memory(GiB)": 27.09, + "reward": 0.4382499873638153, + "reward_std": 0.12918841242790222, + "rewards/MMContentORM/mean": 0.5174999952316284, + "rewards/MMContentORM/std": 0.6612499058246613, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3205, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.2, + "completions/mean_length": 218.575, + "completions/min_length": 139.2, + "epoch": 1.5410465674507923, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.14002180099487305, + "kl": 0.014068603515625, + "learning_rate": 1.3723767837496571e-06, + "loss": 0.0005637550726532936, + "memory(GiB)": 27.09, + "reward": 0.43539997935295105, + "reward_std": 0.12756206155754626, + "rewards/MMContentORM/mean": 0.49599999785423277, + "rewards/MMContentORM/std": 0.6566171884536743, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 3210, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.8, + "completions/mean_length": 207.0125, + "completions/min_length": 145.0, + "epoch": 1.543446951512242, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.17006883025169373, + "kl": 0.016265869140625, + "learning_rate": 1.3587458614640648e-06, + "loss": 0.0006507603451609611, + "memory(GiB)": 27.09, + "reward": 0.5275500059127808, + "reward_std": 0.07700392962433397, + "rewards/MMContentORM/mean": 0.7120000004768372, + "rewards/MMContentORM/std": 0.5522327601909638, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3215, + "train_speed(iter/s)": 0.08302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 454.8, + "completions/mean_length": 222.8125, + "completions/min_length": 118.6, + "epoch": 1.5458473355736917, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19454053044319153, + "kl": 0.02646484375, + "learning_rate": 1.3451723188522043e-06, + "loss": 0.0010566259734332561, + "memory(GiB)": 27.09, + "reward": 0.3924499750137329, + "reward_std": 0.19169664829969407, + "rewards/MMContentORM/mean": 0.46050000190734863, + "rewards/MMContentORM/std": 0.7452011108398438, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 3220, + "train_speed(iter/s)": 0.082975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 472.6, + "completions/mean_length": 222.15, + "completions/min_length": 142.2, + "epoch": 1.5482477196351416, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.19158095121383667, + "kl": 0.0150146484375, + "learning_rate": 1.331656369808908e-06, + "loss": 0.0006003158167004585, + "memory(GiB)": 27.09, + "reward": 0.4775499701499939, + "reward_std": 0.1092479906976223, + "rewards/MMContentORM/mean": 0.6195000171661377, + "rewards/MMContentORM/std": 0.5317566640675068, + "rewards/MMFormatORM/mean": 0.6056249737739563, + "rewards/MMFormatORM/std": 0.11180812567472458, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 3225, + "train_speed(iter/s)": 0.082941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.6, + "completions/mean_length": 211.1875, + "completions/min_length": 118.8, + "epoch": 1.5506481036965916, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1315494179725647, + "kl": 0.01778564453125, + "learning_rate": 1.318198227321436e-06, + "loss": 0.000711181340739131, + "memory(GiB)": 27.09, + "reward": 0.5287999749183655, + "reward_std": 0.011313705006614327, + "rewards/MMContentORM/mean": 0.6720000028610229, + "rewards/MMContentORM/std": 0.5012042224407196, + "rewards/MMFormatORM/mean": 0.6499999761581421, + "rewards/MMFormatORM/std": 0.0, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 3230, + "train_speed(iter/s)": 0.082949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.6, + "completions/mean_length": 216.4625, + "completions/min_length": 109.6, + "epoch": 1.5530484877580413, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.15574294328689575, + "kl": 0.020379638671875, + "learning_rate": 1.3047981034661245e-06, + "loss": 0.0008146503940224647, + "memory(GiB)": 27.09, + "reward": 0.46579996943473817, + "reward_std": 0.16772572547197342, + "rewards/MMContentORM/mean": 0.62950000166893, + "rewards/MMContentORM/std": 0.683431351184845, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.19821036159992217, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 3235, + "train_speed(iter/s)": 0.082953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.2, + "completions/mean_length": 217.525, + "completions/min_length": 132.6, + "epoch": 1.555448871819491, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.20693257451057434, + "kl": 0.02076416015625, + "learning_rate": 1.2914562094050343e-06, + "loss": 0.0008313735015690327, + "memory(GiB)": 27.09, + "reward": 0.47934995889663695, + "reward_std": 0.09482302069664002, + "rewards/MMContentORM/mean": 0.5915000021457673, + "rewards/MMContentORM/std": 0.5874766707420349, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3240, + "train_speed(iter/s)": 0.082959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.8, + "completions/mean_length": 214.3125, + "completions/min_length": 146.0, + "epoch": 1.557849255880941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17259366810321808, + "kl": 0.01396484375, + "learning_rate": 1.2781727553826307e-06, + "loss": 0.0005578281357884407, + "memory(GiB)": 27.09, + "reward": 0.5059499919414521, + "reward_std": 0.06173042135778815, + "rewards/MMContentORM/mean": 0.658000010251999, + "rewards/MMContentORM/std": 0.44256684333086016, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.05240467190742493, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.08062257766723632, + "step": 3245, + "train_speed(iter/s)": 0.082967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.2, + "completions/mean_length": 201.55, + "completions/min_length": 137.4, + "epoch": 1.5602496399423909, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.066941037774086, + "kl": 0.016973876953125, + "learning_rate": 1.264947950722467e-06, + "loss": 0.0006793485023081303, + "memory(GiB)": 27.09, + "reward": 0.535649973154068, + "reward_std": 0.12777419239282609, + "rewards/MMContentORM/mean": 0.7610000252723694, + "rewards/MMContentORM/std": 0.5375386297702789, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3250, + "train_speed(iter/s)": 0.082973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/mean_length": 228.2, + "completions/min_length": 154.6, + "epoch": 1.5626500240038406, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.09117994457483292, + "kl": 0.012646484375, + "learning_rate": 1.2517820038238893e-06, + "loss": 0.0005060765892267227, + "memory(GiB)": 27.09, + "reward": 0.4888499915599823, + "reward_std": 0.09454017840325832, + "rewards/MMContentORM/mean": 0.6439999967813492, + "rewards/MMContentORM/std": 0.5739769160747528, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3255, + "train_speed(iter/s)": 0.082976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.6, + "completions/mean_length": 197.9125, + "completions/min_length": 121.4, + "epoch": 1.5650504080652905, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.2288537472486496, + "kl": 0.017327880859375, + "learning_rate": 1.2386751221587478e-06, + "loss": 0.0006930924020707608, + "memory(GiB)": 27.09, + "reward": 0.3989499807357788, + "reward_std": 0.1745846627280116, + "rewards/MMContentORM/mean": 0.4479999840259552, + "rewards/MMContentORM/std": 0.7100707769393921, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3260, + "train_speed(iter/s)": 0.082991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 465.4, + "completions/mean_length": 225.6875, + "completions/min_length": 145.0, + "epoch": 1.5674507921267402, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.004609475843608379, + "kl": 0.01778564453125, + "learning_rate": 1.2256275122681304e-06, + "loss": 0.0007108909543603658, + "memory(GiB)": 27.09, + "reward": 0.4348500072956085, + "reward_std": 0.10231834650039673, + "rewards/MMContentORM/mean": 0.5090000003576278, + "rewards/MMContentORM/std": 0.6001178443431854, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3265, + "train_speed(iter/s)": 0.082956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.8, + "completions/mean_length": 213.2625, + "completions/min_length": 151.2, + "epoch": 1.5698511761881901, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.18112541735172272, + "kl": 0.01341552734375, + "learning_rate": 1.2126393797591112e-06, + "loss": 0.0005367286503314972, + "memory(GiB)": 27.09, + "reward": 0.4611499786376953, + "reward_std": 0.07771103186532854, + "rewards/MMContentORM/mean": 0.5460000038146973, + "rewards/MMContentORM/std": 0.6368870377540589, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3270, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.6, + "completions/mean_length": 203.625, + "completions/min_length": 121.0, + "epoch": 1.57225156024964, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1886938065290451, + "kl": 0.016790771484375, + "learning_rate": 1.1997109293015015e-06, + "loss": 0.0006717256270349026, + "memory(GiB)": 27.09, + "reward": 0.4238499701023102, + "reward_std": 0.12975409450009465, + "rewards/MMContentORM/mean": 0.48149998784065245, + "rewards/MMContentORM/std": 0.6749303579330445, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3275, + "train_speed(iter/s)": 0.082972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.4, + "completions/mean_length": 217.0375, + "completions/min_length": 127.0, + "epoch": 1.5746519443110898, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1641152799129486, + "kl": 0.017828369140625, + "learning_rate": 1.1868423646246323e-06, + "loss": 0.0007128944620490074, + "memory(GiB)": 27.09, + "reward": 0.5149999856948853, + "reward_std": 0.09135819021612405, + "rewards/MMContentORM/mean": 0.695000022649765, + "rewards/MMContentORM/std": 0.49438799545168877, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3280, + "train_speed(iter/s)": 0.082969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/mean_length": 208.25, + "completions/min_length": 136.2, + "epoch": 1.5770523283725395, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13112139701843262, + "kl": 0.016351318359375, + "learning_rate": 1.1740338885141422e-06, + "loss": 0.0006545517593622207, + "memory(GiB)": 27.09, + "reward": 0.4730999946594238, + "reward_std": 0.11455129862297327, + "rewards/MMContentORM/mean": 0.6190000176429749, + "rewards/MMContentORM/std": 0.6247550487518311, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 3285, + "train_speed(iter/s)": 0.082985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.2, + "completions/mean_length": 207.8, + "completions/min_length": 140.6, + "epoch": 1.5794527124339894, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.18867741525173187, + "kl": 0.01796875, + "learning_rate": 1.16128570280878e-06, + "loss": 0.0007181556895375252, + "memory(GiB)": 27.09, + "reward": 0.45814998745918273, + "reward_std": 0.1034497192595154, + "rewards/MMContentORM/mean": 0.5959999918937683, + "rewards/MMContentORM/std": 0.6685267508029937, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3290, + "train_speed(iter/s)": 0.082978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.6, + "completions/mean_length": 207.7875, + "completions/min_length": 142.6, + "epoch": 1.5818530964954394, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.2017498016357422, + "kl": 0.017095947265625, + "learning_rate": 1.1485980083972242e-06, + "loss": 0.0006845677271485329, + "memory(GiB)": 27.09, + "reward": 0.4696999967098236, + "reward_std": 0.09899494857527316, + "rewards/MMContentORM/mean": 0.6105000078678131, + "rewards/MMContentORM/std": 0.6631593823432922, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3295, + "train_speed(iter/s)": 0.082991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/mean_length": 209.1, + "completions/min_length": 145.0, + "epoch": 1.584253480556889, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1147226095199585, + "kl": 0.01207275390625, + "learning_rate": 1.1359710052149191e-06, + "loss": 0.00048305182717740537, + "memory(GiB)": 27.09, + "reward": 0.536549985408783, + "reward_std": 0.05296229436062276, + "rewards/MMContentORM/mean": 0.7220000147819519, + "rewards/MMContentORM/std": 0.5010363392531871, + "rewards/MMFormatORM/mean": 0.6318749904632568, + "rewards/MMFormatORM/std": 0.07249999642372132, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3300, + "train_speed(iter/s)": 0.083011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.2, + "completions/mean_length": 204.5875, + "completions/min_length": 114.2, + "epoch": 1.5866538646183388, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.22295618057250977, + "kl": 0.020770263671875, + "learning_rate": 1.123404892240927e-06, + "loss": 0.0008308948017656803, + "memory(GiB)": 27.09, + "reward": 0.4197499752044678, + "reward_std": 0.14799744696356357, + "rewards/MMContentORM/mean": 0.5, + "rewards/MMContentORM/std": 0.672368848323822, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 3305, + "train_speed(iter/s)": 0.082994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/mean_length": 214.7875, + "completions/min_length": 132.8, + "epoch": 1.5890542486797887, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08081990480422974, + "kl": 0.016156005859375, + "learning_rate": 1.110899867494784e-06, + "loss": 0.0006469148676842452, + "memory(GiB)": 27.09, + "reward": 0.4746500015258789, + "reward_std": 0.11108647771179676, + "rewards/MMContentORM/mean": 0.6085000157356262, + "rewards/MMContentORM/std": 0.6200405597686768, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 3310, + "train_speed(iter/s)": 0.082984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.8, + "completions/mean_length": 217.85, + "completions/min_length": 144.8, + "epoch": 1.5914546327412387, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.20784571766853333, + "kl": 0.018511962890625, + "learning_rate": 1.0984561280333867e-06, + "loss": 0.0007408755365759135, + "memory(GiB)": 27.09, + "reward": 0.48464998602867126, + "reward_std": 0.1252286109374836, + "rewards/MMContentORM/mean": 0.6210000038146972, + "rewards/MMContentORM/std": 0.5666950985789299, + "rewards/MMFormatORM/mean": 0.615624976158142, + "rewards/MMFormatORM/std": 0.10976680517196655, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3315, + "train_speed(iter/s)": 0.082985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.4, + "completions/mean_length": 200.8625, + "completions/min_length": 88.8, + "epoch": 1.5938550168026886, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11155827343463898, + "kl": 0.035028076171875, + "learning_rate": 1.0860738699478852e-06, + "loss": 0.001404472440481186, + "memory(GiB)": 27.09, + "reward": 0.4386999785900116, + "reward_std": 0.1506137415766716, + "rewards/MMContentORM/mean": 0.5330000042915344, + "rewards/MMContentORM/std": 0.6952720165252686, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 3320, + "train_speed(iter/s)": 0.083002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 521.0, + "completions/mean_length": 233.6, + "completions/min_length": 137.4, + "epoch": 1.5962554008641383, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1469486504793167, + "kl": 0.017987060546875, + "learning_rate": 1.0737532883605916e-06, + "loss": 0.0007191974669694901, + "memory(GiB)": 27.09, + "reward": 0.4517999827861786, + "reward_std": 0.19148451760411261, + "rewards/MMContentORM/mean": 0.5945000112056732, + "rewards/MMContentORM/std": 0.6807171523571014, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 3325, + "train_speed(iter/s)": 0.082945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/mean_length": 215.8875, + "completions/min_length": 132.0, + "epoch": 1.598655784925588, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.13840670883655548, + "kl": 0.015753173828125, + "learning_rate": 1.0614945774219082e-06, + "loss": 0.0006300761830061674, + "memory(GiB)": 27.09, + "reward": 0.501199996471405, + "reward_std": 0.11200571209192275, + "rewards/MMContentORM/mean": 0.6605000019073486, + "rewards/MMContentORM/std": 0.5745877206325531, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3330, + "train_speed(iter/s)": 0.082953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.6, + "completions/mean_length": 220.6125, + "completions/min_length": 135.4, + "epoch": 1.601056168987038, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1523619294166565, + "kl": 0.014703369140625, + "learning_rate": 1.049297930307262e-06, + "loss": 0.0005889554508030414, + "memory(GiB)": 27.09, + "reward": 0.39344998002052306, + "reward_std": 0.12918840944767, + "rewards/MMContentORM/mean": 0.4055000126361847, + "rewards/MMContentORM/std": 0.696004319190979, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3335, + "train_speed(iter/s)": 0.082948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.4, + "completions/mean_length": 211.5125, + "completions/min_length": 128.0, + "epoch": 1.6034565530484879, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.09966878592967987, + "kl": 0.017742919921875, + "learning_rate": 1.037163539214072e-06, + "loss": 0.0007098756264895201, + "memory(GiB)": 27.09, + "reward": 0.4519999802112579, + "reward_std": 0.08032733157742769, + "rewards/MMContentORM/mean": 0.537500011920929, + "rewards/MMContentORM/std": 0.5419799767434597, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3340, + "train_speed(iter/s)": 0.082952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.4, + "completions/mean_length": 207.275, + "completions/min_length": 120.8, + "epoch": 1.6058569371099376, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.15431775152683258, + "kl": 0.018145751953125, + "learning_rate": 1.0250915953587088e-06, + "loss": 0.0007263108156621457, + "memory(GiB)": 27.09, + "reward": 0.4955999791622162, + "reward_std": 0.07580183688551187, + "rewards/MMContentORM/mean": 0.6215000033378602, + "rewards/MMContentORM/std": 0.5888689577579498, + "rewards/MMFormatORM/mean": 0.6299999833106995, + "rewards/MMFormatORM/std": 0.06737477481365203, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3345, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.4, + "completions/mean_length": 223.1125, + "completions/min_length": 131.8, + "epoch": 1.6082573211713873, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.08975112438201904, + "kl": 0.01865234375, + "learning_rate": 1.013082288973481e-06, + "loss": 0.0007464576978236437, + "memory(GiB)": 27.09, + "reward": 0.4766999840736389, + "reward_std": 0.15117942318320274, + "rewards/MMContentORM/mean": 0.628000009059906, + "rewards/MMContentORM/std": 0.6414366006851197, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.14990466833114624, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23062257766723632, + "step": 3350, + "train_speed(iter/s)": 0.082961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.2, + "completions/mean_length": 213.5625, + "completions/min_length": 143.2, + "epoch": 1.6106577052328372, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1311890333890915, + "kl": 0.01627197265625, + "learning_rate": 1.0011358093036527e-06, + "loss": 0.0006509319879114628, + "memory(GiB)": 27.09, + "reward": 0.46369996666908264, + "reward_std": 0.08089300859719514, + "rewards/MMContentORM/mean": 0.595499986410141, + "rewards/MMContentORM/std": 0.6696452021598815, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 3355, + "train_speed(iter/s)": 0.082963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.4, + "completions/mean_length": 210.25, + "completions/min_length": 101.8, + "epoch": 1.6130580892942872, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12645810842514038, + "kl": 0.0197021484375, + "learning_rate": 9.89252344604444e-07, + "loss": 0.000787085946649313, + "memory(GiB)": 27.09, + "reward": 0.4845999777317047, + "reward_std": 0.11115718111395836, + "rewards/MMContentORM/mean": 0.6190000116825104, + "rewards/MMContentORM/std": 0.6094013214111328, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3360, + "train_speed(iter/s)": 0.082972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/mean_length": 212.8875, + "completions/min_length": 153.4, + "epoch": 1.615458473355737, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.07932203263044357, + "kl": 0.014056396484375, + "learning_rate": 9.774320821380734e-07, + "loss": 0.0005630974192172289, + "memory(GiB)": 27.09, + "reward": 0.43234997391700747, + "reward_std": 0.0758725541876629, + "rewards/MMContentORM/mean": 0.47400000095367434, + "rewards/MMContentORM/std": 0.6691441416740418, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3365, + "train_speed(iter/s)": 0.08298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/mean_length": 212.225, + "completions/min_length": 151.6, + "epoch": 1.6178588574171866, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1142314150929451, + "kl": 0.014459228515625, + "learning_rate": 9.656752081708031e-07, + "loss": 0.0005782804451882839, + "memory(GiB)": 27.09, + "reward": 0.43564997911453246, + "reward_std": 0.10175266563892364, + "rewards/MMContentORM/mean": 0.510999995470047, + "rewards/MMContentORM/std": 0.6983252167701721, + "rewards/MMFormatORM/mean": 0.609375, + "rewards/MMFormatORM/std": 0.16249999403953552, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.25, + "step": 3370, + "train_speed(iter/s)": 0.082987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 461.6, + "completions/mean_length": 228.4, + "completions/min_length": 124.0, + "epoch": 1.6202592414786365, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.15813876688480377, + "kl": 0.02144775390625, + "learning_rate": 9.539819079700096e-07, + "loss": 0.0008579882793128491, + "memory(GiB)": 27.09, + "reward": 0.4178499698638916, + "reward_std": 0.11221784348599613, + "rewards/MMContentORM/mean": 0.4665000081062317, + "rewards/MMContentORM/std": 0.6567980706691742, + "rewards/MMFormatORM/mean": 0.6093749821186065, + "rewards/MMFormatORM/std": 0.09063776731491088, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.1394427239894867, + "step": 3375, + "train_speed(iter/s)": 0.082951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.4, + "completions/mean_length": 212.875, + "completions/min_length": 125.2, + "epoch": 1.6226596255400865, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.28574997186660767, + "kl": 0.0211181640625, + "learning_rate": 9.423523658012568e-07, + "loss": 0.0008457589894533158, + "memory(GiB)": 27.09, + "reward": 0.4936999797821045, + "reward_std": 0.13618875967804342, + "rewards/MMContentORM/mean": 0.6705000221729278, + "rewards/MMContentORM/std": 0.5642464995384217, + "rewards/MMFormatORM/mean": 0.6012499749660491, + "rewards/MMFormatORM/std": 0.10254122316837311, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.15775573253631592, + "step": 3380, + "train_speed(iter/s)": 0.082959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.6, + "completions/mean_length": 209.95, + "completions/min_length": 126.0, + "epoch": 1.6250600096015364, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11802355945110321, + "kl": 0.012750244140625, + "learning_rate": 9.30786764925396e-07, + "loss": 0.000509438058361411, + "memory(GiB)": 27.09, + "reward": 0.5361499786376953, + "reward_std": 0.05635640830732882, + "rewards/MMContentORM/mean": 0.7210000157356262, + "rewards/MMContentORM/std": 0.5421726107597351, + "rewards/MMFormatORM/mean": 0.6318749904632568, + "rewards/MMFormatORM/std": 0.07249999642372132, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3385, + "train_speed(iter/s)": 0.082966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.6, + "completions/mean_length": 208.0, + "completions/min_length": 138.8, + "epoch": 1.6274603936629861, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.006833571009337902, + "kl": 0.015826416015625, + "learning_rate": 9.192852875956787e-07, + "loss": 0.0006330645643174649, + "memory(GiB)": 27.09, + "reward": 0.4655999720096588, + "reward_std": 0.1080459140241146, + "rewards/MMContentORM/mean": 0.571500027179718, + "rewards/MMContentORM/std": 0.6459370970726013, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3390, + "train_speed(iter/s)": 0.082977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/mean_length": 229.1375, + "completions/min_length": 148.4, + "epoch": 1.6298607777244358, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.005902289412915707, + "kl": 0.0166748046875, + "learning_rate": 9.078481150548824e-07, + "loss": 0.0006678791251033545, + "memory(GiB)": 27.09, + "reward": 0.42405000030994416, + "reward_std": 0.11476342976093293, + "rewards/MMContentORM/mean": 0.4819999933242798, + "rewards/MMContentORM/std": 0.671118414402008, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3395, + "train_speed(iter/s)": 0.08297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.2, + "completions/mean_length": 217.4, + "completions/min_length": 134.8, + "epoch": 1.6322611617858858, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15501824021339417, + "kl": 0.016632080078125, + "learning_rate": 8.964754275324589e-07, + "loss": 0.0006651143543422222, + "memory(GiB)": 27.09, + "reward": 0.4716499984264374, + "reward_std": 0.07870098501443863, + "rewards/MMContentORM/mean": 0.6010000109672546, + "rewards/MMContentORM/std": 0.6552097082138062, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3400, + "train_speed(iter/s)": 0.082979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.4, + "completions/mean_length": 236.1, + "completions/min_length": 152.0, + "epoch": 1.6346615458473357, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.07511002570390701, + "kl": 0.01353759765625, + "learning_rate": 8.851674042416852e-07, + "loss": 0.0005421666894108057, + "memory(GiB)": 27.09, + "reward": 0.4685499846935272, + "reward_std": 0.10839946605265141, + "rewards/MMContentORM/mean": 0.5644999861717224, + "rewards/MMContentORM/std": 0.5619328938424587, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3405, + "train_speed(iter/s)": 0.082918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.8, + "completions/mean_length": 204.75, + "completions/min_length": 130.6, + "epoch": 1.6370619299087854, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1756453961133957, + "kl": 0.0164794921875, + "learning_rate": 8.739242233768519e-07, + "loss": 0.0006595761980861426, + "memory(GiB)": 27.09, + "reward": 0.4861499845981598, + "reward_std": 0.14361338005401195, + "rewards/MMContentORM/mean": 0.6535000085830689, + "rewards/MMContentORM/std": 0.6205046653747559, + "rewards/MMFormatORM/mean": 0.5993749856948852, + "rewards/MMFormatORM/std": 0.13680812418460847, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 3410, + "train_speed(iter/s)": 0.082932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.8, + "completions/mean_length": 216.9875, + "completions/min_length": 124.0, + "epoch": 1.6394623139702351, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.06857043504714966, + "kl": 0.031121826171875, + "learning_rate": 8.627460621104444e-07, + "loss": 0.001243231911212206, + "memory(GiB)": 27.09, + "reward": 0.46214998364448545, + "reward_std": 0.17797877669800072, + "rewards/MMContentORM/mean": 0.6060000151395798, + "rewards/MMContentORM/std": 0.5733154647052288, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3415, + "train_speed(iter/s)": 0.082938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/mean_length": 213.0625, + "completions/min_length": 142.6, + "epoch": 1.641862698031685, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.1507033109664917, + "kl": 0.01561279296875, + "learning_rate": 8.516330965903564e-07, + "loss": 0.0006242851726710796, + "memory(GiB)": 27.09, + "reward": 0.4748999834060669, + "reward_std": 0.14523972067981958, + "rewards/MMContentORM/mean": 0.6235000133514405, + "rewards/MMContentORM/std": 0.6498092889785767, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3420, + "train_speed(iter/s)": 0.08295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.2, + "completions/mean_length": 220.0, + "completions/min_length": 117.2, + "epoch": 1.644263082093135, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.1359294056892395, + "kl": 0.0308837890625, + "learning_rate": 8.405855019371123e-07, + "loss": 0.0012361595407128334, + "memory(GiB)": 27.09, + "reward": 0.3967999845743179, + "reward_std": 0.17083699703216554, + "rewards/MMContentORM/mean": 0.4570000171661377, + "rewards/MMContentORM/std": 0.6885712265968322, + "rewards/MMFormatORM/mean": 0.5849999964237214, + "rewards/MMFormatORM/std": 0.1881377637386322, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.28944272398948667, + "step": 3425, + "train_speed(iter/s)": 0.082954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.6, + "completions/mean_length": 209.8, + "completions/min_length": 152.6, + "epoch": 1.6466634661545847, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08136511594057083, + "kl": 0.397613525390625, + "learning_rate": 8.296034522411078e-07, + "loss": 0.01587701141834259, + "memory(GiB)": 27.09, + "reward": 0.555299985408783, + "reward_std": 0.0552957494975999, + "rewards/MMContentORM/mean": 0.7670000076293946, + "rewards/MMContentORM/std": 0.4387725330889225, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3430, + "train_speed(iter/s)": 0.082969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.6, + "completions/mean_length": 210.5875, + "completions/min_length": 129.6, + "epoch": 1.6490638502160344, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.18326374888420105, + "kl": 0.018804931640625, + "learning_rate": 8.186871205598712e-07, + "loss": 0.0007523265201598405, + "memory(GiB)": 27.09, + "reward": 0.4282499849796295, + "reward_std": 0.175716033577919, + "rewards/MMContentORM/mean": 0.5500000059604645, + "rewards/MMContentORM/std": 0.6840834498405457, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.17163621485233307, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.26405572295188906, + "step": 3435, + "train_speed(iter/s)": 0.082985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.8, + "completions/mean_length": 220.225, + "completions/min_length": 150.8, + "epoch": 1.6514642342774843, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.11703846603631973, + "kl": 0.0169677734375, + "learning_rate": 8.078366789153241e-07, + "loss": 0.0006778911687433719, + "memory(GiB)": 27.09, + "reward": 0.4566999852657318, + "reward_std": 0.10988439926877618, + "rewards/MMContentORM/mean": 0.5529999971389771, + "rewards/MMContentORM/std": 0.635290002822876, + "rewards/MMFormatORM/mean": 0.6137499809265137, + "rewards/MMFormatORM/std": 0.11046060025691987, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3440, + "train_speed(iter/s)": 0.082986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.8, + "completions/mean_length": 221.65, + "completions/min_length": 146.4, + "epoch": 1.6538646183389343, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.17084509134292603, + "kl": 0.015997314453125, + "learning_rate": 7.970522982910856e-07, + "loss": 0.0006411694921553135, + "memory(GiB)": 27.09, + "reward": 0.4217999815940857, + "reward_std": 0.1796051269862801, + "rewards/MMContentORM/mean": 0.5195000052452088, + "rewards/MMContentORM/std": 0.6755177021026612, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 3445, + "train_speed(iter/s)": 0.082994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/mean_length": 222.2875, + "completions/min_length": 165.8, + "epoch": 1.6562650024003842, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.17295852303504944, + "kl": 0.01490478515625, + "learning_rate": 7.863341486297682e-07, + "loss": 0.0005966905970126391, + "memory(GiB)": 27.09, + "reward": 0.4945999622344971, + "reward_std": 0.09814641983248293, + "rewards/MMContentORM/mean": 0.6440000176429749, + "rewards/MMContentORM/std": 0.5497657291591167, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3450, + "train_speed(iter/s)": 0.082998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.6, + "completions/mean_length": 221.95, + "completions/min_length": 137.8, + "epoch": 1.658665386461834, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14504937827587128, + "kl": 0.01641845703125, + "learning_rate": 7.756823988303025e-07, + "loss": 0.0006571163889020681, + "memory(GiB)": 27.09, + "reward": 0.4818499743938446, + "reward_std": 0.08124656807631254, + "rewards/MMContentORM/mean": 0.6264999985694886, + "rewards/MMContentORM/std": 0.6504538416862488, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3455, + "train_speed(iter/s)": 0.08299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/mean_length": 209.3625, + "completions/min_length": 125.2, + "epoch": 1.6610657705232836, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14114868640899658, + "kl": 0.017041015625, + "learning_rate": 7.650972167452752e-07, + "loss": 0.0006805134937167168, + "memory(GiB)": 27.09, + "reward": 0.476099956035614, + "reward_std": 0.13675445076078177, + "rewards/MMContentORM/mean": 0.626500004529953, + "rewards/MMContentORM/std": 0.6218415260314941, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 3460, + "train_speed(iter/s)": 0.082999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.6, + "completions/mean_length": 210.4875, + "completions/min_length": 126.6, + "epoch": 1.6634661545847336, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21840178966522217, + "kl": 0.016424560546875, + "learning_rate": 7.545787691782847e-07, + "loss": 0.0006578662432730198, + "memory(GiB)": 27.09, + "reward": 0.3567999839782715, + "reward_std": 0.17606958658434452, + "rewards/MMContentORM/mean": 0.38949999660253526, + "rewards/MMContentORM/std": 0.7360579133033752, + "rewards/MMFormatORM/mean": 0.5649999618530274, + "rewards/MMFormatORM/std": 0.18591444790363312, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.2823687314987183, + "step": 3465, + "train_speed(iter/s)": 0.083009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.6, + "completions/mean_length": 217.7875, + "completions/min_length": 152.2, + "epoch": 1.6658665386461835, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.13330751657485962, + "kl": 0.011822509765625, + "learning_rate": 7.441272218813156e-07, + "loss": 0.0004730843007564545, + "memory(GiB)": 27.09, + "reward": 0.5238499701023102, + "reward_std": 0.03981010988354683, + "rewards/MMContentORM/mean": 0.6740000247955322, + "rewards/MMContentORM/std": 0.5537904977798462, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3470, + "train_speed(iter/s)": 0.083019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.4, + "completions/mean_length": 227.1375, + "completions/min_length": 125.0, + "epoch": 1.6682669227076332, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.19898498058319092, + "kl": 0.01395263671875, + "learning_rate": 7.337427395521173e-07, + "loss": 0.0005571233108639717, + "memory(GiB)": 27.09, + "reward": 0.5003499805927276, + "reward_std": 0.08888331830967218, + "rewards/MMContentORM/mean": 0.6440000176429749, + "rewards/MMContentORM/std": 0.6199796617031097, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3475, + "train_speed(iter/s)": 0.083012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/mean_length": 221.5875, + "completions/min_length": 139.8, + "epoch": 1.670667306769083, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15592429041862488, + "kl": 0.01651611328125, + "learning_rate": 7.234254858316187e-07, + "loss": 0.0006610705517232418, + "memory(GiB)": 27.09, + "reward": 0.4636499762535095, + "reward_std": 0.10514677353203297, + "rewards/MMContentORM/mean": 0.581000006198883, + "rewards/MMContentORM/std": 0.6593972444534302, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3480, + "train_speed(iter/s)": 0.083008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.8, + "completions/mean_length": 206.2125, + "completions/min_length": 124.2, + "epoch": 1.6730676908305329, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10853126645088196, + "kl": 0.013946533203125, + "learning_rate": 7.13175623301347e-07, + "loss": 0.0005579915829002857, + "memory(GiB)": 27.09, + "reward": 0.5001499652862549, + "reward_std": 0.08407499315217137, + "rewards/MMContentORM/mean": 0.6435000181198121, + "rewards/MMContentORM/std": 0.5942914664745331, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3485, + "train_speed(iter/s)": 0.083022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 208.8, + "completions/min_length": 114.4, + "epoch": 1.6754680748919828, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.26080384850502014, + "kl": 0.019244384765625, + "learning_rate": 7.02993313480862e-07, + "loss": 0.0007686344906687737, + "memory(GiB)": 27.09, + "reward": 0.5148499727249145, + "reward_std": 0.1034497192595154, + "rewards/MMContentORM/mean": 0.7090000152587891, + "rewards/MMContentORM/std": 0.5670624554157258, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3490, + "train_speed(iter/s)": 0.08303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.4, + "completions/mean_length": 218.8125, + "completions/min_length": 144.0, + "epoch": 1.6778684589534325, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16066478192806244, + "kl": 0.015631103515625, + "learning_rate": 6.928787168252132e-07, + "loss": 0.0006245138123631477, + "memory(GiB)": 27.09, + "reward": 0.45314998030662534, + "reward_std": 0.11943033039569854, + "rewards/MMContentORM/mean": 0.5835000038146972, + "rewards/MMContentORM/std": 0.6094723448157311, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.16571036279201506, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 3495, + "train_speed(iter/s)": 0.083039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.4, + "completions/mean_length": 216.3875, + "completions/min_length": 134.0, + "epoch": 1.6802688430148822, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.00513013731688261, + "kl": 0.015216064453125, + "learning_rate": 6.828319927224114e-07, + "loss": 0.0006088857538998127, + "memory(GiB)": 27.09, + "reward": 0.5228999733924866, + "reward_std": 0.05359869406092912, + "rewards/MMContentORM/mean": 0.6860000133514405, + "rewards/MMContentORM/std": 0.5610509395599366, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3500, + "train_speed(iter/s)": 0.083035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.6, + "completions/mean_length": 215.85, + "completions/min_length": 95.2, + "epoch": 1.6826692270763322, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.17233267426490784, + "kl": 0.021417236328125, + "learning_rate": 6.728532994909203e-07, + "loss": 0.0008568591438233852, + "memory(GiB)": 27.09, + "reward": 0.401749986410141, + "reward_std": 0.15421998733654618, + "rewards/MMContentORM/mean": 0.4550000071525574, + "rewards/MMContentORM/std": 0.6894314765930176, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3505, + "train_speed(iter/s)": 0.082993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.2, + "completions/mean_length": 211.375, + "completions/min_length": 151.2, + "epoch": 1.685069611137782, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14501921832561493, + "kl": 0.01634521484375, + "learning_rate": 6.629427943771532e-07, + "loss": 0.0006534026004374027, + "memory(GiB)": 27.09, + "reward": 0.49959996342658997, + "reward_std": 0.10917728263884782, + "rewards/MMContentORM/mean": 0.6565000057220459, + "rewards/MMContentORM/std": 0.628342616558075, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3510, + "train_speed(iter/s)": 0.082999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.2, + "completions/mean_length": 209.375, + "completions/min_length": 115.8, + "epoch": 1.687469995199232, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15510372817516327, + "kl": 0.0161865234375, + "learning_rate": 6.531006335530016e-07, + "loss": 0.0006463156081736088, + "memory(GiB)": 27.09, + "reward": 0.4707499802112579, + "reward_std": 0.06851864596828819, + "rewards/MMContentORM/mean": 0.5699999809265137, + "rewards/MMContentORM/std": 0.5937826454639434, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3515, + "train_speed(iter/s)": 0.083008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.2, + "completions/mean_length": 223.975, + "completions/min_length": 132.6, + "epoch": 1.6898703792606817, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.18824124336242676, + "kl": 0.013726806640625, + "learning_rate": 6.433269721133767e-07, + "loss": 0.0005491763353347778, + "memory(GiB)": 27.09, + "reward": 0.4896999835968018, + "reward_std": 0.08810550197958947, + "rewards/MMContentORM/mean": 0.602999997138977, + "rewards/MMContentORM/std": 0.617827194929123, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3520, + "train_speed(iter/s)": 0.083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/mean_length": 209.875, + "completions/min_length": 147.0, + "epoch": 1.6922707633221314, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.18868878483772278, + "kl": 0.018115234375, + "learning_rate": 6.336219640737568e-07, + "loss": 0.0007253088988363743, + "memory(GiB)": 27.09, + "reward": 0.4806499779224396, + "reward_std": 0.12635998169425874, + "rewards/MMContentORM/mean": 0.6235000014305114, + "rewards/MMContentORM/std": 0.6232763528823853, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.09680812656879426, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.14893558621406555, + "step": 3525, + "train_speed(iter/s)": 0.083009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.8, + "completions/mean_length": 211.3375, + "completions/min_length": 137.4, + "epoch": 1.6946711473835814, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.15571913123130798, + "kl": 0.01719970703125, + "learning_rate": 6.23985762367768e-07, + "loss": 0.0006875310558825731, + "memory(GiB)": 27.09, + "reward": 0.47434998154640196, + "reward_std": 0.08322646701708436, + "rewards/MMContentORM/mean": 0.5790000081062316, + "rewards/MMContentORM/std": 0.6246312737464905, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3530, + "train_speed(iter/s)": 0.083019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.8, + "completions/mean_length": 213.8125, + "completions/min_length": 136.2, + "epoch": 1.6970715314450313, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14880269765853882, + "kl": 0.0153564453125, + "learning_rate": 6.144185188447682e-07, + "loss": 0.000614521512761712, + "memory(GiB)": 27.09, + "reward": 0.5258000135421753, + "reward_std": 0.08343859082087875, + "rewards/MMContentORM/mean": 0.7220000326633453, + "rewards/MMContentORM/std": 0.4758839137852192, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 3535, + "train_speed(iter/s)": 0.083024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.8, + "completions/mean_length": 220.625, + "completions/min_length": 151.2, + "epoch": 1.699471915506481, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.17675665020942688, + "kl": 0.01483154296875, + "learning_rate": 6.049203842674628e-07, + "loss": 0.0005933211185038089, + "memory(GiB)": 27.09, + "reward": 0.4919499814510345, + "reward_std": 0.08209509402513504, + "rewards/MMContentORM/mean": 0.6230000138282776, + "rewards/MMContentORM/std": 0.6088876247406005, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3540, + "train_speed(iter/s)": 0.083022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.2, + "completions/mean_length": 214.075, + "completions/min_length": 130.6, + "epoch": 1.7018722995679307, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17425678670406342, + "kl": 0.015087890625, + "learning_rate": 5.954915083095164e-07, + "loss": 0.0006034282967448234, + "memory(GiB)": 27.09, + "reward": 0.42609999179840086, + "reward_std": 0.1848377011716366, + "rewards/MMContentORM/mean": 0.5015000104904175, + "rewards/MMContentORM/std": 0.7074744701385498, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3545, + "train_speed(iter/s)": 0.083032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.2, + "completions/mean_length": 213.4125, + "completions/min_length": 126.6, + "epoch": 1.7042726836293807, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.19085994362831116, + "kl": 0.01416015625, + "learning_rate": 5.86132039553205e-07, + "loss": 0.000567510724067688, + "memory(GiB)": 27.09, + "reward": 0.41705000400543213, + "reward_std": 0.14219917133450508, + "rewards/MMContentORM/mean": 0.4645000100135803, + "rewards/MMContentORM/std": 0.6884559154510498, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3550, + "train_speed(iter/s)": 0.083032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.2, + "completions/mean_length": 222.175, + "completions/min_length": 140.8, + "epoch": 1.7066730676908306, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.14451654255390167, + "kl": 0.016253662109375, + "learning_rate": 5.768421254870721e-07, + "loss": 0.0006507723592221737, + "memory(GiB)": 27.09, + "reward": 0.4542999863624573, + "reward_std": 0.08343860041350126, + "rewards/MMContentORM/mean": 0.5719999969005585, + "rewards/MMContentORM/std": 0.46381150707602503, + "rewards/MMFormatORM/mean": 0.6012499749660491, + "rewards/MMFormatORM/std": 0.10254122316837311, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.15775573253631592, + "step": 3555, + "train_speed(iter/s)": 0.083035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.2, + "completions/mean_length": 211.3375, + "completions/min_length": 101.4, + "epoch": 1.7090734517522803, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.10909511148929596, + "kl": 0.018408203125, + "learning_rate": 5.676219125036008e-07, + "loss": 0.0007358456961810589, + "memory(GiB)": 27.09, + "reward": 0.4575499951839447, + "reward_std": 0.17769593372941017, + "rewards/MMContentORM/mean": 0.5944999873638153, + "rewards/MMContentORM/std": 0.6650677740573883, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.14121158123016359, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21724859476089478, + "step": 3560, + "train_speed(iter/s)": 0.083043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.6, + "completions/mean_length": 219.425, + "completions/min_length": 124.2, + "epoch": 1.7114738358137302, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.20709584653377533, + "kl": 0.017572021484375, + "learning_rate": 5.584715458969103e-07, + "loss": 0.0007023838814347982, + "memory(GiB)": 27.09, + "reward": 0.4612499952316284, + "reward_std": 0.10458109080791474, + "rewards/MMContentORM/mean": 0.575000011920929, + "rewards/MMContentORM/std": 0.5526260115206242, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3565, + "train_speed(iter/s)": 0.083037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.8, + "completions/mean_length": 209.3125, + "completions/min_length": 125.0, + "epoch": 1.71387421987518, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.07817840576171875, + "kl": 0.021575927734375, + "learning_rate": 5.493911698604648e-07, + "loss": 0.0008630914613604546, + "memory(GiB)": 27.09, + "reward": 0.4372999846935272, + "reward_std": 0.12600643069017678, + "rewards/MMContentORM/mean": 0.5294999957084656, + "rewards/MMContentORM/std": 0.6551137328147888, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3570, + "train_speed(iter/s)": 0.083038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.2, + "completions/mean_length": 208.3125, + "completions/min_length": 128.4, + "epoch": 1.71627460393663, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.1356513947248459, + "kl": 0.015533447265625, + "learning_rate": 5.403809274848048e-07, + "loss": 0.0006216357462108136, + "memory(GiB)": 27.09, + "reward": 0.5266999721527099, + "reward_std": 0.05444721775129437, + "rewards/MMContentORM/mean": 0.6955000042915345, + "rewards/MMContentORM/std": 0.5412708878517151, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3575, + "train_speed(iter/s)": 0.083049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/mean_length": 205.125, + "completions/min_length": 124.2, + "epoch": 1.7186749879980798, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.19772173464298248, + "kl": 0.0181396484375, + "learning_rate": 5.314409607552845e-07, + "loss": 0.0007258410565555096, + "memory(GiB)": 27.09, + "reward": 0.4547999739646912, + "reward_std": 0.18611050322651862, + "rewards/MMContentORM/mean": 0.6020000219345093, + "rewards/MMContentORM/std": 0.6907771944999694, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 3580, + "train_speed(iter/s)": 0.083061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/mean_length": 212.6, + "completions/min_length": 129.2, + "epoch": 1.7210753720595295, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.19165228307247162, + "kl": 0.016009521484375, + "learning_rate": 5.225714105498414e-07, + "loss": 0.0006398680619895458, + "memory(GiB)": 27.09, + "reward": 0.4763499915599823, + "reward_std": 0.07304412834346294, + "rewards/MMContentORM/mean": 0.5839999854564667, + "rewards/MMContentORM/std": 0.5775633066892624, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3585, + "train_speed(iter/s)": 0.083072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.8, + "completions/mean_length": 220.3, + "completions/min_length": 126.0, + "epoch": 1.7234757561209793, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.29149454832077026, + "kl": 0.015185546875, + "learning_rate": 5.137724166367763e-07, + "loss": 0.0006067929789423943, + "memory(GiB)": 27.09, + "reward": 0.3497999906539917, + "reward_std": 0.2576697215437889, + "rewards/MMContentORM/mean": 0.39700001031160354, + "rewards/MMContentORM/std": 0.7802090883255005, + "rewards/MMFormatORM/mean": 0.5524999856948852, + "rewards/MMFormatORM/std": 0.224040886759758, + "rewards/MMRubricORM/mean": -0.15, + "rewards/MMRubricORM/std": 0.34467830061912536, + "step": 3590, + "train_speed(iter/s)": 0.083072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.6, + "completions/mean_length": 212.625, + "completions/min_length": 145.8, + "epoch": 1.7258761401824292, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.12896399199962616, + "kl": 0.014410400390625, + "learning_rate": 5.050441176725468e-07, + "loss": 0.0005758726038038731, + "memory(GiB)": 27.09, + "reward": 0.46919997930526736, + "reward_std": 0.05953839020803571, + "rewards/MMContentORM/mean": 0.5805000185966491, + "rewards/MMContentORM/std": 0.6441609025001526, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3595, + "train_speed(iter/s)": 0.083085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.2, + "completions/mean_length": 220.2375, + "completions/min_length": 122.2, + "epoch": 1.7282765242438791, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4579542875289917, + "kl": 0.030523681640625, + "learning_rate": 4.96386651199583e-07, + "loss": 0.0012195698916912078, + "memory(GiB)": 27.09, + "reward": 0.4777999818325043, + "reward_std": 0.1202081507537514, + "rewards/MMContentORM/mean": 0.6020000159740448, + "rewards/MMContentORM/std": 0.638184130191803, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.08490467071533203, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13062257766723634, + "step": 3600, + "train_speed(iter/s)": 0.08308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/mean_length": 222.575, + "completions/min_length": 138.0, + "epoch": 1.7306769083053288, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.005367351695895195, + "kl": 0.0135009765625, + "learning_rate": 4.878001536441213e-07, + "loss": 0.0005398348905146122, + "memory(GiB)": 27.09, + "reward": 0.49624998569488527, + "reward_std": 0.043204221641644835, + "rewards/MMContentORM/mean": 0.6049999952316284, + "rewards/MMContentORM/std": 0.5939936757087707, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3605, + "train_speed(iter/s)": 0.083045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/mean_length": 211.625, + "completions/min_length": 125.4, + "epoch": 1.7330772923667785, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13431750237941742, + "kl": 0.014501953125, + "learning_rate": 4.792847603140587e-07, + "loss": 0.000580282649025321, + "memory(GiB)": 27.09, + "reward": 0.48459998369216917, + "reward_std": 0.09192387647926807, + "rewards/MMContentORM/mean": 0.6190000116825104, + "rewards/MMContentORM/std": 0.6144611597061157, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3610, + "train_speed(iter/s)": 0.083051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/mean_length": 216.1125, + "completions/min_length": 127.0, + "epoch": 1.7354776764282285, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.14910034835338593, + "kl": 0.03046875, + "learning_rate": 4.7084060539681066e-07, + "loss": 0.0012202151119709015, + "memory(GiB)": 27.09, + "reward": 0.37774998843669894, + "reward_std": 0.15733125656843186, + "rewards/MMContentORM/mean": 0.4525000035762787, + "rewards/MMContentORM/std": 0.7233627915382386, + "rewards/MMFormatORM/mean": 0.5606249868869781, + "rewards/MMFormatORM/std": 0.21994589269161224, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.33837831020355225, + "step": 3615, + "train_speed(iter/s)": 0.08305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.2, + "completions/mean_length": 217.2375, + "completions/min_length": 139.0, + "epoch": 1.7378780604896784, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.12053893506526947, + "kl": 0.017779541015625, + "learning_rate": 4.624678219572043e-07, + "loss": 0.0007117808330804109, + "memory(GiB)": 27.09, + "reward": 0.4127999782562256, + "reward_std": 0.17083699852228165, + "rewards/MMContentORM/mean": 0.49699999690055846, + "rewards/MMContentORM/std": 0.7366461873054504, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.19821036159992217, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 3620, + "train_speed(iter/s)": 0.083055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.2, + "completions/mean_length": 223.6625, + "completions/min_length": 124.0, + "epoch": 1.7402784445511283, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.17625129222869873, + "kl": 0.0965087890625, + "learning_rate": 4.5416654193538245e-07, + "loss": 0.003856099024415016, + "memory(GiB)": 27.09, + "reward": 0.45974999070167544, + "reward_std": 0.15648272663820534, + "rewards/MMContentORM/mean": 0.6000000178813935, + "rewards/MMContentORM/std": 0.6247810423374176, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3625, + "train_speed(iter/s)": 0.083045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.8, + "completions/mean_length": 218.475, + "completions/min_length": 143.6, + "epoch": 1.742678828612578, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.15371958911418915, + "kl": 0.064898681640625, + "learning_rate": 4.459368961447169e-07, + "loss": 0.002591692842543125, + "memory(GiB)": 27.09, + "reward": 0.47674998044967654, + "reward_std": 0.08266077786684037, + "rewards/MMContentORM/mean": 0.5849999964237214, + "rewards/MMContentORM/std": 0.6398021399974823, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3630, + "train_speed(iter/s)": 0.083041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.4, + "completions/mean_length": 218.6625, + "completions/min_length": 125.0, + "epoch": 1.7450792126740278, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.1884879767894745, + "kl": 0.022528076171875, + "learning_rate": 4.3777901426975465e-07, + "loss": 0.0009016599506139755, + "memory(GiB)": 27.09, + "reward": 0.4047499895095825, + "reward_std": 0.13470384031534194, + "rewards/MMContentORM/mean": 0.4875000238418579, + "rewards/MMContentORM/std": 0.7453335165977478, + "rewards/MMFormatORM/mean": 0.576874977350235, + "rewards/MMFormatORM/std": 0.17944467663764954, + "rewards/MMRubricORM/mean": -0.10500000119209289, + "rewards/MMRubricORM/std": 0.28511459827423097, + "step": 3635, + "train_speed(iter/s)": 0.083021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 442.8, + "completions/mean_length": 224.3125, + "completions/min_length": 135.2, + "epoch": 1.7474795967354777, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13846170902252197, + "kl": 0.021331787109375, + "learning_rate": 4.2969302486417064e-07, + "loss": 0.0008540621027350425, + "memory(GiB)": 27.09, + "reward": 0.41774998903274535, + "reward_std": 0.09086322523653508, + "rewards/MMContentORM/mean": 0.4375, + "rewards/MMContentORM/std": 0.6664343476295471, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3640, + "train_speed(iter/s)": 0.082997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.4, + "completions/mean_length": 217.0375, + "completions/min_length": 117.4, + "epoch": 1.7498799807969276, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.18284378945827484, + "kl": 0.0134521484375, + "learning_rate": 4.2167905534874153e-07, + "loss": 0.0005382131785154343, + "memory(GiB)": 27.09, + "reward": 0.5565499782562255, + "reward_std": 0.07672108276747167, + "rewards/MMContentORM/mean": 0.7845000267028809, + "rewards/MMContentORM/std": 0.42741707861423495, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3645, + "train_speed(iter/s)": 0.082998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/mean_length": 223.3625, + "completions/min_length": 120.6, + "epoch": 1.7522803648583773, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08318620175123215, + "kl": 0.019390869140625, + "learning_rate": 4.1373723200934136e-07, + "loss": 0.0007759532425552606, + "memory(GiB)": 27.09, + "reward": 0.5463499784469604, + "reward_std": 0.0758725541876629, + "rewards/MMContentORM/mean": 0.7590000092983246, + "rewards/MMContentORM/std": 0.4582708589732647, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3650, + "train_speed(iter/s)": 0.083003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.8, + "completions/mean_length": 210.5, + "completions/min_length": 139.2, + "epoch": 1.754680748919827, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.1796911358833313, + "kl": 0.01278076171875, + "learning_rate": 4.0586767999494514e-07, + "loss": 0.0005109596066176891, + "memory(GiB)": 27.09, + "reward": 0.5610999763011932, + "reward_std": 0.055012908577919004, + "rewards/MMContentORM/mean": 0.7815000295639039, + "rewards/MMContentORM/std": 0.3631765726953745, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3655, + "train_speed(iter/s)": 0.083007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.2, + "completions/mean_length": 216.5625, + "completions/min_length": 132.6, + "epoch": 1.757081132981277, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.08192643523216248, + "kl": 0.014990234375, + "learning_rate": 3.980705233156662e-07, + "loss": 0.0005991185083985328, + "memory(GiB)": 27.09, + "reward": 0.5434999465942383, + "reward_std": 0.05840701770503074, + "rewards/MMContentORM/mean": 0.7375, + "rewards/MMContentORM/std": 0.5282024204730987, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3660, + "train_speed(iter/s)": 0.083004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.8, + "completions/mean_length": 211.125, + "completions/min_length": 156.6, + "epoch": 1.759481517042727, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.11645707488059998, + "kl": 0.0146240234375, + "learning_rate": 3.903458848407915e-07, + "loss": 0.0005853664129972458, + "memory(GiB)": 27.09, + "reward": 0.48039997220039365, + "reward_std": 0.0814586978405714, + "rewards/MMContentORM/mean": 0.6085000038146973, + "rewards/MMContentORM/std": 0.6318390727043152, + "rewards/MMFormatORM/mean": 0.6174999952316285, + "rewards/MMFormatORM/std": 0.12999999523162842, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.2, + "step": 3665, + "train_speed(iter/s)": 0.083016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 457.4, + "completions/mean_length": 228.6125, + "completions/min_length": 123.4, + "epoch": 1.7618819011041766, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.20300821959972382, + "kl": 0.01519775390625, + "learning_rate": 3.8269388629685266e-07, + "loss": 0.0006076143123209477, + "memory(GiB)": 27.09, + "reward": 0.436849981546402, + "reward_std": 0.10740951672196389, + "rewards/MMContentORM/mean": 0.489000004529953, + "rewards/MMContentORM/std": 0.6648125410079956, + "rewards/MMFormatORM/mean": 0.6218749761581421, + "rewards/MMFormatORM/std": 0.09190345257520675, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3670, + "train_speed(iter/s)": 0.082985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 451.8, + "completions/mean_length": 222.0625, + "completions/min_length": 150.2, + "epoch": 1.7642822851656264, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.07441962510347366, + "kl": 0.0164794921875, + "learning_rate": 3.7511464826570476e-07, + "loss": 0.0006591953337192535, + "memory(GiB)": 27.09, + "reward": 0.43319997787475584, + "reward_std": 0.14212846159934997, + "rewards/MMContentORM/mean": 0.5480000078678131, + "rewards/MMContentORM/std": 0.7250023484230042, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 3675, + "train_speed(iter/s)": 0.082957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 519.4, + "completions/mean_length": 235.7625, + "completions/min_length": 150.4, + "epoch": 1.7666826692270763, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.07605559378862381, + "kl": 0.019140625, + "learning_rate": 3.676082901826267e-07, + "loss": 0.0007654134184122086, + "memory(GiB)": 27.09, + "reward": 0.4716499924659729, + "reward_std": 0.06908433209173381, + "rewards/MMContentORM/mean": 0.6010000109672546, + "rewards/MMContentORM/std": 0.6110691726207733, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.11740466952323914, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18062257766723633, + "step": 3680, + "train_speed(iter/s)": 0.082914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/mean_length": 215.6375, + "completions/min_length": 129.6, + "epoch": 1.7690830532885262, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.12616391479969025, + "kl": 0.0119415283203125, + "learning_rate": 3.601749303344415e-07, + "loss": 0.000477463286370039, + "memory(GiB)": 27.09, + "reward": 0.530249971151352, + "reward_std": 0.04150716739241034, + "rewards/MMContentORM/mean": 0.6900000095367431, + "rewards/MMContentORM/std": 0.5334938883781433, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3685, + "train_speed(iter/s)": 0.082921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.4, + "completions/mean_length": 211.1875, + "completions/min_length": 137.8, + "epoch": 1.7714834373499762, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14239180088043213, + "kl": 1.172998046875, + "learning_rate": 3.528146858576464e-07, + "loss": 0.0469234973192215, + "memory(GiB)": 27.09, + "reward": 0.5906499743461608, + "reward_std": 0.024536601221188902, + "rewards/MMContentORM/mean": 0.8409999966621399, + "rewards/MMContentORM/std": 0.2608364664018154, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3690, + "train_speed(iter/s)": 0.082933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 444.0, + "completions/mean_length": 225.4375, + "completions/min_length": 155.2, + "epoch": 1.7738838214114259, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1946728527545929, + "kl": 0.014678955078125, + "learning_rate": 3.4552767273657416e-07, + "loss": 0.0005875344388186932, + "memory(GiB)": 27.09, + "reward": 0.4611999809741974, + "reward_std": 0.07240773178637028, + "rewards/MMContentORM/mean": 0.5605000138282776, + "rewards/MMContentORM/std": 0.6589470744132996, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3695, + "train_speed(iter/s)": 0.082904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.2, + "completions/mean_length": 204.9, + "completions/min_length": 115.4, + "epoch": 1.7762842054728756, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.0957147628068924, + "kl": 0.017193603515625, + "learning_rate": 3.383140058015605e-07, + "loss": 0.0006867312826216221, + "memory(GiB)": 27.09, + "reward": 0.4408499926328659, + "reward_std": 0.15040160596836358, + "rewards/MMContentORM/mean": 0.5564999967813492, + "rewards/MMContentORM/std": 0.6231798827648163, + "rewards/MMFormatORM/mean": 0.5893749713897705, + "rewards/MMFormatORM/std": 0.1667675107717514, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.25493902564048765, + "step": 3700, + "train_speed(iter/s)": 0.082917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.6, + "completions/mean_length": 209.525, + "completions/min_length": 106.4, + "epoch": 1.7786845895343255, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.16963821649551392, + "kl": 0.039617919921875, + "learning_rate": 3.3117379872713573e-07, + "loss": 0.0015896432101726531, + "memory(GiB)": 27.09, + "reward": 0.49789999723434447, + "reward_std": 0.1166726142168045, + "rewards/MMContentORM/mean": 0.6810000121593476, + "rewards/MMContentORM/std": 0.6037951707839966, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 3705, + "train_speed(iter/s)": 0.082899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/mean_length": 218.6375, + "completions/min_length": 135.0, + "epoch": 1.7810849735957754, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.09570092707872391, + "kl": 0.014276123046875, + "learning_rate": 3.2410716403023404e-07, + "loss": 0.0005716872867196799, + "memory(GiB)": 27.09, + "reward": 0.42004998922348025, + "reward_std": 0.12551144529134034, + "rewards/MMContentORM/mean": 0.471999990940094, + "rewards/MMContentORM/std": 0.6835508227348328, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3710, + "train_speed(iter/s)": 0.082911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.2, + "completions/mean_length": 211.375, + "completions/min_length": 131.2, + "epoch": 1.7834853576572252, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.07856486737728119, + "kl": 0.01624755859375, + "learning_rate": 3.1711421306841903e-07, + "loss": 0.0006491564214229584, + "memory(GiB)": 27.09, + "reward": 0.5439499855041504, + "reward_std": 0.08831763297785074, + "rewards/MMContentORM/mean": 0.753000020980835, + "rewards/MMContentORM/std": 0.47651802077889444, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3715, + "train_speed(iter/s)": 0.082911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.4, + "completions/mean_length": 217.9125, + "completions/min_length": 134.4, + "epoch": 1.7858857417186749, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11409315466880798, + "kl": 0.01444091796875, + "learning_rate": 3.101950560381339e-07, + "loss": 0.0005774036049842835, + "memory(GiB)": 27.09, + "reward": 0.5369499802589417, + "reward_std": 0.08577205196488649, + "rewards/MMContentORM/mean": 0.7355000257492066, + "rewards/MMContentORM/std": 0.44938567504286764, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3720, + "train_speed(iter/s)": 0.082918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/mean_length": 220.0125, + "completions/min_length": 132.8, + "epoch": 1.7882861257801248, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.13937775790691376, + "kl": 0.015142822265625, + "learning_rate": 3.033498019729553e-07, + "loss": 0.0006057361606508493, + "memory(GiB)": 27.09, + "reward": 0.4632499754428864, + "reward_std": 0.12211733981966973, + "rewards/MMContentORM/mean": 0.5800000250339508, + "rewards/MMContentORM/std": 0.6220081090927124, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3725, + "train_speed(iter/s)": 0.082918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.8, + "completions/mean_length": 220.2125, + "completions/min_length": 151.6, + "epoch": 1.7906865098415747, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12366022914648056, + "kl": 0.0125, + "learning_rate": 2.965785587418857e-07, + "loss": 0.0005001377779990434, + "memory(GiB)": 27.09, + "reward": 0.5290499746799469, + "reward_std": 0.07488261461257935, + "rewards/MMContentORM/mean": 0.6870000004768372, + "rewards/MMContentORM/std": 0.5478822708129882, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3730, + "train_speed(iter/s)": 0.082923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.6, + "completions/mean_length": 220.45, + "completions/min_length": 144.8, + "epoch": 1.7930868939030244, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23606330156326294, + "kl": 0.01500244140625, + "learning_rate": 2.898814330476457e-07, + "loss": 0.0006001268513500691, + "memory(GiB)": 27.09, + "reward": 0.42784997820854187, + "reward_std": 0.18462557792663575, + "rewards/MMContentORM/mean": 0.5490000188350678, + "rewards/MMContentORM/std": 0.7097955226898194, + "rewards/MMFormatORM/mean": 0.5768749833106994, + "rewards/MMFormatORM/std": 0.2062115788459778, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.3172485947608948, + "step": 3735, + "train_speed(iter/s)": 0.082928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.4, + "completions/mean_length": 210.825, + "completions/min_length": 127.2, + "epoch": 1.7954872779644742, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1833251416683197, + "kl": 0.017486572265625, + "learning_rate": 2.8325853042499796e-07, + "loss": 0.000699461530894041, + "memory(GiB)": 27.09, + "reward": 0.42219996452331543, + "reward_std": 0.21411193013191224, + "rewards/MMContentORM/mean": 0.5205000072717667, + "rewards/MMContentORM/std": 0.6509887754917145, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.1430424392223358, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.22006530165672303, + "step": 3740, + "train_speed(iter/s)": 0.082937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.8, + "completions/mean_length": 221.6375, + "completions/min_length": 134.6, + "epoch": 1.797887662025924, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.09696277230978012, + "kl": 0.02608642578125, + "learning_rate": 2.7670995523908007e-07, + "loss": 0.001044764183461666, + "memory(GiB)": 27.09, + "reward": 0.45574997663497924, + "reward_std": 0.14347196728922426, + "rewards/MMContentORM/mean": 0.5900000095367431, + "rewards/MMContentORM/std": 0.580910587310791, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3745, + "train_speed(iter/s)": 0.082918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 490.8, + "completions/mean_length": 221.175, + "completions/min_length": 134.0, + "epoch": 1.800288046087374, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.1732310652732849, + "kl": 0.0142822265625, + "learning_rate": 2.702358106837616e-07, + "loss": 0.0005715820007026196, + "memory(GiB)": 27.09, + "reward": 0.48334997296333315, + "reward_std": 0.07615540148690343, + "rewards/MMContentORM/mean": 0.6015000104904175, + "rewards/MMContentORM/std": 0.6337794065475464, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3750, + "train_speed(iter/s)": 0.082881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.2, + "completions/mean_length": 212.7, + "completions/min_length": 121.8, + "epoch": 1.802688430148824, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.15966768562793732, + "kl": 0.017254638671875, + "learning_rate": 2.63836198780022e-07, + "loss": 0.0006905402522534132, + "memory(GiB)": 27.09, + "reward": 0.4441499710083008, + "reward_std": 0.11745043210685253, + "rewards/MMContentORM/mean": 0.5359999895095825, + "rewards/MMContentORM/std": 0.6547886967658997, + "rewards/MMFormatORM/mean": 0.6056249737739563, + "rewards/MMFormatORM/std": 0.13630690723657607, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3755, + "train_speed(iter/s)": 0.082883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.4, + "completions/mean_length": 210.7375, + "completions/min_length": 136.6, + "epoch": 1.8050888142102737, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.17423182725906372, + "kl": 0.02149658203125, + "learning_rate": 2.575112203743313e-07, + "loss": 0.0008604388684034347, + "memory(GiB)": 27.09, + "reward": 0.44089998602867125, + "reward_std": 0.18257496803998946, + "rewards/MMContentORM/mean": 0.5385000109672546, + "rewards/MMContentORM/std": 0.6908077597618103, + "rewards/MMFormatORM/mean": 0.6012499928474426, + "rewards/MMFormatORM/std": 0.17440344989299775, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.2683130085468292, + "step": 3760, + "train_speed(iter/s)": 0.082888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.8, + "completions/mean_length": 217.0125, + "completions/min_length": 142.4, + "epoch": 1.8074891982717234, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.17498211562633514, + "kl": 0.0125, + "learning_rate": 2.51260975137077e-07, + "loss": 0.000500024575740099, + "memory(GiB)": 27.09, + "reward": 0.5402999818325043, + "reward_std": 0.08216580227017403, + "rewards/MMContentORM/mean": 0.72950000166893, + "rewards/MMContentORM/std": 0.4286257430911064, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3765, + "train_speed(iter/s)": 0.08289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.4, + "completions/mean_length": 211.875, + "completions/min_length": 135.8, + "epoch": 1.8098895823331733, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1608082801103592, + "kl": 0.014971923828125, + "learning_rate": 2.4508556156097983e-07, + "loss": 0.0005985355004668236, + "memory(GiB)": 27.09, + "reward": 0.49359997510910036, + "reward_std": 0.05670996003318578, + "rewards/MMContentORM/mean": 0.6414999842643738, + "rewards/MMContentORM/std": 0.6027493834495544, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3770, + "train_speed(iter/s)": 0.082893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.4, + "completions/mean_length": 214.9375, + "completions/min_length": 139.6, + "epoch": 1.8122899663946233, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.08479262888431549, + "kl": 0.022100830078125, + "learning_rate": 2.3898507695954807e-07, + "loss": 0.0008829880505800247, + "memory(GiB)": 27.09, + "reward": 0.5497499763965606, + "reward_std": 0.06936717466451228, + "rewards/MMContentORM/mean": 0.7675000071525574, + "rewards/MMContentORM/std": 0.3865751329809427, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3775, + "train_speed(iter/s)": 0.082901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 458.8, + "completions/mean_length": 231.45, + "completions/min_length": 149.0, + "epoch": 1.814690350456073, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20971421897411346, + "kl": 0.014501953125, + "learning_rate": 2.3295961746554464e-07, + "loss": 0.0005800392478704453, + "memory(GiB)": 27.09, + "reward": 0.45644997954368594, + "reward_std": 0.12833987697958946, + "rewards/MMContentORM/mean": 0.5630000114440918, + "rewards/MMContentORM/std": 0.6289644300937652, + "rewards/MMFormatORM/mean": 0.6093749761581421, + "rewards/MMFormatORM/std": 0.12130690813064575, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.18662601709365845, + "step": 3780, + "train_speed(iter/s)": 0.082871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/mean_length": 209.8125, + "completions/min_length": 124.8, + "epoch": 1.8170907345175227, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1644957810640335, + "kl": 0.022076416015625, + "learning_rate": 2.2700927802946748e-07, + "loss": 0.0008836163207888604, + "memory(GiB)": 27.09, + "reward": 0.5015999794006347, + "reward_std": 0.09899494738783687, + "rewards/MMContentORM/mean": 0.6615000128746032, + "rewards/MMContentORM/std": 0.5929094016551971, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3785, + "train_speed(iter/s)": 0.082885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.2, + "completions/mean_length": 221.825, + "completions/min_length": 137.6, + "epoch": 1.8194911185789726, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.14464011788368225, + "kl": 0.015411376953125, + "learning_rate": 2.211341524180599e-07, + "loss": 0.0006168725434690714, + "memory(GiB)": 27.09, + "reward": 0.3968499720096588, + "reward_std": 0.16454374492168428, + "rewards/MMContentORM/mean": 0.471500039100647, + "rewards/MMContentORM/std": 0.7294471979141235, + "rewards/MMFormatORM/mean": 0.576874989271164, + "rewards/MMFormatORM/std": 0.20004121959209442, + "rewards/MMRubricORM/mean": -0.1125, + "rewards/MMRubricORM/std": 0.30775573253631594, + "step": 3790, + "train_speed(iter/s)": 0.08288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 570.2, + "completions/mean_length": 231.825, + "completions/min_length": 119.8, + "epoch": 1.8218915026404225, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1182423084974289, + "kl": 0.018719482421875, + "learning_rate": 2.1533433321282548e-07, + "loss": 0.0007486558984965086, + "memory(GiB)": 27.09, + "reward": 0.41229996681213377, + "reward_std": 0.1336431846022606, + "rewards/MMContentORM/mean": 0.46700000762939453, + "rewards/MMContentORM/std": 0.6891016006469727, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3795, + "train_speed(iter/s)": 0.082825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.6, + "completions/mean_length": 214.1, + "completions/min_length": 126.0, + "epoch": 1.8242918867018723, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13948172330856323, + "kl": 0.019952392578125, + "learning_rate": 2.096099118085776e-07, + "loss": 0.0007983671501278877, + "memory(GiB)": 27.09, + "reward": 0.5301999688148499, + "reward_std": 0.10040915980935097, + "rewards/MMContentORM/mean": 0.7330000162124634, + "rewards/MMContentORM/std": 0.554861056804657, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3800, + "train_speed(iter/s)": 0.082827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.6, + "completions/mean_length": 209.8625, + "completions/min_length": 152.0, + "epoch": 1.826692270763322, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.18707101047039032, + "kl": 0.013671875, + "learning_rate": 2.039609784119906e-07, + "loss": 0.0005472441203892231, + "memory(GiB)": 27.09, + "reward": 0.4373499691486359, + "reward_std": 0.0874691043049097, + "rewards/MMContentORM/mean": 0.4865000069141388, + "rewards/MMContentORM/std": 0.6470930695533752, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3805, + "train_speed(iter/s)": 0.082815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.4, + "completions/mean_length": 209.6125, + "completions/min_length": 127.2, + "epoch": 1.829092654824772, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.07913219183683395, + "kl": 0.01658935546875, + "learning_rate": 1.983876220401848e-07, + "loss": 0.0006637333892285824, + "memory(GiB)": 27.09, + "reward": 0.539849978685379, + "reward_std": 0.032456200616434214, + "rewards/MMContentORM/mean": 0.7139999866485596, + "rewards/MMContentORM/std": 0.5111204564571381, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3810, + "train_speed(iter/s)": 0.082827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/mean_length": 218.7125, + "completions/min_length": 125.2, + "epoch": 1.8314930388862218, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.10300786793231964, + "kl": 0.033880615234375, + "learning_rate": 1.9288993051932047e-07, + "loss": 0.0013558823615312577, + "memory(GiB)": 27.09, + "reward": 0.518399977684021, + "reward_std": 0.11200571432709694, + "rewards/MMContentORM/mean": 0.7035000085830688, + "rewards/MMContentORM/std": 0.5992733359336853, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3815, + "train_speed(iter/s)": 0.082827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.6, + "completions/mean_length": 216.15, + "completions/min_length": 150.2, + "epoch": 1.8338934229476718, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20239703357219696, + "kl": 0.0139892578125, + "learning_rate": 1.8746799048321386e-07, + "loss": 0.0005595901049673558, + "memory(GiB)": 27.09, + "reward": 0.5116499781608581, + "reward_std": 0.08534778701141477, + "rewards/MMContentORM/mean": 0.6435000181198121, + "rewards/MMContentORM/std": 0.5361906588077545, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3820, + "train_speed(iter/s)": 0.082836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/mean_length": 215.5125, + "completions/min_length": 115.8, + "epoch": 1.8362938070091215, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1797892451286316, + "kl": 0.015447998046875, + "learning_rate": 1.8212188737197657e-07, + "loss": 0.0006183533929288388, + "memory(GiB)": 27.09, + "reward": 0.4594499826431274, + "reward_std": 0.16086678504943847, + "rewards/MMContentORM/mean": 0.5705000162124634, + "rewards/MMContentORM/std": 0.6871401906013489, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3825, + "train_speed(iter/s)": 0.082844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 494.8, + "completions/mean_length": 227.725, + "completions/min_length": 144.0, + "epoch": 1.8386941910705712, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.10412738472223282, + "kl": 0.016107177734375, + "learning_rate": 1.7685170543065955e-07, + "loss": 0.0006441749632358551, + "memory(GiB)": 27.09, + "reward": 0.4660999894142151, + "reward_std": 0.0735391038004309, + "rewards/MMContentORM/mean": 0.601500004529953, + "rewards/MMContentORM/std": 0.6630040287971497, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3830, + "train_speed(iter/s)": 0.082814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.4, + "completions/mean_length": 218.1375, + "completions/min_length": 154.8, + "epoch": 1.8410945751320211, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1359497755765915, + "kl": 0.0163330078125, + "learning_rate": 1.7165752770793742e-07, + "loss": 0.0006539277732372284, + "memory(GiB)": 27.09, + "reward": 0.4121499717235565, + "reward_std": 0.17345329225063325, + "rewards/MMContentORM/mean": 0.4809999972581863, + "rewards/MMContentORM/std": 0.6978591680526733, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 3835, + "train_speed(iter/s)": 0.082817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.2, + "completions/mean_length": 205.2, + "completions/min_length": 118.4, + "epoch": 1.843494959193471, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.11098845303058624, + "kl": 0.019683837890625, + "learning_rate": 1.665394360547895e-07, + "loss": 0.0007876243442296981, + "memory(GiB)": 27.09, + "reward": 0.4245999872684479, + "reward_std": 0.1688570961356163, + "rewards/MMContentORM/mean": 0.526499992609024, + "rewards/MMContentORM/std": 0.7182626962661743, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.19821036159992217, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.3049390256404877, + "step": 3840, + "train_speed(iter/s)": 0.082826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.2, + "completions/mean_length": 214.1875, + "completions/min_length": 125.2, + "epoch": 1.8458953432549208, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.2774093449115753, + "kl": 0.01912841796875, + "learning_rate": 1.6149751112321643e-07, + "loss": 0.0007657586131244898, + "memory(GiB)": 27.09, + "reward": 0.43994998931884766, + "reward_std": 0.11943033430725336, + "rewards/MMContentORM/mean": 0.5505000114440918, + "rewards/MMContentORM/std": 0.5636135444045067, + "rewards/MMFormatORM/mean": 0.5931249737739563, + "rewards/MMFormatORM/std": 0.14121158123016359, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.21724859476089478, + "step": 3845, + "train_speed(iter/s)": 0.082827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 480.0, + "completions/mean_length": 232.475, + "completions/min_length": 149.8, + "epoch": 1.8482957273163705, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.09440695494413376, + "kl": 0.0136474609375, + "learning_rate": 1.565318323649667e-07, + "loss": 0.0005458991043269634, + "memory(GiB)": 27.09, + "reward": 0.4410999774932861, + "reward_std": 0.15004805505741386, + "rewards/MMContentORM/mean": 0.539000004529953, + "rewards/MMContentORM/std": 0.6789550423622132, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3850, + "train_speed(iter/s)": 0.082796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.2, + "completions/mean_length": 222.3, + "completions/min_length": 140.8, + "epoch": 1.8506961113778204, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.08402004837989807, + "kl": 0.01473388671875, + "learning_rate": 1.5164247803028443e-07, + "loss": 0.0005890860687941313, + "memory(GiB)": 27.09, + "reward": 0.4956999808549881, + "reward_std": 0.06378102500457317, + "rewards/MMContentORM/mean": 0.6180000007152557, + "rewards/MMContentORM/std": 0.4799440011382103, + "rewards/MMFormatORM/mean": 0.6337499737739563, + "rewards/MMFormatORM/std": 0.04440345466136932, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.06831300854682923, + "step": 3855, + "train_speed(iter/s)": 0.082787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.2, + "completions/mean_length": 215.3875, + "completions/min_length": 143.4, + "epoch": 1.8530964954392704, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.10355502367019653, + "kl": 0.012347412109375, + "learning_rate": 1.4682952516667848e-07, + "loss": 0.0004940344952046871, + "memory(GiB)": 27.09, + "reward": 0.5468499898910523, + "reward_std": 0.03330472691450268, + "rewards/MMContentORM/mean": 0.731499993801117, + "rewards/MMContentORM/std": 0.5080301821231842, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3860, + "train_speed(iter/s)": 0.082793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 211.8875, + "completions/min_length": 146.2, + "epoch": 1.85549687950072, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.11793594062328339, + "kl": 0.014239501953125, + "learning_rate": 1.4209304961770364e-07, + "loss": 0.0005696051754057408, + "memory(GiB)": 27.09, + "reward": 0.4437499940395355, + "reward_std": 0.08237794116139412, + "rewards/MMContentORM/mean": 0.5025000140070915, + "rewards/MMContentORM/std": 0.5952349126338958, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3865, + "train_speed(iter/s)": 0.082799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.6, + "completions/mean_length": 208.3625, + "completions/min_length": 135.8, + "epoch": 1.85789726356217, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14964303374290466, + "kl": 0.01424560546875, + "learning_rate": 1.374331260217726e-07, + "loss": 0.0005695806816220283, + "memory(GiB)": 27.09, + "reward": 0.5307499766349792, + "reward_std": 0.08266077996231616, + "rewards/MMContentORM/mean": 0.7200000047683716, + "rewards/MMContentORM/std": 0.48706189841032027, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3870, + "train_speed(iter/s)": 0.082801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.8, + "completions/mean_length": 207.7375, + "completions/min_length": 128.4, + "epoch": 1.8602976476236197, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.13720768690109253, + "kl": 0.014617919921875, + "learning_rate": 1.32849827810973e-07, + "loss": 0.0005841460078954697, + "memory(GiB)": 27.09, + "reward": 0.46409997940063474, + "reward_std": 0.13901719748973845, + "rewards/MMContentORM/mean": 0.5965000092983246, + "rewards/MMContentORM/std": 0.6498379826545715, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 3875, + "train_speed(iter/s)": 0.08281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.2, + "completions/mean_length": 214.3625, + "completions/min_length": 151.8, + "epoch": 1.8626980316850696, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.20279008150100708, + "kl": 0.015106201171875, + "learning_rate": 1.2834322720991332e-07, + "loss": 0.0006037722807377577, + "memory(GiB)": 27.09, + "reward": 0.5412999749183655, + "reward_std": 0.049638888845220205, + "rewards/MMContentORM/mean": 0.7320000052452087, + "rewards/MMContentORM/std": 0.46277309134602546, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3880, + "train_speed(iter/s)": 0.082805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.2, + "completions/mean_length": 211.4375, + "completions/min_length": 141.4, + "epoch": 1.8650984157465196, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.08262795954942703, + "kl": 0.0156005859375, + "learning_rate": 1.2391339523458502e-07, + "loss": 0.0006241547875106334, + "memory(GiB)": 27.09, + "reward": 0.46244998574256896, + "reward_std": 0.11985459551215172, + "rewards/MMContentORM/mean": 0.578000009059906, + "rewards/MMContentORM/std": 0.6259812593460083, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3885, + "train_speed(iter/s)": 0.082824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.6, + "completions/mean_length": 223.2, + "completions/min_length": 161.6, + "epoch": 1.8674987998079693, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.18606510758399963, + "kl": 0.015350341796875, + "learning_rate": 1.1956040169124217e-07, + "loss": 0.0006135111209005118, + "memory(GiB)": 27.09, + "reward": 0.4125999629497528, + "reward_std": 0.13562307790853084, + "rewards/MMContentORM/mean": 0.4965000033378601, + "rewards/MMContentORM/std": 0.707841980457306, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 3890, + "train_speed(iter/s)": 0.082825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.2, + "completions/mean_length": 217.7625, + "completions/min_length": 150.2, + "epoch": 1.869899183869419, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.08665505051612854, + "kl": 0.01368408203125, + "learning_rate": 1.1528431517530414e-07, + "loss": 0.0005479637067764998, + "memory(GiB)": 27.09, + "reward": 0.5135999739170074, + "reward_std": 0.10069200224243105, + "rewards/MMContentORM/mean": 0.6915000200271606, + "rewards/MMContentORM/std": 0.5325116083025933, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3895, + "train_speed(iter/s)": 0.082832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/mean_length": 208.8375, + "completions/min_length": 129.4, + "epoch": 1.872299567930869, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.10476063936948776, + "kl": 0.024139404296875, + "learning_rate": 1.1108520307027026e-07, + "loss": 0.0009668363258242607, + "memory(GiB)": 27.09, + "reward": 0.4625499784946442, + "reward_std": 0.15648272782564163, + "rewards/MMContentORM/mean": 0.6070000171661377, + "rewards/MMContentORM/std": 0.6449923276901245, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.18630690574645997, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2866260170936584, + "step": 3900, + "train_speed(iter/s)": 0.082839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.8, + "completions/mean_length": 210.8875, + "completions/min_length": 110.4, + "epoch": 1.8746999519923189, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.15975040197372437, + "kl": 0.014141845703125, + "learning_rate": 1.0696313154666016e-07, + "loss": 0.0005653574131429196, + "memory(GiB)": 27.09, + "reward": 0.5340999722480774, + "reward_std": 0.060952600184828044, + "rewards/MMContentORM/mean": 0.714000004529953, + "rewards/MMContentORM/std": 0.5186110436916351, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3905, + "train_speed(iter/s)": 0.082817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.8, + "completions/mean_length": 211.0375, + "completions/min_length": 136.2, + "epoch": 1.8771003360537686, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.18092653155326843, + "kl": 0.01676025390625, + "learning_rate": 1.0291816556097455e-07, + "loss": 0.000670450646430254, + "memory(GiB)": 27.09, + "reward": 0.5369499742984771, + "reward_std": 0.08633773510809988, + "rewards/MMContentORM/mean": 0.7355000138282776, + "rewards/MMContentORM/std": 0.5291013896465302, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3910, + "train_speed(iter/s)": 0.082831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.4, + "completions/mean_length": 211.275, + "completions/min_length": 114.2, + "epoch": 1.8795007201152183, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.2313622385263443, + "kl": 0.01806640625, + "learning_rate": 9.895036885466503e-08, + "loss": 0.000722192507237196, + "memory(GiB)": 27.09, + "reward": 0.4598999798297882, + "reward_std": 0.1432598352432251, + "rewards/MMContentORM/mean": 0.5860000014305115, + "rewards/MMContentORM/std": 0.6623120665550232, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 3915, + "train_speed(iter/s)": 0.082841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/mean_length": 208.8875, + "completions/min_length": 138.6, + "epoch": 1.8819011041766682, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13324828445911407, + "kl": 0.016552734375, + "learning_rate": 9.505980395313364e-08, + "loss": 0.0006618403363972903, + "memory(GiB)": 27.09, + "reward": 0.5237999677658081, + "reward_std": 0.10606601641047746, + "rewards/MMContentORM/mean": 0.717000025510788, + "rewards/MMContentORM/std": 0.45795624777674676, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 3920, + "train_speed(iter/s)": 0.08284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.2, + "completions/mean_length": 211.4875, + "completions/min_length": 127.4, + "epoch": 1.8843014882381182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2311268001794815, + "kl": 0.02252197265625, + "learning_rate": 9.124653216474766e-08, + "loss": 0.0009013652801513672, + "memory(GiB)": 27.09, + "reward": 0.43879998922348024, + "reward_std": 0.22344573587179184, + "rewards/MMContentORM/mean": 0.5945000112056732, + "rewards/MMContentORM/std": 0.6892549335956574, + "rewards/MMFormatORM/mean": 0.5649999797344207, + "rewards/MMFormatORM/std": 0.21041721403598784, + "rewards/MMRubricORM/mean": -0.125, + "rewards/MMRubricORM/std": 0.32006530165672303, + "step": 3925, + "train_speed(iter/s)": 0.082839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.2, + "completions/mean_length": 223.475, + "completions/min_length": 154.2, + "epoch": 1.886701872299568, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16043339669704437, + "kl": 0.01510009765625, + "learning_rate": 8.751061357987367e-08, + "loss": 0.0006042405962944031, + "memory(GiB)": 27.09, + "reward": 0.5149499654769898, + "reward_std": 0.08633773569017648, + "rewards/MMContentORM/mean": 0.6805000185966492, + "rewards/MMContentORM/std": 0.5845986545085907, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 3930, + "train_speed(iter/s)": 0.082841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 222.7875, + "completions/min_length": 160.2, + "epoch": 1.8891022563610178, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.07241669297218323, + "kl": 0.012701416015625, + "learning_rate": 8.385210706992608e-08, + "loss": 0.000508300494402647, + "memory(GiB)": 27.09, + "reward": 0.49414998292922974, + "reward_std": 0.07785245187114924, + "rewards/MMContentORM/mean": 0.6285000085830689, + "rewards/MMContentORM/std": 0.6101788878440857, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3935, + "train_speed(iter/s)": 0.082847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.2, + "completions/mean_length": 216.7625, + "completions/min_length": 136.4, + "epoch": 1.8915026404224675, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13122500479221344, + "kl": 0.016265869140625, + "learning_rate": 8.027107028644621e-08, + "loss": 0.0006506592035293579, + "memory(GiB)": 27.09, + "reward": 0.4648999750614166, + "reward_std": 0.1226123157190159, + "rewards/MMContentORM/mean": 0.5860000073909759, + "rewards/MMContentORM/std": 0.645108425617218, + "rewards/MMFormatORM/mean": 0.6074999809265137, + "rewards/MMFormatORM/std": 0.14226680397987365, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3940, + "train_speed(iter/s)": 0.082855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.2, + "completions/mean_length": 215.7375, + "completions/min_length": 136.6, + "epoch": 1.8939030244839175, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.10751134157180786, + "kl": 0.015667724609375, + "learning_rate": 7.676755966018967e-08, + "loss": 0.0006263887509703637, + "memory(GiB)": 27.09, + "reward": 0.49424999952316284, + "reward_std": 0.11278352783992887, + "rewards/MMContentORM/mean": 0.6574999928474426, + "rewards/MMContentORM/std": 0.5534313529729843, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 3945, + "train_speed(iter/s)": 0.082853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/mean_length": 221.6625, + "completions/min_length": 154.8, + "epoch": 1.8963034085453674, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.20207758247852325, + "kl": 0.014703369140625, + "learning_rate": 7.334163040023823e-08, + "loss": 0.000587776442989707, + "memory(GiB)": 27.09, + "reward": 0.5048999905586242, + "reward_std": 0.05699280113913119, + "rewards/MMContentORM/mean": 0.6285000026226044, + "rewards/MMContentORM/std": 0.5722138583660126, + "rewards/MMFormatORM/mean": 0.6399999856948853, + "rewards/MMFormatORM/std": 0.03999999761581421, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 3950, + "train_speed(iter/s)": 0.082857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 452.6, + "completions/mean_length": 224.375, + "completions/min_length": 155.0, + "epoch": 1.898703792606817, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1518968939781189, + "kl": 0.016015625, + "learning_rate": 6.999333649312933e-08, + "loss": 0.000639676209539175, + "memory(GiB)": 27.09, + "reward": 0.4759999752044678, + "reward_std": 0.11539982631802559, + "rewards/MMContentORM/mean": 0.5974999904632569, + "rewards/MMContentORM/std": 0.6229348480701447, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3955, + "train_speed(iter/s)": 0.082832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.4, + "completions/mean_length": 207.525, + "completions/min_length": 128.0, + "epoch": 1.9011041766682668, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.2073817253112793, + "kl": 0.019140625, + "learning_rate": 6.672273070200464e-08, + "loss": 0.0007669827900826931, + "memory(GiB)": 27.09, + "reward": 0.5025999784469605, + "reward_std": 0.08174153957515955, + "rewards/MMContentORM/mean": 0.6640000104904175, + "rewards/MMContentORM/std": 0.534474528580904, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 3960, + "train_speed(iter/s)": 0.082842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 469.4, + "completions/mean_length": 225.8875, + "completions/min_length": 138.0, + "epoch": 1.9035045607297167, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.11310164630413055, + "kl": 0.025762939453125, + "learning_rate": 6.352986456578224e-08, + "loss": 0.0010307587683200837, + "memory(GiB)": 27.09, + "reward": 0.4900999844074249, + "reward_std": 0.08315575905144215, + "rewards/MMContentORM/mean": 0.6615000009536743, + "rewards/MMContentORM/std": 0.6141018033027649, + "rewards/MMFormatORM/mean": 0.6012499868869782, + "rewards/MMFormatORM/std": 0.12313776612281799, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.1894427239894867, + "step": 3965, + "train_speed(iter/s)": 0.082814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/mean_length": 207.1125, + "completions/min_length": 125.6, + "epoch": 1.9059049447911667, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.1447732150554657, + "kl": 0.018310546875, + "learning_rate": 6.041478839834025e-08, + "loss": 0.0007323446683585644, + "memory(GiB)": 27.09, + "reward": 0.4383999824523926, + "reward_std": 0.1336431846022606, + "rewards/MMContentORM/mean": 0.5610000193119049, + "rewards/MMContentORM/std": 0.6906715393066406, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.19430812299251557, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2989355862140656, + "step": 3970, + "train_speed(iter/s)": 0.082821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.2, + "completions/mean_length": 213.025, + "completions/min_length": 144.0, + "epoch": 1.9083053288526164, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.19179855287075043, + "kl": 0.013043212890625, + "learning_rate": 5.7377551287724484e-08, + "loss": 0.000521748187020421, + "memory(GiB)": 27.09, + "reward": 0.48369997143745425, + "reward_std": 0.05642711967229843, + "rewards/MMContentORM/mean": 0.5879999935626984, + "rewards/MMContentORM/std": 0.5680493891239167, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3975, + "train_speed(iter/s)": 0.082824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.4, + "completions/mean_length": 207.3, + "completions/min_length": 132.0, + "epoch": 1.910705712914066, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.11744437366724014, + "kl": 0.0150634765625, + "learning_rate": 5.4418201095377544e-08, + "loss": 0.000602102093398571, + "memory(GiB)": 27.09, + "reward": 0.5488999843597412, + "reward_std": 0.06491240309551358, + "rewards/MMContentORM/mean": 0.7509999990463256, + "rewards/MMContentORM/std": 0.4212790600955486, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 3980, + "train_speed(iter/s)": 0.082831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.2, + "completions/mean_length": 211.7875, + "completions/min_length": 119.4, + "epoch": 1.913106096975516, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.1685052067041397, + "kl": 0.018121337890625, + "learning_rate": 5.153678445538324e-08, + "loss": 0.0007251160684973001, + "memory(GiB)": 27.09, + "reward": 0.3905999720096588, + "reward_std": 0.2324967123568058, + "rewards/MMContentORM/mean": 0.48649999499320984, + "rewards/MMContentORM/std": 0.7621617078781128, + "rewards/MMFormatORM/mean": 0.5587499678134918, + "rewards/MMFormatORM/std": 0.19895429015159607, + "rewards/MMRubricORM/mean": -0.1375, + "rewards/MMRubricORM/std": 0.30669131875038147, + "step": 3985, + "train_speed(iter/s)": 0.08283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/mean_length": 230.875, + "completions/min_length": 150.2, + "epoch": 1.915506481036966, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.17441634833812714, + "kl": 0.015863037109375, + "learning_rate": 4.873334677373054e-08, + "loss": 0.0006344456225633622, + "memory(GiB)": 27.09, + "reward": 0.4771999716758728, + "reward_std": 0.11483414098620415, + "rewards/MMContentORM/mean": 0.6005000114440918, + "rewards/MMContentORM/std": 0.6646744608879089, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 3990, + "train_speed(iter/s)": 0.082834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/mean_length": 215.3625, + "completions/min_length": 106.6, + "epoch": 1.917906865098416, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.10349483042955399, + "kl": 0.06673583984375, + "learning_rate": 4.600793222759858e-08, + "loss": 0.002681119553744793, + "memory(GiB)": 27.09, + "reward": 0.512749969959259, + "reward_std": 0.08492352233733982, + "rewards/MMContentORM/mean": 0.675000011920929, + "rewards/MMContentORM/std": 0.5857814848423004, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 3995, + "train_speed(iter/s)": 0.082835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.8, + "completions/mean_length": 219.95, + "completions/min_length": 121.2, + "epoch": 1.9203072491598656, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13771557807922363, + "kl": 0.01771240234375, + "learning_rate": 4.33605837646639e-08, + "loss": 0.0007083784788846969, + "memory(GiB)": 27.09, + "reward": 0.4103999733924866, + "reward_std": 0.1940301053225994, + "rewards/MMContentORM/mean": 0.49100000262260435, + "rewards/MMContentORM/std": 0.6967435419559479, + "rewards/MMFormatORM/mean": 0.5849999785423279, + "rewards/MMFormatORM/std": 0.1737115800380707, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2672485947608948, + "step": 4000, + "train_speed(iter/s)": 0.082832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.2, + "completions/mean_length": 210.9625, + "completions/min_length": 129.8, + "epoch": 1.9227076332213153, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.16067036986351013, + "kl": 0.013800048828125, + "learning_rate": 4.079134310241706e-08, + "loss": 0.000552175985649228, + "memory(GiB)": 27.09, + "reward": 0.4725499749183655, + "reward_std": 0.11066220700740814, + "rewards/MMContentORM/mean": 0.5745000123977662, + "rewards/MMContentORM/std": 0.6102168440818787, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4005, + "train_speed(iter/s)": 0.082769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.8, + "completions/mean_length": 220.4125, + "completions/min_length": 142.4, + "epoch": 1.9251080172827653, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.18261726200580597, + "kl": 0.01759033203125, + "learning_rate": 3.8300250727510423e-08, + "loss": 0.0007028756663203239, + "memory(GiB)": 27.09, + "reward": 0.43639997243881223, + "reward_std": 0.1443912021815777, + "rewards/MMContentORM/mean": 0.5559999942779541, + "rewards/MMContentORM/std": 0.6905377149581909, + "rewards/MMFormatORM/mean": 0.5849999845027923, + "rewards/MMFormatORM/std": 0.16754122078418732, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2577557325363159, + "step": 4010, + "train_speed(iter/s)": 0.082768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.4, + "completions/mean_length": 205.5875, + "completions/min_length": 133.2, + "epoch": 1.9275084013442152, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12302592396736145, + "kl": 0.014111328125, + "learning_rate": 3.588734589511977e-08, + "loss": 0.0005644991528242826, + "memory(GiB)": 27.09, + "reward": 0.5111499905586243, + "reward_std": 0.08379215330351145, + "rewards/MMContentORM/mean": 0.6710000097751617, + "rewards/MMContentORM/std": 0.5448502898216248, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4015, + "train_speed(iter/s)": 0.082782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.2, + "completions/mean_length": 221.45, + "completions/min_length": 142.6, + "epoch": 1.929908785405665, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.16786423325538635, + "kl": 0.0176513671875, + "learning_rate": 3.3552666628323126e-08, + "loss": 0.0007057101465761662, + "memory(GiB)": 27.09, + "reward": 0.521749985218048, + "reward_std": 0.14743175983894616, + "rewards/MMContentORM/mean": 0.7550000071525573, + "rewards/MMContentORM/std": 0.5161954037845135, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 4020, + "train_speed(iter/s)": 0.082782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.8, + "completions/mean_length": 215.6625, + "completions/min_length": 116.0, + "epoch": 1.9323091694671146, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.36218705773353577, + "kl": 0.017657470703125, + "learning_rate": 3.1296249717504e-08, + "loss": 0.0007065317593514919, + "memory(GiB)": 27.09, + "reward": 0.45779996514320376, + "reward_std": 0.11285423804074526, + "rewards/MMContentORM/mean": 0.5720000088214874, + "rewards/MMContentORM/std": 0.6419292092323303, + "rewards/MMFormatORM/mean": 0.6037499904632568, + "rewards/MMFormatORM/std": 0.14527987241744994, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 4025, + "train_speed(iter/s)": 0.082785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.4, + "completions/mean_length": 222.45, + "completions/min_length": 143.2, + "epoch": 1.9347095535285646, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.10553177446126938, + "kl": 0.015179443359375, + "learning_rate": 2.91181307197691e-08, + "loss": 0.0006072814110666513, + "memory(GiB)": 27.09, + "reward": 0.49004998803138733, + "reward_std": 0.09835855364799499, + "rewards/MMContentORM/mean": 0.647000002861023, + "rewards/MMContentORM/std": 0.6379193365573883, + "rewards/MMFormatORM/mean": 0.609375, + "rewards/MMFormatORM/std": 0.16249999403953552, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.25, + "step": 4030, + "train_speed(iter/s)": 0.082792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.8, + "completions/mean_length": 208.3625, + "completions/min_length": 128.0, + "epoch": 1.9371099375900145, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.12949110567569733, + "kl": 0.022711181640625, + "learning_rate": 2.7018343958392092e-08, + "loss": 0.000908501259982586, + "memory(GiB)": 27.09, + "reward": 0.41979997158050536, + "reward_std": 0.18243354400619866, + "rewards/MMContentORM/mean": 0.5145000040531158, + "rewards/MMContentORM/std": 0.7357254981994629, + "rewards/MMFormatORM/mean": 0.5849999904632568, + "rewards/MMFormatORM/std": 0.16980934143066406, + "rewards/MMRubricORM/mean": -0.1, + "rewards/MMRubricORM/std": 0.2612451553344727, + "step": 4035, + "train_speed(iter/s)": 0.082792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.8, + "completions/mean_length": 224.0875, + "completions/min_length": 159.6, + "epoch": 1.9395103216514642, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.13331717252731323, + "kl": 0.0154541015625, + "learning_rate": 2.499692252226793e-08, + "loss": 0.0006182675249874591, + "memory(GiB)": 27.09, + "reward": 0.4871499836444855, + "reward_std": 0.06908432978671045, + "rewards/MMContentORM/mean": 0.6109999895095826, + "rewards/MMContentORM/std": 0.6157109498977661, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 4040, + "train_speed(iter/s)": 0.082788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.6, + "completions/mean_length": 209.9125, + "completions/min_length": 138.4, + "epoch": 1.941910705712914, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.19583547115325928, + "kl": 0.014129638671875, + "learning_rate": 2.3053898265395503e-08, + "loss": 0.0005656382068991661, + "memory(GiB)": 27.09, + "reward": 0.48734999299049375, + "reward_std": 0.07954950779676437, + "rewards/MMContentORM/mean": 0.6115000009536743, + "rewards/MMContentORM/std": 0.6199671626091003, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4045, + "train_speed(iter/s)": 0.082796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.6, + "completions/mean_length": 218.2625, + "completions/min_length": 139.0, + "epoch": 1.9443110897743638, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12564736604690552, + "kl": 0.014019775390625, + "learning_rate": 2.1189301806372463e-08, + "loss": 0.0005606723949313164, + "memory(GiB)": 27.09, + "reward": 0.5056499779224396, + "reward_std": 0.040092954062856734, + "rewards/MMContentORM/mean": 0.6284999966621398, + "rewards/MMContentORM/std": 0.495737274736166, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 4050, + "train_speed(iter/s)": 0.082803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.4, + "completions/mean_length": 209.225, + "completions/min_length": 111.8, + "epoch": 1.9467114738358138, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.09130167961120605, + "kl": 0.019110107421875, + "learning_rate": 1.940316252791563e-08, + "loss": 0.0007643857039511204, + "memory(GiB)": 27.09, + "reward": 0.49664999842643737, + "reward_std": 0.11900607645511627, + "rewards/MMContentORM/mean": 0.6690000057220459, + "rewards/MMContentORM/std": 0.6181300818920136, + "rewards/MMFormatORM/mean": 0.6056249737739563, + "rewards/MMFormatORM/std": 0.1592322736978531, + "rewards/MMRubricORM/mean": -0.06599999964237213, + "rewards/MMRubricORM/std": 0.2501555383205414, + "step": 4055, + "train_speed(iter/s)": 0.08281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.6, + "completions/mean_length": 217.1625, + "completions/min_length": 143.8, + "epoch": 1.9491118578972637, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.15874595940113068, + "kl": 0.015081787109375, + "learning_rate": 1.7695508576395237e-08, + "loss": 0.0006026300135999918, + "memory(GiB)": 27.09, + "reward": 0.4948499917984009, + "reward_std": 0.1311683064326644, + "rewards/MMContentORM/mean": 0.6590000033378601, + "rewards/MMContentORM/std": 0.5976063251495362, + "rewards/MMFormatORM/mean": 0.6093749821186065, + "rewards/MMFormatORM/std": 0.09063776731491088, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.1394427239894867, + "step": 4060, + "train_speed(iter/s)": 0.082818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 215.8125, + "completions/min_length": 141.2, + "epoch": 1.9515122419587134, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.14566867053508759, + "kl": 0.012939453125, + "learning_rate": 1.6066366861393068e-08, + "loss": 0.000517718493938446, + "memory(GiB)": 27.09, + "reward": 0.5360999763011932, + "reward_std": 0.08697413904592395, + "rewards/MMContentORM/mean": 0.7190000057220459, + "rewards/MMContentORM/std": 0.5173257470130921, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 4065, + "train_speed(iter/s)": 0.082824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.6, + "completions/mean_length": 214.15, + "completions/min_length": 143.2, + "epoch": 1.9539126260201631, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.17086957395076752, + "kl": 0.018682861328125, + "learning_rate": 1.4515763055278354e-08, + "loss": 0.0007456324063241481, + "memory(GiB)": 27.09, + "reward": 0.4182999789714813, + "reward_std": 0.10818733535706997, + "rewards/MMContentORM/mean": 0.48199999928474424, + "rewards/MMContentORM/std": 0.6929208874702454, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 4070, + "train_speed(iter/s)": 0.082825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/mean_length": 212.15, + "completions/min_length": 144.8, + "epoch": 1.956313010081613, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.11877346783876419, + "kl": 0.013311767578125, + "learning_rate": 1.3043721592803093e-08, + "loss": 0.0005328983068466186, + "memory(GiB)": 27.09, + "reward": 0.5734499812126159, + "reward_std": 0.040941482339985666, + "rewards/MMContentORM/mean": 0.7980000138282776, + "rewards/MMContentORM/std": 0.42314670234918594, + "rewards/MMFormatORM/mean": 0.6418749809265136, + "rewards/MMFormatORM/std": 0.032499998807907104, + "rewards/MMRubricORM/mean": -0.0125, + "rewards/MMRubricORM/std": 0.05, + "step": 4075, + "train_speed(iter/s)": 0.082833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.8, + "completions/mean_length": 224.875, + "completions/min_length": 161.8, + "epoch": 1.958713394143063, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.13452792167663574, + "kl": 0.0132568359375, + "learning_rate": 1.1650265670716255e-08, + "loss": 0.0005301388446241617, + "memory(GiB)": 27.09, + "reward": 0.5045499742031098, + "reward_std": 0.0688014852348715, + "rewards/MMContentORM/mean": 0.6545000076293945, + "rewards/MMContentORM/std": 0.592427009344101, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4080, + "train_speed(iter/s)": 0.082833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.6, + "completions/mean_length": 217.6875, + "completions/min_length": 136.8, + "epoch": 1.9611137782045127, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.12339378893375397, + "kl": 0.014471435546875, + "learning_rate": 1.0335417247398505e-08, + "loss": 0.000578406685963273, + "memory(GiB)": 27.09, + "reward": 0.49459999799728394, + "reward_std": 0.06547808232717216, + "rewards/MMContentORM/mean": 0.6440000057220459, + "rewards/MMContentORM/std": 0.5158215515315533, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 4085, + "train_speed(iter/s)": 0.082838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.8, + "completions/mean_length": 208.3125, + "completions/min_length": 130.6, + "epoch": 1.9635141622659624, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12326997518539429, + "kl": 0.015618896484375, + "learning_rate": 9.099197042517493e-09, + "loss": 0.0006246047094464302, + "memory(GiB)": 27.09, + "reward": 0.4959499776363373, + "reward_std": 0.07756961362902075, + "rewards/MMContentORM/mean": 0.6329999923706054, + "rewards/MMContentORM/std": 0.6094204008579254, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4090, + "train_speed(iter/s)": 0.082846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.6, + "completions/mean_length": 207.375, + "completions/min_length": 121.8, + "epoch": 1.9659145463274124, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16281642019748688, + "kl": 0.016046142578125, + "learning_rate": 7.941624536699221e-09, + "loss": 0.0006411905866116286, + "memory(GiB)": 27.09, + "reward": 0.5397499680519104, + "reward_std": 0.06823580265045166, + "rewards/MMContentORM/mean": 0.7425000071525574, + "rewards/MMContentORM/std": 0.47391852661967276, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 4095, + "train_speed(iter/s)": 0.082849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.2, + "completions/mean_length": 206.6, + "completions/min_length": 125.0, + "epoch": 1.9683149303888623, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.1633000671863556, + "kl": 0.0170166015625, + "learning_rate": 6.8627179712232875e-09, + "loss": 0.0006804309785366058, + "memory(GiB)": 27.09, + "reward": 0.47594999670982363, + "reward_std": 0.10245976857841015, + "rewards/MMContentORM/mean": 0.5830000042915344, + "rewards/MMContentORM/std": 0.6251704752445221, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4100, + "train_speed(iter/s)": 0.082859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.6, + "completions/mean_length": 211.1, + "completions/min_length": 149.2, + "epoch": 1.970715314450312, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.16619880497455597, + "kl": 0.02249755859375, + "learning_rate": 5.862494347733672e-09, + "loss": 0.0008998697623610497, + "memory(GiB)": 27.09, + "reward": 0.5654999852180481, + "reward_std": 0.047093309834599494, + "rewards/MMContentORM/mean": 0.7925000190734863, + "rewards/MMContentORM/std": 0.4002851232886314, + "rewards/MMFormatORM/mean": 0.6337499856948853, + "rewards/MMFormatORM/std": 0.06499999761581421, + "rewards/MMRubricORM/mean": -0.025, + "rewards/MMRubricORM/std": 0.1, + "step": 4105, + "train_speed(iter/s)": 0.082845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/mean_length": 211.35, + "completions/min_length": 129.2, + "epoch": 1.9731156985117617, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.11561845242977142, + "kl": 0.026605224609375, + "learning_rate": 4.9409694279711765e-09, + "loss": 0.0010593479499220848, + "memory(GiB)": 27.09, + "reward": 0.5204499781131744, + "reward_std": 0.026516501186415554, + "rewards/MMContentORM/mean": 0.653000020980835, + "rewards/MMContentORM/std": 0.4970328502357006, + "rewards/MMFormatORM/mean": 0.6481249809265137, + "rewards/MMFormatORM/std": 0.007499998807907105, + "rewards/MMRubricORM/mean": 0.0, + "rewards/MMRubricORM/std": 0.0, + "step": 4110, + "train_speed(iter/s)": 0.082852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 213.1625, + "completions/min_length": 146.6, + "epoch": 1.9755160825732117, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18078495562076569, + "kl": 0.017230224609375, + "learning_rate": 4.098157733525842e-09, + "loss": 0.0006890918128192424, + "memory(GiB)": 27.09, + "reward": 0.472899979352951, + "reward_std": 0.1401485550450161, + "rewards/MMContentORM/mean": 0.6185000002384186, + "rewards/MMContentORM/std": 0.6067943811416626, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.12930812537670136, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.19893558621406554, + "step": 4115, + "train_speed(iter/s)": 0.082855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/mean_length": 211.8375, + "completions/min_length": 115.8, + "epoch": 1.9779164666346616, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.14201927185058594, + "kl": 0.029730224609375, + "learning_rate": 3.3340725456071364e-09, + "loss": 0.0011919239535927773, + "memory(GiB)": 27.09, + "reward": 0.4554999828338623, + "reward_std": 0.1658872556872666, + "rewards/MMContentORM/mean": 0.5750000178813934, + "rewards/MMContentORM/std": 0.6658959984779358, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 4120, + "train_speed(iter/s)": 0.082848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/mean_length": 218.8875, + "completions/min_length": 137.2, + "epoch": 1.9803168506961115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.19586387276649475, + "kl": 0.016632080078125, + "learning_rate": 2.6487259048357803e-09, + "loss": 0.000665505975484848, + "memory(GiB)": 27.09, + "reward": 0.47424999475479124, + "reward_std": 0.15803836286067963, + "rewards/MMContentORM/mean": 0.6074999988079071, + "rewards/MMContentORM/std": 0.6695436835289001, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 4125, + "train_speed(iter/s)": 0.08285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.2, + "completions/mean_length": 221.2, + "completions/min_length": 129.6, + "epoch": 1.9827172347575612, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.10946598649024963, + "kl": 0.015472412109375, + "learning_rate": 2.0421286110533513e-09, + "loss": 0.0006184926256537438, + "memory(GiB)": 27.09, + "reward": 0.5523499727249146, + "reward_std": 0.086054896004498, + "rewards/MMContentORM/mean": 0.7740000247955322, + "rewards/MMContentORM/std": 0.44600327536463735, + "rewards/MMFormatORM/mean": 0.6256249904632568, + "rewards/MMFormatORM/std": 0.09749999642372131, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.15, + "step": 4130, + "train_speed(iter/s)": 0.082848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/mean_length": 213.225, + "completions/min_length": 137.4, + "epoch": 1.985117618819011, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.07553966343402863, + "kl": 0.01331787109375, + "learning_rate": 1.5142902231513045e-09, + "loss": 0.0005324673838913441, + "memory(GiB)": 27.09, + "reward": 0.5357499957084656, + "reward_std": 0.06823580311611295, + "rewards/MMContentORM/mean": 0.732500022649765, + "rewards/MMContentORM/std": 0.44778469279408456, + "rewards/MMFormatORM/mean": 0.6256249785423279, + "rewards/MMFormatORM/std": 0.07690345346927643, + "rewards/MMRubricORM/mean": -0.0375, + "rewards/MMRubricORM/std": 0.11831300854682922, + "step": 4135, + "train_speed(iter/s)": 0.082857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/mean_length": 214.9375, + "completions/min_length": 135.4, + "epoch": 1.9875180028804609, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.09850002825260162, + "kl": 0.016082763671875, + "learning_rate": 1.0652190589210965e-09, + "loss": 0.0006438469514250756, + "memory(GiB)": 27.09, + "reward": 0.44769997596740724, + "reward_std": 0.1412799373269081, + "rewards/MMContentORM/mean": 0.5554999947547913, + "rewards/MMContentORM/std": 0.6557976067066192, + "rewards/MMFormatORM/mean": 0.6012499809265137, + "rewards/MMFormatORM/std": 0.15380690693855287, + "rewards/MMRubricORM/mean": -0.075, + "rewards/MMRubricORM/std": 0.23662601709365844, + "step": 4140, + "train_speed(iter/s)": 0.082857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.8, + "completions/mean_length": 210.8875, + "completions/min_length": 134.0, + "epoch": 1.9899183869419108, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.15459950268268585, + "kl": 0.020318603515625, + "learning_rate": 6.949221949248408e-10, + "loss": 0.0008131683804094792, + "memory(GiB)": 27.09, + "reward": 0.4830499768257141, + "reward_std": 0.11561195463873446, + "rewards/MMContentORM/mean": 0.6294999957084656, + "rewards/MMContentORM/std": 0.6491626858711242, + "rewards/MMFormatORM/mean": 0.6093749880790711, + "rewards/MMFormatORM/std": 0.14190345108509064, + "rewards/MMRubricORM/mean": -0.0625, + "rewards/MMRubricORM/std": 0.21831300854682922, + "step": 4145, + "train_speed(iter/s)": 0.082861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.4, + "completions/mean_length": 214.8375, + "completions/min_length": 132.2, + "epoch": 1.9923187710033605, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1465650200843811, + "kl": 0.01693115234375, + "learning_rate": 4.0340546638040213e-10, + "loss": 0.0006770275533199311, + "memory(GiB)": 27.09, + "reward": 0.436549985408783, + "reward_std": 0.15535135762766003, + "rewards/MMContentORM/mean": 0.54200000166893, + "rewards/MMContentORM/std": 0.6834682941436767, + "rewards/MMFormatORM/mean": 0.5931249797344208, + "rewards/MMFormatORM/std": 0.1350412219762802, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.2077557325363159, + "step": 4150, + "train_speed(iter/s)": 0.082874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 492.8, + "completions/mean_length": 222.3125, + "completions/min_length": 123.2, + "epoch": 1.9947191550648102, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.08558174967765808, + "kl": 0.0202880859375, + "learning_rate": 1.9067346707202227e-10, + "loss": 0.0008112492971122265, + "memory(GiB)": 27.09, + "reward": 0.4795499801635742, + "reward_std": 0.16411948413588107, + "rewards/MMContentORM/mean": 0.6495000183582306, + "rewards/MMContentORM/std": 0.5548809096217155, + "rewards/MMFormatORM/mean": 0.5931249856948853, + "rewards/MMFormatORM/std": 0.16180812418460847, + "rewards/MMRubricORM/mean": -0.0875, + "rewards/MMRubricORM/std": 0.24893558621406556, + "step": 4155, + "train_speed(iter/s)": 0.082842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.4, + "completions/mean_length": 214.2625, + "completions/min_length": 151.0, + "epoch": 1.9971195391262602, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1454610973596573, + "kl": 0.015252685546875, + "learning_rate": 5.672954927593566e-11, + "loss": 0.0006094192154705525, + "memory(GiB)": 27.09, + "reward": 0.5129999816417694, + "reward_std": 0.08400428430177272, + "rewards/MMContentORM/mean": 0.6900000154972077, + "rewards/MMContentORM/std": 0.49128730222582817, + "rewards/MMFormatORM/mean": 0.6174999833106994, + "rewards/MMFormatORM/std": 0.10940345227718354, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.16831300854682923, + "step": 4160, + "train_speed(iter/s)": 0.082842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.2, + "completions/mean_length": 208.1375, + "completions/min_length": 121.0, + "epoch": 1.99951992318771, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.14134319126605988, + "kl": 0.018817138671875, + "learning_rate": 1.5758237104090968e-12, + "loss": 0.0007528647780418396, + "memory(GiB)": 27.09, + "reward": 0.478799968957901, + "reward_std": 0.0989949492039159, + "rewards/MMContentORM/mean": 0.6045000076293945, + "rewards/MMContentORM/std": 0.6013319611549377, + "rewards/MMFormatORM/mean": 0.6174999713897705, + "rewards/MMFormatORM/std": 0.08880690932273864, + "rewards/MMRubricORM/mean": -0.05, + "rewards/MMRubricORM/std": 0.13662601709365846, + "step": 4165, + "train_speed(iter/s)": 0.082852 + } + ], + "logging_steps": 5, + "max_steps": 4166, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}