{ "epoch": 1.0, "eval_kl/ref_to_policy/chosen": -17.542104721069336, "eval_kl/ref_to_policy/mean": 39.619388580322266, "eval_kl/ref_to_policy/rejected": 96.7808837890625, "eval_logits/chosen": -1.413936734199524, "eval_logits/rejected": -1.7039226293563843, "eval_logps/chosen": -1506.69384765625, "eval_logps/rejected": -1620.5035400390625, "eval_loss": 4.123634338378906, "eval_nll_loss": 0.9376209378242493, "eval_rewards/accuracies": 0.9414893388748169, "eval_rewards/chosen": 0.17542102932929993, "eval_rewards/margins": 1.1432298421859741, "eval_rewards/rejected": -0.967808723449707, "eval_runtime": 112.3415, "eval_samples_per_second": 3.347, "eval_steps_per_second": 1.673, "total_flos": 0.0, "train_loss": 4.881054094737366, "train_runtime": 20257.6196, "train_samples": 5824, "train_samples_per_second": 0.551, "train_steps_per_second": 0.034 }