{ "epoch": 1, "n_tokens": 5002240, "global_step": 4885, "training_metrics": { "train/loss": 2.515625, "train/contrastive": 2.421875, "train/recons_loss": 0.65234375, "train/balance_loss": 3.84375, "train/balance_loss_contrastive": 2.796875, "train/balance_loss_recons": 1.0390625, "train/contrastive_std": 3.296875, "train/recons_std": 0.1279296875, "train/contrastive_min": 0.10791015625, "train/contrastive_max": 7.0, "train/recons_min": 0.546875, "train/recons_max": 0.89453125, "train/Qwen3_0.6B_layer_2": 0.89453125, "train/Qwen3_0.6B_layer_4": 0.58984375, "train/Qwen3_1.7B_layer_2": 0.578125, "train/Qwen3_1.7B_layer_4": 0.69140625, "train/Qwen3_4B_layer_2": 0.546875, "train/Qwen3_4B_layer_4": 0.61328125, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 5002240, "train/step": 4885 }, "eval_metrics": { "global_step": 4885, "n_tokens": 5002240, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.649429321289062, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.296281814575195, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.075584411621094, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 8.38884162902832, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.30383014678955, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.307902336120605, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.658719062805176, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.32064151763916, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.324888229370117, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.3024439811706543, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.259655714035034, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.363274574279785, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 12.87409782409668, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.341550350189209, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.935274600982666, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.224505424499512, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.194956302642822, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.298384189605713, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.595909595489502, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.6820993423461914, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6489415168762207, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6376867294311523, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.660701274871826, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7599291801452637, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.069366455078125, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.4930596351623535, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.338548421859741, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.4058313369750977, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.42093825340271, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3004584312438965, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 6.583094120025635, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.2262253761291504, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.1968085765838623, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.333820343017578, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.3650805950164795, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.0115933418273926, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.5094382762908936, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.214106798171997, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 2.274066925048828, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3673298358917236, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.390550136566162, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 2.258884906768799, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 3.714834690093994, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0173228979110718, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0794646739959717, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.068953275680542, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.0350369215011597, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.035721778869629, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 3.772433042526245, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.0048426389694214, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.02411949634552, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.0449342727661133, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0301669836044312, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.027207612991333, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 2.8531670570373535, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.316890001296997, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.3466463088989258, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3155006170272827, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3215066194534302, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3193069696426392, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 2.85866117477417, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.078108787536621, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.092585802078247, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.0788657665252686, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.037369728088379, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.057699203491211, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 3.228097438812256, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.1583912372589111, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.1884386539459229, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.1869480609893799, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.176148772239685, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.147931694984436 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.9609375, "mean_cosine_similarity": 0.5078125, "std_cosine_similarity": 0.2080078125, "mean_l2_distance": 49.25, "std_l2_distance": 10.375, "mean_dimension_correlation": 0.656490707397461, "std_dimension_correlation": 0.0958554699318414, "linear_cka": 0.78125 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.9609375, "mean_cosine_similarity": 0.50390625, "std_cosine_similarity": 0.208984375, "mean_l2_distance": 49.25, "std_l2_distance": 10.4375, "mean_dimension_correlation": 0.6543472290039063, "std_dimension_correlation": 0.09762869101522263, "linear_cka": 0.78125 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.9609375, "mean_cosine_similarity": 0.5078125, "std_cosine_similarity": 0.2080078125, "mean_l2_distance": 49.25, "std_l2_distance": 10.375, "mean_dimension_correlation": 0.6573211669921875, "std_dimension_correlation": 0.09694915743193826, "linear_cka": 0.76953125 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.97265625, "mean_cosine_similarity": 0.498046875, "std_cosine_similarity": 0.20703125, "mean_l2_distance": 49.75, "std_l2_distance": 10.25, "mean_dimension_correlation": 0.6492362976074219, "std_dimension_correlation": 0.09888349567655164, "linear_cka": 0.78125 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.96484375, "mean_cosine_similarity": 0.50390625, "std_cosine_similarity": 0.2099609375, "mean_l2_distance": 49.5, "std_l2_distance": 10.4375, "mean_dimension_correlation": 0.6535564422607422, "std_dimension_correlation": 0.09991360994754159, "linear_cka": 0.7734375 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.9609375, "mean_cosine_similarity": 0.5078125, "std_cosine_similarity": 0.2080078125, "mean_l2_distance": 49.25, "std_l2_distance": 10.375, "mean_dimension_correlation": 0.6565372467041015, "std_dimension_correlation": 0.09589836585224641, "linear_cka": 0.78125 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.39453125, "mean_cosine_similarity": 0.8671875, "std_cosine_similarity": 0.212890625, "mean_l2_distance": 20.25, "std_l2_distance": 16.875, "mean_dimension_correlation": 0.8475028991699218, "std_dimension_correlation": 0.0568063510608605, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.388671875, "mean_cosine_similarity": 0.87109375, "std_cosine_similarity": 0.2138671875, "mean_l2_distance": 19.875, "std_l2_distance": 17.0, "mean_dimension_correlation": 0.8486709594726562, "std_dimension_correlation": 0.05617761489018725, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.404296875, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.23046875, "mean_l2_distance": 20.75, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.837774658203125, "std_dimension_correlation": 0.06036525651201483, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.3984375, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.2294921875, "mean_l2_distance": 20.375, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.8405960083007813, "std_dimension_correlation": 0.06082946585887382, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.9609375, "mean_cosine_similarity": 0.50390625, "std_cosine_similarity": 0.208984375, "mean_l2_distance": 49.25, "std_l2_distance": 10.4375, "mean_dimension_correlation": 0.6544017791748047, "std_dimension_correlation": 0.09771714306924677, "linear_cka": 0.78125 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.39453125, "mean_cosine_similarity": 0.8671875, "std_cosine_similarity": 0.212890625, "mean_l2_distance": 20.25, "std_l2_distance": 16.875, "mean_dimension_correlation": 0.8475006103515625, "std_dimension_correlation": 0.05680066543049433, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.38671875, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2255859375, "mean_l2_distance": 19.875, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.8454933166503906, "std_dimension_correlation": 0.05831595958380969, "linear_cka": 0.9921875 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.396484375, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.2275390625, "mean_l2_distance": 20.25, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.840838623046875, "std_dimension_correlation": 0.06208702710996684, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.37890625, "mean_cosine_similarity": 0.875, "std_cosine_similarity": 0.208984375, "mean_l2_distance": 19.375, "std_l2_distance": 16.875, "mean_dimension_correlation": 0.8549148559570312, "std_dimension_correlation": 0.05559449933392313, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.9609375, "mean_cosine_similarity": 0.5078125, "std_cosine_similarity": 0.2080078125, "mean_l2_distance": 49.25, "std_l2_distance": 10.375, "mean_dimension_correlation": 0.6572914123535156, "std_dimension_correlation": 0.09700915659232981, "linear_cka": 0.76953125 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.388671875, "mean_cosine_similarity": 0.87109375, "std_cosine_similarity": 0.2138671875, "mean_l2_distance": 19.875, "std_l2_distance": 17.0, "mean_dimension_correlation": 0.8486358642578125, "std_dimension_correlation": 0.05619399135067562, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.38671875, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2255859375, "mean_l2_distance": 19.875, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.8454788208007813, "std_dimension_correlation": 0.05838818713320479, "linear_cka": 0.9921875 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.396484375, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2255859375, "mean_l2_distance": 20.375, "std_l2_distance": 17.625, "mean_dimension_correlation": 0.8422294616699219, "std_dimension_correlation": 0.058264678286021505, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.37890625, "mean_cosine_similarity": 0.87109375, "std_cosine_similarity": 0.216796875, "mean_l2_distance": 19.5, "std_l2_distance": 17.25, "mean_dimension_correlation": 0.8510147094726562, "std_dimension_correlation": 0.05579116262302701, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.97265625, "mean_cosine_similarity": 0.498046875, "std_cosine_similarity": 0.20703125, "mean_l2_distance": 49.75, "std_l2_distance": 10.25, "mean_dimension_correlation": 0.6493141174316406, "std_dimension_correlation": 0.09894686111103042, "linear_cka": 0.78125 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.404296875, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.23046875, "mean_l2_distance": 20.75, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.837823486328125, "std_dimension_correlation": 0.060340047867445096, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.396484375, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.2275390625, "mean_l2_distance": 20.25, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.8408378601074219, "std_dimension_correlation": 0.06205699551364581, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.396484375, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2255859375, "mean_l2_distance": 20.375, "std_l2_distance": 17.625, "mean_dimension_correlation": 0.8422599792480469, "std_dimension_correlation": 0.05822862763556895, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.384765625, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2314453125, "mean_l2_distance": 19.75, "std_l2_distance": 18.25, "mean_dimension_correlation": 0.842401123046875, "std_dimension_correlation": 0.06118176940514725, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.96484375, "mean_cosine_similarity": 0.50390625, "std_cosine_similarity": 0.2099609375, "mean_l2_distance": 49.5, "std_l2_distance": 10.4375, "mean_dimension_correlation": 0.6536331176757812, "std_dimension_correlation": 0.09992718456511775, "linear_cka": 0.7734375 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.3984375, "mean_cosine_similarity": 0.859375, "std_cosine_similarity": 0.2294921875, "mean_l2_distance": 20.375, "std_l2_distance": 17.75, "mean_dimension_correlation": 0.8406883239746094, "std_dimension_correlation": 0.06084754257089133, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.37890625, "mean_cosine_similarity": 0.875, "std_cosine_similarity": 0.208984375, "mean_l2_distance": 19.375, "std_l2_distance": 16.875, "mean_dimension_correlation": 0.8549110412597656, "std_dimension_correlation": 0.05561355528314661, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.37890625, "mean_cosine_similarity": 0.87109375, "std_cosine_similarity": 0.216796875, "mean_l2_distance": 19.5, "std_l2_distance": 17.25, "mean_dimension_correlation": 0.851116943359375, "std_dimension_correlation": 0.05580195396667784, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.384765625, "mean_cosine_similarity": 0.86328125, "std_cosine_similarity": 0.2314453125, "mean_l2_distance": 19.75, "std_l2_distance": 18.25, "mean_dimension_correlation": 0.8424659729003906, "std_dimension_correlation": 0.06116790445559799, "linear_cka": 0.98828125 }, "avg_mse": 0.5819010416666667, "std_mse": 0.27032309960351786, "avg_mean_cosine_similarity": 0.744921875, "std_mean_cosine_similarity": 0.17021541336381552, "avg_std_cosine_similarity": 0.21764322916666667, "std_std_cosine_similarity": 0.00918084175662793, "avg_mean_l2_distance": 29.825, "std_mean_l2_distance": 13.846163728628952, "avg_std_l2_distance": 15.116666666666667, "std_std_l2_distance": 3.3721335564034565, "avg_mean_dimension_correlation": 0.781509501139323, "std_mean_dimension_correlation": 0.09012204602156858, "avg_std_dimension_correlation": 0.07165274636880824, "std_std_dimension_correlation": 0.018653236358734372, "avg_linear_cka": 0.9169270833333333, "std_linear_cka": 0.09876420102706185 } } }