{ "epoch": 1, "n_tokens": 3001344, "global_step": 2931, "training_metrics": { "train/loss": 2.515625, "train/contrastive": 2.421875, "train/recons_loss": 0.671875, "train/balance_loss": 3.75, "train/balance_loss_contrastive": 2.71875, "train/balance_loss_recons": 1.0390625, "train/contrastive_std": 3.25, "train/recons_std": 0.138671875, "train/contrastive_min": 0.146484375, "train/contrastive_max": 6.9375, "train/recons_min": 0.56640625, "train/recons_max": 0.9375, "train/Qwen3_0.6B_layer_2": 0.9375, "train/Qwen3_0.6B_layer_4": 0.59765625, "train/Qwen3_1.7B_layer_2": 0.59375, "train/Qwen3_1.7B_layer_4": 0.703125, "train/Qwen3_4B_layer_2": 0.56640625, "train/Qwen3_4B_layer_4": 0.6328125, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 3001344, "train/step": 2931 }, "eval_metrics": { "global_step": 2931, "n_tokens": 3001344, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.318835258483887, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 9.138021469116211, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.61973762512207, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 9.007281303405762, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.960853576660156, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.789403915405273, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 7.3046698570251465, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.55082368850708, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.5602962970733643, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.592942714691162, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.588857650756836, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.6625943183898926, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.131369590759277, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.891963481903076, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.430274963378906, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.0684638023376465, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 5.9689507484436035, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.356847286224365, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 8.19615364074707, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.8310694694519043, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.7546491622924805, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.7474663257598877, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.857220411300659, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.925436019897461, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.565979957580566, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9663586616516113, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.719478130340576, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.741952657699585, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.7755935192108154, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.7375831604003906, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.4653778076171875, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.67035174369812, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.566011905670166, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7160496711730957, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.552424907684326, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.459855556488037, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 15.485275268554688, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 22.359243392944336, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 21.841341018676758, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 20.851577758789062, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 23.41849136352539, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 22.13389015197754, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.68209457397461, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0910885334014893, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0663363933563232, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.1295608282089233, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.096497654914856, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0976781845092773, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.745889663696289, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.073387622833252, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0651912689208984, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1097475290298462, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.102055311203003, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1042507886886597, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.085488319396973, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.443469762802124, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.407573938369751, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.394163966178894, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.4274914264678955, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.423639178276062, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.723683834075928, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.2199777364730835, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.1646456718444824, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1640838384628296, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.1155877113342285, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.1568272113800049, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.499415397644043, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2985743284225464, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2958557605743408, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2903549671173096, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2823046445846558, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2616506814956665 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 1.421875, "mean_cosine_similarity": -0.03369140625, "std_cosine_similarity": 0.109375, "mean_l2_distance": 72.5, "std_l2_distance": 3.90625, "mean_dimension_correlation": 0.254237837344408, "std_dimension_correlation": 0.16181929189675745, "linear_cka": 0.5859375 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 1.421875, "mean_cosine_similarity": -0.0284423828125, "std_cosine_similarity": 0.10888671875, "mean_l2_distance": 72.5, "std_l2_distance": 3.890625, "mean_dimension_correlation": 0.25683254674077033, "std_dimension_correlation": 0.16029215327593901, "linear_cka": 0.57421875 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 1.4140625, "mean_cosine_similarity": -0.0252685546875, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 72.5, "std_l2_distance": 3.875, "mean_dimension_correlation": 0.25395019352436066, "std_dimension_correlation": 0.15926056622745546, "linear_cka": 0.578125 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 1.421875, "mean_cosine_similarity": -0.03271484375, "std_cosine_similarity": 0.1064453125, "mean_l2_distance": 72.5, "std_l2_distance": 3.796875, "mean_dimension_correlation": 0.24886183738708495, "std_dimension_correlation": 0.15849261736593726, "linear_cka": 0.55859375 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 1.421875, "mean_cosine_similarity": -0.033203125, "std_cosine_similarity": 0.109375, "mean_l2_distance": 72.5, "std_l2_distance": 3.890625, "mean_dimension_correlation": 0.256584095954895, "std_dimension_correlation": 0.15873214025442897, "linear_cka": 0.57421875 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.4296875, "mean_cosine_similarity": -0.03369140625, "std_cosine_similarity": 0.109375, "mean_l2_distance": 72.5, "std_l2_distance": 3.90625, "mean_dimension_correlation": 0.2542317323386669, "std_dimension_correlation": 0.16183266276519212, "linear_cka": 0.5859375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.734375, "mean_cosine_similarity": 0.65625, "std_cosine_similarity": 0.28515625, "mean_l2_distance": 37.25, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.6187647342681885, "std_dimension_correlation": 0.11470426666838326, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.7265625, "mean_cosine_similarity": 0.66015625, "std_cosine_similarity": 0.279296875, "mean_l2_distance": 37.0, "std_l2_distance": 19.25, "mean_dimension_correlation": 0.6220208525657653, "std_dimension_correlation": 0.11040039509848326, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.76171875, "mean_cosine_similarity": 0.62890625, "std_cosine_similarity": 0.302734375, "mean_l2_distance": 38.75, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.592758321762085, "std_dimension_correlation": 0.11886540241980308, "linear_cka": 0.98046875 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.74609375, "mean_cosine_similarity": 0.63671875, "std_cosine_similarity": 0.302734375, "mean_l2_distance": 38.25, "std_l2_distance": 20.25, "mean_dimension_correlation": 0.6037769317626953, "std_dimension_correlation": 0.11647753822253991, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.4296875, "mean_cosine_similarity": -0.0284423828125, "std_cosine_similarity": 0.10888671875, "mean_l2_distance": 72.5, "std_l2_distance": 3.890625, "mean_dimension_correlation": 0.25684744566679, "std_dimension_correlation": 0.16032274573798164, "linear_cka": 0.57421875 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.734375, "mean_cosine_similarity": 0.65625, "std_cosine_similarity": 0.28515625, "mean_l2_distance": 37.25, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.6187384128570557, "std_dimension_correlation": 0.11471572089316741, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.734375, "mean_cosine_similarity": 0.6484375, "std_cosine_similarity": 0.30078125, "mean_l2_distance": 37.5, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.6119367599487304, "std_dimension_correlation": 0.1157440646478159, "linear_cka": 0.99609375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.75390625, "mean_cosine_similarity": 0.63671875, "std_cosine_similarity": 0.30078125, "mean_l2_distance": 38.25, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.5996460914611816, "std_dimension_correlation": 0.11944124129277625, "linear_cka": 0.98046875 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.70703125, "mean_cosine_similarity": 0.67578125, "std_cosine_similarity": 0.27734375, "mean_l2_distance": 36.0, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.638215160369873, "std_dimension_correlation": 0.10975611081697591, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.4140625, "mean_cosine_similarity": -0.0252685546875, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 72.5, "std_l2_distance": 3.875, "mean_dimension_correlation": 0.25395837128162385, "std_dimension_correlation": 0.15926372552177567, "linear_cka": 0.578125 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.7265625, "mean_cosine_similarity": 0.66015625, "std_cosine_similarity": 0.279296875, "mean_l2_distance": 37.0, "std_l2_distance": 19.25, "mean_dimension_correlation": 0.6219659209251404, "std_dimension_correlation": 0.11032863879333923, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.734375, "mean_cosine_similarity": 0.6484375, "std_cosine_similarity": 0.30078125, "mean_l2_distance": 37.5, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.6119108200073242, "std_dimension_correlation": 0.1157383378132106, "linear_cka": 0.99609375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.7578125, "mean_cosine_similarity": 0.6328125, "std_cosine_similarity": 0.298828125, "mean_l2_distance": 38.5, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.5979020118713378, "std_dimension_correlation": 0.1151705814719715, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.71875, "mean_cosine_similarity": 0.6640625, "std_cosine_similarity": 0.28125, "mean_l2_distance": 36.75, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.6274345874786377, "std_dimension_correlation": 0.11253210388812478, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.4296875, "mean_cosine_similarity": -0.03271484375, "std_cosine_similarity": 0.1064453125, "mean_l2_distance": 72.5, "std_l2_distance": 3.796875, "mean_dimension_correlation": 0.24887723177671434, "std_dimension_correlation": 0.15850834600861563, "linear_cka": 0.55859375 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.76171875, "mean_cosine_similarity": 0.62890625, "std_cosine_similarity": 0.302734375, "mean_l2_distance": 38.75, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.5927883148193359, "std_dimension_correlation": 0.11887853166661289, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.75390625, "mean_cosine_similarity": 0.63671875, "std_cosine_similarity": 0.30078125, "mean_l2_distance": 38.25, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.5995779991149902, "std_dimension_correlation": 0.1193691675179003, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.7578125, "mean_cosine_similarity": 0.6328125, "std_cosine_similarity": 0.298828125, "mean_l2_distance": 38.5, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.5978128433227539, "std_dimension_correlation": 0.11512506347641102, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.75390625, "mean_cosine_similarity": 0.62890625, "std_cosine_similarity": 0.3046875, "mean_l2_distance": 38.5, "std_l2_distance": 20.625, "mean_dimension_correlation": 0.5955796241760254, "std_dimension_correlation": 0.11906185378925987, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.4296875, "mean_cosine_similarity": -0.033203125, "std_cosine_similarity": 0.109375, "mean_l2_distance": 72.5, "std_l2_distance": 3.890625, "mean_dimension_correlation": 0.2565764158964157, "std_dimension_correlation": 0.1587071816624074, "linear_cka": 0.57421875 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.74609375, "mean_cosine_similarity": 0.63671875, "std_cosine_similarity": 0.302734375, "mean_l2_distance": 38.25, "std_l2_distance": 20.25, "mean_dimension_correlation": 0.6037120819091797, "std_dimension_correlation": 0.11639985412027169, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.70703125, "mean_cosine_similarity": 0.67578125, "std_cosine_similarity": 0.27734375, "mean_l2_distance": 36.0, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.6382188320159912, "std_dimension_correlation": 0.10972459865917429, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.71875, "mean_cosine_similarity": 0.6640625, "std_cosine_similarity": 0.28125, "mean_l2_distance": 36.75, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.6273346900939941, "std_dimension_correlation": 0.1124933006393999, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.75390625, "mean_cosine_similarity": 0.62890625, "std_cosine_similarity": 0.3046875, "mean_l2_distance": 38.5, "std_l2_distance": 20.625, "mean_dimension_correlation": 0.5955384254455567, "std_dimension_correlation": 0.11905433194805992, "linear_cka": 0.98828125 }, "avg_mse": 0.9674479166666666, "std_mse": 0.32276016873074226, "avg_mean_cosine_similarity": 0.4210286458333333, "std_mean_cosine_similarity": 0.31965592389258923, "avg_std_cosine_similarity": 0.23173828125, "std_std_cosine_similarity": 0.08757208624967457, "avg_mean_l2_distance": 49.28333333333333, "std_mean_l2_distance": 16.43189648890907, "avg_std_l2_distance": 14.598958333333334, "std_std_l2_distance": 7.594291533045653, "avg_mean_dimension_correlation": 0.49188637080291897, "std_mean_dimension_correlation": 0.16857047305704442, "avg_std_dimension_correlation": 0.1300404178186724, "std_std_dimension_correlation": 0.021172306029780062, "avg_linear_cka": 0.8479166666666667, "std_linear_cka": 0.19363585907609904 } } }