{ "epoch": 1, "n_tokens": 2000896, "global_step": 1954, "training_metrics": { "train/loss": 2.609375, "train/contrastive": 2.5, "train/recons_loss": 0.71484375, "train/balance_loss": 3.671875, "train/balance_loss_contrastive": 2.625, "train/balance_loss_recons": 1.046875, "train/contrastive_std": 3.234375, "train/recons_std": 0.171875, "train/contrastive_min": 0.224609375, "train/contrastive_max": 7.0, "train/recons_min": 0.5859375, "train/recons_max": 1.046875, "train/Qwen3_0.6B_layer_2": 1.046875, "train/Qwen3_0.6B_layer_4": 0.63671875, "train/Qwen3_1.7B_layer_2": 0.62109375, "train/Qwen3_1.7B_layer_4": 0.75, "train/Qwen3_4B_layer_2": 0.5859375, "train/Qwen3_4B_layer_4": 0.65234375, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 2000896, "train/step": 1954 }, "eval_metrics": { "global_step": 1954, "n_tokens": 2000896, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.937955856323242, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.9807448387146, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.929330348968506, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.9499993324279785, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.048929214477539, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.740438938140869, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 10.935715675354004, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 3.0637950897216797, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 3.0137126445770264, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.9935271739959717, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 3.0705885887145996, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 3.0900540351867676, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 9.36762523651123, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.4384565353393555, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.606346130371094, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.644039154052734, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.400282859802246, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.326376438140869, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 12.973535537719727, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.9450173377990723, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 3.160464286804199, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.980670928955078, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 3.0805249214172363, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 3.193880319595337, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.039875030517578, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9839415550231934, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.829629421234131, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.9013402462005615, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.699265241622925, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.6922624111175537, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.345184326171875, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.6716532707214355, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.481139659881592, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7798919677734375, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.6100268363952637, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.4427032470703125, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.8776168823242188, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.3933213949203491, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.383978009223938, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3917444944381714, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.38372004032135, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.364012598991394, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 4.228211402893066, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.2656141519546509, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.232149362564087, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.249756097793579, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.2561695575714111, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.2504116296768188, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.011510372161865, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.1570074558258057, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.123882532119751, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1837798357009888, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.1635627746582031, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1612880229949951, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 3.041527032852173, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.6266475915908813, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.6221141815185547, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.5699771642684937, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.6569658517837524, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.6420214176177979, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.062718391418457, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.3707829713821411, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.2881925106048584, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.2928853034973145, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.213749885559082, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.2994227409362793, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.752654075622559, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.4750916957855225, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.4292265176773071, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.4410502910614014, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.4525748491287231, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.3758857250213623 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 1.3515625, "mean_cosine_similarity": 0.08251953125, "std_cosine_similarity": 0.05322265625, "mean_l2_distance": 68.5, "std_l2_distance": 2.0, "mean_dimension_correlation": 0.062478048354387285, "std_dimension_correlation": 0.18795068082189698, "linear_cka": 0.6640625 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.07666015625, "std_cosine_similarity": 0.0595703125, "mean_l2_distance": 68.5, "std_l2_distance": 2.25, "mean_dimension_correlation": 0.06342285592108965, "std_dimension_correlation": 0.18283416080924633, "linear_cka": 0.6640625 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 1.3359375, "mean_cosine_similarity": 0.0859375, "std_cosine_similarity": 0.057373046875, "mean_l2_distance": 68.5, "std_l2_distance": 2.171875, "mean_dimension_correlation": 0.06745534756919369, "std_dimension_correlation": 0.1829785716985573, "linear_cka": 0.65234375 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 1.359375, "mean_cosine_similarity": 0.0654296875, "std_cosine_similarity": 0.051025390625, "mean_l2_distance": 69.0, "std_l2_distance": 1.9140625, "mean_dimension_correlation": 0.0574939165264368, "std_dimension_correlation": 0.1857303062299582, "linear_cka": 0.66015625 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 1.3515625, "mean_cosine_similarity": 0.0703125, "std_cosine_similarity": 0.0615234375, "mean_l2_distance": 69.0, "std_l2_distance": 2.3125, "mean_dimension_correlation": 0.060864800889976325, "std_dimension_correlation": 0.18444010568801722, "linear_cka": 0.66015625 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.08251953125, "std_cosine_similarity": 0.05322265625, "mean_l2_distance": 68.5, "std_l2_distance": 2.0, "mean_dimension_correlation": 0.062481947243213654, "std_dimension_correlation": 0.18797583962876385, "linear_cka": 0.6640625 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 1.15625, "mean_cosine_similarity": 0.31640625, "std_cosine_similarity": 0.10888671875, "mean_l2_distance": 59.0, "std_l2_distance": 4.84375, "mean_dimension_correlation": 0.29400850236415865, "std_dimension_correlation": 0.16629643255508064, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 1.140625, "mean_cosine_similarity": 0.333984375, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 58.25, "std_l2_distance": 4.84375, "mean_dimension_correlation": 0.3036854453384876, "std_dimension_correlation": 0.1640714460889962, "linear_cka": 0.98046875 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 1.1953125, "mean_cosine_similarity": 0.2734375, "std_cosine_similarity": 0.109375, "mean_l2_distance": 61.0, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.2504168091341853, "std_dimension_correlation": 0.16876857548561683, "linear_cka": 0.9765625 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 1.1796875, "mean_cosine_similarity": 0.2890625, "std_cosine_similarity": 0.10791015625, "mean_l2_distance": 60.25, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.2720076544210315, "std_dimension_correlation": 0.16885400186672725, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.07666015625, "std_cosine_similarity": 0.0595703125, "mean_l2_distance": 68.5, "std_l2_distance": 2.25, "mean_dimension_correlation": 0.06342689506709576, "std_dimension_correlation": 0.18284259799255337, "linear_cka": 0.6640625 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 1.15625, "mean_cosine_similarity": 0.31640625, "std_cosine_similarity": 0.10888671875, "mean_l2_distance": 59.0, "std_l2_distance": 4.84375, "mean_dimension_correlation": 0.29399659037590026, "std_dimension_correlation": 0.16628270680485127, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 1.171875, "mean_cosine_similarity": 0.302734375, "std_cosine_similarity": 0.1162109375, "mean_l2_distance": 59.5, "std_l2_distance": 5.09375, "mean_dimension_correlation": 0.2821845322847366, "std_dimension_correlation": 0.1658260527951995, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 1.1875, "mean_cosine_similarity": 0.279296875, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 60.75, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.2564893037080765, "std_dimension_correlation": 0.1700593172594885, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 1.125, "mean_cosine_similarity": 0.3515625, "std_cosine_similarity": 0.107421875, "mean_l2_distance": 57.5, "std_l2_distance": 4.875, "mean_dimension_correlation": 0.32521353638730943, "std_dimension_correlation": 0.16142420298822246, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.3359375, "mean_cosine_similarity": 0.0859375, "std_cosine_similarity": 0.057373046875, "mean_l2_distance": 68.5, "std_l2_distance": 2.171875, "mean_dimension_correlation": 0.06747776636620983, "std_dimension_correlation": 0.18297839209554984, "linear_cka": 0.65234375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 1.140625, "mean_cosine_similarity": 0.333984375, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 58.25, "std_l2_distance": 4.84375, "mean_dimension_correlation": 0.3036851711571217, "std_dimension_correlation": 0.16406197803154954, "linear_cka": 0.98046875 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 1.171875, "mean_cosine_similarity": 0.302734375, "std_cosine_similarity": 0.1162109375, "mean_l2_distance": 59.5, "std_l2_distance": 5.09375, "mean_dimension_correlation": 0.2821806937456131, "std_dimension_correlation": 0.16583393871178487, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 1.1953125, "mean_cosine_similarity": 0.267578125, "std_cosine_similarity": 0.099609375, "mean_l2_distance": 61.0, "std_l2_distance": 4.25, "mean_dimension_correlation": 0.25347145795822146, "std_dimension_correlation": 0.1676427892496719, "linear_cka": 0.9765625 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 1.140625, "mean_cosine_similarity": 0.3359375, "std_cosine_similarity": 0.10302734375, "mean_l2_distance": 58.25, "std_l2_distance": 4.5625, "mean_dimension_correlation": 0.3062414702028036, "std_dimension_correlation": 0.16808821448337685, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.359375, "mean_cosine_similarity": 0.0654296875, "std_cosine_similarity": 0.051025390625, "mean_l2_distance": 69.0, "std_l2_distance": 1.9140625, "mean_dimension_correlation": 0.05748535506427288, "std_dimension_correlation": 0.18570980538433127, "linear_cka": 0.66015625 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 1.1953125, "mean_cosine_similarity": 0.2734375, "std_cosine_similarity": 0.109375, "mean_l2_distance": 61.0, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.25039467196911575, "std_dimension_correlation": 0.16873641618577465, "linear_cka": 0.9765625 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 1.1875, "mean_cosine_similarity": 0.279296875, "std_cosine_similarity": 0.1083984375, "mean_l2_distance": 60.75, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.25649560913443564, "std_dimension_correlation": 0.17006732480563802, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 1.1953125, "mean_cosine_similarity": 0.267578125, "std_cosine_similarity": 0.099609375, "mean_l2_distance": 61.0, "std_l2_distance": 4.25, "mean_dimension_correlation": 0.2534836530685425, "std_dimension_correlation": 0.16762209134956418, "linear_cka": 0.9765625 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 1.1953125, "mean_cosine_similarity": 0.267578125, "std_cosine_similarity": 0.1044921875, "mean_l2_distance": 61.0, "std_l2_distance": 4.46875, "mean_dimension_correlation": 0.2458049923181534, "std_dimension_correlation": 0.16614846928894367, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.0703125, "std_cosine_similarity": 0.0615234375, "mean_l2_distance": 69.0, "std_l2_distance": 2.3125, "mean_dimension_correlation": 0.060881674400297923, "std_dimension_correlation": 0.18442433029309818, "linear_cka": 0.66015625 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 1.1796875, "mean_cosine_similarity": 0.2890625, "std_cosine_similarity": 0.10791015625, "mean_l2_distance": 60.25, "std_l2_distance": 4.6875, "mean_dimension_correlation": 0.27202143501490356, "std_dimension_correlation": 0.16887910421184668, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 1.125, "mean_cosine_similarity": 0.3515625, "std_cosine_similarity": 0.107421875, "mean_l2_distance": 57.5, "std_l2_distance": 4.875, "mean_dimension_correlation": 0.32520583919249474, "std_dimension_correlation": 0.16142301505759077, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 1.140625, "mean_cosine_similarity": 0.3359375, "std_cosine_similarity": 0.10302734375, "mean_l2_distance": 58.25, "std_l2_distance": 4.5625, "mean_dimension_correlation": 0.30626075267791747, "std_dimension_correlation": 0.16814174967857434, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 1.1953125, "mean_cosine_similarity": 0.267578125, "std_cosine_similarity": 0.1044921875, "mean_l2_distance": 61.0, "std_l2_distance": 4.46875, "mean_dimension_correlation": 0.24580164328217508, "std_dimension_correlation": 0.16616246993231076, "linear_cka": 0.984375 }, "avg_mse": 1.2291666666666667, "std_mse": 0.08795763263576896, "avg_mean_cosine_similarity": 0.2265625, "std_mean_cosine_similarity": 0.10912461458526285, "avg_std_cosine_similarity": 0.0904296875, "std_std_cosine_similarity": 0.02430735142446207, "avg_mean_l2_distance": 62.666666666666664, "std_mean_l2_distance": 4.391911757866827, "avg_std_l2_distance": 3.8432291666666667, "std_std_l2_distance": 1.2284684192595492, "avg_mean_dimension_correlation": 0.20675061237125192, "std_mean_dimension_correlation": 0.10433772738239133, "avg_std_dimension_correlation": 0.17274183624909276, "std_std_dimension_correlation": 0.008813959193590607, "avg_linear_cka": 0.87578125, "std_linear_cka": 0.15252860046015415 } } }