{ "epoch": 1, "n_tokens": 7003136, "global_step": 6839, "training_metrics": { "train/loss": 2.546875, "train/contrastive": 2.453125, "train/recons_loss": 0.5859375, "train/balance_loss": 3.875, "train/balance_loss_contrastive": 2.859375, "train/balance_loss_recons": 1.015625, "train/contrastive_std": 3.390625, "train/recons_std": 0.07177734375, "train/contrastive_min": 0.0849609375, "train/contrastive_max": 7.1875, "train/recons_min": 0.49609375, "train/recons_max": 0.68359375, "train/Qwen3_0.6B_layer_2": 0.68359375, "train/Qwen3_0.6B_layer_4": 0.55859375, "train/Qwen3_1.7B_layer_2": 0.5390625, "train/Qwen3_1.7B_layer_4": 0.65625, "train/Qwen3_4B_layer_2": 0.49609375, "train/Qwen3_4B_layer_4": 0.57421875, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 7003136, "train/step": 6839 }, "eval_metrics": { "global_step": 6839, "n_tokens": 7003136, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 7.436762809753418, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.0658464431762695, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.481184482574463, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.2030930519104, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.240965366363525, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.4969305992126465, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.213552474975586, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.2100319862365723, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.3322105407714844, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.236471652984619, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2212750911712646, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.308291435241699, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.611138343811035, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.940827369689941, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.0901408195495605, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.319899559020996, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.241495609283447, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.30285120010376, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.640443801879883, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.575563907623291, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.661689043045044, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.5703558921813965, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.555753707885742, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7401225566864014, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.5014476776123047, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.3176639080047607, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.024665355682373, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.2373054027557373, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.237010955810547, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.1157214641571045, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.1809849739074707, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.101025104522705, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.89253568649292, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0308847427368164, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.043917179107666, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7547640800476074, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.3392760753631592, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2929567098617554, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.3323848247528076, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3409219980239868, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.3193416595458984, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.3225743770599365, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9926112294197083, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9511648416519165, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9907838106155396, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.0008413791656494, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9629853367805481, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9665719270706177, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9707884788513184, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9194974899291992, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9195350408554077, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9375813603401184, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9312270283699036, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9314996600151062, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2487989664077759, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.2148901224136353, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.2322641611099243, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.2202345132827759, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.212104320526123, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.2153819799423218, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9843270182609558, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9733158349990845, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9586274027824402, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9659514427185059, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9286855459213257, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.941694438457489, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.0964877605438232, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0766807794570923, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0825374126434326, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0911656618118286, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0675519704818726, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0499017238616943 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.37890625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1748046875, "mean_l2_distance": 19.375, "std_l2_distance": 13.5, "mean_dimension_correlation": 0.88087158203125, "std_dimension_correlation": 0.03891650382660663, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.384765625, "mean_cosine_similarity": 0.890625, "std_cosine_similarity": 0.1806640625, "mean_l2_distance": 19.625, "std_l2_distance": 13.8125, "mean_dimension_correlation": 0.8772735595703125, "std_dimension_correlation": 0.040613200120000074, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.380859375, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1728515625, "mean_l2_distance": 19.5, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.8814620971679688, "std_dimension_correlation": 0.03743361738689456, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.380859375, "mean_cosine_similarity": 0.890625, "std_cosine_similarity": 0.1787109375, "mean_l2_distance": 19.625, "std_l2_distance": 13.6875, "mean_dimension_correlation": 0.8784408569335938, "std_dimension_correlation": 0.03935897183860133, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.37890625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.177734375, "mean_l2_distance": 19.5, "std_l2_distance": 13.5625, "mean_dimension_correlation": 0.880218505859375, "std_dimension_correlation": 0.03827009184402313, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.37890625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1748046875, "mean_l2_distance": 19.375, "std_l2_distance": 13.5, "mean_dimension_correlation": 0.8809066772460937, "std_dimension_correlation": 0.038947862226715424, "linear_cka": 0.96875 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.3125, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.171875, "mean_l2_distance": 16.0, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.900384521484375, "std_dimension_correlation": 0.03780791654866101, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.30859375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.171875, "mean_l2_distance": 15.75, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.9015731811523438, "std_dimension_correlation": 0.037591582433134804, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.314453125, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.18359375, "mean_l2_distance": 16.125, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8960525512695312, "std_dimension_correlation": 0.0391310064588162, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.3125, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.1826171875, "mean_l2_distance": 15.9375, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8973068237304688, "std_dimension_correlation": 0.040051008367224694, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.384765625, "mean_cosine_similarity": 0.890625, "std_cosine_similarity": 0.1806640625, "mean_l2_distance": 19.625, "std_l2_distance": 13.8125, "mean_dimension_correlation": 0.877227783203125, "std_dimension_correlation": 0.04062615493819983, "linear_cka": 0.96875 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.3125, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.171875, "mean_l2_distance": 16.0, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.9004119873046875, "std_dimension_correlation": 0.037833126797664623, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.302734375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.181640625, "mean_l2_distance": 15.5625, "std_l2_distance": 14.875, "mean_dimension_correlation": 0.8998611450195313, "std_dimension_correlation": 0.03902960080299192, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.306640625, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.1826171875, "mean_l2_distance": 15.6875, "std_l2_distance": 14.9375, "mean_dimension_correlation": 0.8979522705078125, "std_dimension_correlation": 0.04081550083822763, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.296875, "mean_cosine_similarity": 0.91796875, "std_cosine_similarity": 0.1689453125, "mean_l2_distance": 15.1875, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.9056289672851563, "std_dimension_correlation": 0.037629372700621964, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.380859375, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1728515625, "mean_l2_distance": 19.5, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.881549072265625, "std_dimension_correlation": 0.037422879211554134, "linear_cka": 0.96875 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.30859375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.171875, "mean_l2_distance": 15.75, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.9015762329101562, "std_dimension_correlation": 0.03752165553415462, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.302734375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.181640625, "mean_l2_distance": 15.5625, "std_l2_distance": 14.875, "mean_dimension_correlation": 0.8998123168945312, "std_dimension_correlation": 0.03903639037032718, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.30859375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.1806640625, "mean_l2_distance": 15.8125, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8988906860351562, "std_dimension_correlation": 0.038209415172214704, "linear_cka": 0.9921875 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.298828125, "mean_cosine_similarity": 0.91796875, "std_cosine_similarity": 0.1748046875, "mean_l2_distance": 15.3125, "std_l2_distance": 14.4375, "mean_dimension_correlation": 0.9034744262695312, "std_dimension_correlation": 0.03725401269453125, "linear_cka": 0.9921875 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.380859375, "mean_cosine_similarity": 0.890625, "std_cosine_similarity": 0.1787109375, "mean_l2_distance": 19.625, "std_l2_distance": 13.6875, "mean_dimension_correlation": 0.8785064697265625, "std_dimension_correlation": 0.0393290152752092, "linear_cka": 0.96875 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.314453125, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.18359375, "mean_l2_distance": 16.125, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8960662841796875, "std_dimension_correlation": 0.039119882279186446, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.306640625, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.1826171875, "mean_l2_distance": 15.6875, "std_l2_distance": 14.9375, "mean_dimension_correlation": 0.8979080200195313, "std_dimension_correlation": 0.04092076296017269, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.30859375, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.1806640625, "mean_l2_distance": 15.8125, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8988739013671875, "std_dimension_correlation": 0.03830457081641783, "linear_cka": 0.9921875 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.29296875, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.1845703125, "mean_l2_distance": 15.0, "std_l2_distance": 15.1875, "mean_dimension_correlation": 0.9007888793945312, "std_dimension_correlation": 0.03971286220373166, "linear_cka": 0.9921875 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.37890625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.177734375, "mean_l2_distance": 19.5, "std_l2_distance": 13.5625, "mean_dimension_correlation": 0.8801483154296875, "std_dimension_correlation": 0.038289717180632434, "linear_cka": 0.96875 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.3125, "mean_cosine_similarity": 0.91015625, "std_cosine_similarity": 0.1826171875, "mean_l2_distance": 15.9375, "std_l2_distance": 14.75, "mean_dimension_correlation": 0.8973480224609375, "std_dimension_correlation": 0.0400727561743654, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.296875, "mean_cosine_similarity": 0.91796875, "std_cosine_similarity": 0.1689453125, "mean_l2_distance": 15.1875, "std_l2_distance": 14.1875, "mean_dimension_correlation": 0.9055801391601562, "std_dimension_correlation": 0.0376429470480909, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.298828125, "mean_cosine_similarity": 0.91796875, "std_cosine_similarity": 0.1748046875, "mean_l2_distance": 15.3125, "std_l2_distance": 14.4375, "mean_dimension_correlation": 0.9034759521484375, "std_dimension_correlation": 0.03727643893244147, "linear_cka": 0.9921875 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.29296875, "mean_cosine_similarity": 0.9140625, "std_cosine_similarity": 0.1845703125, "mean_l2_distance": 15.0, "std_l2_distance": 15.1875, "mean_dimension_correlation": 0.9008010864257813, "std_dimension_correlation": 0.03971378852358426, "linear_cka": 0.9921875 }, "avg_mse": 0.33059895833333336, "std_mse": 0.0360100791855751, "avg_mean_cosine_similarity": 0.9067708333333333, "std_mean_cosine_similarity": 0.010072437294694645, "avg_std_cosine_similarity": 0.17786458333333333, "std_std_cosine_similarity": 0.0048221052570680215, "avg_mean_l2_distance": 16.933333333333334, "std_mean_l2_distance": 1.8555191696365978, "avg_std_l2_distance": 14.270833333333334, "std_std_l2_distance": 0.5816941254263752, "avg_mean_dimension_correlation": 0.8933457438151041, "std_mean_dimension_correlation": 0.009972265018242275, "avg_std_dimension_correlation": 0.0387960870501666, "std_std_dimension_correlation": 0.0011301372196425136, "avg_linear_cka": 0.9807291666666667, "std_linear_cka": 0.008960755486502733 } } }