Commit
8061c39
·
verified ·
1 Parent(s): ef46d74

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +27 -0
  2. checkpoint_tokens_1000448.pt +3 -0
  3. checkpoint_tokens_2000896.pt +3 -0
  4. checkpoint_tokens_3001344.pt +3 -0
  5. checkpoint_tokens_4001792.pt +3 -0
  6. checkpoint_tokens_5002240.pt +3 -0
  7. checkpoint_tokens_6002688.pt +3 -0
  8. checkpoint_tokens_7003136.pt +3 -0
  9. checkpoint_tokens_8003584.pt +3 -0
  10. checkpoint_tokens_9004032.pt +3 -0
  11. evaluation/metrics.json +404 -0
  12. evaluation/metrics_tokens_1000448.json +433 -0
  13. evaluation/metrics_tokens_2000896.json +433 -0
  14. evaluation/metrics_tokens_3001344.json +433 -0
  15. evaluation/metrics_tokens_4001792.json +433 -0
  16. evaluation/metrics_tokens_5002240.json +433 -0
  17. evaluation/metrics_tokens_6002688.json +433 -0
  18. evaluation/metrics_tokens_7003136.json +433 -0
  19. evaluation/metrics_tokens_8003584.json +433 -0
  20. evaluation/metrics_tokens_9004032.json +433 -0
  21. evaluation/plots/kl_divergences_step_1954_tokens_2000896.png +3 -0
  22. evaluation/plots/kl_divergences_step_2931_tokens_3001344.png +3 -0
  23. evaluation/plots/kl_divergences_step_3908_tokens_4001792.png +3 -0
  24. evaluation/plots/kl_divergences_step_4885_tokens_5002240.png +3 -0
  25. evaluation/plots/kl_divergences_step_5862_tokens_6002688.png +3 -0
  26. evaluation/plots/kl_divergences_step_6839_tokens_7003136.png +3 -0
  27. evaluation/plots/kl_divergences_step_7816_tokens_8003584.png +3 -0
  28. evaluation/plots/kl_divergences_step_8793_tokens_9004032.png +3 -0
  29. evaluation/plots/kl_divergences_step_977_tokens_1000448.png +3 -0
  30. evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png +3 -0
  31. evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png +3 -0
  32. evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png +3 -0
  33. evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png +3 -0
  34. evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png +3 -0
  35. evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png +3 -0
  36. evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png +3 -0
  37. evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png +3 -0
  38. evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png +3 -0
  39. evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png +3 -0
  40. evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png +3 -0
  41. evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png +3 -0
  42. evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png +3 -0
  43. evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png +3 -0
  44. evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png +3 -0
  45. evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png +3 -0
  46. evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png +3 -0
  47. evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png +3 -0
  48. metrics_tokens_1000448.json +24 -0
  49. metrics_tokens_2000896.json +24 -0
  50. metrics_tokens_3001344.json +24 -0
.gitattributes CHANGED
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ evaluation/plots/kl_divergences_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
37
+ evaluation/plots/kl_divergences_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
38
+ evaluation/plots/kl_divergences_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
39
+ evaluation/plots/kl_divergences_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
40
+ evaluation/plots/kl_divergences_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
41
+ evaluation/plots/kl_divergences_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
42
+ evaluation/plots/kl_divergences_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
43
+ evaluation/plots/kl_divergences_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
44
+ evaluation/plots/kl_divergences_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
45
+ evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
46
+ evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
47
+ evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
48
+ evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
49
+ evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
50
+ evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
51
+ evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
52
+ evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
53
+ evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
54
+ evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
55
+ evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
56
+ evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
57
+ evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
58
+ evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
59
+ evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
60
+ evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
61
+ evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
62
+ evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
checkpoint_tokens_1000448.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:453c2d47e1d2491cfae40e414c0fd4a8cc8084a134bdf62c821738e55bd7779b
3
+ size 5559210371
checkpoint_tokens_2000896.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a142575fb2cd4748c8acf842ac4e80b2a9dda0795dc62e567d0cd8dcef92cd
3
+ size 5559210371
checkpoint_tokens_3001344.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62d4fc045c1ea757d769ad61629ba6c4da447c8f9904795d5baae9e055c5ad1f
3
+ size 5559210371
checkpoint_tokens_4001792.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f094260f810c1a31195dc3ceb2728f4c4fc1523cddddf6e103a0816f210187
3
+ size 5559210371
checkpoint_tokens_5002240.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cddc6b26bb6c2790c7a7a53416c5bd57a960e99d4c45f5f7b251733788e75f9
3
+ size 5559210371
checkpoint_tokens_6002688.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0581e9e4ff1d9e165bfb5e961d61ea249458419912b27cfa440b74e21e23d08a
3
+ size 5559210371
checkpoint_tokens_7003136.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fde3154cede539bbb9c2c0cbb5c2771d247f455ee91547d8f200d9ad2e862cc5
3
+ size 5559210371
checkpoint_tokens_8003584.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1f49201248a6845238afe71798950191df59c1d6c7e9730f96de728fa2e46f8
3
+ size 5559210371
checkpoint_tokens_9004032.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3059b5873c7f8ddb7438bb51b05b2abf11a007981a222e870b545c93f7e28a10
3
+ size 5559210371
evaluation/metrics.json ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 8793,
3
+ "n_tokens": 9004032,
4
+ "kl_divergence": {
5
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.006870746612549,
6
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.394875526428223,
7
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 5.912027359008789,
8
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.115749359130859,
9
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.151121616363525,
10
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.175347805023193,
11
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
12
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.124427318572998,
13
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.121898651123047,
14
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.1416893005371094,
15
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.1991913318634033,
16
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.14923357963562,
17
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.216580390930176,
18
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
19
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.9164533615112305,
20
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.773134708404541,
21
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.090576171875,
22
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.982679843902588,
23
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.148589134216309,
24
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.272560119628906,
25
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
26
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3844406604766846,
27
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.425341844558716,
28
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.334113597869873,
29
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.3682360649108887,
30
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.3439788818359375,
31
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.5122714042663574,
32
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
33
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4646520614624023,
34
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.1960129737854004,
35
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.9887456893920898,
36
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.074134111404419,
37
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.142500638961792,
38
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.9460818767547607,
39
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
40
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.4264042377471924,
41
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.065612554550171,
42
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.84149169921875,
43
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0016493797302246,
44
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 2.982909679412842,
45
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7882883548736572,
46
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
47
+ },
48
+ "mae_hidden_states": {
49
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.144361138343811,
50
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1407876014709473,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1702628135681152,
52
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.169557809829712,
53
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.164795160293579,
54
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.165663480758667,
55
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9478356242179871,
56
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9305350184440613,
57
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9448918104171753,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9919092059135437,
59
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9386879801750183,
60
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9315637350082397,
61
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9601666331291199,
62
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.8851673007011414,
63
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.8906123042106628,
64
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.8979656100273132,
65
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.8988674283027649,
66
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.900534451007843,
67
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.154961109161377,
68
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.1417714357376099,
69
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.147143840789795,
70
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1556771993637085,
71
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.14786696434021,
72
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.148809790611267,
73
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9560009837150574,
74
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9207914471626282,
75
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9233508110046387,
76
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.93439781665802,
77
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.894271194934845,
78
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9094542264938354,
79
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.068742275238037,
80
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0256458520889282,
81
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0398327112197876,
82
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.056915283203125,
83
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0318653583526611,
84
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.016662359237671
85
+ },
86
+ "alignment": {
87
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
88
+ "mse": 0.328125,
89
+ "mean_cosine_similarity": 0.921875,
90
+ "std_cosine_similarity": 0.1474609375,
91
+ "mean_l2_distance": 16.875,
92
+ "std_l2_distance": 11.6875,
93
+ "mean_dimension_correlation": 0.911785888671875,
94
+ "std_dimension_correlation": 0.029143728520497736,
95
+ "linear_cka": 0.97265625
96
+ },
97
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
98
+ "mse": 0.333984375,
99
+ "mean_cosine_similarity": 0.91796875,
100
+ "std_cosine_similarity": 0.1533203125,
101
+ "mean_l2_distance": 17.0,
102
+ "std_l2_distance": 12.0,
103
+ "mean_dimension_correlation": 0.908795166015625,
104
+ "std_dimension_correlation": 0.030533706065498583,
105
+ "linear_cka": 0.97265625
106
+ },
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
108
+ "mse": 0.333984375,
109
+ "mean_cosine_similarity": 0.921875,
110
+ "std_cosine_similarity": 0.1455078125,
111
+ "mean_l2_distance": 17.0,
112
+ "std_l2_distance": 11.4375,
113
+ "mean_dimension_correlation": 0.9114944458007812,
114
+ "std_dimension_correlation": 0.02791911734622361,
115
+ "linear_cka": 0.97265625
116
+ },
117
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
118
+ "mse": 0.333984375,
119
+ "mean_cosine_similarity": 0.91796875,
120
+ "std_cosine_similarity": 0.150390625,
121
+ "mean_l2_distance": 17.0,
122
+ "std_l2_distance": 11.8125,
123
+ "mean_dimension_correlation": 0.910113525390625,
124
+ "std_dimension_correlation": 0.029502623650495614,
125
+ "linear_cka": 0.97265625
126
+ },
127
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
128
+ "mse": 0.330078125,
129
+ "mean_cosine_similarity": 0.921875,
130
+ "std_cosine_similarity": 0.1513671875,
131
+ "mean_l2_distance": 17.0,
132
+ "std_l2_distance": 11.75,
133
+ "mean_dimension_correlation": 0.910638427734375,
134
+ "std_dimension_correlation": 0.029250348976490634,
135
+ "linear_cka": 0.97265625
136
+ },
137
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
138
+ "mse": 0.328125,
139
+ "mean_cosine_similarity": 0.921875,
140
+ "std_cosine_similarity": 0.1474609375,
141
+ "mean_l2_distance": 16.875,
142
+ "std_l2_distance": 11.6875,
143
+ "mean_dimension_correlation": 0.9118026733398438,
144
+ "std_dimension_correlation": 0.02911178290188815,
145
+ "linear_cka": 0.97265625
146
+ },
147
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
148
+ "mse": 0.259765625,
149
+ "mean_cosine_similarity": 0.9375,
150
+ "std_cosine_similarity": 0.1484375,
151
+ "mean_l2_distance": 13.3125,
152
+ "std_l2_distance": 12.4375,
153
+ "mean_dimension_correlation": 0.9281707763671875,
154
+ "std_dimension_correlation": 0.027613594267907524,
155
+ "linear_cka": 0.984375
156
+ },
157
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
158
+ "mse": 0.25390625,
159
+ "mean_cosine_similarity": 0.94140625,
160
+ "std_cosine_similarity": 0.1474609375,
161
+ "mean_l2_distance": 13.0625,
162
+ "std_l2_distance": 12.375,
163
+ "mean_dimension_correlation": 0.929296875,
164
+ "std_dimension_correlation": 0.027428457660098507,
165
+ "linear_cka": 0.984375
166
+ },
167
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
168
+ "mse": 0.26171875,
169
+ "mean_cosine_similarity": 0.9375,
170
+ "std_cosine_similarity": 0.1572265625,
171
+ "mean_l2_distance": 13.375,
172
+ "std_l2_distance": 12.8125,
173
+ "mean_dimension_correlation": 0.9258895874023437,
174
+ "std_dimension_correlation": 0.02807925327640673,
175
+ "linear_cka": 0.984375
176
+ },
177
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
178
+ "mse": 0.2578125,
179
+ "mean_cosine_similarity": 0.9375,
180
+ "std_cosine_similarity": 0.15625,
181
+ "mean_l2_distance": 13.1875,
182
+ "std_l2_distance": 12.875,
183
+ "mean_dimension_correlation": 0.9261764526367188,
184
+ "std_dimension_correlation": 0.029308957111961503,
185
+ "linear_cka": 0.984375
186
+ },
187
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
188
+ "mse": 0.333984375,
189
+ "mean_cosine_similarity": 0.91796875,
190
+ "std_cosine_similarity": 0.1533203125,
191
+ "mean_l2_distance": 17.0,
192
+ "std_l2_distance": 12.0,
193
+ "mean_dimension_correlation": 0.9088623046875,
194
+ "std_dimension_correlation": 0.030521200956836466,
195
+ "linear_cka": 0.97265625
196
+ },
197
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
198
+ "mse": 0.259765625,
199
+ "mean_cosine_similarity": 0.9375,
200
+ "std_cosine_similarity": 0.1484375,
201
+ "mean_l2_distance": 13.3125,
202
+ "std_l2_distance": 12.4375,
203
+ "mean_dimension_correlation": 0.9282363891601563,
204
+ "std_dimension_correlation": 0.02761614875613791,
205
+ "linear_cka": 0.984375
206
+ },
207
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
208
+ "mse": 0.248046875,
209
+ "mean_cosine_similarity": 0.9375,
210
+ "std_cosine_similarity": 0.15625,
211
+ "mean_l2_distance": 12.6875,
212
+ "std_l2_distance": 13.0,
213
+ "mean_dimension_correlation": 0.9286865234375,
214
+ "std_dimension_correlation": 0.028394499325967187,
215
+ "linear_cka": 1.0
216
+ },
217
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
218
+ "mse": 0.251953125,
219
+ "mean_cosine_similarity": 0.9375,
220
+ "std_cosine_similarity": 0.1572265625,
221
+ "mean_l2_distance": 12.875,
222
+ "std_l2_distance": 13.0,
223
+ "mean_dimension_correlation": 0.9273910522460938,
224
+ "std_dimension_correlation": 0.029792982191153054,
225
+ "linear_cka": 1.0
226
+ },
227
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
228
+ "mse": 0.2451171875,
229
+ "mean_cosine_similarity": 0.94140625,
230
+ "std_cosine_similarity": 0.146484375,
231
+ "mean_l2_distance": 12.5625,
232
+ "std_l2_distance": 12.4375,
233
+ "mean_dimension_correlation": 0.9318832397460938,
234
+ "std_dimension_correlation": 0.02779797256144542,
235
+ "linear_cka": 0.984375
236
+ },
237
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
238
+ "mse": 0.333984375,
239
+ "mean_cosine_similarity": 0.921875,
240
+ "std_cosine_similarity": 0.1455078125,
241
+ "mean_l2_distance": 17.0,
242
+ "std_l2_distance": 11.4375,
243
+ "mean_dimension_correlation": 0.9115066528320312,
244
+ "std_dimension_correlation": 0.02783942438110558,
245
+ "linear_cka": 0.97265625
246
+ },
247
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
248
+ "mse": 0.25390625,
249
+ "mean_cosine_similarity": 0.94140625,
250
+ "std_cosine_similarity": 0.1474609375,
251
+ "mean_l2_distance": 13.0625,
252
+ "std_l2_distance": 12.375,
253
+ "mean_dimension_correlation": 0.929364013671875,
254
+ "std_dimension_correlation": 0.027418246966595963,
255
+ "linear_cka": 0.984375
256
+ },
257
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
258
+ "mse": 0.248046875,
259
+ "mean_cosine_similarity": 0.9375,
260
+ "std_cosine_similarity": 0.15625,
261
+ "mean_l2_distance": 12.6875,
262
+ "std_l2_distance": 13.0,
263
+ "mean_dimension_correlation": 0.9286041259765625,
264
+ "std_dimension_correlation": 0.028414978282929146,
265
+ "linear_cka": 1.0
266
+ },
267
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
268
+ "mse": 0.25390625,
269
+ "mean_cosine_similarity": 0.9375,
270
+ "std_cosine_similarity": 0.154296875,
271
+ "mean_l2_distance": 13.0,
272
+ "std_l2_distance": 12.8125,
273
+ "mean_dimension_correlation": 0.927911376953125,
274
+ "std_dimension_correlation": 0.027630500633115,
275
+ "linear_cka": 0.984375
276
+ },
277
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
278
+ "mse": 0.24609375,
279
+ "mean_cosine_similarity": 0.94140625,
280
+ "std_cosine_similarity": 0.1513671875,
281
+ "mean_l2_distance": 12.6875,
282
+ "std_l2_distance": 12.625,
283
+ "mean_dimension_correlation": 0.9304595947265625,
284
+ "std_dimension_correlation": 0.027486156310205404,
285
+ "linear_cka": 0.984375
286
+ },
287
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
288
+ "mse": 0.333984375,
289
+ "mean_cosine_similarity": 0.91796875,
290
+ "std_cosine_similarity": 0.150390625,
291
+ "mean_l2_distance": 17.0,
292
+ "std_l2_distance": 11.8125,
293
+ "mean_dimension_correlation": 0.910162353515625,
294
+ "std_dimension_correlation": 0.029493359043523553,
295
+ "linear_cka": 0.97265625
296
+ },
297
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
298
+ "mse": 0.26171875,
299
+ "mean_cosine_similarity": 0.9375,
300
+ "std_cosine_similarity": 0.1572265625,
301
+ "mean_l2_distance": 13.375,
302
+ "std_l2_distance": 12.8125,
303
+ "mean_dimension_correlation": 0.9259017944335938,
304
+ "std_dimension_correlation": 0.02807177297712858,
305
+ "linear_cka": 0.984375
306
+ },
307
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
308
+ "mse": 0.251953125,
309
+ "mean_cosine_similarity": 0.9375,
310
+ "std_cosine_similarity": 0.1572265625,
311
+ "mean_l2_distance": 12.875,
312
+ "std_l2_distance": 13.0,
313
+ "mean_dimension_correlation": 0.9274307250976562,
314
+ "std_dimension_correlation": 0.029833198285711512,
315
+ "linear_cka": 1.0
316
+ },
317
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
318
+ "mse": 0.25390625,
319
+ "mean_cosine_similarity": 0.9375,
320
+ "std_cosine_similarity": 0.154296875,
321
+ "mean_l2_distance": 13.0,
322
+ "std_l2_distance": 12.8125,
323
+ "mean_dimension_correlation": 0.9279266357421875,
324
+ "std_dimension_correlation": 0.027642045164903522,
325
+ "linear_cka": 0.984375
326
+ },
327
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
328
+ "mse": 0.236328125,
329
+ "mean_cosine_similarity": 0.94140625,
330
+ "std_cosine_similarity": 0.158203125,
331
+ "mean_l2_distance": 12.125,
332
+ "std_l2_distance": 13.25,
333
+ "mean_dimension_correlation": 0.93009033203125,
334
+ "std_dimension_correlation": 0.029206890514525005,
335
+ "linear_cka": 1.0
336
+ },
337
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
338
+ "mse": 0.330078125,
339
+ "mean_cosine_similarity": 0.921875,
340
+ "std_cosine_similarity": 0.1513671875,
341
+ "mean_l2_distance": 17.0,
342
+ "std_l2_distance": 11.75,
343
+ "mean_dimension_correlation": 0.910687255859375,
344
+ "std_dimension_correlation": 0.02925704219094372,
345
+ "linear_cka": 0.97265625
346
+ },
347
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
348
+ "mse": 0.2578125,
349
+ "mean_cosine_similarity": 0.9375,
350
+ "std_cosine_similarity": 0.15625,
351
+ "mean_l2_distance": 13.1875,
352
+ "std_l2_distance": 12.875,
353
+ "mean_dimension_correlation": 0.9263031005859375,
354
+ "std_dimension_correlation": 0.0292820509917565,
355
+ "linear_cka": 0.984375
356
+ },
357
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
358
+ "mse": 0.2451171875,
359
+ "mean_cosine_similarity": 0.94140625,
360
+ "std_cosine_similarity": 0.146484375,
361
+ "mean_l2_distance": 12.5625,
362
+ "std_l2_distance": 12.4375,
363
+ "mean_dimension_correlation": 0.9319442749023438,
364
+ "std_dimension_correlation": 0.02774844077379742,
365
+ "linear_cka": 0.984375
366
+ },
367
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
368
+ "mse": 0.24609375,
369
+ "mean_cosine_similarity": 0.94140625,
370
+ "std_cosine_similarity": 0.1513671875,
371
+ "mean_l2_distance": 12.6875,
372
+ "std_l2_distance": 12.625,
373
+ "mean_dimension_correlation": 0.9305908203125,
374
+ "std_dimension_correlation": 0.027491914687628318,
375
+ "linear_cka": 0.984375
376
+ },
377
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
378
+ "mse": 0.236328125,
379
+ "mean_cosine_similarity": 0.94140625,
380
+ "std_cosine_similarity": 0.158203125,
381
+ "mean_l2_distance": 12.125,
382
+ "std_l2_distance": 13.25,
383
+ "mean_dimension_correlation": 0.9300765991210938,
384
+ "std_dimension_correlation": 0.029167152542476576,
385
+ "linear_cka": 1.0
386
+ },
387
+ "avg_mse": 0.2783203125,
388
+ "std_mse": 0.03847375333442295,
389
+ "avg_mean_cosine_similarity": 0.9328125,
390
+ "std_mean_cosine_similarity": 0.009043622580304863,
391
+ "avg_std_cosine_similarity": 0.15208333333333332,
392
+ "std_std_cosine_similarity": 0.004211187924165684,
393
+ "avg_mean_l2_distance": 14.25,
394
+ "std_mean_l2_distance": 1.9497596005661826,
395
+ "avg_std_l2_distance": 12.420833333333333,
396
+ "std_std_l2_distance": 0.5426913385054979,
397
+ "avg_mean_dimension_correlation": 0.9226060994466144,
398
+ "std_mean_dimension_correlation": 0.00864781898546122,
399
+ "avg_std_dimension_correlation": 0.02859991824384515,
400
+ "std_std_dimension_correlation": 0.00095096984819575,
401
+ "avg_linear_cka": 0.98359375,
402
+ "std_linear_cka": 0.009695057535930357
403
+ }
404
+ }
evaluation/metrics_tokens_1000448.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 1000448,
4
+ "global_step": 977,
5
+ "training_metrics": {
6
+ "train/loss": 0.3984375,
7
+ "train/contrastive": 0.310546875,
8
+ "train/recons_loss": 0.68359375,
9
+ "train/balance_loss": 2.015625,
10
+ "train/balance_loss_contrastive": 1.0,
11
+ "train/balance_loss_recons": 1.015625,
12
+ "train/contrastive_std": 0.015625,
13
+ "train/recons_std": 0.0888671875,
14
+ "train/contrastive_min": 0.27734375,
15
+ "train/contrastive_max": 0.3359375,
16
+ "train/recons_min": 0.58203125,
17
+ "train/recons_max": 0.84375,
18
+ "train/Qwen3_0.6B_layer_2": 0.58203125,
19
+ "train/Qwen3_0.6B_layer_4": 0.69140625,
20
+ "train/Qwen3_1.7B_layer_2": 0.65234375,
21
+ "train/Qwen3_1.7B_layer_4": 0.84375,
22
+ "train/Qwen3_4B_layer_2": 0.63671875,
23
+ "train/Qwen3_4B_layer_4": 0.6953125,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 1000448,
27
+ "train/step": 977
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 977,
31
+ "n_tokens": 1000448,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.733099460601807,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.016953945159912,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.4851484298706055,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.2205095291137695,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.60541296005249,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.584229946136475,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 3.0528693199157715,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 3.0396716594696045,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 3.070789337158203,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 3.154045820236206,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 3.1686503887176514,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 3.2403674125671387,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.6706085205078125,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.970404624938965,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.1444926261901855,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.856858253479004,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.129464149475098,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 5.904314994812012,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 3.781726360321045,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 3.783668041229248,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 3.837904453277588,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 3.7967495918273926,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 3.835503101348877,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 3.970118761062622,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 3.5546975135803223,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 3.486462116241455,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 3.2824597358703613,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 3.3095145225524902,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 3.164924383163452,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 3.2683002948760986,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.2000813484191895,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 4.317432403564453,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 4.843194007873535,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 4.330343723297119,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 4.750157356262207,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 4.742369174957275,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.140511155128479,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1872466802597046,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1824793815612793,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.1870241165161133,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.1805082559585571,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.1967138051986694,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.3885185718536377,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.3479934930801392,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.3934825658798218,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.4097040891647339,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.3858134746551514,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.3670704364776611,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.3714081048965454,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.335449457168579,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.274694561958313,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.3633638620376587,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.3621580600738525,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.3528505563735962,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 2.7587032318115234,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.6864445209503174,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 2.717672824859619,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.55366587638855,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.7207069396972656,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 2.698686361312866,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.694230079650879,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.7634074687957764,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.6623401641845703,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.6645656824111938,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.5185480117797852,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.6429026126861572,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.6580111980438232,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.689562439918518,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.6670496463775635,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.674801230430603,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.665151834487915,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.5619523525238037
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 1.2421875,
117
+ "mean_cosine_similarity": 0.2119140625,
118
+ "std_cosine_similarity": 0.05712890625,
119
+ "mean_l2_distance": 63.5,
120
+ "std_l2_distance": 2.28125,
121
+ "mean_dimension_correlation": 0.1991757216863334,
122
+ "std_dimension_correlation": 0.1752726942140706,
123
+ "linear_cka": 0.98046875
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 1.2734375,
127
+ "mean_cosine_similarity": 0.171875,
128
+ "std_cosine_similarity": 0.061279296875,
129
+ "mean_l2_distance": 65.0,
130
+ "std_l2_distance": 2.40625,
131
+ "mean_dimension_correlation": 0.1659273698925972,
132
+ "std_dimension_correlation": 0.1742108812082249,
133
+ "linear_cka": 0.98046875
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 1.2421875,
137
+ "mean_cosine_similarity": 0.212890625,
138
+ "std_cosine_similarity": 0.048583984375,
139
+ "mean_l2_distance": 63.5,
140
+ "std_l2_distance": 1.9296875,
141
+ "mean_dimension_correlation": 0.2012754407711327,
142
+ "std_dimension_correlation": 0.1694794786130474,
143
+ "linear_cka": 0.9765625
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 1.28125,
147
+ "mean_cosine_similarity": 0.1640625,
148
+ "std_cosine_similarity": 0.051513671875,
149
+ "mean_l2_distance": 65.5,
150
+ "std_l2_distance": 2.03125,
151
+ "mean_dimension_correlation": 0.15531851844862105,
152
+ "std_dimension_correlation": 0.17483813595973072,
153
+ "linear_cka": 0.97265625
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 1.25,
157
+ "mean_cosine_similarity": 0.203125,
158
+ "std_cosine_similarity": 0.0556640625,
159
+ "mean_l2_distance": 63.75,
160
+ "std_l2_distance": 2.21875,
161
+ "mean_dimension_correlation": 0.19491876736283303,
162
+ "std_dimension_correlation": 0.17604292602805996,
163
+ "linear_cka": 0.98046875
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 1.2421875,
167
+ "mean_cosine_similarity": 0.2119140625,
168
+ "std_cosine_similarity": 0.05712890625,
169
+ "mean_l2_distance": 63.5,
170
+ "std_l2_distance": 2.28125,
171
+ "mean_dimension_correlation": 0.19918374745175244,
172
+ "std_dimension_correlation": 0.1752806618143252,
173
+ "linear_cka": 0.98046875
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 1.25,
177
+ "mean_cosine_similarity": 0.201171875,
178
+ "std_cosine_similarity": 0.0576171875,
179
+ "mean_l2_distance": 64.0,
180
+ "std_l2_distance": 2.3125,
181
+ "mean_dimension_correlation": 0.19001486599445344,
182
+ "std_dimension_correlation": 0.17611495516941666,
183
+ "linear_cka": 0.98046875
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 1.2421875,
187
+ "mean_cosine_similarity": 0.216796875,
188
+ "std_cosine_similarity": 0.056884765625,
189
+ "mean_l2_distance": 63.25,
190
+ "std_l2_distance": 2.296875,
191
+ "mean_dimension_correlation": 0.19663777351379394,
192
+ "std_dimension_correlation": 0.1757753968790023,
193
+ "linear_cka": 0.9765625
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 1.28125,
197
+ "mean_cosine_similarity": 0.162109375,
198
+ "std_cosine_similarity": 0.05517578125,
199
+ "mean_l2_distance": 65.5,
200
+ "std_l2_distance": 2.15625,
201
+ "mean_dimension_correlation": 0.15222867031116039,
202
+ "std_dimension_correlation": 0.17404282759337691,
203
+ "linear_cka": 0.96875
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 1.28125,
207
+ "mean_cosine_similarity": 0.1728515625,
208
+ "std_cosine_similarity": 0.050048828125,
209
+ "mean_l2_distance": 65.0,
210
+ "std_l2_distance": 1.9609375,
211
+ "mean_dimension_correlation": 0.16632139831781387,
212
+ "std_dimension_correlation": 0.17677271492333846,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 1.2734375,
217
+ "mean_cosine_similarity": 0.171875,
218
+ "std_cosine_similarity": 0.061279296875,
219
+ "mean_l2_distance": 65.0,
220
+ "std_l2_distance": 2.40625,
221
+ "mean_dimension_correlation": 0.16591697484254836,
222
+ "std_dimension_correlation": 0.17420606946071834,
223
+ "linear_cka": 0.98046875
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 1.25,
227
+ "mean_cosine_similarity": 0.201171875,
228
+ "std_cosine_similarity": 0.0576171875,
229
+ "mean_l2_distance": 64.0,
230
+ "std_l2_distance": 2.3125,
231
+ "mean_dimension_correlation": 0.1900260180234909,
232
+ "std_dimension_correlation": 0.17610506497287914,
233
+ "linear_cka": 0.98046875
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 1.2734375,
237
+ "mean_cosine_similarity": 0.1796875,
238
+ "std_cosine_similarity": 0.058349609375,
239
+ "mean_l2_distance": 65.0,
240
+ "std_l2_distance": 2.3125,
241
+ "mean_dimension_correlation": 0.16930750142782927,
242
+ "std_dimension_correlation": 0.17924016132496154,
243
+ "linear_cka": 0.984375
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 1.28125,
247
+ "mean_cosine_similarity": 0.171875,
248
+ "std_cosine_similarity": 0.060791015625,
249
+ "mean_l2_distance": 65.0,
250
+ "std_l2_distance": 2.390625,
251
+ "mean_dimension_correlation": 0.15887711457908155,
252
+ "std_dimension_correlation": 0.1775669214564653,
253
+ "linear_cka": 0.97265625
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 1.2265625,
257
+ "mean_cosine_similarity": 0.2421875,
258
+ "std_cosine_similarity": 0.060302734375,
259
+ "mean_l2_distance": 62.25,
260
+ "std_l2_distance": 2.453125,
261
+ "mean_dimension_correlation": 0.22514247596263887,
262
+ "std_dimension_correlation": 0.17232934079449927,
263
+ "linear_cka": 0.98046875
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 1.2421875,
267
+ "mean_cosine_similarity": 0.212890625,
268
+ "std_cosine_similarity": 0.048583984375,
269
+ "mean_l2_distance": 63.5,
270
+ "std_l2_distance": 1.9296875,
271
+ "mean_dimension_correlation": 0.20127090597525238,
272
+ "std_dimension_correlation": 0.16948777576467297,
273
+ "linear_cka": 0.9765625
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 1.2421875,
277
+ "mean_cosine_similarity": 0.216796875,
278
+ "std_cosine_similarity": 0.056884765625,
279
+ "mean_l2_distance": 63.25,
280
+ "std_l2_distance": 2.296875,
281
+ "mean_dimension_correlation": 0.1966501235961914,
282
+ "std_dimension_correlation": 0.1757826105994209,
283
+ "linear_cka": 0.9765625
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 1.2734375,
287
+ "mean_cosine_similarity": 0.1796875,
288
+ "std_cosine_similarity": 0.058349609375,
289
+ "mean_l2_distance": 65.0,
290
+ "std_l2_distance": 2.3125,
291
+ "mean_dimension_correlation": 0.16931568142026662,
292
+ "std_dimension_correlation": 0.17925366106210552,
293
+ "linear_cka": 0.984375
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 1.2890625,
297
+ "mean_cosine_similarity": 0.15625,
298
+ "std_cosine_similarity": 0.048828125,
299
+ "mean_l2_distance": 65.5,
300
+ "std_l2_distance": 1.90625,
301
+ "mean_dimension_correlation": 0.1511871140450239,
302
+ "std_dimension_correlation": 0.17476462218981592,
303
+ "linear_cka": 0.97265625
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 1.234375,
307
+ "mean_cosine_similarity": 0.220703125,
308
+ "std_cosine_similarity": 0.052490234375,
309
+ "mean_l2_distance": 63.25,
310
+ "std_l2_distance": 2.09375,
311
+ "mean_dimension_correlation": 0.2026286849519238,
312
+ "std_dimension_correlation": 0.17765936039692512,
313
+ "linear_cka": 0.9765625
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 1.28125,
317
+ "mean_cosine_similarity": 0.1640625,
318
+ "std_cosine_similarity": 0.051513671875,
319
+ "mean_l2_distance": 65.5,
320
+ "std_l2_distance": 2.03125,
321
+ "mean_dimension_correlation": 0.15531704826280474,
322
+ "std_dimension_correlation": 0.1748249456034516,
323
+ "linear_cka": 0.97265625
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 1.28125,
327
+ "mean_cosine_similarity": 0.162109375,
328
+ "std_cosine_similarity": 0.05517578125,
329
+ "mean_l2_distance": 65.5,
330
+ "std_l2_distance": 2.15625,
331
+ "mean_dimension_correlation": 0.1522526470012963,
332
+ "std_dimension_correlation": 0.17404197310947275,
333
+ "linear_cka": 0.96875
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 1.28125,
337
+ "mean_cosine_similarity": 0.171875,
338
+ "std_cosine_similarity": 0.060791015625,
339
+ "mean_l2_distance": 65.0,
340
+ "std_l2_distance": 2.390625,
341
+ "mean_dimension_correlation": 0.15887909792363644,
342
+ "std_dimension_correlation": 0.1775497121320674,
343
+ "linear_cka": 0.97265625
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 1.2890625,
347
+ "mean_cosine_similarity": 0.15625,
348
+ "std_cosine_similarity": 0.048828125,
349
+ "mean_l2_distance": 65.5,
350
+ "std_l2_distance": 1.90625,
351
+ "mean_dimension_correlation": 0.15117020402103662,
352
+ "std_dimension_correlation": 0.17475352199577512,
353
+ "linear_cka": 0.97265625
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 1.2890625,
357
+ "mean_cosine_similarity": 0.1591796875,
358
+ "std_cosine_similarity": 0.052490234375,
359
+ "mean_l2_distance": 65.5,
360
+ "std_l2_distance": 2.046875,
361
+ "mean_dimension_correlation": 0.14662780333310366,
362
+ "std_dimension_correlation": 0.17160536584318073,
363
+ "linear_cka": 0.9765625
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 1.25,
367
+ "mean_cosine_similarity": 0.203125,
368
+ "std_cosine_similarity": 0.0556640625,
369
+ "mean_l2_distance": 63.75,
370
+ "std_l2_distance": 2.21875,
371
+ "mean_dimension_correlation": 0.19489949941635132,
372
+ "std_dimension_correlation": 0.17601193178810715,
373
+ "linear_cka": 0.98046875
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 1.28125,
377
+ "mean_cosine_similarity": 0.1728515625,
378
+ "std_cosine_similarity": 0.050048828125,
379
+ "mean_l2_distance": 65.0,
380
+ "std_l2_distance": 1.9609375,
381
+ "mean_dimension_correlation": 0.16633510272949933,
382
+ "std_dimension_correlation": 0.17675989044615031,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 1.2265625,
387
+ "mean_cosine_similarity": 0.2421875,
388
+ "std_cosine_similarity": 0.060302734375,
389
+ "mean_l2_distance": 62.25,
390
+ "std_l2_distance": 2.453125,
391
+ "mean_dimension_correlation": 0.22512691989541053,
392
+ "std_dimension_correlation": 0.17232222187158686,
393
+ "linear_cka": 0.98046875
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 1.234375,
397
+ "mean_cosine_similarity": 0.220703125,
398
+ "std_cosine_similarity": 0.052490234375,
399
+ "mean_l2_distance": 63.25,
400
+ "std_l2_distance": 2.09375,
401
+ "mean_dimension_correlation": 0.20264770851936192,
402
+ "std_dimension_correlation": 0.1776996694007629,
403
+ "linear_cka": 0.9765625
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 1.2890625,
407
+ "mean_cosine_similarity": 0.1591796875,
408
+ "std_cosine_similarity": 0.052490234375,
409
+ "mean_l2_distance": 65.5,
410
+ "std_l2_distance": 2.046875,
411
+ "mean_dimension_correlation": 0.14663737285882233,
412
+ "std_dimension_correlation": 0.17162838967231733,
413
+ "linear_cka": 0.9765625
414
+ },
415
+ "avg_mse": 1.2625,
416
+ "std_mse": 0.02111784888824301,
417
+ "avg_mean_cosine_similarity": 0.18977864583333334,
418
+ "std_mean_cosine_similarity": 0.026181266453470867,
419
+ "avg_std_cosine_similarity": 0.05514322916666667,
420
+ "std_std_cosine_similarity": 0.004130588740067507,
421
+ "avg_mean_l2_distance": 64.36666666666666,
422
+ "std_mean_l2_distance": 1.0241527663824812,
423
+ "avg_std_l2_distance": 2.1864583333333334,
424
+ "std_std_l2_distance": 0.1757810570986596,
425
+ "avg_mean_dimension_correlation": 0.17837394241786875,
426
+ "std_mean_dimension_correlation": 0.02338007192514464,
427
+ "avg_std_dimension_correlation": 0.17504746274293098,
428
+ "std_std_dimension_correlation": 0.0024286700198214582,
429
+ "avg_linear_cka": 0.9776041666666667,
430
+ "std_linear_cka": 0.004388619673529352
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_2000896.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 2000896,
4
+ "global_step": 1954,
5
+ "training_metrics": {
6
+ "train/loss": 2.609375,
7
+ "train/contrastive": 2.5,
8
+ "train/recons_loss": 0.71484375,
9
+ "train/balance_loss": 3.671875,
10
+ "train/balance_loss_contrastive": 2.625,
11
+ "train/balance_loss_recons": 1.046875,
12
+ "train/contrastive_std": 3.234375,
13
+ "train/recons_std": 0.171875,
14
+ "train/contrastive_min": 0.224609375,
15
+ "train/contrastive_max": 7.0,
16
+ "train/recons_min": 0.5859375,
17
+ "train/recons_max": 1.046875,
18
+ "train/Qwen3_0.6B_layer_2": 1.046875,
19
+ "train/Qwen3_0.6B_layer_4": 0.63671875,
20
+ "train/Qwen3_1.7B_layer_2": 0.62109375,
21
+ "train/Qwen3_1.7B_layer_4": 0.75,
22
+ "train/Qwen3_4B_layer_2": 0.5859375,
23
+ "train/Qwen3_4B_layer_4": 0.65234375,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 2000896,
27
+ "train/step": 1954
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 1954,
31
+ "n_tokens": 2000896,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.937955856323242,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.9807448387146,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.929330348968506,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.9499993324279785,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.048929214477539,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.740438938140869,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 10.935715675354004,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 3.0637950897216797,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 3.0137126445770264,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.9935271739959717,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 3.0705885887145996,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 3.0900540351867676,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 9.36762523651123,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.4384565353393555,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.606346130371094,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.644039154052734,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.400282859802246,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.326376438140869,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 12.973535537719727,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.9450173377990723,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 3.160464286804199,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.980670928955078,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 3.0805249214172363,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 3.193880319595337,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.039875030517578,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9839415550231934,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.829629421234131,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.9013402462005615,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.699265241622925,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.6922624111175537,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.345184326171875,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.6716532707214355,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.481139659881592,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7798919677734375,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.6100268363952637,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.4427032470703125,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.8776168823242188,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.3933213949203491,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.383978009223938,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3917444944381714,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.38372004032135,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.364012598991394,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 4.228211402893066,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.2656141519546509,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.232149362564087,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.249756097793579,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.2561695575714111,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.2504116296768188,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.011510372161865,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.1570074558258057,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.123882532119751,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1837798357009888,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.1635627746582031,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1612880229949951,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 3.041527032852173,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.6266475915908813,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.6221141815185547,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.5699771642684937,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.6569658517837524,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.6420214176177979,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.062718391418457,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.3707829713821411,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.2881925106048584,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.2928853034973145,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.213749885559082,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.2994227409362793,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.752654075622559,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.4750916957855225,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.4292265176773071,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.4410502910614014,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.4525748491287231,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.3758857250213623
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 1.3515625,
117
+ "mean_cosine_similarity": 0.08251953125,
118
+ "std_cosine_similarity": 0.05322265625,
119
+ "mean_l2_distance": 68.5,
120
+ "std_l2_distance": 2.0,
121
+ "mean_dimension_correlation": 0.062478048354387285,
122
+ "std_dimension_correlation": 0.18795068082189698,
123
+ "linear_cka": 0.6640625
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 1.3515625,
127
+ "mean_cosine_similarity": 0.07666015625,
128
+ "std_cosine_similarity": 0.0595703125,
129
+ "mean_l2_distance": 68.5,
130
+ "std_l2_distance": 2.25,
131
+ "mean_dimension_correlation": 0.06342285592108965,
132
+ "std_dimension_correlation": 0.18283416080924633,
133
+ "linear_cka": 0.6640625
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 1.3359375,
137
+ "mean_cosine_similarity": 0.0859375,
138
+ "std_cosine_similarity": 0.057373046875,
139
+ "mean_l2_distance": 68.5,
140
+ "std_l2_distance": 2.171875,
141
+ "mean_dimension_correlation": 0.06745534756919369,
142
+ "std_dimension_correlation": 0.1829785716985573,
143
+ "linear_cka": 0.65234375
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 1.359375,
147
+ "mean_cosine_similarity": 0.0654296875,
148
+ "std_cosine_similarity": 0.051025390625,
149
+ "mean_l2_distance": 69.0,
150
+ "std_l2_distance": 1.9140625,
151
+ "mean_dimension_correlation": 0.0574939165264368,
152
+ "std_dimension_correlation": 0.1857303062299582,
153
+ "linear_cka": 0.66015625
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 1.3515625,
157
+ "mean_cosine_similarity": 0.0703125,
158
+ "std_cosine_similarity": 0.0615234375,
159
+ "mean_l2_distance": 69.0,
160
+ "std_l2_distance": 2.3125,
161
+ "mean_dimension_correlation": 0.060864800889976325,
162
+ "std_dimension_correlation": 0.18444010568801722,
163
+ "linear_cka": 0.66015625
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 1.3515625,
167
+ "mean_cosine_similarity": 0.08251953125,
168
+ "std_cosine_similarity": 0.05322265625,
169
+ "mean_l2_distance": 68.5,
170
+ "std_l2_distance": 2.0,
171
+ "mean_dimension_correlation": 0.062481947243213654,
172
+ "std_dimension_correlation": 0.18797583962876385,
173
+ "linear_cka": 0.6640625
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 1.15625,
177
+ "mean_cosine_similarity": 0.31640625,
178
+ "std_cosine_similarity": 0.10888671875,
179
+ "mean_l2_distance": 59.0,
180
+ "std_l2_distance": 4.84375,
181
+ "mean_dimension_correlation": 0.29400850236415865,
182
+ "std_dimension_correlation": 0.16629643255508064,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 1.140625,
187
+ "mean_cosine_similarity": 0.333984375,
188
+ "std_cosine_similarity": 0.1083984375,
189
+ "mean_l2_distance": 58.25,
190
+ "std_l2_distance": 4.84375,
191
+ "mean_dimension_correlation": 0.3036854453384876,
192
+ "std_dimension_correlation": 0.1640714460889962,
193
+ "linear_cka": 0.98046875
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 1.1953125,
197
+ "mean_cosine_similarity": 0.2734375,
198
+ "std_cosine_similarity": 0.109375,
199
+ "mean_l2_distance": 61.0,
200
+ "std_l2_distance": 4.6875,
201
+ "mean_dimension_correlation": 0.2504168091341853,
202
+ "std_dimension_correlation": 0.16876857548561683,
203
+ "linear_cka": 0.9765625
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 1.1796875,
207
+ "mean_cosine_similarity": 0.2890625,
208
+ "std_cosine_similarity": 0.10791015625,
209
+ "mean_l2_distance": 60.25,
210
+ "std_l2_distance": 4.6875,
211
+ "mean_dimension_correlation": 0.2720076544210315,
212
+ "std_dimension_correlation": 0.16885400186672725,
213
+ "linear_cka": 0.98828125
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 1.3515625,
217
+ "mean_cosine_similarity": 0.07666015625,
218
+ "std_cosine_similarity": 0.0595703125,
219
+ "mean_l2_distance": 68.5,
220
+ "std_l2_distance": 2.25,
221
+ "mean_dimension_correlation": 0.06342689506709576,
222
+ "std_dimension_correlation": 0.18284259799255337,
223
+ "linear_cka": 0.6640625
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 1.15625,
227
+ "mean_cosine_similarity": 0.31640625,
228
+ "std_cosine_similarity": 0.10888671875,
229
+ "mean_l2_distance": 59.0,
230
+ "std_l2_distance": 4.84375,
231
+ "mean_dimension_correlation": 0.29399659037590026,
232
+ "std_dimension_correlation": 0.16628270680485127,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 1.171875,
237
+ "mean_cosine_similarity": 0.302734375,
238
+ "std_cosine_similarity": 0.1162109375,
239
+ "mean_l2_distance": 59.5,
240
+ "std_l2_distance": 5.09375,
241
+ "mean_dimension_correlation": 0.2821845322847366,
242
+ "std_dimension_correlation": 0.1658260527951995,
243
+ "linear_cka": 0.984375
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 1.1875,
247
+ "mean_cosine_similarity": 0.279296875,
248
+ "std_cosine_similarity": 0.1083984375,
249
+ "mean_l2_distance": 60.75,
250
+ "std_l2_distance": 4.6875,
251
+ "mean_dimension_correlation": 0.2564893037080765,
252
+ "std_dimension_correlation": 0.1700593172594885,
253
+ "linear_cka": 0.98828125
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 1.125,
257
+ "mean_cosine_similarity": 0.3515625,
258
+ "std_cosine_similarity": 0.107421875,
259
+ "mean_l2_distance": 57.5,
260
+ "std_l2_distance": 4.875,
261
+ "mean_dimension_correlation": 0.32521353638730943,
262
+ "std_dimension_correlation": 0.16142420298822246,
263
+ "linear_cka": 0.98828125
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 1.3359375,
267
+ "mean_cosine_similarity": 0.0859375,
268
+ "std_cosine_similarity": 0.057373046875,
269
+ "mean_l2_distance": 68.5,
270
+ "std_l2_distance": 2.171875,
271
+ "mean_dimension_correlation": 0.06747776636620983,
272
+ "std_dimension_correlation": 0.18297839209554984,
273
+ "linear_cka": 0.65234375
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 1.140625,
277
+ "mean_cosine_similarity": 0.333984375,
278
+ "std_cosine_similarity": 0.1083984375,
279
+ "mean_l2_distance": 58.25,
280
+ "std_l2_distance": 4.84375,
281
+ "mean_dimension_correlation": 0.3036851711571217,
282
+ "std_dimension_correlation": 0.16406197803154954,
283
+ "linear_cka": 0.98046875
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 1.171875,
287
+ "mean_cosine_similarity": 0.302734375,
288
+ "std_cosine_similarity": 0.1162109375,
289
+ "mean_l2_distance": 59.5,
290
+ "std_l2_distance": 5.09375,
291
+ "mean_dimension_correlation": 0.2821806937456131,
292
+ "std_dimension_correlation": 0.16583393871178487,
293
+ "linear_cka": 0.984375
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 1.1953125,
297
+ "mean_cosine_similarity": 0.267578125,
298
+ "std_cosine_similarity": 0.099609375,
299
+ "mean_l2_distance": 61.0,
300
+ "std_l2_distance": 4.25,
301
+ "mean_dimension_correlation": 0.25347145795822146,
302
+ "std_dimension_correlation": 0.1676427892496719,
303
+ "linear_cka": 0.9765625
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 1.140625,
307
+ "mean_cosine_similarity": 0.3359375,
308
+ "std_cosine_similarity": 0.10302734375,
309
+ "mean_l2_distance": 58.25,
310
+ "std_l2_distance": 4.5625,
311
+ "mean_dimension_correlation": 0.3062414702028036,
312
+ "std_dimension_correlation": 0.16808821448337685,
313
+ "linear_cka": 0.984375
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 1.359375,
317
+ "mean_cosine_similarity": 0.0654296875,
318
+ "std_cosine_similarity": 0.051025390625,
319
+ "mean_l2_distance": 69.0,
320
+ "std_l2_distance": 1.9140625,
321
+ "mean_dimension_correlation": 0.05748535506427288,
322
+ "std_dimension_correlation": 0.18570980538433127,
323
+ "linear_cka": 0.66015625
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 1.1953125,
327
+ "mean_cosine_similarity": 0.2734375,
328
+ "std_cosine_similarity": 0.109375,
329
+ "mean_l2_distance": 61.0,
330
+ "std_l2_distance": 4.6875,
331
+ "mean_dimension_correlation": 0.25039467196911575,
332
+ "std_dimension_correlation": 0.16873641618577465,
333
+ "linear_cka": 0.9765625
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 1.1875,
337
+ "mean_cosine_similarity": 0.279296875,
338
+ "std_cosine_similarity": 0.1083984375,
339
+ "mean_l2_distance": 60.75,
340
+ "std_l2_distance": 4.6875,
341
+ "mean_dimension_correlation": 0.25649560913443564,
342
+ "std_dimension_correlation": 0.17006732480563802,
343
+ "linear_cka": 0.98828125
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 1.1953125,
347
+ "mean_cosine_similarity": 0.267578125,
348
+ "std_cosine_similarity": 0.099609375,
349
+ "mean_l2_distance": 61.0,
350
+ "std_l2_distance": 4.25,
351
+ "mean_dimension_correlation": 0.2534836530685425,
352
+ "std_dimension_correlation": 0.16762209134956418,
353
+ "linear_cka": 0.9765625
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 1.1953125,
357
+ "mean_cosine_similarity": 0.267578125,
358
+ "std_cosine_similarity": 0.1044921875,
359
+ "mean_l2_distance": 61.0,
360
+ "std_l2_distance": 4.46875,
361
+ "mean_dimension_correlation": 0.2458049923181534,
362
+ "std_dimension_correlation": 0.16614846928894367,
363
+ "linear_cka": 0.984375
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 1.3515625,
367
+ "mean_cosine_similarity": 0.0703125,
368
+ "std_cosine_similarity": 0.0615234375,
369
+ "mean_l2_distance": 69.0,
370
+ "std_l2_distance": 2.3125,
371
+ "mean_dimension_correlation": 0.060881674400297923,
372
+ "std_dimension_correlation": 0.18442433029309818,
373
+ "linear_cka": 0.66015625
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 1.1796875,
377
+ "mean_cosine_similarity": 0.2890625,
378
+ "std_cosine_similarity": 0.10791015625,
379
+ "mean_l2_distance": 60.25,
380
+ "std_l2_distance": 4.6875,
381
+ "mean_dimension_correlation": 0.27202143501490356,
382
+ "std_dimension_correlation": 0.16887910421184668,
383
+ "linear_cka": 0.98828125
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 1.125,
387
+ "mean_cosine_similarity": 0.3515625,
388
+ "std_cosine_similarity": 0.107421875,
389
+ "mean_l2_distance": 57.5,
390
+ "std_l2_distance": 4.875,
391
+ "mean_dimension_correlation": 0.32520583919249474,
392
+ "std_dimension_correlation": 0.16142301505759077,
393
+ "linear_cka": 0.98828125
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 1.140625,
397
+ "mean_cosine_similarity": 0.3359375,
398
+ "std_cosine_similarity": 0.10302734375,
399
+ "mean_l2_distance": 58.25,
400
+ "std_l2_distance": 4.5625,
401
+ "mean_dimension_correlation": 0.30626075267791747,
402
+ "std_dimension_correlation": 0.16814174967857434,
403
+ "linear_cka": 0.984375
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 1.1953125,
407
+ "mean_cosine_similarity": 0.267578125,
408
+ "std_cosine_similarity": 0.1044921875,
409
+ "mean_l2_distance": 61.0,
410
+ "std_l2_distance": 4.46875,
411
+ "mean_dimension_correlation": 0.24580164328217508,
412
+ "std_dimension_correlation": 0.16616246993231076,
413
+ "linear_cka": 0.984375
414
+ },
415
+ "avg_mse": 1.2291666666666667,
416
+ "std_mse": 0.08795763263576896,
417
+ "avg_mean_cosine_similarity": 0.2265625,
418
+ "std_mean_cosine_similarity": 0.10912461458526285,
419
+ "avg_std_cosine_similarity": 0.0904296875,
420
+ "std_std_cosine_similarity": 0.02430735142446207,
421
+ "avg_mean_l2_distance": 62.666666666666664,
422
+ "std_mean_l2_distance": 4.391911757866827,
423
+ "avg_std_l2_distance": 3.8432291666666667,
424
+ "std_std_l2_distance": 1.2284684192595492,
425
+ "avg_mean_dimension_correlation": 0.20675061237125192,
426
+ "std_mean_dimension_correlation": 0.10433772738239133,
427
+ "avg_std_dimension_correlation": 0.17274183624909276,
428
+ "std_std_dimension_correlation": 0.008813959193590607,
429
+ "avg_linear_cka": 0.87578125,
430
+ "std_linear_cka": 0.15252860046015415
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_3001344.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 3001344,
4
+ "global_step": 2931,
5
+ "training_metrics": {
6
+ "train/loss": 2.515625,
7
+ "train/contrastive": 2.421875,
8
+ "train/recons_loss": 0.671875,
9
+ "train/balance_loss": 3.75,
10
+ "train/balance_loss_contrastive": 2.71875,
11
+ "train/balance_loss_recons": 1.0390625,
12
+ "train/contrastive_std": 3.25,
13
+ "train/recons_std": 0.138671875,
14
+ "train/contrastive_min": 0.146484375,
15
+ "train/contrastive_max": 6.9375,
16
+ "train/recons_min": 0.56640625,
17
+ "train/recons_max": 0.9375,
18
+ "train/Qwen3_0.6B_layer_2": 0.9375,
19
+ "train/Qwen3_0.6B_layer_4": 0.59765625,
20
+ "train/Qwen3_1.7B_layer_2": 0.59375,
21
+ "train/Qwen3_1.7B_layer_4": 0.703125,
22
+ "train/Qwen3_4B_layer_2": 0.56640625,
23
+ "train/Qwen3_4B_layer_4": 0.6328125,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 3001344,
27
+ "train/step": 2931
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 2931,
31
+ "n_tokens": 3001344,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.318835258483887,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 9.138021469116211,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.61973762512207,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 9.007281303405762,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.960853576660156,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.789403915405273,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 7.3046698570251465,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.55082368850708,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.5602962970733643,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.592942714691162,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.588857650756836,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.6625943183898926,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.131369590759277,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.891963481903076,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.430274963378906,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.0684638023376465,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 5.9689507484436035,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.356847286224365,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 8.19615364074707,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.8310694694519043,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.7546491622924805,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.7474663257598877,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.857220411300659,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.925436019897461,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.565979957580566,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9663586616516113,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.719478130340576,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.741952657699585,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.7755935192108154,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.7375831604003906,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.4653778076171875,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.67035174369812,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.566011905670166,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7160496711730957,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.552424907684326,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.459855556488037,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 15.485275268554688,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 22.359243392944336,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 21.841341018676758,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 20.851577758789062,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 23.41849136352539,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 22.13389015197754,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.68209457397461,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0910885334014893,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0663363933563232,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.1295608282089233,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.096497654914856,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0976781845092773,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.745889663696289,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.073387622833252,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0651912689208984,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1097475290298462,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.102055311203003,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1042507886886597,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.085488319396973,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.443469762802124,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.407573938369751,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.394163966178894,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.4274914264678955,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.423639178276062,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.723683834075928,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.2199777364730835,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.1646456718444824,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1640838384628296,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.1155877113342285,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.1568272113800049,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.499415397644043,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2985743284225464,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2958557605743408,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2903549671173096,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2823046445846558,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2616506814956665
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 1.421875,
117
+ "mean_cosine_similarity": -0.03369140625,
118
+ "std_cosine_similarity": 0.109375,
119
+ "mean_l2_distance": 72.5,
120
+ "std_l2_distance": 3.90625,
121
+ "mean_dimension_correlation": 0.254237837344408,
122
+ "std_dimension_correlation": 0.16181929189675745,
123
+ "linear_cka": 0.5859375
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 1.421875,
127
+ "mean_cosine_similarity": -0.0284423828125,
128
+ "std_cosine_similarity": 0.10888671875,
129
+ "mean_l2_distance": 72.5,
130
+ "std_l2_distance": 3.890625,
131
+ "mean_dimension_correlation": 0.25683254674077033,
132
+ "std_dimension_correlation": 0.16029215327593901,
133
+ "linear_cka": 0.57421875
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 1.4140625,
137
+ "mean_cosine_similarity": -0.0252685546875,
138
+ "std_cosine_similarity": 0.1083984375,
139
+ "mean_l2_distance": 72.5,
140
+ "std_l2_distance": 3.875,
141
+ "mean_dimension_correlation": 0.25395019352436066,
142
+ "std_dimension_correlation": 0.15926056622745546,
143
+ "linear_cka": 0.578125
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 1.421875,
147
+ "mean_cosine_similarity": -0.03271484375,
148
+ "std_cosine_similarity": 0.1064453125,
149
+ "mean_l2_distance": 72.5,
150
+ "std_l2_distance": 3.796875,
151
+ "mean_dimension_correlation": 0.24886183738708495,
152
+ "std_dimension_correlation": 0.15849261736593726,
153
+ "linear_cka": 0.55859375
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 1.421875,
157
+ "mean_cosine_similarity": -0.033203125,
158
+ "std_cosine_similarity": 0.109375,
159
+ "mean_l2_distance": 72.5,
160
+ "std_l2_distance": 3.890625,
161
+ "mean_dimension_correlation": 0.256584095954895,
162
+ "std_dimension_correlation": 0.15873214025442897,
163
+ "linear_cka": 0.57421875
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 1.4296875,
167
+ "mean_cosine_similarity": -0.03369140625,
168
+ "std_cosine_similarity": 0.109375,
169
+ "mean_l2_distance": 72.5,
170
+ "std_l2_distance": 3.90625,
171
+ "mean_dimension_correlation": 0.2542317323386669,
172
+ "std_dimension_correlation": 0.16183266276519212,
173
+ "linear_cka": 0.5859375
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.734375,
177
+ "mean_cosine_similarity": 0.65625,
178
+ "std_cosine_similarity": 0.28515625,
179
+ "mean_l2_distance": 37.25,
180
+ "std_l2_distance": 19.5,
181
+ "mean_dimension_correlation": 0.6187647342681885,
182
+ "std_dimension_correlation": 0.11470426666838326,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.7265625,
187
+ "mean_cosine_similarity": 0.66015625,
188
+ "std_cosine_similarity": 0.279296875,
189
+ "mean_l2_distance": 37.0,
190
+ "std_l2_distance": 19.25,
191
+ "mean_dimension_correlation": 0.6220208525657653,
192
+ "std_dimension_correlation": 0.11040039509848326,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.76171875,
197
+ "mean_cosine_similarity": 0.62890625,
198
+ "std_cosine_similarity": 0.302734375,
199
+ "mean_l2_distance": 38.75,
200
+ "std_l2_distance": 20.125,
201
+ "mean_dimension_correlation": 0.592758321762085,
202
+ "std_dimension_correlation": 0.11886540241980308,
203
+ "linear_cka": 0.98046875
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.74609375,
207
+ "mean_cosine_similarity": 0.63671875,
208
+ "std_cosine_similarity": 0.302734375,
209
+ "mean_l2_distance": 38.25,
210
+ "std_l2_distance": 20.25,
211
+ "mean_dimension_correlation": 0.6037769317626953,
212
+ "std_dimension_correlation": 0.11647753822253991,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 1.4296875,
217
+ "mean_cosine_similarity": -0.0284423828125,
218
+ "std_cosine_similarity": 0.10888671875,
219
+ "mean_l2_distance": 72.5,
220
+ "std_l2_distance": 3.890625,
221
+ "mean_dimension_correlation": 0.25684744566679,
222
+ "std_dimension_correlation": 0.16032274573798164,
223
+ "linear_cka": 0.57421875
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.734375,
227
+ "mean_cosine_similarity": 0.65625,
228
+ "std_cosine_similarity": 0.28515625,
229
+ "mean_l2_distance": 37.25,
230
+ "std_l2_distance": 19.5,
231
+ "mean_dimension_correlation": 0.6187384128570557,
232
+ "std_dimension_correlation": 0.11471572089316741,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.734375,
237
+ "mean_cosine_similarity": 0.6484375,
238
+ "std_cosine_similarity": 0.30078125,
239
+ "mean_l2_distance": 37.5,
240
+ "std_l2_distance": 20.375,
241
+ "mean_dimension_correlation": 0.6119367599487304,
242
+ "std_dimension_correlation": 0.1157440646478159,
243
+ "linear_cka": 0.99609375
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.75390625,
247
+ "mean_cosine_similarity": 0.63671875,
248
+ "std_cosine_similarity": 0.30078125,
249
+ "mean_l2_distance": 38.25,
250
+ "std_l2_distance": 20.375,
251
+ "mean_dimension_correlation": 0.5996460914611816,
252
+ "std_dimension_correlation": 0.11944124129277625,
253
+ "linear_cka": 0.98046875
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.70703125,
257
+ "mean_cosine_similarity": 0.67578125,
258
+ "std_cosine_similarity": 0.27734375,
259
+ "mean_l2_distance": 36.0,
260
+ "std_l2_distance": 19.5,
261
+ "mean_dimension_correlation": 0.638215160369873,
262
+ "std_dimension_correlation": 0.10975611081697591,
263
+ "linear_cka": 0.984375
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 1.4140625,
267
+ "mean_cosine_similarity": -0.0252685546875,
268
+ "std_cosine_similarity": 0.1083984375,
269
+ "mean_l2_distance": 72.5,
270
+ "std_l2_distance": 3.875,
271
+ "mean_dimension_correlation": 0.25395837128162385,
272
+ "std_dimension_correlation": 0.15926372552177567,
273
+ "linear_cka": 0.578125
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.7265625,
277
+ "mean_cosine_similarity": 0.66015625,
278
+ "std_cosine_similarity": 0.279296875,
279
+ "mean_l2_distance": 37.0,
280
+ "std_l2_distance": 19.25,
281
+ "mean_dimension_correlation": 0.6219659209251404,
282
+ "std_dimension_correlation": 0.11032863879333923,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.734375,
287
+ "mean_cosine_similarity": 0.6484375,
288
+ "std_cosine_similarity": 0.30078125,
289
+ "mean_l2_distance": 37.5,
290
+ "std_l2_distance": 20.375,
291
+ "mean_dimension_correlation": 0.6119108200073242,
292
+ "std_dimension_correlation": 0.1157383378132106,
293
+ "linear_cka": 0.99609375
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.7578125,
297
+ "mean_cosine_similarity": 0.6328125,
298
+ "std_cosine_similarity": 0.298828125,
299
+ "mean_l2_distance": 38.5,
300
+ "std_l2_distance": 20.125,
301
+ "mean_dimension_correlation": 0.5979020118713378,
302
+ "std_dimension_correlation": 0.1151705814719715,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.71875,
307
+ "mean_cosine_similarity": 0.6640625,
308
+ "std_cosine_similarity": 0.28125,
309
+ "mean_l2_distance": 36.75,
310
+ "std_l2_distance": 19.5,
311
+ "mean_dimension_correlation": 0.6274345874786377,
312
+ "std_dimension_correlation": 0.11253210388812478,
313
+ "linear_cka": 0.98046875
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 1.4296875,
317
+ "mean_cosine_similarity": -0.03271484375,
318
+ "std_cosine_similarity": 0.1064453125,
319
+ "mean_l2_distance": 72.5,
320
+ "std_l2_distance": 3.796875,
321
+ "mean_dimension_correlation": 0.24887723177671434,
322
+ "std_dimension_correlation": 0.15850834600861563,
323
+ "linear_cka": 0.55859375
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.76171875,
327
+ "mean_cosine_similarity": 0.62890625,
328
+ "std_cosine_similarity": 0.302734375,
329
+ "mean_l2_distance": 38.75,
330
+ "std_l2_distance": 20.125,
331
+ "mean_dimension_correlation": 0.5927883148193359,
332
+ "std_dimension_correlation": 0.11887853166661289,
333
+ "linear_cka": 0.98046875
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.75390625,
337
+ "mean_cosine_similarity": 0.63671875,
338
+ "std_cosine_similarity": 0.30078125,
339
+ "mean_l2_distance": 38.25,
340
+ "std_l2_distance": 20.375,
341
+ "mean_dimension_correlation": 0.5995779991149902,
342
+ "std_dimension_correlation": 0.1193691675179003,
343
+ "linear_cka": 0.98046875
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.7578125,
347
+ "mean_cosine_similarity": 0.6328125,
348
+ "std_cosine_similarity": 0.298828125,
349
+ "mean_l2_distance": 38.5,
350
+ "std_l2_distance": 20.125,
351
+ "mean_dimension_correlation": 0.5978128433227539,
352
+ "std_dimension_correlation": 0.11512506347641102,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.75390625,
357
+ "mean_cosine_similarity": 0.62890625,
358
+ "std_cosine_similarity": 0.3046875,
359
+ "mean_l2_distance": 38.5,
360
+ "std_l2_distance": 20.625,
361
+ "mean_dimension_correlation": 0.5955796241760254,
362
+ "std_dimension_correlation": 0.11906185378925987,
363
+ "linear_cka": 0.98828125
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 1.4296875,
367
+ "mean_cosine_similarity": -0.033203125,
368
+ "std_cosine_similarity": 0.109375,
369
+ "mean_l2_distance": 72.5,
370
+ "std_l2_distance": 3.890625,
371
+ "mean_dimension_correlation": 0.2565764158964157,
372
+ "std_dimension_correlation": 0.1587071816624074,
373
+ "linear_cka": 0.57421875
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.74609375,
377
+ "mean_cosine_similarity": 0.63671875,
378
+ "std_cosine_similarity": 0.302734375,
379
+ "mean_l2_distance": 38.25,
380
+ "std_l2_distance": 20.25,
381
+ "mean_dimension_correlation": 0.6037120819091797,
382
+ "std_dimension_correlation": 0.11639985412027169,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.70703125,
387
+ "mean_cosine_similarity": 0.67578125,
388
+ "std_cosine_similarity": 0.27734375,
389
+ "mean_l2_distance": 36.0,
390
+ "std_l2_distance": 19.5,
391
+ "mean_dimension_correlation": 0.6382188320159912,
392
+ "std_dimension_correlation": 0.10972459865917429,
393
+ "linear_cka": 0.984375
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.71875,
397
+ "mean_cosine_similarity": 0.6640625,
398
+ "std_cosine_similarity": 0.28125,
399
+ "mean_l2_distance": 36.75,
400
+ "std_l2_distance": 19.5,
401
+ "mean_dimension_correlation": 0.6273346900939941,
402
+ "std_dimension_correlation": 0.1124933006393999,
403
+ "linear_cka": 0.98046875
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.75390625,
407
+ "mean_cosine_similarity": 0.62890625,
408
+ "std_cosine_similarity": 0.3046875,
409
+ "mean_l2_distance": 38.5,
410
+ "std_l2_distance": 20.625,
411
+ "mean_dimension_correlation": 0.5955384254455567,
412
+ "std_dimension_correlation": 0.11905433194805992,
413
+ "linear_cka": 0.98828125
414
+ },
415
+ "avg_mse": 0.9674479166666666,
416
+ "std_mse": 0.32276016873074226,
417
+ "avg_mean_cosine_similarity": 0.4210286458333333,
418
+ "std_mean_cosine_similarity": 0.31965592389258923,
419
+ "avg_std_cosine_similarity": 0.23173828125,
420
+ "std_std_cosine_similarity": 0.08757208624967457,
421
+ "avg_mean_l2_distance": 49.28333333333333,
422
+ "std_mean_l2_distance": 16.43189648890907,
423
+ "avg_std_l2_distance": 14.598958333333334,
424
+ "std_std_l2_distance": 7.594291533045653,
425
+ "avg_mean_dimension_correlation": 0.49188637080291897,
426
+ "std_mean_dimension_correlation": 0.16857047305704442,
427
+ "avg_std_dimension_correlation": 0.1300404178186724,
428
+ "std_std_dimension_correlation": 0.021172306029780062,
429
+ "avg_linear_cka": 0.8479166666666667,
430
+ "std_linear_cka": 0.19363585907609904
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_4001792.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 4001792,
4
+ "global_step": 3908,
5
+ "training_metrics": {
6
+ "train/loss": 2.5,
7
+ "train/contrastive": 2.40625,
8
+ "train/recons_loss": 0.671875,
9
+ "train/balance_loss": 3.828125,
10
+ "train/balance_loss_contrastive": 2.78125,
11
+ "train/balance_loss_recons": 1.046875,
12
+ "train/contrastive_std": 3.265625,
13
+ "train/recons_std": 0.1513671875,
14
+ "train/contrastive_min": 0.1162109375,
15
+ "train/contrastive_max": 6.9375,
16
+ "train/recons_min": 0.55859375,
17
+ "train/recons_max": 0.96484375,
18
+ "train/Qwen3_0.6B_layer_2": 0.96484375,
19
+ "train/Qwen3_0.6B_layer_4": 0.6015625,
20
+ "train/Qwen3_1.7B_layer_2": 0.58203125,
21
+ "train/Qwen3_1.7B_layer_4": 0.69140625,
22
+ "train/Qwen3_4B_layer_2": 0.55859375,
23
+ "train/Qwen3_4B_layer_4": 0.625,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 4001792,
27
+ "train/step": 3908
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 3908,
31
+ "n_tokens": 4001792,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 10.677733421325684,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 10.070417404174805,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 10.500988960266113,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 10.254755973815918,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 10.141581535339355,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 10.209218978881836,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 6.944426536560059,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.526094675064087,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.448215961456299,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.5273706912994385,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.518568515777588,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.5949084758758545,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.105497360229492,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.14721155166626,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.3550543785095215,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.340244293212891,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.490333557128906,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.536875247955322,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 9.303935050964355,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.7055323123931885,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6092498302459717,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6337990760803223,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.66951322555542,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.742098569869995,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 8.335909843444824,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.7109298706054688,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.387141704559326,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.432076930999756,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.466850519180298,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3831467628479004,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 8.078448295593262,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.5085513591766357,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.5161335468292236,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7921173572540283,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.47990345954895,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.2069830894470215,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.193017959594727,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.069667816162109,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.365467071533203,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.159329414367676,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 5.38785982131958,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.0614728927612305,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 5.8135833740234375,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0534567832946777,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0686990022659302,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.093401312828064,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.055011510848999,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0553429126739502,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.750565528869629,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.014384150505066,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0321710109710693,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.048608422279358,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0492010116577148,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.053753137588501,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 5.545437812805176,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.3332059383392334,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.340571641921997,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3302825689315796,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3458808660507202,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3539550304412842,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.418992042541504,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.1341036558151245,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0986826419830322,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1034446954727173,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.0604543685913086,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.0890880823135376,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.221832275390625,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2409595251083374,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2512269020080566,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2450522184371948,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2315609455108643,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2183749675750732
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 1.3515625,
117
+ "mean_cosine_similarity": 0.052001953125,
118
+ "std_cosine_similarity": 0.19140625,
119
+ "mean_l2_distance": 69.5,
120
+ "std_l2_distance": 7.3125,
121
+ "mean_dimension_correlation": 0.46465563774108887,
122
+ "std_dimension_correlation": 0.1362067287833464,
123
+ "linear_cka": 0.5859375
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 1.34375,
127
+ "mean_cosine_similarity": 0.056396484375,
128
+ "std_cosine_similarity": 0.19140625,
129
+ "mean_l2_distance": 69.0,
130
+ "std_l2_distance": 7.34375,
131
+ "mean_dimension_correlation": 0.46726187616586684,
132
+ "std_dimension_correlation": 0.13268670396178475,
133
+ "linear_cka": 0.578125
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 1.34375,
137
+ "mean_cosine_similarity": 0.055908203125,
138
+ "std_cosine_similarity": 0.1904296875,
139
+ "mean_l2_distance": 69.0,
140
+ "std_l2_distance": 7.34375,
141
+ "mean_dimension_correlation": 0.4647917509078979,
142
+ "std_dimension_correlation": 0.13471787446655337,
143
+ "linear_cka": 0.578125
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 1.3515625,
147
+ "mean_cosine_similarity": 0.056640625,
148
+ "std_cosine_similarity": 0.1884765625,
149
+ "mean_l2_distance": 69.0,
150
+ "std_l2_distance": 7.21875,
151
+ "mean_dimension_correlation": 0.4657045602798462,
152
+ "std_dimension_correlation": 0.13361017606712636,
153
+ "linear_cka": 0.57421875
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 1.34375,
157
+ "mean_cosine_similarity": 0.05615234375,
158
+ "std_cosine_similarity": 0.1904296875,
159
+ "mean_l2_distance": 69.0,
160
+ "std_l2_distance": 7.3125,
161
+ "mean_dimension_correlation": 0.4670211374759674,
162
+ "std_dimension_correlation": 0.13379253598505308,
163
+ "linear_cka": 0.57421875
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 1.359375,
167
+ "mean_cosine_similarity": 0.052001953125,
168
+ "std_cosine_similarity": 0.19140625,
169
+ "mean_l2_distance": 69.5,
170
+ "std_l2_distance": 7.3125,
171
+ "mean_dimension_correlation": 0.4646653652191162,
172
+ "std_dimension_correlation": 0.13617385784126732,
173
+ "linear_cka": 0.5859375
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.5,
177
+ "mean_cosine_similarity": 0.8046875,
178
+ "std_cosine_similarity": 0.255859375,
179
+ "mean_l2_distance": 25.5,
180
+ "std_l2_distance": 19.375,
181
+ "mean_dimension_correlation": 0.7786048889160156,
182
+ "std_dimension_correlation": 0.079747066045607,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.498046875,
187
+ "mean_cosine_similarity": 0.8046875,
188
+ "std_cosine_similarity": 0.25390625,
189
+ "mean_l2_distance": 25.375,
190
+ "std_l2_distance": 19.25,
191
+ "mean_dimension_correlation": 0.7792343139648438,
192
+ "std_dimension_correlation": 0.07860444177664169,
193
+ "linear_cka": 0.98828125
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.51953125,
197
+ "mean_cosine_similarity": 0.78515625,
198
+ "std_cosine_similarity": 0.275390625,
199
+ "mean_l2_distance": 26.5,
200
+ "std_l2_distance": 20.25,
201
+ "mean_dimension_correlation": 0.7631843566894532,
202
+ "std_dimension_correlation": 0.08458475161101357,
203
+ "linear_cka": 0.98828125
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.5078125,
207
+ "mean_cosine_similarity": 0.79296875,
208
+ "std_cosine_similarity": 0.2734375,
209
+ "mean_l2_distance": 26.0,
210
+ "std_l2_distance": 20.125,
211
+ "mean_dimension_correlation": 0.7681190490722656,
212
+ "std_dimension_correlation": 0.08387350384855204,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 1.3515625,
217
+ "mean_cosine_similarity": 0.056396484375,
218
+ "std_cosine_similarity": 0.19140625,
219
+ "mean_l2_distance": 69.0,
220
+ "std_l2_distance": 7.34375,
221
+ "mean_dimension_correlation": 0.46729940325021746,
222
+ "std_dimension_correlation": 0.13270665905666312,
223
+ "linear_cka": 0.578125
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.5,
227
+ "mean_cosine_similarity": 0.8046875,
228
+ "std_cosine_similarity": 0.255859375,
229
+ "mean_l2_distance": 25.5,
230
+ "std_l2_distance": 19.375,
231
+ "mean_dimension_correlation": 0.7785774230957031,
232
+ "std_dimension_correlation": 0.07977299719796638,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.49609375,
237
+ "mean_cosine_similarity": 0.796875,
238
+ "std_cosine_similarity": 0.2734375,
239
+ "mean_l2_distance": 25.25,
240
+ "std_l2_distance": 20.375,
241
+ "mean_dimension_correlation": 0.7738082885742188,
242
+ "std_dimension_correlation": 0.08244148252527034,
243
+ "linear_cka": 0.98828125
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.50390625,
247
+ "mean_cosine_similarity": 0.79296875,
248
+ "std_cosine_similarity": 0.275390625,
249
+ "mean_l2_distance": 25.75,
250
+ "std_l2_distance": 20.5,
251
+ "mean_dimension_correlation": 0.7685455322265625,
252
+ "std_dimension_correlation": 0.08636337823761847,
253
+ "linear_cka": 0.98828125
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.48046875,
257
+ "mean_cosine_similarity": 0.8125,
258
+ "std_cosine_similarity": 0.251953125,
259
+ "mean_l2_distance": 24.5,
260
+ "std_l2_distance": 19.375,
261
+ "mean_dimension_correlation": 0.789703369140625,
262
+ "std_dimension_correlation": 0.07704774466213117,
263
+ "linear_cka": 0.984375
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 1.34375,
267
+ "mean_cosine_similarity": 0.055908203125,
268
+ "std_cosine_similarity": 0.1904296875,
269
+ "mean_l2_distance": 69.0,
270
+ "std_l2_distance": 7.34375,
271
+ "mean_dimension_correlation": 0.4647957801818848,
272
+ "std_dimension_correlation": 0.1347461643666133,
273
+ "linear_cka": 0.578125
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.49609375,
277
+ "mean_cosine_similarity": 0.8046875,
278
+ "std_cosine_similarity": 0.25390625,
279
+ "mean_l2_distance": 25.375,
280
+ "std_l2_distance": 19.25,
281
+ "mean_dimension_correlation": 0.779193115234375,
282
+ "std_dimension_correlation": 0.07862977772942846,
283
+ "linear_cka": 0.98828125
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.4921875,
287
+ "mean_cosine_similarity": 0.796875,
288
+ "std_cosine_similarity": 0.2734375,
289
+ "mean_l2_distance": 25.25,
290
+ "std_l2_distance": 20.375,
291
+ "mean_dimension_correlation": 0.773846435546875,
292
+ "std_dimension_correlation": 0.08246401911605972,
293
+ "linear_cka": 0.98828125
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.5078125,
297
+ "mean_cosine_similarity": 0.79296875,
298
+ "std_cosine_similarity": 0.271484375,
299
+ "mean_l2_distance": 26.0,
300
+ "std_l2_distance": 20.125,
301
+ "mean_dimension_correlation": 0.7682723999023438,
302
+ "std_dimension_correlation": 0.08173679476643078,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.486328125,
307
+ "mean_cosine_similarity": 0.80859375,
308
+ "std_cosine_similarity": 0.255859375,
309
+ "mean_l2_distance": 24.875,
310
+ "std_l2_distance": 19.5,
311
+ "mean_dimension_correlation": 0.7830284118652344,
312
+ "std_dimension_correlation": 0.07756386958443834,
313
+ "linear_cka": 0.98046875
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 1.359375,
317
+ "mean_cosine_similarity": 0.056640625,
318
+ "std_cosine_similarity": 0.1884765625,
319
+ "mean_l2_distance": 69.0,
320
+ "std_l2_distance": 7.21875,
321
+ "mean_dimension_correlation": 0.46567630767822266,
322
+ "std_dimension_correlation": 0.13364195702919346,
323
+ "linear_cka": 0.57421875
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.51953125,
327
+ "mean_cosine_similarity": 0.78515625,
328
+ "std_cosine_similarity": 0.275390625,
329
+ "mean_l2_distance": 26.5,
330
+ "std_l2_distance": 20.25,
331
+ "mean_dimension_correlation": 0.7631195068359375,
332
+ "std_dimension_correlation": 0.08451723229099471,
333
+ "linear_cka": 0.98828125
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.50390625,
337
+ "mean_cosine_similarity": 0.79296875,
338
+ "std_cosine_similarity": 0.275390625,
339
+ "mean_l2_distance": 25.75,
340
+ "std_l2_distance": 20.5,
341
+ "mean_dimension_correlation": 0.7685020446777344,
342
+ "std_dimension_correlation": 0.08637723380547804,
343
+ "linear_cka": 0.98828125
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.5078125,
347
+ "mean_cosine_similarity": 0.79296875,
348
+ "std_cosine_similarity": 0.271484375,
349
+ "mean_l2_distance": 26.0,
350
+ "std_l2_distance": 20.125,
351
+ "mean_dimension_correlation": 0.7681938171386719,
352
+ "std_dimension_correlation": 0.08170402844520411,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.5,
357
+ "mean_cosine_similarity": 0.7890625,
358
+ "std_cosine_similarity": 0.27734375,
359
+ "mean_l2_distance": 25.5,
360
+ "std_l2_distance": 20.75,
361
+ "mean_dimension_correlation": 0.7680191040039063,
362
+ "std_dimension_correlation": 0.08532466419571123,
363
+ "linear_cka": 0.98828125
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 1.3515625,
367
+ "mean_cosine_similarity": 0.05615234375,
368
+ "std_cosine_similarity": 0.1904296875,
369
+ "mean_l2_distance": 69.0,
370
+ "std_l2_distance": 7.3125,
371
+ "mean_dimension_correlation": 0.4670826017856598,
372
+ "std_dimension_correlation": 0.13384197305399426,
373
+ "linear_cka": 0.57421875
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.5078125,
377
+ "mean_cosine_similarity": 0.79296875,
378
+ "std_cosine_similarity": 0.2734375,
379
+ "mean_l2_distance": 26.0,
380
+ "std_l2_distance": 20.125,
381
+ "mean_dimension_correlation": 0.7681541442871094,
382
+ "std_dimension_correlation": 0.08380469982339137,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.48046875,
387
+ "mean_cosine_similarity": 0.8125,
388
+ "std_cosine_similarity": 0.251953125,
389
+ "mean_l2_distance": 24.5,
390
+ "std_l2_distance": 19.375,
391
+ "mean_dimension_correlation": 0.7896865844726563,
392
+ "std_dimension_correlation": 0.07704049416139637,
393
+ "linear_cka": 0.984375
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.48828125,
397
+ "mean_cosine_similarity": 0.80859375,
398
+ "std_cosine_similarity": 0.255859375,
399
+ "mean_l2_distance": 24.875,
400
+ "std_l2_distance": 19.5,
401
+ "mean_dimension_correlation": 0.7829521179199219,
402
+ "std_dimension_correlation": 0.07745501980282286,
403
+ "linear_cka": 0.98046875
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.5,
407
+ "mean_cosine_similarity": 0.7890625,
408
+ "std_cosine_similarity": 0.27734375,
409
+ "mean_l2_distance": 25.5,
410
+ "std_l2_distance": 20.75,
411
+ "mean_dimension_correlation": 0.7680793762207031,
412
+ "std_dimension_correlation": 0.08533230341332358,
413
+ "linear_cka": 0.98828125
414
+ },
415
+ "avg_mse": 0.783203125,
416
+ "std_mse": 0.4008924066126565,
417
+ "avg_mean_cosine_similarity": 0.5505045572916667,
418
+ "std_mean_cosine_similarity": 0.3501489988188365,
419
+ "avg_std_cosine_similarity": 0.24108072916666667,
420
+ "std_std_cosine_similarity": 0.036733753214407784,
421
+ "avg_mean_l2_distance": 40.05,
422
+ "std_mean_l2_distance": 20.54668464740723,
423
+ "avg_std_l2_distance": 15.74375,
424
+ "std_std_l2_distance": 5.980928892460323,
425
+ "avg_mean_dimension_correlation": 0.6713259566823642,
426
+ "std_mean_dimension_correlation": 0.14540630815784578,
427
+ "avg_std_dimension_correlation": 0.09921700445503585,
428
+ "std_std_dimension_correlation": 0.024890230217464338,
429
+ "avg_linear_cka": 0.85,
430
+ "std_linear_cka": 0.19227216969593736
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_5002240.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 5002240,
4
+ "global_step": 4885,
5
+ "training_metrics": {
6
+ "train/loss": 2.515625,
7
+ "train/contrastive": 2.421875,
8
+ "train/recons_loss": 0.65234375,
9
+ "train/balance_loss": 3.84375,
10
+ "train/balance_loss_contrastive": 2.796875,
11
+ "train/balance_loss_recons": 1.0390625,
12
+ "train/contrastive_std": 3.296875,
13
+ "train/recons_std": 0.1279296875,
14
+ "train/contrastive_min": 0.10791015625,
15
+ "train/contrastive_max": 7.0,
16
+ "train/recons_min": 0.546875,
17
+ "train/recons_max": 0.89453125,
18
+ "train/Qwen3_0.6B_layer_2": 0.89453125,
19
+ "train/Qwen3_0.6B_layer_4": 0.58984375,
20
+ "train/Qwen3_1.7B_layer_2": 0.578125,
21
+ "train/Qwen3_1.7B_layer_4": 0.69140625,
22
+ "train/Qwen3_4B_layer_2": 0.546875,
23
+ "train/Qwen3_4B_layer_4": 0.61328125,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 5002240,
27
+ "train/step": 4885
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 4885,
31
+ "n_tokens": 5002240,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.649429321289062,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.296281814575195,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.075584411621094,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 8.38884162902832,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.30383014678955,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.307902336120605,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.658719062805176,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.32064151763916,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.324888229370117,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.3024439811706543,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.259655714035034,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.363274574279785,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 12.87409782409668,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.341550350189209,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.935274600982666,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.224505424499512,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.194956302642822,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.298384189605713,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.595909595489502,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.6820993423461914,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6489415168762207,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6376867294311523,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.660701274871826,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7599291801452637,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.069366455078125,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.4930596351623535,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.338548421859741,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.4058313369750977,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.42093825340271,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3004584312438965,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 6.583094120025635,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.2262253761291504,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.1968085765838623,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.333820343017578,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.3650805950164795,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.0115933418273926,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.5094382762908936,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.214106798171997,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 2.274066925048828,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3673298358917236,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.390550136566162,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 2.258884906768799,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 3.714834690093994,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0173228979110718,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0794646739959717,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.068953275680542,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.0350369215011597,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.035721778869629,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 3.772433042526245,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.0048426389694214,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.02411949634552,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.0449342727661133,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0301669836044312,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.027207612991333,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 2.8531670570373535,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.316890001296997,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.3466463088989258,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3155006170272827,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3215066194534302,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3193069696426392,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 2.85866117477417,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.078108787536621,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.092585802078247,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.0788657665252686,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.037369728088379,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.057699203491211,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 3.228097438812256,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.1583912372589111,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.1884386539459229,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.1869480609893799,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.176148772239685,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.147931694984436
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 0.9609375,
117
+ "mean_cosine_similarity": 0.5078125,
118
+ "std_cosine_similarity": 0.2080078125,
119
+ "mean_l2_distance": 49.25,
120
+ "std_l2_distance": 10.375,
121
+ "mean_dimension_correlation": 0.656490707397461,
122
+ "std_dimension_correlation": 0.0958554699318414,
123
+ "linear_cka": 0.78125
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 0.9609375,
127
+ "mean_cosine_similarity": 0.50390625,
128
+ "std_cosine_similarity": 0.208984375,
129
+ "mean_l2_distance": 49.25,
130
+ "std_l2_distance": 10.4375,
131
+ "mean_dimension_correlation": 0.6543472290039063,
132
+ "std_dimension_correlation": 0.09762869101522263,
133
+ "linear_cka": 0.78125
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 0.9609375,
137
+ "mean_cosine_similarity": 0.5078125,
138
+ "std_cosine_similarity": 0.2080078125,
139
+ "mean_l2_distance": 49.25,
140
+ "std_l2_distance": 10.375,
141
+ "mean_dimension_correlation": 0.6573211669921875,
142
+ "std_dimension_correlation": 0.09694915743193826,
143
+ "linear_cka": 0.76953125
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 0.97265625,
147
+ "mean_cosine_similarity": 0.498046875,
148
+ "std_cosine_similarity": 0.20703125,
149
+ "mean_l2_distance": 49.75,
150
+ "std_l2_distance": 10.25,
151
+ "mean_dimension_correlation": 0.6492362976074219,
152
+ "std_dimension_correlation": 0.09888349567655164,
153
+ "linear_cka": 0.78125
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 0.96484375,
157
+ "mean_cosine_similarity": 0.50390625,
158
+ "std_cosine_similarity": 0.2099609375,
159
+ "mean_l2_distance": 49.5,
160
+ "std_l2_distance": 10.4375,
161
+ "mean_dimension_correlation": 0.6535564422607422,
162
+ "std_dimension_correlation": 0.09991360994754159,
163
+ "linear_cka": 0.7734375
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 0.9609375,
167
+ "mean_cosine_similarity": 0.5078125,
168
+ "std_cosine_similarity": 0.2080078125,
169
+ "mean_l2_distance": 49.25,
170
+ "std_l2_distance": 10.375,
171
+ "mean_dimension_correlation": 0.6565372467041015,
172
+ "std_dimension_correlation": 0.09589836585224641,
173
+ "linear_cka": 0.78125
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.39453125,
177
+ "mean_cosine_similarity": 0.8671875,
178
+ "std_cosine_similarity": 0.212890625,
179
+ "mean_l2_distance": 20.25,
180
+ "std_l2_distance": 16.875,
181
+ "mean_dimension_correlation": 0.8475028991699218,
182
+ "std_dimension_correlation": 0.0568063510608605,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.388671875,
187
+ "mean_cosine_similarity": 0.87109375,
188
+ "std_cosine_similarity": 0.2138671875,
189
+ "mean_l2_distance": 19.875,
190
+ "std_l2_distance": 17.0,
191
+ "mean_dimension_correlation": 0.8486709594726562,
192
+ "std_dimension_correlation": 0.05617761489018725,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.404296875,
197
+ "mean_cosine_similarity": 0.859375,
198
+ "std_cosine_similarity": 0.23046875,
199
+ "mean_l2_distance": 20.75,
200
+ "std_l2_distance": 17.75,
201
+ "mean_dimension_correlation": 0.837774658203125,
202
+ "std_dimension_correlation": 0.06036525651201483,
203
+ "linear_cka": 0.984375
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.3984375,
207
+ "mean_cosine_similarity": 0.859375,
208
+ "std_cosine_similarity": 0.2294921875,
209
+ "mean_l2_distance": 20.375,
210
+ "std_l2_distance": 17.75,
211
+ "mean_dimension_correlation": 0.8405960083007813,
212
+ "std_dimension_correlation": 0.06082946585887382,
213
+ "linear_cka": 0.98828125
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 0.9609375,
217
+ "mean_cosine_similarity": 0.50390625,
218
+ "std_cosine_similarity": 0.208984375,
219
+ "mean_l2_distance": 49.25,
220
+ "std_l2_distance": 10.4375,
221
+ "mean_dimension_correlation": 0.6544017791748047,
222
+ "std_dimension_correlation": 0.09771714306924677,
223
+ "linear_cka": 0.78125
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.39453125,
227
+ "mean_cosine_similarity": 0.8671875,
228
+ "std_cosine_similarity": 0.212890625,
229
+ "mean_l2_distance": 20.25,
230
+ "std_l2_distance": 16.875,
231
+ "mean_dimension_correlation": 0.8475006103515625,
232
+ "std_dimension_correlation": 0.05680066543049433,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.38671875,
237
+ "mean_cosine_similarity": 0.86328125,
238
+ "std_cosine_similarity": 0.2255859375,
239
+ "mean_l2_distance": 19.875,
240
+ "std_l2_distance": 17.75,
241
+ "mean_dimension_correlation": 0.8454933166503906,
242
+ "std_dimension_correlation": 0.05831595958380969,
243
+ "linear_cka": 0.9921875
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.396484375,
247
+ "mean_cosine_similarity": 0.859375,
248
+ "std_cosine_similarity": 0.2275390625,
249
+ "mean_l2_distance": 20.25,
250
+ "std_l2_distance": 17.75,
251
+ "mean_dimension_correlation": 0.840838623046875,
252
+ "std_dimension_correlation": 0.06208702710996684,
253
+ "linear_cka": 0.984375
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.37890625,
257
+ "mean_cosine_similarity": 0.875,
258
+ "std_cosine_similarity": 0.208984375,
259
+ "mean_l2_distance": 19.375,
260
+ "std_l2_distance": 16.875,
261
+ "mean_dimension_correlation": 0.8549148559570312,
262
+ "std_dimension_correlation": 0.05559449933392313,
263
+ "linear_cka": 0.98828125
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 0.9609375,
267
+ "mean_cosine_similarity": 0.5078125,
268
+ "std_cosine_similarity": 0.2080078125,
269
+ "mean_l2_distance": 49.25,
270
+ "std_l2_distance": 10.375,
271
+ "mean_dimension_correlation": 0.6572914123535156,
272
+ "std_dimension_correlation": 0.09700915659232981,
273
+ "linear_cka": 0.76953125
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.388671875,
277
+ "mean_cosine_similarity": 0.87109375,
278
+ "std_cosine_similarity": 0.2138671875,
279
+ "mean_l2_distance": 19.875,
280
+ "std_l2_distance": 17.0,
281
+ "mean_dimension_correlation": 0.8486358642578125,
282
+ "std_dimension_correlation": 0.05619399135067562,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.38671875,
287
+ "mean_cosine_similarity": 0.86328125,
288
+ "std_cosine_similarity": 0.2255859375,
289
+ "mean_l2_distance": 19.875,
290
+ "std_l2_distance": 17.75,
291
+ "mean_dimension_correlation": 0.8454788208007813,
292
+ "std_dimension_correlation": 0.05838818713320479,
293
+ "linear_cka": 0.9921875
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.396484375,
297
+ "mean_cosine_similarity": 0.86328125,
298
+ "std_cosine_similarity": 0.2255859375,
299
+ "mean_l2_distance": 20.375,
300
+ "std_l2_distance": 17.625,
301
+ "mean_dimension_correlation": 0.8422294616699219,
302
+ "std_dimension_correlation": 0.058264678286021505,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.37890625,
307
+ "mean_cosine_similarity": 0.87109375,
308
+ "std_cosine_similarity": 0.216796875,
309
+ "mean_l2_distance": 19.5,
310
+ "std_l2_distance": 17.25,
311
+ "mean_dimension_correlation": 0.8510147094726562,
312
+ "std_dimension_correlation": 0.05579116262302701,
313
+ "linear_cka": 0.98828125
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 0.97265625,
317
+ "mean_cosine_similarity": 0.498046875,
318
+ "std_cosine_similarity": 0.20703125,
319
+ "mean_l2_distance": 49.75,
320
+ "std_l2_distance": 10.25,
321
+ "mean_dimension_correlation": 0.6493141174316406,
322
+ "std_dimension_correlation": 0.09894686111103042,
323
+ "linear_cka": 0.78125
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.404296875,
327
+ "mean_cosine_similarity": 0.859375,
328
+ "std_cosine_similarity": 0.23046875,
329
+ "mean_l2_distance": 20.75,
330
+ "std_l2_distance": 17.75,
331
+ "mean_dimension_correlation": 0.837823486328125,
332
+ "std_dimension_correlation": 0.060340047867445096,
333
+ "linear_cka": 0.984375
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.396484375,
337
+ "mean_cosine_similarity": 0.859375,
338
+ "std_cosine_similarity": 0.2275390625,
339
+ "mean_l2_distance": 20.25,
340
+ "std_l2_distance": 17.75,
341
+ "mean_dimension_correlation": 0.8408378601074219,
342
+ "std_dimension_correlation": 0.06205699551364581,
343
+ "linear_cka": 0.984375
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.396484375,
347
+ "mean_cosine_similarity": 0.86328125,
348
+ "std_cosine_similarity": 0.2255859375,
349
+ "mean_l2_distance": 20.375,
350
+ "std_l2_distance": 17.625,
351
+ "mean_dimension_correlation": 0.8422599792480469,
352
+ "std_dimension_correlation": 0.05822862763556895,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.384765625,
357
+ "mean_cosine_similarity": 0.86328125,
358
+ "std_cosine_similarity": 0.2314453125,
359
+ "mean_l2_distance": 19.75,
360
+ "std_l2_distance": 18.25,
361
+ "mean_dimension_correlation": 0.842401123046875,
362
+ "std_dimension_correlation": 0.06118176940514725,
363
+ "linear_cka": 0.98828125
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 0.96484375,
367
+ "mean_cosine_similarity": 0.50390625,
368
+ "std_cosine_similarity": 0.2099609375,
369
+ "mean_l2_distance": 49.5,
370
+ "std_l2_distance": 10.4375,
371
+ "mean_dimension_correlation": 0.6536331176757812,
372
+ "std_dimension_correlation": 0.09992718456511775,
373
+ "linear_cka": 0.7734375
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.3984375,
377
+ "mean_cosine_similarity": 0.859375,
378
+ "std_cosine_similarity": 0.2294921875,
379
+ "mean_l2_distance": 20.375,
380
+ "std_l2_distance": 17.75,
381
+ "mean_dimension_correlation": 0.8406883239746094,
382
+ "std_dimension_correlation": 0.06084754257089133,
383
+ "linear_cka": 0.98828125
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.37890625,
387
+ "mean_cosine_similarity": 0.875,
388
+ "std_cosine_similarity": 0.208984375,
389
+ "mean_l2_distance": 19.375,
390
+ "std_l2_distance": 16.875,
391
+ "mean_dimension_correlation": 0.8549110412597656,
392
+ "std_dimension_correlation": 0.05561355528314661,
393
+ "linear_cka": 0.98828125
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.37890625,
397
+ "mean_cosine_similarity": 0.87109375,
398
+ "std_cosine_similarity": 0.216796875,
399
+ "mean_l2_distance": 19.5,
400
+ "std_l2_distance": 17.25,
401
+ "mean_dimension_correlation": 0.851116943359375,
402
+ "std_dimension_correlation": 0.05580195396667784,
403
+ "linear_cka": 0.98828125
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.384765625,
407
+ "mean_cosine_similarity": 0.86328125,
408
+ "std_cosine_similarity": 0.2314453125,
409
+ "mean_l2_distance": 19.75,
410
+ "std_l2_distance": 18.25,
411
+ "mean_dimension_correlation": 0.8424659729003906,
412
+ "std_dimension_correlation": 0.06116790445559799,
413
+ "linear_cka": 0.98828125
414
+ },
415
+ "avg_mse": 0.5819010416666667,
416
+ "std_mse": 0.27032309960351786,
417
+ "avg_mean_cosine_similarity": 0.744921875,
418
+ "std_mean_cosine_similarity": 0.17021541336381552,
419
+ "avg_std_cosine_similarity": 0.21764322916666667,
420
+ "std_std_cosine_similarity": 0.00918084175662793,
421
+ "avg_mean_l2_distance": 29.825,
422
+ "std_mean_l2_distance": 13.846163728628952,
423
+ "avg_std_l2_distance": 15.116666666666667,
424
+ "std_std_l2_distance": 3.3721335564034565,
425
+ "avg_mean_dimension_correlation": 0.781509501139323,
426
+ "std_mean_dimension_correlation": 0.09012204602156858,
427
+ "avg_std_dimension_correlation": 0.07165274636880824,
428
+ "std_std_dimension_correlation": 0.018653236358734372,
429
+ "avg_linear_cka": 0.9169270833333333,
430
+ "std_linear_cka": 0.09876420102706185
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_6002688.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 6002688,
4
+ "global_step": 5862,
5
+ "training_metrics": {
6
+ "train/loss": 2.546875,
7
+ "train/contrastive": 2.453125,
8
+ "train/recons_loss": 0.6015625,
9
+ "train/balance_loss": 3.875,
10
+ "train/balance_loss_contrastive": 2.84375,
11
+ "train/balance_loss_recons": 1.0234375,
12
+ "train/contrastive_std": 3.359375,
13
+ "train/recons_std": 0.1044921875,
14
+ "train/contrastive_min": 0.0927734375,
15
+ "train/contrastive_max": 7.125,
16
+ "train/recons_min": 0.50390625,
17
+ "train/recons_max": 0.7890625,
18
+ "train/Qwen3_0.6B_layer_2": 0.7890625,
19
+ "train/Qwen3_0.6B_layer_4": 0.546875,
20
+ "train/Qwen3_1.7B_layer_2": 0.5390625,
21
+ "train/Qwen3_1.7B_layer_4": 0.65234375,
22
+ "train/Qwen3_4B_layer_2": 0.50390625,
23
+ "train/Qwen3_4B_layer_4": 0.578125,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 6002688,
27
+ "train/step": 5862
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 5862,
31
+ "n_tokens": 6002688,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 8.11236572265625,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.918520927429199,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.107908248901367,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.823939323425293,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.930037498474121,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.921452522277832,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 3.8407585620880127,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.2737579345703125,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.3659729957580566,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.320258617401123,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.256859302520752,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.30106782913208,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.883870601654053,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.919462203979492,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.107792377471924,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.342267036437988,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.5884199142456055,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.350939750671387,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 3.649061918258667,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.6456005573272705,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6709065437316895,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.648075580596924,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.622471332550049,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.786158800125122,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 4.197429656982422,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.494438648223877,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0957608222961426,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.3116352558135986,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.3204939365386963,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3190274238586426,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 4.347363471984863,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.295062303543091,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.980980634689331,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.255995512008667,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.1774487495422363,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.962179183959961,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.7902801036834717,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.5620601177215576,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.5911345481872559,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.6545490026474,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.5992227792739868,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.5955195426940918,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.649550199508667,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9683128595352173,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0230571031570435,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.0230211019515991,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.976743221282959,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9846528768539429,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.6651275157928467,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9568565487861633,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9617317318916321,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9807920455932617,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.968661904335022,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9733489751815796,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.803094506263733,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.2474844455718994,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.2767094373703003,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.2572760581970215,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.2457572221755981,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.2488363981246948,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.6203731298446655,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.0437383651733398,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0250771045684814,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.0189640522003174,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9739342927932739,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9979578852653503,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.6929349899291992,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.1436792612075806,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.1596219539642334,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.1562455892562866,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.1323540210723877,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.1177374124526978
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 0.55859375,
117
+ "mean_cosine_similarity": 0.80859375,
118
+ "std_cosine_similarity": 0.1884765625,
119
+ "mean_l2_distance": 28.625,
120
+ "std_l2_distance": 13.4375,
121
+ "mean_dimension_correlation": 0.8190200805664063,
122
+ "std_dimension_correlation": 0.05411914097024187,
123
+ "linear_cka": 0.8828125
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 0.56640625,
127
+ "mean_cosine_similarity": 0.80078125,
128
+ "std_cosine_similarity": 0.1943359375,
129
+ "mean_l2_distance": 29.0,
130
+ "std_l2_distance": 13.6875,
131
+ "mean_dimension_correlation": 0.8142623901367188,
132
+ "std_dimension_correlation": 0.05641095638371379,
133
+ "linear_cka": 0.8828125
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 0.55859375,
137
+ "mean_cosine_similarity": 0.80859375,
138
+ "std_cosine_similarity": 0.1884765625,
139
+ "mean_l2_distance": 28.75,
140
+ "std_l2_distance": 13.4375,
141
+ "mean_dimension_correlation": 0.8182327270507812,
142
+ "std_dimension_correlation": 0.05454624510823859,
143
+ "linear_cka": 0.875
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 0.56640625,
147
+ "mean_cosine_similarity": 0.80078125,
148
+ "std_cosine_similarity": 0.1953125,
149
+ "mean_l2_distance": 29.0,
150
+ "std_l2_distance": 13.8125,
151
+ "mean_dimension_correlation": 0.8126785278320312,
152
+ "std_dimension_correlation": 0.0564578377173453,
153
+ "linear_cka": 0.8828125
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 0.56640625,
157
+ "mean_cosine_similarity": 0.80078125,
158
+ "std_cosine_similarity": 0.1943359375,
159
+ "mean_l2_distance": 29.0,
160
+ "std_l2_distance": 13.75,
161
+ "mean_dimension_correlation": 0.8143714904785156,
162
+ "std_dimension_correlation": 0.05675514125293357,
163
+ "linear_cka": 0.8828125
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 0.55859375,
167
+ "mean_cosine_similarity": 0.80859375,
168
+ "std_cosine_similarity": 0.1884765625,
169
+ "mean_l2_distance": 28.625,
170
+ "std_l2_distance": 13.4375,
171
+ "mean_dimension_correlation": 0.818988037109375,
172
+ "std_dimension_correlation": 0.05413218674674489,
173
+ "linear_cka": 0.8828125
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.34375,
177
+ "mean_cosine_similarity": 0.8984375,
178
+ "std_cosine_similarity": 0.1865234375,
179
+ "mean_l2_distance": 17.625,
180
+ "std_l2_distance": 15.1875,
181
+ "mean_dimension_correlation": 0.8818618774414062,
182
+ "std_dimension_correlation": 0.04443958868205486,
183
+ "linear_cka": 0.9921875
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.337890625,
187
+ "mean_cosine_similarity": 0.8984375,
188
+ "std_cosine_similarity": 0.185546875,
189
+ "mean_l2_distance": 17.375,
190
+ "std_l2_distance": 15.1875,
191
+ "mean_dimension_correlation": 0.8834381103515625,
192
+ "std_dimension_correlation": 0.04373309871215753,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.349609375,
197
+ "mean_cosine_similarity": 0.890625,
198
+ "std_cosine_similarity": 0.2001953125,
199
+ "mean_l2_distance": 18.0,
200
+ "std_l2_distance": 15.9375,
201
+ "mean_dimension_correlation": 0.875439453125,
202
+ "std_dimension_correlation": 0.04714301734748471,
203
+ "linear_cka": 0.984375
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.345703125,
207
+ "mean_cosine_similarity": 0.89453125,
208
+ "std_cosine_similarity": 0.19921875,
209
+ "mean_l2_distance": 17.75,
210
+ "std_l2_distance": 15.875,
211
+ "mean_dimension_correlation": 0.8766807556152344,
212
+ "std_dimension_correlation": 0.048109971697749936,
213
+ "linear_cka": 0.98828125
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 0.56640625,
217
+ "mean_cosine_similarity": 0.80078125,
218
+ "std_cosine_similarity": 0.1943359375,
219
+ "mean_l2_distance": 29.0,
220
+ "std_l2_distance": 13.6875,
221
+ "mean_dimension_correlation": 0.814276123046875,
222
+ "std_dimension_correlation": 0.05643128448459502,
223
+ "linear_cka": 0.8828125
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.34375,
227
+ "mean_cosine_similarity": 0.8984375,
228
+ "std_cosine_similarity": 0.1865234375,
229
+ "mean_l2_distance": 17.625,
230
+ "std_l2_distance": 15.1875,
231
+ "mean_dimension_correlation": 0.88189697265625,
232
+ "std_dimension_correlation": 0.044490526216888565,
233
+ "linear_cka": 0.9921875
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.3359375,
237
+ "mean_cosine_similarity": 0.89453125,
238
+ "std_cosine_similarity": 0.197265625,
239
+ "mean_l2_distance": 17.25,
240
+ "std_l2_distance": 15.9375,
241
+ "mean_dimension_correlation": 0.880517578125,
242
+ "std_dimension_correlation": 0.0455623185686599,
243
+ "linear_cka": 0.9921875
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.34375,
247
+ "mean_cosine_similarity": 0.89453125,
248
+ "std_cosine_similarity": 0.19921875,
249
+ "mean_l2_distance": 17.5,
250
+ "std_l2_distance": 16.0,
251
+ "mean_dimension_correlation": 0.8772689819335937,
252
+ "std_dimension_correlation": 0.04894074305932899,
253
+ "linear_cka": 0.9921875
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.328125,
257
+ "mean_cosine_similarity": 0.90234375,
258
+ "std_cosine_similarity": 0.18359375,
259
+ "mean_l2_distance": 16.875,
260
+ "std_l2_distance": 15.1875,
261
+ "mean_dimension_correlation": 0.8873504638671875,
262
+ "std_dimension_correlation": 0.04405998009297244,
263
+ "linear_cka": 0.98828125
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 0.55859375,
267
+ "mean_cosine_similarity": 0.80859375,
268
+ "std_cosine_similarity": 0.1884765625,
269
+ "mean_l2_distance": 28.75,
270
+ "std_l2_distance": 13.4375,
271
+ "mean_dimension_correlation": 0.8182823181152343,
272
+ "std_dimension_correlation": 0.054536269965662375,
273
+ "linear_cka": 0.875
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.337890625,
277
+ "mean_cosine_similarity": 0.8984375,
278
+ "std_cosine_similarity": 0.185546875,
279
+ "mean_l2_distance": 17.375,
280
+ "std_l2_distance": 15.1875,
281
+ "mean_dimension_correlation": 0.8834640502929687,
282
+ "std_dimension_correlation": 0.043707021156640234,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.3359375,
287
+ "mean_cosine_similarity": 0.89453125,
288
+ "std_cosine_similarity": 0.197265625,
289
+ "mean_l2_distance": 17.25,
290
+ "std_l2_distance": 15.9375,
291
+ "mean_dimension_correlation": 0.880511474609375,
292
+ "std_dimension_correlation": 0.04552510428270067,
293
+ "linear_cka": 0.9921875
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.34375,
297
+ "mean_cosine_similarity": 0.89453125,
298
+ "std_cosine_similarity": 0.197265625,
299
+ "mean_l2_distance": 17.625,
300
+ "std_l2_distance": 15.875,
301
+ "mean_dimension_correlation": 0.878497314453125,
302
+ "std_dimension_correlation": 0.045715874336397684,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.330078125,
307
+ "mean_cosine_similarity": 0.90234375,
308
+ "std_cosine_similarity": 0.189453125,
309
+ "mean_l2_distance": 17.0,
310
+ "std_l2_distance": 15.4375,
311
+ "mean_dimension_correlation": 0.8847824096679687,
312
+ "std_dimension_correlation": 0.0437299377718827,
313
+ "linear_cka": 0.98828125
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 0.56640625,
317
+ "mean_cosine_similarity": 0.80078125,
318
+ "std_cosine_similarity": 0.1953125,
319
+ "mean_l2_distance": 29.0,
320
+ "std_l2_distance": 13.8125,
321
+ "mean_dimension_correlation": 0.8127197265625,
322
+ "std_dimension_correlation": 0.05649289036639174,
323
+ "linear_cka": 0.8828125
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.349609375,
327
+ "mean_cosine_similarity": 0.890625,
328
+ "std_cosine_similarity": 0.2001953125,
329
+ "mean_l2_distance": 18.0,
330
+ "std_l2_distance": 15.9375,
331
+ "mean_dimension_correlation": 0.8753738403320312,
332
+ "std_dimension_correlation": 0.047172969623106395,
333
+ "linear_cka": 0.984375
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.34375,
337
+ "mean_cosine_similarity": 0.89453125,
338
+ "std_cosine_similarity": 0.19921875,
339
+ "mean_l2_distance": 17.5,
340
+ "std_l2_distance": 16.0,
341
+ "mean_dimension_correlation": 0.8773391723632813,
342
+ "std_dimension_correlation": 0.048965809084662616,
343
+ "linear_cka": 0.9921875
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.34375,
347
+ "mean_cosine_similarity": 0.89453125,
348
+ "std_cosine_similarity": 0.197265625,
349
+ "mean_l2_distance": 17.625,
350
+ "std_l2_distance": 15.875,
351
+ "mean_dimension_correlation": 0.8785049438476562,
352
+ "std_dimension_correlation": 0.04566486947653503,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.330078125,
357
+ "mean_cosine_similarity": 0.89453125,
358
+ "std_cosine_similarity": 0.2021484375,
359
+ "mean_l2_distance": 16.875,
360
+ "std_l2_distance": 16.5,
361
+ "mean_dimension_correlation": 0.8791839599609375,
362
+ "std_dimension_correlation": 0.04851861504925165,
363
+ "linear_cka": 0.99609375
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 0.56640625,
367
+ "mean_cosine_similarity": 0.80078125,
368
+ "std_cosine_similarity": 0.1943359375,
369
+ "mean_l2_distance": 29.0,
370
+ "std_l2_distance": 13.75,
371
+ "mean_dimension_correlation": 0.8144371032714843,
372
+ "std_dimension_correlation": 0.056772418466086064,
373
+ "linear_cka": 0.8828125
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.345703125,
377
+ "mean_cosine_similarity": 0.89453125,
378
+ "std_cosine_similarity": 0.19921875,
379
+ "mean_l2_distance": 17.75,
380
+ "std_l2_distance": 15.875,
381
+ "mean_dimension_correlation": 0.8767280578613281,
382
+ "std_dimension_correlation": 0.04809051339837505,
383
+ "linear_cka": 0.98828125
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.328125,
387
+ "mean_cosine_similarity": 0.90234375,
388
+ "std_cosine_similarity": 0.18359375,
389
+ "mean_l2_distance": 16.875,
390
+ "std_l2_distance": 15.1875,
391
+ "mean_dimension_correlation": 0.88740234375,
392
+ "std_dimension_correlation": 0.04400411105457499,
393
+ "linear_cka": 0.98828125
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.330078125,
397
+ "mean_cosine_similarity": 0.90234375,
398
+ "std_cosine_similarity": 0.189453125,
399
+ "mean_l2_distance": 17.0,
400
+ "std_l2_distance": 15.4375,
401
+ "mean_dimension_correlation": 0.8847671508789062,
402
+ "std_dimension_correlation": 0.04373389353658331,
403
+ "linear_cka": 0.98828125
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.330078125,
407
+ "mean_cosine_similarity": 0.89453125,
408
+ "std_cosine_similarity": 0.2021484375,
409
+ "mean_l2_distance": 16.875,
410
+ "std_l2_distance": 16.5,
411
+ "mean_dimension_correlation": 0.8792098999023438,
412
+ "std_dimension_correlation": 0.04842937345071404,
413
+ "linear_cka": 0.99609375
414
+ },
415
+ "avg_mse": 0.413671875,
416
+ "std_mse": 0.10597438593952982,
417
+ "avg_mean_cosine_similarity": 0.865625,
418
+ "std_mean_cosine_similarity": 0.04379647828783345,
419
+ "avg_std_cosine_similarity": 0.19342447916666666,
420
+ "std_std_cosine_similarity": 0.005743944010446035,
421
+ "avg_mean_l2_distance": 21.216666666666665,
422
+ "std_mean_l2_distance": 5.424187087071717,
423
+ "avg_std_l2_distance": 15.016666666666667,
424
+ "std_std_l2_distance": 1.0459412294940647,
425
+ "avg_mean_dimension_correlation": 0.858916244506836,
426
+ "std_mean_dimension_correlation": 0.03071649327798749,
427
+ "avg_std_dimension_correlation": 0.04921305693535582,
428
+ "std_std_dimension_correlation": 0.004870255293926663,
429
+ "avg_linear_cka": 0.953125,
430
+ "std_linear_cka": 0.05095123792248166
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_7003136.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 7003136,
4
+ "global_step": 6839,
5
+ "training_metrics": {
6
+ "train/loss": 2.546875,
7
+ "train/contrastive": 2.453125,
8
+ "train/recons_loss": 0.5859375,
9
+ "train/balance_loss": 3.875,
10
+ "train/balance_loss_contrastive": 2.859375,
11
+ "train/balance_loss_recons": 1.015625,
12
+ "train/contrastive_std": 3.390625,
13
+ "train/recons_std": 0.07177734375,
14
+ "train/contrastive_min": 0.0849609375,
15
+ "train/contrastive_max": 7.1875,
16
+ "train/recons_min": 0.49609375,
17
+ "train/recons_max": 0.68359375,
18
+ "train/Qwen3_0.6B_layer_2": 0.68359375,
19
+ "train/Qwen3_0.6B_layer_4": 0.55859375,
20
+ "train/Qwen3_1.7B_layer_2": 0.5390625,
21
+ "train/Qwen3_1.7B_layer_4": 0.65625,
22
+ "train/Qwen3_4B_layer_2": 0.49609375,
23
+ "train/Qwen3_4B_layer_4": 0.57421875,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 7003136,
27
+ "train/step": 6839
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 6839,
31
+ "n_tokens": 7003136,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 7.436762809753418,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.0658464431762695,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.481184482574463,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.2030930519104,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.240965366363525,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.4969305992126465,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.213552474975586,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.2100319862365723,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.3322105407714844,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.236471652984619,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2212750911712646,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.308291435241699,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.611138343811035,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.940827369689941,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.0901408195495605,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.319899559020996,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.241495609283447,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.30285120010376,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.640443801879883,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.575563907623291,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.661689043045044,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.5703558921813965,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.555753707885742,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7401225566864014,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.5014476776123047,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.3176639080047607,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.024665355682373,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.2373054027557373,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.237010955810547,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.1157214641571045,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.1809849739074707,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.101025104522705,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.89253568649292,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0308847427368164,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.043917179107666,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7547640800476074,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.3392760753631592,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2929567098617554,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.3323848247528076,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3409219980239868,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.3193416595458984,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.3225743770599365,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9926112294197083,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9511648416519165,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9907838106155396,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.0008413791656494,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9629853367805481,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9665719270706177,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9707884788513184,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9194974899291992,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9195350408554077,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9375813603401184,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9312270283699036,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9314996600151062,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2487989664077759,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.2148901224136353,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.2322641611099243,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.2202345132827759,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.212104320526123,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.2153819799423218,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9843270182609558,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9733158349990845,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9586274027824402,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9659514427185059,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9286855459213257,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.941694438457489,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.0964877605438232,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0766807794570923,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0825374126434326,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0911656618118286,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0675519704818726,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0499017238616943
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 0.37890625,
117
+ "mean_cosine_similarity": 0.89453125,
118
+ "std_cosine_similarity": 0.1748046875,
119
+ "mean_l2_distance": 19.375,
120
+ "std_l2_distance": 13.5,
121
+ "mean_dimension_correlation": 0.88087158203125,
122
+ "std_dimension_correlation": 0.03891650382660663,
123
+ "linear_cka": 0.96875
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 0.384765625,
127
+ "mean_cosine_similarity": 0.890625,
128
+ "std_cosine_similarity": 0.1806640625,
129
+ "mean_l2_distance": 19.625,
130
+ "std_l2_distance": 13.8125,
131
+ "mean_dimension_correlation": 0.8772735595703125,
132
+ "std_dimension_correlation": 0.040613200120000074,
133
+ "linear_cka": 0.96875
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 0.380859375,
137
+ "mean_cosine_similarity": 0.89453125,
138
+ "std_cosine_similarity": 0.1728515625,
139
+ "mean_l2_distance": 19.5,
140
+ "std_l2_distance": 13.25,
141
+ "mean_dimension_correlation": 0.8814620971679688,
142
+ "std_dimension_correlation": 0.03743361738689456,
143
+ "linear_cka": 0.96875
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 0.380859375,
147
+ "mean_cosine_similarity": 0.890625,
148
+ "std_cosine_similarity": 0.1787109375,
149
+ "mean_l2_distance": 19.625,
150
+ "std_l2_distance": 13.6875,
151
+ "mean_dimension_correlation": 0.8784408569335938,
152
+ "std_dimension_correlation": 0.03935897183860133,
153
+ "linear_cka": 0.96875
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 0.37890625,
157
+ "mean_cosine_similarity": 0.89453125,
158
+ "std_cosine_similarity": 0.177734375,
159
+ "mean_l2_distance": 19.5,
160
+ "std_l2_distance": 13.5625,
161
+ "mean_dimension_correlation": 0.880218505859375,
162
+ "std_dimension_correlation": 0.03827009184402313,
163
+ "linear_cka": 0.96875
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 0.37890625,
167
+ "mean_cosine_similarity": 0.89453125,
168
+ "std_cosine_similarity": 0.1748046875,
169
+ "mean_l2_distance": 19.375,
170
+ "std_l2_distance": 13.5,
171
+ "mean_dimension_correlation": 0.8809066772460937,
172
+ "std_dimension_correlation": 0.038947862226715424,
173
+ "linear_cka": 0.96875
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.3125,
177
+ "mean_cosine_similarity": 0.9140625,
178
+ "std_cosine_similarity": 0.171875,
179
+ "mean_l2_distance": 16.0,
180
+ "std_l2_distance": 14.1875,
181
+ "mean_dimension_correlation": 0.900384521484375,
182
+ "std_dimension_correlation": 0.03780791654866101,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.30859375,
187
+ "mean_cosine_similarity": 0.9140625,
188
+ "std_cosine_similarity": 0.171875,
189
+ "mean_l2_distance": 15.75,
190
+ "std_l2_distance": 14.1875,
191
+ "mean_dimension_correlation": 0.9015731811523438,
192
+ "std_dimension_correlation": 0.037591582433134804,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.314453125,
197
+ "mean_cosine_similarity": 0.91015625,
198
+ "std_cosine_similarity": 0.18359375,
199
+ "mean_l2_distance": 16.125,
200
+ "std_l2_distance": 14.75,
201
+ "mean_dimension_correlation": 0.8960525512695312,
202
+ "std_dimension_correlation": 0.0391310064588162,
203
+ "linear_cka": 0.984375
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.3125,
207
+ "mean_cosine_similarity": 0.91015625,
208
+ "std_cosine_similarity": 0.1826171875,
209
+ "mean_l2_distance": 15.9375,
210
+ "std_l2_distance": 14.75,
211
+ "mean_dimension_correlation": 0.8973068237304688,
212
+ "std_dimension_correlation": 0.040051008367224694,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 0.384765625,
217
+ "mean_cosine_similarity": 0.890625,
218
+ "std_cosine_similarity": 0.1806640625,
219
+ "mean_l2_distance": 19.625,
220
+ "std_l2_distance": 13.8125,
221
+ "mean_dimension_correlation": 0.877227783203125,
222
+ "std_dimension_correlation": 0.04062615493819983,
223
+ "linear_cka": 0.96875
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.3125,
227
+ "mean_cosine_similarity": 0.9140625,
228
+ "std_cosine_similarity": 0.171875,
229
+ "mean_l2_distance": 16.0,
230
+ "std_l2_distance": 14.1875,
231
+ "mean_dimension_correlation": 0.9004119873046875,
232
+ "std_dimension_correlation": 0.037833126797664623,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.302734375,
237
+ "mean_cosine_similarity": 0.9140625,
238
+ "std_cosine_similarity": 0.181640625,
239
+ "mean_l2_distance": 15.5625,
240
+ "std_l2_distance": 14.875,
241
+ "mean_dimension_correlation": 0.8998611450195313,
242
+ "std_dimension_correlation": 0.03902960080299192,
243
+ "linear_cka": 0.984375
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.306640625,
247
+ "mean_cosine_similarity": 0.91015625,
248
+ "std_cosine_similarity": 0.1826171875,
249
+ "mean_l2_distance": 15.6875,
250
+ "std_l2_distance": 14.9375,
251
+ "mean_dimension_correlation": 0.8979522705078125,
252
+ "std_dimension_correlation": 0.04081550083822763,
253
+ "linear_cka": 0.984375
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.296875,
257
+ "mean_cosine_similarity": 0.91796875,
258
+ "std_cosine_similarity": 0.1689453125,
259
+ "mean_l2_distance": 15.1875,
260
+ "std_l2_distance": 14.1875,
261
+ "mean_dimension_correlation": 0.9056289672851563,
262
+ "std_dimension_correlation": 0.037629372700621964,
263
+ "linear_cka": 0.984375
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 0.380859375,
267
+ "mean_cosine_similarity": 0.89453125,
268
+ "std_cosine_similarity": 0.1728515625,
269
+ "mean_l2_distance": 19.5,
270
+ "std_l2_distance": 13.25,
271
+ "mean_dimension_correlation": 0.881549072265625,
272
+ "std_dimension_correlation": 0.037422879211554134,
273
+ "linear_cka": 0.96875
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.30859375,
277
+ "mean_cosine_similarity": 0.9140625,
278
+ "std_cosine_similarity": 0.171875,
279
+ "mean_l2_distance": 15.75,
280
+ "std_l2_distance": 14.1875,
281
+ "mean_dimension_correlation": 0.9015762329101562,
282
+ "std_dimension_correlation": 0.03752165553415462,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.302734375,
287
+ "mean_cosine_similarity": 0.9140625,
288
+ "std_cosine_similarity": 0.181640625,
289
+ "mean_l2_distance": 15.5625,
290
+ "std_l2_distance": 14.875,
291
+ "mean_dimension_correlation": 0.8998123168945312,
292
+ "std_dimension_correlation": 0.03903639037032718,
293
+ "linear_cka": 0.984375
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.30859375,
297
+ "mean_cosine_similarity": 0.9140625,
298
+ "std_cosine_similarity": 0.1806640625,
299
+ "mean_l2_distance": 15.8125,
300
+ "std_l2_distance": 14.75,
301
+ "mean_dimension_correlation": 0.8988906860351562,
302
+ "std_dimension_correlation": 0.038209415172214704,
303
+ "linear_cka": 0.9921875
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.298828125,
307
+ "mean_cosine_similarity": 0.91796875,
308
+ "std_cosine_similarity": 0.1748046875,
309
+ "mean_l2_distance": 15.3125,
310
+ "std_l2_distance": 14.4375,
311
+ "mean_dimension_correlation": 0.9034744262695312,
312
+ "std_dimension_correlation": 0.03725401269453125,
313
+ "linear_cka": 0.9921875
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 0.380859375,
317
+ "mean_cosine_similarity": 0.890625,
318
+ "std_cosine_similarity": 0.1787109375,
319
+ "mean_l2_distance": 19.625,
320
+ "std_l2_distance": 13.6875,
321
+ "mean_dimension_correlation": 0.8785064697265625,
322
+ "std_dimension_correlation": 0.0393290152752092,
323
+ "linear_cka": 0.96875
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.314453125,
327
+ "mean_cosine_similarity": 0.91015625,
328
+ "std_cosine_similarity": 0.18359375,
329
+ "mean_l2_distance": 16.125,
330
+ "std_l2_distance": 14.75,
331
+ "mean_dimension_correlation": 0.8960662841796875,
332
+ "std_dimension_correlation": 0.039119882279186446,
333
+ "linear_cka": 0.984375
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.306640625,
337
+ "mean_cosine_similarity": 0.91015625,
338
+ "std_cosine_similarity": 0.1826171875,
339
+ "mean_l2_distance": 15.6875,
340
+ "std_l2_distance": 14.9375,
341
+ "mean_dimension_correlation": 0.8979080200195313,
342
+ "std_dimension_correlation": 0.04092076296017269,
343
+ "linear_cka": 0.984375
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.30859375,
347
+ "mean_cosine_similarity": 0.9140625,
348
+ "std_cosine_similarity": 0.1806640625,
349
+ "mean_l2_distance": 15.8125,
350
+ "std_l2_distance": 14.75,
351
+ "mean_dimension_correlation": 0.8988739013671875,
352
+ "std_dimension_correlation": 0.03830457081641783,
353
+ "linear_cka": 0.9921875
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.29296875,
357
+ "mean_cosine_similarity": 0.9140625,
358
+ "std_cosine_similarity": 0.1845703125,
359
+ "mean_l2_distance": 15.0,
360
+ "std_l2_distance": 15.1875,
361
+ "mean_dimension_correlation": 0.9007888793945312,
362
+ "std_dimension_correlation": 0.03971286220373166,
363
+ "linear_cka": 0.9921875
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 0.37890625,
367
+ "mean_cosine_similarity": 0.89453125,
368
+ "std_cosine_similarity": 0.177734375,
369
+ "mean_l2_distance": 19.5,
370
+ "std_l2_distance": 13.5625,
371
+ "mean_dimension_correlation": 0.8801483154296875,
372
+ "std_dimension_correlation": 0.038289717180632434,
373
+ "linear_cka": 0.96875
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.3125,
377
+ "mean_cosine_similarity": 0.91015625,
378
+ "std_cosine_similarity": 0.1826171875,
379
+ "mean_l2_distance": 15.9375,
380
+ "std_l2_distance": 14.75,
381
+ "mean_dimension_correlation": 0.8973480224609375,
382
+ "std_dimension_correlation": 0.0400727561743654,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.296875,
387
+ "mean_cosine_similarity": 0.91796875,
388
+ "std_cosine_similarity": 0.1689453125,
389
+ "mean_l2_distance": 15.1875,
390
+ "std_l2_distance": 14.1875,
391
+ "mean_dimension_correlation": 0.9055801391601562,
392
+ "std_dimension_correlation": 0.0376429470480909,
393
+ "linear_cka": 0.984375
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.298828125,
397
+ "mean_cosine_similarity": 0.91796875,
398
+ "std_cosine_similarity": 0.1748046875,
399
+ "mean_l2_distance": 15.3125,
400
+ "std_l2_distance": 14.4375,
401
+ "mean_dimension_correlation": 0.9034759521484375,
402
+ "std_dimension_correlation": 0.03727643893244147,
403
+ "linear_cka": 0.9921875
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.29296875,
407
+ "mean_cosine_similarity": 0.9140625,
408
+ "std_cosine_similarity": 0.1845703125,
409
+ "mean_l2_distance": 15.0,
410
+ "std_l2_distance": 15.1875,
411
+ "mean_dimension_correlation": 0.9008010864257813,
412
+ "std_dimension_correlation": 0.03971378852358426,
413
+ "linear_cka": 0.9921875
414
+ },
415
+ "avg_mse": 0.33059895833333336,
416
+ "std_mse": 0.0360100791855751,
417
+ "avg_mean_cosine_similarity": 0.9067708333333333,
418
+ "std_mean_cosine_similarity": 0.010072437294694645,
419
+ "avg_std_cosine_similarity": 0.17786458333333333,
420
+ "std_std_cosine_similarity": 0.0048221052570680215,
421
+ "avg_mean_l2_distance": 16.933333333333334,
422
+ "std_mean_l2_distance": 1.8555191696365978,
423
+ "avg_std_l2_distance": 14.270833333333334,
424
+ "std_std_l2_distance": 0.5816941254263752,
425
+ "avg_mean_dimension_correlation": 0.8933457438151041,
426
+ "std_mean_dimension_correlation": 0.009972265018242275,
427
+ "avg_std_dimension_correlation": 0.0387960870501666,
428
+ "std_std_dimension_correlation": 0.0011301372196425136,
429
+ "avg_linear_cka": 0.9807291666666667,
430
+ "std_linear_cka": 0.008960755486502733
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_8003584.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 8003584,
4
+ "global_step": 7816,
5
+ "training_metrics": {
6
+ "train/loss": 2.546875,
7
+ "train/contrastive": 2.453125,
8
+ "train/recons_loss": 0.5703125,
9
+ "train/balance_loss": 3.84375,
10
+ "train/balance_loss_contrastive": 2.84375,
11
+ "train/balance_loss_recons": 1.0078125,
12
+ "train/contrastive_std": 3.359375,
13
+ "train/recons_std": 0.0703125,
14
+ "train/contrastive_min": 0.083984375,
15
+ "train/contrastive_max": 7.125,
16
+ "train/recons_min": 0.48828125,
17
+ "train/recons_max": 0.671875,
18
+ "train/Qwen3_0.6B_layer_2": 0.671875,
19
+ "train/Qwen3_0.6B_layer_4": 0.54296875,
20
+ "train/Qwen3_1.7B_layer_2": 0.52734375,
21
+ "train/Qwen3_1.7B_layer_4": 0.640625,
22
+ "train/Qwen3_4B_layer_2": 0.48828125,
23
+ "train/Qwen3_4B_layer_4": 0.5625,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 8003584,
27
+ "train/step": 7816
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 7816,
31
+ "n_tokens": 8003584,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.801623344421387,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.516300201416016,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.550345420837402,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.498440742492676,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.312735080718994,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.4551262855529785,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.2260851860046387,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.1856892108917236,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.254146099090576,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.229769468307495,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2037243843078613,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.2896828651428223,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.400465965270996,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.9340386390686035,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.794930458068848,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.900982856750488,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.348906517028809,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.4423675537109375,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.5666661262512207,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.535998821258545,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.4926912784576416,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.476747989654541,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.408336877822876,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.6492466926574707,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4851021766662598,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.303314685821533,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0016140937805176,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.184553384780884,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.121729850769043,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.000966787338257,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.442514419555664,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.1136765480041504,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.937788486480713,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0111327171325684,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.0196948051452637,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7799510955810547,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.2630091905593872,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2069993019104004,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.2386506795883179,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.2585456371307373,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.212580919265747,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.2229262590408325,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.0233924388885498,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9251772165298462,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9622151255607605,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9760592579841614,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9428697824478149,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9486178159713745,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.0079174041748047,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9031265377998352,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9057611227035522,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9231780767440796,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9179145097732544,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9312993884086609,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2595539093017578,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.169715166091919,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.1957802772521973,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1877433061599731,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.179739236831665,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.1788952350616455,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.0426846742630005,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9591526985168457,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9619539380073547,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9698508977890015,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9279893636703491,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9385145902633667,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.1462980508804321,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0632051229476929,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0799243450164795,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0858067274093628,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0611412525177002,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0519263744354248
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 0.388671875,
117
+ "mean_cosine_similarity": 0.89453125,
118
+ "std_cosine_similarity": 0.15625,
119
+ "mean_l2_distance": 19.875,
120
+ "std_l2_distance": 12.375,
121
+ "mean_dimension_correlation": 0.890447998046875,
122
+ "std_dimension_correlation": 0.03419125987740356,
123
+ "linear_cka": 0.96484375
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 0.39453125,
127
+ "mean_cosine_similarity": 0.89453125,
128
+ "std_cosine_similarity": 0.162109375,
129
+ "mean_l2_distance": 20.125,
130
+ "std_l2_distance": 12.625,
131
+ "mean_dimension_correlation": 0.8867477416992188,
132
+ "std_dimension_correlation": 0.035491939390515204,
133
+ "linear_cka": 0.96484375
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 0.39453125,
137
+ "mean_cosine_similarity": 0.89453125,
138
+ "std_cosine_similarity": 0.154296875,
139
+ "mean_l2_distance": 20.125,
140
+ "std_l2_distance": 12.1875,
141
+ "mean_dimension_correlation": 0.889697265625,
142
+ "std_dimension_correlation": 0.03374281347550432,
143
+ "linear_cka": 0.96484375
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 0.390625,
147
+ "mean_cosine_similarity": 0.89453125,
148
+ "std_cosine_similarity": 0.1591796875,
149
+ "mean_l2_distance": 20.0,
150
+ "std_l2_distance": 12.5,
151
+ "mean_dimension_correlation": 0.8883514404296875,
152
+ "std_dimension_correlation": 0.035164283126066044,
153
+ "linear_cka": 0.96484375
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 0.388671875,
157
+ "mean_cosine_similarity": 0.89453125,
158
+ "std_cosine_similarity": 0.1591796875,
159
+ "mean_l2_distance": 20.0,
160
+ "std_l2_distance": 12.4375,
161
+ "mean_dimension_correlation": 0.8896194458007812,
162
+ "std_dimension_correlation": 0.03421083254072828,
163
+ "linear_cka": 0.96484375
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 0.388671875,
167
+ "mean_cosine_similarity": 0.89453125,
168
+ "std_cosine_similarity": 0.15625,
169
+ "mean_l2_distance": 19.875,
170
+ "std_l2_distance": 12.375,
171
+ "mean_dimension_correlation": 0.8904556274414063,
172
+ "std_dimension_correlation": 0.034210556225841876,
173
+ "linear_cka": 0.96484375
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.26953125,
177
+ "mean_cosine_similarity": 0.93359375,
178
+ "std_cosine_similarity": 0.1513671875,
179
+ "mean_l2_distance": 13.8125,
180
+ "std_l2_distance": 12.625,
181
+ "mean_dimension_correlation": 0.923016357421875,
182
+ "std_dimension_correlation": 0.029236331051580345,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.263671875,
187
+ "mean_cosine_similarity": 0.9375,
188
+ "std_cosine_similarity": 0.150390625,
189
+ "mean_l2_distance": 13.5625,
190
+ "std_l2_distance": 12.625,
191
+ "mean_dimension_correlation": 0.9244888305664063,
192
+ "std_dimension_correlation": 0.02919239611161659,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.271484375,
197
+ "mean_cosine_similarity": 0.93359375,
198
+ "std_cosine_similarity": 0.16015625,
199
+ "mean_l2_distance": 13.875,
200
+ "std_l2_distance": 13.0625,
201
+ "mean_dimension_correlation": 0.9205032348632812,
202
+ "std_dimension_correlation": 0.029844860484086543,
203
+ "linear_cka": 0.984375
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.267578125,
207
+ "mean_cosine_similarity": 0.93359375,
208
+ "std_cosine_similarity": 0.158203125,
209
+ "mean_l2_distance": 13.6875,
210
+ "std_l2_distance": 13.0,
211
+ "mean_dimension_correlation": 0.9218185424804688,
212
+ "std_dimension_correlation": 0.030954341854338954,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 0.39453125,
217
+ "mean_cosine_similarity": 0.89453125,
218
+ "std_cosine_similarity": 0.162109375,
219
+ "mean_l2_distance": 20.125,
220
+ "std_l2_distance": 12.625,
221
+ "mean_dimension_correlation": 0.8868682861328125,
222
+ "std_dimension_correlation": 0.03559183889902671,
223
+ "linear_cka": 0.96484375
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.26953125,
227
+ "mean_cosine_similarity": 0.93359375,
228
+ "std_cosine_similarity": 0.1513671875,
229
+ "mean_l2_distance": 13.8125,
230
+ "std_l2_distance": 12.625,
231
+ "mean_dimension_correlation": 0.9229568481445313,
232
+ "std_dimension_correlation": 0.029229316660619842,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.2578125,
237
+ "mean_cosine_similarity": 0.93359375,
238
+ "std_cosine_similarity": 0.16015625,
239
+ "mean_l2_distance": 13.25,
240
+ "std_l2_distance": 13.25,
241
+ "mean_dimension_correlation": 0.923333740234375,
242
+ "std_dimension_correlation": 0.030134410337098863,
243
+ "linear_cka": 0.984375
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.26171875,
247
+ "mean_cosine_similarity": 0.93359375,
248
+ "std_cosine_similarity": 0.16015625,
249
+ "mean_l2_distance": 13.4375,
250
+ "std_l2_distance": 13.25,
251
+ "mean_dimension_correlation": 0.9219314575195312,
252
+ "std_dimension_correlation": 0.03136389625872561,
253
+ "linear_cka": 0.984375
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.25390625,
257
+ "mean_cosine_similarity": 0.9375,
258
+ "std_cosine_similarity": 0.1484375,
259
+ "mean_l2_distance": 13.0625,
260
+ "std_l2_distance": 12.625,
261
+ "mean_dimension_correlation": 0.92755126953125,
262
+ "std_dimension_correlation": 0.02898992593261031,
263
+ "linear_cka": 0.984375
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 0.39453125,
267
+ "mean_cosine_similarity": 0.89453125,
268
+ "std_cosine_similarity": 0.154296875,
269
+ "mean_l2_distance": 20.125,
270
+ "std_l2_distance": 12.1875,
271
+ "mean_dimension_correlation": 0.8896469116210938,
272
+ "std_dimension_correlation": 0.03377379140546021,
273
+ "linear_cka": 0.96484375
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.263671875,
277
+ "mean_cosine_similarity": 0.9375,
278
+ "std_cosine_similarity": 0.150390625,
279
+ "mean_l2_distance": 13.5625,
280
+ "std_l2_distance": 12.625,
281
+ "mean_dimension_correlation": 0.9245574951171875,
282
+ "std_dimension_correlation": 0.029099754782990043,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.2578125,
287
+ "mean_cosine_similarity": 0.93359375,
288
+ "std_cosine_similarity": 0.16015625,
289
+ "mean_l2_distance": 13.25,
290
+ "std_l2_distance": 13.25,
291
+ "mean_dimension_correlation": 0.9233123779296875,
292
+ "std_dimension_correlation": 0.030156395218800952,
293
+ "linear_cka": 0.984375
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.263671875,
297
+ "mean_cosine_similarity": 0.93359375,
298
+ "std_cosine_similarity": 0.1572265625,
299
+ "mean_l2_distance": 13.5625,
300
+ "std_l2_distance": 13.0625,
301
+ "mean_dimension_correlation": 0.9226715087890625,
302
+ "std_dimension_correlation": 0.02929662688468137,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.255859375,
307
+ "mean_cosine_similarity": 0.9375,
308
+ "std_cosine_similarity": 0.1533203125,
309
+ "mean_l2_distance": 13.125,
310
+ "std_l2_distance": 12.8125,
311
+ "mean_dimension_correlation": 0.9262313842773438,
312
+ "std_dimension_correlation": 0.029011160291782537,
313
+ "linear_cka": 0.984375
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 0.390625,
317
+ "mean_cosine_similarity": 0.89453125,
318
+ "std_cosine_similarity": 0.1591796875,
319
+ "mean_l2_distance": 20.0,
320
+ "std_l2_distance": 12.5,
321
+ "mean_dimension_correlation": 0.8883377075195312,
322
+ "std_dimension_correlation": 0.03512599620173197,
323
+ "linear_cka": 0.96484375
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.271484375,
327
+ "mean_cosine_similarity": 0.93359375,
328
+ "std_cosine_similarity": 0.16015625,
329
+ "mean_l2_distance": 13.875,
330
+ "std_l2_distance": 13.0625,
331
+ "mean_dimension_correlation": 0.9205001831054688,
332
+ "std_dimension_correlation": 0.02990616928878693,
333
+ "linear_cka": 0.984375
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.26171875,
337
+ "mean_cosine_similarity": 0.93359375,
338
+ "std_cosine_similarity": 0.16015625,
339
+ "mean_l2_distance": 13.4375,
340
+ "std_l2_distance": 13.25,
341
+ "mean_dimension_correlation": 0.922039794921875,
342
+ "std_dimension_correlation": 0.03143896607512693,
343
+ "linear_cka": 0.984375
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.263671875,
347
+ "mean_cosine_similarity": 0.93359375,
348
+ "std_cosine_similarity": 0.1572265625,
349
+ "mean_l2_distance": 13.5625,
350
+ "std_l2_distance": 13.0625,
351
+ "mean_dimension_correlation": 0.9226806640625,
352
+ "std_dimension_correlation": 0.029339070768690877,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.24609375,
357
+ "mean_cosine_similarity": 0.9375,
358
+ "std_cosine_similarity": 0.1591796875,
359
+ "mean_l2_distance": 12.625,
360
+ "std_l2_distance": 13.375,
361
+ "mean_dimension_correlation": 0.9257278442382812,
362
+ "std_dimension_correlation": 0.030489491126206747,
363
+ "linear_cka": 0.984375
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 0.388671875,
367
+ "mean_cosine_similarity": 0.89453125,
368
+ "std_cosine_similarity": 0.1591796875,
369
+ "mean_l2_distance": 20.0,
370
+ "std_l2_distance": 12.4375,
371
+ "mean_dimension_correlation": 0.8896011352539063,
372
+ "std_dimension_correlation": 0.034245117741804325,
373
+ "linear_cka": 0.96484375
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.267578125,
377
+ "mean_cosine_similarity": 0.93359375,
378
+ "std_cosine_similarity": 0.158203125,
379
+ "mean_l2_distance": 13.6875,
380
+ "std_l2_distance": 13.0,
381
+ "mean_dimension_correlation": 0.9218338012695313,
382
+ "std_dimension_correlation": 0.03096110466803191,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.25390625,
387
+ "mean_cosine_similarity": 0.9375,
388
+ "std_cosine_similarity": 0.1484375,
389
+ "mean_l2_distance": 13.0625,
390
+ "std_l2_distance": 12.625,
391
+ "mean_dimension_correlation": 0.9275863647460938,
392
+ "std_dimension_correlation": 0.029019101935420444,
393
+ "linear_cka": 0.984375
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.255859375,
397
+ "mean_cosine_similarity": 0.9375,
398
+ "std_cosine_similarity": 0.1533203125,
399
+ "mean_l2_distance": 13.125,
400
+ "std_l2_distance": 12.8125,
401
+ "mean_dimension_correlation": 0.9262100219726562,
402
+ "std_dimension_correlation": 0.029023808376502022,
403
+ "linear_cka": 0.984375
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.24609375,
407
+ "mean_cosine_similarity": 0.9375,
408
+ "std_cosine_similarity": 0.1591796875,
409
+ "mean_l2_distance": 12.625,
410
+ "std_l2_distance": 13.375,
411
+ "mean_dimension_correlation": 0.9256805419921875,
412
+ "std_dimension_correlation": 0.030472261122601613,
413
+ "linear_cka": 0.984375
414
+ },
415
+ "avg_mse": 0.3045572916666667,
416
+ "std_mse": 0.061728089131668586,
417
+ "avg_mean_cosine_similarity": 0.9216145833333333,
418
+ "std_mean_cosine_similarity": 0.01921444452676741,
419
+ "avg_std_cosine_similarity": 0.156640625,
420
+ "std_std_cosine_similarity": 0.003999537123283247,
421
+ "avg_mean_l2_distance": 15.608333333333333,
422
+ "std_mean_l2_distance": 3.137845819808374,
423
+ "avg_std_l2_distance": 12.7875,
424
+ "std_std_l2_distance": 0.34746102898982306,
425
+ "avg_mean_dimension_correlation": 0.9121468607584635,
426
+ "std_mean_dimension_correlation": 0.016489903679962933,
427
+ "avg_std_dimension_correlation": 0.03143026060381273,
428
+ "std_std_dimension_correlation": 0.0023529554482308356,
429
+ "avg_linear_cka": 0.9778645833333334,
430
+ "std_linear_cka": 0.009207119546699838
431
+ }
432
+ }
433
+ }
evaluation/metrics_tokens_9004032.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1,
3
+ "n_tokens": 9004032,
4
+ "global_step": 8793,
5
+ "training_metrics": {
6
+ "train/loss": 2.546875,
7
+ "train/contrastive": 2.453125,
8
+ "train/recons_loss": 0.56640625,
9
+ "train/balance_loss": 3.84375,
10
+ "train/balance_loss_contrastive": 2.84375,
11
+ "train/balance_loss_recons": 1.0078125,
12
+ "train/contrastive_std": 3.359375,
13
+ "train/recons_std": 0.06787109375,
14
+ "train/contrastive_min": 0.08935546875,
15
+ "train/contrastive_max": 7.125,
16
+ "train/recons_min": 0.482421875,
17
+ "train/recons_max": 0.65234375,
18
+ "train/Qwen3_0.6B_layer_2": 0.640625,
19
+ "train/Qwen3_0.6B_layer_4": 0.546875,
20
+ "train/Qwen3_1.7B_layer_2": 0.515625,
21
+ "train/Qwen3_1.7B_layer_4": 0.65234375,
22
+ "train/Qwen3_4B_layer_2": 0.482421875,
23
+ "train/Qwen3_4B_layer_4": 0.5625,
24
+ "train/contrastives": null,
25
+ "train/epoch": 1,
26
+ "train/n_tokens": 9004032,
27
+ "train/step": 8793
28
+ },
29
+ "eval_metrics": {
30
+ "global_step": 8793,
31
+ "n_tokens": 9004032,
32
+ "kl_divergence": {
33
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.006870746612549,
34
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.394875526428223,
35
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 5.912027359008789,
36
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.115749359130859,
37
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.151121616363525,
38
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.175347805023193,
39
+ "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
40
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.124427318572998,
41
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.121898651123047,
42
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.1416893005371094,
43
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.1991913318634033,
44
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.14923357963562,
45
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.216580390930176,
46
+ "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
47
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.9164533615112305,
48
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.773134708404541,
49
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.090576171875,
50
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.982679843902588,
51
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.148589134216309,
52
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.272560119628906,
53
+ "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
54
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3844406604766846,
55
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.425341844558716,
56
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.334113597869873,
57
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.3682360649108887,
58
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.3439788818359375,
59
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.5122714042663574,
60
+ "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
61
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4646520614624023,
62
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.1960129737854004,
63
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.9887456893920898,
64
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.074134111404419,
65
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.142500638961792,
66
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.9460818767547607,
67
+ "Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
68
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.4264042377471924,
69
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.065612554550171,
70
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.84149169921875,
71
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0016493797302246,
72
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 2.982909679412842,
73
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7882883548736572,
74
+ "Qwen3_4B_layer_4_to_uniform": 10.104096412658691
75
+ },
76
+ "mae_hidden_states": {
77
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.144361138343811,
78
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1407876014709473,
79
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1702628135681152,
80
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.169557809829712,
81
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.164795160293579,
82
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.165663480758667,
83
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9478356242179871,
84
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9305350184440613,
85
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9448918104171753,
86
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9919092059135437,
87
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9386879801750183,
88
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9315637350082397,
89
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9601666331291199,
90
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.8851673007011414,
91
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.8906123042106628,
92
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.8979656100273132,
93
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.8988674283027649,
94
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.900534451007843,
95
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.154961109161377,
96
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.1417714357376099,
97
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.147143840789795,
98
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1556771993637085,
99
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.14786696434021,
100
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.148809790611267,
101
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9560009837150574,
102
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9207914471626282,
103
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9233508110046387,
104
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.93439781665802,
105
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.894271194934845,
106
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9094542264938354,
107
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.068742275238037,
108
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0256458520889282,
109
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0398327112197876,
110
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.056915283203125,
111
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0318653583526611,
112
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.016662359237671
113
+ },
114
+ "alignment": {
115
+ "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
116
+ "mse": 0.328125,
117
+ "mean_cosine_similarity": 0.921875,
118
+ "std_cosine_similarity": 0.1474609375,
119
+ "mean_l2_distance": 16.875,
120
+ "std_l2_distance": 11.6875,
121
+ "mean_dimension_correlation": 0.911785888671875,
122
+ "std_dimension_correlation": 0.029143728520497736,
123
+ "linear_cka": 0.97265625
124
+ },
125
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
126
+ "mse": 0.333984375,
127
+ "mean_cosine_similarity": 0.91796875,
128
+ "std_cosine_similarity": 0.1533203125,
129
+ "mean_l2_distance": 17.0,
130
+ "std_l2_distance": 12.0,
131
+ "mean_dimension_correlation": 0.908795166015625,
132
+ "std_dimension_correlation": 0.030533706065498583,
133
+ "linear_cka": 0.97265625
134
+ },
135
+ "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
136
+ "mse": 0.333984375,
137
+ "mean_cosine_similarity": 0.921875,
138
+ "std_cosine_similarity": 0.1455078125,
139
+ "mean_l2_distance": 17.0,
140
+ "std_l2_distance": 11.4375,
141
+ "mean_dimension_correlation": 0.9114944458007812,
142
+ "std_dimension_correlation": 0.02791911734622361,
143
+ "linear_cka": 0.97265625
144
+ },
145
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
146
+ "mse": 0.333984375,
147
+ "mean_cosine_similarity": 0.91796875,
148
+ "std_cosine_similarity": 0.150390625,
149
+ "mean_l2_distance": 17.0,
150
+ "std_l2_distance": 11.8125,
151
+ "mean_dimension_correlation": 0.910113525390625,
152
+ "std_dimension_correlation": 0.029502623650495614,
153
+ "linear_cka": 0.97265625
154
+ },
155
+ "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
156
+ "mse": 0.330078125,
157
+ "mean_cosine_similarity": 0.921875,
158
+ "std_cosine_similarity": 0.1513671875,
159
+ "mean_l2_distance": 17.0,
160
+ "std_l2_distance": 11.75,
161
+ "mean_dimension_correlation": 0.910638427734375,
162
+ "std_dimension_correlation": 0.029250348976490634,
163
+ "linear_cka": 0.97265625
164
+ },
165
+ "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
166
+ "mse": 0.328125,
167
+ "mean_cosine_similarity": 0.921875,
168
+ "std_cosine_similarity": 0.1474609375,
169
+ "mean_l2_distance": 16.875,
170
+ "std_l2_distance": 11.6875,
171
+ "mean_dimension_correlation": 0.9118026733398438,
172
+ "std_dimension_correlation": 0.02911178290188815,
173
+ "linear_cka": 0.97265625
174
+ },
175
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
176
+ "mse": 0.259765625,
177
+ "mean_cosine_similarity": 0.9375,
178
+ "std_cosine_similarity": 0.1484375,
179
+ "mean_l2_distance": 13.3125,
180
+ "std_l2_distance": 12.4375,
181
+ "mean_dimension_correlation": 0.9281707763671875,
182
+ "std_dimension_correlation": 0.027613594267907524,
183
+ "linear_cka": 0.984375
184
+ },
185
+ "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
186
+ "mse": 0.25390625,
187
+ "mean_cosine_similarity": 0.94140625,
188
+ "std_cosine_similarity": 0.1474609375,
189
+ "mean_l2_distance": 13.0625,
190
+ "std_l2_distance": 12.375,
191
+ "mean_dimension_correlation": 0.929296875,
192
+ "std_dimension_correlation": 0.027428457660098507,
193
+ "linear_cka": 0.984375
194
+ },
195
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
196
+ "mse": 0.26171875,
197
+ "mean_cosine_similarity": 0.9375,
198
+ "std_cosine_similarity": 0.1572265625,
199
+ "mean_l2_distance": 13.375,
200
+ "std_l2_distance": 12.8125,
201
+ "mean_dimension_correlation": 0.9258895874023437,
202
+ "std_dimension_correlation": 0.02807925327640673,
203
+ "linear_cka": 0.984375
204
+ },
205
+ "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
206
+ "mse": 0.2578125,
207
+ "mean_cosine_similarity": 0.9375,
208
+ "std_cosine_similarity": 0.15625,
209
+ "mean_l2_distance": 13.1875,
210
+ "std_l2_distance": 12.875,
211
+ "mean_dimension_correlation": 0.9261764526367188,
212
+ "std_dimension_correlation": 0.029308957111961503,
213
+ "linear_cka": 0.984375
214
+ },
215
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
216
+ "mse": 0.333984375,
217
+ "mean_cosine_similarity": 0.91796875,
218
+ "std_cosine_similarity": 0.1533203125,
219
+ "mean_l2_distance": 17.0,
220
+ "std_l2_distance": 12.0,
221
+ "mean_dimension_correlation": 0.9088623046875,
222
+ "std_dimension_correlation": 0.030521200956836466,
223
+ "linear_cka": 0.97265625
224
+ },
225
+ "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
226
+ "mse": 0.259765625,
227
+ "mean_cosine_similarity": 0.9375,
228
+ "std_cosine_similarity": 0.1484375,
229
+ "mean_l2_distance": 13.3125,
230
+ "std_l2_distance": 12.4375,
231
+ "mean_dimension_correlation": 0.9282363891601563,
232
+ "std_dimension_correlation": 0.02761614875613791,
233
+ "linear_cka": 0.984375
234
+ },
235
+ "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
236
+ "mse": 0.248046875,
237
+ "mean_cosine_similarity": 0.9375,
238
+ "std_cosine_similarity": 0.15625,
239
+ "mean_l2_distance": 12.6875,
240
+ "std_l2_distance": 13.0,
241
+ "mean_dimension_correlation": 0.9286865234375,
242
+ "std_dimension_correlation": 0.028394499325967187,
243
+ "linear_cka": 1.0
244
+ },
245
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
246
+ "mse": 0.251953125,
247
+ "mean_cosine_similarity": 0.9375,
248
+ "std_cosine_similarity": 0.1572265625,
249
+ "mean_l2_distance": 12.875,
250
+ "std_l2_distance": 13.0,
251
+ "mean_dimension_correlation": 0.9273910522460938,
252
+ "std_dimension_correlation": 0.029792982191153054,
253
+ "linear_cka": 1.0
254
+ },
255
+ "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
256
+ "mse": 0.2451171875,
257
+ "mean_cosine_similarity": 0.94140625,
258
+ "std_cosine_similarity": 0.146484375,
259
+ "mean_l2_distance": 12.5625,
260
+ "std_l2_distance": 12.4375,
261
+ "mean_dimension_correlation": 0.9318832397460938,
262
+ "std_dimension_correlation": 0.02779797256144542,
263
+ "linear_cka": 0.984375
264
+ },
265
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
266
+ "mse": 0.333984375,
267
+ "mean_cosine_similarity": 0.921875,
268
+ "std_cosine_similarity": 0.1455078125,
269
+ "mean_l2_distance": 17.0,
270
+ "std_l2_distance": 11.4375,
271
+ "mean_dimension_correlation": 0.9115066528320312,
272
+ "std_dimension_correlation": 0.02783942438110558,
273
+ "linear_cka": 0.97265625
274
+ },
275
+ "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
276
+ "mse": 0.25390625,
277
+ "mean_cosine_similarity": 0.94140625,
278
+ "std_cosine_similarity": 0.1474609375,
279
+ "mean_l2_distance": 13.0625,
280
+ "std_l2_distance": 12.375,
281
+ "mean_dimension_correlation": 0.929364013671875,
282
+ "std_dimension_correlation": 0.027418246966595963,
283
+ "linear_cka": 0.984375
284
+ },
285
+ "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
286
+ "mse": 0.248046875,
287
+ "mean_cosine_similarity": 0.9375,
288
+ "std_cosine_similarity": 0.15625,
289
+ "mean_l2_distance": 12.6875,
290
+ "std_l2_distance": 13.0,
291
+ "mean_dimension_correlation": 0.9286041259765625,
292
+ "std_dimension_correlation": 0.028414978282929146,
293
+ "linear_cka": 1.0
294
+ },
295
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
296
+ "mse": 0.25390625,
297
+ "mean_cosine_similarity": 0.9375,
298
+ "std_cosine_similarity": 0.154296875,
299
+ "mean_l2_distance": 13.0,
300
+ "std_l2_distance": 12.8125,
301
+ "mean_dimension_correlation": 0.927911376953125,
302
+ "std_dimension_correlation": 0.027630500633115,
303
+ "linear_cka": 0.984375
304
+ },
305
+ "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
306
+ "mse": 0.24609375,
307
+ "mean_cosine_similarity": 0.94140625,
308
+ "std_cosine_similarity": 0.1513671875,
309
+ "mean_l2_distance": 12.6875,
310
+ "std_l2_distance": 12.625,
311
+ "mean_dimension_correlation": 0.9304595947265625,
312
+ "std_dimension_correlation": 0.027486156310205404,
313
+ "linear_cka": 0.984375
314
+ },
315
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
316
+ "mse": 0.333984375,
317
+ "mean_cosine_similarity": 0.91796875,
318
+ "std_cosine_similarity": 0.150390625,
319
+ "mean_l2_distance": 17.0,
320
+ "std_l2_distance": 11.8125,
321
+ "mean_dimension_correlation": 0.910162353515625,
322
+ "std_dimension_correlation": 0.029493359043523553,
323
+ "linear_cka": 0.97265625
324
+ },
325
+ "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
326
+ "mse": 0.26171875,
327
+ "mean_cosine_similarity": 0.9375,
328
+ "std_cosine_similarity": 0.1572265625,
329
+ "mean_l2_distance": 13.375,
330
+ "std_l2_distance": 12.8125,
331
+ "mean_dimension_correlation": 0.9259017944335938,
332
+ "std_dimension_correlation": 0.02807177297712858,
333
+ "linear_cka": 0.984375
334
+ },
335
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
336
+ "mse": 0.251953125,
337
+ "mean_cosine_similarity": 0.9375,
338
+ "std_cosine_similarity": 0.1572265625,
339
+ "mean_l2_distance": 12.875,
340
+ "std_l2_distance": 13.0,
341
+ "mean_dimension_correlation": 0.9274307250976562,
342
+ "std_dimension_correlation": 0.029833198285711512,
343
+ "linear_cka": 1.0
344
+ },
345
+ "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
346
+ "mse": 0.25390625,
347
+ "mean_cosine_similarity": 0.9375,
348
+ "std_cosine_similarity": 0.154296875,
349
+ "mean_l2_distance": 13.0,
350
+ "std_l2_distance": 12.8125,
351
+ "mean_dimension_correlation": 0.9279266357421875,
352
+ "std_dimension_correlation": 0.027642045164903522,
353
+ "linear_cka": 0.984375
354
+ },
355
+ "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
356
+ "mse": 0.236328125,
357
+ "mean_cosine_similarity": 0.94140625,
358
+ "std_cosine_similarity": 0.158203125,
359
+ "mean_l2_distance": 12.125,
360
+ "std_l2_distance": 13.25,
361
+ "mean_dimension_correlation": 0.93009033203125,
362
+ "std_dimension_correlation": 0.029206890514525005,
363
+ "linear_cka": 1.0
364
+ },
365
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
366
+ "mse": 0.330078125,
367
+ "mean_cosine_similarity": 0.921875,
368
+ "std_cosine_similarity": 0.1513671875,
369
+ "mean_l2_distance": 17.0,
370
+ "std_l2_distance": 11.75,
371
+ "mean_dimension_correlation": 0.910687255859375,
372
+ "std_dimension_correlation": 0.02925704219094372,
373
+ "linear_cka": 0.97265625
374
+ },
375
+ "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
376
+ "mse": 0.2578125,
377
+ "mean_cosine_similarity": 0.9375,
378
+ "std_cosine_similarity": 0.15625,
379
+ "mean_l2_distance": 13.1875,
380
+ "std_l2_distance": 12.875,
381
+ "mean_dimension_correlation": 0.9263031005859375,
382
+ "std_dimension_correlation": 0.0292820509917565,
383
+ "linear_cka": 0.984375
384
+ },
385
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
386
+ "mse": 0.2451171875,
387
+ "mean_cosine_similarity": 0.94140625,
388
+ "std_cosine_similarity": 0.146484375,
389
+ "mean_l2_distance": 12.5625,
390
+ "std_l2_distance": 12.4375,
391
+ "mean_dimension_correlation": 0.9319442749023438,
392
+ "std_dimension_correlation": 0.02774844077379742,
393
+ "linear_cka": 0.984375
394
+ },
395
+ "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
396
+ "mse": 0.24609375,
397
+ "mean_cosine_similarity": 0.94140625,
398
+ "std_cosine_similarity": 0.1513671875,
399
+ "mean_l2_distance": 12.6875,
400
+ "std_l2_distance": 12.625,
401
+ "mean_dimension_correlation": 0.9305908203125,
402
+ "std_dimension_correlation": 0.027491914687628318,
403
+ "linear_cka": 0.984375
404
+ },
405
+ "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
406
+ "mse": 0.236328125,
407
+ "mean_cosine_similarity": 0.94140625,
408
+ "std_cosine_similarity": 0.158203125,
409
+ "mean_l2_distance": 12.125,
410
+ "std_l2_distance": 13.25,
411
+ "mean_dimension_correlation": 0.9300765991210938,
412
+ "std_dimension_correlation": 0.029167152542476576,
413
+ "linear_cka": 1.0
414
+ },
415
+ "avg_mse": 0.2783203125,
416
+ "std_mse": 0.03847375333442295,
417
+ "avg_mean_cosine_similarity": 0.9328125,
418
+ "std_mean_cosine_similarity": 0.009043622580304863,
419
+ "avg_std_cosine_similarity": 0.15208333333333332,
420
+ "std_std_cosine_similarity": 0.004211187924165684,
421
+ "avg_mean_l2_distance": 14.25,
422
+ "std_mean_l2_distance": 1.9497596005661826,
423
+ "avg_std_l2_distance": 12.420833333333333,
424
+ "std_std_l2_distance": 0.5426913385054979,
425
+ "avg_mean_dimension_correlation": 0.9226060994466144,
426
+ "std_mean_dimension_correlation": 0.00864781898546122,
427
+ "avg_std_dimension_correlation": 0.02859991824384515,
428
+ "std_std_dimension_correlation": 0.00095096984819575,
429
+ "avg_linear_cka": 0.98359375,
430
+ "std_linear_cka": 0.009695057535930357
431
+ }
432
+ }
433
+ }
evaluation/plots/kl_divergences_step_1954_tokens_2000896.png ADDED

Git LFS Details

  • SHA256: b8f67d95679d86ba324a4b58ffacd48c5d06ac3d5d6eca08118f9cab94a1dfd9
  • Pointer size: 131 Bytes
  • Size of remote file: 235 kB
evaluation/plots/kl_divergences_step_2931_tokens_3001344.png ADDED

Git LFS Details

  • SHA256: d4ca3eab66613eb7a817141a20691f4b32da5dfb541f232d33413c9e7e74c220
  • Pointer size: 131 Bytes
  • Size of remote file: 238 kB
evaluation/plots/kl_divergences_step_3908_tokens_4001792.png ADDED

Git LFS Details

  • SHA256: 1141b4ca96036a91c792fec742937a08c739c06fd19b026652a38fe9cf13de3d
  • Pointer size: 131 Bytes
  • Size of remote file: 235 kB
evaluation/plots/kl_divergences_step_4885_tokens_5002240.png ADDED

Git LFS Details

  • SHA256: 4f0778e4b9154fd8781bb93e28d6e59bfb6435173d070d159c048ecb93ddce8e
  • Pointer size: 131 Bytes
  • Size of remote file: 240 kB
evaluation/plots/kl_divergences_step_5862_tokens_6002688.png ADDED

Git LFS Details

  • SHA256: b083f84b8512adeb143c89fbe99d82af4fff3b657c057c1e181614b348bb882b
  • Pointer size: 131 Bytes
  • Size of remote file: 235 kB
evaluation/plots/kl_divergences_step_6839_tokens_7003136.png ADDED

Git LFS Details

  • SHA256: 8c4e010c7c58b3343707bdaf7872940a0c9312d569097fd10d063b5f402141e9
  • Pointer size: 131 Bytes
  • Size of remote file: 235 kB
evaluation/plots/kl_divergences_step_7816_tokens_8003584.png ADDED

Git LFS Details

  • SHA256: e161e6327685f14b68b10751ff86ecce66e54036e4582892248252edd165d663
  • Pointer size: 131 Bytes
  • Size of remote file: 240 kB
evaluation/plots/kl_divergences_step_8793_tokens_9004032.png ADDED

Git LFS Details

  • SHA256: 0d43380ee414f89f6802367778738982c01218b65e3be87ef00599812b0f07d6
  • Pointer size: 131 Bytes
  • Size of remote file: 234 kB
evaluation/plots/kl_divergences_step_977_tokens_1000448.png ADDED

Git LFS Details

  • SHA256: 4df14aecd7771a6c797aa87644f99b38be105b78e024e97fc98eca35749d627b
  • Pointer size: 131 Bytes
  • Size of remote file: 234 kB
evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png ADDED

Git LFS Details

  • SHA256: dad3676b329a55bbc2c6dd1b5bfd9574ce16bce5256bd99c84226d12f05a1973
  • Pointer size: 131 Bytes
  • Size of remote file: 217 kB
evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png ADDED

Git LFS Details

  • SHA256: c730ecfeb50fe133289a7440a5b07f7c566cd147865f4c3ad1bd16d08a3659e7
  • Pointer size: 131 Bytes
  • Size of remote file: 211 kB
evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png ADDED

Git LFS Details

  • SHA256: 0359b254db6807d070df0d993109bed270def46653da75dbb2a6e0e22d716206
  • Pointer size: 131 Bytes
  • Size of remote file: 209 kB
evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png ADDED

Git LFS Details

  • SHA256: 5b1b27af9fe9786f239dafa160d19a168233302a5346f92c1060a3f3a2194ac8
  • Pointer size: 131 Bytes
  • Size of remote file: 218 kB
evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png ADDED

Git LFS Details

  • SHA256: 979a478df2a47a18b58733ea9b83643b2b4a5e26756da271b45074852a2c05f7
  • Pointer size: 131 Bytes
  • Size of remote file: 238 kB
evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png ADDED

Git LFS Details

  • SHA256: 598dcc665c026d3a83ac7b3e4d23d12376c290b88a9712a3d51dec9289ddedf5
  • Pointer size: 131 Bytes
  • Size of remote file: 250 kB
evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png ADDED

Git LFS Details

  • SHA256: 8b0f573a1b745e9d8f7b609c32702b1860ffe467fcf491f550149feef6449d13
  • Pointer size: 131 Bytes
  • Size of remote file: 243 kB
evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png ADDED

Git LFS Details

  • SHA256: b7cd6ff801a60da3d9cffda3a4aa3993c2b76d0ce0fc2d1e601793d36761272f
  • Pointer size: 131 Bytes
  • Size of remote file: 246 kB
evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png ADDED

Git LFS Details

  • SHA256: 808689ca0ffa0a1c473f01ae09c03e3b66756c9c6730805c16c4f15431b87544
  • Pointer size: 131 Bytes
  • Size of remote file: 217 kB
evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png ADDED

Git LFS Details

  • SHA256: c4a11f913d88be35cba8994c6f1a24b74be9af6b97a42ae4d01f608dcb7b1373
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png ADDED

Git LFS Details

  • SHA256: f097e937a2f55df03254326bb1c6446c4a83622a43e57e297e3232101ce9429c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.62 MB
evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png ADDED

Git LFS Details

  • SHA256: e1f63608777e9c7c186f021143bdac2673cc70156b187ab7a6582abd4b99982b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.03 MB
evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png ADDED

Git LFS Details

  • SHA256: fa9c4a291e4fa12c4a8e70aa5974adfbee714a113ce0959f4c79f9d7808b8d1c
  • Pointer size: 132 Bytes
  • Size of remote file: 2.23 MB
evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png ADDED

Git LFS Details

  • SHA256: ee66187ce6798b32cf1d4e784ad0fa4000bac446f46b7e9a332e7846ed10778f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.4 MB
evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png ADDED

Git LFS Details

  • SHA256: f7c0ed120b3db4846aecbef6c547b215e7223ec5d11094ac3819767ee6167ed9
  • Pointer size: 132 Bytes
  • Size of remote file: 2.37 MB
evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png ADDED

Git LFS Details

  • SHA256: 2520d0285e93edd11b39ff50ed9ea2c2485f7b1fe0edb9feaf521faffb0b2791
  • Pointer size: 132 Bytes
  • Size of remote file: 2.32 MB
evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png ADDED

Git LFS Details

  • SHA256: 7388e23cbe52e590e7342708b1e2845bb6c903c9ef290d05b0e7062b00577668
  • Pointer size: 132 Bytes
  • Size of remote file: 2.3 MB
evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png ADDED

Git LFS Details

  • SHA256: 0a15c21d23ddefbd44ad4c6615167a2bf81dc349eb20ba1c0dfb2c7663ec2c94
  • Pointer size: 132 Bytes
  • Size of remote file: 1.45 MB
metrics_tokens_1000448.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train/loss": 0.3984375,
3
+ "train/contrastive": 0.310546875,
4
+ "train/recons_loss": 0.68359375,
5
+ "train/balance_loss": 2.015625,
6
+ "train/balance_loss_contrastive": 1.0,
7
+ "train/balance_loss_recons": 1.015625,
8
+ "train/contrastive_std": 0.015625,
9
+ "train/recons_std": 0.0888671875,
10
+ "train/contrastive_min": 0.27734375,
11
+ "train/contrastive_max": 0.3359375,
12
+ "train/recons_min": 0.58203125,
13
+ "train/recons_max": 0.84375,
14
+ "train/Qwen3_0.6B_layer_2": 0.58203125,
15
+ "train/Qwen3_0.6B_layer_4": 0.69140625,
16
+ "train/Qwen3_1.7B_layer_2": 0.65234375,
17
+ "train/Qwen3_1.7B_layer_4": 0.84375,
18
+ "train/Qwen3_4B_layer_2": 0.63671875,
19
+ "train/Qwen3_4B_layer_4": 0.6953125,
20
+ "train/contrastives": null,
21
+ "train/epoch": 1,
22
+ "train/n_tokens": 1000448,
23
+ "train/step": 977
24
+ }
metrics_tokens_2000896.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train/loss": 2.609375,
3
+ "train/contrastive": 2.5,
4
+ "train/recons_loss": 0.71484375,
5
+ "train/balance_loss": 3.671875,
6
+ "train/balance_loss_contrastive": 2.625,
7
+ "train/balance_loss_recons": 1.046875,
8
+ "train/contrastive_std": 3.234375,
9
+ "train/recons_std": 0.171875,
10
+ "train/contrastive_min": 0.224609375,
11
+ "train/contrastive_max": 7.0,
12
+ "train/recons_min": 0.5859375,
13
+ "train/recons_max": 1.046875,
14
+ "train/Qwen3_0.6B_layer_2": 1.046875,
15
+ "train/Qwen3_0.6B_layer_4": 0.63671875,
16
+ "train/Qwen3_1.7B_layer_2": 0.62109375,
17
+ "train/Qwen3_1.7B_layer_4": 0.75,
18
+ "train/Qwen3_4B_layer_2": 0.5859375,
19
+ "train/Qwen3_4B_layer_4": 0.65234375,
20
+ "train/contrastives": null,
21
+ "train/epoch": 1,
22
+ "train/n_tokens": 2000896,
23
+ "train/step": 1954
24
+ }
metrics_tokens_3001344.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train/loss": 2.515625,
3
+ "train/contrastive": 2.421875,
4
+ "train/recons_loss": 0.671875,
5
+ "train/balance_loss": 3.75,
6
+ "train/balance_loss_contrastive": 2.71875,
7
+ "train/balance_loss_recons": 1.0390625,
8
+ "train/contrastive_std": 3.25,
9
+ "train/recons_std": 0.138671875,
10
+ "train/contrastive_min": 0.146484375,
11
+ "train/contrastive_max": 6.9375,
12
+ "train/recons_min": 0.56640625,
13
+ "train/recons_max": 0.9375,
14
+ "train/Qwen3_0.6B_layer_2": 0.9375,
15
+ "train/Qwen3_0.6B_layer_4": 0.59765625,
16
+ "train/Qwen3_1.7B_layer_2": 0.59375,
17
+ "train/Qwen3_1.7B_layer_4": 0.703125,
18
+ "train/Qwen3_4B_layer_2": 0.56640625,
19
+ "train/Qwen3_4B_layer_4": 0.6328125,
20
+ "train/contrastives": null,
21
+ "train/epoch": 1,
22
+ "train/n_tokens": 3001344,
23
+ "train/step": 2931
24
+ }